2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
51 #include "gromacs/domdec/domdec_network.h"
52 #include "gromacs/domdec/ga2la.h"
53 #include "gromacs/ewald/pme.h"
54 #include "gromacs/fileio/gmxfio.h"
55 #include "gromacs/fileio/pdbio.h"
56 #include "gromacs/gmxlib/chargegroup.h"
57 #include "gromacs/gmxlib/network.h"
58 #include "gromacs/gmxlib/nrnb.h"
59 #include "gromacs/gpu_utils/gpu_utils.h"
60 #include "gromacs/hardware/hw_info.h"
61 #include "gromacs/imd/imd.h"
62 #include "gromacs/listed-forces/manage-threading.h"
63 #include "gromacs/math/functions.h"
64 #include "gromacs/math/vec.h"
65 #include "gromacs/math/vectypes.h"
66 #include "gromacs/mdlib/constr.h"
67 #include "gromacs/mdlib/force.h"
68 #include "gromacs/mdlib/forcerec.h"
69 #include "gromacs/mdlib/genborn.h"
70 #include "gromacs/mdlib/gmx_omp_nthreads.h"
71 #include "gromacs/mdlib/mdatoms.h"
72 #include "gromacs/mdlib/mdrun.h"
73 #include "gromacs/mdlib/nb_verlet.h"
74 #include "gromacs/mdlib/nbnxn_grid.h"
75 #include "gromacs/mdlib/nsgrid.h"
76 #include "gromacs/mdlib/shellfc.h"
77 #include "gromacs/mdlib/vsite.h"
78 #include "gromacs/mdtypes/commrec.h"
79 #include "gromacs/mdtypes/df_history.h"
80 #include "gromacs/mdtypes/forcerec.h"
81 #include "gromacs/mdtypes/inputrec.h"
82 #include "gromacs/mdtypes/md_enums.h"
83 #include "gromacs/mdtypes/mdatom.h"
84 #include "gromacs/mdtypes/nblist.h"
85 #include "gromacs/mdtypes/state.h"
86 #include "gromacs/pbcutil/ishift.h"
87 #include "gromacs/pbcutil/pbc.h"
88 #include "gromacs/pulling/pull.h"
89 #include "gromacs/pulling/pull_rotation.h"
90 #include "gromacs/swap/swapcoords.h"
91 #include "gromacs/timing/wallcycle.h"
92 #include "gromacs/topology/block.h"
93 #include "gromacs/topology/idef.h"
94 #include "gromacs/topology/ifunc.h"
95 #include "gromacs/topology/mtop_util.h"
96 #include "gromacs/topology/topology.h"
97 #include "gromacs/utility/basedefinitions.h"
98 #include "gromacs/utility/basenetwork.h"
99 #include "gromacs/utility/cstringutil.h"
100 #include "gromacs/utility/exceptions.h"
101 #include "gromacs/utility/fatalerror.h"
102 #include "gromacs/utility/gmxmpi.h"
103 #include "gromacs/utility/qsort_threadsafe.h"
104 #include "gromacs/utility/real.h"
105 #include "gromacs/utility/smalloc.h"
107 #include "domdec_constraints.h"
108 #include "domdec_internal.h"
109 #include "domdec_vsite.h"
111 #define DDRANK(dd, rank) (rank)
112 #define DDMASTERRANK(dd) (dd->masterrank)
114 struct gmx_domdec_master_t
116 /* The cell boundaries */
118 /* The global charge group division */
119 int *ncg
; /* Number of home charge groups for each node */
120 int *index
; /* Index of nnodes+1 into cg */
121 int *cg
; /* Global charge group index */
122 int *nat
; /* Number of home atoms for each node. */
123 int *ibuf
; /* Buffer for communication */
124 rvec
*vbuf
; /* Buffer for state scattering and gathering */
127 #define DD_NLOAD_MAX 9
129 const char *edlbs_names
[edlbsNR
] = { "off", "auto", "locked", "on" };
131 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
134 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
135 #define DD_FLAG_NRCG 65535
136 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
137 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
139 /* The DD zone order */
140 static const ivec dd_zo
[DD_MAXZONE
] =
141 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
146 static const ivec dd_zp3
[dd_zp3n
] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
151 static const ivec dd_zp2
[dd_zp2n
] = {{0, 0, 4}, {1, 3, 4}};
156 static const ivec dd_zp1
[dd_zp1n
] = {{0, 0, 2}};
161 static const ivec dd_zp0
[dd_zp0n
] = {{0, 0, 1}};
163 /* Factors used to avoid problems due to rounding issues */
164 #define DD_CELL_MARGIN 1.0001
165 #define DD_CELL_MARGIN2 1.00005
166 /* Factor to account for pressure scaling during nstlist steps */
167 #define DD_PRES_SCALE_MARGIN 1.02
169 /* Turn on DLB when the load imbalance causes this amount of total loss.
170 * There is a bit of overhead with DLB and it's difficult to achieve
171 * a load imbalance of less than 2% with DLB.
173 #define DD_PERF_LOSS_DLB_ON 0.02
175 /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
176 #define DD_PERF_LOSS_WARN 0.05
178 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
180 /* Use separate MPI send and receive commands
181 * when nnodes <= GMX_DD_NNODES_SENDRECV.
182 * This saves memory (and some copying for small nnodes).
183 * For high parallelization scatter and gather calls are used.
185 #define GMX_DD_NNODES_SENDRECV 4
189 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
191 static void index2xyz(ivec nc,int ind,ivec xyz)
193 xyz[XX] = ind % nc[XX];
194 xyz[YY] = (ind / nc[XX]) % nc[YY];
195 xyz[ZZ] = ind / (nc[YY]*nc[XX]);
199 /* This order is required to minimize the coordinate communication in PME
200 * which uses decomposition in the x direction.
202 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
204 static void ddindex2xyz(ivec nc
, int ind
, ivec xyz
)
206 xyz
[XX
] = ind
/ (nc
[YY
]*nc
[ZZ
]);
207 xyz
[YY
] = (ind
/ nc
[ZZ
]) % nc
[YY
];
208 xyz
[ZZ
] = ind
% nc
[ZZ
];
211 static int ddcoord2ddnodeid(gmx_domdec_t
*dd
, ivec c
)
216 ddindex
= dd_index(dd
->nc
, c
);
217 if (dd
->comm
->bCartesianPP_PME
)
219 ddnodeid
= dd
->comm
->ddindex2ddnodeid
[ddindex
];
221 else if (dd
->comm
->bCartesianPP
)
224 MPI_Cart_rank(dd
->mpi_comm_all
, c
, &ddnodeid
);
235 static gmx_bool
dynamic_dd_box(const gmx_ddbox_t
*ddbox
, const t_inputrec
*ir
)
237 return (ddbox
->nboundeddim
< DIM
|| inputrecDynamicBox(ir
));
240 int ddglatnr(gmx_domdec_t
*dd
, int i
)
250 if (i
>= dd
->comm
->nat
[ddnatNR
-1])
252 gmx_fatal(FARGS
, "glatnr called with %d, which is larger than the local number of atoms (%d)", i
, dd
->comm
->nat
[ddnatNR
-1]);
254 atnr
= dd
->gatindex
[i
] + 1;
260 t_block
*dd_charge_groups_global(gmx_domdec_t
*dd
)
262 return &dd
->comm
->cgs_gl
;
265 static bool dlbIsOn(const gmx_domdec_comm_t
*comm
)
267 return (comm
->dlbState
== edlbsOn
);
270 static void vec_rvec_init(vec_rvec_t
*v
)
276 static void vec_rvec_check_alloc(vec_rvec_t
*v
, int n
)
280 v
->nalloc
= over_alloc_dd(n
);
281 srenew(v
->v
, v
->nalloc
);
285 void dd_store_state(gmx_domdec_t
*dd
, t_state
*state
)
289 if (state
->ddp_count
!= dd
->ddp_count
)
291 gmx_incons("The state does not the domain decomposition state");
294 state
->ncg_gl
= dd
->ncg_home
;
295 if (state
->ncg_gl
> state
->cg_gl_nalloc
)
297 state
->cg_gl_nalloc
= over_alloc_dd(state
->ncg_gl
);
298 srenew(state
->cg_gl
, state
->cg_gl_nalloc
);
300 for (i
= 0; i
< state
->ncg_gl
; i
++)
302 state
->cg_gl
[i
] = dd
->index_gl
[i
];
305 state
->ddp_count_cg_gl
= dd
->ddp_count
;
308 gmx_domdec_zones_t
*domdec_zones(gmx_domdec_t
*dd
)
310 return &dd
->comm
->zones
;
313 void dd_get_ns_ranges(gmx_domdec_t
*dd
, int icg
,
314 int *jcg0
, int *jcg1
, ivec shift0
, ivec shift1
)
316 gmx_domdec_zones_t
*zones
;
319 zones
= &dd
->comm
->zones
;
322 while (icg
>= zones
->izone
[izone
].cg1
)
331 else if (izone
< zones
->nizone
)
333 *jcg0
= zones
->izone
[izone
].jcg0
;
337 gmx_fatal(FARGS
, "DD icg %d out of range: izone (%d) >= nizone (%d)",
338 icg
, izone
, zones
->nizone
);
341 *jcg1
= zones
->izone
[izone
].jcg1
;
343 for (d
= 0; d
< dd
->ndim
; d
++)
346 shift0
[dim
] = zones
->izone
[izone
].shift0
[dim
];
347 shift1
[dim
] = zones
->izone
[izone
].shift1
[dim
];
348 if (dd
->comm
->tric_dir
[dim
] || (dlbIsOn(dd
->comm
) && d
> 0))
350 /* A conservative approach, this can be optimized */
357 int dd_natoms_vsite(gmx_domdec_t
*dd
)
359 return dd
->comm
->nat
[ddnatVSITE
];
362 void dd_get_constraint_range(gmx_domdec_t
*dd
, int *at_start
, int *at_end
)
364 *at_start
= dd
->comm
->nat
[ddnatCON
-1];
365 *at_end
= dd
->comm
->nat
[ddnatCON
];
368 void dd_move_x(gmx_domdec_t
*dd
, matrix box
, rvec x
[])
370 int nzone
, nat_tot
, n
, d
, p
, i
, j
, at0
, at1
, zone
;
371 int *index
, *cgindex
;
372 gmx_domdec_comm_t
*comm
;
373 gmx_domdec_comm_dim_t
*cd
;
374 gmx_domdec_ind_t
*ind
;
375 rvec shift
= {0, 0, 0}, *buf
, *rbuf
;
376 gmx_bool bPBC
, bScrew
;
380 cgindex
= dd
->cgindex
;
385 nat_tot
= dd
->nat_home
;
386 for (d
= 0; d
< dd
->ndim
; d
++)
388 bPBC
= (dd
->ci
[dd
->dim
[d
]] == 0);
389 bScrew
= (bPBC
&& dd
->bScrewPBC
&& dd
->dim
[d
] == XX
);
392 copy_rvec(box
[dd
->dim
[d
]], shift
);
395 for (p
= 0; p
< cd
->np
; p
++)
402 for (i
= 0; i
< ind
->nsend
[nzone
]; i
++)
404 at0
= cgindex
[index
[i
]];
405 at1
= cgindex
[index
[i
]+1];
406 for (j
= at0
; j
< at1
; j
++)
408 copy_rvec(x
[j
], buf
[n
]);
415 for (i
= 0; i
< ind
->nsend
[nzone
]; i
++)
417 at0
= cgindex
[index
[i
]];
418 at1
= cgindex
[index
[i
]+1];
419 for (j
= at0
; j
< at1
; j
++)
421 /* We need to shift the coordinates */
422 rvec_add(x
[j
], shift
, buf
[n
]);
429 for (i
= 0; i
< ind
->nsend
[nzone
]; i
++)
431 at0
= cgindex
[index
[i
]];
432 at1
= cgindex
[index
[i
]+1];
433 for (j
= at0
; j
< at1
; j
++)
436 buf
[n
][XX
] = x
[j
][XX
] + shift
[XX
];
438 * This operation requires a special shift force
439 * treatment, which is performed in calc_vir.
441 buf
[n
][YY
] = box
[YY
][YY
] - x
[j
][YY
];
442 buf
[n
][ZZ
] = box
[ZZ
][ZZ
] - x
[j
][ZZ
];
454 rbuf
= comm
->vbuf2
.v
;
456 /* Send and receive the coordinates */
457 dd_sendrecv_rvec(dd
, d
, dddirBackward
,
458 buf
, ind
->nsend
[nzone
+1],
459 rbuf
, ind
->nrecv
[nzone
+1]);
463 for (zone
= 0; zone
< nzone
; zone
++)
465 for (i
= ind
->cell2at0
[zone
]; i
< ind
->cell2at1
[zone
]; i
++)
467 copy_rvec(rbuf
[j
], x
[i
]);
472 nat_tot
+= ind
->nrecv
[nzone
+1];
478 void dd_move_f(gmx_domdec_t
*dd
, rvec f
[], rvec
*fshift
)
480 int nzone
, nat_tot
, n
, d
, p
, i
, j
, at0
, at1
, zone
;
481 int *index
, *cgindex
;
482 gmx_domdec_comm_t
*comm
;
483 gmx_domdec_comm_dim_t
*cd
;
484 gmx_domdec_ind_t
*ind
;
488 gmx_bool bShiftForcesNeedPbc
, bScrew
;
492 cgindex
= dd
->cgindex
;
496 nzone
= comm
->zones
.n
/2;
497 nat_tot
= dd
->nat_tot
;
498 for (d
= dd
->ndim
-1; d
>= 0; d
--)
500 /* Only forces in domains near the PBC boundaries need to
501 consider PBC in the treatment of fshift */
502 bShiftForcesNeedPbc
= (dd
->ci
[dd
->dim
[d
]] == 0);
503 bScrew
= (bShiftForcesNeedPbc
&& dd
->bScrewPBC
&& dd
->dim
[d
] == XX
);
504 if (fshift
== NULL
&& !bScrew
)
506 bShiftForcesNeedPbc
= FALSE
;
508 /* Determine which shift vector we need */
514 for (p
= cd
->np
-1; p
>= 0; p
--)
517 nat_tot
-= ind
->nrecv
[nzone
+1];
524 sbuf
= comm
->vbuf2
.v
;
526 for (zone
= 0; zone
< nzone
; zone
++)
528 for (i
= ind
->cell2at0
[zone
]; i
< ind
->cell2at1
[zone
]; i
++)
530 copy_rvec(f
[i
], sbuf
[j
]);
535 /* Communicate the forces */
536 dd_sendrecv_rvec(dd
, d
, dddirForward
,
537 sbuf
, ind
->nrecv
[nzone
+1],
538 buf
, ind
->nsend
[nzone
+1]);
540 /* Add the received forces */
542 if (!bShiftForcesNeedPbc
)
544 for (i
= 0; i
< ind
->nsend
[nzone
]; i
++)
546 at0
= cgindex
[index
[i
]];
547 at1
= cgindex
[index
[i
]+1];
548 for (j
= at0
; j
< at1
; j
++)
550 rvec_inc(f
[j
], buf
[n
]);
557 /* fshift should always be defined if this function is
558 * called when bShiftForcesNeedPbc is true */
559 assert(NULL
!= fshift
);
560 for (i
= 0; i
< ind
->nsend
[nzone
]; i
++)
562 at0
= cgindex
[index
[i
]];
563 at1
= cgindex
[index
[i
]+1];
564 for (j
= at0
; j
< at1
; j
++)
566 rvec_inc(f
[j
], buf
[n
]);
567 /* Add this force to the shift force */
568 rvec_inc(fshift
[is
], buf
[n
]);
575 for (i
= 0; i
< ind
->nsend
[nzone
]; i
++)
577 at0
= cgindex
[index
[i
]];
578 at1
= cgindex
[index
[i
]+1];
579 for (j
= at0
; j
< at1
; j
++)
581 /* Rotate the force */
582 f
[j
][XX
] += buf
[n
][XX
];
583 f
[j
][YY
] -= buf
[n
][YY
];
584 f
[j
][ZZ
] -= buf
[n
][ZZ
];
587 /* Add this force to the shift force */
588 rvec_inc(fshift
[is
], buf
[n
]);
599 void dd_atom_spread_real(gmx_domdec_t
*dd
, real v
[])
601 int nzone
, nat_tot
, n
, d
, p
, i
, j
, at0
, at1
, zone
;
602 int *index
, *cgindex
;
603 gmx_domdec_comm_t
*comm
;
604 gmx_domdec_comm_dim_t
*cd
;
605 gmx_domdec_ind_t
*ind
;
610 cgindex
= dd
->cgindex
;
612 buf
= &comm
->vbuf
.v
[0][0];
615 nat_tot
= dd
->nat_home
;
616 for (d
= 0; d
< dd
->ndim
; d
++)
619 for (p
= 0; p
< cd
->np
; p
++)
624 for (i
= 0; i
< ind
->nsend
[nzone
]; i
++)
626 at0
= cgindex
[index
[i
]];
627 at1
= cgindex
[index
[i
]+1];
628 for (j
= at0
; j
< at1
; j
++)
641 rbuf
= &comm
->vbuf2
.v
[0][0];
643 /* Send and receive the coordinates */
644 dd_sendrecv_real(dd
, d
, dddirBackward
,
645 buf
, ind
->nsend
[nzone
+1],
646 rbuf
, ind
->nrecv
[nzone
+1]);
650 for (zone
= 0; zone
< nzone
; zone
++)
652 for (i
= ind
->cell2at0
[zone
]; i
< ind
->cell2at1
[zone
]; i
++)
659 nat_tot
+= ind
->nrecv
[nzone
+1];
665 void dd_atom_sum_real(gmx_domdec_t
*dd
, real v
[])
667 int nzone
, nat_tot
, n
, d
, p
, i
, j
, at0
, at1
, zone
;
668 int *index
, *cgindex
;
669 gmx_domdec_comm_t
*comm
;
670 gmx_domdec_comm_dim_t
*cd
;
671 gmx_domdec_ind_t
*ind
;
676 cgindex
= dd
->cgindex
;
678 buf
= &comm
->vbuf
.v
[0][0];
680 nzone
= comm
->zones
.n
/2;
681 nat_tot
= dd
->nat_tot
;
682 for (d
= dd
->ndim
-1; d
>= 0; d
--)
685 for (p
= cd
->np
-1; p
>= 0; p
--)
688 nat_tot
-= ind
->nrecv
[nzone
+1];
695 sbuf
= &comm
->vbuf2
.v
[0][0];
697 for (zone
= 0; zone
< nzone
; zone
++)
699 for (i
= ind
->cell2at0
[zone
]; i
< ind
->cell2at1
[zone
]; i
++)
706 /* Communicate the forces */
707 dd_sendrecv_real(dd
, d
, dddirForward
,
708 sbuf
, ind
->nrecv
[nzone
+1],
709 buf
, ind
->nsend
[nzone
+1]);
711 /* Add the received forces */
713 for (i
= 0; i
< ind
->nsend
[nzone
]; i
++)
715 at0
= cgindex
[index
[i
]];
716 at1
= cgindex
[index
[i
]+1];
717 for (j
= at0
; j
< at1
; j
++)
728 static void print_ddzone(FILE *fp
, int d
, int i
, int j
, gmx_ddzone_t
*zone
)
730 fprintf(fp
, "zone d0 %d d1 %d d2 %d min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
732 zone
->min0
, zone
->max1
,
733 zone
->mch0
, zone
->mch0
,
734 zone
->p1_0
, zone
->p1_1
);
738 #define DDZONECOMM_MAXZONE 5
739 #define DDZONECOMM_BUFSIZE 3
741 static void dd_sendrecv_ddzone(const gmx_domdec_t
*dd
,
742 int ddimind
, int direction
,
743 gmx_ddzone_t
*buf_s
, int n_s
,
744 gmx_ddzone_t
*buf_r
, int n_r
)
746 #define ZBS DDZONECOMM_BUFSIZE
747 rvec vbuf_s
[DDZONECOMM_MAXZONE
*ZBS
];
748 rvec vbuf_r
[DDZONECOMM_MAXZONE
*ZBS
];
751 for (i
= 0; i
< n_s
; i
++)
753 vbuf_s
[i
*ZBS
][0] = buf_s
[i
].min0
;
754 vbuf_s
[i
*ZBS
][1] = buf_s
[i
].max1
;
755 vbuf_s
[i
*ZBS
][2] = buf_s
[i
].min1
;
756 vbuf_s
[i
*ZBS
+1][0] = buf_s
[i
].mch0
;
757 vbuf_s
[i
*ZBS
+1][1] = buf_s
[i
].mch1
;
758 vbuf_s
[i
*ZBS
+1][2] = 0;
759 vbuf_s
[i
*ZBS
+2][0] = buf_s
[i
].p1_0
;
760 vbuf_s
[i
*ZBS
+2][1] = buf_s
[i
].p1_1
;
761 vbuf_s
[i
*ZBS
+2][2] = 0;
764 dd_sendrecv_rvec(dd
, ddimind
, direction
,
768 for (i
= 0; i
< n_r
; i
++)
770 buf_r
[i
].min0
= vbuf_r
[i
*ZBS
][0];
771 buf_r
[i
].max1
= vbuf_r
[i
*ZBS
][1];
772 buf_r
[i
].min1
= vbuf_r
[i
*ZBS
][2];
773 buf_r
[i
].mch0
= vbuf_r
[i
*ZBS
+1][0];
774 buf_r
[i
].mch1
= vbuf_r
[i
*ZBS
+1][1];
775 buf_r
[i
].p1_0
= vbuf_r
[i
*ZBS
+2][0];
776 buf_r
[i
].p1_1
= vbuf_r
[i
*ZBS
+2][1];
782 static void dd_move_cellx(gmx_domdec_t
*dd
, gmx_ddbox_t
*ddbox
,
783 rvec cell_ns_x0
, rvec cell_ns_x1
)
785 int d
, d1
, dim
, pos
, buf_size
, i
, j
, p
, npulse
, npulse_min
;
787 gmx_ddzone_t buf_s
[DDZONECOMM_MAXZONE
];
788 gmx_ddzone_t buf_r
[DDZONECOMM_MAXZONE
];
789 gmx_ddzone_t buf_e
[DDZONECOMM_MAXZONE
];
790 rvec extr_s
[2], extr_r
[2];
792 real dist_d
, c
= 0, det
;
793 gmx_domdec_comm_t
*comm
;
798 for (d
= 1; d
< dd
->ndim
; d
++)
801 zp
= (d
== 1) ? &comm
->zone_d1
[0] : &comm
->zone_d2
[0][0];
802 zp
->min0
= cell_ns_x0
[dim
];
803 zp
->max1
= cell_ns_x1
[dim
];
804 zp
->min1
= cell_ns_x1
[dim
];
805 zp
->mch0
= cell_ns_x0
[dim
];
806 zp
->mch1
= cell_ns_x1
[dim
];
807 zp
->p1_0
= cell_ns_x0
[dim
];
808 zp
->p1_1
= cell_ns_x1
[dim
];
811 for (d
= dd
->ndim
-2; d
>= 0; d
--)
814 bPBC
= (dim
< ddbox
->npbcdim
);
816 /* Use an rvec to store two reals */
817 extr_s
[d
][0] = comm
->cell_f0
[d
+1];
818 extr_s
[d
][1] = comm
->cell_f1
[d
+1];
819 extr_s
[d
][2] = comm
->cell_f1
[d
+1];
822 /* Store the extremes in the backward sending buffer,
823 * so the get updated separately from the forward communication.
825 for (d1
= d
; d1
< dd
->ndim
-1; d1
++)
827 /* We invert the order to be able to use the same loop for buf_e */
828 buf_s
[pos
].min0
= extr_s
[d1
][1];
829 buf_s
[pos
].max1
= extr_s
[d1
][0];
830 buf_s
[pos
].min1
= extr_s
[d1
][2];
833 /* Store the cell corner of the dimension we communicate along */
834 buf_s
[pos
].p1_0
= comm
->cell_x0
[dim
];
839 buf_s
[pos
] = (dd
->ndim
== 2) ? comm
->zone_d1
[0] : comm
->zone_d2
[0][0];
842 if (dd
->ndim
== 3 && d
== 0)
844 buf_s
[pos
] = comm
->zone_d2
[0][1];
846 buf_s
[pos
] = comm
->zone_d1
[0];
850 /* We only need to communicate the extremes
851 * in the forward direction
853 npulse
= comm
->cd
[d
].np
;
856 /* Take the minimum to avoid double communication */
857 npulse_min
= std::min(npulse
, dd
->nc
[dim
]-1-npulse
);
861 /* Without PBC we should really not communicate over
862 * the boundaries, but implementing that complicates
863 * the communication setup and therefore we simply
864 * do all communication, but ignore some data.
868 for (p
= 0; p
< npulse_min
; p
++)
870 /* Communicate the extremes forward */
871 bUse
= (bPBC
|| dd
->ci
[dim
] > 0);
873 dd_sendrecv_rvec(dd
, d
, dddirForward
,
874 extr_s
+d
, dd
->ndim
-d
-1,
875 extr_r
+d
, dd
->ndim
-d
-1);
879 for (d1
= d
; d1
< dd
->ndim
-1; d1
++)
881 extr_s
[d1
][0] = std::max(extr_s
[d1
][0], extr_r
[d1
][0]);
882 extr_s
[d1
][1] = std::min(extr_s
[d1
][1], extr_r
[d1
][1]);
883 extr_s
[d1
][2] = std::min(extr_s
[d1
][2], extr_r
[d1
][2]);
889 for (p
= 0; p
< npulse
; p
++)
891 /* Communicate all the zone information backward */
892 bUse
= (bPBC
|| dd
->ci
[dim
] < dd
->nc
[dim
] - 1);
894 dd_sendrecv_ddzone(dd
, d
, dddirBackward
,
901 for (d1
= d
+1; d1
< dd
->ndim
; d1
++)
903 /* Determine the decrease of maximum required
904 * communication height along d1 due to the distance along d,
905 * this avoids a lot of useless atom communication.
907 dist_d
= comm
->cell_x1
[dim
] - buf_r
[0].p1_0
;
909 if (ddbox
->tric_dir
[dim
])
911 /* c is the off-diagonal coupling between the cell planes
912 * along directions d and d1.
914 c
= ddbox
->v
[dim
][dd
->dim
[d1
]][dim
];
920 det
= (1 + c
*c
)*comm
->cutoff
*comm
->cutoff
- dist_d
*dist_d
;
923 dh
[d1
] = comm
->cutoff
- (c
*dist_d
+ std::sqrt(det
))/(1 + c
*c
);
927 /* A negative value signals out of range */
933 /* Accumulate the extremes over all pulses */
934 for (i
= 0; i
< buf_size
; i
++)
944 buf_e
[i
].min0
= std::min(buf_e
[i
].min0
, buf_r
[i
].min0
);
945 buf_e
[i
].max1
= std::max(buf_e
[i
].max1
, buf_r
[i
].max1
);
946 buf_e
[i
].min1
= std::min(buf_e
[i
].min1
, buf_r
[i
].min1
);
949 if (dd
->ndim
== 3 && d
== 0 && i
== buf_size
- 1)
957 if (bUse
&& dh
[d1
] >= 0)
959 buf_e
[i
].mch0
= std::max(buf_e
[i
].mch0
, buf_r
[i
].mch0
-dh
[d1
]);
960 buf_e
[i
].mch1
= std::max(buf_e
[i
].mch1
, buf_r
[i
].mch1
-dh
[d1
]);
963 /* Copy the received buffer to the send buffer,
964 * to pass the data through with the next pulse.
968 if (((bPBC
|| dd
->ci
[dim
]+npulse
< dd
->nc
[dim
]) && p
== npulse
-1) ||
969 (!bPBC
&& dd
->ci
[dim
]+1+p
== dd
->nc
[dim
]-1))
971 /* Store the extremes */
974 for (d1
= d
; d1
< dd
->ndim
-1; d1
++)
976 extr_s
[d1
][1] = std::min(extr_s
[d1
][1], buf_e
[pos
].min0
);
977 extr_s
[d1
][0] = std::max(extr_s
[d1
][0], buf_e
[pos
].max1
);
978 extr_s
[d1
][2] = std::min(extr_s
[d1
][2], buf_e
[pos
].min1
);
982 if (d
== 1 || (d
== 0 && dd
->ndim
== 3))
984 for (i
= d
; i
< 2; i
++)
986 comm
->zone_d2
[1-d
][i
] = buf_e
[pos
];
992 comm
->zone_d1
[1] = buf_e
[pos
];
1002 for (i
= 0; i
< 2; i
++)
1006 print_ddzone(debug
, 1, i
, 0, &comm
->zone_d1
[i
]);
1008 cell_ns_x0
[dim
] = std::min(cell_ns_x0
[dim
], comm
->zone_d1
[i
].min0
);
1009 cell_ns_x1
[dim
] = std::max(cell_ns_x1
[dim
], comm
->zone_d1
[i
].max1
);
1015 for (i
= 0; i
< 2; i
++)
1017 for (j
= 0; j
< 2; j
++)
1021 print_ddzone(debug
, 2, i
, j
, &comm
->zone_d2
[i
][j
]);
1023 cell_ns_x0
[dim
] = std::min(cell_ns_x0
[dim
], comm
->zone_d2
[i
][j
].min0
);
1024 cell_ns_x1
[dim
] = std::max(cell_ns_x1
[dim
], comm
->zone_d2
[i
][j
].max1
);
1028 for (d
= 1; d
< dd
->ndim
; d
++)
1030 comm
->cell_f_max0
[d
] = extr_s
[d
-1][0];
1031 comm
->cell_f_min1
[d
] = extr_s
[d
-1][1];
1034 fprintf(debug
, "Cell fraction d %d, max0 %f, min1 %f\n",
1035 d
, comm
->cell_f_max0
[d
], comm
->cell_f_min1
[d
]);
1040 static void dd_collect_cg(gmx_domdec_t
*dd
,
1041 t_state
*state_local
)
1043 gmx_domdec_master_t
*ma
= NULL
;
1044 int buf2
[2], *ibuf
, i
, ncg_home
= 0, *cg
= NULL
, nat_home
= 0;
1046 if (state_local
->ddp_count
== dd
->comm
->master_cg_ddp_count
)
1048 /* The master has the correct distribution */
1052 if (state_local
->ddp_count
== dd
->ddp_count
)
1054 /* The local state and DD are in sync, use the DD indices */
1055 ncg_home
= dd
->ncg_home
;
1057 nat_home
= dd
->nat_home
;
1059 else if (state_local
->ddp_count_cg_gl
== state_local
->ddp_count
)
1061 /* The DD is out of sync with the local state, but we have stored
1062 * the cg indices with the local state, so we can use those.
1066 cgs_gl
= &dd
->comm
->cgs_gl
;
1068 ncg_home
= state_local
->ncg_gl
;
1069 cg
= state_local
->cg_gl
;
1071 for (i
= 0; i
< ncg_home
; i
++)
1073 nat_home
+= cgs_gl
->index
[cg
[i
]+1] - cgs_gl
->index
[cg
[i
]];
1078 gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1092 /* Collect the charge group and atom counts on the master */
1093 dd_gather(dd
, 2*sizeof(int), buf2
, ibuf
);
1098 for (i
= 0; i
< dd
->nnodes
; i
++)
1100 ma
->ncg
[i
] = ma
->ibuf
[2*i
];
1101 ma
->nat
[i
] = ma
->ibuf
[2*i
+1];
1102 ma
->index
[i
+1] = ma
->index
[i
] + ma
->ncg
[i
];
1105 /* Make byte counts and indices */
1106 for (i
= 0; i
< dd
->nnodes
; i
++)
1108 ma
->ibuf
[i
] = ma
->ncg
[i
]*sizeof(int);
1109 ma
->ibuf
[dd
->nnodes
+i
] = ma
->index
[i
]*sizeof(int);
1113 fprintf(debug
, "Initial charge group distribution: ");
1114 for (i
= 0; i
< dd
->nnodes
; i
++)
1116 fprintf(debug
, " %d", ma
->ncg
[i
]);
1118 fprintf(debug
, "\n");
1122 /* Collect the charge group indices on the master */
1124 ncg_home
*sizeof(int), cg
,
1125 DDMASTER(dd
) ? ma
->ibuf
: NULL
,
1126 DDMASTER(dd
) ? ma
->ibuf
+dd
->nnodes
: NULL
,
1127 DDMASTER(dd
) ? ma
->cg
: NULL
);
1129 dd
->comm
->master_cg_ddp_count
= state_local
->ddp_count
;
1132 static void dd_collect_vec_sendrecv(gmx_domdec_t
*dd
,
1135 gmx_domdec_master_t
*ma
;
1136 int n
, i
, c
, a
, nalloc
= 0;
1145 MPI_Send(lv
, dd
->nat_home
*sizeof(rvec
), MPI_BYTE
, DDMASTERRANK(dd
),
1146 dd
->rank
, dd
->mpi_comm_all
);
1151 /* Copy the master coordinates to the global array */
1152 cgs_gl
= &dd
->comm
->cgs_gl
;
1154 n
= DDMASTERRANK(dd
);
1156 for (i
= ma
->index
[n
]; i
< ma
->index
[n
+1]; i
++)
1158 for (c
= cgs_gl
->index
[ma
->cg
[i
]]; c
< cgs_gl
->index
[ma
->cg
[i
]+1]; c
++)
1160 copy_rvec(lv
[a
++], v
[c
]);
1164 for (n
= 0; n
< dd
->nnodes
; n
++)
1168 if (ma
->nat
[n
] > nalloc
)
1170 nalloc
= over_alloc_dd(ma
->nat
[n
]);
1171 srenew(buf
, nalloc
);
1174 MPI_Recv(buf
, ma
->nat
[n
]*sizeof(rvec
), MPI_BYTE
, DDRANK(dd
, n
),
1175 n
, dd
->mpi_comm_all
, MPI_STATUS_IGNORE
);
1178 for (i
= ma
->index
[n
]; i
< ma
->index
[n
+1]; i
++)
1180 for (c
= cgs_gl
->index
[ma
->cg
[i
]]; c
< cgs_gl
->index
[ma
->cg
[i
]+1]; c
++)
1182 copy_rvec(buf
[a
++], v
[c
]);
1191 static void get_commbuffer_counts(gmx_domdec_t
*dd
,
1192 int **counts
, int **disps
)
1194 gmx_domdec_master_t
*ma
;
1199 /* Make the rvec count and displacment arrays */
1201 *disps
= ma
->ibuf
+ dd
->nnodes
;
1202 for (n
= 0; n
< dd
->nnodes
; n
++)
1204 (*counts
)[n
] = ma
->nat
[n
]*sizeof(rvec
);
1205 (*disps
)[n
] = (n
== 0 ? 0 : (*disps
)[n
-1] + (*counts
)[n
-1]);
1209 static void dd_collect_vec_gatherv(gmx_domdec_t
*dd
,
1212 gmx_domdec_master_t
*ma
;
1213 int *rcounts
= NULL
, *disps
= NULL
;
1222 get_commbuffer_counts(dd
, &rcounts
, &disps
);
1227 dd_gatherv(dd
, dd
->nat_home
*sizeof(rvec
), lv
, rcounts
, disps
, buf
);
1231 cgs_gl
= &dd
->comm
->cgs_gl
;
1234 for (n
= 0; n
< dd
->nnodes
; n
++)
1236 for (i
= ma
->index
[n
]; i
< ma
->index
[n
+1]; i
++)
1238 for (c
= cgs_gl
->index
[ma
->cg
[i
]]; c
< cgs_gl
->index
[ma
->cg
[i
]+1]; c
++)
1240 copy_rvec(buf
[a
++], v
[c
]);
1247 void dd_collect_vec(gmx_domdec_t
*dd
,
1248 t_state
*state_local
, rvec
*lv
, rvec
*v
)
1250 dd_collect_cg(dd
, state_local
);
1252 if (dd
->nnodes
<= GMX_DD_NNODES_SENDRECV
)
1254 dd_collect_vec_sendrecv(dd
, lv
, v
);
1258 dd_collect_vec_gatherv(dd
, lv
, v
);
1263 void dd_collect_state(gmx_domdec_t
*dd
,
1264 t_state
*state_local
, t_state
*state
)
1268 nh
= state
->nhchainlength
;
1272 for (i
= 0; i
< efptNR
; i
++)
1274 state
->lambda
[i
] = state_local
->lambda
[i
];
1276 state
->fep_state
= state_local
->fep_state
;
1277 state
->veta
= state_local
->veta
;
1278 state
->vol0
= state_local
->vol0
;
1279 copy_mat(state_local
->box
, state
->box
);
1280 copy_mat(state_local
->boxv
, state
->boxv
);
1281 copy_mat(state_local
->svir_prev
, state
->svir_prev
);
1282 copy_mat(state_local
->fvir_prev
, state
->fvir_prev
);
1283 copy_mat(state_local
->pres_prev
, state
->pres_prev
);
1285 for (i
= 0; i
< state_local
->ngtc
; i
++)
1287 for (j
= 0; j
< nh
; j
++)
1289 state
->nosehoover_xi
[i
*nh
+j
] = state_local
->nosehoover_xi
[i
*nh
+j
];
1290 state
->nosehoover_vxi
[i
*nh
+j
] = state_local
->nosehoover_vxi
[i
*nh
+j
];
1292 state
->therm_integral
[i
] = state_local
->therm_integral
[i
];
1294 for (i
= 0; i
< state_local
->nnhpres
; i
++)
1296 for (j
= 0; j
< nh
; j
++)
1298 state
->nhpres_xi
[i
*nh
+j
] = state_local
->nhpres_xi
[i
*nh
+j
];
1299 state
->nhpres_vxi
[i
*nh
+j
] = state_local
->nhpres_vxi
[i
*nh
+j
];
1303 for (est
= 0; est
< estNR
; est
++)
1305 if (EST_DISTR(est
) && (state_local
->flags
& (1<<est
)))
1310 dd_collect_vec(dd
, state_local
, state_local
->x
, state
->x
);
1313 dd_collect_vec(dd
, state_local
, state_local
->v
, state
->v
);
1316 dd_collect_vec(dd
, state_local
, state_local
->sd_X
, state
->sd_X
);
1319 dd_collect_vec(dd
, state_local
, state_local
->cg_p
, state
->cg_p
);
1321 case estDISRE_INITF
:
1322 case estDISRE_RM3TAV
:
1323 case estORIRE_INITF
:
1327 gmx_incons("Unknown state entry encountered in dd_collect_state");
1333 static void dd_realloc_state(t_state
*state
, rvec
**f
, int nalloc
)
1339 fprintf(debug
, "Reallocating state: currently %d, required %d, allocating %d\n", state
->nalloc
, nalloc
, over_alloc_dd(nalloc
));
1342 state
->nalloc
= over_alloc_dd(nalloc
);
1344 for (est
= 0; est
< estNR
; est
++)
1346 if (EST_DISTR(est
) && (state
->flags
& (1<<est
)))
1348 /* We need to allocate one element extra, since we might use
1349 * (unaligned) 4-wide SIMD loads to access rvec entries.
1354 srenew(state
->x
, state
->nalloc
+ 1);
1357 srenew(state
->v
, state
->nalloc
+ 1);
1360 srenew(state
->sd_X
, state
->nalloc
+ 1);
1363 srenew(state
->cg_p
, state
->nalloc
+ 1);
1365 case estDISRE_INITF
:
1366 case estDISRE_RM3TAV
:
1367 case estORIRE_INITF
:
1369 /* No reallocation required */
1372 gmx_incons("Unknown state entry encountered in dd_realloc_state");
1379 srenew(*f
, state
->nalloc
);
1383 static void dd_check_alloc_ncg(t_forcerec
*fr
, t_state
*state
, rvec
**f
,
1386 if (nalloc
> fr
->cg_nalloc
)
1390 fprintf(debug
, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr
->cg_nalloc
, nalloc
, over_alloc_dd(nalloc
));
1392 fr
->cg_nalloc
= over_alloc_dd(nalloc
);
1393 srenew(fr
->cginfo
, fr
->cg_nalloc
);
1394 if (fr
->cutoff_scheme
== ecutsGROUP
)
1396 srenew(fr
->cg_cm
, fr
->cg_nalloc
);
1399 if (fr
->cutoff_scheme
== ecutsVERLET
&& nalloc
> state
->nalloc
)
1401 /* We don't use charge groups, we use x in state to set up
1402 * the atom communication.
1404 dd_realloc_state(state
, f
, nalloc
);
1408 static void dd_distribute_vec_sendrecv(gmx_domdec_t
*dd
, t_block
*cgs
,
1411 gmx_domdec_master_t
*ma
;
1412 int n
, i
, c
, a
, nalloc
= 0;
1419 for (n
= 0; n
< dd
->nnodes
; n
++)
1423 if (ma
->nat
[n
] > nalloc
)
1425 nalloc
= over_alloc_dd(ma
->nat
[n
]);
1426 srenew(buf
, nalloc
);
1428 /* Use lv as a temporary buffer */
1430 for (i
= ma
->index
[n
]; i
< ma
->index
[n
+1]; i
++)
1432 for (c
= cgs
->index
[ma
->cg
[i
]]; c
< cgs
->index
[ma
->cg
[i
]+1]; c
++)
1434 copy_rvec(v
[c
], buf
[a
++]);
1437 if (a
!= ma
->nat
[n
])
1439 gmx_fatal(FARGS
, "Internal error a (%d) != nat (%d)",
1444 MPI_Send(buf
, ma
->nat
[n
]*sizeof(rvec
), MPI_BYTE
,
1445 DDRANK(dd
, n
), n
, dd
->mpi_comm_all
);
1450 n
= DDMASTERRANK(dd
);
1452 for (i
= ma
->index
[n
]; i
< ma
->index
[n
+1]; i
++)
1454 for (c
= cgs
->index
[ma
->cg
[i
]]; c
< cgs
->index
[ma
->cg
[i
]+1]; c
++)
1456 copy_rvec(v
[c
], lv
[a
++]);
1463 MPI_Recv(lv
, dd
->nat_home
*sizeof(rvec
), MPI_BYTE
, DDMASTERRANK(dd
),
1464 MPI_ANY_TAG
, dd
->mpi_comm_all
, MPI_STATUS_IGNORE
);
1469 static void dd_distribute_vec_scatterv(gmx_domdec_t
*dd
, t_block
*cgs
,
1472 gmx_domdec_master_t
*ma
;
1473 int *scounts
= NULL
, *disps
= NULL
;
1481 get_commbuffer_counts(dd
, &scounts
, &disps
);
1485 for (n
= 0; n
< dd
->nnodes
; n
++)
1487 for (i
= ma
->index
[n
]; i
< ma
->index
[n
+1]; i
++)
1489 for (c
= cgs
->index
[ma
->cg
[i
]]; c
< cgs
->index
[ma
->cg
[i
]+1]; c
++)
1491 copy_rvec(v
[c
], buf
[a
++]);
1497 dd_scatterv(dd
, scounts
, disps
, buf
, dd
->nat_home
*sizeof(rvec
), lv
);
1500 static void dd_distribute_vec(gmx_domdec_t
*dd
, t_block
*cgs
, rvec
*v
, rvec
*lv
)
1502 if (dd
->nnodes
<= GMX_DD_NNODES_SENDRECV
)
1504 dd_distribute_vec_sendrecv(dd
, cgs
, v
, lv
);
1508 dd_distribute_vec_scatterv(dd
, cgs
, v
, lv
);
1512 static void dd_distribute_dfhist(gmx_domdec_t
*dd
, df_history_t
*dfhist
)
1515 dd_bcast(dd
, sizeof(int), &dfhist
->bEquil
);
1516 dd_bcast(dd
, sizeof(int), &dfhist
->nlambda
);
1517 dd_bcast(dd
, sizeof(real
), &dfhist
->wl_delta
);
1519 if (dfhist
->nlambda
> 0)
1521 int nlam
= dfhist
->nlambda
;
1522 dd_bcast(dd
, sizeof(int)*nlam
, dfhist
->n_at_lam
);
1523 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->wl_histo
);
1524 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->sum_weights
);
1525 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->sum_dg
);
1526 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->sum_minvar
);
1527 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->sum_variance
);
1529 for (i
= 0; i
< nlam
; i
++)
1531 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->accum_p
[i
]);
1532 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->accum_m
[i
]);
1533 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->accum_p2
[i
]);
1534 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->accum_m2
[i
]);
1535 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->Tij
[i
]);
1536 dd_bcast(dd
, sizeof(real
)*nlam
, dfhist
->Tij_empirical
[i
]);
1541 static void dd_distribute_state(gmx_domdec_t
*dd
, t_block
*cgs
,
1542 t_state
*state
, t_state
*state_local
,
1547 nh
= state
->nhchainlength
;
1551 for (i
= 0; i
< efptNR
; i
++)
1553 state_local
->lambda
[i
] = state
->lambda
[i
];
1555 state_local
->fep_state
= state
->fep_state
;
1556 state_local
->veta
= state
->veta
;
1557 state_local
->vol0
= state
->vol0
;
1558 copy_mat(state
->box
, state_local
->box
);
1559 copy_mat(state
->box_rel
, state_local
->box_rel
);
1560 copy_mat(state
->boxv
, state_local
->boxv
);
1561 copy_mat(state
->svir_prev
, state_local
->svir_prev
);
1562 copy_mat(state
->fvir_prev
, state_local
->fvir_prev
);
1563 copy_df_history(&state_local
->dfhist
, &state
->dfhist
);
1564 for (i
= 0; i
< state_local
->ngtc
; i
++)
1566 for (j
= 0; j
< nh
; j
++)
1568 state_local
->nosehoover_xi
[i
*nh
+j
] = state
->nosehoover_xi
[i
*nh
+j
];
1569 state_local
->nosehoover_vxi
[i
*nh
+j
] = state
->nosehoover_vxi
[i
*nh
+j
];
1571 state_local
->therm_integral
[i
] = state
->therm_integral
[i
];
1573 for (i
= 0; i
< state_local
->nnhpres
; i
++)
1575 for (j
= 0; j
< nh
; j
++)
1577 state_local
->nhpres_xi
[i
*nh
+j
] = state
->nhpres_xi
[i
*nh
+j
];
1578 state_local
->nhpres_vxi
[i
*nh
+j
] = state
->nhpres_vxi
[i
*nh
+j
];
1582 dd_bcast(dd
, ((efptNR
)*sizeof(real
)), state_local
->lambda
);
1583 dd_bcast(dd
, sizeof(int), &state_local
->fep_state
);
1584 dd_bcast(dd
, sizeof(real
), &state_local
->veta
);
1585 dd_bcast(dd
, sizeof(real
), &state_local
->vol0
);
1586 dd_bcast(dd
, sizeof(state_local
->box
), state_local
->box
);
1587 dd_bcast(dd
, sizeof(state_local
->box_rel
), state_local
->box_rel
);
1588 dd_bcast(dd
, sizeof(state_local
->boxv
), state_local
->boxv
);
1589 dd_bcast(dd
, sizeof(state_local
->svir_prev
), state_local
->svir_prev
);
1590 dd_bcast(dd
, sizeof(state_local
->fvir_prev
), state_local
->fvir_prev
);
1591 dd_bcast(dd
, ((state_local
->ngtc
*nh
)*sizeof(double)), state_local
->nosehoover_xi
);
1592 dd_bcast(dd
, ((state_local
->ngtc
*nh
)*sizeof(double)), state_local
->nosehoover_vxi
);
1593 dd_bcast(dd
, state_local
->ngtc
*sizeof(double), state_local
->therm_integral
);
1594 dd_bcast(dd
, ((state_local
->nnhpres
*nh
)*sizeof(double)), state_local
->nhpres_xi
);
1595 dd_bcast(dd
, ((state_local
->nnhpres
*nh
)*sizeof(double)), state_local
->nhpres_vxi
);
1597 /* communicate df_history -- required for restarting from checkpoint */
1598 dd_distribute_dfhist(dd
, &state_local
->dfhist
);
1600 if (dd
->nat_home
> state_local
->nalloc
)
1602 dd_realloc_state(state_local
, f
, dd
->nat_home
);
1604 for (i
= 0; i
< estNR
; i
++)
1606 if (EST_DISTR(i
) && (state_local
->flags
& (1<<i
)))
1611 dd_distribute_vec(dd
, cgs
, state
->x
, state_local
->x
);
1614 dd_distribute_vec(dd
, cgs
, state
->v
, state_local
->v
);
1617 dd_distribute_vec(dd
, cgs
, state
->sd_X
, state_local
->sd_X
);
1620 dd_distribute_vec(dd
, cgs
, state
->cg_p
, state_local
->cg_p
);
1622 case estDISRE_INITF
:
1623 case estDISRE_RM3TAV
:
1624 case estORIRE_INITF
:
1626 /* Not implemented yet */
1629 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1635 static char dim2char(int dim
)
1641 case XX
: c
= 'X'; break;
1642 case YY
: c
= 'Y'; break;
1643 case ZZ
: c
= 'Z'; break;
1644 default: gmx_fatal(FARGS
, "Unknown dim %d", dim
);
1650 static void write_dd_grid_pdb(const char *fn
, gmx_int64_t step
,
1651 gmx_domdec_t
*dd
, matrix box
, gmx_ddbox_t
*ddbox
)
1653 rvec grid_s
[2], *grid_r
= NULL
, cx
, r
;
1654 char fname
[STRLEN
], buf
[22];
1656 int a
, i
, d
, z
, y
, x
;
1660 copy_rvec(dd
->comm
->cell_x0
, grid_s
[0]);
1661 copy_rvec(dd
->comm
->cell_x1
, grid_s
[1]);
1665 snew(grid_r
, 2*dd
->nnodes
);
1668 dd_gather(dd
, 2*sizeof(rvec
), grid_s
, DDMASTER(dd
) ? grid_r
: NULL
);
1672 for (d
= 0; d
< DIM
; d
++)
1674 for (i
= 0; i
< DIM
; i
++)
1682 if (d
< ddbox
->npbcdim
&& dd
->nc
[d
] > 1)
1684 tric
[d
][i
] = box
[i
][d
]/box
[i
][i
];
1693 sprintf(fname
, "%s_%s.pdb", fn
, gmx_step_str(step
, buf
));
1694 out
= gmx_fio_fopen(fname
, "w");
1695 gmx_write_pdb_box(out
, dd
->bScrewPBC
? epbcSCREW
: epbcXYZ
, box
);
1697 for (i
= 0; i
< dd
->nnodes
; i
++)
1699 vol
= dd
->nnodes
/(box
[XX
][XX
]*box
[YY
][YY
]*box
[ZZ
][ZZ
]);
1700 for (d
= 0; d
< DIM
; d
++)
1702 vol
*= grid_r
[i
*2+1][d
] - grid_r
[i
*2][d
];
1704 for (z
= 0; z
< 2; z
++)
1706 for (y
= 0; y
< 2; y
++)
1708 for (x
= 0; x
< 2; x
++)
1710 cx
[XX
] = grid_r
[i
*2+x
][XX
];
1711 cx
[YY
] = grid_r
[i
*2+y
][YY
];
1712 cx
[ZZ
] = grid_r
[i
*2+z
][ZZ
];
1714 gmx_fprintf_pdb_atomline(out
, epdbATOM
, a
++, "CA", ' ', "GLY", ' ', i
+1, ' ',
1715 10*r
[XX
], 10*r
[YY
], 10*r
[ZZ
], 1.0, vol
, "");
1719 for (d
= 0; d
< DIM
; d
++)
1721 for (x
= 0; x
< 4; x
++)
1725 case 0: y
= 1 + i
*8 + 2*x
; break;
1726 case 1: y
= 1 + i
*8 + 2*x
- (x
% 2); break;
1727 case 2: y
= 1 + i
*8 + x
; break;
1729 fprintf(out
, "%6s%5d%5d\n", "CONECT", y
, y
+(1<<d
));
1733 gmx_fio_fclose(out
);
1738 void write_dd_pdb(const char *fn
, gmx_int64_t step
, const char *title
,
1739 gmx_mtop_t
*mtop
, t_commrec
*cr
,
1740 int natoms
, rvec x
[], matrix box
)
1742 char fname
[STRLEN
], buf
[22];
1744 int i
, ii
, resnr
, c
;
1745 char *atomname
, *resname
;
1752 natoms
= dd
->comm
->nat
[ddnatVSITE
];
1755 sprintf(fname
, "%s_%s_n%d.pdb", fn
, gmx_step_str(step
, buf
), cr
->sim_nodeid
);
1757 out
= gmx_fio_fopen(fname
, "w");
1759 fprintf(out
, "TITLE %s\n", title
);
1760 gmx_write_pdb_box(out
, dd
->bScrewPBC
? epbcSCREW
: epbcXYZ
, box
);
1761 for (i
= 0; i
< natoms
; i
++)
1763 ii
= dd
->gatindex
[i
];
1764 gmx_mtop_atominfo_global(mtop
, ii
, &atomname
, &resnr
, &resname
);
1765 if (i
< dd
->comm
->nat
[ddnatZONE
])
1768 while (i
>= dd
->cgindex
[dd
->comm
->zones
.cg_range
[c
+1]])
1774 else if (i
< dd
->comm
->nat
[ddnatVSITE
])
1776 b
= dd
->comm
->zones
.n
;
1780 b
= dd
->comm
->zones
.n
+ 1;
1782 gmx_fprintf_pdb_atomline(out
, epdbATOM
, ii
+1, atomname
, ' ', resname
, ' ', resnr
, ' ',
1783 10*x
[i
][XX
], 10*x
[i
][YY
], 10*x
[i
][ZZ
], 1.0, b
, "");
1785 fprintf(out
, "TER\n");
1787 gmx_fio_fclose(out
);
1790 real
dd_cutoff_multibody(const gmx_domdec_t
*dd
)
1792 gmx_domdec_comm_t
*comm
;
1799 if (comm
->bInterCGBondeds
)
1801 if (comm
->cutoff_mbody
> 0)
1803 r
= comm
->cutoff_mbody
;
1807 /* cutoff_mbody=0 means we do not have DLB */
1808 r
= comm
->cellsize_min
[dd
->dim
[0]];
1809 for (di
= 1; di
< dd
->ndim
; di
++)
1811 r
= std::min(r
, comm
->cellsize_min
[dd
->dim
[di
]]);
1813 if (comm
->bBondComm
)
1815 r
= std::max(r
, comm
->cutoff_mbody
);
1819 r
= std::min(r
, comm
->cutoff
);
1827 real
dd_cutoff_twobody(const gmx_domdec_t
*dd
)
1831 r_mb
= dd_cutoff_multibody(dd
);
1833 return std::max(dd
->comm
->cutoff
, r_mb
);
1837 static void dd_cart_coord2pmecoord(gmx_domdec_t
*dd
, ivec coord
, ivec coord_pme
)
1841 nc
= dd
->nc
[dd
->comm
->cartpmedim
];
1842 ntot
= dd
->comm
->ntot
[dd
->comm
->cartpmedim
];
1843 copy_ivec(coord
, coord_pme
);
1844 coord_pme
[dd
->comm
->cartpmedim
] =
1845 nc
+ (coord
[dd
->comm
->cartpmedim
]*(ntot
- nc
) + (ntot
- nc
)/2)/nc
;
1848 static int low_ddindex2pmeindex(int ndd
, int npme
, int ddindex
)
1850 /* Here we assign a PME node to communicate with this DD node
1851 * by assuming that the major index of both is x.
1852 * We add cr->npmenodes/2 to obtain an even distribution.
1854 return (ddindex
*npme
+ npme
/2)/ndd
;
1857 static int ddindex2pmeindex(const gmx_domdec_t
*dd
, int ddindex
)
1859 return low_ddindex2pmeindex(dd
->nnodes
, dd
->comm
->npmenodes
, ddindex
);
1862 static int cr_ddindex2pmeindex(const t_commrec
*cr
, int ddindex
)
1864 return low_ddindex2pmeindex(cr
->dd
->nnodes
, cr
->npmenodes
, ddindex
);
1867 static int *dd_pmenodes(t_commrec
*cr
)
1872 snew(pmenodes
, cr
->npmenodes
);
1874 for (i
= 0; i
< cr
->dd
->nnodes
; i
++)
1876 p0
= cr_ddindex2pmeindex(cr
, i
);
1877 p1
= cr_ddindex2pmeindex(cr
, i
+1);
1878 if (i
+1 == cr
->dd
->nnodes
|| p1
> p0
)
1882 fprintf(debug
, "pmenode[%d] = %d\n", n
, i
+1+n
);
1884 pmenodes
[n
] = i
+ 1 + n
;
1892 static int gmx_ddcoord2pmeindex(t_commrec
*cr
, int x
, int y
, int z
)
1900 if (dd->comm->bCartesian) {
1901 gmx_ddindex2xyz(dd->nc,ddindex,coords);
1902 dd_coords2pmecoords(dd,coords,coords_pme);
1903 copy_ivec(dd->ntot,nc);
1904 nc[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
1905 coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
1907 slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
1909 slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
1915 slab
= ddindex2pmeindex(dd
, dd_index(dd
->nc
, coords
));
1920 static int ddcoord2simnodeid(t_commrec
*cr
, int x
, int y
, int z
)
1922 gmx_domdec_comm_t
*comm
;
1924 int ddindex
, nodeid
= -1;
1926 comm
= cr
->dd
->comm
;
1931 if (comm
->bCartesianPP_PME
)
1934 MPI_Cart_rank(cr
->mpi_comm_mysim
, coords
, &nodeid
);
1939 ddindex
= dd_index(cr
->dd
->nc
, coords
);
1940 if (comm
->bCartesianPP
)
1942 nodeid
= comm
->ddindex2simnodeid
[ddindex
];
1948 nodeid
= ddindex
+ gmx_ddcoord2pmeindex(cr
, x
, y
, z
);
1960 static int dd_simnode2pmenode(t_commrec
*cr
, int sim_nodeid
)
1963 gmx_domdec_comm_t
*comm
;
1970 /* This assumes a uniform x domain decomposition grid cell size */
1971 if (comm
->bCartesianPP_PME
)
1974 ivec coord
, coord_pme
;
1975 MPI_Cart_coords(cr
->mpi_comm_mysim
, sim_nodeid
, DIM
, coord
);
1976 if (coord
[comm
->cartpmedim
] < dd
->nc
[comm
->cartpmedim
])
1978 /* This is a PP node */
1979 dd_cart_coord2pmecoord(dd
, coord
, coord_pme
);
1980 MPI_Cart_rank(cr
->mpi_comm_mysim
, coord_pme
, &pmenode
);
1984 else if (comm
->bCartesianPP
)
1986 if (sim_nodeid
< dd
->nnodes
)
1988 pmenode
= dd
->nnodes
+ ddindex2pmeindex(dd
, sim_nodeid
);
1993 /* This assumes DD cells with identical x coordinates
1994 * are numbered sequentially.
1996 if (dd
->comm
->pmenodes
== NULL
)
1998 if (sim_nodeid
< dd
->nnodes
)
2000 /* The DD index equals the nodeid */
2001 pmenode
= dd
->nnodes
+ ddindex2pmeindex(dd
, sim_nodeid
);
2007 while (sim_nodeid
> dd
->comm
->pmenodes
[i
])
2011 if (sim_nodeid
< dd
->comm
->pmenodes
[i
])
2013 pmenode
= dd
->comm
->pmenodes
[i
];
2021 void get_pme_nnodes(const gmx_domdec_t
*dd
,
2022 int *npmenodes_x
, int *npmenodes_y
)
2026 *npmenodes_x
= dd
->comm
->npmenodes_x
;
2027 *npmenodes_y
= dd
->comm
->npmenodes_y
;
2036 void get_pme_ddnodes(t_commrec
*cr
, int pmenodeid
,
2037 int *nmy_ddnodes
, int **my_ddnodes
, int *node_peer
)
2041 ivec coord
, coord_pme
;
2045 snew(*my_ddnodes
, (dd
->nnodes
+cr
->npmenodes
-1)/cr
->npmenodes
);
2048 for (x
= 0; x
< dd
->nc
[XX
]; x
++)
2050 for (y
= 0; y
< dd
->nc
[YY
]; y
++)
2052 for (z
= 0; z
< dd
->nc
[ZZ
]; z
++)
2054 if (dd
->comm
->bCartesianPP_PME
)
2059 dd_cart_coord2pmecoord(dd
, coord
, coord_pme
);
2060 if (dd
->ci
[XX
] == coord_pme
[XX
] &&
2061 dd
->ci
[YY
] == coord_pme
[YY
] &&
2062 dd
->ci
[ZZ
] == coord_pme
[ZZ
])
2064 (*my_ddnodes
)[(*nmy_ddnodes
)++] = ddcoord2simnodeid(cr
, x
, y
, z
);
2069 /* The slab corresponds to the nodeid in the PME group */
2070 if (gmx_ddcoord2pmeindex(cr
, x
, y
, z
) == pmenodeid
)
2072 (*my_ddnodes
)[(*nmy_ddnodes
)++] = ddcoord2simnodeid(cr
, x
, y
, z
);
2079 /* The last PP-only node is the peer node */
2080 *node_peer
= (*my_ddnodes
)[*nmy_ddnodes
-1];
2084 fprintf(debug
, "Receive coordinates from PP ranks:");
2085 for (x
= 0; x
< *nmy_ddnodes
; x
++)
2087 fprintf(debug
, " %d", (*my_ddnodes
)[x
]);
2089 fprintf(debug
, "\n");
2093 static gmx_bool
receive_vir_ener(t_commrec
*cr
)
2095 gmx_domdec_comm_t
*comm
;
2100 if (cr
->npmenodes
< cr
->dd
->nnodes
)
2102 comm
= cr
->dd
->comm
;
2103 if (comm
->bCartesianPP_PME
)
2105 pmenode
= dd_simnode2pmenode(cr
, cr
->sim_nodeid
);
2108 MPI_Cart_coords(cr
->mpi_comm_mysim
, cr
->sim_nodeid
, DIM
, coords
);
2109 coords
[comm
->cartpmedim
]++;
2110 if (coords
[comm
->cartpmedim
] < cr
->dd
->nc
[comm
->cartpmedim
])
2113 MPI_Cart_rank(cr
->mpi_comm_mysim
, coords
, &rank
);
2114 if (dd_simnode2pmenode(cr
, rank
) == pmenode
)
2116 /* This is not the last PP node for pmenode */
2124 pmenode
= dd_simnode2pmenode(cr
, cr
->sim_nodeid
);
2125 if (cr
->sim_nodeid
+1 < cr
->nnodes
&&
2126 dd_simnode2pmenode(cr
, cr
->sim_nodeid
+1) == pmenode
)
2128 /* This is not the last PP node for pmenode */
2137 static void set_zones_ncg_home(gmx_domdec_t
*dd
)
2139 gmx_domdec_zones_t
*zones
;
2142 zones
= &dd
->comm
->zones
;
2144 zones
->cg_range
[0] = 0;
2145 for (i
= 1; i
< zones
->n
+1; i
++)
2147 zones
->cg_range
[i
] = dd
->ncg_home
;
2149 /* zone_ncg1[0] should always be equal to ncg_home */
2150 dd
->comm
->zone_ncg1
[0] = dd
->ncg_home
;
2153 static void rebuild_cgindex(gmx_domdec_t
*dd
,
2154 const int *gcgs_index
, t_state
*state
)
2156 int nat
, i
, *ind
, *dd_cg_gl
, *cgindex
, cg_gl
;
2159 dd_cg_gl
= dd
->index_gl
;
2160 cgindex
= dd
->cgindex
;
2163 for (i
= 0; i
< state
->ncg_gl
; i
++)
2167 dd_cg_gl
[i
] = cg_gl
;
2168 nat
+= gcgs_index
[cg_gl
+1] - gcgs_index
[cg_gl
];
2172 dd
->ncg_home
= state
->ncg_gl
;
2175 set_zones_ncg_home(dd
);
2178 static int ddcginfo(const cginfo_mb_t
*cginfo_mb
, int cg
)
2180 while (cg
>= cginfo_mb
->cg_end
)
2185 return cginfo_mb
->cginfo
[(cg
- cginfo_mb
->cg_start
) % cginfo_mb
->cg_mod
];
2188 static void dd_set_cginfo(int *index_gl
, int cg0
, int cg1
,
2189 t_forcerec
*fr
, char *bLocalCG
)
2191 cginfo_mb_t
*cginfo_mb
;
2197 cginfo_mb
= fr
->cginfo_mb
;
2198 cginfo
= fr
->cginfo
;
2200 for (cg
= cg0
; cg
< cg1
; cg
++)
2202 cginfo
[cg
] = ddcginfo(cginfo_mb
, index_gl
[cg
]);
2206 if (bLocalCG
!= NULL
)
2208 for (cg
= cg0
; cg
< cg1
; cg
++)
2210 bLocalCG
[index_gl
[cg
]] = TRUE
;
2215 static void make_dd_indices(gmx_domdec_t
*dd
,
2216 const int *gcgs_index
, int cg_start
)
2218 int nzone
, zone
, zone1
, cg0
, cg1
, cg1_p1
, cg
, cg_gl
, a
, a_gl
;
2219 int *zone2cg
, *zone_ncg1
, *index_gl
, *gatindex
;
2222 if (dd
->nat_tot
> dd
->gatindex_nalloc
)
2224 dd
->gatindex_nalloc
= over_alloc_dd(dd
->nat_tot
);
2225 srenew(dd
->gatindex
, dd
->gatindex_nalloc
);
2228 nzone
= dd
->comm
->zones
.n
;
2229 zone2cg
= dd
->comm
->zones
.cg_range
;
2230 zone_ncg1
= dd
->comm
->zone_ncg1
;
2231 index_gl
= dd
->index_gl
;
2232 gatindex
= dd
->gatindex
;
2233 bCGs
= dd
->comm
->bCGs
;
2235 if (zone2cg
[1] != dd
->ncg_home
)
2237 gmx_incons("dd->ncg_zone is not up to date");
2240 /* Make the local to global and global to local atom index */
2241 a
= dd
->cgindex
[cg_start
];
2242 for (zone
= 0; zone
< nzone
; zone
++)
2250 cg0
= zone2cg
[zone
];
2252 cg1
= zone2cg
[zone
+1];
2253 cg1_p1
= cg0
+ zone_ncg1
[zone
];
2255 for (cg
= cg0
; cg
< cg1
; cg
++)
2260 /* Signal that this cg is from more than one pulse away */
2263 cg_gl
= index_gl
[cg
];
2266 for (a_gl
= gcgs_index
[cg_gl
]; a_gl
< gcgs_index
[cg_gl
+1]; a_gl
++)
2269 ga2la_set(dd
->ga2la
, a_gl
, a
, zone1
);
2275 gatindex
[a
] = cg_gl
;
2276 ga2la_set(dd
->ga2la
, cg_gl
, a
, zone1
);
2283 static int check_bLocalCG(gmx_domdec_t
*dd
, int ncg_sys
, const char *bLocalCG
,
2289 if (bLocalCG
== NULL
)
2293 for (i
= 0; i
< dd
->ncg_tot
; i
++)
2295 if (!bLocalCG
[dd
->index_gl
[i
]])
2298 "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd
->rank
, where
, i
+1, dd
->index_gl
[i
]+1, dd
->ncg_home
);
2303 for (i
= 0; i
< ncg_sys
; i
++)
2310 if (ngl
!= dd
->ncg_tot
)
2312 fprintf(stderr
, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd
->rank
, where
, ngl
, dd
->ncg_tot
);
2319 static void check_index_consistency(gmx_domdec_t
*dd
,
2320 int natoms_sys
, int ncg_sys
,
2323 int nerr
, ngl
, i
, a
, cell
;
2328 if (dd
->comm
->DD_debug
> 1)
2330 snew(have
, natoms_sys
);
2331 for (a
= 0; a
< dd
->nat_tot
; a
++)
2333 if (have
[dd
->gatindex
[a
]] > 0)
2335 fprintf(stderr
, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd
->rank
, dd
->gatindex
[a
]+1, have
[dd
->gatindex
[a
]], a
+1);
2339 have
[dd
->gatindex
[a
]] = a
+ 1;
2345 snew(have
, dd
->nat_tot
);
2348 for (i
= 0; i
< natoms_sys
; i
++)
2350 if (ga2la_get(dd
->ga2la
, i
, &a
, &cell
))
2352 if (a
>= dd
->nat_tot
)
2354 fprintf(stderr
, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd
->rank
, i
+1, a
+1, dd
->nat_tot
);
2360 if (dd
->gatindex
[a
] != i
)
2362 fprintf(stderr
, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd
->rank
, i
+1, a
+1, dd
->gatindex
[a
]+1);
2369 if (ngl
!= dd
->nat_tot
)
2372 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
2373 dd
->rank
, where
, ngl
, dd
->nat_tot
);
2375 for (a
= 0; a
< dd
->nat_tot
; a
++)
2380 "DD rank %d, %s: local atom %d, global %d has no global index\n",
2381 dd
->rank
, where
, a
+1, dd
->gatindex
[a
]+1);
2386 nerr
+= check_bLocalCG(dd
, ncg_sys
, dd
->comm
->bLocalCG
, where
);
2390 gmx_fatal(FARGS
, "DD rank %d, %s: %d atom/cg index inconsistencies",
2391 dd
->rank
, where
, nerr
);
2395 static void clear_dd_indices(gmx_domdec_t
*dd
, int cg_start
, int a_start
)
2402 /* Clear the whole list without searching */
2403 ga2la_clear(dd
->ga2la
);
2407 for (i
= a_start
; i
< dd
->nat_tot
; i
++)
2409 ga2la_del(dd
->ga2la
, dd
->gatindex
[i
]);
2413 bLocalCG
= dd
->comm
->bLocalCG
;
2416 for (i
= cg_start
; i
< dd
->ncg_tot
; i
++)
2418 bLocalCG
[dd
->index_gl
[i
]] = FALSE
;
2422 dd_clear_local_vsite_indices(dd
);
2424 if (dd
->constraints
)
2426 dd_clear_local_constraint_indices(dd
);
2430 /* This function should be used for moving the domain boudaries during DLB,
2431 * for obtaining the minimum cell size. It checks the initially set limit
2432 * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2433 * and, possibly, a longer cut-off limit set for PME load balancing.
2435 static real
cellsize_min_dlb(gmx_domdec_comm_t
*comm
, int dim_ind
, int dim
)
2439 cellsize_min
= comm
->cellsize_min
[dim
];
2441 if (!comm
->bVacDLBNoLimit
)
2443 /* The cut-off might have changed, e.g. by PME load balacning,
2444 * from the value used to set comm->cellsize_min, so check it.
2446 cellsize_min
= std::max(cellsize_min
, comm
->cutoff
/comm
->cd
[dim_ind
].np_dlb
);
2448 if (comm
->bPMELoadBalDLBLimits
)
2450 /* Check for the cut-off limit set by the PME load balancing */
2451 cellsize_min
= std::max(cellsize_min
, comm
->PMELoadBal_max_cutoff
/comm
->cd
[dim_ind
].np_dlb
);
2455 return cellsize_min
;
2458 static real
grid_jump_limit(gmx_domdec_comm_t
*comm
, real cutoff
,
2461 real grid_jump_limit
;
2463 /* The distance between the boundaries of cells at distance
2464 * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2465 * and by the fact that cells should not be shifted by more than
2466 * half their size, such that cg's only shift by one cell
2467 * at redecomposition.
2469 grid_jump_limit
= comm
->cellsize_limit
;
2470 if (!comm
->bVacDLBNoLimit
)
2472 if (comm
->bPMELoadBalDLBLimits
)
2474 cutoff
= std::max(cutoff
, comm
->PMELoadBal_max_cutoff
);
2476 grid_jump_limit
= std::max(grid_jump_limit
,
2477 cutoff
/comm
->cd
[dim_ind
].np
);
2480 return grid_jump_limit
;
2483 static gmx_bool
check_grid_jump(gmx_int64_t step
,
2489 gmx_domdec_comm_t
*comm
;
2498 for (d
= 1; d
< dd
->ndim
; d
++)
2501 limit
= grid_jump_limit(comm
, cutoff
, d
);
2502 bfac
= ddbox
->box_size
[dim
];
2503 if (ddbox
->tric_dir
[dim
])
2505 bfac
*= ddbox
->skew_fac
[dim
];
2507 if ((comm
->cell_f1
[d
] - comm
->cell_f_max0
[d
])*bfac
< limit
||
2508 (comm
->cell_f0
[d
] - comm
->cell_f_min1
[d
])*bfac
> -limit
)
2516 /* This error should never be triggered under normal
2517 * circumstances, but you never know ...
2519 gmx_fatal(FARGS
, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
2520 gmx_step_str(step
, buf
),
2521 dim2char(dim
), dd
->ci
[XX
], dd
->ci
[YY
], dd
->ci
[ZZ
]);
2529 static int dd_load_count(gmx_domdec_comm_t
*comm
)
2531 return (comm
->eFlop
? comm
->flop_n
: comm
->cycl_n
[ddCyclF
]);
2534 static float dd_force_load(gmx_domdec_comm_t
*comm
)
2541 if (comm
->eFlop
> 1)
2543 load
*= 1.0 + (comm
->eFlop
- 1)*(0.1*rand()/RAND_MAX
- 0.05);
2548 load
= comm
->cycl
[ddCyclF
];
2549 if (comm
->cycl_n
[ddCyclF
] > 1)
2551 /* Subtract the maximum of the last n cycle counts
2552 * to get rid of possible high counts due to other sources,
2553 * for instance system activity, that would otherwise
2554 * affect the dynamic load balancing.
2556 load
-= comm
->cycl_max
[ddCyclF
];
2560 if (comm
->cycl_n
[ddCyclWaitGPU
] && comm
->nrank_gpu_shared
> 1)
2562 float gpu_wait
, gpu_wait_sum
;
2564 gpu_wait
= comm
->cycl
[ddCyclWaitGPU
];
2565 if (comm
->cycl_n
[ddCyclF
] > 1)
2567 /* We should remove the WaitGPU time of the same MD step
2568 * as the one with the maximum F time, since the F time
2569 * and the wait time are not independent.
2570 * Furthermore, the step for the max F time should be chosen
2571 * the same on all ranks that share the same GPU.
2572 * But to keep the code simple, we remove the average instead.
2573 * The main reason for artificially long times at some steps
2574 * is spurious CPU activity or MPI time, so we don't expect
2575 * that changes in the GPU wait time matter a lot here.
2577 gpu_wait
*= (comm
->cycl_n
[ddCyclF
] - 1)/(float)comm
->cycl_n
[ddCyclF
];
2579 /* Sum the wait times over the ranks that share the same GPU */
2580 MPI_Allreduce(&gpu_wait
, &gpu_wait_sum
, 1, MPI_FLOAT
, MPI_SUM
,
2581 comm
->mpi_comm_gpu_shared
);
2582 /* Replace the wait time by the average over the ranks */
2583 load
+= -gpu_wait
+ gpu_wait_sum
/comm
->nrank_gpu_shared
;
2591 static void set_slb_pme_dim_f(gmx_domdec_t
*dd
, int dim
, real
**dim_f
)
2593 gmx_domdec_comm_t
*comm
;
2598 snew(*dim_f
, dd
->nc
[dim
]+1);
2600 for (i
= 1; i
< dd
->nc
[dim
]; i
++)
2602 if (comm
->slb_frac
[dim
])
2604 (*dim_f
)[i
] = (*dim_f
)[i
-1] + comm
->slb_frac
[dim
][i
-1];
2608 (*dim_f
)[i
] = (real
)i
/(real
)dd
->nc
[dim
];
2611 (*dim_f
)[dd
->nc
[dim
]] = 1;
2614 static void init_ddpme(gmx_domdec_t
*dd
, gmx_ddpme_t
*ddpme
, int dimind
)
2616 int pmeindex
, slab
, nso
, i
;
2619 if (dimind
== 0 && dd
->dim
[0] == YY
&& dd
->comm
->npmenodes_x
== 1)
2625 ddpme
->dim
= dimind
;
2627 ddpme
->dim_match
= (ddpme
->dim
== dd
->dim
[dimind
]);
2629 ddpme
->nslab
= (ddpme
->dim
== 0 ?
2630 dd
->comm
->npmenodes_x
:
2631 dd
->comm
->npmenodes_y
);
2633 if (ddpme
->nslab
<= 1)
2638 nso
= dd
->comm
->npmenodes
/ddpme
->nslab
;
2639 /* Determine for each PME slab the PP location range for dimension dim */
2640 snew(ddpme
->pp_min
, ddpme
->nslab
);
2641 snew(ddpme
->pp_max
, ddpme
->nslab
);
2642 for (slab
= 0; slab
< ddpme
->nslab
; slab
++)
2644 ddpme
->pp_min
[slab
] = dd
->nc
[dd
->dim
[dimind
]] - 1;
2645 ddpme
->pp_max
[slab
] = 0;
2647 for (i
= 0; i
< dd
->nnodes
; i
++)
2649 ddindex2xyz(dd
->nc
, i
, xyz
);
2650 /* For y only use our y/z slab.
2651 * This assumes that the PME x grid size matches the DD grid size.
2653 if (dimind
== 0 || xyz
[XX
] == dd
->ci
[XX
])
2655 pmeindex
= ddindex2pmeindex(dd
, i
);
2658 slab
= pmeindex
/nso
;
2662 slab
= pmeindex
% ddpme
->nslab
;
2664 ddpme
->pp_min
[slab
] = std::min(ddpme
->pp_min
[slab
], xyz
[dimind
]);
2665 ddpme
->pp_max
[slab
] = std::max(ddpme
->pp_max
[slab
], xyz
[dimind
]);
2669 set_slb_pme_dim_f(dd
, ddpme
->dim
, &ddpme
->slb_dim_f
);
2672 int dd_pme_maxshift_x(gmx_domdec_t
*dd
)
2674 if (dd
->comm
->ddpme
[0].dim
== XX
)
2676 return dd
->comm
->ddpme
[0].maxshift
;
2684 int dd_pme_maxshift_y(gmx_domdec_t
*dd
)
2686 if (dd
->comm
->ddpme
[0].dim
== YY
)
2688 return dd
->comm
->ddpme
[0].maxshift
;
2690 else if (dd
->comm
->npmedecompdim
>= 2 && dd
->comm
->ddpme
[1].dim
== YY
)
2692 return dd
->comm
->ddpme
[1].maxshift
;
2700 static void set_pme_maxshift(gmx_domdec_t
*dd
, gmx_ddpme_t
*ddpme
,
2701 gmx_bool bUniform
, gmx_ddbox_t
*ddbox
, real
*cell_f
)
2703 gmx_domdec_comm_t
*comm
;
2706 real range
, pme_boundary
;
2710 nc
= dd
->nc
[ddpme
->dim
];
2713 if (!ddpme
->dim_match
)
2715 /* PP decomposition is not along dim: the worst situation */
2718 else if (ns
<= 3 || (bUniform
&& ns
== nc
))
2720 /* The optimal situation */
2725 /* We need to check for all pme nodes which nodes they
2726 * could possibly need to communicate with.
2728 xmin
= ddpme
->pp_min
;
2729 xmax
= ddpme
->pp_max
;
2730 /* Allow for atoms to be maximally 2/3 times the cut-off
2731 * out of their DD cell. This is a reasonable balance between
2732 * between performance and support for most charge-group/cut-off
2735 range
= 2.0/3.0*comm
->cutoff
/ddbox
->box_size
[ddpme
->dim
];
2736 /* Avoid extra communication when we are exactly at a boundary */
2740 for (s
= 0; s
< ns
; s
++)
2742 /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2743 pme_boundary
= (real
)s
/ns
;
2746 cell_f
[xmax
[s
-(sh
+1) ]+1] + range
> pme_boundary
) ||
2748 cell_f
[xmax
[s
-(sh
+1)+ns
]+1] - 1 + range
> pme_boundary
)))
2752 pme_boundary
= (real
)(s
+1)/ns
;
2755 cell_f
[xmin
[s
+(sh
+1) ] ] - range
< pme_boundary
) ||
2757 cell_f
[xmin
[s
+(sh
+1)-ns
] ] + 1 - range
< pme_boundary
)))
2764 ddpme
->maxshift
= sh
;
2768 fprintf(debug
, "PME slab communication range for dim %d is %d\n",
2769 ddpme
->dim
, ddpme
->maxshift
);
2773 static void check_box_size(gmx_domdec_t
*dd
, gmx_ddbox_t
*ddbox
)
2777 for (d
= 0; d
< dd
->ndim
; d
++)
2780 if (dim
< ddbox
->nboundeddim
&&
2781 ddbox
->box_size
[dim
]*ddbox
->skew_fac
[dim
] <
2782 dd
->nc
[dim
]*dd
->comm
->cellsize_limit
*DD_CELL_MARGIN
)
2784 gmx_fatal(FARGS
, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2785 dim2char(dim
), ddbox
->box_size
[dim
], ddbox
->skew_fac
[dim
],
2786 dd
->nc
[dim
], dd
->comm
->cellsize_limit
);
2792 setcellsizeslbLOCAL
, setcellsizeslbMASTER
, setcellsizeslbPULSE_ONLY
2795 /* Set the domain boundaries. Use for static (or no) load balancing,
2796 * and also for the starting state for dynamic load balancing.
2797 * setmode determine if and where the boundaries are stored, use enum above.
2798 * Returns the number communication pulses in npulse.
2800 static void set_dd_cell_sizes_slb(gmx_domdec_t
*dd
, gmx_ddbox_t
*ddbox
,
2801 int setmode
, ivec npulse
)
2803 gmx_domdec_comm_t
*comm
;
2806 real
*cell_x
, cell_dx
, cellsize
;
2810 for (d
= 0; d
< DIM
; d
++)
2812 cellsize_min
[d
] = ddbox
->box_size
[d
]*ddbox
->skew_fac
[d
];
2814 if (dd
->nc
[d
] == 1 || comm
->slb_frac
[d
] == NULL
)
2817 cell_dx
= ddbox
->box_size
[d
]/dd
->nc
[d
];
2820 case setcellsizeslbMASTER
:
2821 for (j
= 0; j
< dd
->nc
[d
]+1; j
++)
2823 dd
->ma
->cell_x
[d
][j
] = ddbox
->box0
[d
] + j
*cell_dx
;
2826 case setcellsizeslbLOCAL
:
2827 comm
->cell_x0
[d
] = ddbox
->box0
[d
] + (dd
->ci
[d
] )*cell_dx
;
2828 comm
->cell_x1
[d
] = ddbox
->box0
[d
] + (dd
->ci
[d
]+1)*cell_dx
;
2833 cellsize
= cell_dx
*ddbox
->skew_fac
[d
];
2834 while (cellsize
*npulse
[d
] < comm
->cutoff
)
2838 cellsize_min
[d
] = cellsize
;
2842 /* Statically load balanced grid */
2843 /* Also when we are not doing a master distribution we determine
2844 * all cell borders in a loop to obtain identical values
2845 * to the master distribution case and to determine npulse.
2847 if (setmode
== setcellsizeslbMASTER
)
2849 cell_x
= dd
->ma
->cell_x
[d
];
2853 snew(cell_x
, dd
->nc
[d
]+1);
2855 cell_x
[0] = ddbox
->box0
[d
];
2856 for (j
= 0; j
< dd
->nc
[d
]; j
++)
2858 cell_dx
= ddbox
->box_size
[d
]*comm
->slb_frac
[d
][j
];
2859 cell_x
[j
+1] = cell_x
[j
] + cell_dx
;
2860 cellsize
= cell_dx
*ddbox
->skew_fac
[d
];
2861 while (cellsize
*npulse
[d
] < comm
->cutoff
&&
2862 npulse
[d
] < dd
->nc
[d
]-1)
2866 cellsize_min
[d
] = std::min(cellsize_min
[d
], cellsize
);
2868 if (setmode
== setcellsizeslbLOCAL
)
2870 comm
->cell_x0
[d
] = cell_x
[dd
->ci
[d
]];
2871 comm
->cell_x1
[d
] = cell_x
[dd
->ci
[d
]+1];
2873 if (setmode
!= setcellsizeslbMASTER
)
2878 /* The following limitation is to avoid that a cell would receive
2879 * some of its own home charge groups back over the periodic boundary.
2880 * Double charge groups cause trouble with the global indices.
2882 if (d
< ddbox
->npbcdim
&&
2883 dd
->nc
[d
] > 1 && npulse
[d
] >= dd
->nc
[d
])
2885 char error_string
[STRLEN
];
2887 sprintf(error_string
,
2888 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2889 dim2char(d
), ddbox
->box_size
[d
], ddbox
->skew_fac
[d
],
2891 dd
->nc
[d
], dd
->nc
[d
],
2892 dd
->nnodes
> dd
->nc
[d
] ? "cells" : "ranks");
2894 if (setmode
== setcellsizeslbLOCAL
)
2896 gmx_fatal_collective(FARGS
, dd
->mpi_comm_all
, DDMASTER(dd
),
2901 gmx_fatal(FARGS
, error_string
);
2908 copy_rvec(cellsize_min
, comm
->cellsize_min
);
2911 for (d
= 0; d
< comm
->npmedecompdim
; d
++)
2913 set_pme_maxshift(dd
, &comm
->ddpme
[d
],
2914 comm
->slb_frac
[dd
->dim
[d
]] == NULL
, ddbox
,
2915 comm
->ddpme
[d
].slb_dim_f
);
2920 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t
*dd
,
2921 int d
, int dim
, domdec_root_t
*root
,
2923 gmx_bool bUniform
, gmx_int64_t step
, real cellsize_limit_f
, int range
[])
2925 gmx_domdec_comm_t
*comm
;
2926 int ncd
, i
, j
, nmin
, nmin_old
;
2927 gmx_bool bLimLo
, bLimHi
;
2929 real fac
, halfway
, cellsize_limit_f_i
, region_size
;
2930 gmx_bool bPBC
, bLastHi
= FALSE
;
2931 int nrange
[] = {range
[0], range
[1]};
2933 region_size
= root
->cell_f
[range
[1]]-root
->cell_f
[range
[0]];
2939 bPBC
= (dim
< ddbox
->npbcdim
);
2941 cell_size
= root
->buf_ncd
;
2945 fprintf(debug
, "enforce_limits: %d %d\n", range
[0], range
[1]);
2948 /* First we need to check if the scaling does not make cells
2949 * smaller than the smallest allowed size.
2950 * We need to do this iteratively, since if a cell is too small,
2951 * it needs to be enlarged, which makes all the other cells smaller,
2952 * which could in turn make another cell smaller than allowed.
2954 for (i
= range
[0]; i
< range
[1]; i
++)
2956 root
->bCellMin
[i
] = FALSE
;
2962 /* We need the total for normalization */
2964 for (i
= range
[0]; i
< range
[1]; i
++)
2966 if (root
->bCellMin
[i
] == FALSE
)
2968 fac
+= cell_size
[i
];
2971 fac
= ( region_size
- nmin
*cellsize_limit_f
)/fac
; /* substracting cells already set to cellsize_limit_f */
2972 /* Determine the cell boundaries */
2973 for (i
= range
[0]; i
< range
[1]; i
++)
2975 if (root
->bCellMin
[i
] == FALSE
)
2977 cell_size
[i
] *= fac
;
2978 if (!bPBC
&& (i
== 0 || i
== dd
->nc
[dim
] -1))
2980 cellsize_limit_f_i
= 0;
2984 cellsize_limit_f_i
= cellsize_limit_f
;
2986 if (cell_size
[i
] < cellsize_limit_f_i
)
2988 root
->bCellMin
[i
] = TRUE
;
2989 cell_size
[i
] = cellsize_limit_f_i
;
2993 root
->cell_f
[i
+1] = root
->cell_f
[i
] + cell_size
[i
];
2996 while (nmin
> nmin_old
);
2999 cell_size
[i
] = root
->cell_f
[i
+1] - root
->cell_f
[i
];
3000 /* For this check we should not use DD_CELL_MARGIN,
3001 * but a slightly smaller factor,
3002 * since rounding could get use below the limit.
3004 if (bPBC
&& cell_size
[i
] < cellsize_limit_f
*DD_CELL_MARGIN2
/DD_CELL_MARGIN
)
3007 gmx_fatal(FARGS
, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3008 gmx_step_str(step
, buf
),
3009 dim2char(dim
), ddbox
->box_size
[dim
], ddbox
->skew_fac
[dim
],
3010 ncd
, comm
->cellsize_min
[dim
]);
3013 root
->bLimited
= (nmin
> 0) || (range
[0] > 0) || (range
[1] < ncd
);
3017 /* Check if the boundary did not displace more than halfway
3018 * each of the cells it bounds, as this could cause problems,
3019 * especially when the differences between cell sizes are large.
3020 * If changes are applied, they will not make cells smaller
3021 * than the cut-off, as we check all the boundaries which
3022 * might be affected by a change and if the old state was ok,
3023 * the cells will at most be shrunk back to their old size.
3025 for (i
= range
[0]+1; i
< range
[1]; i
++)
3027 halfway
= 0.5*(root
->old_cell_f
[i
] + root
->old_cell_f
[i
-1]);
3028 if (root
->cell_f
[i
] < halfway
)
3030 root
->cell_f
[i
] = halfway
;
3031 /* Check if the change also causes shifts of the next boundaries */
3032 for (j
= i
+1; j
< range
[1]; j
++)
3034 if (root
->cell_f
[j
] < root
->cell_f
[j
-1] + cellsize_limit_f
)
3036 root
->cell_f
[j
] = root
->cell_f
[j
-1] + cellsize_limit_f
;
3040 halfway
= 0.5*(root
->old_cell_f
[i
] + root
->old_cell_f
[i
+1]);
3041 if (root
->cell_f
[i
] > halfway
)
3043 root
->cell_f
[i
] = halfway
;
3044 /* Check if the change also causes shifts of the next boundaries */
3045 for (j
= i
-1; j
>= range
[0]+1; j
--)
3047 if (root
->cell_f
[j
] > root
->cell_f
[j
+1] - cellsize_limit_f
)
3049 root
->cell_f
[j
] = root
->cell_f
[j
+1] - cellsize_limit_f
;
3056 /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3057 /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3058 * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3059 * for a and b nrange is used */
3062 /* Take care of the staggering of the cell boundaries */
3065 for (i
= range
[0]; i
< range
[1]; i
++)
3067 root
->cell_f_max0
[i
] = root
->cell_f
[i
];
3068 root
->cell_f_min1
[i
] = root
->cell_f
[i
+1];
3073 for (i
= range
[0]+1; i
< range
[1]; i
++)
3075 bLimLo
= (root
->cell_f
[i
] < root
->bound_min
[i
]);
3076 bLimHi
= (root
->cell_f
[i
] > root
->bound_max
[i
]);
3077 if (bLimLo
&& bLimHi
)
3079 /* Both limits violated, try the best we can */
3080 /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3081 root
->cell_f
[i
] = 0.5*(root
->bound_min
[i
] + root
->bound_max
[i
]);
3082 nrange
[0] = range
[0];
3084 dd_cell_sizes_dlb_root_enforce_limits(dd
, d
, dim
, root
, ddbox
, bUniform
, step
, cellsize_limit_f
, nrange
);
3087 nrange
[1] = range
[1];
3088 dd_cell_sizes_dlb_root_enforce_limits(dd
, d
, dim
, root
, ddbox
, bUniform
, step
, cellsize_limit_f
, nrange
);
3094 /* root->cell_f[i] = root->bound_min[i]; */
3095 nrange
[1] = i
; /* only store violation location. There could be a LimLo violation following with an higher index */
3098 else if (bLimHi
&& !bLastHi
)
3101 if (nrange
[1] < range
[1]) /* found a LimLo before */
3103 root
->cell_f
[nrange
[1]] = root
->bound_min
[nrange
[1]];
3104 dd_cell_sizes_dlb_root_enforce_limits(dd
, d
, dim
, root
, ddbox
, bUniform
, step
, cellsize_limit_f
, nrange
);
3105 nrange
[0] = nrange
[1];
3107 root
->cell_f
[i
] = root
->bound_max
[i
];
3109 dd_cell_sizes_dlb_root_enforce_limits(dd
, d
, dim
, root
, ddbox
, bUniform
, step
, cellsize_limit_f
, nrange
);
3111 nrange
[1] = range
[1];
3114 if (nrange
[1] < range
[1]) /* found last a LimLo */
3116 root
->cell_f
[nrange
[1]] = root
->bound_min
[nrange
[1]];
3117 dd_cell_sizes_dlb_root_enforce_limits(dd
, d
, dim
, root
, ddbox
, bUniform
, step
, cellsize_limit_f
, nrange
);
3118 nrange
[0] = nrange
[1];
3119 nrange
[1] = range
[1];
3120 dd_cell_sizes_dlb_root_enforce_limits(dd
, d
, dim
, root
, ddbox
, bUniform
, step
, cellsize_limit_f
, nrange
);
3122 else if (nrange
[0] > range
[0]) /* found at least one LimHi */
3124 dd_cell_sizes_dlb_root_enforce_limits(dd
, d
, dim
, root
, ddbox
, bUniform
, step
, cellsize_limit_f
, nrange
);
3131 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t
*dd
,
3132 int d
, int dim
, domdec_root_t
*root
,
3133 gmx_ddbox_t
*ddbox
, gmx_bool bDynamicBox
,
3134 gmx_bool bUniform
, gmx_int64_t step
)
3136 gmx_domdec_comm_t
*comm
;
3137 int ncd
, d1
, i
, pos
;
3139 real load_aver
, load_i
, imbalance
, change
, change_max
, sc
;
3140 real cellsize_limit_f
, dist_min_f
, dist_min_f_hard
, space
;
3144 int range
[] = { 0, 0 };
3148 /* Convert the maximum change from the input percentage to a fraction */
3149 change_limit
= comm
->dlb_scale_lim
*0.01;
3153 bPBC
= (dim
< ddbox
->npbcdim
);
3155 cell_size
= root
->buf_ncd
;
3157 /* Store the original boundaries */
3158 for (i
= 0; i
< ncd
+1; i
++)
3160 root
->old_cell_f
[i
] = root
->cell_f
[i
];
3164 for (i
= 0; i
< ncd
; i
++)
3166 cell_size
[i
] = 1.0/ncd
;
3169 else if (dd_load_count(comm
) > 0)
3171 load_aver
= comm
->load
[d
].sum_m
/ncd
;
3173 for (i
= 0; i
< ncd
; i
++)
3175 /* Determine the relative imbalance of cell i */
3176 load_i
= comm
->load
[d
].load
[i
*comm
->load
[d
].nload
+2];
3177 imbalance
= (load_i
- load_aver
)/(load_aver
> 0 ? load_aver
: 1);
3178 /* Determine the change of the cell size using underrelaxation */
3179 change
= -relax
*imbalance
;
3180 change_max
= std::max(change_max
, std::max(change
, -change
));
3182 /* Limit the amount of scaling.
3183 * We need to use the same rescaling for all cells in one row,
3184 * otherwise the load balancing might not converge.
3187 if (change_max
> change_limit
)
3189 sc
*= change_limit
/change_max
;
3191 for (i
= 0; i
< ncd
; i
++)
3193 /* Determine the relative imbalance of cell i */
3194 load_i
= comm
->load
[d
].load
[i
*comm
->load
[d
].nload
+2];
3195 imbalance
= (load_i
- load_aver
)/(load_aver
> 0 ? load_aver
: 1);
3196 /* Determine the change of the cell size using underrelaxation */
3197 change
= -sc
*imbalance
;
3198 cell_size
[i
] = (root
->cell_f
[i
+1]-root
->cell_f
[i
])*(1 + change
);
3202 cellsize_limit_f
= cellsize_min_dlb(comm
, d
, dim
)/ddbox
->box_size
[dim
];
3203 cellsize_limit_f
*= DD_CELL_MARGIN
;
3204 dist_min_f_hard
= grid_jump_limit(comm
, comm
->cutoff
, d
)/ddbox
->box_size
[dim
];
3205 dist_min_f
= dist_min_f_hard
* DD_CELL_MARGIN
;
3206 if (ddbox
->tric_dir
[dim
])
3208 cellsize_limit_f
/= ddbox
->skew_fac
[dim
];
3209 dist_min_f
/= ddbox
->skew_fac
[dim
];
3211 if (bDynamicBox
&& d
> 0)
3213 dist_min_f
*= DD_PRES_SCALE_MARGIN
;
3215 if (d
> 0 && !bUniform
)
3217 /* Make sure that the grid is not shifted too much */
3218 for (i
= 1; i
< ncd
; i
++)
3220 if (root
->cell_f_min1
[i
] - root
->cell_f_max0
[i
-1] < 2 * dist_min_f_hard
)
3222 gmx_incons("Inconsistent DD boundary staggering limits!");
3224 root
->bound_min
[i
] = root
->cell_f_max0
[i
-1] + dist_min_f
;
3225 space
= root
->cell_f
[i
] - (root
->cell_f_max0
[i
-1] + dist_min_f
);
3228 root
->bound_min
[i
] += 0.5*space
;
3230 root
->bound_max
[i
] = root
->cell_f_min1
[i
] - dist_min_f
;
3231 space
= root
->cell_f
[i
] - (root
->cell_f_min1
[i
] - dist_min_f
);
3234 root
->bound_max
[i
] += 0.5*space
;
3239 "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3241 root
->cell_f_max0
[i
-1] + dist_min_f
,
3242 root
->bound_min
[i
], root
->cell_f
[i
], root
->bound_max
[i
],
3243 root
->cell_f_min1
[i
] - dist_min_f
);
3248 root
->cell_f
[0] = 0;
3249 root
->cell_f
[ncd
] = 1;
3250 dd_cell_sizes_dlb_root_enforce_limits(dd
, d
, dim
, root
, ddbox
, bUniform
, step
, cellsize_limit_f
, range
);
3253 /* After the checks above, the cells should obey the cut-off
3254 * restrictions, but it does not hurt to check.
3256 for (i
= 0; i
< ncd
; i
++)
3260 fprintf(debug
, "Relative bounds dim %d cell %d: %f %f\n",
3261 dim
, i
, root
->cell_f
[i
], root
->cell_f
[i
+1]);
3264 if ((bPBC
|| (i
!= 0 && i
!= dd
->nc
[dim
]-1)) &&
3265 root
->cell_f
[i
+1] - root
->cell_f
[i
] <
3266 cellsize_limit_f
/DD_CELL_MARGIN
)
3270 "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3271 gmx_step_str(step
, buf
), dim2char(dim
), i
,
3272 (root
->cell_f
[i
+1] - root
->cell_f
[i
])
3273 *ddbox
->box_size
[dim
]*ddbox
->skew_fac
[dim
]);
3278 /* Store the cell boundaries of the lower dimensions at the end */
3279 for (d1
= 0; d1
< d
; d1
++)
3281 root
->cell_f
[pos
++] = comm
->cell_f0
[d1
];
3282 root
->cell_f
[pos
++] = comm
->cell_f1
[d1
];
3285 if (d
< comm
->npmedecompdim
)
3287 /* The master determines the maximum shift for
3288 * the coordinate communication between separate PME nodes.
3290 set_pme_maxshift(dd
, &comm
->ddpme
[d
], bUniform
, ddbox
, root
->cell_f
);
3292 root
->cell_f
[pos
++] = comm
->ddpme
[0].maxshift
;
3295 root
->cell_f
[pos
++] = comm
->ddpme
[1].maxshift
;
3299 static void relative_to_absolute_cell_bounds(gmx_domdec_t
*dd
,
3300 gmx_ddbox_t
*ddbox
, int dimind
)
3302 gmx_domdec_comm_t
*comm
;
3307 /* Set the cell dimensions */
3308 dim
= dd
->dim
[dimind
];
3309 comm
->cell_x0
[dim
] = comm
->cell_f0
[dimind
]*ddbox
->box_size
[dim
];
3310 comm
->cell_x1
[dim
] = comm
->cell_f1
[dimind
]*ddbox
->box_size
[dim
];
3311 if (dim
>= ddbox
->nboundeddim
)
3313 comm
->cell_x0
[dim
] += ddbox
->box0
[dim
];
3314 comm
->cell_x1
[dim
] += ddbox
->box0
[dim
];
3318 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t
*dd
,
3319 int d
, int dim
, real
*cell_f_row
,
3322 gmx_domdec_comm_t
*comm
;
3328 /* Each node would only need to know two fractions,
3329 * but it is probably cheaper to broadcast the whole array.
3331 MPI_Bcast(cell_f_row
, DD_CELL_F_SIZE(dd
, d
)*sizeof(real
), MPI_BYTE
,
3332 0, comm
->mpi_comm_load
[d
]);
3334 /* Copy the fractions for this dimension from the buffer */
3335 comm
->cell_f0
[d
] = cell_f_row
[dd
->ci
[dim
] ];
3336 comm
->cell_f1
[d
] = cell_f_row
[dd
->ci
[dim
]+1];
3337 /* The whole array was communicated, so set the buffer position */
3338 pos
= dd
->nc
[dim
] + 1;
3339 for (d1
= 0; d1
<= d
; d1
++)
3343 /* Copy the cell fractions of the lower dimensions */
3344 comm
->cell_f0
[d1
] = cell_f_row
[pos
++];
3345 comm
->cell_f1
[d1
] = cell_f_row
[pos
++];
3347 relative_to_absolute_cell_bounds(dd
, ddbox
, d1
);
3349 /* Convert the communicated shift from float to int */
3350 comm
->ddpme
[0].maxshift
= (int)(cell_f_row
[pos
++] + 0.5);
3353 comm
->ddpme
[1].maxshift
= (int)(cell_f_row
[pos
++] + 0.5);
3357 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t
*dd
,
3358 gmx_ddbox_t
*ddbox
, gmx_bool bDynamicBox
,
3359 gmx_bool bUniform
, gmx_int64_t step
)
3361 gmx_domdec_comm_t
*comm
;
3363 gmx_bool bRowMember
, bRowRoot
;
3368 for (d
= 0; d
< dd
->ndim
; d
++)
3373 for (d1
= d
; d1
< dd
->ndim
; d1
++)
3375 if (dd
->ci
[dd
->dim
[d1
]] > 0)
3388 set_dd_cell_sizes_dlb_root(dd
, d
, dim
, comm
->root
[d
],
3389 ddbox
, bDynamicBox
, bUniform
, step
);
3390 cell_f_row
= comm
->root
[d
]->cell_f
;
3394 cell_f_row
= comm
->cell_f_row
;
3396 distribute_dd_cell_sizes_dlb(dd
, d
, dim
, cell_f_row
, ddbox
);
3401 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t
*dd
, gmx_ddbox_t
*ddbox
)
3405 /* This function assumes the box is static and should therefore
3406 * not be called when the box has changed since the last
3407 * call to dd_partition_system.
3409 for (d
= 0; d
< dd
->ndim
; d
++)
3411 relative_to_absolute_cell_bounds(dd
, ddbox
, d
);
3417 static void set_dd_cell_sizes_dlb(gmx_domdec_t
*dd
,
3418 gmx_ddbox_t
*ddbox
, gmx_bool bDynamicBox
,
3419 gmx_bool bUniform
, gmx_bool bDoDLB
, gmx_int64_t step
,
3420 gmx_wallcycle_t wcycle
)
3422 gmx_domdec_comm_t
*comm
;
3429 wallcycle_start(wcycle
, ewcDDCOMMBOUND
);
3430 set_dd_cell_sizes_dlb_change(dd
, ddbox
, bDynamicBox
, bUniform
, step
);
3431 wallcycle_stop(wcycle
, ewcDDCOMMBOUND
);
3433 else if (bDynamicBox
)
3435 set_dd_cell_sizes_dlb_nochange(dd
, ddbox
);
3438 /* Set the dimensions for which no DD is used */
3439 for (dim
= 0; dim
< DIM
; dim
++)
3441 if (dd
->nc
[dim
] == 1)
3443 comm
->cell_x0
[dim
] = 0;
3444 comm
->cell_x1
[dim
] = ddbox
->box_size
[dim
];
3445 if (dim
>= ddbox
->nboundeddim
)
3447 comm
->cell_x0
[dim
] += ddbox
->box0
[dim
];
3448 comm
->cell_x1
[dim
] += ddbox
->box0
[dim
];
3454 static void realloc_comm_ind(gmx_domdec_t
*dd
, ivec npulse
)
3457 gmx_domdec_comm_dim_t
*cd
;
3459 for (d
= 0; d
< dd
->ndim
; d
++)
3461 cd
= &dd
->comm
->cd
[d
];
3462 np
= npulse
[dd
->dim
[d
]];
3463 if (np
> cd
->np_nalloc
)
3467 fprintf(debug
, "(Re)allocing cd for %c to %d pulses\n",
3468 dim2char(dd
->dim
[d
]), np
);
3470 if (DDMASTER(dd
) && cd
->np_nalloc
> 0)
3472 fprintf(stderr
, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd
->dim
[d
]), np
);
3474 srenew(cd
->ind
, np
);
3475 for (i
= cd
->np_nalloc
; i
< np
; i
++)
3477 cd
->ind
[i
].index
= NULL
;
3478 cd
->ind
[i
].nalloc
= 0;
3487 static void set_dd_cell_sizes(gmx_domdec_t
*dd
,
3488 gmx_ddbox_t
*ddbox
, gmx_bool bDynamicBox
,
3489 gmx_bool bUniform
, gmx_bool bDoDLB
, gmx_int64_t step
,
3490 gmx_wallcycle_t wcycle
)
3492 gmx_domdec_comm_t
*comm
;
3498 /* Copy the old cell boundaries for the cg displacement check */
3499 copy_rvec(comm
->cell_x0
, comm
->old_cell_x0
);
3500 copy_rvec(comm
->cell_x1
, comm
->old_cell_x1
);
3506 check_box_size(dd
, ddbox
);
3508 set_dd_cell_sizes_dlb(dd
, ddbox
, bDynamicBox
, bUniform
, bDoDLB
, step
, wcycle
);
3512 set_dd_cell_sizes_slb(dd
, ddbox
, setcellsizeslbLOCAL
, npulse
);
3513 realloc_comm_ind(dd
, npulse
);
3518 for (d
= 0; d
< DIM
; d
++)
3520 fprintf(debug
, "cell_x[%d] %f - %f skew_fac %f\n",
3521 d
, comm
->cell_x0
[d
], comm
->cell_x1
[d
], ddbox
->skew_fac
[d
]);
3526 static void comm_dd_ns_cell_sizes(gmx_domdec_t
*dd
,
3528 rvec cell_ns_x0
, rvec cell_ns_x1
,
3531 gmx_domdec_comm_t
*comm
;
3536 for (dim_ind
= 0; dim_ind
< dd
->ndim
; dim_ind
++)
3538 dim
= dd
->dim
[dim_ind
];
3540 /* Without PBC we don't have restrictions on the outer cells */
3541 if (!(dim
>= ddbox
->npbcdim
&&
3542 (dd
->ci
[dim
] == 0 || dd
->ci
[dim
] == dd
->nc
[dim
] - 1)) &&
3544 (comm
->cell_x1
[dim
] - comm
->cell_x0
[dim
])*ddbox
->skew_fac
[dim
] <
3545 comm
->cellsize_min
[dim
])
3548 gmx_fatal(FARGS
, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3549 gmx_step_str(step
, buf
), dim2char(dim
),
3550 comm
->cell_x1
[dim
] - comm
->cell_x0
[dim
],
3551 ddbox
->skew_fac
[dim
],
3552 dd
->comm
->cellsize_min
[dim
],
3553 dd
->ci
[XX
], dd
->ci
[YY
], dd
->ci
[ZZ
]);
3557 if ((dlbIsOn(dd
->comm
) && dd
->ndim
> 1) || ddbox
->nboundeddim
< DIM
)
3559 /* Communicate the boundaries and update cell_ns_x0/1 */
3560 dd_move_cellx(dd
, ddbox
, cell_ns_x0
, cell_ns_x1
);
3561 if (dlbIsOn(dd
->comm
) && dd
->ndim
> 1)
3563 check_grid_jump(step
, dd
, dd
->comm
->cutoff
, ddbox
, TRUE
);
3568 static void make_tric_corr_matrix(int npbcdim
, matrix box
, matrix tcm
)
3572 tcm
[YY
][XX
] = -box
[YY
][XX
]/box
[YY
][YY
];
3580 tcm
[ZZ
][XX
] = -(box
[ZZ
][YY
]*tcm
[YY
][XX
] + box
[ZZ
][XX
])/box
[ZZ
][ZZ
];
3581 tcm
[ZZ
][YY
] = -box
[ZZ
][YY
]/box
[ZZ
][ZZ
];
3590 static void check_screw_box(matrix box
)
3592 /* Mathematical limitation */
3593 if (box
[YY
][XX
] != 0 || box
[ZZ
][XX
] != 0)
3595 gmx_fatal(FARGS
, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3598 /* Limitation due to the asymmetry of the eighth shell method */
3599 if (box
[ZZ
][YY
] != 0)
3601 gmx_fatal(FARGS
, "pbc=screw with non-zero box_zy is not supported");
3605 static void distribute_cg(FILE *fplog
,
3606 matrix box
, ivec tric_dir
, t_block
*cgs
, rvec pos
[],
3609 gmx_domdec_master_t
*ma
;
3610 int **tmp_ind
= NULL
, *tmp_nalloc
= NULL
;
3611 int i
, icg
, j
, k
, k0
, k1
, d
;
3615 real nrcg
, inv_ncg
, pos_d
;
3621 if (tmp_ind
== NULL
)
3623 snew(tmp_nalloc
, dd
->nnodes
);
3624 snew(tmp_ind
, dd
->nnodes
);
3625 for (i
= 0; i
< dd
->nnodes
; i
++)
3627 tmp_nalloc
[i
] = over_alloc_large(cgs
->nr
/dd
->nnodes
+1);
3628 snew(tmp_ind
[i
], tmp_nalloc
[i
]);
3632 /* Clear the count */
3633 for (i
= 0; i
< dd
->nnodes
; i
++)
3639 make_tric_corr_matrix(dd
->npbcdim
, box
, tcm
);
3641 cgindex
= cgs
->index
;
3643 /* Compute the center of geometry for all charge groups */
3644 for (icg
= 0; icg
< cgs
->nr
; icg
++)
3647 k1
= cgindex
[icg
+1];
3651 copy_rvec(pos
[k0
], cg_cm
);
3658 for (k
= k0
; (k
< k1
); k
++)
3660 rvec_inc(cg_cm
, pos
[k
]);
3662 for (d
= 0; (d
< DIM
); d
++)
3664 cg_cm
[d
] *= inv_ncg
;
3667 /* Put the charge group in the box and determine the cell index */
3668 for (d
= DIM
-1; d
>= 0; d
--)
3671 if (d
< dd
->npbcdim
)
3673 bScrew
= (dd
->bScrewPBC
&& d
== XX
);
3674 if (tric_dir
[d
] && dd
->nc
[d
] > 1)
3676 /* Use triclinic coordintates for this dimension */
3677 for (j
= d
+1; j
< DIM
; j
++)
3679 pos_d
+= cg_cm
[j
]*tcm
[j
][d
];
3682 while (pos_d
>= box
[d
][d
])
3685 rvec_dec(cg_cm
, box
[d
]);
3688 cg_cm
[YY
] = box
[YY
][YY
] - cg_cm
[YY
];
3689 cg_cm
[ZZ
] = box
[ZZ
][ZZ
] - cg_cm
[ZZ
];
3691 for (k
= k0
; (k
< k1
); k
++)
3693 rvec_dec(pos
[k
], box
[d
]);
3696 pos
[k
][YY
] = box
[YY
][YY
] - pos
[k
][YY
];
3697 pos
[k
][ZZ
] = box
[ZZ
][ZZ
] - pos
[k
][ZZ
];
3704 rvec_inc(cg_cm
, box
[d
]);
3707 cg_cm
[YY
] = box
[YY
][YY
] - cg_cm
[YY
];
3708 cg_cm
[ZZ
] = box
[ZZ
][ZZ
] - cg_cm
[ZZ
];
3710 for (k
= k0
; (k
< k1
); k
++)
3712 rvec_inc(pos
[k
], box
[d
]);
3715 pos
[k
][YY
] = box
[YY
][YY
] - pos
[k
][YY
];
3716 pos
[k
][ZZ
] = box
[ZZ
][ZZ
] - pos
[k
][ZZ
];
3721 /* This could be done more efficiently */
3723 while (ind
[d
]+1 < dd
->nc
[d
] && pos_d
>= ma
->cell_x
[d
][ind
[d
]+1])
3728 i
= dd_index(dd
->nc
, ind
);
3729 if (ma
->ncg
[i
] == tmp_nalloc
[i
])
3731 tmp_nalloc
[i
] = over_alloc_large(ma
->ncg
[i
]+1);
3732 srenew(tmp_ind
[i
], tmp_nalloc
[i
]);
3734 tmp_ind
[i
][ma
->ncg
[i
]] = icg
;
3736 ma
->nat
[i
] += cgindex
[icg
+1] - cgindex
[icg
];
3740 for (i
= 0; i
< dd
->nnodes
; i
++)
3743 for (k
= 0; k
< ma
->ncg
[i
]; k
++)
3745 ma
->cg
[k1
++] = tmp_ind
[i
][k
];
3748 ma
->index
[dd
->nnodes
] = k1
;
3750 for (i
= 0; i
< dd
->nnodes
; i
++)
3759 // Use double for the sums to avoid natoms^2 overflowing
3761 int nat_sum
, nat_min
, nat_max
;
3766 nat_min
= ma
->nat
[0];
3767 nat_max
= ma
->nat
[0];
3768 for (i
= 0; i
< dd
->nnodes
; i
++)
3770 nat_sum
+= ma
->nat
[i
];
3771 // cast to double to avoid integer overflows when squaring
3772 nat2_sum
+= gmx::square(static_cast<double>(ma
->nat
[i
]));
3773 nat_min
= std::min(nat_min
, ma
->nat
[i
]);
3774 nat_max
= std::max(nat_max
, ma
->nat
[i
]);
3776 nat_sum
/= dd
->nnodes
;
3777 nat2_sum
/= dd
->nnodes
;
3779 fprintf(fplog
, "Atom distribution over %d domains: av %d stddev %d min %d max %d\n",
3782 static_cast<int>(std::sqrt(nat2_sum
- gmx::square(static_cast<double>(nat_sum
)) + 0.5)),
3787 static void get_cg_distribution(FILE *fplog
, gmx_domdec_t
*dd
,
3788 t_block
*cgs
, matrix box
, gmx_ddbox_t
*ddbox
,
3791 gmx_domdec_master_t
*ma
= NULL
;
3794 int *ibuf
, buf2
[2] = { 0, 0 };
3795 gmx_bool bMaster
= DDMASTER(dd
);
3803 check_screw_box(box
);
3806 set_dd_cell_sizes_slb(dd
, ddbox
, setcellsizeslbMASTER
, npulse
);
3808 distribute_cg(fplog
, box
, ddbox
->tric_dir
, cgs
, pos
, dd
);
3809 for (i
= 0; i
< dd
->nnodes
; i
++)
3811 ma
->ibuf
[2*i
] = ma
->ncg
[i
];
3812 ma
->ibuf
[2*i
+1] = ma
->nat
[i
];
3820 dd_scatter(dd
, 2*sizeof(int), ibuf
, buf2
);
3822 dd
->ncg_home
= buf2
[0];
3823 dd
->nat_home
= buf2
[1];
3824 dd
->ncg_tot
= dd
->ncg_home
;
3825 dd
->nat_tot
= dd
->nat_home
;
3826 if (dd
->ncg_home
> dd
->cg_nalloc
|| dd
->cg_nalloc
== 0)
3828 dd
->cg_nalloc
= over_alloc_dd(dd
->ncg_home
);
3829 srenew(dd
->index_gl
, dd
->cg_nalloc
);
3830 srenew(dd
->cgindex
, dd
->cg_nalloc
+1);
3834 for (i
= 0; i
< dd
->nnodes
; i
++)
3836 ma
->ibuf
[i
] = ma
->ncg
[i
]*sizeof(int);
3837 ma
->ibuf
[dd
->nnodes
+i
] = ma
->index
[i
]*sizeof(int);
3842 bMaster
? ma
->ibuf
: NULL
,
3843 bMaster
? ma
->ibuf
+dd
->nnodes
: NULL
,
3844 bMaster
? ma
->cg
: NULL
,
3845 dd
->ncg_home
*sizeof(int), dd
->index_gl
);
3847 /* Determine the home charge group sizes */
3849 for (i
= 0; i
< dd
->ncg_home
; i
++)
3851 cg_gl
= dd
->index_gl
[i
];
3853 dd
->cgindex
[i
] + cgs
->index
[cg_gl
+1] - cgs
->index
[cg_gl
];
3858 fprintf(debug
, "Home charge groups:\n");
3859 for (i
= 0; i
< dd
->ncg_home
; i
++)
3861 fprintf(debug
, " %d", dd
->index_gl
[i
]);
3864 fprintf(debug
, "\n");
3867 fprintf(debug
, "\n");
3871 static int compact_and_copy_vec_at(int ncg
, int *move
,
3874 rvec
*src
, gmx_domdec_comm_t
*comm
,
3877 int m
, icg
, i
, i0
, i1
, nrcg
;
3883 for (m
= 0; m
< DIM
*2; m
++)
3889 for (icg
= 0; icg
< ncg
; icg
++)
3891 i1
= cgindex
[icg
+1];
3897 /* Compact the home array in place */
3898 for (i
= i0
; i
< i1
; i
++)
3900 copy_rvec(src
[i
], src
[home_pos
++]);
3906 /* Copy to the communication buffer */
3908 pos_vec
[m
] += 1 + vec
*nrcg
;
3909 for (i
= i0
; i
< i1
; i
++)
3911 copy_rvec(src
[i
], comm
->cgcm_state
[m
][pos_vec
[m
]++]);
3913 pos_vec
[m
] += (nvec
- vec
- 1)*nrcg
;
3917 home_pos
+= i1
- i0
;
3925 static int compact_and_copy_vec_cg(int ncg
, int *move
,
3927 int nvec
, rvec
*src
, gmx_domdec_comm_t
*comm
,
3930 int m
, icg
, i0
, i1
, nrcg
;
3936 for (m
= 0; m
< DIM
*2; m
++)
3942 for (icg
= 0; icg
< ncg
; icg
++)
3944 i1
= cgindex
[icg
+1];
3950 /* Compact the home array in place */
3951 copy_rvec(src
[icg
], src
[home_pos
++]);
3957 /* Copy to the communication buffer */
3958 copy_rvec(src
[icg
], comm
->cgcm_state
[m
][pos_vec
[m
]]);
3959 pos_vec
[m
] += 1 + nrcg
*nvec
;
3971 static int compact_ind(int ncg
, int *move
,
3972 int *index_gl
, int *cgindex
,
3974 gmx_ga2la_t
*ga2la
, char *bLocalCG
,
3977 int cg
, nat
, a0
, a1
, a
, a_gl
;
3982 for (cg
= 0; cg
< ncg
; cg
++)
3988 /* Compact the home arrays in place.
3989 * Anything that can be done here avoids access to global arrays.
3991 cgindex
[home_pos
] = nat
;
3992 for (a
= a0
; a
< a1
; a
++)
3995 gatindex
[nat
] = a_gl
;
3996 /* The cell number stays 0, so we don't need to set it */
3997 ga2la_change_la(ga2la
, a_gl
, nat
);
4000 index_gl
[home_pos
] = index_gl
[cg
];
4001 cginfo
[home_pos
] = cginfo
[cg
];
4002 /* The charge group remains local, so bLocalCG does not change */
4007 /* Clear the global indices */
4008 for (a
= a0
; a
< a1
; a
++)
4010 ga2la_del(ga2la
, gatindex
[a
]);
4014 bLocalCG
[index_gl
[cg
]] = FALSE
;
4018 cgindex
[home_pos
] = nat
;
4023 static void clear_and_mark_ind(int ncg
, int *move
,
4024 int *index_gl
, int *cgindex
, int *gatindex
,
4025 gmx_ga2la_t
*ga2la
, char *bLocalCG
,
4030 for (cg
= 0; cg
< ncg
; cg
++)
4036 /* Clear the global indices */
4037 for (a
= a0
; a
< a1
; a
++)
4039 ga2la_del(ga2la
, gatindex
[a
]);
4043 bLocalCG
[index_gl
[cg
]] = FALSE
;
4045 /* Signal that this cg has moved using the ns cell index.
4046 * Here we set it to -1. fill_grid will change it
4047 * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4049 cell_index
[cg
] = -1;
4054 static void print_cg_move(FILE *fplog
,
4056 gmx_int64_t step
, int cg
, int dim
, int dir
,
4057 gmx_bool bHaveCgcmOld
, real limitd
,
4058 rvec cm_old
, rvec cm_new
, real pos_d
)
4060 gmx_domdec_comm_t
*comm
;
4065 fprintf(fplog
, "\nStep %s:\n", gmx_step_str(step
, buf
));
4068 fprintf(fplog
, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4069 dd
->comm
->bCGs
? "The charge group starting at atom" : "Atom",
4070 ddglatnr(dd
, dd
->cgindex
[cg
]), limitd
, dim2char(dim
));
4074 /* We don't have a limiting distance available: don't print it */
4075 fprintf(fplog
, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
4076 dd
->comm
->bCGs
? "The charge group starting at atom" : "Atom",
4077 ddglatnr(dd
, dd
->cgindex
[cg
]), dim2char(dim
));
4079 fprintf(fplog
, "distance out of cell %f\n",
4080 dir
== 1 ? pos_d
- comm
->cell_x1
[dim
] : pos_d
- comm
->cell_x0
[dim
]);
4083 fprintf(fplog
, "Old coordinates: %8.3f %8.3f %8.3f\n",
4084 cm_old
[XX
], cm_old
[YY
], cm_old
[ZZ
]);
4086 fprintf(fplog
, "New coordinates: %8.3f %8.3f %8.3f\n",
4087 cm_new
[XX
], cm_new
[YY
], cm_new
[ZZ
]);
4088 fprintf(fplog
, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4090 comm
->old_cell_x0
[dim
], comm
->old_cell_x1
[dim
]);
4091 fprintf(fplog
, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4093 comm
->cell_x0
[dim
], comm
->cell_x1
[dim
]);
4096 static void cg_move_error(FILE *fplog
,
4098 gmx_int64_t step
, int cg
, int dim
, int dir
,
4099 gmx_bool bHaveCgcmOld
, real limitd
,
4100 rvec cm_old
, rvec cm_new
, real pos_d
)
4104 print_cg_move(fplog
, dd
, step
, cg
, dim
, dir
,
4105 bHaveCgcmOld
, limitd
, cm_old
, cm_new
, pos_d
);
4107 print_cg_move(stderr
, dd
, step
, cg
, dim
, dir
,
4108 bHaveCgcmOld
, limitd
, cm_old
, cm_new
, pos_d
);
4110 "%s moved too far between two domain decomposition steps\n"
4111 "This usually means that your system is not well equilibrated",
4112 dd
->comm
->bCGs
? "A charge group" : "An atom");
4115 static void rotate_state_atom(t_state
*state
, int a
)
4119 for (est
= 0; est
< estNR
; est
++)
4121 if (EST_DISTR(est
) && (state
->flags
& (1<<est
)))
4126 /* Rotate the complete state; for a rectangular box only */
4127 state
->x
[a
][YY
] = state
->box
[YY
][YY
] - state
->x
[a
][YY
];
4128 state
->x
[a
][ZZ
] = state
->box
[ZZ
][ZZ
] - state
->x
[a
][ZZ
];
4131 state
->v
[a
][YY
] = -state
->v
[a
][YY
];
4132 state
->v
[a
][ZZ
] = -state
->v
[a
][ZZ
];
4135 state
->sd_X
[a
][YY
] = -state
->sd_X
[a
][YY
];
4136 state
->sd_X
[a
][ZZ
] = -state
->sd_X
[a
][ZZ
];
4139 state
->cg_p
[a
][YY
] = -state
->cg_p
[a
][YY
];
4140 state
->cg_p
[a
][ZZ
] = -state
->cg_p
[a
][ZZ
];
4142 case estDISRE_INITF
:
4143 case estDISRE_RM3TAV
:
4144 case estORIRE_INITF
:
4146 /* These are distances, so not affected by rotation */
4149 gmx_incons("Unknown state entry encountered in rotate_state_atom");
4155 static int *get_moved(gmx_domdec_comm_t
*comm
, int natoms
)
4157 if (natoms
> comm
->moved_nalloc
)
4159 /* Contents should be preserved here */
4160 comm
->moved_nalloc
= over_alloc_dd(natoms
);
4161 srenew(comm
->moved
, comm
->moved_nalloc
);
4167 static void calc_cg_move(FILE *fplog
, gmx_int64_t step
,
4170 ivec tric_dir
, matrix tcm
,
4171 rvec cell_x0
, rvec cell_x1
,
4172 rvec limitd
, rvec limit0
, rvec limit1
,
4174 int cg_start
, int cg_end
,
4179 int cg
, k
, k0
, k1
, d
, dim
, d2
;
4184 real inv_ncg
, pos_d
;
4187 npbcdim
= dd
->npbcdim
;
4189 for (cg
= cg_start
; cg
< cg_end
; cg
++)
4196 copy_rvec(state
->x
[k0
], cm_new
);
4203 for (k
= k0
; (k
< k1
); k
++)
4205 rvec_inc(cm_new
, state
->x
[k
]);
4207 for (d
= 0; (d
< DIM
); d
++)
4209 cm_new
[d
] = inv_ncg
*cm_new
[d
];
4214 /* Do pbc and check DD cell boundary crossings */
4215 for (d
= DIM
-1; d
>= 0; d
--)
4219 bScrew
= (dd
->bScrewPBC
&& d
== XX
);
4220 /* Determine the location of this cg in lattice coordinates */
4224 for (d2
= d
+1; d2
< DIM
; d2
++)
4226 pos_d
+= cm_new
[d2
]*tcm
[d2
][d
];
4229 /* Put the charge group in the triclinic unit-cell */
4230 if (pos_d
>= cell_x1
[d
])
4232 if (pos_d
>= limit1
[d
])
4234 cg_move_error(fplog
, dd
, step
, cg
, d
, 1,
4235 cg_cm
!= state
->x
, limitd
[d
],
4236 cg_cm
[cg
], cm_new
, pos_d
);
4239 if (dd
->ci
[d
] == dd
->nc
[d
] - 1)
4241 rvec_dec(cm_new
, state
->box
[d
]);
4244 cm_new
[YY
] = state
->box
[YY
][YY
] - cm_new
[YY
];
4245 cm_new
[ZZ
] = state
->box
[ZZ
][ZZ
] - cm_new
[ZZ
];
4247 for (k
= k0
; (k
< k1
); k
++)
4249 rvec_dec(state
->x
[k
], state
->box
[d
]);
4252 rotate_state_atom(state
, k
);
4257 else if (pos_d
< cell_x0
[d
])
4259 if (pos_d
< limit0
[d
])
4261 cg_move_error(fplog
, dd
, step
, cg
, d
, -1,
4262 cg_cm
!= state
->x
, limitd
[d
],
4263 cg_cm
[cg
], cm_new
, pos_d
);
4268 rvec_inc(cm_new
, state
->box
[d
]);
4271 cm_new
[YY
] = state
->box
[YY
][YY
] - cm_new
[YY
];
4272 cm_new
[ZZ
] = state
->box
[ZZ
][ZZ
] - cm_new
[ZZ
];
4274 for (k
= k0
; (k
< k1
); k
++)
4276 rvec_inc(state
->x
[k
], state
->box
[d
]);
4279 rotate_state_atom(state
, k
);
4285 else if (d
< npbcdim
)
4287 /* Put the charge group in the rectangular unit-cell */
4288 while (cm_new
[d
] >= state
->box
[d
][d
])
4290 rvec_dec(cm_new
, state
->box
[d
]);
4291 for (k
= k0
; (k
< k1
); k
++)
4293 rvec_dec(state
->x
[k
], state
->box
[d
]);
4296 while (cm_new
[d
] < 0)
4298 rvec_inc(cm_new
, state
->box
[d
]);
4299 for (k
= k0
; (k
< k1
); k
++)
4301 rvec_inc(state
->x
[k
], state
->box
[d
]);
4307 copy_rvec(cm_new
, cg_cm
[cg
]);
4309 /* Determine where this cg should go */
4312 for (d
= 0; d
< dd
->ndim
; d
++)
4317 flag
|= DD_FLAG_FW(d
);
4323 else if (dev
[dim
] == -1)
4325 flag
|= DD_FLAG_BW(d
);
4328 if (dd
->nc
[dim
] > 2)
4339 /* Temporarily store the flag in move */
4340 move
[cg
] = mc
+ flag
;
4344 static void dd_redistribute_cg(FILE *fplog
, gmx_int64_t step
,
4345 gmx_domdec_t
*dd
, ivec tric_dir
,
4346 t_state
*state
, rvec
**f
,
4355 int ncg
[DIM
*2], nat
[DIM
*2];
4356 int c
, i
, cg
, k
, d
, dim
, dim2
, dir
, d2
, d3
;
4357 int mc
, cdd
, nrcg
, ncg_recv
, nvs
, nvr
, nvec
, vec
;
4358 int sbuf
[2], rbuf
[2];
4359 int home_pos_cg
, home_pos_at
, buf_pos
;
4361 gmx_bool bV
= FALSE
, bSDX
= FALSE
, bCGP
= FALSE
;
4364 rvec
*cg_cm
= NULL
, cell_x0
, cell_x1
, limitd
, limit0
, limit1
;
4366 cginfo_mb_t
*cginfo_mb
;
4367 gmx_domdec_comm_t
*comm
;
4369 int nthread
, thread
;
4373 check_screw_box(state
->box
);
4377 if (fr
->cutoff_scheme
== ecutsGROUP
)
4382 for (i
= 0; i
< estNR
; i
++)
4388 case estX
: /* Always present */ break;
4389 case estV
: bV
= (state
->flags
& (1<<i
)); break;
4390 case estSDX
: bSDX
= (state
->flags
& (1<<i
)); break;
4391 case estCGP
: bCGP
= (state
->flags
& (1<<i
)); break;
4394 case estDISRE_INITF
:
4395 case estDISRE_RM3TAV
:
4396 case estORIRE_INITF
:
4398 /* No processing required */
4401 gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4406 if (dd
->ncg_tot
> comm
->nalloc_int
)
4408 comm
->nalloc_int
= over_alloc_dd(dd
->ncg_tot
);
4409 srenew(comm
->buf_int
, comm
->nalloc_int
);
4411 move
= comm
->buf_int
;
4413 /* Clear the count */
4414 for (c
= 0; c
< dd
->ndim
*2; c
++)
4420 npbcdim
= dd
->npbcdim
;
4422 for (d
= 0; (d
< DIM
); d
++)
4424 limitd
[d
] = dd
->comm
->cellsize_min
[d
];
4425 if (d
>= npbcdim
&& dd
->ci
[d
] == 0)
4427 cell_x0
[d
] = -GMX_FLOAT_MAX
;
4431 cell_x0
[d
] = comm
->cell_x0
[d
];
4433 if (d
>= npbcdim
&& dd
->ci
[d
] == dd
->nc
[d
] - 1)
4435 cell_x1
[d
] = GMX_FLOAT_MAX
;
4439 cell_x1
[d
] = comm
->cell_x1
[d
];
4443 limit0
[d
] = comm
->old_cell_x0
[d
] - limitd
[d
];
4444 limit1
[d
] = comm
->old_cell_x1
[d
] + limitd
[d
];
4448 /* We check after communication if a charge group moved
4449 * more than one cell. Set the pre-comm check limit to float_max.
4451 limit0
[d
] = -GMX_FLOAT_MAX
;
4452 limit1
[d
] = GMX_FLOAT_MAX
;
4456 make_tric_corr_matrix(npbcdim
, state
->box
, tcm
);
4458 cgindex
= dd
->cgindex
;
4460 nthread
= gmx_omp_nthreads_get(emntDomdec
);
4462 /* Compute the center of geometry for all home charge groups
4463 * and put them in the box and determine where they should go.
4465 #pragma omp parallel for num_threads(nthread) schedule(static)
4466 for (thread
= 0; thread
< nthread
; thread
++)
4470 calc_cg_move(fplog
, step
, dd
, state
, tric_dir
, tcm
,
4471 cell_x0
, cell_x1
, limitd
, limit0
, limit1
,
4473 ( thread
*dd
->ncg_home
)/nthread
,
4474 ((thread
+1)*dd
->ncg_home
)/nthread
,
4475 fr
->cutoff_scheme
== ecutsGROUP
? cg_cm
: state
->x
,
4478 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
4481 for (cg
= 0; cg
< dd
->ncg_home
; cg
++)
4486 flag
= mc
& ~DD_FLAG_NRCG
;
4487 mc
= mc
& DD_FLAG_NRCG
;
4490 if (ncg
[mc
]+1 > comm
->cggl_flag_nalloc
[mc
])
4492 comm
->cggl_flag_nalloc
[mc
] = over_alloc_dd(ncg
[mc
]+1);
4493 srenew(comm
->cggl_flag
[mc
], comm
->cggl_flag_nalloc
[mc
]*DD_CGIBS
);
4495 comm
->cggl_flag
[mc
][ncg
[mc
]*DD_CGIBS
] = dd
->index_gl
[cg
];
4496 /* We store the cg size in the lower 16 bits
4497 * and the place where the charge group should go
4498 * in the next 6 bits. This saves some communication volume.
4500 nrcg
= cgindex
[cg
+1] - cgindex
[cg
];
4501 comm
->cggl_flag
[mc
][ncg
[mc
]*DD_CGIBS
+1] = nrcg
| flag
;
4507 inc_nrnb(nrnb
, eNR_CGCM
, dd
->nat_home
);
4508 inc_nrnb(nrnb
, eNR_RESETX
, dd
->ncg_home
);
4511 for (i
= 0; i
< dd
->ndim
*2; i
++)
4513 *ncg_moved
+= ncg
[i
];
4530 /* Make sure the communication buffers are large enough */
4531 for (mc
= 0; mc
< dd
->ndim
*2; mc
++)
4533 nvr
= ncg
[mc
] + nat
[mc
]*nvec
;
4534 if (nvr
> comm
->cgcm_state_nalloc
[mc
])
4536 comm
->cgcm_state_nalloc
[mc
] = over_alloc_dd(nvr
);
4537 srenew(comm
->cgcm_state
[mc
], comm
->cgcm_state_nalloc
[mc
]);
4541 switch (fr
->cutoff_scheme
)
4544 /* Recalculating cg_cm might be cheaper than communicating,
4545 * but that could give rise to rounding issues.
4548 compact_and_copy_vec_cg(dd
->ncg_home
, move
, cgindex
,
4549 nvec
, cg_cm
, comm
, bCompact
);
4552 /* Without charge groups we send the moved atom coordinates
4553 * over twice. This is so the code below can be used without
4554 * many conditionals for both for with and without charge groups.
4557 compact_and_copy_vec_cg(dd
->ncg_home
, move
, cgindex
,
4558 nvec
, state
->x
, comm
, FALSE
);
4561 home_pos_cg
-= *ncg_moved
;
4565 gmx_incons("unimplemented");
4571 compact_and_copy_vec_at(dd
->ncg_home
, move
, cgindex
,
4572 nvec
, vec
++, state
->x
, comm
, bCompact
);
4575 compact_and_copy_vec_at(dd
->ncg_home
, move
, cgindex
,
4576 nvec
, vec
++, state
->v
, comm
, bCompact
);
4580 compact_and_copy_vec_at(dd
->ncg_home
, move
, cgindex
,
4581 nvec
, vec
++, state
->sd_X
, comm
, bCompact
);
4585 compact_and_copy_vec_at(dd
->ncg_home
, move
, cgindex
,
4586 nvec
, vec
++, state
->cg_p
, comm
, bCompact
);
4591 compact_ind(dd
->ncg_home
, move
,
4592 dd
->index_gl
, dd
->cgindex
, dd
->gatindex
,
4593 dd
->ga2la
, comm
->bLocalCG
,
4598 if (fr
->cutoff_scheme
== ecutsVERLET
)
4600 moved
= get_moved(comm
, dd
->ncg_home
);
4602 for (k
= 0; k
< dd
->ncg_home
; k
++)
4609 moved
= fr
->ns
->grid
->cell_index
;
4612 clear_and_mark_ind(dd
->ncg_home
, move
,
4613 dd
->index_gl
, dd
->cgindex
, dd
->gatindex
,
4614 dd
->ga2la
, comm
->bLocalCG
,
4618 cginfo_mb
= fr
->cginfo_mb
;
4620 *ncg_stay_home
= home_pos_cg
;
4621 for (d
= 0; d
< dd
->ndim
; d
++)
4626 for (dir
= 0; dir
< (dd
->nc
[dim
] == 2 ? 1 : 2); dir
++)
4629 /* Communicate the cg and atom counts */
4634 fprintf(debug
, "Sending ddim %d dir %d: ncg %d nat %d\n",
4635 d
, dir
, sbuf
[0], sbuf
[1]);
4637 dd_sendrecv_int(dd
, d
, dir
, sbuf
, 2, rbuf
, 2);
4639 if ((ncg_recv
+rbuf
[0])*DD_CGIBS
> comm
->nalloc_int
)
4641 comm
->nalloc_int
= over_alloc_dd((ncg_recv
+rbuf
[0])*DD_CGIBS
);
4642 srenew(comm
->buf_int
, comm
->nalloc_int
);
4645 /* Communicate the charge group indices, sizes and flags */
4646 dd_sendrecv_int(dd
, d
, dir
,
4647 comm
->cggl_flag
[cdd
], sbuf
[0]*DD_CGIBS
,
4648 comm
->buf_int
+ncg_recv
*DD_CGIBS
, rbuf
[0]*DD_CGIBS
);
4650 nvs
= ncg
[cdd
] + nat
[cdd
]*nvec
;
4651 i
= rbuf
[0] + rbuf
[1] *nvec
;
4652 vec_rvec_check_alloc(&comm
->vbuf
, nvr
+i
);
4654 /* Communicate cgcm and state */
4655 dd_sendrecv_rvec(dd
, d
, dir
,
4656 comm
->cgcm_state
[cdd
], nvs
,
4657 comm
->vbuf
.v
+nvr
, i
);
4658 ncg_recv
+= rbuf
[0];
4662 /* Process the received charge groups */
4664 for (cg
= 0; cg
< ncg_recv
; cg
++)
4666 flag
= comm
->buf_int
[cg
*DD_CGIBS
+1];
4668 if (dim
>= npbcdim
&& dd
->nc
[dim
] > 2)
4670 /* No pbc in this dim and more than one domain boundary.
4671 * We do a separate check if a charge group didn't move too far.
4673 if (((flag
& DD_FLAG_FW(d
)) &&
4674 comm
->vbuf
.v
[buf_pos
][dim
] > cell_x1
[dim
]) ||
4675 ((flag
& DD_FLAG_BW(d
)) &&
4676 comm
->vbuf
.v
[buf_pos
][dim
] < cell_x0
[dim
]))
4678 cg_move_error(fplog
, dd
, step
, cg
, dim
,
4679 (flag
& DD_FLAG_FW(d
)) ? 1 : 0,
4680 fr
->cutoff_scheme
== ecutsGROUP
, 0,
4681 comm
->vbuf
.v
[buf_pos
],
4682 comm
->vbuf
.v
[buf_pos
],
4683 comm
->vbuf
.v
[buf_pos
][dim
]);
4690 /* Check which direction this cg should go */
4691 for (d2
= d
+1; (d2
< dd
->ndim
&& mc
== -1); d2
++)
4693 if (dlbIsOn(dd
->comm
))
4695 /* The cell boundaries for dimension d2 are not equal
4696 * for each cell row of the lower dimension(s),
4697 * therefore we might need to redetermine where
4698 * this cg should go.
4701 /* If this cg crosses the box boundary in dimension d2
4702 * we can use the communicated flag, so we do not
4703 * have to worry about pbc.
4705 if (!((dd
->ci
[dim2
] == dd
->nc
[dim2
]-1 &&
4706 (flag
& DD_FLAG_FW(d2
))) ||
4707 (dd
->ci
[dim2
] == 0 &&
4708 (flag
& DD_FLAG_BW(d2
)))))
4710 /* Clear the two flags for this dimension */
4711 flag
&= ~(DD_FLAG_FW(d2
) | DD_FLAG_BW(d2
));
4712 /* Determine the location of this cg
4713 * in lattice coordinates
4715 pos_d
= comm
->vbuf
.v
[buf_pos
][dim2
];
4718 for (d3
= dim2
+1; d3
< DIM
; d3
++)
4721 comm
->vbuf
.v
[buf_pos
][d3
]*tcm
[d3
][dim2
];
4724 /* Check of we are not at the box edge.
4725 * pbc is only handled in the first step above,
4726 * but this check could move over pbc while
4727 * the first step did not due to different rounding.
4729 if (pos_d
>= cell_x1
[dim2
] &&
4730 dd
->ci
[dim2
] != dd
->nc
[dim2
]-1)
4732 flag
|= DD_FLAG_FW(d2
);
4734 else if (pos_d
< cell_x0
[dim2
] &&
4737 flag
|= DD_FLAG_BW(d2
);
4739 comm
->buf_int
[cg
*DD_CGIBS
+1] = flag
;
4742 /* Set to which neighboring cell this cg should go */
4743 if (flag
& DD_FLAG_FW(d2
))
4747 else if (flag
& DD_FLAG_BW(d2
))
4749 if (dd
->nc
[dd
->dim
[d2
]] > 2)
4761 nrcg
= flag
& DD_FLAG_NRCG
;
4764 if (home_pos_cg
+1 > dd
->cg_nalloc
)
4766 dd
->cg_nalloc
= over_alloc_dd(home_pos_cg
+1);
4767 srenew(dd
->index_gl
, dd
->cg_nalloc
);
4768 srenew(dd
->cgindex
, dd
->cg_nalloc
+1);
4770 /* Set the global charge group index and size */
4771 dd
->index_gl
[home_pos_cg
] = comm
->buf_int
[cg
*DD_CGIBS
];
4772 dd
->cgindex
[home_pos_cg
+1] = dd
->cgindex
[home_pos_cg
] + nrcg
;
4773 /* Copy the state from the buffer */
4774 dd_check_alloc_ncg(fr
, state
, f
, home_pos_cg
+1);
4775 if (fr
->cutoff_scheme
== ecutsGROUP
)
4778 copy_rvec(comm
->vbuf
.v
[buf_pos
], cg_cm
[home_pos_cg
]);
4782 /* Set the cginfo */
4783 fr
->cginfo
[home_pos_cg
] = ddcginfo(cginfo_mb
,
4784 dd
->index_gl
[home_pos_cg
]);
4787 comm
->bLocalCG
[dd
->index_gl
[home_pos_cg
]] = TRUE
;
4790 if (home_pos_at
+nrcg
> state
->nalloc
)
4792 dd_realloc_state(state
, f
, home_pos_at
+nrcg
);
4794 for (i
= 0; i
< nrcg
; i
++)
4796 copy_rvec(comm
->vbuf
.v
[buf_pos
++],
4797 state
->x
[home_pos_at
+i
]);
4801 for (i
= 0; i
< nrcg
; i
++)
4803 copy_rvec(comm
->vbuf
.v
[buf_pos
++],
4804 state
->v
[home_pos_at
+i
]);
4809 for (i
= 0; i
< nrcg
; i
++)
4811 copy_rvec(comm
->vbuf
.v
[buf_pos
++],
4812 state
->sd_X
[home_pos_at
+i
]);
4817 for (i
= 0; i
< nrcg
; i
++)
4819 copy_rvec(comm
->vbuf
.v
[buf_pos
++],
4820 state
->cg_p
[home_pos_at
+i
]);
4824 home_pos_at
+= nrcg
;
4828 /* Reallocate the buffers if necessary */
4829 if (ncg
[mc
]+1 > comm
->cggl_flag_nalloc
[mc
])
4831 comm
->cggl_flag_nalloc
[mc
] = over_alloc_dd(ncg
[mc
]+1);
4832 srenew(comm
->cggl_flag
[mc
], comm
->cggl_flag_nalloc
[mc
]*DD_CGIBS
);
4834 nvr
= ncg
[mc
] + nat
[mc
]*nvec
;
4835 if (nvr
+ 1 + nrcg
*nvec
> comm
->cgcm_state_nalloc
[mc
])
4837 comm
->cgcm_state_nalloc
[mc
] = over_alloc_dd(nvr
+ 1 + nrcg
*nvec
);
4838 srenew(comm
->cgcm_state
[mc
], comm
->cgcm_state_nalloc
[mc
]);
4840 /* Copy from the receive to the send buffers */
4841 memcpy(comm
->cggl_flag
[mc
] + ncg
[mc
]*DD_CGIBS
,
4842 comm
->buf_int
+ cg
*DD_CGIBS
,
4843 DD_CGIBS
*sizeof(int));
4844 memcpy(comm
->cgcm_state
[mc
][nvr
],
4845 comm
->vbuf
.v
[buf_pos
],
4846 (1+nrcg
*nvec
)*sizeof(rvec
));
4847 buf_pos
+= 1 + nrcg
*nvec
;
4854 /* With sorting (!bCompact) the indices are now only partially up to date
4855 * and ncg_home and nat_home are not the real count, since there are
4856 * "holes" in the arrays for the charge groups that moved to neighbors.
4858 if (fr
->cutoff_scheme
== ecutsVERLET
)
4860 moved
= get_moved(comm
, home_pos_cg
);
4862 for (i
= dd
->ncg_home
; i
< home_pos_cg
; i
++)
4867 dd
->ncg_home
= home_pos_cg
;
4868 dd
->nat_home
= home_pos_at
;
4873 "Finished repartitioning: cgs moved out %d, new home %d\n",
4874 *ncg_moved
, dd
->ncg_home
-*ncg_moved
);
4879 void dd_cycles_add(gmx_domdec_t
*dd
, float cycles
, int ddCycl
)
4881 /* Note that the cycles value can be incorrect, either 0 or some
4882 * extremely large value, when our thread migrated to another core
4883 * with an unsynchronized cycle counter. If this happens less often
4884 * that once per nstlist steps, this will not cause issues, since
4885 * we later subtract the maximum value from the sum over nstlist steps.
4886 * A zero count will slightly lower the total, but that's a small effect.
4887 * Note that the main purpose of the subtraction of the maximum value
4888 * is to avoid throwing off the load balancing when stalls occur due
4889 * e.g. system activity or network congestion.
4891 dd
->comm
->cycl
[ddCycl
] += cycles
;
4892 dd
->comm
->cycl_n
[ddCycl
]++;
4893 if (cycles
> dd
->comm
->cycl_max
[ddCycl
])
4895 dd
->comm
->cycl_max
[ddCycl
] = cycles
;
4899 static double force_flop_count(t_nrnb
*nrnb
)
4906 for (i
= 0; i
< eNR_NBKERNEL_FREE_ENERGY
; i
++)
4908 /* To get closer to the real timings, we half the count
4909 * for the normal loops and again half it for water loops.
4912 if (strstr(name
, "W3") != NULL
|| strstr(name
, "W4") != NULL
)
4914 sum
+= nrnb
->n
[i
]*0.25*cost_nrnb(i
);
4918 sum
+= nrnb
->n
[i
]*0.50*cost_nrnb(i
);
4921 for (i
= eNR_NBKERNEL_FREE_ENERGY
; i
<= eNR_NB14
; i
++)
4924 if (strstr(name
, "W3") != NULL
|| strstr(name
, "W4") != NULL
)
4926 sum
+= nrnb
->n
[i
]*cost_nrnb(i
);
4929 for (i
= eNR_BONDS
; i
<= eNR_WALLS
; i
++)
4931 sum
+= nrnb
->n
[i
]*cost_nrnb(i
);
4937 void dd_force_flop_start(gmx_domdec_t
*dd
, t_nrnb
*nrnb
)
4939 if (dd
->comm
->eFlop
)
4941 dd
->comm
->flop
-= force_flop_count(nrnb
);
4944 void dd_force_flop_stop(gmx_domdec_t
*dd
, t_nrnb
*nrnb
)
4946 if (dd
->comm
->eFlop
)
4948 dd
->comm
->flop
+= force_flop_count(nrnb
);
4953 static void clear_dd_cycle_counts(gmx_domdec_t
*dd
)
4957 for (i
= 0; i
< ddCyclNr
; i
++)
4959 dd
->comm
->cycl
[i
] = 0;
4960 dd
->comm
->cycl_n
[i
] = 0;
4961 dd
->comm
->cycl_max
[i
] = 0;
4964 dd
->comm
->flop_n
= 0;
4967 static void get_load_distribution(gmx_domdec_t
*dd
, gmx_wallcycle_t wcycle
)
4969 gmx_domdec_comm_t
*comm
;
4970 domdec_load_t
*load
;
4971 domdec_root_t
*root
= NULL
;
4973 float cell_frac
= 0, sbuf
[DD_NLOAD_MAX
];
4978 fprintf(debug
, "get_load_distribution start\n");
4981 wallcycle_start(wcycle
, ewcDDCOMMLOAD
);
4985 bSepPME
= (dd
->pme_nodeid
>= 0);
4987 if (dd
->ndim
== 0 && bSepPME
)
4989 /* Without decomposition, but with PME nodes, we need the load */
4990 comm
->load
[0].mdf
= comm
->cycl
[ddCyclPPduringPME
];
4991 comm
->load
[0].pme
= comm
->cycl
[ddCyclPME
];
4994 for (d
= dd
->ndim
-1; d
>= 0; d
--)
4997 /* Check if we participate in the communication in this dimension */
4998 if (d
== dd
->ndim
-1 ||
4999 (dd
->ci
[dd
->dim
[d
+1]] == 0 && dd
->ci
[dd
->dim
[dd
->ndim
-1]] == 0))
5001 load
= &comm
->load
[d
];
5002 if (dlbIsOn(dd
->comm
))
5004 cell_frac
= comm
->cell_f1
[d
] - comm
->cell_f0
[d
];
5007 if (d
== dd
->ndim
-1)
5009 sbuf
[pos
++] = dd_force_load(comm
);
5010 sbuf
[pos
++] = sbuf
[0];
5011 if (dlbIsOn(dd
->comm
))
5013 sbuf
[pos
++] = sbuf
[0];
5014 sbuf
[pos
++] = cell_frac
;
5017 sbuf
[pos
++] = comm
->cell_f_max0
[d
];
5018 sbuf
[pos
++] = comm
->cell_f_min1
[d
];
5023 sbuf
[pos
++] = comm
->cycl
[ddCyclPPduringPME
];
5024 sbuf
[pos
++] = comm
->cycl
[ddCyclPME
];
5029 sbuf
[pos
++] = comm
->load
[d
+1].sum
;
5030 sbuf
[pos
++] = comm
->load
[d
+1].max
;
5031 if (dlbIsOn(dd
->comm
))
5033 sbuf
[pos
++] = comm
->load
[d
+1].sum_m
;
5034 sbuf
[pos
++] = comm
->load
[d
+1].cvol_min
*cell_frac
;
5035 sbuf
[pos
++] = comm
->load
[d
+1].flags
;
5038 sbuf
[pos
++] = comm
->cell_f_max0
[d
];
5039 sbuf
[pos
++] = comm
->cell_f_min1
[d
];
5044 sbuf
[pos
++] = comm
->load
[d
+1].mdf
;
5045 sbuf
[pos
++] = comm
->load
[d
+1].pme
;
5049 /* Communicate a row in DD direction d.
5050 * The communicators are setup such that the root always has rank 0.
5053 MPI_Gather(sbuf
, load
->nload
*sizeof(float), MPI_BYTE
,
5054 load
->load
, load
->nload
*sizeof(float), MPI_BYTE
,
5055 0, comm
->mpi_comm_load
[d
]);
5057 if (dd
->ci
[dim
] == dd
->master_ci
[dim
])
5059 /* We are the root, process this row */
5062 root
= comm
->root
[d
];
5072 for (i
= 0; i
< dd
->nc
[dim
]; i
++)
5074 load
->sum
+= load
->load
[pos
++];
5075 load
->max
= std::max(load
->max
, load
->load
[pos
]);
5077 if (dlbIsOn(dd
->comm
))
5081 /* This direction could not be load balanced properly,
5082 * therefore we need to use the maximum iso the average load.
5084 load
->sum_m
= std::max(load
->sum_m
, load
->load
[pos
]);
5088 load
->sum_m
+= load
->load
[pos
];
5091 load
->cvol_min
= std::min(load
->cvol_min
, load
->load
[pos
]);
5095 load
->flags
= (int)(load
->load
[pos
++] + 0.5);
5099 root
->cell_f_max0
[i
] = load
->load
[pos
++];
5100 root
->cell_f_min1
[i
] = load
->load
[pos
++];
5105 load
->mdf
= std::max(load
->mdf
, load
->load
[pos
]);
5107 load
->pme
= std::max(load
->pme
, load
->load
[pos
]);
5111 if (dlbIsOn(comm
) && root
->bLimited
)
5113 load
->sum_m
*= dd
->nc
[dim
];
5114 load
->flags
|= (1<<d
);
5122 comm
->nload
+= dd_load_count(comm
);
5123 comm
->load_step
+= comm
->cycl
[ddCyclStep
];
5124 comm
->load_sum
+= comm
->load
[0].sum
;
5125 comm
->load_max
+= comm
->load
[0].max
;
5128 for (d
= 0; d
< dd
->ndim
; d
++)
5130 if (comm
->load
[0].flags
& (1<<d
))
5132 comm
->load_lim
[d
]++;
5138 comm
->load_mdf
+= comm
->load
[0].mdf
;
5139 comm
->load_pme
+= comm
->load
[0].pme
;
5143 wallcycle_stop(wcycle
, ewcDDCOMMLOAD
);
5147 fprintf(debug
, "get_load_distribution finished\n");
5151 static float dd_force_imb_perf_loss(gmx_domdec_t
*dd
)
5153 /* Return the relative performance loss on the total run time
5154 * due to the force calculation load imbalance.
5156 if (dd
->comm
->nload
> 0 && dd
->comm
->load_step
> 0)
5159 (dd
->comm
->load_max
*dd
->nnodes
- dd
->comm
->load_sum
)/
5160 (dd
->comm
->load_step
*dd
->nnodes
);
5168 static void print_dd_load_av(FILE *fplog
, gmx_domdec_t
*dd
)
5171 int npp
, npme
, nnodes
, d
, limp
;
5172 float imbal
, pme_f_ratio
, lossf
= 0, lossp
= 0;
5174 gmx_domdec_comm_t
*comm
;
5177 if (DDMASTER(dd
) && comm
->nload
> 0)
5180 npme
= (dd
->pme_nodeid
>= 0) ? comm
->npmenodes
: 0;
5181 nnodes
= npp
+ npme
;
5182 if (dd
->nnodes
> 1 && comm
->load_sum
> 0)
5184 imbal
= comm
->load_max
*npp
/comm
->load_sum
- 1;
5185 lossf
= dd_force_imb_perf_loss(dd
);
5186 sprintf(buf
, " Average load imbalance: %.1f %%\n", imbal
*100);
5187 fprintf(fplog
, "%s", buf
);
5188 fprintf(stderr
, "\n");
5189 fprintf(stderr
, "%s", buf
);
5190 sprintf(buf
, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf
*100);
5191 fprintf(fplog
, "%s", buf
);
5192 fprintf(stderr
, "%s", buf
);
5197 sprintf(buf
, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5198 for (d
= 0; d
< dd
->ndim
; d
++)
5200 limp
= (200*comm
->load_lim
[d
]+1)/(2*comm
->nload
);
5201 sprintf(buf
+strlen(buf
), " %c %d %%", dim2char(dd
->dim
[d
]), limp
);
5207 sprintf(buf
+strlen(buf
), "\n");
5208 fprintf(fplog
, "%s", buf
);
5209 fprintf(stderr
, "%s", buf
);
5211 if (npme
> 0 && comm
->load_mdf
> 0 && comm
->load_step
> 0)
5213 pme_f_ratio
= comm
->load_pme
/comm
->load_mdf
;
5214 lossp
= (comm
->load_pme
- comm
->load_mdf
)/comm
->load_step
;
5217 lossp
*= (float)npme
/(float)nnodes
;
5221 lossp
*= (float)npp
/(float)nnodes
;
5223 sprintf(buf
, " Average PME mesh/force load: %5.3f\n", pme_f_ratio
);
5224 fprintf(fplog
, "%s", buf
);
5225 fprintf(stderr
, "%s", buf
);
5226 sprintf(buf
, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp
)*100);
5227 fprintf(fplog
, "%s", buf
);
5228 fprintf(stderr
, "%s", buf
);
5230 fprintf(fplog
, "\n");
5231 fprintf(stderr
, "\n");
5233 if (lossf
>= DD_PERF_LOSS_WARN
)
5236 "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5237 " in the domain decomposition.\n", lossf
*100);
5240 sprintf(buf
+strlen(buf
), " You might want to use dynamic load balancing (option -dlb.)\n");
5244 sprintf(buf
+strlen(buf
), " You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5246 fprintf(fplog
, "%s\n", buf
);
5247 fprintf(stderr
, "%s\n", buf
);
5249 if (npme
> 0 && fabs(lossp
) >= DD_PERF_LOSS_WARN
)
5252 "NOTE: %.1f %% performance was lost because the PME ranks\n"
5253 " had %s work to do than the PP ranks.\n"
5254 " You might want to %s the number of PME ranks\n"
5255 " or %s the cut-off and the grid spacing.\n",
5257 (lossp
< 0) ? "less" : "more",
5258 (lossp
< 0) ? "decrease" : "increase",
5259 (lossp
< 0) ? "decrease" : "increase");
5260 fprintf(fplog
, "%s\n", buf
);
5261 fprintf(stderr
, "%s\n", buf
);
5266 static float dd_vol_min(gmx_domdec_t
*dd
)
5268 return dd
->comm
->load
[0].cvol_min
*dd
->nnodes
;
5271 static gmx_bool
dd_load_flags(gmx_domdec_t
*dd
)
5273 return dd
->comm
->load
[0].flags
;
5276 static float dd_f_imbal(gmx_domdec_t
*dd
)
5278 if (dd
->comm
->load
[0].sum
> 0)
5280 return dd
->comm
->load
[0].max
*dd
->nnodes
/dd
->comm
->load
[0].sum
- 1.0f
;
5284 /* Something is wrong in the cycle counting, report no load imbalance */
5289 float dd_pme_f_ratio(gmx_domdec_t
*dd
)
5291 /* Should only be called on the DD master rank */
5292 assert(DDMASTER(dd
));
5294 if (dd
->comm
->load
[0].mdf
> 0 && dd
->comm
->cycl_n
[ddCyclPME
] > 0)
5296 return dd
->comm
->load
[0].pme
/dd
->comm
->load
[0].mdf
;
5304 static void dd_print_load(FILE *fplog
, gmx_domdec_t
*dd
, gmx_int64_t step
)
5309 flags
= dd_load_flags(dd
);
5313 "DD load balancing is limited by minimum cell size in dimension");
5314 for (d
= 0; d
< dd
->ndim
; d
++)
5318 fprintf(fplog
, " %c", dim2char(dd
->dim
[d
]));
5321 fprintf(fplog
, "\n");
5323 fprintf(fplog
, "DD step %s", gmx_step_str(step
, buf
));
5324 if (dlbIsOn(dd
->comm
))
5326 fprintf(fplog
, " vol min/aver %5.3f%c",
5327 dd_vol_min(dd
), flags
? '!' : ' ');
5331 fprintf(fplog
, " load imb.: force %4.1f%%", dd_f_imbal(dd
)*100);
5333 if (dd
->comm
->cycl_n
[ddCyclPME
])
5335 fprintf(fplog
, " pme mesh/force %5.3f", dd_pme_f_ratio(dd
));
5337 fprintf(fplog
, "\n\n");
5340 static void dd_print_load_verbose(gmx_domdec_t
*dd
)
5342 if (dlbIsOn(dd
->comm
))
5344 fprintf(stderr
, "vol %4.2f%c ",
5345 dd_vol_min(dd
), dd_load_flags(dd
) ? '!' : ' ');
5349 fprintf(stderr
, "imb F %2d%% ", (int)(dd_f_imbal(dd
)*100+0.5));
5351 if (dd
->comm
->cycl_n
[ddCyclPME
])
5353 fprintf(stderr
, "pme/F %4.2f ", dd_pme_f_ratio(dd
));
5358 static void make_load_communicator(gmx_domdec_t
*dd
, int dim_ind
, ivec loc
)
5363 domdec_root_t
*root
;
5364 gmx_bool bPartOfGroup
= FALSE
;
5366 dim
= dd
->dim
[dim_ind
];
5367 copy_ivec(loc
, loc_c
);
5368 for (i
= 0; i
< dd
->nc
[dim
]; i
++)
5371 rank
= dd_index(dd
->nc
, loc_c
);
5372 if (rank
== dd
->rank
)
5374 /* This process is part of the group */
5375 bPartOfGroup
= TRUE
;
5378 MPI_Comm_split(dd
->mpi_comm_all
, bPartOfGroup
? 0 : MPI_UNDEFINED
, dd
->rank
,
5382 dd
->comm
->mpi_comm_load
[dim_ind
] = c_row
;
5383 if (dd
->comm
->dlbState
!= edlbsOffForever
)
5385 if (dd
->ci
[dim
] == dd
->master_ci
[dim
])
5387 /* This is the root process of this row */
5388 snew(dd
->comm
->root
[dim_ind
], 1);
5389 root
= dd
->comm
->root
[dim_ind
];
5390 snew(root
->cell_f
, DD_CELL_F_SIZE(dd
, dim_ind
));
5391 snew(root
->old_cell_f
, dd
->nc
[dim
]+1);
5392 snew(root
->bCellMin
, dd
->nc
[dim
]);
5395 snew(root
->cell_f_max0
, dd
->nc
[dim
]);
5396 snew(root
->cell_f_min1
, dd
->nc
[dim
]);
5397 snew(root
->bound_min
, dd
->nc
[dim
]);
5398 snew(root
->bound_max
, dd
->nc
[dim
]);
5400 snew(root
->buf_ncd
, dd
->nc
[dim
]);
5404 /* This is not a root process, we only need to receive cell_f */
5405 snew(dd
->comm
->cell_f_row
, DD_CELL_F_SIZE(dd
, dim_ind
));
5408 if (dd
->ci
[dim
] == dd
->master_ci
[dim
])
5410 snew(dd
->comm
->load
[dim_ind
].load
, dd
->nc
[dim
]*DD_NLOAD_MAX
);
5416 void dd_setup_dlb_resource_sharing(t_commrec gmx_unused
*cr
,
5417 const gmx_hw_info_t gmx_unused
*hwinfo
,
5418 const gmx_hw_opt_t gmx_unused
*hw_opt
)
5421 int physicalnode_id_hash
;
5424 MPI_Comm mpi_comm_pp_physicalnode
;
5426 if (!(cr
->duty
& DUTY_PP
) || hw_opt
->gpu_opt
.n_dev_use
== 0)
5428 /* Only PP nodes (currently) use GPUs.
5429 * If we don't have GPUs, there are no resources to share.
5434 physicalnode_id_hash
= gmx_physicalnode_id_hash();
5436 gpu_id
= get_gpu_device_id(&hwinfo
->gpu_info
, &hw_opt
->gpu_opt
, cr
->rank_pp_intranode
);
5442 fprintf(debug
, "dd_setup_dd_dlb_gpu_sharing:\n");
5443 fprintf(debug
, "DD PP rank %d physical node hash %d gpu_id %d\n",
5444 dd
->rank
, physicalnode_id_hash
, gpu_id
);
5446 /* Split the PP communicator over the physical nodes */
5447 /* TODO: See if we should store this (before), as it's also used for
5448 * for the nodecomm summution.
5450 MPI_Comm_split(dd
->mpi_comm_all
, physicalnode_id_hash
, dd
->rank
,
5451 &mpi_comm_pp_physicalnode
);
5452 MPI_Comm_split(mpi_comm_pp_physicalnode
, gpu_id
, dd
->rank
,
5453 &dd
->comm
->mpi_comm_gpu_shared
);
5454 MPI_Comm_free(&mpi_comm_pp_physicalnode
);
5455 MPI_Comm_size(dd
->comm
->mpi_comm_gpu_shared
, &dd
->comm
->nrank_gpu_shared
);
5459 fprintf(debug
, "nrank_gpu_shared %d\n", dd
->comm
->nrank_gpu_shared
);
5462 /* Note that some ranks could share a GPU, while others don't */
5464 if (dd
->comm
->nrank_gpu_shared
== 1)
5466 MPI_Comm_free(&dd
->comm
->mpi_comm_gpu_shared
);
5471 static void make_load_communicators(gmx_domdec_t gmx_unused
*dd
)
5474 int dim0
, dim1
, i
, j
;
5479 fprintf(debug
, "Making load communicators\n");
5482 snew(dd
->comm
->load
, std::max(dd
->ndim
, 1));
5483 snew(dd
->comm
->mpi_comm_load
, std::max(dd
->ndim
, 1));
5491 make_load_communicator(dd
, 0, loc
);
5495 for (i
= 0; i
< dd
->nc
[dim0
]; i
++)
5498 make_load_communicator(dd
, 1, loc
);
5504 for (i
= 0; i
< dd
->nc
[dim0
]; i
++)
5508 for (j
= 0; j
< dd
->nc
[dim1
]; j
++)
5511 make_load_communicator(dd
, 2, loc
);
5518 fprintf(debug
, "Finished making load communicators\n");
5523 void setup_dd_grid(FILE *fplog
, gmx_domdec_t
*dd
)
5525 int d
, dim
, i
, j
, m
;
5528 ivec dd_zp
[DD_MAXIZONE
];
5529 gmx_domdec_zones_t
*zones
;
5530 gmx_domdec_ns_ranges_t
*izone
;
5532 for (d
= 0; d
< dd
->ndim
; d
++)
5535 copy_ivec(dd
->ci
, tmp
);
5536 tmp
[dim
] = (tmp
[dim
] + 1) % dd
->nc
[dim
];
5537 dd
->neighbor
[d
][0] = ddcoord2ddnodeid(dd
, tmp
);
5538 copy_ivec(dd
->ci
, tmp
);
5539 tmp
[dim
] = (tmp
[dim
] - 1 + dd
->nc
[dim
]) % dd
->nc
[dim
];
5540 dd
->neighbor
[d
][1] = ddcoord2ddnodeid(dd
, tmp
);
5543 fprintf(debug
, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5546 dd
->neighbor
[d
][1]);
5552 fprintf(fplog
, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5554 dd
->nc
[XX
], dd
->nc
[YY
], dd
->nc
[ZZ
],
5555 dd
->ci
[XX
], dd
->ci
[YY
], dd
->ci
[ZZ
]);
5562 for (i
= 0; i
< nzonep
; i
++)
5564 copy_ivec(dd_zp3
[i
], dd_zp
[i
]);
5570 for (i
= 0; i
< nzonep
; i
++)
5572 copy_ivec(dd_zp2
[i
], dd_zp
[i
]);
5578 for (i
= 0; i
< nzonep
; i
++)
5580 copy_ivec(dd_zp1
[i
], dd_zp
[i
]);
5586 for (i
= 0; i
< nzonep
; i
++)
5588 copy_ivec(dd_zp0
[i
], dd_zp
[i
]);
5592 gmx_fatal(FARGS
, "Can only do 1, 2 or 3D domain decomposition");
5597 zones
= &dd
->comm
->zones
;
5599 for (i
= 0; i
< nzone
; i
++)
5602 clear_ivec(zones
->shift
[i
]);
5603 for (d
= 0; d
< dd
->ndim
; d
++)
5605 zones
->shift
[i
][dd
->dim
[d
]] = dd_zo
[i
][m
++];
5610 for (i
= 0; i
< nzone
; i
++)
5612 for (d
= 0; d
< DIM
; d
++)
5614 s
[d
] = dd
->ci
[d
] - zones
->shift
[i
][d
];
5619 else if (s
[d
] >= dd
->nc
[d
])
5625 zones
->nizone
= nzonep
;
5626 for (i
= 0; i
< zones
->nizone
; i
++)
5628 if (dd_zp
[i
][0] != i
)
5630 gmx_fatal(FARGS
, "Internal inconsistency in the dd grid setup");
5632 izone
= &zones
->izone
[i
];
5633 izone
->j0
= dd_zp
[i
][1];
5634 izone
->j1
= dd_zp
[i
][2];
5635 for (dim
= 0; dim
< DIM
; dim
++)
5637 if (dd
->nc
[dim
] == 1)
5639 /* All shifts should be allowed */
5640 izone
->shift0
[dim
] = -1;
5641 izone
->shift1
[dim
] = 1;
5646 izone->shift0[d] = 0;
5647 izone->shift1[d] = 0;
5648 for(j=izone->j0; j<izone->j1; j++) {
5649 if (dd->shift[j][d] > dd->shift[i][d])
5650 izone->shift0[d] = -1;
5651 if (dd->shift[j][d] < dd->shift[i][d])
5652 izone->shift1[d] = 1;
5658 /* Assume the shift are not more than 1 cell */
5659 izone
->shift0
[dim
] = 1;
5660 izone
->shift1
[dim
] = -1;
5661 for (j
= izone
->j0
; j
< izone
->j1
; j
++)
5663 shift_diff
= zones
->shift
[j
][dim
] - zones
->shift
[i
][dim
];
5664 if (shift_diff
< izone
->shift0
[dim
])
5666 izone
->shift0
[dim
] = shift_diff
;
5668 if (shift_diff
> izone
->shift1
[dim
])
5670 izone
->shift1
[dim
] = shift_diff
;
5677 if (dd
->comm
->dlbState
!= edlbsOffForever
)
5679 snew(dd
->comm
->root
, dd
->ndim
);
5682 if (dd
->comm
->bRecordLoad
)
5684 make_load_communicators(dd
);
5688 static void make_pp_communicator(FILE *fplog
, t_commrec
*cr
, int gmx_unused reorder
)
5694 gmx_domdec_comm_t
*comm
;
5701 if (comm
->bCartesianPP
)
5703 /* Set up cartesian communication for the particle-particle part */
5706 fprintf(fplog
, "Will use a Cartesian communicator: %d x %d x %d\n",
5707 dd
->nc
[XX
], dd
->nc
[YY
], dd
->nc
[ZZ
]);
5710 for (int i
= 0; i
< DIM
; i
++)
5714 MPI_Cart_create(cr
->mpi_comm_mygroup
, DIM
, dd
->nc
, periods
, reorder
,
5716 /* We overwrite the old communicator with the new cartesian one */
5717 cr
->mpi_comm_mygroup
= comm_cart
;
5720 dd
->mpi_comm_all
= cr
->mpi_comm_mygroup
;
5721 MPI_Comm_rank(dd
->mpi_comm_all
, &dd
->rank
);
5723 if (comm
->bCartesianPP_PME
)
5725 /* Since we want to use the original cartesian setup for sim,
5726 * and not the one after split, we need to make an index.
5728 snew(comm
->ddindex2ddnodeid
, dd
->nnodes
);
5729 comm
->ddindex2ddnodeid
[dd_index(dd
->nc
, dd
->ci
)] = dd
->rank
;
5730 gmx_sumi(dd
->nnodes
, comm
->ddindex2ddnodeid
, cr
);
5731 /* Get the rank of the DD master,
5732 * above we made sure that the master node is a PP node.
5742 MPI_Allreduce(&rank
, &dd
->masterrank
, 1, MPI_INT
, MPI_SUM
, dd
->mpi_comm_all
);
5744 else if (comm
->bCartesianPP
)
5746 if (cr
->npmenodes
== 0)
5748 /* The PP communicator is also
5749 * the communicator for this simulation
5751 cr
->mpi_comm_mysim
= cr
->mpi_comm_mygroup
;
5753 cr
->nodeid
= dd
->rank
;
5755 MPI_Cart_coords(dd
->mpi_comm_all
, dd
->rank
, DIM
, dd
->ci
);
5757 /* We need to make an index to go from the coordinates
5758 * to the nodeid of this simulation.
5760 snew(comm
->ddindex2simnodeid
, dd
->nnodes
);
5761 snew(buf
, dd
->nnodes
);
5762 if (cr
->duty
& DUTY_PP
)
5764 buf
[dd_index(dd
->nc
, dd
->ci
)] = cr
->sim_nodeid
;
5766 /* Communicate the ddindex to simulation nodeid index */
5767 MPI_Allreduce(buf
, comm
->ddindex2simnodeid
, dd
->nnodes
, MPI_INT
, MPI_SUM
,
5768 cr
->mpi_comm_mysim
);
5771 /* Determine the master coordinates and rank.
5772 * The DD master should be the same node as the master of this sim.
5774 for (int i
= 0; i
< dd
->nnodes
; i
++)
5776 if (comm
->ddindex2simnodeid
[i
] == 0)
5778 ddindex2xyz(dd
->nc
, i
, dd
->master_ci
);
5779 MPI_Cart_rank(dd
->mpi_comm_all
, dd
->master_ci
, &dd
->masterrank
);
5784 fprintf(debug
, "The master rank is %d\n", dd
->masterrank
);
5789 /* No Cartesian communicators */
5790 /* We use the rank in dd->comm->all as DD index */
5791 ddindex2xyz(dd
->nc
, dd
->rank
, dd
->ci
);
5792 /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5794 clear_ivec(dd
->master_ci
);
5801 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5802 dd
->rank
, dd
->ci
[XX
], dd
->ci
[YY
], dd
->ci
[ZZ
]);
5807 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
5808 dd
->rank
, dd
->ci
[XX
], dd
->ci
[YY
], dd
->ci
[ZZ
]);
5812 static void receive_ddindex2simnodeid(t_commrec gmx_unused
*cr
)
5816 gmx_domdec_comm_t
*comm
;
5821 if (!comm
->bCartesianPP_PME
&& comm
->bCartesianPP
)
5824 snew(comm
->ddindex2simnodeid
, dd
->nnodes
);
5825 snew(buf
, dd
->nnodes
);
5826 if (cr
->duty
& DUTY_PP
)
5828 buf
[dd_index(dd
->nc
, dd
->ci
)] = cr
->sim_nodeid
;
5830 /* Communicate the ddindex to simulation nodeid index */
5831 MPI_Allreduce(buf
, comm
->ddindex2simnodeid
, dd
->nnodes
, MPI_INT
, MPI_SUM
,
5832 cr
->mpi_comm_mysim
);
5838 static gmx_domdec_master_t
*init_gmx_domdec_master_t(gmx_domdec_t
*dd
,
5839 int ncg
, int natoms
)
5841 gmx_domdec_master_t
*ma
;
5846 snew(ma
->ncg
, dd
->nnodes
);
5847 snew(ma
->index
, dd
->nnodes
+1);
5849 snew(ma
->nat
, dd
->nnodes
);
5850 snew(ma
->ibuf
, dd
->nnodes
*2);
5851 snew(ma
->cell_x
, DIM
);
5852 for (i
= 0; i
< DIM
; i
++)
5854 snew(ma
->cell_x
[i
], dd
->nc
[i
]+1);
5857 if (dd
->nnodes
<= GMX_DD_NNODES_SENDRECV
)
5863 snew(ma
->vbuf
, natoms
);
5869 static void split_communicator(FILE *fplog
, t_commrec
*cr
, int gmx_unused dd_node_order
,
5870 int gmx_unused reorder
)
5873 gmx_domdec_comm_t
*comm
;
5883 if (comm
->bCartesianPP
)
5885 for (i
= 1; i
< DIM
; i
++)
5887 bDiv
[i
] = ((cr
->npmenodes
*dd
->nc
[i
]) % (dd
->nnodes
) == 0);
5889 if (bDiv
[YY
] || bDiv
[ZZ
])
5891 comm
->bCartesianPP_PME
= TRUE
;
5892 /* If we have 2D PME decomposition, which is always in x+y,
5893 * we stack the PME only nodes in z.
5894 * Otherwise we choose the direction that provides the thinnest slab
5895 * of PME only nodes as this will have the least effect
5896 * on the PP communication.
5897 * But for the PME communication the opposite might be better.
5899 if (bDiv
[ZZ
] && (comm
->npmenodes_y
> 1 ||
5901 dd
->nc
[YY
] > dd
->nc
[ZZ
]))
5903 comm
->cartpmedim
= ZZ
;
5907 comm
->cartpmedim
= YY
;
5909 comm
->ntot
[comm
->cartpmedim
]
5910 += (cr
->npmenodes
*dd
->nc
[comm
->cartpmedim
])/dd
->nnodes
;
5914 fprintf(fplog
, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr
->npmenodes
, dd
->nc
[XX
], dd
->nc
[YY
], dd
->nc
[XX
], dd
->nc
[ZZ
]);
5916 "Will not use a Cartesian communicator for PP <-> PME\n\n");
5921 if (comm
->bCartesianPP_PME
)
5928 fprintf(fplog
, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm
->ntot
[XX
], comm
->ntot
[YY
], comm
->ntot
[ZZ
]);
5931 for (i
= 0; i
< DIM
; i
++)
5935 MPI_Cart_create(cr
->mpi_comm_mysim
, DIM
, comm
->ntot
, periods
, reorder
,
5937 MPI_Comm_rank(comm_cart
, &rank
);
5938 if (MASTER(cr
) && rank
!= 0)
5940 gmx_fatal(FARGS
, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5943 /* With this assigment we loose the link to the original communicator
5944 * which will usually be MPI_COMM_WORLD, unless have multisim.
5946 cr
->mpi_comm_mysim
= comm_cart
;
5947 cr
->sim_nodeid
= rank
;
5949 MPI_Cart_coords(cr
->mpi_comm_mysim
, cr
->sim_nodeid
, DIM
, dd
->ci
);
5953 fprintf(fplog
, "Cartesian rank %d, coordinates %d %d %d\n\n",
5954 cr
->sim_nodeid
, dd
->ci
[XX
], dd
->ci
[YY
], dd
->ci
[ZZ
]);
5957 if (dd
->ci
[comm
->cartpmedim
] < dd
->nc
[comm
->cartpmedim
])
5961 if (cr
->npmenodes
== 0 ||
5962 dd
->ci
[comm
->cartpmedim
] >= dd
->nc
[comm
->cartpmedim
])
5964 cr
->duty
= DUTY_PME
;
5967 /* Split the sim communicator into PP and PME only nodes */
5968 MPI_Comm_split(cr
->mpi_comm_mysim
,
5970 dd_index(comm
->ntot
, dd
->ci
),
5971 &cr
->mpi_comm_mygroup
);
5975 switch (dd_node_order
)
5980 fprintf(fplog
, "Order of the ranks: PP first, PME last\n");
5983 case ddnoINTERLEAVE
:
5984 /* Interleave the PP-only and PME-only nodes,
5985 * as on clusters with dual-core machines this will double
5986 * the communication bandwidth of the PME processes
5987 * and thus speed up the PP <-> PME and inter PME communication.
5991 fprintf(fplog
, "Interleaving PP and PME ranks\n");
5993 comm
->pmenodes
= dd_pmenodes(cr
);
5998 gmx_fatal(FARGS
, "Unknown dd_node_order=%d", dd_node_order
);
6001 if (dd_simnode2pmenode(cr
, cr
->sim_nodeid
) == -1)
6003 cr
->duty
= DUTY_PME
;
6010 /* Split the sim communicator into PP and PME only nodes */
6011 MPI_Comm_split(cr
->mpi_comm_mysim
,
6014 &cr
->mpi_comm_mygroup
);
6015 MPI_Comm_rank(cr
->mpi_comm_mygroup
, &cr
->nodeid
);
6021 fprintf(fplog
, "This rank does only %s work.\n\n",
6022 (cr
->duty
& DUTY_PP
) ? "particle-particle" : "PME-mesh");
6026 void make_dd_communicators(FILE *fplog
, t_commrec
*cr
, int dd_node_order
)
6029 gmx_domdec_comm_t
*comm
;
6035 copy_ivec(dd
->nc
, comm
->ntot
);
6037 comm
->bCartesianPP
= (dd_node_order
== ddnoCARTESIAN
);
6038 comm
->bCartesianPP_PME
= FALSE
;
6040 /* Reorder the nodes by default. This might change the MPI ranks.
6041 * Real reordering is only supported on very few architectures,
6042 * Blue Gene is one of them.
6044 CartReorder
= (getenv("GMX_NO_CART_REORDER") == NULL
);
6046 if (cr
->npmenodes
> 0)
6048 /* Split the communicator into a PP and PME part */
6049 split_communicator(fplog
, cr
, dd_node_order
, CartReorder
);
6050 if (comm
->bCartesianPP_PME
)
6052 /* We (possibly) reordered the nodes in split_communicator,
6053 * so it is no longer required in make_pp_communicator.
6055 CartReorder
= FALSE
;
6060 /* All nodes do PP and PME */
6062 /* We do not require separate communicators */
6063 cr
->mpi_comm_mygroup
= cr
->mpi_comm_mysim
;
6067 if (cr
->duty
& DUTY_PP
)
6069 /* Copy or make a new PP communicator */
6070 make_pp_communicator(fplog
, cr
, CartReorder
);
6074 receive_ddindex2simnodeid(cr
);
6077 if (!(cr
->duty
& DUTY_PME
))
6079 /* Set up the commnuication to our PME node */
6080 dd
->pme_nodeid
= dd_simnode2pmenode(cr
, cr
->sim_nodeid
);
6081 dd
->pme_receive_vir_ener
= receive_vir_ener(cr
);
6084 fprintf(debug
, "My pme_nodeid %d receive ener %d\n",
6085 dd
->pme_nodeid
, dd
->pme_receive_vir_ener
);
6090 dd
->pme_nodeid
= -1;
6095 dd
->ma
= init_gmx_domdec_master_t(dd
,
6097 comm
->cgs_gl
.index
[comm
->cgs_gl
.nr
]);
6101 static real
*get_slb_frac(FILE *fplog
, const char *dir
, int nc
, const char *size_string
)
6103 real
*slb_frac
, tot
;
6108 if (nc
> 1 && size_string
!= NULL
)
6112 fprintf(fplog
, "Using static load balancing for the %s direction\n",
6117 for (i
= 0; i
< nc
; i
++)
6120 sscanf(size_string
, "%20lf%n", &dbl
, &n
);
6123 gmx_fatal(FARGS
, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir
, size_string
);
6132 fprintf(fplog
, "Relative cell sizes:");
6134 for (i
= 0; i
< nc
; i
++)
6139 fprintf(fplog
, " %5.3f", slb_frac
[i
]);
6144 fprintf(fplog
, "\n");
6151 static int multi_body_bondeds_count(gmx_mtop_t
*mtop
)
6154 gmx_mtop_ilistloop_t iloop
;
6158 iloop
= gmx_mtop_ilistloop_init(mtop
);
6159 while (gmx_mtop_ilistloop_next(iloop
, &il
, &nmol
))
6161 for (ftype
= 0; ftype
< F_NRE
; ftype
++)
6163 if ((interaction_function
[ftype
].flags
& IF_BOND
) &&
6166 n
+= nmol
*il
[ftype
].nr
/(1 + NRAL(ftype
));
6174 static int dd_getenv(FILE *fplog
, const char *env_var
, int def
)
6180 val
= getenv(env_var
);
6183 if (sscanf(val
, "%20d", &nst
) <= 0)
6189 fprintf(fplog
, "Found env.var. %s = %s, using value %d\n",
6197 static void dd_warning(t_commrec
*cr
, FILE *fplog
, const char *warn_string
)
6201 fprintf(stderr
, "\n%s\n", warn_string
);
6205 fprintf(fplog
, "\n%s\n", warn_string
);
6209 static void check_dd_restrictions(t_commrec
*cr
, gmx_domdec_t
*dd
,
6210 t_inputrec
*ir
, FILE *fplog
)
6212 if (ir
->ePBC
== epbcSCREW
&&
6213 (dd
->nc
[XX
] == 1 || dd
->nc
[YY
] > 1 || dd
->nc
[ZZ
] > 1))
6215 gmx_fatal(FARGS
, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names
[ir
->ePBC
]);
6218 if (ir
->ns_type
== ensSIMPLE
)
6220 gmx_fatal(FARGS
, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
6223 if (ir
->nstlist
== 0)
6225 gmx_fatal(FARGS
, "Domain decomposition does not work with nstlist=0");
6228 if (ir
->comm_mode
== ecmANGULAR
&& ir
->ePBC
!= epbcNONE
)
6230 dd_warning(cr
, fplog
, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6234 static real
average_cellsize_min(gmx_domdec_t
*dd
, gmx_ddbox_t
*ddbox
)
6239 r
= ddbox
->box_size
[XX
];
6240 for (di
= 0; di
< dd
->ndim
; di
++)
6243 /* Check using the initial average cell size */
6244 r
= std::min(r
, ddbox
->box_size
[d
]*ddbox
->skew_fac
[d
]/dd
->nc
[d
]);
6250 static int check_dlb_support(FILE *fplog
, t_commrec
*cr
,
6251 const char *dlb_opt
, gmx_bool bRecordLoad
,
6252 unsigned long Flags
, t_inputrec
*ir
)
6259 case 'a': dlbState
= edlbsOffCanTurnOn
; break;
6260 case 'n': dlbState
= edlbsOffForever
; break;
6261 case 'y': dlbState
= edlbsOn
; break;
6262 default: gmx_incons("Unknown dlb_opt");
6265 if (Flags
& MD_RERUN
)
6267 return edlbsOffForever
;
6270 if (!EI_DYNAMICS(ir
->eI
))
6272 if (dlbState
== edlbsOn
)
6274 sprintf(buf
, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir
->eI
));
6275 dd_warning(cr
, fplog
, buf
);
6278 return edlbsOffForever
;
6283 dd_warning(cr
, fplog
, "NOTE: Cycle counters unsupported or not enabled in kernel. Cannot use dynamic load balancing.\n");
6284 return edlbsOffForever
;
6287 if (Flags
& MD_REPRODUCIBLE
)
6291 case edlbsOffForever
:
6293 case edlbsOffCanTurnOn
:
6294 dd_warning(cr
, fplog
, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6295 dlbState
= edlbsOffForever
;
6298 dd_warning(cr
, fplog
, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6301 gmx_fatal(FARGS
, "Death horror: undefined case (%d) for load balancing choice", dlbState
);
6309 static void set_dd_dim(FILE *fplog
, gmx_domdec_t
*dd
)
6314 if (getenv("GMX_DD_ORDER_ZYX") != NULL
)
6316 /* Decomposition order z,y,x */
6319 fprintf(fplog
, "Using domain decomposition order z, y, x\n");
6321 for (dim
= DIM
-1; dim
>= 0; dim
--)
6323 if (dd
->nc
[dim
] > 1)
6325 dd
->dim
[dd
->ndim
++] = dim
;
6331 /* Decomposition order x,y,z */
6332 for (dim
= 0; dim
< DIM
; dim
++)
6334 if (dd
->nc
[dim
] > 1)
6336 dd
->dim
[dd
->ndim
++] = dim
;
6342 static gmx_domdec_comm_t
*init_dd_comm()
6344 gmx_domdec_comm_t
*comm
;
6348 snew(comm
->cggl_flag
, DIM
*2);
6349 snew(comm
->cgcm_state
, DIM
*2);
6350 for (i
= 0; i
< DIM
*2; i
++)
6352 comm
->cggl_flag_nalloc
[i
] = 0;
6353 comm
->cgcm_state_nalloc
[i
] = 0;
6356 comm
->nalloc_int
= 0;
6357 comm
->buf_int
= NULL
;
6359 vec_rvec_init(&comm
->vbuf
);
6361 comm
->n_load_have
= 0;
6362 comm
->n_load_collect
= 0;
6364 for (i
= 0; i
< ddnatNR
-ddnatZONE
; i
++)
6366 comm
->sum_nat
[i
] = 0;
6370 comm
->load_step
= 0;
6373 clear_ivec(comm
->load_lim
);
6380 gmx_domdec_t
*init_domain_decomposition(FILE *fplog
, t_commrec
*cr
,
6381 unsigned long Flags
,
6383 real comm_distance_min
, real rconstr
,
6384 const char *dlb_opt
, real dlb_scale
,
6385 const char *sizex
, const char *sizey
, const char *sizez
,
6386 gmx_mtop_t
*mtop
, t_inputrec
*ir
,
6387 matrix box
, rvec
*x
,
6389 int *npme_x
, int *npme_y
)
6392 gmx_domdec_comm_t
*comm
;
6394 real r_2b
, r_mb
, r_bonded
= -1, r_bonded_limit
= -1, limit
, acs
;
6397 const real tenPercentMargin
= 1.1;
6402 "\nInitializing Domain Decomposition on %d ranks\n", cr
->nnodes
);
6407 dd
->comm
= init_dd_comm();
6409 snew(comm
->cggl_flag
, DIM
*2);
6410 snew(comm
->cgcm_state
, DIM
*2);
6412 dd
->npbcdim
= ePBC2npbcdim(ir
->ePBC
);
6413 dd
->bScrewPBC
= (ir
->ePBC
== epbcSCREW
);
6415 dd
->bSendRecv2
= dd_getenv(fplog
, "GMX_DD_USE_SENDRECV2", 0);
6416 comm
->dlb_scale_lim
= dd_getenv(fplog
, "GMX_DLB_MAX_BOX_SCALING", 10);
6417 comm
->eFlop
= dd_getenv(fplog
, "GMX_DLB_BASED_ON_FLOPS", 0);
6418 recload
= dd_getenv(fplog
, "GMX_DD_RECORD_LOAD", 1);
6419 comm
->nstSortCG
= dd_getenv(fplog
, "GMX_DD_NST_SORT_CHARGE_GROUPS", 1);
6420 comm
->nstDDDump
= dd_getenv(fplog
, "GMX_DD_NST_DUMP", 0);
6421 comm
->nstDDDumpGrid
= dd_getenv(fplog
, "GMX_DD_NST_DUMP_GRID", 0);
6422 comm
->DD_debug
= dd_getenv(fplog
, "GMX_DD_DEBUG", 0);
6424 dd
->pme_recv_f_alloc
= 0;
6425 dd
->pme_recv_f_buf
= NULL
;
6427 if (dd
->bSendRecv2
&& fplog
)
6429 fprintf(fplog
, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6435 fprintf(fplog
, "Will load balance based on FLOP count\n");
6437 if (comm
->eFlop
> 1)
6439 srand(1+cr
->nodeid
);
6441 comm
->bRecordLoad
= TRUE
;
6445 comm
->bRecordLoad
= (wallcycle_have_counter() && recload
> 0);
6449 /* Initialize to GPU share count to 0, might change later */
6450 comm
->nrank_gpu_shared
= 0;
6452 comm
->dlbState
= check_dlb_support(fplog
, cr
, dlb_opt
, comm
->bRecordLoad
, Flags
, ir
);
6453 comm
->bCheckWhetherToTurnDlbOn
= TRUE
;
6457 fprintf(fplog
, "Dynamic load balancing: %s\n",
6458 edlbs_names
[comm
->dlbState
]);
6460 comm
->bPMELoadBalDLBLimits
= FALSE
;
6462 if (comm
->nstSortCG
)
6466 if (comm
->nstSortCG
== 1)
6468 fprintf(fplog
, "Will sort the charge groups at every domain (re)decomposition\n");
6472 fprintf(fplog
, "Will sort the charge groups every %d steps\n",
6476 snew(comm
->sort
, 1);
6482 fprintf(fplog
, "Will not sort the charge groups\n");
6486 comm
->bCGs
= (ncg_mtop(mtop
) < mtop
->natoms
);
6488 comm
->bInterCGBondeds
= ((ncg_mtop(mtop
) > mtop
->mols
.nr
) ||
6489 mtop
->bIntermolecularInteractions
);
6490 if (comm
->bInterCGBondeds
)
6492 comm
->bInterCGMultiBody
= (multi_body_bondeds_count(mtop
) > 0);
6496 comm
->bInterCGMultiBody
= FALSE
;
6499 dd
->bInterCGcons
= inter_charge_group_constraints(mtop
);
6500 dd
->bInterCGsettles
= inter_charge_group_settles(mtop
);
6504 /* Set the cut-off to some very large value,
6505 * so we don't need if statements everywhere in the code.
6506 * We use sqrt, since the cut-off is squared in some places.
6508 comm
->cutoff
= GMX_CUTOFF_INF
;
6512 comm
->cutoff
= ir
->rlist
;
6514 comm
->cutoff_mbody
= 0;
6516 comm
->cellsize_limit
= 0;
6517 comm
->bBondComm
= FALSE
;
6519 /* Atoms should be able to move by up to half the list buffer size (if > 0)
6520 * within nstlist steps. Since boundaries are allowed to displace by half
6521 * a cell size, DD cells should be at least the size of the list buffer.
6523 comm
->cellsize_limit
= std::max(comm
->cellsize_limit
,
6524 ir
->rlist
- std::max(ir
->rvdw
, ir
->rcoulomb
));
6526 if (comm
->bInterCGBondeds
)
6528 if (comm_distance_min
> 0)
6530 comm
->cutoff_mbody
= comm_distance_min
;
6531 if (Flags
& MD_DDBONDCOMM
)
6533 comm
->bBondComm
= (comm
->cutoff_mbody
> comm
->cutoff
);
6537 comm
->cutoff
= std::max(comm
->cutoff
, comm
->cutoff_mbody
);
6539 r_bonded_limit
= comm
->cutoff_mbody
;
6541 else if (ir
->bPeriodicMols
)
6543 /* Can not easily determine the required cut-off */
6544 dd_warning(cr
, fplog
, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6545 comm
->cutoff_mbody
= comm
->cutoff
/2;
6546 r_bonded_limit
= comm
->cutoff_mbody
;
6552 dd_bonded_cg_distance(fplog
, mtop
, ir
, x
, box
,
6553 Flags
& MD_DDBONDCHECK
, &r_2b
, &r_mb
);
6555 gmx_bcast(sizeof(r_2b
), &r_2b
, cr
);
6556 gmx_bcast(sizeof(r_mb
), &r_mb
, cr
);
6558 /* We use an initial margin of 10% for the minimum cell size,
6559 * except when we are just below the non-bonded cut-off.
6561 if (Flags
& MD_DDBONDCOMM
)
6563 if (std::max(r_2b
, r_mb
) > comm
->cutoff
)
6565 r_bonded
= std::max(r_2b
, r_mb
);
6566 r_bonded_limit
= tenPercentMargin
*r_bonded
;
6567 comm
->bBondComm
= TRUE
;
6572 r_bonded_limit
= std::min(tenPercentMargin
*r_bonded
, comm
->cutoff
);
6574 /* We determine cutoff_mbody later */
6578 /* No special bonded communication,
6579 * simply increase the DD cut-off.
6581 r_bonded_limit
= tenPercentMargin
*std::max(r_2b
, r_mb
);
6582 comm
->cutoff_mbody
= r_bonded_limit
;
6583 comm
->cutoff
= std::max(comm
->cutoff
, comm
->cutoff_mbody
);
6589 "Minimum cell size due to bonded interactions: %.3f nm\n",
6592 comm
->cellsize_limit
= std::max(comm
->cellsize_limit
, r_bonded_limit
);
6595 if (dd
->bInterCGcons
&& rconstr
<= 0)
6597 /* There is a cell size limit due to the constraints (P-LINCS) */
6598 rconstr
= constr_r_max(fplog
, mtop
, ir
);
6602 "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6604 if (rconstr
> comm
->cellsize_limit
)
6606 fprintf(fplog
, "This distance will limit the DD cell size, you can override this with -rcon\n");
6610 else if (rconstr
> 0 && fplog
)
6612 /* Here we do not check for dd->bInterCGcons,
6613 * because one can also set a cell size limit for virtual sites only
6614 * and at this point we don't know yet if there are intercg v-sites.
6617 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6620 comm
->cellsize_limit
= std::max(comm
->cellsize_limit
, rconstr
);
6622 comm
->cgs_gl
= gmx_mtop_global_cgs(mtop
);
6626 copy_ivec(nc
, dd
->nc
);
6627 set_dd_dim(fplog
, dd
);
6628 set_ddbox_cr(cr
, &dd
->nc
, ir
, box
, &comm
->cgs_gl
, x
, ddbox
);
6630 if (cr
->npmenodes
== -1)
6634 acs
= average_cellsize_min(dd
, ddbox
);
6635 if (acs
< comm
->cellsize_limit
)
6639 fprintf(fplog
, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs
, comm
->cellsize_limit
);
6641 gmx_fatal_collective(FARGS
, cr
->mpi_comm_mysim
, MASTER(cr
),
6642 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6643 acs
, comm
->cellsize_limit
);
6648 set_ddbox_cr(cr
, NULL
, ir
, box
, &comm
->cgs_gl
, x
, ddbox
);
6650 /* We need to choose the optimal DD grid and possibly PME nodes */
6651 limit
= dd_choose_grid(fplog
, cr
, dd
, ir
, mtop
, box
, ddbox
,
6652 comm
->dlbState
!= edlbsOffForever
, dlb_scale
,
6653 comm
->cellsize_limit
, comm
->cutoff
,
6654 comm
->bInterCGBondeds
);
6656 if (dd
->nc
[XX
] == 0)
6658 bC
= (dd
->bInterCGcons
&& rconstr
> r_bonded_limit
);
6659 sprintf(buf
, "Change the number of ranks or mdrun option %s%s%s",
6660 !bC
? "-rdd" : "-rcon",
6661 comm
->dlbState
!= edlbsOffForever
? " or -dds" : "",
6662 bC
? " or your LINCS settings" : "");
6664 gmx_fatal_collective(FARGS
, cr
->mpi_comm_mysim
, MASTER(cr
),
6665 "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
6667 "Look in the log file for details on the domain decomposition",
6668 cr
->nnodes
-cr
->npmenodes
, limit
, buf
);
6670 set_dd_dim(fplog
, dd
);
6676 "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
6677 dd
->nc
[XX
], dd
->nc
[YY
], dd
->nc
[ZZ
], cr
->npmenodes
);
6680 dd
->nnodes
= dd
->nc
[XX
]*dd
->nc
[YY
]*dd
->nc
[ZZ
];
6681 if (cr
->nnodes
- dd
->nnodes
!= cr
->npmenodes
)
6683 gmx_fatal_collective(FARGS
, cr
->mpi_comm_mysim
, MASTER(cr
),
6684 "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
6685 dd
->nnodes
, cr
->nnodes
- cr
->npmenodes
, cr
->nnodes
);
6687 if (cr
->npmenodes
> dd
->nnodes
)
6689 gmx_fatal_collective(FARGS
, cr
->mpi_comm_mysim
, MASTER(cr
),
6690 "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr
->npmenodes
, dd
->nnodes
);
6692 if (cr
->npmenodes
> 0)
6694 comm
->npmenodes
= cr
->npmenodes
;
6698 comm
->npmenodes
= dd
->nnodes
;
6701 if (EEL_PME(ir
->coulombtype
) || EVDW_PME(ir
->vdwtype
))
6703 /* The following choices should match those
6704 * in comm_cost_est in domdec_setup.c.
6705 * Note that here the checks have to take into account
6706 * that the decomposition might occur in a different order than xyz
6707 * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6708 * in which case they will not match those in comm_cost_est,
6709 * but since that is mainly for testing purposes that's fine.
6711 if (dd
->ndim
>= 2 && dd
->dim
[0] == XX
&& dd
->dim
[1] == YY
&&
6712 comm
->npmenodes
> dd
->nc
[XX
] && comm
->npmenodes
% dd
->nc
[XX
] == 0 &&
6713 getenv("GMX_PMEONEDD") == NULL
)
6715 comm
->npmedecompdim
= 2;
6716 comm
->npmenodes_x
= dd
->nc
[XX
];
6717 comm
->npmenodes_y
= comm
->npmenodes
/comm
->npmenodes_x
;
6721 /* In case nc is 1 in both x and y we could still choose to
6722 * decompose pme in y instead of x, but we use x for simplicity.
6724 comm
->npmedecompdim
= 1;
6725 if (dd
->dim
[0] == YY
)
6727 comm
->npmenodes_x
= 1;
6728 comm
->npmenodes_y
= comm
->npmenodes
;
6732 comm
->npmenodes_x
= comm
->npmenodes
;
6733 comm
->npmenodes_y
= 1;
6738 fprintf(fplog
, "PME domain decomposition: %d x %d x %d\n",
6739 comm
->npmenodes_x
, comm
->npmenodes_y
, 1);
6744 comm
->npmedecompdim
= 0;
6745 comm
->npmenodes_x
= 0;
6746 comm
->npmenodes_y
= 0;
6749 /* Technically we don't need both of these,
6750 * but it simplifies code not having to recalculate it.
6752 *npme_x
= comm
->npmenodes_x
;
6753 *npme_y
= comm
->npmenodes_y
;
6755 snew(comm
->slb_frac
, DIM
);
6756 if (comm
->dlbState
== edlbsOffForever
)
6758 comm
->slb_frac
[XX
] = get_slb_frac(fplog
, "x", dd
->nc
[XX
], sizex
);
6759 comm
->slb_frac
[YY
] = get_slb_frac(fplog
, "y", dd
->nc
[YY
], sizey
);
6760 comm
->slb_frac
[ZZ
] = get_slb_frac(fplog
, "z", dd
->nc
[ZZ
], sizez
);
6763 if (comm
->bInterCGBondeds
&& comm
->cutoff_mbody
== 0)
6765 if (comm
->bBondComm
|| comm
->dlbState
!= edlbsOffForever
)
6767 /* Set the bonded communication distance to halfway
6768 * the minimum and the maximum,
6769 * since the extra communication cost is nearly zero.
6771 acs
= average_cellsize_min(dd
, ddbox
);
6772 comm
->cutoff_mbody
= 0.5*(r_bonded
+ acs
);
6773 if (comm
->dlbState
!= edlbsOffForever
)
6775 /* Check if this does not limit the scaling */
6776 comm
->cutoff_mbody
= std::min(comm
->cutoff_mbody
, dlb_scale
*acs
);
6778 if (!comm
->bBondComm
)
6780 /* Without bBondComm do not go beyond the n.b. cut-off */
6781 comm
->cutoff_mbody
= std::min(comm
->cutoff_mbody
, comm
->cutoff
);
6782 if (comm
->cellsize_limit
>= comm
->cutoff
)
6784 /* We don't loose a lot of efficieny
6785 * when increasing it to the n.b. cut-off.
6786 * It can even be slightly faster, because we need
6787 * less checks for the communication setup.
6789 comm
->cutoff_mbody
= comm
->cutoff
;
6792 /* Check if we did not end up below our original limit */
6793 comm
->cutoff_mbody
= std::max(comm
->cutoff_mbody
, r_bonded_limit
);
6795 if (comm
->cutoff_mbody
> comm
->cellsize_limit
)
6797 comm
->cellsize_limit
= comm
->cutoff_mbody
;
6800 /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6805 fprintf(debug
, "Bonded atom communication beyond the cut-off: %d\n"
6806 "cellsize limit %f\n",
6807 comm
->bBondComm
, comm
->cellsize_limit
);
6812 check_dd_restrictions(cr
, dd
, ir
, fplog
);
6815 comm
->partition_step
= INT_MIN
;
6818 clear_dd_cycle_counts(dd
);
6823 static void set_dlb_limits(gmx_domdec_t
*dd
)
6828 for (d
= 0; d
< dd
->ndim
; d
++)
6830 dd
->comm
->cd
[d
].np
= dd
->comm
->cd
[d
].np_dlb
;
6831 dd
->comm
->cellsize_min
[dd
->dim
[d
]] =
6832 dd
->comm
->cellsize_min_dlb
[dd
->dim
[d
]];
6837 static void turn_on_dlb(FILE *fplog
, t_commrec
*cr
, gmx_int64_t step
)
6840 gmx_domdec_comm_t
*comm
;
6850 fprintf(fplog
, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step
, buf
), dd_force_imb_perf_loss(dd
)*100);
6853 cellsize_min
= comm
->cellsize_min
[dd
->dim
[0]];
6854 for (d
= 1; d
< dd
->ndim
; d
++)
6856 cellsize_min
= std::min(cellsize_min
, comm
->cellsize_min
[dd
->dim
[d
]]);
6859 if (cellsize_min
< comm
->cellsize_limit
*1.05)
6861 dd_warning(cr
, fplog
, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6863 /* Change DLB from "auto" to "no". */
6864 comm
->dlbState
= edlbsOffForever
;
6869 dd_warning(cr
, fplog
, "NOTE: Turning on dynamic load balancing\n");
6870 comm
->dlbState
= edlbsOn
;
6874 /* We can set the required cell size info here,
6875 * so we do not need to communicate this.
6876 * The grid is completely uniform.
6878 for (d
= 0; d
< dd
->ndim
; d
++)
6882 comm
->load
[d
].sum_m
= comm
->load
[d
].sum
;
6884 nc
= dd
->nc
[dd
->dim
[d
]];
6885 for (i
= 0; i
< nc
; i
++)
6887 comm
->root
[d
]->cell_f
[i
] = i
/(real
)nc
;
6890 comm
->root
[d
]->cell_f_max0
[i
] = i
/(real
)nc
;
6891 comm
->root
[d
]->cell_f_min1
[i
] = (i
+1)/(real
)nc
;
6894 comm
->root
[d
]->cell_f
[nc
] = 1.0;
6899 static char *init_bLocalCG(gmx_mtop_t
*mtop
)
6904 ncg
= ncg_mtop(mtop
);
6905 snew(bLocalCG
, ncg
);
6906 for (cg
= 0; cg
< ncg
; cg
++)
6908 bLocalCG
[cg
] = FALSE
;
6914 void dd_init_bondeds(FILE *fplog
,
6915 gmx_domdec_t
*dd
, gmx_mtop_t
*mtop
,
6917 t_inputrec
*ir
, gmx_bool bBCheck
, cginfo_mb_t
*cginfo_mb
)
6919 gmx_domdec_comm_t
*comm
;
6921 dd_make_reverse_top(fplog
, dd
, mtop
, vsite
, ir
, bBCheck
);
6925 if (comm
->bBondComm
)
6927 /* Communicate atoms beyond the cut-off for bonded interactions */
6930 comm
->cglink
= make_charge_group_links(mtop
, dd
, cginfo_mb
);
6932 comm
->bLocalCG
= init_bLocalCG(mtop
);
6936 /* Only communicate atoms based on cut-off */
6937 comm
->cglink
= NULL
;
6938 comm
->bLocalCG
= NULL
;
6942 static void print_dd_settings(FILE *fplog
, gmx_domdec_t
*dd
,
6944 gmx_bool bDynLoadBal
, real dlb_scale
,
6947 gmx_domdec_comm_t
*comm
;
6962 fprintf(fplog
, "The maximum number of communication pulses is:");
6963 for (d
= 0; d
< dd
->ndim
; d
++)
6965 fprintf(fplog
, " %c %d", dim2char(dd
->dim
[d
]), comm
->cd
[d
].np_dlb
);
6967 fprintf(fplog
, "\n");
6968 fprintf(fplog
, "The minimum size for domain decomposition cells is %.3f nm\n", comm
->cellsize_limit
);
6969 fprintf(fplog
, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale
);
6970 fprintf(fplog
, "The allowed shrink of domain decomposition cells is:");
6971 for (d
= 0; d
< DIM
; d
++)
6975 if (d
>= ddbox
->npbcdim
&& dd
->nc
[d
] == 2)
6982 comm
->cellsize_min_dlb
[d
]/
6983 (ddbox
->box_size
[d
]*ddbox
->skew_fac
[d
]/dd
->nc
[d
]);
6985 fprintf(fplog
, " %c %.2f", dim2char(d
), shrink
);
6988 fprintf(fplog
, "\n");
6992 set_dd_cell_sizes_slb(dd
, ddbox
, setcellsizeslbPULSE_ONLY
, np
);
6993 fprintf(fplog
, "The initial number of communication pulses is:");
6994 for (d
= 0; d
< dd
->ndim
; d
++)
6996 fprintf(fplog
, " %c %d", dim2char(dd
->dim
[d
]), np
[dd
->dim
[d
]]);
6998 fprintf(fplog
, "\n");
6999 fprintf(fplog
, "The initial domain decomposition cell size is:");
7000 for (d
= 0; d
< DIM
; d
++)
7004 fprintf(fplog
, " %c %.2f nm",
7005 dim2char(d
), dd
->comm
->cellsize_min
[d
]);
7008 fprintf(fplog
, "\n\n");
7011 if (comm
->bInterCGBondeds
|| dd
->vsite_comm
|| dd
->constraint_comm
)
7013 fprintf(fplog
, "The maximum allowed distance for charge groups involved in interactions is:\n");
7014 fprintf(fplog
, "%40s %-7s %6.3f nm\n",
7015 "non-bonded interactions", "", comm
->cutoff
);
7019 limit
= dd
->comm
->cellsize_limit
;
7023 if (dynamic_dd_box(ddbox
, ir
))
7025 fprintf(fplog
, "(the following are initial values, they could change due to box deformation)\n");
7027 limit
= dd
->comm
->cellsize_min
[XX
];
7028 for (d
= 1; d
< DIM
; d
++)
7030 limit
= std::min(limit
, dd
->comm
->cellsize_min
[d
]);
7034 if (comm
->bInterCGBondeds
)
7036 fprintf(fplog
, "%40s %-7s %6.3f nm\n",
7037 "two-body bonded interactions", "(-rdd)",
7038 std::max(comm
->cutoff
, comm
->cutoff_mbody
));
7039 fprintf(fplog
, "%40s %-7s %6.3f nm\n",
7040 "multi-body bonded interactions", "(-rdd)",
7041 (comm
->bBondComm
|| dlbIsOn(dd
->comm
)) ? comm
->cutoff_mbody
: std::min(comm
->cutoff
, limit
));
7045 fprintf(fplog
, "%40s %-7s %6.3f nm\n",
7046 "virtual site constructions", "(-rcon)", limit
);
7048 if (dd
->constraint_comm
)
7050 sprintf(buf
, "atoms separated by up to %d constraints",
7052 fprintf(fplog
, "%40s %-7s %6.3f nm\n",
7053 buf
, "(-rcon)", limit
);
7055 fprintf(fplog
, "\n");
7061 static void set_cell_limits_dlb(gmx_domdec_t
*dd
,
7063 const t_inputrec
*ir
,
7064 const gmx_ddbox_t
*ddbox
)
7066 gmx_domdec_comm_t
*comm
;
7067 int d
, dim
, npulse
, npulse_d_max
, npulse_d
;
7072 bNoCutOff
= (ir
->rvdw
== 0 || ir
->rcoulomb
== 0);
7074 /* Determine the maximum number of comm. pulses in one dimension */
7076 comm
->cellsize_limit
= std::max(comm
->cellsize_limit
, comm
->cutoff_mbody
);
7078 /* Determine the maximum required number of grid pulses */
7079 if (comm
->cellsize_limit
>= comm
->cutoff
)
7081 /* Only a single pulse is required */
7084 else if (!bNoCutOff
&& comm
->cellsize_limit
> 0)
7086 /* We round down slightly here to avoid overhead due to the latency
7087 * of extra communication calls when the cut-off
7088 * would be only slightly longer than the cell size.
7089 * Later cellsize_limit is redetermined,
7090 * so we can not miss interactions due to this rounding.
7092 npulse
= (int)(0.96 + comm
->cutoff
/comm
->cellsize_limit
);
7096 /* There is no cell size limit */
7097 npulse
= std::max(dd
->nc
[XX
]-1, std::max(dd
->nc
[YY
]-1, dd
->nc
[ZZ
]-1));
7100 if (!bNoCutOff
&& npulse
> 1)
7102 /* See if we can do with less pulses, based on dlb_scale */
7104 for (d
= 0; d
< dd
->ndim
; d
++)
7107 npulse_d
= (int)(1 + dd
->nc
[dim
]*comm
->cutoff
7108 /(ddbox
->box_size
[dim
]*ddbox
->skew_fac
[dim
]*dlb_scale
));
7109 npulse_d_max
= std::max(npulse_d_max
, npulse_d
);
7111 npulse
= std::min(npulse
, npulse_d_max
);
7114 /* This env var can override npulse */
7115 d
= dd_getenv(debug
, "GMX_DD_NPULSE", 0);
7122 comm
->bVacDLBNoLimit
= (ir
->ePBC
== epbcNONE
);
7123 for (d
= 0; d
< dd
->ndim
; d
++)
7125 comm
->cd
[d
].np_dlb
= std::min(npulse
, dd
->nc
[dd
->dim
[d
]]-1);
7126 comm
->cd
[d
].np_nalloc
= comm
->cd
[d
].np_dlb
;
7127 snew(comm
->cd
[d
].ind
, comm
->cd
[d
].np_nalloc
);
7128 comm
->maxpulse
= std::max(comm
->maxpulse
, comm
->cd
[d
].np_dlb
);
7129 if (comm
->cd
[d
].np_dlb
< dd
->nc
[dd
->dim
[d
]]-1)
7131 comm
->bVacDLBNoLimit
= FALSE
;
7135 /* cellsize_limit is set for LINCS in init_domain_decomposition */
7136 if (!comm
->bVacDLBNoLimit
)
7138 comm
->cellsize_limit
= std::max(comm
->cellsize_limit
,
7139 comm
->cutoff
/comm
->maxpulse
);
7141 comm
->cellsize_limit
= std::max(comm
->cellsize_limit
, comm
->cutoff_mbody
);
7142 /* Set the minimum cell size for each DD dimension */
7143 for (d
= 0; d
< dd
->ndim
; d
++)
7145 if (comm
->bVacDLBNoLimit
||
7146 comm
->cd
[d
].np_dlb
*comm
->cellsize_limit
>= comm
->cutoff
)
7148 comm
->cellsize_min_dlb
[dd
->dim
[d
]] = comm
->cellsize_limit
;
7152 comm
->cellsize_min_dlb
[dd
->dim
[d
]] =
7153 comm
->cutoff
/comm
->cd
[d
].np_dlb
;
7156 if (comm
->cutoff_mbody
<= 0)
7158 comm
->cutoff_mbody
= std::min(comm
->cutoff
, comm
->cellsize_limit
);
7166 gmx_bool
dd_bonded_molpbc(gmx_domdec_t
*dd
, int ePBC
)
7168 /* If each molecule is a single charge group
7169 * or we use domain decomposition for each periodic dimension,
7170 * we do not need to take pbc into account for the bonded interactions.
7172 return (ePBC
!= epbcNONE
&& dd
->comm
->bInterCGBondeds
&&
7175 (dd
->nc
[ZZ
] > 1 || ePBC
== epbcXY
)));
7178 void set_dd_parameters(FILE *fplog
, gmx_domdec_t
*dd
, real dlb_scale
,
7179 t_inputrec
*ir
, gmx_ddbox_t
*ddbox
)
7181 gmx_domdec_comm_t
*comm
;
7187 /* Initialize the thread data.
7188 * This can not be done in init_domain_decomposition,
7189 * as the numbers of threads is determined later.
7191 comm
->nth
= gmx_omp_nthreads_get(emntDomdec
);
7194 snew(comm
->dth
, comm
->nth
);
7197 if (EEL_PME(ir
->coulombtype
) || EVDW_PME(ir
->vdwtype
))
7199 init_ddpme(dd
, &comm
->ddpme
[0], 0);
7200 if (comm
->npmedecompdim
>= 2)
7202 init_ddpme(dd
, &comm
->ddpme
[1], 1);
7207 comm
->npmenodes
= 0;
7208 if (dd
->pme_nodeid
>= 0)
7210 gmx_fatal_collective(FARGS
, dd
->mpi_comm_all
, DDMASTER(dd
),
7211 "Can not have separate PME ranks without PME electrostatics");
7217 fprintf(debug
, "The DD cut-off is %f\n", comm
->cutoff
);
7219 if (comm
->dlbState
!= edlbsOffForever
)
7221 set_cell_limits_dlb(dd
, dlb_scale
, ir
, ddbox
);
7224 print_dd_settings(fplog
, dd
, ir
, dlbIsOn(comm
), dlb_scale
, ddbox
);
7225 if (comm
->dlbState
== edlbsOffCanTurnOn
)
7229 fprintf(fplog
, "When dynamic load balancing gets turned on, these settings will change to:\n");
7231 print_dd_settings(fplog
, dd
, ir
, TRUE
, dlb_scale
, ddbox
);
7234 if (ir
->ePBC
== epbcNONE
)
7236 vol_frac
= 1 - 1/(double)dd
->nnodes
;
7241 (1 + comm_box_frac(dd
->nc
, comm
->cutoff
, ddbox
))/(double)dd
->nnodes
;
7245 fprintf(debug
, "Volume fraction for all DD zones: %f\n", vol_frac
);
7247 natoms_tot
= comm
->cgs_gl
.index
[comm
->cgs_gl
.nr
];
7249 dd
->ga2la
= ga2la_init(natoms_tot
, static_cast<int>(vol_frac
*natoms_tot
));
7252 static gmx_bool
test_dd_cutoff(t_commrec
*cr
,
7253 t_state
*state
, const t_inputrec
*ir
,
7264 set_ddbox(dd
, FALSE
, cr
, ir
, state
->box
,
7265 TRUE
, &dd
->comm
->cgs_gl
, state
->x
, &ddbox
);
7269 for (d
= 0; d
< dd
->ndim
; d
++)
7273 inv_cell_size
= DD_CELL_MARGIN
*dd
->nc
[dim
]/ddbox
.box_size
[dim
];
7274 if (dynamic_dd_box(&ddbox
, ir
))
7276 inv_cell_size
*= DD_PRES_SCALE_MARGIN
;
7279 np
= 1 + (int)(cutoff_req
*inv_cell_size
*ddbox
.skew_fac
[dim
]);
7281 if (dd
->comm
->dlbState
!= edlbsOffForever
&& dim
< ddbox
.npbcdim
&&
7282 dd
->comm
->cd
[d
].np_dlb
> 0)
7284 if (np
> dd
->comm
->cd
[d
].np_dlb
)
7289 /* If a current local cell size is smaller than the requested
7290 * cut-off, we could still fix it, but this gets very complicated.
7291 * Without fixing here, we might actually need more checks.
7293 if ((dd
->comm
->cell_x1
[dim
] - dd
->comm
->cell_x0
[dim
])*ddbox
.skew_fac
[dim
]*dd
->comm
->cd
[d
].np_dlb
< cutoff_req
)
7300 if (dd
->comm
->dlbState
!= edlbsOffForever
)
7302 /* If DLB is not active yet, we don't need to check the grid jumps.
7303 * Actually we shouldn't, because then the grid jump data is not set.
7305 if (dlbIsOn(dd
->comm
) &&
7306 check_grid_jump(0, dd
, cutoff_req
, &ddbox
, FALSE
))
7311 gmx_sumi(1, &LocallyLimited
, cr
);
7313 if (LocallyLimited
> 0)
7322 gmx_bool
change_dd_cutoff(t_commrec
*cr
, t_state
*state
, const t_inputrec
*ir
,
7325 gmx_bool bCutoffAllowed
;
7327 bCutoffAllowed
= test_dd_cutoff(cr
, state
, ir
, cutoff_req
);
7331 cr
->dd
->comm
->cutoff
= cutoff_req
;
7334 return bCutoffAllowed
;
7337 void set_dd_dlb_max_cutoff(t_commrec
*cr
, real cutoff
)
7339 gmx_domdec_comm_t
*comm
;
7341 comm
= cr
->dd
->comm
;
7343 /* Turn on the DLB limiting (might have been on already) */
7344 comm
->bPMELoadBalDLBLimits
= TRUE
;
7346 /* Change the cut-off limit */
7347 comm
->PMELoadBal_max_cutoff
= cutoff
;
7351 fprintf(debug
, "PME load balancing set a limit to the DLB staggering such that a %f cut-off will continue to fit\n",
7352 comm
->PMELoadBal_max_cutoff
);
7356 /* Sets whether we should later check the load imbalance data, so that
7357 * we can trigger dynamic load balancing if enough imbalance has
7360 * Used after PME load balancing unlocks DLB, so that the check
7361 * whether DLB will be useful can happen immediately.
7363 static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t
*dd
, gmx_bool bValue
)
7365 if (dd
->comm
->dlbState
== edlbsOffCanTurnOn
)
7367 dd
->comm
->bCheckWhetherToTurnDlbOn
= bValue
;
7371 /* Returns if we should check whether there has been enough load
7372 * imbalance to trigger dynamic load balancing.
7374 static gmx_bool
dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t
*dd
)
7376 const int nddp_chk_dlb
= 100;
7378 if (dd
->comm
->dlbState
!= edlbsOffCanTurnOn
)
7383 /* We should check whether we should use DLB directly after
7385 if (dd
->comm
->bCheckWhetherToTurnDlbOn
)
7387 /* This flag was set when the PME load-balancing routines
7388 unlocked DLB, and should now be cleared. */
7389 dd_dlb_set_should_check_whether_to_turn_dlb_on(dd
, FALSE
);
7392 /* We should also check whether we should use DLB every 100
7393 * partitionings (we do not do this every partioning, so that we
7394 * avoid excessive communication). */
7395 if (dd
->comm
->n_load_have
% nddp_chk_dlb
== nddp_chk_dlb
- 1)
7403 gmx_bool
dd_dlb_is_on(const gmx_domdec_t
*dd
)
7405 return (dd
->comm
->dlbState
== edlbsOn
);
7408 gmx_bool
dd_dlb_is_locked(const gmx_domdec_t
*dd
)
7410 return (dd
->comm
->dlbState
== edlbsOffTemporarilyLocked
);
7413 void dd_dlb_lock(gmx_domdec_t
*dd
)
7415 /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7416 if (dd
->comm
->dlbState
== edlbsOffCanTurnOn
)
7418 dd
->comm
->dlbState
= edlbsOffTemporarilyLocked
;
7422 void dd_dlb_unlock(gmx_domdec_t
*dd
)
7424 /* We can only lock the DLB when it is set to auto, otherwise don't do anything */
7425 if (dd
->comm
->dlbState
== edlbsOffTemporarilyLocked
)
7427 dd
->comm
->dlbState
= edlbsOffCanTurnOn
;
7428 dd_dlb_set_should_check_whether_to_turn_dlb_on(dd
, TRUE
);
7432 static void merge_cg_buffers(int ncell
,
7433 gmx_domdec_comm_dim_t
*cd
, int pulse
,
7435 int *index_gl
, int *recv_i
,
7436 rvec
*cg_cm
, rvec
*recv_vr
,
7438 cginfo_mb_t
*cginfo_mb
, int *cginfo
)
7440 gmx_domdec_ind_t
*ind
, *ind_p
;
7441 int p
, cell
, c
, cg
, cg0
, cg1
, cg_gl
, nat
;
7442 int shift
, shift_at
;
7444 ind
= &cd
->ind
[pulse
];
7446 /* First correct the already stored data */
7447 shift
= ind
->nrecv
[ncell
];
7448 for (cell
= ncell
-1; cell
>= 0; cell
--)
7450 shift
-= ind
->nrecv
[cell
];
7453 /* Move the cg's present from previous grid pulses */
7454 cg0
= ncg_cell
[ncell
+cell
];
7455 cg1
= ncg_cell
[ncell
+cell
+1];
7456 cgindex
[cg1
+shift
] = cgindex
[cg1
];
7457 for (cg
= cg1
-1; cg
>= cg0
; cg
--)
7459 index_gl
[cg
+shift
] = index_gl
[cg
];
7460 copy_rvec(cg_cm
[cg
], cg_cm
[cg
+shift
]);
7461 cgindex
[cg
+shift
] = cgindex
[cg
];
7462 cginfo
[cg
+shift
] = cginfo
[cg
];
7464 /* Correct the already stored send indices for the shift */
7465 for (p
= 1; p
<= pulse
; p
++)
7467 ind_p
= &cd
->ind
[p
];
7469 for (c
= 0; c
< cell
; c
++)
7471 cg0
+= ind_p
->nsend
[c
];
7473 cg1
= cg0
+ ind_p
->nsend
[cell
];
7474 for (cg
= cg0
; cg
< cg1
; cg
++)
7476 ind_p
->index
[cg
] += shift
;
7482 /* Merge in the communicated buffers */
7486 for (cell
= 0; cell
< ncell
; cell
++)
7488 cg1
= ncg_cell
[ncell
+cell
+1] + shift
;
7491 /* Correct the old cg indices */
7492 for (cg
= ncg_cell
[ncell
+cell
]; cg
< cg1
; cg
++)
7494 cgindex
[cg
+1] += shift_at
;
7497 for (cg
= 0; cg
< ind
->nrecv
[cell
]; cg
++)
7499 /* Copy this charge group from the buffer */
7500 index_gl
[cg1
] = recv_i
[cg0
];
7501 copy_rvec(recv_vr
[cg0
], cg_cm
[cg1
]);
7502 /* Add it to the cgindex */
7503 cg_gl
= index_gl
[cg1
];
7504 cginfo
[cg1
] = ddcginfo(cginfo_mb
, cg_gl
);
7505 nat
= GET_CGINFO_NATOMS(cginfo
[cg1
]);
7506 cgindex
[cg1
+1] = cgindex
[cg1
] + nat
;
7511 shift
+= ind
->nrecv
[cell
];
7512 ncg_cell
[ncell
+cell
+1] = cg1
;
7516 static void make_cell2at_index(gmx_domdec_comm_dim_t
*cd
,
7517 int nzone
, int cg0
, const int *cgindex
)
7521 /* Store the atom block boundaries for easy copying of communication buffers
7524 for (zone
= 0; zone
< nzone
; zone
++)
7526 for (p
= 0; p
< cd
->np
; p
++)
7528 cd
->ind
[p
].cell2at0
[zone
] = cgindex
[cg
];
7529 cg
+= cd
->ind
[p
].nrecv
[zone
];
7530 cd
->ind
[p
].cell2at1
[zone
] = cgindex
[cg
];
7535 static gmx_bool
missing_link(t_blocka
*link
, int cg_gl
, char *bLocalCG
)
7541 for (i
= link
->index
[cg_gl
]; i
< link
->index
[cg_gl
+1]; i
++)
7543 if (!bLocalCG
[link
->a
[i
]])
7552 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7554 real c
[DIM
][4]; /* the corners for the non-bonded communication */
7555 real cr0
; /* corner for rounding */
7556 real cr1
[4]; /* corners for rounding */
7557 real bc
[DIM
]; /* corners for bounded communication */
7558 real bcr1
; /* corner for rounding for bonded communication */
7561 /* Determine the corners of the domain(s) we are communicating with */
7563 set_dd_corners(const gmx_domdec_t
*dd
,
7564 int dim0
, int dim1
, int dim2
,
7568 const gmx_domdec_comm_t
*comm
;
7569 const gmx_domdec_zones_t
*zones
;
7574 zones
= &comm
->zones
;
7576 /* Keep the compiler happy */
7580 /* The first dimension is equal for all cells */
7581 c
->c
[0][0] = comm
->cell_x0
[dim0
];
7584 c
->bc
[0] = c
->c
[0][0];
7589 /* This cell row is only seen from the first row */
7590 c
->c
[1][0] = comm
->cell_x0
[dim1
];
7591 /* All rows can see this row */
7592 c
->c
[1][1] = comm
->cell_x0
[dim1
];
7593 if (dlbIsOn(dd
->comm
))
7595 c
->c
[1][1] = std::max(comm
->cell_x0
[dim1
], comm
->zone_d1
[1].mch0
);
7598 /* For the multi-body distance we need the maximum */
7599 c
->bc
[1] = std::max(comm
->cell_x0
[dim1
], comm
->zone_d1
[1].p1_0
);
7602 /* Set the upper-right corner for rounding */
7603 c
->cr0
= comm
->cell_x1
[dim0
];
7608 for (j
= 0; j
< 4; j
++)
7610 c
->c
[2][j
] = comm
->cell_x0
[dim2
];
7612 if (dlbIsOn(dd
->comm
))
7614 /* Use the maximum of the i-cells that see a j-cell */
7615 for (i
= 0; i
< zones
->nizone
; i
++)
7617 for (j
= zones
->izone
[i
].j0
; j
< zones
->izone
[i
].j1
; j
++)
7622 std::max(c
->c
[2][j
-4],
7623 comm
->zone_d2
[zones
->shift
[i
][dim0
]][zones
->shift
[i
][dim1
]].mch0
);
7629 /* For the multi-body distance we need the maximum */
7630 c
->bc
[2] = comm
->cell_x0
[dim2
];
7631 for (i
= 0; i
< 2; i
++)
7633 for (j
= 0; j
< 2; j
++)
7635 c
->bc
[2] = std::max(c
->bc
[2], comm
->zone_d2
[i
][j
].p1_0
);
7641 /* Set the upper-right corner for rounding */
7642 /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7643 * Only cell (0,0,0) can see cell 7 (1,1,1)
7645 c
->cr1
[0] = comm
->cell_x1
[dim1
];
7646 c
->cr1
[3] = comm
->cell_x1
[dim1
];
7647 if (dlbIsOn(dd
->comm
))
7649 c
->cr1
[0] = std::max(comm
->cell_x1
[dim1
], comm
->zone_d1
[1].mch1
);
7652 /* For the multi-body distance we need the maximum */
7653 c
->bcr1
= std::max(comm
->cell_x1
[dim1
], comm
->zone_d1
[1].p1_1
);
7660 /* Determine which cg's we need to send in this pulse from this zone */
7662 get_zone_pulse_cgs(gmx_domdec_t
*dd
,
7663 int zonei
, int zone
,
7665 const int *index_gl
,
7667 int dim
, int dim_ind
,
7668 int dim0
, int dim1
, int dim2
,
7669 real r_comm2
, real r_bcomm2
,
7673 real skew_fac2_d
, real skew_fac_01
,
7674 rvec
*v_d
, rvec
*v_0
, rvec
*v_1
,
7675 const dd_corners_t
*c
,
7677 gmx_bool bDistBonded
,
7683 gmx_domdec_ind_t
*ind
,
7684 int **ibuf
, int *ibuf_nalloc
,
7690 gmx_domdec_comm_t
*comm
;
7692 gmx_bool bDistMB_pulse
;
7694 real r2
, rb2
, r
, tric_sh
;
7697 int nsend_z
, nsend
, nat
;
7701 bScrew
= (dd
->bScrewPBC
&& dim
== XX
);
7703 bDistMB_pulse
= (bDistMB
&& bDistBonded
);
7709 for (cg
= cg0
; cg
< cg1
; cg
++)
7713 if (tric_dist
[dim_ind
] == 0)
7715 /* Rectangular direction, easy */
7716 r
= cg_cm
[cg
][dim
] - c
->c
[dim_ind
][zone
];
7723 r
= cg_cm
[cg
][dim
] - c
->bc
[dim_ind
];
7729 /* Rounding gives at most a 16% reduction
7730 * in communicated atoms
7732 if (dim_ind
>= 1 && (zonei
== 1 || zonei
== 2))
7734 r
= cg_cm
[cg
][dim0
] - c
->cr0
;
7735 /* This is the first dimension, so always r >= 0 */
7742 if (dim_ind
== 2 && (zonei
== 2 || zonei
== 3))
7744 r
= cg_cm
[cg
][dim1
] - c
->cr1
[zone
];
7751 r
= cg_cm
[cg
][dim1
] - c
->bcr1
;
7761 /* Triclinic direction, more complicated */
7764 /* Rounding, conservative as the skew_fac multiplication
7765 * will slightly underestimate the distance.
7767 if (dim_ind
>= 1 && (zonei
== 1 || zonei
== 2))
7769 rn
[dim0
] = cg_cm
[cg
][dim0
] - c
->cr0
;
7770 for (i
= dim0
+1; i
< DIM
; i
++)
7772 rn
[dim0
] -= cg_cm
[cg
][i
]*v_0
[i
][dim0
];
7774 r2
= rn
[dim0
]*rn
[dim0
]*sf2_round
[dim0
];
7777 rb
[dim0
] = rn
[dim0
];
7780 /* Take care that the cell planes along dim0 might not
7781 * be orthogonal to those along dim1 and dim2.
7783 for (i
= 1; i
<= dim_ind
; i
++)
7786 if (normal
[dim0
][dimd
] > 0)
7788 rn
[dimd
] -= rn
[dim0
]*normal
[dim0
][dimd
];
7791 rb
[dimd
] -= rb
[dim0
]*normal
[dim0
][dimd
];
7796 if (dim_ind
== 2 && (zonei
== 2 || zonei
== 3))
7798 rn
[dim1
] += cg_cm
[cg
][dim1
] - c
->cr1
[zone
];
7800 for (i
= dim1
+1; i
< DIM
; i
++)
7802 tric_sh
-= cg_cm
[cg
][i
]*v_1
[i
][dim1
];
7804 rn
[dim1
] += tric_sh
;
7807 r2
+= rn
[dim1
]*rn
[dim1
]*sf2_round
[dim1
];
7808 /* Take care of coupling of the distances
7809 * to the planes along dim0 and dim1 through dim2.
7811 r2
-= rn
[dim0
]*rn
[dim1
]*skew_fac_01
;
7812 /* Take care that the cell planes along dim1
7813 * might not be orthogonal to that along dim2.
7815 if (normal
[dim1
][dim2
] > 0)
7817 rn
[dim2
] -= rn
[dim1
]*normal
[dim1
][dim2
];
7823 cg_cm
[cg
][dim1
] - c
->bcr1
+ tric_sh
;
7826 rb2
+= rb
[dim1
]*rb
[dim1
]*sf2_round
[dim1
];
7827 /* Take care of coupling of the distances
7828 * to the planes along dim0 and dim1 through dim2.
7830 rb2
-= rb
[dim0
]*rb
[dim1
]*skew_fac_01
;
7831 /* Take care that the cell planes along dim1
7832 * might not be orthogonal to that along dim2.
7834 if (normal
[dim1
][dim2
] > 0)
7836 rb
[dim2
] -= rb
[dim1
]*normal
[dim1
][dim2
];
7841 /* The distance along the communication direction */
7842 rn
[dim
] += cg_cm
[cg
][dim
] - c
->c
[dim_ind
][zone
];
7844 for (i
= dim
+1; i
< DIM
; i
++)
7846 tric_sh
-= cg_cm
[cg
][i
]*v_d
[i
][dim
];
7851 r2
+= rn
[dim
]*rn
[dim
]*skew_fac2_d
;
7852 /* Take care of coupling of the distances
7853 * to the planes along dim0 and dim1 through dim2.
7855 if (dim_ind
== 1 && zonei
== 1)
7857 r2
-= rn
[dim0
]*rn
[dim
]*skew_fac_01
;
7863 rb
[dim
] += cg_cm
[cg
][dim
] - c
->bc
[dim_ind
] + tric_sh
;
7866 rb2
+= rb
[dim
]*rb
[dim
]*skew_fac2_d
;
7867 /* Take care of coupling of the distances
7868 * to the planes along dim0 and dim1 through dim2.
7870 if (dim_ind
== 1 && zonei
== 1)
7872 rb2
-= rb
[dim0
]*rb
[dim
]*skew_fac_01
;
7880 ((bDistMB
&& rb2
< r_bcomm2
) ||
7881 (bDist2B
&& r2
< r_bcomm2
)) &&
7883 (GET_CGINFO_BOND_INTER(cginfo
[cg
]) &&
7884 missing_link(comm
->cglink
, index_gl
[cg
],
7887 /* Make an index to the local charge groups */
7888 if (nsend
+1 > ind
->nalloc
)
7890 ind
->nalloc
= over_alloc_large(nsend
+1);
7891 srenew(ind
->index
, ind
->nalloc
);
7893 if (nsend
+1 > *ibuf_nalloc
)
7895 *ibuf_nalloc
= over_alloc_large(nsend
+1);
7896 srenew(*ibuf
, *ibuf_nalloc
);
7898 ind
->index
[nsend
] = cg
;
7899 (*ibuf
)[nsend
] = index_gl
[cg
];
7901 vec_rvec_check_alloc(vbuf
, nsend
+1);
7903 if (dd
->ci
[dim
] == 0)
7905 /* Correct cg_cm for pbc */
7906 rvec_add(cg_cm
[cg
], box
[dim
], vbuf
->v
[nsend
]);
7909 vbuf
->v
[nsend
][YY
] = box
[YY
][YY
] - vbuf
->v
[nsend
][YY
];
7910 vbuf
->v
[nsend
][ZZ
] = box
[ZZ
][ZZ
] - vbuf
->v
[nsend
][ZZ
];
7915 copy_rvec(cg_cm
[cg
], vbuf
->v
[nsend
]);
7918 nat
+= cgindex
[cg
+1] - cgindex
[cg
];
7924 *nsend_z_ptr
= nsend_z
;
7927 static void setup_dd_communication(gmx_domdec_t
*dd
,
7928 matrix box
, gmx_ddbox_t
*ddbox
,
7929 t_forcerec
*fr
, t_state
*state
, rvec
**f
)
7931 int dim_ind
, dim
, dim0
, dim1
, dim2
, dimd
, p
, nat_tot
;
7932 int nzone
, nzone_send
, zone
, zonei
, cg0
, cg1
;
7933 int c
, i
, cg
, cg_gl
, nrcg
;
7934 int *zone_cg_range
, pos_cg
, *index_gl
, *cgindex
, *recv_i
;
7935 gmx_domdec_comm_t
*comm
;
7936 gmx_domdec_zones_t
*zones
;
7937 gmx_domdec_comm_dim_t
*cd
;
7938 gmx_domdec_ind_t
*ind
;
7939 cginfo_mb_t
*cginfo_mb
;
7940 gmx_bool bBondComm
, bDist2B
, bDistMB
, bDistBonded
;
7941 real r_comm2
, r_bcomm2
;
7942 dd_corners_t corners
;
7944 rvec
*cg_cm
, *normal
, *v_d
, *v_0
= NULL
, *v_1
= NULL
, *recv_vr
;
7945 real skew_fac2_d
, skew_fac_01
;
7952 fprintf(debug
, "Setting up DD communication\n");
7957 switch (fr
->cutoff_scheme
)
7966 gmx_incons("unimplemented");
7970 for (dim_ind
= 0; dim_ind
< dd
->ndim
; dim_ind
++)
7972 /* Check if we need to use triclinic distances */
7973 tric_dist
[dim_ind
] = 0;
7974 for (i
= 0; i
<= dim_ind
; i
++)
7976 if (ddbox
->tric_dir
[dd
->dim
[i
]])
7978 tric_dist
[dim_ind
] = 1;
7983 bBondComm
= comm
->bBondComm
;
7985 /* Do we need to determine extra distances for multi-body bondeds? */
7986 bDistMB
= (comm
->bInterCGMultiBody
&& dlbIsOn(dd
->comm
) && dd
->ndim
> 1);
7988 /* Do we need to determine extra distances for only two-body bondeds? */
7989 bDist2B
= (bBondComm
&& !bDistMB
);
7991 r_comm2
= gmx::square(comm
->cutoff
);
7992 r_bcomm2
= gmx::square(comm
->cutoff_mbody
);
7996 fprintf(debug
, "bBondComm %d, r_bc %f\n", bBondComm
, std::sqrt(r_bcomm2
));
7999 zones
= &comm
->zones
;
8002 dim1
= (dd
->ndim
>= 2 ? dd
->dim
[1] : -1);
8003 dim2
= (dd
->ndim
>= 3 ? dd
->dim
[2] : -1);
8005 set_dd_corners(dd
, dim0
, dim1
, dim2
, bDistMB
, &corners
);
8007 /* Triclinic stuff */
8008 normal
= ddbox
->normal
;
8012 v_0
= ddbox
->v
[dim0
];
8013 if (ddbox
->tric_dir
[dim0
] && ddbox
->tric_dir
[dim1
])
8015 /* Determine the coupling coefficient for the distances
8016 * to the cell planes along dim0 and dim1 through dim2.
8017 * This is required for correct rounding.
8020 ddbox
->v
[dim0
][dim1
+1][dim0
]*ddbox
->v
[dim1
][dim1
+1][dim1
];
8023 fprintf(debug
, "\nskew_fac_01 %f\n", skew_fac_01
);
8029 v_1
= ddbox
->v
[dim1
];
8032 zone_cg_range
= zones
->cg_range
;
8033 index_gl
= dd
->index_gl
;
8034 cgindex
= dd
->cgindex
;
8035 cginfo_mb
= fr
->cginfo_mb
;
8037 zone_cg_range
[0] = 0;
8038 zone_cg_range
[1] = dd
->ncg_home
;
8039 comm
->zone_ncg1
[0] = dd
->ncg_home
;
8040 pos_cg
= dd
->ncg_home
;
8042 nat_tot
= dd
->nat_home
;
8044 for (dim_ind
= 0; dim_ind
< dd
->ndim
; dim_ind
++)
8046 dim
= dd
->dim
[dim_ind
];
8047 cd
= &comm
->cd
[dim_ind
];
8049 if (dim
>= ddbox
->npbcdim
&& dd
->ci
[dim
] == 0)
8051 /* No pbc in this dimension, the first node should not comm. */
8059 v_d
= ddbox
->v
[dim
];
8060 skew_fac2_d
= gmx::square(ddbox
->skew_fac
[dim
]);
8062 cd
->bInPlace
= TRUE
;
8063 for (p
= 0; p
< cd
->np
; p
++)
8065 /* Only atoms communicated in the first pulse are used
8066 * for multi-body bonded interactions or for bBondComm.
8068 bDistBonded
= ((bDistMB
|| bDist2B
) && p
== 0);
8073 for (zone
= 0; zone
< nzone_send
; zone
++)
8075 if (tric_dist
[dim_ind
] && dim_ind
> 0)
8077 /* Determine slightly more optimized skew_fac's
8079 * This reduces the number of communicated atoms
8080 * by about 10% for 3D DD of rhombic dodecahedra.
8082 for (dimd
= 0; dimd
< dim
; dimd
++)
8084 sf2_round
[dimd
] = 1;
8085 if (ddbox
->tric_dir
[dimd
])
8087 for (i
= dd
->dim
[dimd
]+1; i
< DIM
; i
++)
8089 /* If we are shifted in dimension i
8090 * and the cell plane is tilted forward
8091 * in dimension i, skip this coupling.
8093 if (!(zones
->shift
[nzone
+zone
][i
] &&
8094 ddbox
->v
[dimd
][i
][dimd
] >= 0))
8097 gmx::square(ddbox
->v
[dimd
][i
][dimd
]);
8100 sf2_round
[dimd
] = 1/sf2_round
[dimd
];
8105 zonei
= zone_perm
[dim_ind
][zone
];
8108 /* Here we permutate the zones to obtain a convenient order
8109 * for neighbor searching
8111 cg0
= zone_cg_range
[zonei
];
8112 cg1
= zone_cg_range
[zonei
+1];
8116 /* Look only at the cg's received in the previous grid pulse
8118 cg1
= zone_cg_range
[nzone
+zone
+1];
8119 cg0
= cg1
- cd
->ind
[p
-1].nrecv
[zone
];
8122 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8123 for (th
= 0; th
< comm
->nth
; th
++)
8127 gmx_domdec_ind_t
*ind_p
;
8128 int **ibuf_p
, *ibuf_nalloc_p
;
8130 int *nsend_p
, *nat_p
;
8136 /* Thread 0 writes in the comm buffers */
8138 ibuf_p
= &comm
->buf_int
;
8139 ibuf_nalloc_p
= &comm
->nalloc_int
;
8140 vbuf_p
= &comm
->vbuf
;
8143 nsend_zone_p
= &ind
->nsend
[zone
];
8147 /* Other threads write into temp buffers */
8148 ind_p
= &comm
->dth
[th
].ind
;
8149 ibuf_p
= &comm
->dth
[th
].ibuf
;
8150 ibuf_nalloc_p
= &comm
->dth
[th
].ibuf_nalloc
;
8151 vbuf_p
= &comm
->dth
[th
].vbuf
;
8152 nsend_p
= &comm
->dth
[th
].nsend
;
8153 nat_p
= &comm
->dth
[th
].nat
;
8154 nsend_zone_p
= &comm
->dth
[th
].nsend_zone
;
8156 comm
->dth
[th
].nsend
= 0;
8157 comm
->dth
[th
].nat
= 0;
8158 comm
->dth
[th
].nsend_zone
= 0;
8168 cg0_th
= cg0
+ ((cg1
- cg0
)* th
)/comm
->nth
;
8169 cg1_th
= cg0
+ ((cg1
- cg0
)*(th
+1))/comm
->nth
;
8172 /* Get the cg's for this pulse in this zone */
8173 get_zone_pulse_cgs(dd
, zonei
, zone
, cg0_th
, cg1_th
,
8175 dim
, dim_ind
, dim0
, dim1
, dim2
,
8178 normal
, skew_fac2_d
, skew_fac_01
,
8179 v_d
, v_0
, v_1
, &corners
, sf2_round
,
8180 bDistBonded
, bBondComm
,
8184 ibuf_p
, ibuf_nalloc_p
,
8189 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
8192 /* Append data of threads>=1 to the communication buffers */
8193 for (th
= 1; th
< comm
->nth
; th
++)
8195 dd_comm_setup_work_t
*dth
;
8198 dth
= &comm
->dth
[th
];
8200 ns1
= nsend
+ dth
->nsend_zone
;
8201 if (ns1
> ind
->nalloc
)
8203 ind
->nalloc
= over_alloc_dd(ns1
);
8204 srenew(ind
->index
, ind
->nalloc
);
8206 if (ns1
> comm
->nalloc_int
)
8208 comm
->nalloc_int
= over_alloc_dd(ns1
);
8209 srenew(comm
->buf_int
, comm
->nalloc_int
);
8211 if (ns1
> comm
->vbuf
.nalloc
)
8213 comm
->vbuf
.nalloc
= over_alloc_dd(ns1
);
8214 srenew(comm
->vbuf
.v
, comm
->vbuf
.nalloc
);
8217 for (i
= 0; i
< dth
->nsend_zone
; i
++)
8219 ind
->index
[nsend
] = dth
->ind
.index
[i
];
8220 comm
->buf_int
[nsend
] = dth
->ibuf
[i
];
8221 copy_rvec(dth
->vbuf
.v
[i
],
8222 comm
->vbuf
.v
[nsend
]);
8226 ind
->nsend
[zone
] += dth
->nsend_zone
;
8229 /* Clear the counts in case we do not have pbc */
8230 for (zone
= nzone_send
; zone
< nzone
; zone
++)
8232 ind
->nsend
[zone
] = 0;
8234 ind
->nsend
[nzone
] = nsend
;
8235 ind
->nsend
[nzone
+1] = nat
;
8236 /* Communicate the number of cg's and atoms to receive */
8237 dd_sendrecv_int(dd
, dim_ind
, dddirBackward
,
8238 ind
->nsend
, nzone
+2,
8239 ind
->nrecv
, nzone
+2);
8241 /* The rvec buffer is also required for atom buffers of size nsend
8242 * in dd_move_x and dd_move_f.
8244 vec_rvec_check_alloc(&comm
->vbuf
, ind
->nsend
[nzone
+1]);
8248 /* We can receive in place if only the last zone is not empty */
8249 for (zone
= 0; zone
< nzone
-1; zone
++)
8251 if (ind
->nrecv
[zone
] > 0)
8253 cd
->bInPlace
= FALSE
;
8258 /* The int buffer is only required here for the cg indices */
8259 if (ind
->nrecv
[nzone
] > comm
->nalloc_int2
)
8261 comm
->nalloc_int2
= over_alloc_dd(ind
->nrecv
[nzone
]);
8262 srenew(comm
->buf_int2
, comm
->nalloc_int2
);
8264 /* The rvec buffer is also required for atom buffers
8265 * of size nrecv in dd_move_x and dd_move_f.
8267 i
= std::max(cd
->ind
[0].nrecv
[nzone
+1], ind
->nrecv
[nzone
+1]);
8268 vec_rvec_check_alloc(&comm
->vbuf2
, i
);
8272 /* Make space for the global cg indices */
8273 if (pos_cg
+ ind
->nrecv
[nzone
] > dd
->cg_nalloc
8274 || dd
->cg_nalloc
== 0)
8276 dd
->cg_nalloc
= over_alloc_dd(pos_cg
+ ind
->nrecv
[nzone
]);
8277 srenew(index_gl
, dd
->cg_nalloc
);
8278 srenew(cgindex
, dd
->cg_nalloc
+1);
8280 /* Communicate the global cg indices */
8283 recv_i
= index_gl
+ pos_cg
;
8287 recv_i
= comm
->buf_int2
;
8289 dd_sendrecv_int(dd
, dim_ind
, dddirBackward
,
8290 comm
->buf_int
, nsend
,
8291 recv_i
, ind
->nrecv
[nzone
]);
8293 /* Make space for cg_cm */
8294 dd_check_alloc_ncg(fr
, state
, f
, pos_cg
+ ind
->nrecv
[nzone
]);
8295 if (fr
->cutoff_scheme
== ecutsGROUP
)
8303 /* Communicate cg_cm */
8306 recv_vr
= cg_cm
+ pos_cg
;
8310 recv_vr
= comm
->vbuf2
.v
;
8312 dd_sendrecv_rvec(dd
, dim_ind
, dddirBackward
,
8313 comm
->vbuf
.v
, nsend
,
8314 recv_vr
, ind
->nrecv
[nzone
]);
8316 /* Make the charge group index */
8319 zone
= (p
== 0 ? 0 : nzone
- 1);
8320 while (zone
< nzone
)
8322 for (cg
= 0; cg
< ind
->nrecv
[zone
]; cg
++)
8324 cg_gl
= index_gl
[pos_cg
];
8325 fr
->cginfo
[pos_cg
] = ddcginfo(cginfo_mb
, cg_gl
);
8326 nrcg
= GET_CGINFO_NATOMS(fr
->cginfo
[pos_cg
]);
8327 cgindex
[pos_cg
+1] = cgindex
[pos_cg
] + nrcg
;
8330 /* Update the charge group presence,
8331 * so we can use it in the next pass of the loop.
8333 comm
->bLocalCG
[cg_gl
] = TRUE
;
8339 comm
->zone_ncg1
[nzone
+zone
] = ind
->nrecv
[zone
];
8342 zone_cg_range
[nzone
+zone
] = pos_cg
;
8347 /* This part of the code is never executed with bBondComm. */
8348 merge_cg_buffers(nzone
, cd
, p
, zone_cg_range
,
8349 index_gl
, recv_i
, cg_cm
, recv_vr
,
8350 cgindex
, fr
->cginfo_mb
, fr
->cginfo
);
8351 pos_cg
+= ind
->nrecv
[nzone
];
8353 nat_tot
+= ind
->nrecv
[nzone
+1];
8357 /* Store the atom block for easy copying of communication buffers */
8358 make_cell2at_index(cd
, nzone
, zone_cg_range
[nzone
], cgindex
);
8362 dd
->index_gl
= index_gl
;
8363 dd
->cgindex
= cgindex
;
8365 dd
->ncg_tot
= zone_cg_range
[zones
->n
];
8366 dd
->nat_tot
= nat_tot
;
8367 comm
->nat
[ddnatHOME
] = dd
->nat_home
;
8368 for (i
= ddnatZONE
; i
< ddnatNR
; i
++)
8370 comm
->nat
[i
] = dd
->nat_tot
;
8375 /* We don't need to update cginfo, since that was alrady done above.
8376 * So we pass NULL for the forcerec.
8378 dd_set_cginfo(dd
->index_gl
, dd
->ncg_home
, dd
->ncg_tot
,
8379 NULL
, comm
->bLocalCG
);
8384 fprintf(debug
, "Finished setting up DD communication, zones:");
8385 for (c
= 0; c
< zones
->n
; c
++)
8387 fprintf(debug
, " %d", zones
->cg_range
[c
+1]-zones
->cg_range
[c
]);
8389 fprintf(debug
, "\n");
8393 static void set_cg_boundaries(gmx_domdec_zones_t
*zones
)
8397 for (c
= 0; c
< zones
->nizone
; c
++)
8399 zones
->izone
[c
].cg1
= zones
->cg_range
[c
+1];
8400 zones
->izone
[c
].jcg0
= zones
->cg_range
[zones
->izone
[c
].j0
];
8401 zones
->izone
[c
].jcg1
= zones
->cg_range
[zones
->izone
[c
].j1
];
8405 static void set_zones_size(gmx_domdec_t
*dd
,
8406 matrix box
, const gmx_ddbox_t
*ddbox
,
8407 int zone_start
, int zone_end
)
8409 gmx_domdec_comm_t
*comm
;
8410 gmx_domdec_zones_t
*zones
;
8419 zones
= &comm
->zones
;
8421 /* Do we need to determine extra distances for multi-body bondeds? */
8422 bDistMB
= (comm
->bInterCGMultiBody
&& dlbIsOn(dd
->comm
) && dd
->ndim
> 1);
8424 for (z
= zone_start
; z
< zone_end
; z
++)
8426 /* Copy cell limits to zone limits.
8427 * Valid for non-DD dims and non-shifted dims.
8429 copy_rvec(comm
->cell_x0
, zones
->size
[z
].x0
);
8430 copy_rvec(comm
->cell_x1
, zones
->size
[z
].x1
);
8433 for (d
= 0; d
< dd
->ndim
; d
++)
8437 for (z
= 0; z
< zones
->n
; z
++)
8439 /* With a staggered grid we have different sizes
8440 * for non-shifted dimensions.
8442 if (dlbIsOn(dd
->comm
) && zones
->shift
[z
][dim
] == 0)
8446 zones
->size
[z
].x0
[dim
] = comm
->zone_d1
[zones
->shift
[z
][dd
->dim
[d
-1]]].min0
;
8447 zones
->size
[z
].x1
[dim
] = comm
->zone_d1
[zones
->shift
[z
][dd
->dim
[d
-1]]].max1
;
8451 zones
->size
[z
].x0
[dim
] = comm
->zone_d2
[zones
->shift
[z
][dd
->dim
[d
-2]]][zones
->shift
[z
][dd
->dim
[d
-1]]].min0
;
8452 zones
->size
[z
].x1
[dim
] = comm
->zone_d2
[zones
->shift
[z
][dd
->dim
[d
-2]]][zones
->shift
[z
][dd
->dim
[d
-1]]].max1
;
8458 rcmbs
= comm
->cutoff_mbody
;
8459 if (ddbox
->tric_dir
[dim
])
8461 rcs
/= ddbox
->skew_fac
[dim
];
8462 rcmbs
/= ddbox
->skew_fac
[dim
];
8465 /* Set the lower limit for the shifted zone dimensions */
8466 for (z
= zone_start
; z
< zone_end
; z
++)
8468 if (zones
->shift
[z
][dim
] > 0)
8471 if (!dlbIsOn(dd
->comm
) || d
== 0)
8473 zones
->size
[z
].x0
[dim
] = comm
->cell_x1
[dim
];
8474 zones
->size
[z
].x1
[dim
] = comm
->cell_x1
[dim
] + rcs
;
8478 /* Here we take the lower limit of the zone from
8479 * the lowest domain of the zone below.
8483 zones
->size
[z
].x0
[dim
] =
8484 comm
->zone_d1
[zones
->shift
[z
][dd
->dim
[d
-1]]].min1
;
8490 zones
->size
[z
].x0
[dim
] =
8491 zones
->size
[zone_perm
[2][z
-4]].x0
[dim
];
8495 zones
->size
[z
].x0
[dim
] =
8496 comm
->zone_d2
[zones
->shift
[z
][dd
->dim
[d
-2]]][zones
->shift
[z
][dd
->dim
[d
-1]]].min1
;
8499 /* A temporary limit, is updated below */
8500 zones
->size
[z
].x1
[dim
] = zones
->size
[z
].x0
[dim
];
8504 for (zi
= 0; zi
< zones
->nizone
; zi
++)
8506 if (zones
->shift
[zi
][dim
] == 0)
8508 /* This takes the whole zone into account.
8509 * With multiple pulses this will lead
8510 * to a larger zone then strictly necessary.
8512 zones
->size
[z
].x1
[dim
] = std::max(zones
->size
[z
].x1
[dim
],
8513 zones
->size
[zi
].x1
[dim
]+rcmbs
);
8521 /* Loop over the i-zones to set the upper limit of each
8524 for (zi
= 0; zi
< zones
->nizone
; zi
++)
8526 if (zones
->shift
[zi
][dim
] == 0)
8528 for (z
= zones
->izone
[zi
].j0
; z
< zones
->izone
[zi
].j1
; z
++)
8530 if (zones
->shift
[z
][dim
] > 0)
8532 zones
->size
[z
].x1
[dim
] = std::max(zones
->size
[z
].x1
[dim
],
8533 zones
->size
[zi
].x1
[dim
]+rcs
);
8540 for (z
= zone_start
; z
< zone_end
; z
++)
8542 /* Initialization only required to keep the compiler happy */
8543 rvec corner_min
= {0, 0, 0}, corner_max
= {0, 0, 0}, corner
;
8546 /* To determine the bounding box for a zone we need to find
8547 * the extreme corners of 4, 2 or 1 corners.
8549 nc
= 1 << (ddbox
->nboundeddim
- 1);
8551 for (c
= 0; c
< nc
; c
++)
8553 /* Set up a zone corner at x=0, ignoring trilinic couplings */
8557 corner
[YY
] = zones
->size
[z
].x0
[YY
];
8561 corner
[YY
] = zones
->size
[z
].x1
[YY
];
8565 corner
[ZZ
] = zones
->size
[z
].x0
[ZZ
];
8569 corner
[ZZ
] = zones
->size
[z
].x1
[ZZ
];
8571 if (dd
->ndim
== 1 && dd
->dim
[0] < ZZ
&& ZZ
< dd
->npbcdim
&&
8572 box
[ZZ
][1 - dd
->dim
[0]] != 0)
8574 /* With 1D domain decomposition the cg's are not in
8575 * the triclinic box, but triclinic x-y and rectangular y/x-z.
8576 * Shift the corner of the z-vector back to along the box
8577 * vector of dimension d, so it will later end up at 0 along d.
8578 * This can affect the location of this corner along dd->dim[0]
8579 * through the matrix operation below if box[d][dd->dim[0]]!=0.
8581 int d
= 1 - dd
->dim
[0];
8583 corner
[d
] -= corner
[ZZ
]*box
[ZZ
][d
]/box
[ZZ
][ZZ
];
8585 /* Apply the triclinic couplings */
8586 assert(ddbox
->npbcdim
<= DIM
);
8587 for (i
= YY
; i
< ddbox
->npbcdim
; i
++)
8589 for (j
= XX
; j
< i
; j
++)
8591 corner
[j
] += corner
[i
]*box
[i
][j
]/box
[i
][i
];
8596 copy_rvec(corner
, corner_min
);
8597 copy_rvec(corner
, corner_max
);
8601 for (i
= 0; i
< DIM
; i
++)
8603 corner_min
[i
] = std::min(corner_min
[i
], corner
[i
]);
8604 corner_max
[i
] = std::max(corner_max
[i
], corner
[i
]);
8608 /* Copy the extreme cornes without offset along x */
8609 for (i
= 0; i
< DIM
; i
++)
8611 zones
->size
[z
].bb_x0
[i
] = corner_min
[i
];
8612 zones
->size
[z
].bb_x1
[i
] = corner_max
[i
];
8614 /* Add the offset along x */
8615 zones
->size
[z
].bb_x0
[XX
] += zones
->size
[z
].x0
[XX
];
8616 zones
->size
[z
].bb_x1
[XX
] += zones
->size
[z
].x1
[XX
];
8619 if (zone_start
== 0)
8622 for (dim
= 0; dim
< DIM
; dim
++)
8624 vol
*= zones
->size
[0].x1
[dim
] - zones
->size
[0].x0
[dim
];
8626 zones
->dens_zone0
= (zones
->cg_range
[1] - zones
->cg_range
[0])/vol
;
8631 for (z
= zone_start
; z
< zone_end
; z
++)
8633 fprintf(debug
, "zone %d %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
8635 zones
->size
[z
].x0
[XX
], zones
->size
[z
].x1
[XX
],
8636 zones
->size
[z
].x0
[YY
], zones
->size
[z
].x1
[YY
],
8637 zones
->size
[z
].x0
[ZZ
], zones
->size
[z
].x1
[ZZ
]);
8638 fprintf(debug
, "zone %d bb %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
8640 zones
->size
[z
].bb_x0
[XX
], zones
->size
[z
].bb_x1
[XX
],
8641 zones
->size
[z
].bb_x0
[YY
], zones
->size
[z
].bb_x1
[YY
],
8642 zones
->size
[z
].bb_x0
[ZZ
], zones
->size
[z
].bb_x1
[ZZ
]);
8647 static int comp_cgsort(const void *a
, const void *b
)
8651 gmx_cgsort_t
*cga
, *cgb
;
8652 cga
= (gmx_cgsort_t
*)a
;
8653 cgb
= (gmx_cgsort_t
*)b
;
8655 comp
= cga
->nsc
- cgb
->nsc
;
8658 comp
= cga
->ind_gl
- cgb
->ind_gl
;
8664 static void order_int_cg(int n
, const gmx_cgsort_t
*sort
,
8669 /* Order the data */
8670 for (i
= 0; i
< n
; i
++)
8672 buf
[i
] = a
[sort
[i
].ind
];
8675 /* Copy back to the original array */
8676 for (i
= 0; i
< n
; i
++)
8682 static void order_vec_cg(int n
, const gmx_cgsort_t
*sort
,
8687 /* Order the data */
8688 for (i
= 0; i
< n
; i
++)
8690 copy_rvec(v
[sort
[i
].ind
], buf
[i
]);
8693 /* Copy back to the original array */
8694 for (i
= 0; i
< n
; i
++)
8696 copy_rvec(buf
[i
], v
[i
]);
8700 static void order_vec_atom(int ncg
, const int *cgindex
, const gmx_cgsort_t
*sort
,
8703 int a
, atot
, cg
, cg0
, cg1
, i
;
8705 if (cgindex
== NULL
)
8707 /* Avoid the useless loop of the atoms within a cg */
8708 order_vec_cg(ncg
, sort
, v
, buf
);
8713 /* Order the data */
8715 for (cg
= 0; cg
< ncg
; cg
++)
8717 cg0
= cgindex
[sort
[cg
].ind
];
8718 cg1
= cgindex
[sort
[cg
].ind
+1];
8719 for (i
= cg0
; i
< cg1
; i
++)
8721 copy_rvec(v
[i
], buf
[a
]);
8727 /* Copy back to the original array */
8728 for (a
= 0; a
< atot
; a
++)
8730 copy_rvec(buf
[a
], v
[a
]);
8734 static void ordered_sort(int nsort2
, gmx_cgsort_t
*sort2
,
8735 int nsort_new
, gmx_cgsort_t
*sort_new
,
8736 gmx_cgsort_t
*sort1
)
8740 /* The new indices are not very ordered, so we qsort them */
8741 gmx_qsort_threadsafe(sort_new
, nsort_new
, sizeof(sort_new
[0]), comp_cgsort
);
8743 /* sort2 is already ordered, so now we can merge the two arrays */
8747 while (i2
< nsort2
|| i_new
< nsort_new
)
8751 sort1
[i1
++] = sort_new
[i_new
++];
8753 else if (i_new
== nsort_new
)
8755 sort1
[i1
++] = sort2
[i2
++];
8757 else if (sort2
[i2
].nsc
< sort_new
[i_new
].nsc
||
8758 (sort2
[i2
].nsc
== sort_new
[i_new
].nsc
&&
8759 sort2
[i2
].ind_gl
< sort_new
[i_new
].ind_gl
))
8761 sort1
[i1
++] = sort2
[i2
++];
8765 sort1
[i1
++] = sort_new
[i_new
++];
8770 static int dd_sort_order(gmx_domdec_t
*dd
, t_forcerec
*fr
, int ncg_home_old
)
8772 gmx_domdec_sort_t
*sort
;
8773 gmx_cgsort_t
*cgsort
, *sort_i
;
8774 int ncg_new
, nsort2
, nsort_new
, i
, *a
, moved
;
8776 sort
= dd
->comm
->sort
;
8778 a
= fr
->ns
->grid
->cell_index
;
8780 moved
= NSGRID_SIGNAL_MOVED_FAC
*fr
->ns
->grid
->ncells
;
8782 if (ncg_home_old
>= 0)
8784 /* The charge groups that remained in the same ns grid cell
8785 * are completely ordered. So we can sort efficiently by sorting
8786 * the charge groups that did move into the stationary list.
8791 for (i
= 0; i
< dd
->ncg_home
; i
++)
8793 /* Check if this cg did not move to another node */
8796 if (i
>= ncg_home_old
|| a
[i
] != sort
->sort
[i
].nsc
)
8798 /* This cg is new on this node or moved ns grid cell */
8799 if (nsort_new
>= sort
->sort_new_nalloc
)
8801 sort
->sort_new_nalloc
= over_alloc_dd(nsort_new
+1);
8802 srenew(sort
->sort_new
, sort
->sort_new_nalloc
);
8804 sort_i
= &(sort
->sort_new
[nsort_new
++]);
8808 /* This cg did not move */
8809 sort_i
= &(sort
->sort2
[nsort2
++]);
8811 /* Sort on the ns grid cell indices
8812 * and the global topology index.
8813 * index_gl is irrelevant with cell ns,
8814 * but we set it here anyhow to avoid a conditional.
8817 sort_i
->ind_gl
= dd
->index_gl
[i
];
8824 fprintf(debug
, "ordered sort cgs: stationary %d moved %d\n",
8827 /* Sort efficiently */
8828 ordered_sort(nsort2
, sort
->sort2
, nsort_new
, sort
->sort_new
,
8833 cgsort
= sort
->sort
;
8835 for (i
= 0; i
< dd
->ncg_home
; i
++)
8837 /* Sort on the ns grid cell indices
8838 * and the global topology index
8840 cgsort
[i
].nsc
= a
[i
];
8841 cgsort
[i
].ind_gl
= dd
->index_gl
[i
];
8843 if (cgsort
[i
].nsc
< moved
)
8850 fprintf(debug
, "qsort cgs: %d new home %d\n", dd
->ncg_home
, ncg_new
);
8852 /* Determine the order of the charge groups using qsort */
8853 gmx_qsort_threadsafe(cgsort
, dd
->ncg_home
, sizeof(cgsort
[0]), comp_cgsort
);
8859 static int dd_sort_order_nbnxn(gmx_domdec_t
*dd
, t_forcerec
*fr
)
8865 sort
= dd
->comm
->sort
->sort
;
8867 nbnxn_get_atomorder(fr
->nbv
->nbs
, &a
, &na
);
8870 for (i
= 0; i
< na
; i
++)
8874 sort
[ncg_new
].ind
= a
[i
];
8882 static void dd_sort_state(gmx_domdec_t
*dd
, rvec
*cgcm
, t_forcerec
*fr
, t_state
*state
,
8885 gmx_domdec_sort_t
*sort
;
8886 gmx_cgsort_t
*cgsort
;
8888 int ncg_new
, i
, *ibuf
, cgsize
;
8891 sort
= dd
->comm
->sort
;
8893 if (dd
->ncg_home
> sort
->sort_nalloc
)
8895 sort
->sort_nalloc
= over_alloc_dd(dd
->ncg_home
);
8896 srenew(sort
->sort
, sort
->sort_nalloc
);
8897 srenew(sort
->sort2
, sort
->sort_nalloc
);
8899 cgsort
= sort
->sort
;
8901 switch (fr
->cutoff_scheme
)
8904 ncg_new
= dd_sort_order(dd
, fr
, ncg_home_old
);
8907 ncg_new
= dd_sort_order_nbnxn(dd
, fr
);
8910 gmx_incons("unimplemented");
8914 /* We alloc with the old size, since cgindex is still old */
8915 vec_rvec_check_alloc(&dd
->comm
->vbuf
, dd
->cgindex
[dd
->ncg_home
]);
8916 vbuf
= dd
->comm
->vbuf
.v
;
8920 cgindex
= dd
->cgindex
;
8927 /* Remove the charge groups which are no longer at home here */
8928 dd
->ncg_home
= ncg_new
;
8931 fprintf(debug
, "Set the new home charge group count to %d\n",
8935 /* Reorder the state */
8936 for (i
= 0; i
< estNR
; i
++)
8938 if (EST_DISTR(i
) && (state
->flags
& (1<<i
)))
8943 order_vec_atom(dd
->ncg_home
, cgindex
, cgsort
, state
->x
, vbuf
);
8946 order_vec_atom(dd
->ncg_home
, cgindex
, cgsort
, state
->v
, vbuf
);
8949 order_vec_atom(dd
->ncg_home
, cgindex
, cgsort
, state
->sd_X
, vbuf
);
8952 order_vec_atom(dd
->ncg_home
, cgindex
, cgsort
, state
->cg_p
, vbuf
);
8956 case estDISRE_INITF
:
8957 case estDISRE_RM3TAV
:
8958 case estORIRE_INITF
:
8960 /* No ordering required */
8963 gmx_incons("Unknown state entry encountered in dd_sort_state");
8968 if (fr
->cutoff_scheme
== ecutsGROUP
)
8971 order_vec_cg(dd
->ncg_home
, cgsort
, cgcm
, vbuf
);
8974 if (dd
->ncg_home
+1 > sort
->ibuf_nalloc
)
8976 sort
->ibuf_nalloc
= over_alloc_dd(dd
->ncg_home
+1);
8977 srenew(sort
->ibuf
, sort
->ibuf_nalloc
);
8980 /* Reorder the global cg index */
8981 order_int_cg(dd
->ncg_home
, cgsort
, dd
->index_gl
, ibuf
);
8982 /* Reorder the cginfo */
8983 order_int_cg(dd
->ncg_home
, cgsort
, fr
->cginfo
, ibuf
);
8984 /* Rebuild the local cg index */
8988 for (i
= 0; i
< dd
->ncg_home
; i
++)
8990 cgsize
= dd
->cgindex
[cgsort
[i
].ind
+1] - dd
->cgindex
[cgsort
[i
].ind
];
8991 ibuf
[i
+1] = ibuf
[i
] + cgsize
;
8993 for (i
= 0; i
< dd
->ncg_home
+1; i
++)
8995 dd
->cgindex
[i
] = ibuf
[i
];
9000 for (i
= 0; i
< dd
->ncg_home
+1; i
++)
9005 /* Set the home atom number */
9006 dd
->nat_home
= dd
->cgindex
[dd
->ncg_home
];
9008 if (fr
->cutoff_scheme
== ecutsVERLET
)
9010 /* The atoms are now exactly in grid order, update the grid order */
9011 nbnxn_set_atomorder(fr
->nbv
->nbs
);
9015 /* Copy the sorted ns cell indices back to the ns grid struct */
9016 for (i
= 0; i
< dd
->ncg_home
; i
++)
9018 fr
->ns
->grid
->cell_index
[i
] = cgsort
[i
].nsc
;
9020 fr
->ns
->grid
->nr
= dd
->ncg_home
;
9024 static void add_dd_statistics(gmx_domdec_t
*dd
)
9026 gmx_domdec_comm_t
*comm
;
9031 for (ddnat
= ddnatZONE
; ddnat
< ddnatNR
; ddnat
++)
9033 comm
->sum_nat
[ddnat
-ddnatZONE
] +=
9034 comm
->nat
[ddnat
] - comm
->nat
[ddnat
-1];
9039 void reset_dd_statistics_counters(gmx_domdec_t
*dd
)
9041 gmx_domdec_comm_t
*comm
;
9046 /* Reset all the statistics and counters for total run counting */
9047 for (ddnat
= ddnatZONE
; ddnat
< ddnatNR
; ddnat
++)
9049 comm
->sum_nat
[ddnat
-ddnatZONE
] = 0;
9053 comm
->load_step
= 0;
9056 clear_ivec(comm
->load_lim
);
9061 void print_dd_statistics(t_commrec
*cr
, t_inputrec
*ir
, FILE *fplog
)
9063 gmx_domdec_comm_t
*comm
;
9067 comm
= cr
->dd
->comm
;
9069 gmx_sumd(ddnatNR
-ddnatZONE
, comm
->sum_nat
, cr
);
9076 fprintf(fplog
, "\n D O M A I N D E C O M P O S I T I O N S T A T I S T I C S\n\n");
9078 for (ddnat
= ddnatZONE
; ddnat
< ddnatNR
; ddnat
++)
9080 av
= comm
->sum_nat
[ddnat
-ddnatZONE
]/comm
->ndecomp
;
9085 " av. #atoms communicated per step for force: %d x %.1f\n",
9089 if (cr
->dd
->vsite_comm
)
9092 " av. #atoms communicated per step for vsites: %d x %.1f\n",
9093 (EEL_PME(ir
->coulombtype
) || ir
->coulombtype
== eelEWALD
) ? 3 : 2,
9098 if (cr
->dd
->constraint_comm
)
9101 " av. #atoms communicated per step for LINCS: %d x %.1f\n",
9102 1 + ir
->nLincsIter
, av
);
9106 gmx_incons(" Unknown type for DD statistics");
9109 fprintf(fplog
, "\n");
9111 if (comm
->bRecordLoad
&& EI_DYNAMICS(ir
->eI
))
9113 print_dd_load_av(fplog
, cr
->dd
);
9117 void dd_partition_system(FILE *fplog
,
9120 gmx_bool bMasterState
,
9122 t_state
*state_global
,
9123 gmx_mtop_t
*top_global
,
9125 t_state
*state_local
,
9128 gmx_localtop_t
*top_local
,
9131 gmx_shellfc_t
*shellfc
,
9132 gmx_constr_t constr
,
9134 gmx_wallcycle_t wcycle
,
9138 gmx_domdec_comm_t
*comm
;
9139 gmx_ddbox_t ddbox
= {0};
9141 gmx_int64_t step_pcoupl
;
9142 rvec cell_ns_x0
, cell_ns_x1
;
9143 int i
, n
, ncgindex_set
, ncg_home_old
= -1, ncg_moved
, nat_f_novirsum
;
9144 gmx_bool bBoxChanged
, bNStGlobalComm
, bDoDLB
, bCheckWhetherToTurnDlbOn
, bTurnOnDLB
, bLogLoad
;
9145 gmx_bool bRedist
, bSortCG
, bResortAll
;
9146 ivec ncells_old
= {0, 0, 0}, ncells_new
= {0, 0, 0}, np
;
9150 wallcycle_start(wcycle
, ewcDOMDEC
);
9155 bBoxChanged
= (bMasterState
|| inputrecDeform(ir
));
9156 if (ir
->epc
!= epcNO
)
9158 /* With nstpcouple > 1 pressure coupling happens.
9159 * one step after calculating the pressure.
9160 * Box scaling happens at the end of the MD step,
9161 * after the DD partitioning.
9162 * We therefore have to do DLB in the first partitioning
9163 * after an MD step where P-coupling occured.
9164 * We need to determine the last step in which p-coupling occurred.
9165 * MRS -- need to validate this for vv?
9170 step_pcoupl
= step
- 1;
9174 step_pcoupl
= ((step
- 1)/n
)*n
+ 1;
9176 if (step_pcoupl
>= comm
->partition_step
)
9182 bNStGlobalComm
= (step
% nstglobalcomm
== 0);
9190 /* Should we do dynamic load balacing this step?
9191 * Since it requires (possibly expensive) global communication,
9192 * we might want to do DLB less frequently.
9194 if (bBoxChanged
|| ir
->epc
!= epcNO
)
9196 bDoDLB
= bBoxChanged
;
9200 bDoDLB
= bNStGlobalComm
;
9204 /* Check if we have recorded loads on the nodes */
9205 if (comm
->bRecordLoad
&& dd_load_count(comm
) > 0)
9207 bCheckWhetherToTurnDlbOn
= dd_dlb_get_should_check_whether_to_turn_dlb_on(dd
);
9209 /* Print load every nstlog, first and last step to the log file */
9210 bLogLoad
= ((ir
->nstlog
> 0 && step
% ir
->nstlog
== 0) ||
9211 comm
->n_load_collect
== 0 ||
9213 (step
+ ir
->nstlist
> ir
->init_step
+ ir
->nsteps
)));
9215 /* Avoid extra communication due to verbose screen output
9216 * when nstglobalcomm is set.
9218 if (bDoDLB
|| bLogLoad
|| bCheckWhetherToTurnDlbOn
||
9219 (bVerbose
&& (ir
->nstlist
== 0 || nstglobalcomm
<= ir
->nstlist
)))
9221 get_load_distribution(dd
, wcycle
);
9226 dd_print_load(fplog
, dd
, step
-1);
9230 dd_print_load_verbose(dd
);
9233 comm
->n_load_collect
++;
9235 if (bCheckWhetherToTurnDlbOn
)
9237 /* Since the timings are node dependent, the master decides */
9240 /* Here we check if the max PME rank load is more than 0.98
9241 * the max PP force load. If so, PP DLB will not help,
9242 * since we are (almost) limited by PME. Furthermore,
9243 * DLB will cause a significant extra x/f redistribution
9244 * cost on the PME ranks, which will then surely result
9245 * in lower total performance.
9246 * This check might be fragile, since one measurement
9247 * below 0.98 (although only done once every 100 DD part.)
9248 * could turn on DLB for the rest of the run.
9250 if (cr
->npmenodes
> 0 &&
9251 dd_pme_f_ratio(dd
) > 1 - DD_PERF_LOSS_DLB_ON
)
9258 (dd_force_imb_perf_loss(dd
) >= DD_PERF_LOSS_DLB_ON
);
9262 fprintf(debug
, "step %s, imb loss %f\n",
9263 gmx_step_str(step
, sbuf
),
9264 dd_force_imb_perf_loss(dd
));
9267 dd_bcast(dd
, sizeof(bTurnOnDLB
), &bTurnOnDLB
);
9270 turn_on_dlb(fplog
, cr
, step
);
9275 comm
->n_load_have
++;
9278 cgs_gl
= &comm
->cgs_gl
;
9283 /* Clear the old state */
9284 clear_dd_indices(dd
, 0, 0);
9287 set_ddbox(dd
, bMasterState
, cr
, ir
, state_global
->box
,
9288 TRUE
, cgs_gl
, state_global
->x
, &ddbox
);
9290 get_cg_distribution(fplog
, dd
, cgs_gl
,
9291 state_global
->box
, &ddbox
, state_global
->x
);
9293 dd_distribute_state(dd
, cgs_gl
,
9294 state_global
, state_local
, f
);
9296 dd_make_local_cgs(dd
, &top_local
->cgs
);
9298 /* Ensure that we have space for the new distribution */
9299 dd_check_alloc_ncg(fr
, state_local
, f
, dd
->ncg_home
);
9301 if (fr
->cutoff_scheme
== ecutsGROUP
)
9303 calc_cgcm(fplog
, 0, dd
->ncg_home
,
9304 &top_local
->cgs
, state_local
->x
, fr
->cg_cm
);
9307 inc_nrnb(nrnb
, eNR_CGCM
, dd
->nat_home
);
9309 dd_set_cginfo(dd
->index_gl
, 0, dd
->ncg_home
, fr
, comm
->bLocalCG
);
9311 else if (state_local
->ddp_count
!= dd
->ddp_count
)
9313 if (state_local
->ddp_count
> dd
->ddp_count
)
9315 gmx_fatal(FARGS
, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local
->ddp_count
, dd
->ddp_count
);
9318 if (state_local
->ddp_count_cg_gl
!= state_local
->ddp_count
)
9320 gmx_fatal(FARGS
, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local
->ddp_count_cg_gl
, state_local
->ddp_count
);
9323 /* Clear the old state */
9324 clear_dd_indices(dd
, 0, 0);
9326 /* Build the new indices */
9327 rebuild_cgindex(dd
, cgs_gl
->index
, state_local
);
9328 make_dd_indices(dd
, cgs_gl
->index
, 0);
9329 ncgindex_set
= dd
->ncg_home
;
9331 if (fr
->cutoff_scheme
== ecutsGROUP
)
9333 /* Redetermine the cg COMs */
9334 calc_cgcm(fplog
, 0, dd
->ncg_home
,
9335 &top_local
->cgs
, state_local
->x
, fr
->cg_cm
);
9338 inc_nrnb(nrnb
, eNR_CGCM
, dd
->nat_home
);
9340 dd_set_cginfo(dd
->index_gl
, 0, dd
->ncg_home
, fr
, comm
->bLocalCG
);
9342 set_ddbox(dd
, bMasterState
, cr
, ir
, state_local
->box
,
9343 TRUE
, &top_local
->cgs
, state_local
->x
, &ddbox
);
9345 bRedist
= dlbIsOn(comm
);
9349 /* We have the full state, only redistribute the cgs */
9351 /* Clear the non-home indices */
9352 clear_dd_indices(dd
, dd
->ncg_home
, dd
->nat_home
);
9355 /* Avoid global communication for dim's without pbc and -gcom */
9356 if (!bNStGlobalComm
)
9358 copy_rvec(comm
->box0
, ddbox
.box0
);
9359 copy_rvec(comm
->box_size
, ddbox
.box_size
);
9361 set_ddbox(dd
, bMasterState
, cr
, ir
, state_local
->box
,
9362 bNStGlobalComm
, &top_local
->cgs
, state_local
->x
, &ddbox
);
9367 /* For dim's without pbc and -gcom */
9368 copy_rvec(ddbox
.box0
, comm
->box0
);
9369 copy_rvec(ddbox
.box_size
, comm
->box_size
);
9371 set_dd_cell_sizes(dd
, &ddbox
, dynamic_dd_box(&ddbox
, ir
), bMasterState
, bDoDLB
,
9374 if (comm
->nstDDDumpGrid
> 0 && step
% comm
->nstDDDumpGrid
== 0)
9376 write_dd_grid_pdb("dd_grid", step
, dd
, state_local
->box
, &ddbox
);
9379 /* Check if we should sort the charge groups */
9380 if (comm
->nstSortCG
> 0)
9382 bSortCG
= (bMasterState
||
9383 (bRedist
&& (step
% comm
->nstSortCG
== 0)));
9390 ncg_home_old
= dd
->ncg_home
;
9395 wallcycle_sub_start(wcycle
, ewcsDD_REDIST
);
9397 dd_redistribute_cg(fplog
, step
, dd
, ddbox
.tric_dir
,
9399 !bSortCG
, nrnb
, &ncgindex_set
, &ncg_moved
);
9401 wallcycle_sub_stop(wcycle
, ewcsDD_REDIST
);
9404 get_nsgrid_boundaries(ddbox
.nboundeddim
, state_local
->box
,
9406 &comm
->cell_x0
, &comm
->cell_x1
,
9407 dd
->ncg_home
, fr
->cg_cm
,
9408 cell_ns_x0
, cell_ns_x1
, &grid_density
);
9412 comm_dd_ns_cell_sizes(dd
, &ddbox
, cell_ns_x0
, cell_ns_x1
, step
);
9415 switch (fr
->cutoff_scheme
)
9418 copy_ivec(fr
->ns
->grid
->n
, ncells_old
);
9419 grid_first(fplog
, fr
->ns
->grid
, dd
, &ddbox
,
9420 state_local
->box
, cell_ns_x0
, cell_ns_x1
,
9421 fr
->rlist
, grid_density
);
9424 nbnxn_get_ncells(fr
->nbv
->nbs
, &ncells_old
[XX
], &ncells_old
[YY
]);
9427 gmx_incons("unimplemented");
9429 /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9430 copy_ivec(ddbox
.tric_dir
, comm
->tric_dir
);
9434 wallcycle_sub_start(wcycle
, ewcsDD_GRID
);
9436 /* Sort the state on charge group position.
9437 * This enables exact restarts from this step.
9438 * It also improves performance by about 15% with larger numbers
9439 * of atoms per node.
9442 /* Fill the ns grid with the home cell,
9443 * so we can sort with the indices.
9445 set_zones_ncg_home(dd
);
9447 switch (fr
->cutoff_scheme
)
9450 set_zones_size(dd
, state_local
->box
, &ddbox
, 0, 1);
9452 nbnxn_put_on_grid(fr
->nbv
->nbs
, fr
->ePBC
, state_local
->box
,
9454 comm
->zones
.size
[0].bb_x0
,
9455 comm
->zones
.size
[0].bb_x1
,
9457 comm
->zones
.dens_zone0
,
9460 ncg_moved
, bRedist
? comm
->moved
: NULL
,
9461 fr
->nbv
->grp
[eintLocal
].kernel_type
,
9462 fr
->nbv
->grp
[eintLocal
].nbat
);
9464 nbnxn_get_ncells(fr
->nbv
->nbs
, &ncells_new
[XX
], &ncells_new
[YY
]);
9467 fill_grid(&comm
->zones
, fr
->ns
->grid
, dd
->ncg_home
,
9468 0, dd
->ncg_home
, fr
->cg_cm
);
9470 copy_ivec(fr
->ns
->grid
->n
, ncells_new
);
9473 gmx_incons("unimplemented");
9476 bResortAll
= bMasterState
;
9478 /* Check if we can user the old order and ns grid cell indices
9479 * of the charge groups to sort the charge groups efficiently.
9481 if (ncells_new
[XX
] != ncells_old
[XX
] ||
9482 ncells_new
[YY
] != ncells_old
[YY
] ||
9483 ncells_new
[ZZ
] != ncells_old
[ZZ
])
9490 fprintf(debug
, "Step %s, sorting the %d home charge groups\n",
9491 gmx_step_str(step
, sbuf
), dd
->ncg_home
);
9493 dd_sort_state(dd
, fr
->cg_cm
, fr
, state_local
,
9494 bResortAll
? -1 : ncg_home_old
);
9495 /* Rebuild all the indices */
9496 ga2la_clear(dd
->ga2la
);
9499 wallcycle_sub_stop(wcycle
, ewcsDD_GRID
);
9502 wallcycle_sub_start(wcycle
, ewcsDD_SETUPCOMM
);
9504 /* Setup up the communication and communicate the coordinates */
9505 setup_dd_communication(dd
, state_local
->box
, &ddbox
, fr
, state_local
, f
);
9507 /* Set the indices */
9508 make_dd_indices(dd
, cgs_gl
->index
, ncgindex_set
);
9510 /* Set the charge group boundaries for neighbor searching */
9511 set_cg_boundaries(&comm
->zones
);
9513 if (fr
->cutoff_scheme
== ecutsVERLET
)
9515 set_zones_size(dd
, state_local
->box
, &ddbox
,
9516 bSortCG
? 1 : 0, comm
->zones
.n
);
9519 wallcycle_sub_stop(wcycle
, ewcsDD_SETUPCOMM
);
9522 write_dd_pdb("dd_home",step,"dump",top_global,cr,
9523 -1,state_local->x,state_local->box);
9526 wallcycle_sub_start(wcycle
, ewcsDD_MAKETOP
);
9528 /* Extract a local topology from the global topology */
9529 for (i
= 0; i
< dd
->ndim
; i
++)
9531 np
[dd
->dim
[i
]] = comm
->cd
[i
].np
;
9533 dd_make_local_top(dd
, &comm
->zones
, dd
->npbcdim
, state_local
->box
,
9534 comm
->cellsize_min
, np
,
9536 fr
->cutoff_scheme
== ecutsGROUP
? fr
->cg_cm
: state_local
->x
,
9537 vsite
, top_global
, top_local
);
9539 wallcycle_sub_stop(wcycle
, ewcsDD_MAKETOP
);
9541 wallcycle_sub_start(wcycle
, ewcsDD_MAKECONSTR
);
9543 /* Set up the special atom communication */
9544 n
= comm
->nat
[ddnatZONE
];
9545 for (i
= ddnatZONE
+1; i
< ddnatNR
; i
++)
9550 if (vsite
&& vsite
->n_intercg_vsite
)
9552 n
= dd_make_local_vsites(dd
, n
, top_local
->idef
.il
);
9556 if (dd
->bInterCGcons
|| dd
->bInterCGsettles
)
9558 /* Only for inter-cg constraints we need special code */
9559 n
= dd_make_local_constraints(dd
, n
, top_global
, fr
->cginfo
,
9560 constr
, ir
->nProjOrder
,
9561 top_local
->idef
.il
);
9565 gmx_incons("Unknown special atom type setup");
9570 wallcycle_sub_stop(wcycle
, ewcsDD_MAKECONSTR
);
9572 wallcycle_sub_start(wcycle
, ewcsDD_TOPOTHER
);
9574 /* Make space for the extra coordinates for virtual site
9575 * or constraint communication.
9577 state_local
->natoms
= comm
->nat
[ddnatNR
-1];
9578 if (state_local
->natoms
> state_local
->nalloc
)
9580 dd_realloc_state(state_local
, f
, state_local
->natoms
);
9583 if (fr
->bF_NoVirSum
)
9585 if (vsite
&& vsite
->n_intercg_vsite
)
9587 nat_f_novirsum
= comm
->nat
[ddnatVSITE
];
9591 if (EEL_FULL(ir
->coulombtype
) && dd
->n_intercg_excl
> 0)
9593 nat_f_novirsum
= dd
->nat_tot
;
9597 nat_f_novirsum
= dd
->nat_home
;
9606 /* Set the number of atoms required for the force calculation.
9607 * Forces need to be constrained when doing energy
9608 * minimization. For simple simulations we could avoid some
9609 * allocation, zeroing and copying, but this is probably not worth
9610 * the complications and checking.
9612 forcerec_set_ranges(fr
, dd
->ncg_home
, dd
->ncg_tot
,
9613 dd
->nat_tot
, comm
->nat
[ddnatCON
], nat_f_novirsum
);
9615 /* We make the all mdatoms up to nat_tot_con.
9616 * We could save some work by only setting invmass
9617 * between nat_tot and nat_tot_con.
9619 /* This call also sets the new number of home particles to dd->nat_home */
9620 atoms2md(top_global
, ir
,
9621 comm
->nat
[ddnatCON
], dd
->gatindex
, dd
->nat_home
, mdatoms
);
9623 /* Now we have the charges we can sort the FE interactions */
9624 dd_sort_local_top(dd
, mdatoms
, top_local
);
9628 /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9629 split_vsites_over_threads(top_local
->idef
.il
, top_local
->idef
.iparams
,
9630 mdatoms
, FALSE
, vsite
);
9635 /* Make the local shell stuff, currently no communication is done */
9636 make_local_shells(cr
, mdatoms
, shellfc
);
9639 if (ir
->implicit_solvent
)
9641 make_local_gb(cr
, fr
->born
, ir
->gb_algorithm
);
9644 setup_bonded_threading(fr
, &top_local
->idef
);
9646 if (!(cr
->duty
& DUTY_PME
))
9648 /* Send the charges and/or c6/sigmas to our PME only node */
9649 gmx_pme_send_parameters(cr
,
9651 mdatoms
->nChargePerturbed
, mdatoms
->nTypePerturbed
,
9652 mdatoms
->chargeA
, mdatoms
->chargeB
,
9653 mdatoms
->sqrt_c6A
, mdatoms
->sqrt_c6B
,
9654 mdatoms
->sigmaA
, mdatoms
->sigmaB
,
9655 dd_pme_maxshift_x(dd
), dd_pme_maxshift_y(dd
));
9660 set_constraints(constr
, top_local
, ir
, mdatoms
, cr
);
9665 /* Update the local pull groups */
9666 dd_make_local_pull_groups(cr
, ir
->pull_work
, mdatoms
);
9671 /* Update the local rotation groups */
9672 dd_make_local_rotation_groups(dd
, ir
->rot
);
9675 if (ir
->eSwapCoords
!= eswapNO
)
9677 /* Update the local groups needed for ion swapping */
9678 dd_make_local_swap_groups(dd
, ir
->swap
);
9681 /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
9682 dd_make_local_IMD_atoms(ir
->bIMD
, dd
, ir
->imd
);
9684 add_dd_statistics(dd
);
9686 /* Make sure we only count the cycles for this DD partitioning */
9687 clear_dd_cycle_counts(dd
);
9689 /* Because the order of the atoms might have changed since
9690 * the last vsite construction, we need to communicate the constructing
9691 * atom coordinates again (for spreading the forces this MD step).
9693 dd_move_x_vsites(dd
, state_local
->box
, state_local
->x
);
9695 wallcycle_sub_stop(wcycle
, ewcsDD_TOPOTHER
);
9697 if (comm
->nstDDDump
> 0 && step
% comm
->nstDDDump
== 0)
9699 dd_move_x(dd
, state_local
->box
, state_local
->x
);
9700 write_dd_pdb("dd_dump", step
, "dump", top_global
, cr
,
9701 -1, state_local
->x
, state_local
->box
);
9704 /* Store the partitioning step */
9705 comm
->partition_step
= step
;
9707 /* Increase the DD partitioning counter */
9709 /* The state currently matches this DD partitioning count, store it */
9710 state_local
->ddp_count
= dd
->ddp_count
;
9713 /* The DD master node knows the complete cg distribution,
9714 * store the count so we can possibly skip the cg info communication.
9716 comm
->master_cg_ddp_count
= (bSortCG
? 0 : dd
->ddp_count
);
9719 if (comm
->DD_debug
> 0)
9721 /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9722 check_index_consistency(dd
, top_global
->natoms
, ncg_mtop(top_global
),
9723 "after partitioning");
9726 wallcycle_stop(wcycle
, ewcDOMDEC
);