1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This file is part of Gromacs Copyright (c) 1991-2008
5 * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
12 * To help us fund GROMACS development, we humbly ask that you cite
13 * the research papers on the package. Check out http://www.gromacs.org
16 * Gnomes, ROck Monsters And Chili Sauce
33 /* Margin for setting up the DD grid */
34 #define DD_GRID_MARGIN_PRES_SCALE 1.05
36 static int factorize(int n
,int **fac
,int **mfac
)
40 /* Decompose n in factors */
49 if (ndiv
== 0 || (*fac
)[ndiv
-1] != d
)
63 static bool fits_pme_ratio(int nnodes
,int npme
,float ratio
)
65 return ((double)npme
/(double)nnodes
> 0.95*ratio
);
68 static bool fits_pp_pme_perf(FILE *fplog
,
69 t_inputrec
*ir
,matrix box
,gmx_mtop_t
*mtop
,
70 int nnodes
,int npme
,float ratio
)
72 int ndiv
,*div
,*mdiv
,ldiv
;
74 ndiv
= factorize(nnodes
-npme
,&div
,&mdiv
);
78 /* The check below gives a reasonable division:
79 * factor 5 allowed at 5 or more PP nodes,
80 * factor 7 allowed at 49 or more PP nodes.
82 if (ldiv
> 3 + (int)(pow(nnodes
-npme
,1.0/3.0) + 0.5))
87 /* Does this division gives a reasonable PME load? */
88 return fits_pme_ratio(nnodes
,npme
,ratio
);
91 static int guess_npme(FILE *fplog
,gmx_mtop_t
*mtop
,t_inputrec
*ir
,matrix box
,
98 ratio
= pme_load_estimate(mtop
,ir
,box
);
102 fprintf(fplog
,"Guess for relative PME load: %.2f\n",ratio
);
105 /* We assume the optimal node ratio is close to the load ratio.
106 * The communication load is neglected,
107 * but (hopefully) this will balance out between PP and PME.
110 if (!fits_pme_ratio(nnodes
,nnodes
/2,ratio
))
112 /* We would need more than nnodes/2 PME only nodes,
113 * which is not possible. Since the PME load is very high,
114 * we will not loose much performance when all nodes do PME.
120 /* First try to find npme as a factor of nnodes up to nnodes/3.
121 * We start with a minimum PME node fraction of 1/16
122 * and avoid ratios which lead to large prime factors in nnodes-npme.
124 npme
= (nnodes
+ 15)/16;
125 while (npme
<= nnodes
/3) {
126 if (nnodes
% npme
== 0)
128 /* Note that fits_perf might change the PME grid,
129 * in the current implementation it does not.
131 if (fits_pp_pme_perf(fplog
,ir
,box
,mtop
,nnodes
,npme
,ratio
))
140 /* Try any possible number for npme */
142 while (npme
<= nnodes
/2)
144 /* Note that fits_perf may change the PME grid */
145 if (fits_pp_pme_perf(fplog
,ir
,box
,mtop
,nnodes
,npme
,ratio
))
154 gmx_fatal(FARGS
,"Could not find an appropriate number of separate PME nodes. i.e. >= %5f*#nodes (%d) and <= #nodes/2 (%d) and reasonable performance wise (grid_x=%d, grid_y=%d).\n"
155 "Use the -npme option of mdrun or change the number of processors or the PME grid dimensions, see the manual for details.",
156 ratio
,(int)(0.95*ratio
*nnodes
+0.5),nnodes
/2,ir
->nkx
,ir
->nky
);
157 /* Keep the compiler happy */
165 "Will use %d particle-particle and %d PME only nodes\n"
166 "This is a guess, check the performance at the end of the log file\n",
170 "Will use %d particle-particle and %d PME only nodes\n"
171 "This is a guess, check the performance at the end of the log file\n",
178 static int lcd(int n1
,int n2
)
183 for(i
=2; (i
<=n1
&& i
<=n2
); i
++)
185 if (n1
% i
== 0 && n2
% i
== 0)
194 static int div_up(int n
,int f
)
196 return (n
+ f
- 1)/f
;
199 real
comm_box_frac(ivec dd_nc
,real cutoff
,gmx_ddbox_t
*ddbox
)
207 bt
[i
] = ddbox
->box_size
[i
]*ddbox
->skew_fac
[i
];
208 nw
[i
] = dd_nc
[i
]*cutoff
/bt
[i
];
219 for(j
=i
+1; j
<DIM
; j
++)
223 comm_vol
+= nw
[i
]*nw
[j
]*M_PI
/4;
224 for(k
=j
+1; k
<DIM
; k
++)
228 comm_vol
+= nw
[i
]*nw
[j
]*nw
[k
]*M_PI
/6;
239 static bool inhomogeneous_z(const t_inputrec
*ir
)
241 return ((EEL_PME(ir
->coulombtype
) || ir
->coulombtype
==eelEWALD
) &&
242 ir
->ePBC
==epbcXYZ
&& ir
->ewald_geometry
==eewg3DC
);
245 static float comm_cost_est(gmx_domdec_t
*dd
,real limit
,real cutoff
,
246 matrix box
,gmx_ddbox_t
*ddbox
,
247 int natoms
,t_inputrec
*ir
,
249 int npme_tot
,ivec nc
)
252 int i
,j
,k
,nk
,overlap
;
254 float comm_vol
,comm_vol_xf
,comm_pme
,cost_pbcdx
;
255 /* This is the cost of a pbc_dx call relative to the cost
256 * of communicating the coordinate and force of an atom.
257 * This will be machine dependent.
258 * These factors are for x86 with SMP or Infiniband.
260 float pbcdx_rect_fac
= 0.1;
261 float pbcdx_tric_fac
= 0.2;
263 /* Check the DD algorithm restrictions */
264 if ((ir
->ePBC
== epbcXY
&& ir
->nwall
< 2 && nc
[ZZ
] > 1) ||
265 (ir
->ePBC
== epbcSCREW
&& (nc
[XX
] == 1 || nc
[YY
] > 1 || nc
[ZZ
] > 1)))
270 if (inhomogeneous_z(ir
) && nc
[ZZ
] > 1)
275 /* Check if the triclinic requirements are met */
278 for(j
=i
+1; j
<ddbox
->npbcdim
; j
++)
280 if (box
[j
][i
] != 0 || ir
->deform
[j
][i
] != 0 ||
281 (ir
->epc
!= epcNO
&& ir
->compress
[j
][i
] != 0))
283 if (nc
[j
] > 1 && nc
[i
] == 1)
293 bt
[i
] = ddbox
->box_size
[i
]*ddbox
->skew_fac
[i
];
295 /* Without PBC there are no cell size limits with 2 cells */
296 if (!(i
>= ddbox
->npbcdim
&& nc
[i
] <= 2) && bt
[i
] < nc
[i
]*limit
)
304 /* Will we use 1D or 2D PME decomposition? */
305 npme
[XX
] = (npme_tot
% nc
[XX
] == 0) ? nc
[XX
] : npme_tot
;
306 npme
[YY
] = npme_tot
/npme
[XX
];
309 /* When two dimensions are (nearly) equal, use more cells
310 * for the smallest index, so the decomposition does not
311 * depend sensitively on the rounding of the box elements.
315 for(j
=i
+1; j
<DIM
; j
++)
317 /* Check if dimension i and j are equivalent,
318 * both for PME and for the box size.
319 * The XX/YY check is a bit compact. If nc[YY]==npme[YY]
320 * this means the swapped nc has nc[XX]=npme[XX],
321 * and we can also swap X and Y for PME.
323 if (!((i
== XX
&& j
== YY
&& npme
[YY
] > 1 && nc
[YY
] != npme
[YY
]) ||
324 (i
== YY
&& j
== ZZ
&& npme
[YY
] > 1)) &&
325 fabs(bt
[j
] - bt
[i
]) < 0.01*bt
[i
] && nc
[j
] > nc
[i
])
332 /* This function determines only half of the communication cost.
333 * All PP, PME and PP-PME communication is symmetric
334 * and the "back"-communication cost is identical to the forward cost.
337 comm_vol
= comm_box_frac(nc
,cutoff
,ddbox
);
342 /* Determine the largest volume for PME x/f redistribution */
343 if (nc
[i
] % npme
[i
] != 0)
347 comm_vol_xf
= (npme
[i
]==2 ? 1.0/3.0 : 0.5);
351 comm_vol_xf
= 1.0 - lcd(nc
[i
],npme
[i
])/(double)npme
[i
];
353 comm_pme
+= 3*natoms
*comm_vol_xf
;
356 /* Grid overlap communication */
359 nk
= (i
==0 ? ir
->nkx
: ir
->nky
);
360 overlap
= (nk
% npme
[i
] == 0 ? ir
->pme_order
-1 : ir
->pme_order
);
361 comm_pme
+= npme
[i
]*overlap
*ir
->nkx
*ir
->nky
*ir
->nkz
/nk
;
365 /* PME FFT communication volume.
366 * This only takes the communication into account and not imbalance
367 * in the calculation. But the imbalance in communication and calculation
368 * are similar and therefore these formulas also prefer load balance
369 * in the FFT and pme_solve calculation.
371 comm_pme
+= (npme
[YY
] - 1)*npme
[YY
]*div_up(ir
->nky
,npme
[YY
])*div_up(ir
->nkz
,npme
[YY
])*ir
->nkx
;
372 comm_pme
+= (npme
[XX
] - 1)*npme
[XX
]*div_up(ir
->nkx
,npme
[XX
])*div_up(ir
->nky
,npme
[XX
])*ir
->nkz
;
374 /* Add cost of pbc_dx for bondeds */
376 if ((nc
[XX
] == 1 || nc
[YY
] == 1) || (nc
[ZZ
] == 1 && ir
->ePBC
!= epbcXY
))
378 if ((ddbox
->tric_dir
[XX
] && nc
[XX
] == 1) ||
379 (ddbox
->tric_dir
[YY
] && nc
[YY
] == 1))
381 cost_pbcdx
= pbcdxr
*pbcdx_tric_fac
;
385 cost_pbcdx
= pbcdxr
*pbcdx_rect_fac
;
392 "nc %2d %2d %2d %2d %2d vol pp %6.4f pbcdx %6.4f pme %9.3e tot %9.3e\n",
393 nc
[XX
],nc
[YY
],nc
[ZZ
],npme
[XX
],npme
[YY
],
394 comm_vol
,cost_pbcdx
,comm_pme
,
395 3*natoms
*(comm_vol
+ cost_pbcdx
) + comm_pme
);
398 return 3*natoms
*(comm_vol
+ cost_pbcdx
) + comm_pme
;
401 static void assign_factors(gmx_domdec_t
*dd
,
402 real limit
,real cutoff
,
403 matrix box
,gmx_ddbox_t
*ddbox
,
404 int natoms
,t_inputrec
*ir
,
405 float pbcdxr
,int npme
,
406 int ndiv
,int *div
,int *mdiv
,ivec ir_try
,ivec opt
)
413 ce
= comm_cost_est(dd
,limit
,cutoff
,box
,ddbox
,
414 natoms
,ir
,pbcdxr
,npme
,ir_try
);
415 if (ce
>= 0 && (opt
[XX
] == 0 ||
416 ce
< comm_cost_est(dd
,limit
,cutoff
,box
,ddbox
,
420 copy_ivec(ir_try
,opt
);
426 for(x
=mdiv
[0]; x
>=0; x
--)
430 ir_try
[XX
] *= div
[0];
432 for(y
=mdiv
[0]-x
; y
>=0; y
--)
436 ir_try
[YY
] *= div
[0];
438 for(i
=0; i
<mdiv
[0]-x
-y
; i
++)
440 ir_try
[ZZ
] *= div
[0];
444 assign_factors(dd
,limit
,cutoff
,box
,ddbox
,natoms
,ir
,pbcdxr
,npme
,
445 ndiv
-1,div
+1,mdiv
+1,ir_try
,opt
);
447 for(i
=0; i
<mdiv
[0]-x
-y
; i
++)
449 ir_try
[ZZ
] /= div
[0];
453 ir_try
[YY
] /= div
[0];
458 ir_try
[XX
] /= div
[0];
463 static real
optimize_ncells(FILE *fplog
,
464 int nnodes_tot
,int npme_only
,
465 bool bDynLoadBal
,real dlb_scale
,
466 gmx_mtop_t
*mtop
,matrix box
,gmx_ddbox_t
*ddbox
,
469 real cellsize_limit
,real cutoff
,
470 bool bInterCGBondeds
,bool bInterCGMultiBody
,
473 int npp
,npme
,ndiv
,*div
,*mdiv
,d
,nmax
;
479 limit
= cellsize_limit
;
485 npp
= nnodes_tot
- npme_only
;
486 if (EEL_PME(ir
->coulombtype
))
488 npme
= (npme_only
> 0 ? npme_only
: npp
);
497 /* For Ewald exclusions pbc_dx is not called */
499 (IR_EXCL_FORCES(*ir
) && !EEL_FULL(ir
->coulombtype
));
500 pbcdxr
= (double)n_bonded_dx(mtop
,bExcl_pbcdx
)/(double)mtop
->natoms
;
504 /* Every molecule is a single charge group: no pbc required */
507 /* Add a margin for DLB and/or pressure scaling */
510 if (dlb_scale
>= 1.0)
512 gmx_fatal(FARGS
,"The value for option -dds should be smaller than 1");
516 fprintf(fplog
,"Scaling the initial minimum size with 1/%g (option -dds) = %g\n",dlb_scale
,1/dlb_scale
);
520 else if (ir
->epc
!= epcNO
)
524 fprintf(fplog
,"To account for pressure scaling, scaling the initial minimum size with %g\n",DD_GRID_MARGIN_PRES_SCALE
);
525 limit
*= DD_GRID_MARGIN_PRES_SCALE
;
531 fprintf(fplog
,"Optimizing the DD grid for %d cells with a minimum initial size of %.3f nm\n",npp
,limit
);
533 if (inhomogeneous_z(ir
))
535 fprintf(fplog
,"Ewald_geometry=%s: assuming inhomogeneous particle distribution in z, will not decompose in z.\n",eewg_names
[ir
->ewald_geometry
]);
540 fprintf(fplog
,"The maximum allowed number of cells is:");
543 nmax
= (int)(ddbox
->box_size
[d
]*ddbox
->skew_fac
[d
]/limit
);
544 if (d
>= ddbox
->npbcdim
&& nmax
< 2)
548 if (d
== ZZ
&& inhomogeneous_z(ir
))
552 fprintf(fplog
," %c %d",'X' + d
,nmax
);
560 fprintf(debug
,"Average nr of pbc_dx calls per atom %.2f\n",pbcdxr
);
563 /* Decompose npp in factors */
564 ndiv
= factorize(npp
,&div
,&mdiv
);
570 assign_factors(dd
,limit
,cutoff
,box
,ddbox
,mtop
->natoms
,ir
,pbcdxr
,
571 npme
,ndiv
,div
,mdiv
,itry
,nc
);
579 real
dd_choose_grid(FILE *fplog
,
580 t_commrec
*cr
,gmx_domdec_t
*dd
,t_inputrec
*ir
,
581 gmx_mtop_t
*mtop
,matrix box
,gmx_ddbox_t
*ddbox
,
582 bool bDynLoadBal
,real dlb_scale
,
583 real cellsize_limit
,real cutoff_dd
,
584 bool bInterCGBondeds
,bool bInterCGMultiBody
)
591 if (EEL_PME(ir
->coulombtype
))
593 if (cr
->npmenodes
>= 0)
595 if (cr
->nnodes
<= 2 && cr
->npmenodes
> 0)
598 "Can not have separate PME nodes with 2 or less nodes");
603 if (cr
->nnodes
<= 10)
609 cr
->npmenodes
= guess_npme(fplog
,mtop
,ir
,box
,cr
->nnodes
);
614 fprintf(fplog
,"Using %d separate PME nodes\n",cr
->npmenodes
);
619 if (cr
->npmenodes
< 0)
625 limit
= optimize_ncells(fplog
,cr
->nnodes
,cr
->npmenodes
,
626 bDynLoadBal
,dlb_scale
,
627 mtop
,box
,ddbox
,ir
,dd
,
628 cellsize_limit
,cutoff_dd
,
629 bInterCGBondeds
,bInterCGMultiBody
,
636 /* Communicate the information set by the master to all nodes */
637 gmx_bcast(sizeof(dd
->nc
),dd
->nc
,cr
);
638 if (EEL_PME(ir
->coulombtype
))
640 gmx_bcast(sizeof(ir
->nkx
),&ir
->nkx
,cr
);
641 gmx_bcast(sizeof(ir
->nky
),&ir
->nky
,cr
);
642 gmx_bcast(sizeof(cr
->npmenodes
),&cr
->npmenodes
,cr
);