1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This file is part of Gromacs Copyright (c) 1991-2008
5 * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
12 * To help us fund GROMACS development, we humbly ask that you cite
13 * the research papers on the package. Check out http://www.gromacs.org
16 * Gnomes, ROck Monsters And Chili Sauce
33 /* Margin for setting up the DD grid */
34 #define DD_GRID_MARGIN_PRES_SCALE 1.05
36 static int factorize(int n
,int **fac
,int **mfac
)
40 /* Decompose n in factors */
49 if (ndiv
== 0 || (*fac
)[ndiv
-1] != d
)
63 static gmx_bool
is_prime(int n
)
80 static int lcd(int n1
,int n2
)
85 for(i
=2; (i
<=n1
&& i
<=n2
); i
++)
87 if (n1
% i
== 0 && n2
% i
== 0)
96 static gmx_bool
fits_pme_ratio(int nnodes
,int npme
,float ratio
)
98 return ((double)npme
/(double)nnodes
> 0.95*ratio
);
101 static gmx_bool
fits_pp_pme_perf(FILE *fplog
,
102 t_inputrec
*ir
,matrix box
,gmx_mtop_t
*mtop
,
103 int nnodes
,int npme
,float ratio
)
105 int ndiv
,*div
,*mdiv
,ldiv
;
106 int npp_root3
,npme_root2
;
108 ndiv
= factorize(nnodes
-npme
,&div
,&mdiv
);
113 npp_root3
= (int)(pow(nnodes
-npme
,1.0/3.0) + 0.5);
114 npme_root2
= (int)(sqrt(npme
) + 0.5);
116 /* The check below gives a reasonable division:
117 * factor 5 allowed at 5 or more PP nodes,
118 * factor 7 allowed at 49 or more PP nodes.
120 if (ldiv
> 3 + npp_root3
)
125 /* Check if the number of PP and PME nodes have a reasonable sized
126 * denominator in common, such that we can use 2D PME decomposition
127 * when required (which requires nx_pp == nx_pme).
128 * The factor of 2 allows for a maximum ratio of 2^2=4
129 * between nx_pme and ny_pme.
131 if (lcd(nnodes
-npme
,npme
)*2 < npme_root2
)
136 /* Does this division gives a reasonable PME load? */
137 return fits_pme_ratio(nnodes
,npme
,ratio
);
140 static int guess_npme(FILE *fplog
,gmx_mtop_t
*mtop
,t_inputrec
*ir
,matrix box
,
147 ratio
= pme_load_estimate(mtop
,ir
,box
);
151 fprintf(fplog
,"Guess for relative PME load: %.2f\n",ratio
);
154 /* We assume the optimal node ratio is close to the load ratio.
155 * The communication load is neglected,
156 * but (hopefully) this will balance out between PP and PME.
159 if (!fits_pme_ratio(nnodes
,nnodes
/2,ratio
))
161 /* We would need more than nnodes/2 PME only nodes,
162 * which is not possible. Since the PME load is very high,
163 * we will not loose much performance when all nodes do PME.
169 /* First try to find npme as a factor of nnodes up to nnodes/3.
170 * We start with a minimum PME node fraction of 1/16
171 * and avoid ratios which lead to large prime factors in nnodes-npme.
173 npme
= (nnodes
+ 15)/16;
174 while (npme
<= nnodes
/3) {
175 if (nnodes
% npme
== 0)
177 /* Note that fits_perf might change the PME grid,
178 * in the current implementation it does not.
180 if (fits_pp_pme_perf(fplog
,ir
,box
,mtop
,nnodes
,npme
,ratio
))
189 /* Try any possible number for npme */
191 while (npme
<= nnodes
/2)
193 /* Note that fits_perf may change the PME grid */
194 if (fits_pp_pme_perf(fplog
,ir
,box
,mtop
,nnodes
,npme
,ratio
))
203 gmx_fatal(FARGS
,"Could not find an appropriate number of separate PME nodes. i.e. >= %5f*#nodes (%d) and <= #nodes/2 (%d) and reasonable performance wise (grid_x=%d, grid_y=%d).\n"
204 "Use the -npme option of mdrun or change the number of processors or the PME grid dimensions, see the manual for details.",
205 ratio
,(int)(0.95*ratio
*nnodes
+0.5),nnodes
/2,ir
->nkx
,ir
->nky
);
206 /* Keep the compiler happy */
214 "Will use %d particle-particle and %d PME only nodes\n"
215 "This is a guess, check the performance at the end of the log file\n",
219 "Will use %d particle-particle and %d PME only nodes\n"
220 "This is a guess, check the performance at the end of the log file\n",
227 static int div_up(int n
,int f
)
229 return (n
+ f
- 1)/f
;
232 real
comm_box_frac(ivec dd_nc
,real cutoff
,gmx_ddbox_t
*ddbox
)
240 bt
[i
] = ddbox
->box_size
[i
]*ddbox
->skew_fac
[i
];
241 nw
[i
] = dd_nc
[i
]*cutoff
/bt
[i
];
252 for(j
=i
+1; j
<DIM
; j
++)
256 comm_vol
+= nw
[i
]*nw
[j
]*M_PI
/4;
257 for(k
=j
+1; k
<DIM
; k
++)
261 comm_vol
+= nw
[i
]*nw
[j
]*nw
[k
]*M_PI
/6;
272 static gmx_bool
inhomogeneous_z(const t_inputrec
*ir
)
274 return ((EEL_PME(ir
->coulombtype
) || ir
->coulombtype
==eelEWALD
) &&
275 ir
->ePBC
==epbcXYZ
&& ir
->ewald_geometry
==eewg3DC
);
278 static float comm_cost_est(gmx_domdec_t
*dd
,real limit
,real cutoff
,
279 matrix box
,gmx_ddbox_t
*ddbox
,
280 int natoms
,t_inputrec
*ir
,
282 int npme_tot
,ivec nc
)
285 int i
,j
,k
,nk
,overlap
;
287 float comm_vol
,comm_vol_xf
,comm_pme
,cost_pbcdx
;
288 /* This is the cost of a pbc_dx call relative to the cost
289 * of communicating the coordinate and force of an atom.
290 * This will be machine dependent.
291 * These factors are for x86 with SMP or Infiniband.
293 float pbcdx_rect_fac
= 0.1;
294 float pbcdx_tric_fac
= 0.2;
296 /* Check the DD algorithm restrictions */
297 if ((ir
->ePBC
== epbcXY
&& ir
->nwall
< 2 && nc
[ZZ
] > 1) ||
298 (ir
->ePBC
== epbcSCREW
&& (nc
[XX
] == 1 || nc
[YY
] > 1 || nc
[ZZ
] > 1)))
303 if (inhomogeneous_z(ir
) && nc
[ZZ
] > 1)
308 /* Check if the triclinic requirements are met */
311 for(j
=i
+1; j
<ddbox
->npbcdim
; j
++)
313 if (box
[j
][i
] != 0 || ir
->deform
[j
][i
] != 0 ||
314 (ir
->epc
!= epcNO
&& ir
->compress
[j
][i
] != 0))
316 if (nc
[j
] > 1 && nc
[i
] == 1)
326 bt
[i
] = ddbox
->box_size
[i
]*ddbox
->skew_fac
[i
];
328 /* Without PBC there are no cell size limits with 2 cells */
329 if (!(i
>= ddbox
->npbcdim
&& nc
[i
] <= 2) && bt
[i
] < nc
[i
]*limit
)
337 /* The following choices should match those
338 * in init_domain_decomposition in domdec.c.
340 if (nc
[XX
] == 1 && nc
[YY
] > 1)
345 else if (nc
[YY
] == 1)
352 /* Will we use 1D or 2D PME decomposition? */
353 npme
[XX
] = (npme_tot
% nc
[XX
] == 0) ? nc
[XX
] : npme_tot
;
354 npme
[YY
] = npme_tot
/npme
[XX
];
358 /* When two dimensions are (nearly) equal, use more cells
359 * for the smallest index, so the decomposition does not
360 * depend sensitively on the rounding of the box elements.
364 for(j
=i
+1; j
<DIM
; j
++)
366 /* Check if the box size is nearly identical,
367 * in that case we prefer nx > ny and ny > nz.
369 if (fabs(bt
[j
] - bt
[i
]) < 0.01*bt
[i
] && nc
[j
] > nc
[i
])
371 /* The XX/YY check is a bit compact. If nc[YY]==npme[YY]
372 * this means the swapped nc has nc[XX]==npme[XX],
373 * and we can also swap X and Y for PME.
375 /* Check if dimension i and j are equivalent for PME.
376 * For x/y: if nc[YY]!=npme[YY], we can not swap x/y
377 * For y/z: we can not have PME decomposition in z
380 !((i
== XX
&& j
== YY
&& nc
[YY
] != npme
[YY
]) ||
381 (i
== YY
&& j
== ZZ
&& npme
[YY
] > 1)))
389 /* This function determines only half of the communication cost.
390 * All PP, PME and PP-PME communication is symmetric
391 * and the "back"-communication cost is identical to the forward cost.
394 comm_vol
= comm_box_frac(nc
,cutoff
,ddbox
);
399 /* Determine the largest volume for PME x/f redistribution */
400 if (nc
[i
] % npme
[i
] != 0)
404 comm_vol_xf
= (npme
[i
]==2 ? 1.0/3.0 : 0.5);
408 comm_vol_xf
= 1.0 - lcd(nc
[i
],npme
[i
])/(double)npme
[i
];
410 comm_pme
+= 3*natoms
*comm_vol_xf
;
413 /* Grid overlap communication */
416 nk
= (i
==0 ? ir
->nkx
: ir
->nky
);
417 overlap
= (nk
% npme
[i
] == 0 ? ir
->pme_order
-1 : ir
->pme_order
);
418 comm_pme
+= npme
[i
]*overlap
*ir
->nkx
*ir
->nky
*ir
->nkz
/nk
;
422 /* PME FFT communication volume.
423 * This only takes the communication into account and not imbalance
424 * in the calculation. But the imbalance in communication and calculation
425 * are similar and therefore these formulas also prefer load balance
426 * in the FFT and pme_solve calculation.
428 comm_pme
+= (npme
[YY
] - 1)*npme
[YY
]*div_up(ir
->nky
,npme
[YY
])*div_up(ir
->nkz
,npme
[YY
])*ir
->nkx
;
429 comm_pme
+= (npme
[XX
] - 1)*npme
[XX
]*div_up(ir
->nkx
,npme
[XX
])*div_up(ir
->nky
,npme
[XX
])*ir
->nkz
;
431 /* Add cost of pbc_dx for bondeds */
433 if ((nc
[XX
] == 1 || nc
[YY
] == 1) || (nc
[ZZ
] == 1 && ir
->ePBC
!= epbcXY
))
435 if ((ddbox
->tric_dir
[XX
] && nc
[XX
] == 1) ||
436 (ddbox
->tric_dir
[YY
] && nc
[YY
] == 1))
438 cost_pbcdx
= pbcdxr
*pbcdx_tric_fac
;
442 cost_pbcdx
= pbcdxr
*pbcdx_rect_fac
;
449 "nc %2d %2d %2d %2d %2d vol pp %6.4f pbcdx %6.4f pme %9.3e tot %9.3e\n",
450 nc
[XX
],nc
[YY
],nc
[ZZ
],npme
[XX
],npme
[YY
],
451 comm_vol
,cost_pbcdx
,comm_pme
,
452 3*natoms
*(comm_vol
+ cost_pbcdx
) + comm_pme
);
455 return 3*natoms
*(comm_vol
+ cost_pbcdx
) + comm_pme
;
458 static void assign_factors(gmx_domdec_t
*dd
,
459 real limit
,real cutoff
,
460 matrix box
,gmx_ddbox_t
*ddbox
,
461 int natoms
,t_inputrec
*ir
,
462 float pbcdxr
,int npme
,
463 int ndiv
,int *div
,int *mdiv
,ivec ir_try
,ivec opt
)
470 ce
= comm_cost_est(dd
,limit
,cutoff
,box
,ddbox
,
471 natoms
,ir
,pbcdxr
,npme
,ir_try
);
472 if (ce
>= 0 && (opt
[XX
] == 0 ||
473 ce
< comm_cost_est(dd
,limit
,cutoff
,box
,ddbox
,
477 copy_ivec(ir_try
,opt
);
483 for(x
=mdiv
[0]; x
>=0; x
--)
487 ir_try
[XX
] *= div
[0];
489 for(y
=mdiv
[0]-x
; y
>=0; y
--)
493 ir_try
[YY
] *= div
[0];
495 for(i
=0; i
<mdiv
[0]-x
-y
; i
++)
497 ir_try
[ZZ
] *= div
[0];
501 assign_factors(dd
,limit
,cutoff
,box
,ddbox
,natoms
,ir
,pbcdxr
,npme
,
502 ndiv
-1,div
+1,mdiv
+1,ir_try
,opt
);
504 for(i
=0; i
<mdiv
[0]-x
-y
; i
++)
506 ir_try
[ZZ
] /= div
[0];
510 ir_try
[YY
] /= div
[0];
515 ir_try
[XX
] /= div
[0];
520 static real
optimize_ncells(FILE *fplog
,
521 int nnodes_tot
,int npme_only
,
522 gmx_bool bDynLoadBal
,real dlb_scale
,
523 gmx_mtop_t
*mtop
,matrix box
,gmx_ddbox_t
*ddbox
,
526 real cellsize_limit
,real cutoff
,
527 gmx_bool bInterCGBondeds
,gmx_bool bInterCGMultiBody
,
530 int npp
,npme
,ndiv
,*div
,*mdiv
,d
,nmax
;
531 gmx_bool bExcl_pbcdx
;
536 limit
= cellsize_limit
;
542 npp
= nnodes_tot
- npme_only
;
543 if (EEL_PME(ir
->coulombtype
))
545 npme
= (npme_only
> 0 ? npme_only
: npp
);
554 /* For Ewald exclusions pbc_dx is not called */
556 (IR_EXCL_FORCES(*ir
) && !EEL_FULL(ir
->coulombtype
));
557 pbcdxr
= (double)n_bonded_dx(mtop
,bExcl_pbcdx
)/(double)mtop
->natoms
;
561 /* Every molecule is a single charge group: no pbc required */
564 /* Add a margin for DLB and/or pressure scaling */
567 if (dlb_scale
>= 1.0)
569 gmx_fatal(FARGS
,"The value for option -dds should be smaller than 1");
573 fprintf(fplog
,"Scaling the initial minimum size with 1/%g (option -dds) = %g\n",dlb_scale
,1/dlb_scale
);
577 else if (ir
->epc
!= epcNO
)
581 fprintf(fplog
,"To account for pressure scaling, scaling the initial minimum size with %g\n",DD_GRID_MARGIN_PRES_SCALE
);
582 limit
*= DD_GRID_MARGIN_PRES_SCALE
;
588 fprintf(fplog
,"Optimizing the DD grid for %d cells with a minimum initial size of %.3f nm\n",npp
,limit
);
590 if (inhomogeneous_z(ir
))
592 fprintf(fplog
,"Ewald_geometry=%s: assuming inhomogeneous particle distribution in z, will not decompose in z.\n",eewg_names
[ir
->ewald_geometry
]);
597 fprintf(fplog
,"The maximum allowed number of cells is:");
600 nmax
= (int)(ddbox
->box_size
[d
]*ddbox
->skew_fac
[d
]/limit
);
601 if (d
>= ddbox
->npbcdim
&& nmax
< 2)
605 if (d
== ZZ
&& inhomogeneous_z(ir
))
609 fprintf(fplog
," %c %d",'X' + d
,nmax
);
617 fprintf(debug
,"Average nr of pbc_dx calls per atom %.2f\n",pbcdxr
);
620 /* Decompose npp in factors */
621 ndiv
= factorize(npp
,&div
,&mdiv
);
627 assign_factors(dd
,limit
,cutoff
,box
,ddbox
,mtop
->natoms
,ir
,pbcdxr
,
628 npme
,ndiv
,div
,mdiv
,itry
,nc
);
636 real
dd_choose_grid(FILE *fplog
,
637 t_commrec
*cr
,gmx_domdec_t
*dd
,t_inputrec
*ir
,
638 gmx_mtop_t
*mtop
,matrix box
,gmx_ddbox_t
*ddbox
,
639 gmx_bool bDynLoadBal
,real dlb_scale
,
640 real cellsize_limit
,real cutoff_dd
,
641 gmx_bool bInterCGBondeds
,gmx_bool bInterCGMultiBody
)
648 if (cr
->nnodes
> 12 && is_prime(cr
->nnodes
))
650 gmx_fatal(FARGS
,"The number of nodes you selected (%d) is a large prime. In most cases this will lead to bad performance. Choose a non-prime number, or set the decomposition (option -dd) manually.",cr
->nnodes
);
653 if (EEL_PME(ir
->coulombtype
))
655 if (cr
->npmenodes
>= 0)
657 if (cr
->nnodes
<= 2 && cr
->npmenodes
> 0)
660 "Can not have separate PME nodes with 2 or less nodes");
665 if (cr
->nnodes
<= 10)
671 cr
->npmenodes
= guess_npme(fplog
,mtop
,ir
,box
,cr
->nnodes
);
676 fprintf(fplog
,"Using %d separate PME nodes\n",cr
->npmenodes
);
681 if (cr
->npmenodes
< 0)
687 limit
= optimize_ncells(fplog
,cr
->nnodes
,cr
->npmenodes
,
688 bDynLoadBal
,dlb_scale
,
689 mtop
,box
,ddbox
,ir
,dd
,
690 cellsize_limit
,cutoff_dd
,
691 bInterCGBondeds
,bInterCGMultiBody
,
698 /* Communicate the information set by the master to all nodes */
699 gmx_bcast(sizeof(dd
->nc
),dd
->nc
,cr
);
700 if (EEL_PME(ir
->coulombtype
))
702 gmx_bcast(sizeof(ir
->nkx
),&ir
->nkx
,cr
);
703 gmx_bcast(sizeof(ir
->nky
),&ir
->nky
,cr
);
704 gmx_bcast(sizeof(cr
->npmenodes
),&cr
->npmenodes
,cr
);