2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2009-2018, The GROMACS development team.
5 * Copyright (c) 2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
51 #ifdef HAVE_SYS_TIME_H
52 # include <sys/time.h>
55 #include "gromacs/commandline/pargs.h"
56 #include "gromacs/ewald/pme.h"
57 #include "gromacs/fft/calcgrid.h"
58 #include "gromacs/fileio/checkpoint.h"
59 #include "gromacs/fileio/tpxio.h"
60 #include "gromacs/math/utilities.h"
61 #include "gromacs/math/vec.h"
62 #include "gromacs/mdlib/perf_est.h"
63 #include "gromacs/mdtypes/commrec.h"
64 #include "gromacs/mdtypes/inputrec.h"
65 #include "gromacs/mdtypes/md_enums.h"
66 #include "gromacs/mdtypes/state.h"
67 #include "gromacs/pbcutil/pbc.h"
68 #include "gromacs/taskassignment/usergpuids.h"
69 #include "gromacs/timing/walltime_accounting.h"
70 #include "gromacs/topology/topology.h"
71 #include "gromacs/utility/arraysize.h"
72 #include "gromacs/utility/baseversion.h"
73 #include "gromacs/utility/cstringutil.h"
74 #include "gromacs/utility/fatalerror.h"
75 #include "gromacs/utility/futil.h"
76 #include "gromacs/utility/gmxassert.h"
77 #include "gromacs/utility/path.h"
78 #include "gromacs/utility/smalloc.h"
79 #include "gromacs/utility/stringutil.h"
81 /* Enum for situations that can occur during log file parsing, the
82 * corresponding string entries can be found in do_the_tests() in
83 * const char* ParseLog[] */
84 /* TODO clean up CamelCasing of these enum names */
91 eParselogResetProblem
,
95 eParselogLargePrimeFactor
,
96 eParselogMismatchOfNumberOfPPRanksAndAvailableGPUs
,
105 int nPMEnodes
; /* number of PME-only nodes used in this test */
106 int nx
, ny
, nz
; /* DD grid */
107 int guessPME
; /* if nPMEnodes == -1, this is the guessed number of PME nodes */
108 double* Gcycles
; /* This can contain more than one value if doing multiple tests */
112 float* PME_f_load
; /* PME mesh/force load average*/
113 float PME_f_load_Av
; /* Average average ;) ... */
114 char* mdrun_cmd_line
; /* Mdrun command line used for this test */
120 int nr_inputfiles
; /* The number of tpr and mdp input files */
121 int64_t orig_sim_steps
; /* Number of steps to be done in the real simulation */
122 int64_t orig_init_step
; /* Init step for the real simulation */
123 real
* rcoulomb
; /* The coulomb radii [0...nr_inputfiles] */
124 real
* rvdw
; /* The vdW radii */
125 real
* rlist
; /* Neighbourlist cutoff radius */
126 int * nkx
, *nky
, *nkz
;
127 real
* fsx
, *fsy
, *fsz
; /* Fourierspacing in x,y,z dimension */
131 static void sep_line(FILE* fp
)
133 fprintf(fp
, "\n------------------------------------------------------------\n");
137 /* Wrapper for system calls */
138 static int gmx_system_call(char* command
)
140 return (system(command
));
144 /* Check if string starts with substring */
145 static gmx_bool
str_starts(const char* string
, const char* substring
)
147 return (std::strncmp(string
, substring
, std::strlen(substring
)) == 0);
151 static void cleandata(t_perf
* perfdata
, int test_nr
)
153 perfdata
->Gcycles
[test_nr
] = 0.0;
154 perfdata
->ns_per_day
[test_nr
] = 0.0;
155 perfdata
->PME_f_load
[test_nr
] = 0.0;
159 static void remove_if_exists(const char* fn
)
163 fprintf(stdout
, "Deleting %s\n", fn
);
169 static void finalize(const char* fn_out
)
175 fp
= fopen(fn_out
, "r");
176 fprintf(stdout
, "\n\n");
178 while (fgets(buf
, STRLEN
- 1, fp
) != nullptr)
180 fprintf(stdout
, "%s", buf
);
183 fprintf(stdout
, "\n\n");
195 static int parse_logfile(const char* logfile
,
204 char line
[STRLEN
], dumstring
[STRLEN
], dumstring2
[STRLEN
];
205 const char matchstrdd
[] = "Domain decomposition grid";
206 const char matchstrcr
[] = "resetting all time and cycle counters";
207 const char matchstrbal
[] = "Average PME mesh/force load:";
208 const char matchstring
[] = "R E A L C Y C L E A N D T I M E A C C O U N T I N G";
209 const char errSIG
[] = "signal, stopping at the next";
211 float dum1
, dum2
, dum3
, dum4
;
214 int64_t resetsteps
= -1;
215 gmx_bool bFoundResetStr
= FALSE
;
216 gmx_bool bResetChecked
= FALSE
;
219 if (!gmx_fexist(logfile
))
221 fprintf(stderr
, "WARNING: Could not find logfile %s.\n", logfile
);
222 cleandata(perfdata
, test_nr
);
223 return eParselogNotFound
;
226 fp
= fopen(logfile
, "r");
227 perfdata
->PME_f_load
[test_nr
] = -1.0;
228 perfdata
->guessPME
= -1;
230 iFound
= eFoundNothing
;
233 iFound
= eFoundDDStr
; /* Skip some case statements */
236 while (fgets(line
, STRLEN
, fp
) != nullptr)
238 /* Remove leading spaces */
241 /* Check for TERM and INT signals from user: */
242 if (std::strstr(line
, errSIG
) != nullptr)
245 cleandata(perfdata
, test_nr
);
246 return eParselogTerm
;
249 /* Check whether cycle resetting worked */
250 if (presteps
> 0 && !bFoundResetStr
)
252 if (std::strstr(line
, matchstrcr
) != nullptr)
254 sprintf(dumstring
, "step %s", "%" SCNd64
);
255 sscanf(line
, dumstring
, &resetsteps
);
256 bFoundResetStr
= TRUE
;
257 if (resetsteps
== presteps
+ cpt_steps
)
259 bResetChecked
= TRUE
;
263 sprintf(dumstring
, "%" PRId64
, resetsteps
);
264 sprintf(dumstring2
, "%" PRId64
, presteps
+ cpt_steps
);
266 "WARNING: Time step counters were reset at step %s,\n"
267 " though they were supposed to be reset at step %s!\n",
268 dumstring
, dumstring2
);
273 /* Look for strings that appear in a certain order in the log file: */
277 /* Look for domain decomp grid and separate PME nodes: */
278 if (str_starts(line
, matchstrdd
))
280 sscanf(line
, "Domain decomposition grid %d x %d x %d, separate PME ranks %d",
281 &(perfdata
->nx
), &(perfdata
->ny
), &(perfdata
->nz
), &npme
);
282 if (perfdata
->nPMEnodes
== -1)
284 perfdata
->guessPME
= npme
;
286 else if (perfdata
->nPMEnodes
!= npme
)
289 "PME ranks from command line and output file are not identical");
291 iFound
= eFoundDDStr
;
293 /* Catch a few errors that might have occurred: */
294 else if (str_starts(line
, "There is no domain decomposition for"))
297 return eParselogNoDDGrid
;
299 else if (str_starts(line
, "The number of ranks you selected"))
302 return eParselogLargePrimeFactor
;
304 else if (str_starts(line
, "reading tpx file"))
307 return eParselogTPXVersion
;
309 else if (str_starts(line
, "The -dd or -npme option request a parallel simulation"))
312 return eParselogNotParallel
;
316 /* Even after the "Domain decomposition grid" string was found,
317 * it could be that mdrun had to quit due to some error. */
318 if (str_starts(line
, "Incorrect launch configuration: mismatching number of"))
321 return eParselogMismatchOfNumberOfPPRanksAndAvailableGPUs
;
323 else if (str_starts(line
, "Some of the requested GPUs do not exist"))
326 return eParselogGpuProblem
;
328 /* Look for PME mesh/force balance (not necessarily present, though) */
329 else if (str_starts(line
, matchstrbal
))
331 sscanf(&line
[std::strlen(matchstrbal
)], "%f", &(perfdata
->PME_f_load
[test_nr
]));
333 /* Look for matchstring */
334 else if (str_starts(line
, matchstring
))
336 iFound
= eFoundAccountingStr
;
339 case eFoundAccountingStr
:
340 /* Already found matchstring - look for cycle data */
341 if (str_starts(line
, "Total "))
343 sscanf(line
, "Total %*f %lf", &(perfdata
->Gcycles
[test_nr
]));
344 iFound
= eFoundCycleStr
;
348 /* Already found cycle data - look for remaining performance info and return */
349 if (str_starts(line
, "Performance:"))
351 ndum
= sscanf(line
, "%s %f %f %f %f", dumstring
, &dum1
, &dum2
, &dum3
, &dum4
);
352 /* (ns/day) is the second last entry, depending on whether GMX_DETAILED_PERF_STATS was set in print_perf(), nrnb.c */
353 perfdata
->ns_per_day
[test_nr
] = (ndum
== 5) ? dum3
: dum1
;
355 if (bResetChecked
|| presteps
== 0)
361 return eParselogResetProblem
;
368 /* Close the log file */
371 /* Check why there is no performance data in the log file.
372 * Did a fatal errors occur? */
373 if (gmx_fexist(errfile
))
375 fp
= fopen(errfile
, "r");
376 while (fgets(line
, STRLEN
, fp
) != nullptr)
378 if (str_starts(line
, "Fatal error:"))
380 if (fgets(line
, STRLEN
, fp
) != nullptr)
383 "\nWARNING: An error occurred during this benchmark:\n"
388 cleandata(perfdata
, test_nr
);
389 return eParselogFatal
;
396 fprintf(stderr
, "WARNING: Could not find stderr file %s.\n", errfile
);
399 /* Giving up ... we could not find out why there is no performance data in
401 fprintf(stdout
, "No performance data in log file.\n");
402 cleandata(perfdata
, test_nr
);
404 return eParselogNoPerfData
;
408 static gmx_bool
analyze_data(FILE* fp
,
416 int* index_tpr
, /* OUT: Nr of mdp file with best settings */
417 int* npme_optimal
) /* OUT: Optimal number of PME nodes */
420 int line
= 0, line_win
= -1;
421 int k_win
= -1, i_win
= -1, winPME
;
422 double s
= 0.0; /* standard deviation */
425 char str_PME_f_load
[13];
426 gmx_bool bCanUseOrigTPR
;
427 gmx_bool bRefinedCoul
, bRefinedVdW
, bRefinedGrid
;
433 fprintf(fp
, "Summary of successful runs:\n");
434 fprintf(fp
, "Line tpr PME ranks Gcycles Av. Std.dev. ns/day PME/f");
437 fprintf(fp
, " DD grid");
443 for (k
= 0; k
< ntprs
; k
++)
445 for (i
= 0; i
< ntests
; i
++)
447 /* Select the right dataset: */
448 pd
= &(perfdata
[k
][i
]);
450 pd
->Gcycles_Av
= 0.0;
451 pd
->PME_f_load_Av
= 0.0;
452 pd
->ns_per_day_Av
= 0.0;
454 if (pd
->nPMEnodes
== -1)
456 sprintf(strbuf
, "(%3d)", pd
->guessPME
);
460 sprintf(strbuf
, " ");
463 /* Get the average run time of a setting */
464 for (j
= 0; j
< nrepeats
; j
++)
466 pd
->Gcycles_Av
+= pd
->Gcycles
[j
];
467 pd
->PME_f_load_Av
+= pd
->PME_f_load
[j
];
469 pd
->Gcycles_Av
/= nrepeats
;
470 pd
->PME_f_load_Av
/= nrepeats
;
472 for (j
= 0; j
< nrepeats
; j
++)
474 if (pd
->ns_per_day
[j
] > 0.0)
476 pd
->ns_per_day_Av
+= pd
->ns_per_day
[j
];
480 /* Somehow the performance number was not aquired for this run,
481 * therefor set the average to some negative value: */
482 pd
->ns_per_day_Av
= -1.0F
* nrepeats
;
486 pd
->ns_per_day_Av
/= nrepeats
;
489 if (pd
->PME_f_load_Av
> 0.0)
491 sprintf(str_PME_f_load
, "%12.3f", pd
->PME_f_load_Av
);
495 sprintf(str_PME_f_load
, "%s", " - ");
499 /* We assume we had a successful run if both averages are positive */
500 if (pd
->Gcycles_Av
> 0.0 && pd
->ns_per_day_Av
> 0.0)
502 /* Output statistics if repeats were done */
505 /* Calculate the standard deviation */
507 for (j
= 0; j
< nrepeats
; j
++)
509 s
+= gmx::square(pd
->Gcycles
[j
] - pd
->Gcycles_Av
);
514 fprintf(fp
, "%4d %3d %4d%s %12.3f %12.3f %12.3f %s", line
, k
, pd
->nPMEnodes
,
515 strbuf
, pd
->Gcycles_Av
, s
, pd
->ns_per_day_Av
, str_PME_f_load
);
518 fprintf(fp
, " %3d %3d %3d", pd
->nx
, pd
->ny
, pd
->nz
);
522 /* Store the index of the best run found so far in 'winner': */
523 if ((k_win
== -1) || (pd
->Gcycles_Av
< perfdata
[k_win
][i_win
].Gcycles_Av
))
536 gmx_fatal(FARGS
, "None of the runs was successful! Check %s for problems.", fn
);
541 winPME
= perfdata
[k_win
][i_win
].nPMEnodes
;
545 /* We stuck to a fixed number of PME-only nodes */
546 sprintf(strbuf
, "settings No. %d", k_win
);
550 /* We have optimized the number of PME-only nodes */
553 sprintf(strbuf
, "%s", "the automatic number of PME ranks");
557 sprintf(strbuf
, "%d PME ranks", winPME
);
560 fprintf(fp
, "Best performance was achieved with %s", strbuf
);
561 if ((nrepeats
> 1) && (ntests
> 1))
563 fprintf(fp
, " (see line %d)", line_win
);
567 /* Only mention settings if they were modified: */
568 bRefinedCoul
= !gmx_within_tol(info
->rcoulomb
[k_win
], info
->rcoulomb
[0], GMX_REAL_EPS
);
569 bRefinedVdW
= !gmx_within_tol(info
->rvdw
[k_win
], info
->rvdw
[0], GMX_REAL_EPS
);
570 bRefinedGrid
= !(info
->nkx
[k_win
] == info
->nkx
[0] && info
->nky
[k_win
] == info
->nky
[0]
571 && info
->nkz
[k_win
] == info
->nkz
[0]);
573 if (bRefinedCoul
|| bRefinedVdW
|| bRefinedGrid
)
575 fprintf(fp
, "Optimized PME settings:\n");
576 bCanUseOrigTPR
= FALSE
;
580 bCanUseOrigTPR
= TRUE
;
585 fprintf(fp
, " New Coulomb radius: %f nm (was %f nm)\n", info
->rcoulomb
[k_win
],
591 fprintf(fp
, " New Van der Waals radius: %f nm (was %f nm)\n", info
->rvdw
[k_win
], info
->rvdw
[0]);
596 fprintf(fp
, " New Fourier grid xyz: %d %d %d (was %d %d %d)\n", info
->nkx
[k_win
],
597 info
->nky
[k_win
], info
->nkz
[k_win
], info
->nkx
[0], info
->nky
[0], info
->nkz
[0]);
600 if (bCanUseOrigTPR
&& ntprs
> 1)
602 fprintf(fp
, "and original PME settings.\n");
607 /* Return the index of the mdp file that showed the highest performance
608 * and the optimal number of PME nodes */
610 *npme_optimal
= winPME
;
612 return bCanUseOrigTPR
;
616 /* Get the commands we need to set up the runs from environment variables */
617 static void get_program_paths(gmx_bool bThreads
, char* cmd_mpirun
[], char* cmd_mdrun
[])
620 const char def_mpirun
[] = "mpirun";
622 const char empty_mpirun
[] = "";
624 /* Get the commands we need to set up the runs from environment variables */
627 if ((cp
= getenv("MPIRUN")) != nullptr)
629 *cmd_mpirun
= gmx_strdup(cp
);
633 *cmd_mpirun
= gmx_strdup(def_mpirun
);
638 *cmd_mpirun
= gmx_strdup(empty_mpirun
);
641 if (*cmd_mdrun
== nullptr)
643 /* The use of MDRUN is deprecated, but made available in 5.1
644 for backward compatibility. It may be removed in a future
646 if ((cp
= getenv("MDRUN")) != nullptr)
648 *cmd_mdrun
= gmx_strdup(cp
);
652 gmx_fatal(FARGS
, "The way to call mdrun must be set in the -mdrun command-line flag.");
657 /* Check that the commands will run mdrun (perhaps via mpirun) by
658 * running a very quick test simulation. Requires MPI environment or
659 * GPU support to be available if applicable. */
660 /* TODO implement feature to parse the log file to get the list of
661 compatible GPUs from mdrun, if the user of gmx tune-pme has not
663 static void check_mdrun_works(gmx_bool bThreads
,
664 const char* cmd_mpirun
,
666 const char* cmd_mdrun
,
667 gmx_bool bNeedGpuSupport
)
669 char* command
= nullptr;
673 const char filename
[] = "benchtest.log";
675 /* This string should always be identical to the one in copyrite.c,
676 * gmx_print_version_info() in the GMX_MPI section */
677 const char match_mpi
[] = "MPI library: MPI";
678 const char match_mdrun
[] = "Executable: ";
679 const char match_nogpu
[] = "GPU support: disabled";
680 gmx_bool bMdrun
= FALSE
;
681 gmx_bool bMPI
= FALSE
;
682 gmx_bool bHaveGpuSupport
= TRUE
;
684 /* Run a small test to see whether mpirun + mdrun work */
685 fprintf(stdout
, "Making sure that mdrun can be executed. ");
688 snew(command
, std::strlen(cmd_mdrun
) + std::strlen(cmd_np
) + std::strlen(filename
) + 50);
689 sprintf(command
, "%s%s -version -maxh 0.001 1> %s 2>&1", cmd_mdrun
, cmd_np
, filename
);
693 snew(command
, std::strlen(cmd_mpirun
) + std::strlen(cmd_np
) + std::strlen(cmd_mdrun
)
694 + std::strlen(filename
) + 50);
695 sprintf(command
, "%s%s%s -version -maxh 0.001 1> %s 2>&1", cmd_mpirun
, cmd_np
, cmd_mdrun
, filename
);
697 fprintf(stdout
, "Trying '%s' ... ", command
);
698 make_backup(filename
);
699 gmx_system_call(command
);
701 /* Check if we find the characteristic string in the output: */
702 if (!gmx_fexist(filename
))
704 gmx_fatal(FARGS
, "Output from test run could not be found.");
707 fp
= fopen(filename
, "r");
708 /* We need to scan the whole output file, since sometimes the queuing system
709 * also writes stuff to stdout/err */
712 cp
= fgets(line
, STRLEN
, fp
);
715 if (str_starts(line
, match_mdrun
))
719 if (str_starts(line
, match_mpi
))
723 if (str_starts(line
, match_nogpu
))
725 bHaveGpuSupport
= FALSE
;
736 "Need a threaded version of mdrun. This one\n"
738 "seems to have been compiled with MPI instead.",
747 "Need an MPI-enabled version of mdrun. This one\n"
749 "seems to have been compiled without MPI support.",
756 gmx_fatal(FARGS
, "Cannot execute mdrun. Please check %s for problems!", filename
);
759 if (bNeedGpuSupport
&& !bHaveGpuSupport
)
761 gmx_fatal(FARGS
, "The mdrun executable did not have the expected GPU support.");
764 fprintf(stdout
, "passed.\n");
771 /* Handles the no-GPU case by emitting an empty string. */
772 static std::string
make_gpu_id_command_line(const char* eligible_gpu_ids
)
774 /* If the user has given no eligible GPU IDs, or we're trying the
775 * default behaviour, then there is nothing for tune_pme to give
776 * to mdrun -gpu_id */
777 if (eligible_gpu_ids
!= nullptr)
779 return gmx::formatString("-gpu_id %s", eligible_gpu_ids
);
783 return std::string();
786 static void launch_simulation(gmx_bool bLaunch
, /* Should the simulation be launched? */
787 FILE* fp
, /* General log file */
788 gmx_bool bThreads
, /* whether to use threads */
789 char* cmd_mpirun
, /* Command for mpirun */
790 char* cmd_np
, /* Switch for -np or -ntmpi or empty */
791 char* cmd_mdrun
, /* Command for mdrun */
792 char* args_for_mdrun
, /* Arguments for mdrun */
793 const char* simulation_tpr
, /* This tpr will be simulated */
794 int nPMEnodes
, /* Number of PME ranks to use */
795 const char* eligible_gpu_ids
) /* Available GPU IDs for
796 * constructing mdrun command lines */
801 /* Make enough space for the system call command,
802 * (200 extra chars for -npme ... etc. options should suffice): */
803 snew(command
, std::strlen(cmd_mpirun
) + std::strlen(cmd_mdrun
) + std::strlen(cmd_np
)
804 + std::strlen(args_for_mdrun
) + std::strlen(simulation_tpr
) + 200);
806 auto cmd_gpu_ids
= make_gpu_id_command_line(eligible_gpu_ids
);
808 /* Note that the -passall options requires args_for_mdrun to be at the end
809 * of the command line string */
812 sprintf(command
, "%s%s-npme %d -s %s %s %s", cmd_mdrun
, cmd_np
, nPMEnodes
, simulation_tpr
,
813 args_for_mdrun
, cmd_gpu_ids
.c_str());
817 sprintf(command
, "%s%s%s -npme %d -s %s %s %s", cmd_mpirun
, cmd_np
, cmd_mdrun
, nPMEnodes
,
818 simulation_tpr
, args_for_mdrun
, cmd_gpu_ids
.c_str());
821 fprintf(fp
, "%s this command line to launch the simulation:\n\n%s",
822 bLaunch
? "Using" : "Please use", command
);
826 /* Now the real thing! */
829 fprintf(stdout
, "\nLaunching simulation with best parameters now.\nExecuting '%s'", command
);
832 gmx_system_call(command
);
837 static void modify_PMEsettings(int64_t simsteps
, /* Set this value as number of time steps */
838 int64_t init_step
, /* Set this value as init_step */
839 const char* fn_best_tpr
, /* tpr file with the best performance */
840 const char* fn_sim_tpr
) /* name of tpr file to be launched */
846 t_inputrec irInstance
;
847 t_inputrec
* ir
= &irInstance
;
848 read_tpx_state(fn_best_tpr
, ir
, &state
, &mtop
);
850 /* Reset nsteps and init_step to the value of the input .tpr file */
851 ir
->nsteps
= simsteps
;
852 ir
->init_step
= init_step
;
854 /* Write the tpr file which will be launched */
855 sprintf(buf
, "Writing optimized simulation file %s with nsteps=%s.\n", fn_sim_tpr
, "%" PRId64
);
856 fprintf(stdout
, buf
, ir
->nsteps
);
858 write_tpx_state(fn_sim_tpr
, ir
, &state
, &mtop
);
861 static gmx_bool
can_scale_rvdw(int vdwtype
)
863 return (evdwCUT
== vdwtype
|| evdwPME
== vdwtype
);
866 #define EPME_SWITCHED(e) ((e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
868 /* Make additional TPR files with more computational load for the
869 * direct space processors: */
870 static void make_benchmark_tprs(const char* fn_sim_tpr
, /* READ : User-provided tpr file */
871 char* fn_bench_tprs
[], /* WRITE: Names of benchmark tpr files */
872 int64_t benchsteps
, /* Number of time steps for benchmark runs */
873 int64_t statesteps
, /* Step counter in checkpoint file */
874 real rmin
, /* Minimal Coulomb radius */
875 real rmax
, /* Maximal Coulomb radius */
876 bool bScaleRvdw
, /* Scale rvdw along with rcoulomb */
877 const int* ntprs
, /* No. of TPRs to write, each with a different
878 rcoulomb and fourierspacing */
879 t_inputinfo
* info
, /* Contains information about mdp file options */
880 FILE* fp
) /* Write the output here */
885 real nlist_buffer
; /* Thickness of the buffer regions for PME-switch potentials */
888 gmx_bool bNote
= FALSE
;
889 real add
; /* Add this to rcoul for the next test */
890 real fac
= 1.0; /* Scaling factor for Coulomb radius */
891 real fourierspacing
; /* Basic fourierspacing from tpr */
894 sprintf(buf
, "Making benchmark tpr file%s with %s time step%s", *ntprs
> 1 ? "s" : "",
895 "%" PRId64
, benchsteps
> 1 ? "s" : "");
896 fprintf(stdout
, buf
, benchsteps
);
899 sprintf(buf
, " (adding %s steps from checkpoint file)", "%" PRId64
);
900 fprintf(stdout
, buf
, statesteps
);
901 benchsteps
+= statesteps
;
903 fprintf(stdout
, ".\n");
905 t_inputrec irInstance
;
906 t_inputrec
* ir
= &irInstance
;
907 read_tpx_state(fn_sim_tpr
, ir
, &state
, &mtop
);
909 /* Check if some kind of PME was chosen */
910 if (EEL_PME(ir
->coulombtype
) == FALSE
)
912 gmx_fatal(FARGS
, "Can only do optimizations for simulations with %s electrostatics.",
916 /* Check if rcoulomb == rlist, which is necessary for plain PME. */
917 if ((ir
->cutoff_scheme
!= ecutsVERLET
) && (eelPME
== ir
->coulombtype
) && !(ir
->rcoulomb
== ir
->rlist
))
919 gmx_fatal(FARGS
, "%s requires rcoulomb (%f) to be equal to rlist (%f).", EELTYPE(eelPME
),
920 ir
->rcoulomb
, ir
->rlist
);
922 /* For other PME types, rcoulomb is allowed to be smaller than rlist */
923 else if (ir
->rcoulomb
> ir
->rlist
)
925 gmx_fatal(FARGS
, "%s requires rcoulomb (%f) to be equal to or smaller than rlist (%f)",
926 EELTYPE(ir
->coulombtype
), ir
->rcoulomb
, ir
->rlist
);
929 if (bScaleRvdw
&& ir
->rvdw
!= ir
->rcoulomb
)
931 fprintf(stdout
, "NOTE: input rvdw != rcoulomb, will not scale rvdw\n");
935 /* Reduce the number of steps for the benchmarks */
936 info
->orig_sim_steps
= ir
->nsteps
;
937 ir
->nsteps
= benchsteps
;
938 /* We must not use init_step from the input tpr file for the benchmarks */
939 info
->orig_init_step
= ir
->init_step
;
942 /* For PME-switch potentials, keep the radial distance of the buffer region */
943 nlist_buffer
= ir
->rlist
- ir
->rcoulomb
;
945 /* Determine length of triclinic box vectors */
946 for (d
= 0; d
< DIM
; d
++)
949 for (i
= 0; i
< DIM
; i
++)
951 box_size
[d
] += state
.box
[d
][i
] * state
.box
[d
][i
];
953 box_size
[d
] = std::sqrt(box_size
[d
]);
956 if (ir
->fourier_spacing
> 0)
958 info
->fsx
[0] = ir
->fourier_spacing
;
959 info
->fsy
[0] = ir
->fourier_spacing
;
960 info
->fsz
[0] = ir
->fourier_spacing
;
964 /* Reconstruct fourierspacing per dimension from the number of grid points and box size */
965 info
->fsx
[0] = box_size
[XX
] / ir
->nkx
;
966 info
->fsy
[0] = box_size
[YY
] / ir
->nky
;
967 info
->fsz
[0] = box_size
[ZZ
] / ir
->nkz
;
970 /* If no value for the fourierspacing was provided on the command line, we
971 * use the reconstruction from the tpr file */
972 if (ir
->fourier_spacing
> 0)
974 /* Use the spacing from the tpr */
975 fourierspacing
= ir
->fourier_spacing
;
979 /* Use the maximum observed spacing */
980 fourierspacing
= std::max(std::max(info
->fsx
[0], info
->fsy
[0]), info
->fsz
[0]);
983 fprintf(stdout
, "Calculating PME grid points on the basis of a fourierspacing of %f nm\n",
986 /* For performance comparisons the number of particles is useful to have */
987 fprintf(fp
, " Number of particles : %d\n", mtop
.natoms
);
989 /* Print information about settings of which some are potentially modified: */
990 fprintf(fp
, " Coulomb type : %s\n", EELTYPE(ir
->coulombtype
));
991 fprintf(fp
, " Grid spacing x y z : %f %f %f\n", box_size
[XX
] / ir
->nkx
,
992 box_size
[YY
] / ir
->nky
, box_size
[ZZ
] / ir
->nkz
);
993 fprintf(fp
, " Van der Waals type : %s\n", EVDWTYPE(ir
->vdwtype
));
994 if (ir_vdw_switched(ir
))
996 fprintf(fp
, " rvdw_switch : %f nm\n", ir
->rvdw_switch
);
998 if (EPME_SWITCHED(ir
->coulombtype
))
1000 fprintf(fp
, " rlist : %f nm\n", ir
->rlist
);
1003 /* Print a descriptive line about the tpr settings tested */
1004 fprintf(fp
, "\nWill try these real/reciprocal workload settings:\n");
1005 fprintf(fp
, " No. scaling rcoulomb");
1006 fprintf(fp
, " nkx nky nkz");
1007 fprintf(fp
, " spacing");
1008 if (can_scale_rvdw(ir
->vdwtype
))
1010 fprintf(fp
, " rvdw");
1012 if (EPME_SWITCHED(ir
->coulombtype
))
1014 fprintf(fp
, " rlist");
1016 fprintf(fp
, " tpr file\n");
1018 /* Loop to create the requested number of tpr input files */
1019 for (j
= 0; j
< *ntprs
; j
++)
1021 /* The first .tpr is the provided one, just need to modify nsteps,
1022 * so skip the following block */
1025 /* Determine which Coulomb radii rc to use in the benchmarks */
1026 add
= (rmax
- rmin
) / (*ntprs
- 1);
1027 if (gmx_within_tol(rmin
, info
->rcoulomb
[0], GMX_REAL_EPS
))
1029 ir
->rcoulomb
= rmin
+ j
* add
;
1031 else if (gmx_within_tol(rmax
, info
->rcoulomb
[0], GMX_REAL_EPS
))
1033 ir
->rcoulomb
= rmin
+ (j
- 1) * add
;
1037 /* rmin != rcoul != rmax, ergo test between rmin and rmax */
1038 add
= (rmax
- rmin
) / (*ntprs
- 2);
1039 ir
->rcoulomb
= rmin
+ (j
- 1) * add
;
1042 /* Determine the scaling factor fac */
1043 fac
= ir
->rcoulomb
/ info
->rcoulomb
[0];
1045 /* Scale the Fourier grid spacing */
1046 ir
->nkx
= ir
->nky
= ir
->nkz
= 0;
1047 calcFftGrid(nullptr, state
.box
, fourierspacing
* fac
, minimalPmeGridSize(ir
->pme_order
),
1048 &ir
->nkx
, &ir
->nky
, &ir
->nkz
);
1050 /* Adjust other radii since various conditions need to be fulfilled */
1051 if (eelPME
== ir
->coulombtype
)
1053 /* plain PME, rcoulomb must be equal to rlist TODO only in the group scheme? */
1054 ir
->rlist
= ir
->rcoulomb
;
1058 /* rlist must be >= rcoulomb, we keep the size of the buffer region */
1059 ir
->rlist
= ir
->rcoulomb
+ nlist_buffer
;
1062 if (bScaleRvdw
&& can_scale_rvdw(ir
->vdwtype
))
1064 if (ecutsVERLET
== ir
->cutoff_scheme
|| evdwPME
== ir
->vdwtype
)
1066 /* With either the Verlet cutoff-scheme or LJ-PME,
1067 the van der Waals radius must always equal the
1069 ir
->rvdw
= ir
->rcoulomb
;
1073 /* For vdw cutoff, rvdw >= rlist */
1074 ir
->rvdw
= std::max(info
->rvdw
[0], ir
->rlist
);
1077 } /* end of "if (j != 0)" */
1079 /* for j==0: Save the original settings
1080 * for j >0: Save modified radii and Fourier grids */
1081 info
->rcoulomb
[j
] = ir
->rcoulomb
;
1082 info
->rvdw
[j
] = ir
->rvdw
;
1083 info
->nkx
[j
] = ir
->nkx
;
1084 info
->nky
[j
] = ir
->nky
;
1085 info
->nkz
[j
] = ir
->nkz
;
1086 info
->rlist
[j
] = ir
->rlist
;
1087 info
->fsx
[j
] = fac
* fourierspacing
;
1088 info
->fsy
[j
] = fac
* fourierspacing
;
1089 info
->fsz
[j
] = fac
* fourierspacing
;
1091 /* Write the benchmark tpr file */
1092 fn_bench_tprs
[j
] = gmx_strdup(
1093 gmx::Path::concatenateBeforeExtension(fn_sim_tpr
, gmx::formatString("_bench%.2d", j
))
1096 fprintf(stdout
, "Writing benchmark tpr %s with nsteps=", fn_bench_tprs
[j
]);
1097 fprintf(stdout
, "%" PRId64
, ir
->nsteps
);
1100 fprintf(stdout
, ", scaling factor %f\n", fac
);
1104 fprintf(stdout
, ", unmodified settings\n");
1107 write_tpx_state(fn_bench_tprs
[j
], ir
, &state
, &mtop
);
1109 /* Write information about modified tpr settings to log file */
1110 fprintf(fp
, "%4d%10f%10f", j
, fac
, ir
->rcoulomb
);
1111 fprintf(fp
, "%5d%5d%5d", ir
->nkx
, ir
->nky
, ir
->nkz
);
1112 fprintf(fp
, " %9f ", info
->fsx
[j
]);
1113 if (can_scale_rvdw(ir
->vdwtype
))
1115 fprintf(fp
, "%10f", ir
->rvdw
);
1117 if (EPME_SWITCHED(ir
->coulombtype
))
1119 fprintf(fp
, "%10f", ir
->rlist
);
1121 fprintf(fp
, " %-14s\n", fn_bench_tprs
[j
]);
1123 /* Make it clear to the user that some additional settings were modified */
1124 if (!gmx_within_tol(ir
->rvdw
, info
->rvdw
[0], GMX_REAL_EPS
)
1125 || !gmx_within_tol(ir
->rlist
, info
->rlist
[0], GMX_REAL_EPS
))
1133 "\nNote that in addition to the Coulomb radius and the Fourier grid\n"
1134 "other input settings were also changed (see table above).\n"
1135 "Please check if the modified settings are appropriate.\n");
1142 /* Rename the files we want to keep to some meaningful filename and
1143 * delete the rest */
1144 static void cleanup(const t_filenm
* fnm
, int nfile
, int k
, int nnodes
, int nPMEnodes
, int nr
, gmx_bool bKeepStderr
)
1146 char numstring
[STRLEN
];
1147 const char* fn
= nullptr;
1152 fprintf(stdout
, "Cleaning up, deleting benchmark temp files ...\n");
1154 for (i
= 0; i
< nfile
; i
++)
1156 opt
= const_cast<char*>(fnm
[i
].opt
);
1157 if (std::strcmp(opt
, "-p") == 0)
1159 /* do nothing; keep this file */
1161 else if (std::strcmp(opt
, "-bg") == 0)
1163 /* Give the log file a nice name so one can later see which parameters were used */
1164 numstring
[0] = '\0';
1167 sprintf(numstring
, "_%d", nr
);
1169 std::string newfilename
= gmx::formatString(
1170 "%s_no%d_np%d_npme%d%s", opt2fn("-bg", nfile
, fnm
), k
, nnodes
, nPMEnodes
, numstring
);
1171 if (gmx_fexist(opt2fn("-bg", nfile
, fnm
)))
1173 fprintf(stdout
, "renaming log file to %s\n", newfilename
.c_str());
1174 make_backup(newfilename
);
1175 rename(opt2fn("-bg", nfile
, fnm
), newfilename
.c_str());
1178 else if (std::strcmp(opt
, "-err") == 0)
1180 /* This file contains the output of stderr. We want to keep it in
1181 * cases where there have been problems. */
1182 fn
= opt2fn(opt
, nfile
, fnm
);
1183 numstring
[0] = '\0';
1186 sprintf(numstring
, "_%d", nr
);
1188 std::string newfilename
=
1189 gmx::formatString("%s_no%d_np%d_npme%d%s", fn
, k
, nnodes
, nPMEnodes
, numstring
);
1194 fprintf(stdout
, "Saving stderr output in %s\n", newfilename
.c_str());
1195 make_backup(newfilename
);
1196 rename(fn
, newfilename
.c_str());
1200 fprintf(stdout
, "Deleting %s\n", fn
);
1205 /* Delete the files which are created for each benchmark run: (options -b*) */
1206 else if ((0 == std::strncmp(opt
, "-b", 2)) && (opt2bSet(opt
, nfile
, fnm
) || !is_optional(&fnm
[i
])))
1208 remove_if_exists(opt2fn(opt
, nfile
, fnm
));
1223 /* Create a list of numbers of PME nodes to test */
1224 static void make_npme_list(const char* npmevalues_opt
, /* Make a complete list with all
1225 * possibilities or a short list that keeps
1226 * only reasonable numbers of PME nodes */
1227 int* nentries
, /* Number of entries we put in the nPMEnodes list */
1228 int* nPMEnodes
[], /* Each entry contains the value for -npme */
1229 int nnodes
, /* Total number of nodes to do the tests on */
1230 int minPMEnodes
, /* Minimum number of PME nodes */
1231 int maxPMEnodes
) /* Maximum number of PME nodes */
1234 int min_factor
= 1; /* We request that npp and npme have this minimal
1235 * largest common factor (depends on npp) */
1236 int nlistmax
; /* Max. list size */
1237 int nlist
; /* Actual number of entries in list */
1241 /* Do we need to check all possible values for -npme or is a reduced list enough? */
1242 if (!std::strcmp(npmevalues_opt
, "all"))
1246 else if (!std::strcmp(npmevalues_opt
, "subset"))
1248 eNPME
= eNpmeSubset
;
1250 else /* "auto" or "range" */
1256 else if (nnodes
< 128)
1258 eNPME
= eNpmeReduced
;
1262 eNPME
= eNpmeSubset
;
1266 /* Calculate how many entries we could possibly have (in case of -npme all) */
1269 nlistmax
= maxPMEnodes
- minPMEnodes
+ 3;
1270 if (0 == minPMEnodes
)
1280 /* Now make the actual list which is at most of size nlist */
1281 snew(*nPMEnodes
, nlistmax
);
1282 nlist
= 0; /* start counting again, now the real entries in the list */
1283 for (i
= 0; i
< nlistmax
- 2; i
++)
1285 npme
= maxPMEnodes
- i
;
1286 npp
= nnodes
- npme
;
1289 case eNpmeAll
: min_factor
= 1; break;
1290 case eNpmeReduced
: min_factor
= 2; break;
1292 /* For 2d PME we want a common largest factor of at least the cube
1293 * root of the number of PP nodes */
1294 min_factor
= static_cast<int>(std::cbrt(npp
));
1296 default: gmx_fatal(FARGS
, "Unknown option for eNPME in make_npme_list");
1298 if (std::gcd(npp
, npme
) >= min_factor
)
1300 (*nPMEnodes
)[nlist
] = npme
;
1304 /* We always test 0 PME nodes and the automatic number */
1305 *nentries
= nlist
+ 2;
1306 (*nPMEnodes
)[nlist
] = 0;
1307 (*nPMEnodes
)[nlist
+ 1] = -1;
1309 fprintf(stderr
, "Will try the following %d different values for -npme:\n", *nentries
);
1310 for (i
= 0; i
< *nentries
- 1; i
++)
1312 fprintf(stderr
, "%d, ", (*nPMEnodes
)[i
]);
1314 fprintf(stderr
, "and %d (auto).\n", (*nPMEnodes
)[*nentries
- 1]);
1318 /* Allocate memory to store the performance data */
1319 static void init_perfdata(t_perf
* perfdata
[], int ntprs
, int datasets
, int repeats
)
1324 for (k
= 0; k
< ntprs
; k
++)
1326 snew(perfdata
[k
], datasets
);
1327 for (i
= 0; i
< datasets
; i
++)
1329 for (j
= 0; j
< repeats
; j
++)
1331 snew(perfdata
[k
][i
].Gcycles
, repeats
);
1332 snew(perfdata
[k
][i
].ns_per_day
, repeats
);
1333 snew(perfdata
[k
][i
].PME_f_load
, repeats
);
1340 /* Check for errors on mdrun -h */
1341 static void make_sure_it_runs(char* mdrun_cmd_line
, int length
, FILE* fp
, const t_filenm
* fnm
, int nfile
)
1343 char *command
, *msg
;
1346 snew(command
, length
+ 15);
1347 snew(msg
, length
+ 500);
1349 fprintf(stdout
, "Making sure the benchmarks can be executed by running just 1 step...\n");
1350 sprintf(command
, "%s -nsteps 1 -quiet", mdrun_cmd_line
);
1351 fprintf(stdout
, "Executing '%s' ...\n", command
);
1352 ret
= gmx_system_call(command
);
1356 /* To prevent confusion, do not again issue a gmx_fatal here since we already
1357 * get the error message from mdrun itself */
1359 "Cannot run the first benchmark simulation! Please check the error message of\n"
1360 "mdrun for the source of the problem. Did you provide a command line\n"
1361 "argument that neither gmx tune_pme nor mdrun understands? If you're\n"
1362 "sure your command line should work, you can bypass this check with \n"
1363 "gmx tune_pme -nocheck. The failing command was:\n"
1367 fprintf(stderr
, "%s", msg
);
1369 fprintf(fp
, "%s", msg
);
1373 fprintf(stdout
, "Benchmarks can be executed!\n");
1375 /* Clean up the benchmark output files we just created */
1376 fprintf(stdout
, "Cleaning up ...\n");
1377 remove_if_exists(opt2fn("-bc", nfile
, fnm
));
1378 remove_if_exists(opt2fn("-be", nfile
, fnm
));
1379 remove_if_exists(opt2fn("-bcpo", nfile
, fnm
));
1380 remove_if_exists(opt2fn("-bg", nfile
, fnm
));
1381 remove_if_exists(opt2fn("-bo", nfile
, fnm
));
1382 remove_if_exists(opt2fn("-bx", nfile
, fnm
));
1388 static void do_the_tests(FILE* fp
, /* General tune_pme output file */
1389 char** tpr_names
, /* Filenames of the input files to test */
1390 int maxPMEnodes
, /* Max fraction of nodes to use for PME */
1391 int minPMEnodes
, /* Min fraction of nodes to use for PME */
1392 int npme_fixed
, /* If >= -1, test fixed number of PME
1394 const char* npmevalues_opt
, /* Which -npme values should be tested */
1395 t_perf
** perfdata
, /* Here the performace data is stored */
1396 int* pmeentries
, /* Entries in the nPMEnodes list */
1397 int repeats
, /* Repeat each test this often */
1398 int nnodes
, /* Total number of nodes = nPP + nPME */
1399 int nr_tprs
, /* Total number of tpr files to test */
1400 gmx_bool bThreads
, /* Threads or MPI? */
1401 char* cmd_mpirun
, /* mpirun command string */
1402 char* cmd_np
, /* "-np", "-n", whatever mpirun needs */
1403 char* cmd_mdrun
, /* mdrun command string */
1404 char* cmd_args_bench
, /* arguments for mdrun in a string */
1405 const t_filenm
* fnm
, /* List of filenames from command line */
1406 int nfile
, /* Number of files specified on the cmdl. */
1407 int presteps
, /* DLB equilibration steps, is checked */
1408 int64_t cpt_steps
, /* Time step counter in the checkpoint */
1409 gmx_bool bCheck
, /* Check whether benchmark mdrun works */
1410 const char* eligible_gpu_ids
) /* GPU IDs for
1411 * constructing mdrun command lines */
1413 int i
, nr
, k
, ret
, count
= 0, totaltests
;
1414 int* nPMEnodes
= nullptr;
1415 t_perf
* pd
= nullptr;
1417 char * command
, *cmd_stub
;
1419 gmx_bool bResetProblem
= FALSE
;
1420 gmx_bool bFirst
= TRUE
;
1422 /* This string array corresponds to the eParselog enum type at the start
1424 const char* ParseLog
[] = { "OK.",
1425 "Logfile not found!",
1426 "No timings, logfile truncated?",
1427 "Run was terminated.",
1428 "Counters were not reset properly.",
1429 "No DD grid found for these settings.",
1430 "TPX version conflict!",
1431 "mdrun was not started in parallel!",
1432 "Number of PP ranks has a prime factor that is too large.",
1433 "The number of PP ranks did not suit the number of GPUs.",
1434 "Some GPUs were not detected or are incompatible.",
1435 "An error occurred." };
1436 char str_PME_f_load
[13];
1439 /* Allocate space for the mdrun command line. 100 extra characters should
1440 be more than enough for the -npme etcetera arguments */
1441 cmdline_length
= std::strlen(cmd_mpirun
) + std::strlen(cmd_np
) + std::strlen(cmd_mdrun
)
1442 + std::strlen(cmd_args_bench
) + std::strlen(tpr_names
[0]) + 100;
1443 snew(command
, cmdline_length
);
1444 snew(cmd_stub
, cmdline_length
);
1446 /* Construct the part of the command line that stays the same for all tests: */
1449 sprintf(cmd_stub
, "%s%s", cmd_mdrun
, cmd_np
);
1453 sprintf(cmd_stub
, "%s%s%s ", cmd_mpirun
, cmd_np
, cmd_mdrun
);
1456 /* Create a list of numbers of PME nodes to test */
1457 if (npme_fixed
< -1)
1459 make_npme_list(npmevalues_opt
, pmeentries
, &nPMEnodes
, nnodes
, minPMEnodes
, maxPMEnodes
);
1465 nPMEnodes
[0] = npme_fixed
;
1466 fprintf(stderr
, "Will use a fixed number of %d PME-only ranks.\n", nPMEnodes
[0]);
1471 fprintf(fp
, "\nNo benchmarks done since number of repeats (-r) is 0.\n");
1473 finalize(opt2fn("-p", nfile
, fnm
));
1477 /* Allocate one dataset for each tpr input file: */
1478 init_perfdata(perfdata
, nr_tprs
, *pmeentries
, repeats
);
1480 /*****************************************/
1481 /* Main loop over all tpr files to test: */
1482 /*****************************************/
1483 totaltests
= nr_tprs
* (*pmeentries
) * repeats
;
1484 for (k
= 0; k
< nr_tprs
; k
++)
1486 fprintf(fp
, "\nIndividual timings for input file %d (%s):\n", k
, tpr_names
[k
]);
1487 fprintf(fp
, "PME ranks Gcycles ns/day PME/f Remark\n");
1488 /* Loop over various numbers of PME nodes: */
1489 for (i
= 0; i
< *pmeentries
; i
++)
1491 pd
= &perfdata
[k
][i
];
1493 auto cmd_gpu_ids
= make_gpu_id_command_line(eligible_gpu_ids
);
1495 /* Loop over the repeats for each scenario: */
1496 for (nr
= 0; nr
< repeats
; nr
++)
1498 pd
->nPMEnodes
= nPMEnodes
[i
];
1500 /* Add -npme and -s to the command line and save it. Note that
1501 * the -passall (if set) options requires cmd_args_bench to be
1502 * at the end of the command line string */
1503 snew(pd
->mdrun_cmd_line
, cmdline_length
);
1504 sprintf(pd
->mdrun_cmd_line
, "%s-npme %d -s %s %s %s", cmd_stub
, pd
->nPMEnodes
,
1505 tpr_names
[k
], cmd_args_bench
, cmd_gpu_ids
.c_str());
1507 /* To prevent that all benchmarks fail due to a show-stopper argument
1508 * on the mdrun command line, we make a quick check first.
1509 * This check can be turned off in cases where the automatically chosen
1510 * number of PME-only ranks leads to a number of PP ranks for which no
1511 * decomposition can be found (e.g. for large prime numbers) */
1512 if (bFirst
&& bCheck
)
1514 /* TODO This check is really for a functional
1515 * .tpr, and if we need it, it should take place
1516 * for every .tpr, and the logic for it should be
1517 * immediately inside the loop over k, not in
1518 * this inner loop. */
1519 char* temporary_cmd_line
;
1521 snew(temporary_cmd_line
, cmdline_length
);
1522 /* TODO -npme 0 is more likely to succeed at low
1523 parallelism than the default of -npme -1, but
1524 is more likely to fail at the scaling limit
1525 when the PP domains may be too small. "mpirun
1526 -np 1 mdrun" is probably a reasonable thing to
1527 do for this check, but it'll be easier to
1528 implement that after some refactoring of how
1529 the number of MPI ranks is managed. */
1530 sprintf(temporary_cmd_line
, "%s-npme 0 -nb cpu -s %s %s", cmd_stub
,
1531 tpr_names
[k
], cmd_args_bench
);
1532 make_sure_it_runs(temporary_cmd_line
, cmdline_length
, fp
, fnm
, nfile
);
1536 /* Do a benchmark simulation: */
1539 sprintf(buf
, ", pass %d/%d", nr
+ 1, repeats
);
1545 fprintf(stdout
, "\n=== Progress %2.0f%%, tpr %d/%d, run %d/%d%s:\n",
1546 (100.0 * count
) / totaltests
, k
+ 1, nr_tprs
, i
+ 1, *pmeentries
, buf
);
1547 make_backup(opt2fn("-err", nfile
, fnm
));
1548 sprintf(command
, "%s 1> /dev/null 2>%s", pd
->mdrun_cmd_line
, opt2fn("-err", nfile
, fnm
));
1549 fprintf(stdout
, "%s\n", pd
->mdrun_cmd_line
);
1550 gmx_system_call(command
);
1552 /* Collect the performance data from the log file; also check stderr
1553 * for fatal errors */
1554 ret
= parse_logfile(opt2fn("-bg", nfile
, fnm
), opt2fn("-err", nfile
, fnm
), pd
, nr
,
1555 presteps
, cpt_steps
, nnodes
);
1556 if ((presteps
> 0) && (ret
== eParselogResetProblem
))
1558 bResetProblem
= TRUE
;
1561 if (-1 == pd
->nPMEnodes
)
1563 sprintf(buf
, "(%3d)", pd
->guessPME
);
1571 if (pd
->PME_f_load
[nr
] > 0.0)
1573 sprintf(str_PME_f_load
, "%12.3f", pd
->PME_f_load
[nr
]);
1577 sprintf(str_PME_f_load
, "%s", " - ");
1580 /* Write the data we got to disk */
1581 fprintf(fp
, "%4d%s %12.3f %12.3f %s %s", pd
->nPMEnodes
, buf
, pd
->Gcycles
[nr
],
1582 pd
->ns_per_day
[nr
], str_PME_f_load
, ParseLog
[ret
]);
1583 if (!(ret
== eParselogOK
|| ret
== eParselogNoDDGrid
|| ret
== eParselogNotFound
))
1585 fprintf(fp
, " Check %s file for problems.", ret
== eParselogFatal
? "err" : "log");
1591 /* Do some cleaning up and delete the files we do not need any more */
1592 cleanup(fnm
, nfile
, k
, nnodes
, pd
->nPMEnodes
, nr
, ret
== eParselogFatal
);
1594 /* If the first run with this number of processors already failed, do not try again: */
1595 if (pd
->Gcycles
[0] <= 0.0 && repeats
> 1)
1598 "Skipping remaining passes of unsuccessful setting, see log file for "
1600 count
+= repeats
- (nr
+ 1);
1603 } /* end of repeats loop */
1604 } /* end of -npme loop */
1605 } /* end of tpr file loop */
1610 fprintf(fp
, "WARNING: The cycle and time step counters could not be reset properly. ");
1618 static void check_input(int nnodes
,
1624 real maxPMEfraction
,
1625 real minPMEfraction
,
1627 int64_t bench_nsteps
,
1628 const t_filenm
* fnm
,
1638 /* Make sure the input file exists */
1639 if (!gmx_fexist(opt2fn("-s", nfile
, fnm
)))
1641 gmx_fatal(FARGS
, "File %s not found.", opt2fn("-s", nfile
, fnm
));
1644 /* Make sure that the checkpoint file is not overwritten during benchmarking */
1645 if ((0 == std::strcmp(opt2fn("-cpi", nfile
, fnm
), opt2fn("-bcpo", nfile
, fnm
))) && (sim_part
> 1))
1648 "Checkpoint input (-cpi) and benchmark checkpoint output (-bcpo) files must not "
1650 "The checkpoint input file must not be overwritten during the benchmarks.\n");
1653 /* Make sure that repeats is >= 0 (if == 0, only write tpr files) */
1656 gmx_fatal(FARGS
, "Number of repeats < 0!");
1659 /* Check number of nodes */
1662 gmx_fatal(FARGS
, "Number of ranks/threads must be a positive integer.");
1665 /* Automatically choose -ntpr if not set */
1675 /* Set a reasonable scaling factor for rcoulomb */
1678 *rmax
= rcoulomb
* 1.2;
1681 fprintf(stderr
, "Will test %d tpr file%s.\n", *ntprs
, *ntprs
== 1 ? "" : "s");
1688 "Note: Choose ntpr>1 to shift PME load between real and reciprocal space.\n");
1692 /* Make shure that rmin <= rcoulomb <= rmax */
1701 if (!(*rmin
<= *rmax
))
1704 "Please choose the Coulomb radii such that rmin <= rmax.\n"
1705 "rmin = %g, rmax = %g, actual rcoul from .tpr file = %g\n",
1706 *rmin
, *rmax
, rcoulomb
);
1708 /* Add test scenarios if rmin or rmax were set */
1711 if (!gmx_within_tol(*rmin
, rcoulomb
, GMX_REAL_EPS
) && (*ntprs
== 1))
1714 fprintf(stderr
, "NOTE: Setting -rmin to %g changed -ntpr to %d\n", *rmin
, *ntprs
);
1716 if (!gmx_within_tol(*rmax
, rcoulomb
, GMX_REAL_EPS
) && (*ntprs
== 1))
1719 fprintf(stderr
, "NOTE: Setting -rmax to %g changed -ntpr to %d\n", *rmax
, *ntprs
);
1723 /* If one of rmin, rmax is set, we need 2 tpr files at minimum */
1724 if (!gmx_within_tol(*rmax
, rcoulomb
, GMX_REAL_EPS
) || !gmx_within_tol(*rmin
, rcoulomb
, GMX_REAL_EPS
))
1726 *ntprs
= std::max(*ntprs
, 2);
1729 /* If both rmin, rmax are set, we need 3 tpr files at minimum */
1730 if (!gmx_within_tol(*rmax
, rcoulomb
, GMX_REAL_EPS
) && !gmx_within_tol(*rmin
, rcoulomb
, GMX_REAL_EPS
))
1732 *ntprs
= std::max(*ntprs
, 3);
1737 fprintf(stderr
, "NOTE: Your rmin, rmax setting changed -ntpr to %d\n", *ntprs
);
1742 if (gmx_within_tol(*rmin
, rcoulomb
, GMX_REAL_EPS
)
1743 && gmx_within_tol(rcoulomb
, *rmax
, GMX_REAL_EPS
)) /* We have just a single rc */
1746 "WARNING: Resetting -ntpr to 1 since no Coulomb radius scaling is requested.\n"
1747 "Please set rmin < rmax to test Coulomb radii in the [rmin, rmax] interval\n"
1748 "with correspondingly adjusted PME grid settings\n");
1753 /* Check whether max and min fraction are within required values */
1754 if (maxPMEfraction
> 0.5 || maxPMEfraction
< 0)
1756 gmx_fatal(FARGS
, "-max must be between 0 and 0.5");
1758 if (minPMEfraction
> 0.5 || minPMEfraction
< 0)
1760 gmx_fatal(FARGS
, "-min must be between 0 and 0.5");
1762 if (maxPMEfraction
< minPMEfraction
)
1764 gmx_fatal(FARGS
, "-max must be larger or equal to -min");
1767 /* Check whether the number of steps - if it was set - has a reasonable value */
1768 if (bench_nsteps
< 0)
1770 gmx_fatal(FARGS
, "Number of steps must be positive.");
1773 if (bench_nsteps
> 10000 || bench_nsteps
< 100)
1775 fprintf(stderr
, "WARNING: steps=");
1776 fprintf(stderr
, "%" PRId64
, bench_nsteps
);
1777 fprintf(stderr
, ". Are you sure you want to perform so %s steps for each benchmark?\n",
1778 (bench_nsteps
< 100) ? "few" : "many");
1783 gmx_fatal(FARGS
, "Cannot have a negative number of presteps.\n");
1786 /* Check for rcoulomb scaling if more than one .tpr file is tested */
1789 if (*rmin
/ rcoulomb
< 0.75 || *rmax
/ rcoulomb
> 1.25)
1792 "WARNING: Applying extreme scaling factor. I hope you know what you are "
1797 /* If a fixed number of PME nodes is set we do rcoulomb and PME gird tuning
1798 * only. We need to check whether the requested number of PME-only nodes
1800 if (npme_fixed
> -1)
1802 /* No more than 50% of all nodes can be assigned as PME-only nodes. */
1803 if (2 * npme_fixed
> nnodes
)
1806 "Cannot have more than %d PME-only ranks for a total of %d ranks (you chose "
1808 nnodes
/ 2, nnodes
, npme_fixed
);
1810 if ((npme_fixed
> 0) && (5 * npme_fixed
< nnodes
))
1813 "WARNING: Only %g percent of the ranks are assigned as PME-only ranks.\n",
1814 (100.0 * npme_fixed
) / nnodes
);
1816 if (opt2parg_bSet("-min", npargs
, pa
) || opt2parg_bSet("-max", npargs
, pa
))
1819 "NOTE: The -min, -max, and -npme options have no effect when a\n"
1820 " fixed number of PME-only ranks is requested with -fix.\n");
1826 /* Returns TRUE when "opt" is needed at launch time */
1827 static gmx_bool
is_launch_file(char* opt
, gmx_bool bSet
)
1829 if (0 == std::strncmp(opt
, "-swap", 5))
1834 /* Apart from the input .tpr and the output log files we need all options that
1835 * were set on the command line and that do not start with -b */
1836 if (0 == std::strncmp(opt
, "-b", 2) || 0 == std::strncmp(opt
, "-s", 2)
1837 || 0 == std::strncmp(opt
, "-err", 4) || 0 == std::strncmp(opt
, "-p", 2))
1846 /* Returns TRUE when "opt" defines a file which is needed for the benchmarks runs */
1847 static gmx_bool
is_bench_file(char* opt
, gmx_bool bSet
, gmx_bool bOptional
, gmx_bool bIsOutput
)
1849 /* Apart from the input .tpr, all files starting with "-b" are for
1850 * _b_enchmark files exclusively */
1851 if (0 == std::strncmp(opt
, "-s", 2))
1856 if (0 == std::strncmp(opt
, "-b", 2) || 0 == std::strncmp(opt
, "-s", 2))
1858 return !bOptional
|| bSet
;
1868 return bSet
; /* These are additional input files like -cpi -ei */
1874 /* Adds 'buf' to 'str' */
1875 static void add_to_string(char** str
, const char* buf
)
1880 len
= std::strlen(*str
) + std::strlen(buf
) + 1;
1882 std::strcat(*str
, buf
);
1886 /* Create the command line for the benchmark as well as for the real run */
1888 create_command_line_snippets(gmx_bool bAppendFiles
,
1889 gmx_bool bKeepAndNumCPT
,
1890 gmx_bool bResetHWay
,
1894 char* cmd_args_bench
[], /* command line arguments for benchmark runs */
1895 char* cmd_args_launch
[], /* command line arguments for simulation run */
1896 char extra_args
[], /* Add this to the end of the command line */
1897 char* deffnm
) /* Default file names, or NULL if not set */
1902 char strbuf
[STRLEN
];
1905 /* strlen needs at least '\0' as a string: */
1906 snew(*cmd_args_bench
, 1);
1907 snew(*cmd_args_launch
, 1);
1908 *cmd_args_launch
[0] = '\0';
1909 *cmd_args_bench
[0] = '\0';
1912 /*******************************************/
1913 /* 1. Process other command line arguments */
1914 /*******************************************/
1917 /* Add equilibration steps to benchmark options */
1918 sprintf(strbuf
, "-resetstep %d ", presteps
);
1919 add_to_string(cmd_args_bench
, strbuf
);
1921 /* These switches take effect only at launch time */
1924 sprintf(strbuf
, "-deffnm %s ", deffnm
);
1925 add_to_string(cmd_args_launch
, strbuf
);
1929 add_to_string(cmd_args_launch
, "-noappend ");
1933 add_to_string(cmd_args_launch
, "-cpnum ");
1937 add_to_string(cmd_args_launch
, "-resethway ");
1940 /********************/
1941 /* 2. Process files */
1942 /********************/
1943 for (i
= 0; i
< nfile
; i
++)
1945 opt
= const_cast<char*>(fnm
[i
].opt
);
1946 name
= opt2fn(opt
, nfile
, fnm
);
1948 /* Strbuf contains the options, now let's sort out where we need that */
1949 sprintf(strbuf
, "%s %s ", opt
, name
);
1951 if (is_bench_file(opt
, opt2bSet(opt
, nfile
, fnm
), is_optional(&fnm
[i
]), is_output(&fnm
[i
])))
1953 /* All options starting with -b* need the 'b' removed,
1954 * therefore overwrite strbuf */
1955 if (0 == std::strncmp(opt
, "-b", 2))
1957 sprintf(strbuf
, "-%s %s ", &opt
[2], name
);
1960 add_to_string(cmd_args_bench
, strbuf
);
1963 if (is_launch_file(opt
, opt2bSet(opt
, nfile
, fnm
)))
1965 add_to_string(cmd_args_launch
, strbuf
);
1969 add_to_string(cmd_args_bench
, extra_args
);
1970 add_to_string(cmd_args_launch
, extra_args
);
1974 /* Set option opt */
1975 static void setopt(const char* opt
, int nfile
, t_filenm fnm
[])
1979 for (i
= 0; (i
< nfile
); i
++)
1981 if (std::strcmp(opt
, fnm
[i
].opt
) == 0)
1983 fnm
[i
].flag
|= ffSET
;
1989 /* This routine inspects the tpr file and ...
1990 * 1. checks for output files that get triggered by a tpr option. These output
1991 * files are marked as 'set' to allow for a proper cleanup after each
1993 * 2. returns the PME:PP load ratio
1994 * 3. returns rcoulomb from the tpr */
1995 static float inspect_tpr(int nfile
, t_filenm fnm
[], real
* rcoulomb
)
1997 gmx_bool bTpi
; /* Is test particle insertion requested? */
1998 gmx_bool bFree
; /* Is a free energy simulation requested? */
1999 gmx_bool bNM
; /* Is a normal mode analysis requested? */
2000 gmx_bool bSwap
; /* Is water/ion position swapping requested? */
2005 /* Check tpr file for options that trigger extra output files */
2006 t_inputrec irInstance
;
2007 t_inputrec
* ir
= &irInstance
;
2008 read_tpx_state(opt2fn("-s", nfile
, fnm
), ir
, &state
, &mtop
);
2009 bFree
= (efepNO
!= ir
->efep
);
2010 bNM
= (eiNM
== ir
->eI
);
2011 bSwap
= (eswapNO
!= ir
->eSwapCoords
);
2012 bTpi
= EI_TPI(ir
->eI
);
2014 /* Set these output files on the tuning command-line */
2017 setopt("-pf", nfile
, fnm
);
2018 setopt("-px", nfile
, fnm
);
2022 setopt("-dhdl", nfile
, fnm
);
2026 setopt("-tpi", nfile
, fnm
);
2027 setopt("-tpid", nfile
, fnm
);
2031 setopt("-mtx", nfile
, fnm
);
2035 setopt("-swap", nfile
, fnm
);
2038 *rcoulomb
= ir
->rcoulomb
;
2040 /* Return the estimate for the number of PME nodes */
2041 float npme
= pme_load_estimate(mtop
, *ir
, state
.box
);
2046 static void couple_files_options(int nfile
, t_filenm fnm
[])
2049 gmx_bool bSet
, bBench
;
2054 for (i
= 0; i
< nfile
; i
++)
2056 opt
= const_cast<char*>(fnm
[i
].opt
);
2057 bSet
= ((fnm
[i
].flag
& ffSET
) != 0);
2058 bBench
= (0 == std::strncmp(opt
, "-b", 2));
2060 /* Check optional files */
2061 /* If e.g. -eo is set, then -beo also needs to be set */
2062 if (is_optional(&fnm
[i
]) && bSet
&& !bBench
)
2064 sprintf(buf
, "-b%s", &opt
[1]);
2065 setopt(buf
, nfile
, fnm
);
2067 /* If -beo is set, then -eo also needs to be! */
2068 if (is_optional(&fnm
[i
]) && bSet
&& bBench
)
2070 sprintf(buf
, "-%s", &opt
[2]);
2071 setopt(buf
, nfile
, fnm
);
2077 #define BENCHSTEPS (1000)
2079 int gmx_tune_pme(int argc
, char* argv
[])
2081 const char* desc
[] = {
2082 "For a given number [TT]-np[tt] or [TT]-ntmpi[tt] of ranks, [THISMODULE] systematically",
2083 "times [gmx-mdrun] with various numbers of PME-only ranks and determines",
2084 "which setting is fastest. It will also test whether performance can",
2085 "be enhanced by shifting load from the reciprocal to the real space",
2086 "part of the Ewald sum. ",
2087 "Simply pass your [REF].tpr[ref] file to [THISMODULE] together with other options",
2088 "for [gmx-mdrun] as needed.[PAR]",
2089 "[THISMODULE] needs to call [gmx-mdrun] and so requires that you",
2090 "specify how to call mdrun with the argument to the [TT]-mdrun[tt]",
2091 "parameter. Depending how you have built GROMACS, values such as",
2092 "'gmx mdrun', 'gmx_d mdrun', or 'mdrun_mpi' might be needed.[PAR]",
2093 "The program that runs MPI programs can be set in the environment variable",
2094 "MPIRUN (defaults to 'mpirun'). Note that for certain MPI frameworks,",
2095 "you need to provide a machine- or hostfile. This can also be passed",
2096 "via the MPIRUN variable, e.g.[PAR]",
2097 "[TT]export MPIRUN=\"/usr/local/mpirun -machinefile hosts\"[tt]",
2098 "Note that in such cases it is normally necessary to compile",
2099 "and/or run [THISMODULE] without MPI support, so that it can call",
2100 "the MPIRUN program.[PAR]",
2101 "Before doing the actual benchmark runs, [THISMODULE] will do a quick",
2102 "check whether [gmx-mdrun] works as expected with the provided parallel settings",
2103 "if the [TT]-check[tt] option is activated (the default).",
2104 "Please call [THISMODULE] with the normal options you would pass to",
2105 "[gmx-mdrun] and add [TT]-np[tt] for the number of ranks to perform the",
2106 "tests on, or [TT]-ntmpi[tt] for the number of threads. You can also add [TT]-r[tt]",
2107 "to repeat each test several times to get better statistics. [PAR]",
2108 "[THISMODULE] can test various real space / reciprocal space workloads",
2109 "for you. With [TT]-ntpr[tt] you control how many extra [REF].tpr[ref] files will be",
2110 "written with enlarged cutoffs and smaller Fourier grids respectively.",
2111 "Typically, the first test (number 0) will be with the settings from the input",
2112 "[REF].tpr[ref] file; the last test (number [TT]ntpr[tt]) will have the Coulomb cutoff",
2113 "specified by [TT]-rmax[tt] with a somewhat smaller PME grid at the same time. ",
2114 "In this last test, the Fourier spacing is multiplied with [TT]rmax[tt]/rcoulomb. ",
2115 "The remaining [REF].tpr[ref] files will have equally-spaced Coulomb radii (and Fourier ",
2116 "spacings) between these extremes. [BB]Note[bb] that you can set [TT]-ntpr[tt] to 1",
2117 "if you just seek the optimal number of PME-only ranks; in that case",
2118 "your input [REF].tpr[ref] file will remain unchanged.[PAR]",
2119 "For the benchmark runs, the default of 1000 time steps should suffice for most",
2120 "MD systems. The dynamic load balancing needs about 100 time steps",
2121 "to adapt to local load imbalances, therefore the time step counters",
2122 "are by default reset after 100 steps. For large systems (>1M atoms), as well as ",
2123 "for a higher accuracy of the measurements, you should set [TT]-resetstep[tt] to a higher ",
2125 "From the 'DD' load imbalance entries in the md.log output file you",
2126 "can tell after how many steps the load is sufficiently balanced. Example call:[PAR]",
2127 "[TT]gmx tune_pme -np 64 -s protein.tpr -launch[tt][PAR]",
2128 "After calling [gmx-mdrun] several times, detailed performance information",
2129 "is available in the output file [TT]perf.out[tt].",
2130 "[BB]Note[bb] that during the benchmarks, a couple of temporary files are written",
2131 "(options [TT]-b*[tt]), these will be automatically deleted after each test.[PAR]",
2132 "If you want the simulation to be started automatically with the",
2133 "optimized parameters, use the command line option [TT]-launch[tt].[PAR]",
2134 "Basic support for GPU-enabled [TT]mdrun[tt] exists. Give a string containing the IDs",
2135 "of the GPUs that you wish to use in the optimization in the [TT]-gpu_id[tt]",
2136 "command-line argument. This works exactly like [TT]mdrun -gpu_id[tt], does not imply a ",
2138 "and merely declares the eligible set of GPU devices. [TT]gmx-tune_pme[tt] will construct ",
2140 "mdrun that use this set appropriately. [TT]gmx-tune_pme[tt] does not support",
2141 "[TT]-gputasks[tt].[PAR]",
2146 int pmeentries
= 0; /* How many values for -npme do we actually test for each tpr file */
2147 real maxPMEfraction
= 0.50;
2148 real minPMEfraction
= 0.25;
2149 int maxPMEnodes
, minPMEnodes
;
2150 float guessPMEratio
; /* guessed PME:PP ratio based on the tpr file */
2151 float guessPMEnodes
;
2152 int npme_fixed
= -2; /* If >= -1, use only this number
2153 * of PME-only nodes */
2155 real rmin
= 0.0, rmax
= 0.0; /* min and max value for rcoulomb if scaling is requested */
2156 real rcoulomb
= -1.0; /* Coulomb radius as set in .tpr file */
2157 gmx_bool bScaleRvdw
= TRUE
;
2158 int64_t bench_nsteps
= BENCHSTEPS
;
2159 int64_t new_sim_nsteps
= -1; /* -1 indicates: not set by the user */
2160 int64_t cpt_steps
= 0; /* Step counter in .cpt input file */
2161 int presteps
= 1500; /* Do a full cycle reset after presteps steps */
2162 gmx_bool bOverwrite
= FALSE
, bKeepTPR
;
2163 gmx_bool bLaunch
= FALSE
;
2164 char* ExtraArgs
= nullptr;
2165 char** tpr_names
= nullptr;
2166 const char* simulation_tpr
= nullptr;
2167 char* deffnm
= nullptr;
2168 int best_npme
, best_tpr
;
2169 int sim_part
= 1; /* For benchmarks with checkpoint files */
2173 /* Default program names if nothing else is found */
2174 char *cmd_mpirun
= nullptr, *cmd_mdrun
= nullptr;
2175 char *cmd_args_bench
, *cmd_args_launch
;
2176 char* cmd_np
= nullptr;
2178 /* IDs of GPUs that are eligible for computation */
2179 char* eligible_gpu_ids
= nullptr;
2181 t_perf
** perfdata
= nullptr;
2186 /* Print out how long the tuning took */
2189 static t_filenm fnm
[] = { /* tune_pme */
2190 { efOUT
, "-p", "perf", ffWRITE
},
2191 { efLOG
, "-err", "bencherr", ffWRITE
},
2192 { efTPR
, "-so", "tuned", ffWRITE
},
2194 { efTPR
, "-s", nullptr, ffREAD
},
2195 { efTRN
, "-o", nullptr, ffWRITE
},
2196 { efCOMPRESSED
, "-x", nullptr, ffOPTWR
},
2197 { efCPT
, "-cpi", nullptr, ffOPTRD
},
2198 { efCPT
, "-cpo", nullptr, ffOPTWR
},
2199 { efSTO
, "-c", "confout", ffWRITE
},
2200 { efEDR
, "-e", "ener", ffWRITE
},
2201 { efLOG
, "-g", "md", ffWRITE
},
2202 { efXVG
, "-dhdl", "dhdl", ffOPTWR
},
2203 { efXVG
, "-field", "field", ffOPTWR
},
2204 { efXVG
, "-table", "table", ffOPTRD
},
2205 { efXVG
, "-tablep", "tablep", ffOPTRD
},
2206 { efXVG
, "-tableb", "table", ffOPTRD
},
2207 { efTRX
, "-rerun", "rerun", ffOPTRD
},
2208 { efXVG
, "-tpi", "tpi", ffOPTWR
},
2209 { efXVG
, "-tpid", "tpidist", ffOPTWR
},
2210 { efEDI
, "-ei", "sam", ffOPTRD
},
2211 { efXVG
, "-eo", "edsam", ffOPTWR
},
2212 { efXVG
, "-px", "pullx", ffOPTWR
},
2213 { efXVG
, "-pf", "pullf", ffOPTWR
},
2214 { efXVG
, "-ro", "rotation", ffOPTWR
},
2215 { efLOG
, "-ra", "rotangles", ffOPTWR
},
2216 { efLOG
, "-rs", "rotslabs", ffOPTWR
},
2217 { efLOG
, "-rt", "rottorque", ffOPTWR
},
2218 { efMTX
, "-mtx", "nm", ffOPTWR
},
2219 { efXVG
, "-swap", "swapions", ffOPTWR
},
2220 /* Output files that are deleted after each benchmark run */
2221 { efTRN
, "-bo", "bench", ffWRITE
},
2222 { efXTC
, "-bx", "bench", ffWRITE
},
2223 { efCPT
, "-bcpo", "bench", ffWRITE
},
2224 { efSTO
, "-bc", "bench", ffWRITE
},
2225 { efEDR
, "-be", "bench", ffWRITE
},
2226 { efLOG
, "-bg", "bench", ffWRITE
},
2227 { efXVG
, "-beo", "benchedo", ffOPTWR
},
2228 { efXVG
, "-bdhdl", "benchdhdl", ffOPTWR
},
2229 { efXVG
, "-bfield", "benchfld", ffOPTWR
},
2230 { efXVG
, "-btpi", "benchtpi", ffOPTWR
},
2231 { efXVG
, "-btpid", "benchtpid", ffOPTWR
},
2232 { efXVG
, "-bdevout", "benchdev", ffOPTWR
},
2233 { efXVG
, "-brunav", "benchrnav", ffOPTWR
},
2234 { efXVG
, "-bpx", "benchpx", ffOPTWR
},
2235 { efXVG
, "-bpf", "benchpf", ffOPTWR
},
2236 { efXVG
, "-bro", "benchrot", ffOPTWR
},
2237 { efLOG
, "-bra", "benchrota", ffOPTWR
},
2238 { efLOG
, "-brs", "benchrots", ffOPTWR
},
2239 { efLOG
, "-brt", "benchrott", ffOPTWR
},
2240 { efMTX
, "-bmtx", "benchn", ffOPTWR
},
2241 { efNDX
, "-bdn", "bench", ffOPTWR
},
2242 { efXVG
, "-bswap", "benchswp", ffOPTWR
}
2245 gmx_bool bThreads
= FALSE
;
2249 const char* procstring
[] = { nullptr, "np", "n", "none", nullptr };
2250 const char* npmevalues_opt
[] = { nullptr, "auto", "all", "subset", nullptr };
2252 gmx_bool bAppendFiles
= TRUE
;
2253 gmx_bool bKeepAndNumCPT
= FALSE
;
2254 gmx_bool bResetCountersHalfWay
= FALSE
;
2255 gmx_bool bBenchmark
= TRUE
;
2256 gmx_bool bCheck
= TRUE
;
2258 gmx_output_env_t
* oenv
= nullptr;
2261 /***********************/
2262 /* tune_pme options: */
2263 /***********************/
2268 "Command line to run a simulation, e.g. 'gmx mdrun' or 'mdrun_mpi'" },
2273 "Number of ranks to run the tests on (must be > 2 for separate PME ranks)" },
2278 "Name of the [TT]$MPIRUN[tt] option that specifies the number of ranks to use ('np', or "
2279 "'n'; use 'none' if there is no such option)" },
2284 "Number of MPI-threads to run the tests on (turns MPI & mpirun off)" },
2285 { "-r", FALSE
, etINT
, { &repeats
}, "Repeat each test this often" },
2286 { "-max", FALSE
, etREAL
, { &maxPMEfraction
}, "Max fraction of PME ranks to test with" },
2287 { "-min", FALSE
, etREAL
, { &minPMEfraction
}, "Min fraction of PME ranks to test with" },
2292 "Within -min and -max, benchmark all possible values for [TT]-npme[tt], or just a "
2293 "reasonable subset. "
2294 "Auto neglects -min and -max and chooses reasonable values around a guess for npme "
2295 "derived from the .tpr" },
2300 "If >= -1, do not vary the number of PME-only ranks, instead use this fixed value and "
2301 "only vary rcoulomb and the PME grid spacing." },
2306 "If >0, maximal rcoulomb for -ntpr>1 (rcoulomb upscaling results in fourier grid "
2308 { "-rmin", FALSE
, etREAL
, { &rmin
}, "If >0, minimal rcoulomb for -ntpr>1" },
2309 { "-scalevdw", FALSE
, etBOOL
, { &bScaleRvdw
}, "Scale rvdw along with rcoulomb" },
2314 "Number of [REF].tpr[ref] files to benchmark. Create this many files with different "
2315 "rcoulomb scaling factors depending on -rmin and -rmax. "
2316 "If < 1, automatically choose the number of [REF].tpr[ref] files to test" },
2321 "Take timings for this many steps in the benchmark runs" },
2326 "Let dlb equilibrate this many steps before timings are taken (reset cycle counters "
2327 "after this many steps)" },
2331 { &new_sim_nsteps
},
2332 "If non-negative, perform this many steps in the real run (overwrites nsteps from "
2333 "[REF].tpr[ref], add [REF].cpt[ref] steps)" },
2334 { "-launch", FALSE
, etBOOL
, { &bLaunch
}, "Launch the real simulation after optimization" },
2339 "Run the benchmarks or just create the input [REF].tpr[ref] files?" },
2344 "Before the benchmark runs, check whether mdrun works in parallel" },
2348 { &eligible_gpu_ids
},
2349 "List of unique GPU device IDs that are eligible for use" },
2350 /******************/
2351 /* mdrun options: */
2352 /******************/
2353 /* We let tune_pme parse and understand these options, because we need to
2354 * prevent that they appear on the mdrun command line for the benchmarks */
2359 "Append to previous output files when continuing from checkpoint instead of adding the "
2360 "simulation part number to all file names (for launch only)" },
2364 { &bKeepAndNumCPT
},
2365 "Keep and number checkpoint files (launch only)" },
2366 { "-deffnm", FALSE
, etSTR
, { &deffnm
}, "Set the default filenames (launch only)" },
2370 { &bResetCountersHalfWay
},
2371 "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt] "
2375 #define NFILE asize(fnm)
2377 seconds
= gmx_gettime();
2379 if (!parse_common_args(&argc
, argv
, PCA_NOEXIT_ON_ARGS
, NFILE
, fnm
, asize(pa
), pa
, asize(desc
),
2380 desc
, 0, nullptr, &oenv
))
2385 // procstring[0]Â is used inside two different conditionals further down
2386 GMX_RELEASE_ASSERT(procstring
[0] != nullptr, "Options inconsistency; procstring[0]Â is NULL");
2388 /* Store the remaining unparsed command line entries in a string which
2389 * is then attached to the mdrun command line */
2391 ExtraArgs
[0] = '\0';
2392 for (i
= 1; i
< argc
; i
++) /* argc will now be 1 if everything was understood */
2394 add_to_string(&ExtraArgs
, argv
[i
]);
2395 add_to_string(&ExtraArgs
, " ");
2398 if (opt2parg_bSet("-ntmpi", asize(pa
), pa
))
2401 if (opt2parg_bSet("-npstring", asize(pa
), pa
))
2403 fprintf(stderr
, "WARNING: -npstring has no effect when using threads.\n");
2408 gmx_fatal(FARGS
, "Can't run multi-threaded MPI simulation yet!");
2410 /* and now we just set this; a bit of an ugly hack*/
2413 /* Check for PME:PP ratio and whether tpr triggers additional output files */
2414 guessPMEratio
= inspect_tpr(NFILE
, fnm
, &rcoulomb
);
2416 /* Automatically set -beo options if -eo is set etc. */
2417 couple_files_options(NFILE
, fnm
);
2419 /* Construct the command line arguments for benchmark runs
2420 * as well as for the simulation run */
2423 sprintf(bbuf
, " -ntmpi %d ", nthreads
);
2427 /* This string will be used for MPI runs and will appear after the
2428 * mpirun command. */
2429 if (std::strcmp(procstring
[0], "none") != 0)
2431 sprintf(bbuf
, " -%s %d ", procstring
[0], nnodes
);
2441 create_command_line_snippets(bAppendFiles
, bKeepAndNumCPT
, bResetCountersHalfWay
, presteps
,
2442 NFILE
, fnm
, &cmd_args_bench
, &cmd_args_launch
, ExtraArgs
, deffnm
);
2444 /* Prepare to use checkpoint file if requested */
2446 if (opt2bSet("-cpi", NFILE
, fnm
))
2448 const char* filename
= opt2fn("-cpi", NFILE
, fnm
);
2450 read_checkpoint_part_and_step(filename
, &cpt_sim_part
, &cpt_steps
);
2451 if (cpt_sim_part
== 0)
2453 gmx_fatal(FARGS
, "Checkpoint file %s could not be read!", filename
);
2455 /* tune_pme will run the next part of the simulation */
2456 sim_part
= cpt_sim_part
+ 1;
2459 /* Open performance output file and write header info */
2460 fp
= gmx_ffopen(opt2fn("-p", NFILE
, fnm
), "w");
2462 /* Make a quick consistency check of command line parameters */
2463 check_input(nnodes
, repeats
, &ntprs
, &rmin
, rcoulomb
, &rmax
, maxPMEfraction
, minPMEfraction
,
2464 npme_fixed
, bench_nsteps
, fnm
, NFILE
, sim_part
, presteps
, asize(pa
), pa
);
2466 /* Determine the maximum and minimum number of PME nodes to test,
2467 * the actual list of settings is build in do_the_tests(). */
2468 if ((nnodes
> 2) && (npme_fixed
< -1))
2470 if (0 == std::strcmp(npmevalues_opt
[0], "auto"))
2472 /* Determine the npme range automatically based on the PME:PP load guess */
2473 if (guessPMEratio
> 1.0)
2475 /* More PME than PP work, probably we do not need separate PME nodes at all! */
2476 maxPMEnodes
= nnodes
/ 2;
2477 minPMEnodes
= nnodes
/ 2;
2481 /* PME : PP load is in the range 0..1, let's test around the guess */
2482 guessPMEnodes
= static_cast<int>(nnodes
/ (1.0 + 1.0 / guessPMEratio
));
2483 minPMEnodes
= static_cast<int>(std::floor(0.7 * guessPMEnodes
));
2484 maxPMEnodes
= static_cast<int>(std::ceil(1.6 * guessPMEnodes
));
2485 maxPMEnodes
= std::min(maxPMEnodes
, nnodes
/ 2);
2490 /* Determine the npme range based on user input */
2491 maxPMEnodes
= static_cast<int>(std::floor(maxPMEfraction
* nnodes
));
2492 minPMEnodes
= std::max(static_cast<int>(std::floor(minPMEfraction
* nnodes
)), 0);
2493 fprintf(stdout
, "Will try runs with %d ", minPMEnodes
);
2494 if (maxPMEnodes
!= minPMEnodes
)
2496 fprintf(stdout
, "- %d ", maxPMEnodes
);
2499 "PME-only ranks.\n Note that the automatic number of PME-only ranks and no "
2500 "separate PME ranks are always tested.\n");
2509 /* Get the commands we need to set up the runs from environment variables */
2510 get_program_paths(bThreads
, &cmd_mpirun
, &cmd_mdrun
);
2511 if (bBenchmark
&& repeats
> 0)
2513 check_mdrun_works(bThreads
, cmd_mpirun
, cmd_np
, cmd_mdrun
, nullptr != eligible_gpu_ids
);
2516 /* Print some header info to file */
2518 fprintf(fp
, "\n P E R F O R M A N C E R E S U L T S\n");
2520 fprintf(fp
, "%s for GROMACS %s\n", output_env_get_program_display_name(oenv
), gmx_version());
2523 fprintf(fp
, "Number of ranks : %d\n", nnodes
);
2524 fprintf(fp
, "The mpirun command is : %s\n", cmd_mpirun
);
2525 if (std::strcmp(procstring
[0], "none") != 0)
2527 fprintf(fp
, "Passing # of ranks via : -%s\n", procstring
[0]);
2531 fprintf(fp
, "Not setting number of ranks in system call\n");
2536 fprintf(fp
, "Number of threads : %d\n", nnodes
);
2539 fprintf(fp
, "The mdrun command is : %s\n", cmd_mdrun
);
2540 fprintf(fp
, "mdrun args benchmarks : %s\n", cmd_args_bench
);
2541 fprintf(fp
, "Benchmark steps : ");
2542 fprintf(fp
, "%" PRId64
, bench_nsteps
);
2544 fprintf(fp
, "dlb equilibration steps : %d\n", presteps
);
2547 fprintf(fp
, "Checkpoint time step : ");
2548 fprintf(fp
, "%" PRId64
, cpt_steps
);
2551 fprintf(fp
, "mdrun args at launchtime: %s\n", cmd_args_launch
);
2553 if (new_sim_nsteps
>= 0)
2556 fprintf(stderr
, "Note: Simulation input file %s will have ", opt2fn("-so", NFILE
, fnm
));
2557 fprintf(stderr
, "%" PRId64
, new_sim_nsteps
+ cpt_steps
);
2558 fprintf(stderr
, " steps.\n");
2559 fprintf(fp
, "Simulation steps : ");
2560 fprintf(fp
, "%" PRId64
, new_sim_nsteps
);
2565 fprintf(fp
, "Repeats for each test : %d\n", repeats
);
2568 if (npme_fixed
>= -1)
2570 fprintf(fp
, "Fixing -npme at : %d\n", npme_fixed
);
2573 fprintf(fp
, "Input file : %s\n", opt2fn("-s", NFILE
, fnm
));
2574 fprintf(fp
, " PME/PP load estimate : %g\n", guessPMEratio
);
2576 /* Allocate memory for the inputinfo struct: */
2578 info
->nr_inputfiles
= ntprs
;
2579 for (i
= 0; i
< ntprs
; i
++)
2581 snew(info
->rcoulomb
, ntprs
);
2582 snew(info
->rvdw
, ntprs
);
2583 snew(info
->rlist
, ntprs
);
2584 snew(info
->nkx
, ntprs
);
2585 snew(info
->nky
, ntprs
);
2586 snew(info
->nkz
, ntprs
);
2587 snew(info
->fsx
, ntprs
);
2588 snew(info
->fsy
, ntprs
);
2589 snew(info
->fsz
, ntprs
);
2591 /* Make alternative tpr files to test: */
2592 snew(tpr_names
, ntprs
);
2593 for (i
= 0; i
< ntprs
; i
++)
2595 snew(tpr_names
[i
], STRLEN
);
2598 /* It can be that ntprs is reduced by make_benchmark_tprs if not enough
2599 * different grids could be found. */
2600 make_benchmark_tprs(opt2fn("-s", NFILE
, fnm
), tpr_names
, bench_nsteps
+ presteps
, cpt_steps
,
2601 rmin
, rmax
, bScaleRvdw
, &ntprs
, info
, fp
);
2603 /********************************************************************************/
2604 /* Main loop over all scenarios we need to test: tpr files, PME nodes, repeats */
2605 /********************************************************************************/
2606 snew(perfdata
, ntprs
);
2609 GMX_RELEASE_ASSERT(npmevalues_opt
[0] != nullptr,
2610 "Options inconsistency; npmevalues_opt[0] is NULL");
2611 do_the_tests(fp
, tpr_names
, maxPMEnodes
, minPMEnodes
, npme_fixed
, npmevalues_opt
[0], perfdata
,
2612 &pmeentries
, repeats
, nnodes
, ntprs
, bThreads
, cmd_mpirun
, cmd_np
, cmd_mdrun
,
2613 cmd_args_bench
, fnm
, NFILE
, presteps
, cpt_steps
, bCheck
, eligible_gpu_ids
);
2615 fprintf(fp
, "\nTuning took%8.1f minutes.\n", (gmx_gettime() - seconds
) / 60.0);
2617 /* Analyse the results and give a suggestion for optimal settings: */
2618 bKeepTPR
= analyze_data(fp
, opt2fn("-p", NFILE
, fnm
), perfdata
, nnodes
, ntprs
, pmeentries
,
2619 repeats
, info
, &best_tpr
, &best_npme
);
2621 /* Take the best-performing tpr file and enlarge nsteps to original value */
2622 if (bKeepTPR
&& !bOverwrite
)
2624 simulation_tpr
= opt2fn("-s", NFILE
, fnm
);
2628 simulation_tpr
= opt2fn("-so", NFILE
, fnm
);
2629 modify_PMEsettings(bOverwrite
? (new_sim_nsteps
+ cpt_steps
) : info
->orig_sim_steps
,
2630 info
->orig_init_step
, tpr_names
[best_tpr
], simulation_tpr
);
2633 /* Let's get rid of the temporary benchmark input files */
2634 for (i
= 0; i
< ntprs
; i
++)
2636 fprintf(stdout
, "Deleting temporary benchmark input file %s\n", tpr_names
[i
]);
2637 remove(tpr_names
[i
]);
2640 /* Now start the real simulation if the user requested it ... */
2641 launch_simulation(bLaunch
, fp
, bThreads
, cmd_mpirun
, cmd_np
, cmd_mdrun
, cmd_args_launch
,
2642 simulation_tpr
, best_npme
, eligible_gpu_ids
);
2646 /* ... or simply print the performance results to screen: */
2649 finalize(opt2fn("-p", NFILE
, fnm
));