Update instructions in containers.rst
[gromacs.git] / src / gromacs / timing / wallcycle.cpp
bloba80254755c4a149b363745bb98029b60e4861860
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "gmxpre.h"
40 #include "wallcycle.h"
42 #include "config.h"
44 #include <cstdlib>
46 #include <array>
47 #include <vector>
49 #include "gromacs/math/functions.h"
50 #include "gromacs/mdtypes/commrec.h"
51 #include "gromacs/timing/cyclecounter.h"
52 #include "gromacs/timing/gpu_timing.h"
53 #include "gromacs/timing/wallcyclereporting.h"
54 #include "gromacs/utility/cstringutil.h"
55 #include "gromacs/utility/gmxassert.h"
56 #include "gromacs/utility/gmxmpi.h"
57 #include "gromacs/utility/logger.h"
58 #include "gromacs/utility/smalloc.h"
59 #include "gromacs/utility/snprintf.h"
61 static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
63 #ifndef DEBUG_WCYCLE
64 /*! \brief Enables consistency checking for the counters.
66 * If the macro is set to 1, code checks if you stop a counter different from the last
67 * one that was opened and if you do nest too deep.
69 # define DEBUG_WCYCLE 0
70 #endif
71 //! Whether wallcycle debugging is enabled
72 constexpr bool gmx_unused enableWallcycleDebug = (DEBUG_WCYCLE != 0);
73 //! True if only the master rank should print debugging output
74 constexpr bool gmx_unused onlyMasterDebugPrints = true;
75 //! True if cycle counter nesting depth debuggin prints are enabled
76 constexpr bool gmx_unused debugPrintDepth = false /* enableWallcycleDebug */;
78 #if DEBUG_WCYCLE
79 # include "gromacs/utility/fatalerror.h"
80 #endif
82 typedef struct
84 int n;
85 gmx_cycles_t c;
86 gmx_cycles_t start;
87 } wallcc_t;
89 struct gmx_wallcycle
91 wallcc_t* wcc;
92 /* did we detect one or more invalid cycle counts */
93 gmx_bool haveInvalidCount;
94 /* variables for testing/debugging */
95 gmx_bool wc_barrier;
96 wallcc_t* wcc_all;
97 int wc_depth;
98 #if DEBUG_WCYCLE
99 # define DEPTH_MAX 6
100 int counterlist[DEPTH_MAX];
101 int count_depth;
102 bool isMasterRank;
103 #endif
104 int ewc_prev;
105 gmx_cycles_t cycle_prev;
106 int64_t reset_counters;
107 #if GMX_MPI
108 MPI_Comm mpi_comm_mygroup;
109 #endif
110 wallcc_t* wcsc;
113 /* Each name should not exceed 19 printing characters
114 (ie. terminating null can be twentieth) */
115 static const char* wcn[ewcNR] = { "Run",
116 "Step",
117 "PP during PME",
118 "Domain decomp.",
119 "DD comm. load",
120 "DD comm. bounds",
121 "Vsite constr.",
122 "Send X to PME",
123 "Neighbor search",
124 "Launch GPU ops.",
125 "Comm. coord.",
126 "Force",
127 "Wait + Comm. F",
128 "PME mesh",
129 "PME redist. X/F",
130 "PME spread",
131 "PME gather",
132 "PME 3D-FFT",
133 "PME 3D-FFT Comm.",
134 "PME solve LJ",
135 "PME solve Elec",
136 "PME wait for PP",
137 "Wait + Recv. PME F",
138 "Wait PME GPU spread",
139 "PME 3D-FFT",
140 "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
141 "Wait PME GPU gather",
142 "Wait Bonded GPU",
143 "Reduce GPU PME F",
144 "Wait GPU NB nonloc.",
145 "Wait GPU NB local",
146 "Wait GPU state copy",
147 "NB X/F buffer ops.",
148 "Vsite spread",
149 "COM pull force",
150 "AWH",
151 "Write traj.",
152 "Update",
153 "Constraints",
154 "Comm. energies",
155 "Enforced rotation",
156 "Add rot. forces",
157 "Position swapping",
158 "IMD",
159 "Test" };
161 static const char* wcsn[ewcsNR] = {
162 "DD redist.",
163 "DD NS grid + sort",
164 "DD setup comm.",
165 "DD make top.",
166 "DD make constr.",
167 "DD top. other",
168 "DD GPU ops.",
169 "NS grid local",
170 "NS grid non-loc.",
171 "NS search local",
172 "NS search non-loc.",
173 "Bonded F",
174 "Bonded-FEP F",
175 "Restraints F",
176 "Listed buffer ops.",
177 "Nonbonded pruning",
178 "Nonbonded F kernel",
179 "Nonbonded F clear",
180 "Nonbonded FEP",
181 "Launch NB GPU tasks",
182 "Launch Bonded GPU tasks",
183 "Launch PME GPU tasks",
184 "Launch state copy",
185 "Ewald F correction",
186 "NB X buffer ops.",
187 "NB F buffer ops.",
188 "Clear force buffer",
189 "Launch GPU NB X buffer ops.",
190 "Launch GPU NB F buffer ops.",
191 "Launch GPU Comm. coord.",
192 "Launch GPU Comm. force.",
193 "Launch GPU update",
194 "Test subcounter",
197 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
198 static const char* PMEStageNames[] = {
199 "PME spline", "PME spread", "PME spline + spread", "PME 3D-FFT r2c",
200 "PME solve", "PME 3D-FFT c2r", "PME gather",
203 gmx_bool wallcycle_have_counter()
205 return gmx_cycles_have_counter();
208 gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, t_commrec gmx_unused* cr)
210 gmx_wallcycle_t wc;
213 if (!wallcycle_have_counter())
215 return nullptr;
218 snew(wc, 1);
220 wc->haveInvalidCount = FALSE;
221 wc->wc_barrier = FALSE;
222 wc->wcc_all = nullptr;
223 wc->wc_depth = 0;
224 wc->ewc_prev = -1;
225 wc->reset_counters = resetstep;
228 #if GMX_MPI
229 if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
231 if (fplog)
233 fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
235 wc->wc_barrier = TRUE;
236 wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
238 #endif
240 snew(wc->wcc, ewcNR);
241 if (getenv("GMX_CYCLE_ALL") != nullptr)
243 if (fplog)
245 fprintf(fplog, "\nWill time all the code during the run\n\n");
247 snew(wc->wcc_all, ewcNR * ewcNR);
250 if (useCycleSubcounters)
252 snew(wc->wcsc, ewcsNR);
255 #if DEBUG_WCYCLE
256 wc->count_depth = 0;
257 wc->isMasterRank = MASTER(cr);
258 #endif
260 return wc;
263 void wallcycle_destroy(gmx_wallcycle_t wc)
265 if (wc == nullptr)
267 return;
270 if (wc->wcc != nullptr)
272 sfree(wc->wcc);
274 if (wc->wcc_all != nullptr)
276 sfree(wc->wcc_all);
278 if (wc->wcsc != nullptr)
280 sfree(wc->wcsc);
282 sfree(wc);
285 static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
287 wc->ewc_prev = ewc;
288 wc->cycle_prev = cycle;
291 static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
293 wc->wcc_all[wc->ewc_prev * ewcNR + ewc].n += 1;
294 wc->wcc_all[wc->ewc_prev * ewcNR + ewc].c += cycle - wc->cycle_prev;
298 #if DEBUG_WCYCLE
299 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
301 if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
303 gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth + 1);
305 wc->counterlist[wc->count_depth] = ewc;
306 wc->count_depth++;
308 if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
310 std::string indentStr(4 * wc->count_depth, ' ');
311 fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
315 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
317 if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
319 std::string indentStr(4 * wc->count_depth, ' ');
320 fprintf(stderr, "%swcycle_stop depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
323 wc->count_depth--;
325 if (wc->count_depth < 0)
327 gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc],
328 wc->count_depth);
330 if (wc->counterlist[wc->count_depth] != ewc)
332 gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
333 wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
336 #endif
338 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
340 gmx_cycles_t cycle;
342 if (wc == nullptr)
344 return;
347 #if GMX_MPI
348 if (wc->wc_barrier)
350 MPI_Barrier(wc->mpi_comm_mygroup);
352 #endif
354 #if DEBUG_WCYCLE
355 debug_start_check(wc, ewc);
356 #endif
358 cycle = gmx_cycles_read();
359 wc->wcc[ewc].start = cycle;
360 if (wc->wcc_all != nullptr)
362 wc->wc_depth++;
363 if (ewc == ewcRUN)
365 wallcycle_all_start(wc, ewc, cycle);
367 else if (wc->wc_depth == 3)
369 wallcycle_all_stop(wc, ewc, cycle);
374 void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
376 if (wc == nullptr)
378 return;
380 wc->wcc[ewc].n++;
383 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
385 if (wc == nullptr)
387 return;
390 wallcycle_start(wc, ewc);
391 wc->wcc[ewc].n--;
394 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
396 gmx_cycles_t cycle, last;
398 if (wc == nullptr)
400 return 0;
403 #if GMX_MPI
404 if (wc->wc_barrier)
406 MPI_Barrier(wc->mpi_comm_mygroup);
408 #endif
410 #if DEBUG_WCYCLE
411 debug_stop_check(wc, ewc);
412 #endif
414 /* When processes or threads migrate between cores, the cycle counting
415 * can get messed up if the cycle counter on different cores are not
416 * synchronized. When this happens we expect both large negative and
417 * positive cycle differences. We can detect negative cycle differences.
418 * Detecting too large positive counts if difficult, since count can be
419 * large, especially for ewcRUN. If we detect a negative count,
420 * we will not print the cycle accounting table.
422 cycle = gmx_cycles_read();
423 if (cycle >= wc->wcc[ewc].start)
425 last = cycle - wc->wcc[ewc].start;
427 else
429 last = 0;
430 wc->haveInvalidCount = TRUE;
432 wc->wcc[ewc].c += last;
433 wc->wcc[ewc].n++;
434 if (wc->wcc_all)
436 wc->wc_depth--;
437 if (ewc == ewcRUN)
439 wallcycle_all_stop(wc, ewc, cycle);
441 else if (wc->wc_depth == 2)
443 wallcycle_all_start(wc, ewc, cycle);
447 return last;
450 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c)
452 *n = wc->wcc[ewc].n;
453 *c = static_cast<double>(wc->wcc[ewc].c);
456 void wallcycle_reset_all(gmx_wallcycle_t wc)
458 int i;
460 if (wc == nullptr)
462 return;
465 for (i = 0; i < ewcNR; i++)
467 wc->wcc[i].n = 0;
468 wc->wcc[i].c = 0;
470 wc->haveInvalidCount = FALSE;
472 if (wc->wcc_all)
474 for (i = 0; i < ewcNR * ewcNR; i++)
476 wc->wcc_all[i].n = 0;
477 wc->wcc_all[i].c = 0;
480 if (wc->wcsc)
482 for (i = 0; i < ewcsNR; i++)
484 wc->wcsc[i].n = 0;
485 wc->wcsc[i].c = 0;
490 static gmx_bool is_pme_counter(int ewc)
492 return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
495 static gmx_bool is_pme_subcounter(int ewc)
497 return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
500 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
501 static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
503 if (wcc[ewc_sub].n > 0)
505 if (wcc[ewc_main].c >= wcc[ewc_sub].c)
507 wcc[ewc_main].c -= wcc[ewc_sub].c;
509 else
511 /* Something is wrong with the cycle counting */
512 wcc[ewc_main].c = 0;
517 void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
519 if (wc == nullptr)
521 return;
524 for (int i = 0; i < ewcNR; i++)
526 if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
528 wc->wcc[i].c *= nthreads_pme;
530 if (wc->wcc_all)
532 for (int j = 0; j < ewcNR; j++)
534 wc->wcc_all[i * ewcNR + j].c *= nthreads_pme;
538 else
540 wc->wcc[i].c *= nthreads_pp;
542 if (wc->wcc_all)
544 for (int j = 0; j < ewcNR; j++)
546 wc->wcc_all[i * ewcNR + j].c *= nthreads_pp;
551 if (useCycleSubcounters && wc->wcsc && !isPmeRank)
553 for (int i = 0; i < ewcsNR; i++)
555 wc->wcsc[i].c *= nthreads_pp;
560 /* TODO Make an object for this function to return, containing some
561 * vectors of something like wallcc_t for the summed wcc, wcc_all and
562 * wcsc, AND the original wcc for rank 0.
564 * The GPU timing is reported only for rank 0, so we want to preserve
565 * the original wcycle on that rank. Rank 0 also reports the global
566 * counts before that, so needs something to contain the global data
567 * without over-writing the rank-0 data. The current implementation
568 * uses cycles_sum to manage this, which works OK now because wcsc and
569 * wcc_all are unused by the GPU reporting, but it is not satisfactory
570 * for the future. Also, there's no need for MPI_Allreduce, since
571 * only MASTERRANK uses any of the results. */
572 WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc)
574 WallcycleCounts cycles_sum;
575 wallcc_t* wcc;
576 double cycles[int(ewcNR) + int(ewcsNR)];
577 #if GMX_MPI
578 double cycles_n[int(ewcNR) + int(ewcsNR) + 1];
579 #endif
580 int i;
581 int nsum;
583 if (wc == nullptr)
585 /* Default construction of std::array of non-class T can leave
586 the values indeterminate, just like a C array, and icc
587 warns about it. */
588 cycles_sum.fill(0);
589 return cycles_sum;
592 wcc = wc->wcc;
594 subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
595 subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
597 subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
599 if (cr->npmenodes == 0)
601 /* All nodes do PME (or no PME at all) */
602 subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
604 else
606 /* The are PME-only nodes */
607 if (wcc[ewcPMEMESH].n > 0)
609 /* This must be a PME only node, calculate the Wait + Comm. time */
610 GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c,
611 "Total run ticks must be greater than PME-only ticks");
612 wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
616 /* Store the cycles in a double buffer for summing */
617 for (i = 0; i < ewcNR; i++)
619 #if GMX_MPI
620 cycles_n[i] = static_cast<double>(wcc[i].n);
621 #endif
622 cycles[i] = static_cast<double>(wcc[i].c);
624 nsum = ewcNR;
625 if (wc->wcsc)
627 for (i = 0; i < ewcsNR; i++)
629 #if GMX_MPI
630 cycles_n[ewcNR + i] = static_cast<double>(wc->wcsc[i].n);
631 #endif
632 cycles[ewcNR + i] = static_cast<double>(wc->wcsc[i].c);
634 nsum += ewcsNR;
637 #if GMX_MPI
638 if (cr->nnodes > 1)
640 double buf[int(ewcNR) + int(ewcsNR) + 1];
642 // TODO this code is used only at the end of the run, so we
643 // can just do a simple reduce of haveInvalidCount in
644 // wallcycle_print, and avoid bugs
645 cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
646 // TODO Use MPI_Reduce
647 MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
648 for (i = 0; i < ewcNR; i++)
650 wcc[i].n = gmx::roundToInt(buf[i]);
652 wc->haveInvalidCount = (buf[nsum] > 0);
653 if (wc->wcsc)
655 for (i = 0; i < ewcsNR; i++)
657 wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR + i]);
661 // TODO Use MPI_Reduce
662 MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
664 if (wc->wcc_all != nullptr)
666 double *buf_all, *cyc_all;
668 snew(cyc_all, ewcNR * ewcNR);
669 snew(buf_all, ewcNR * ewcNR);
670 for (i = 0; i < ewcNR * ewcNR; i++)
672 cyc_all[i] = wc->wcc_all[i].c;
674 // TODO Use MPI_Reduce
675 MPI_Allreduce(cyc_all, buf_all, ewcNR * ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
676 for (i = 0; i < ewcNR * ewcNR; i++)
678 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
680 sfree(buf_all);
681 sfree(cyc_all);
684 else
685 #endif
687 for (i = 0; i < nsum; i++)
689 cycles_sum[i] = cycles[i];
693 return cycles_sum;
696 static void
697 print_cycles(FILE* fplog, double c2t, const char* name, int nnodes, int nthreads, int ncalls, double c_sum, double tot)
699 char nnodes_str[STRLEN];
700 char nthreads_str[STRLEN];
701 char ncalls_str[STRLEN];
702 double wallt;
703 double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
705 if (c_sum > 0)
707 if (ncalls > 0)
709 snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
710 if (nnodes < 0)
712 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
714 else
716 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
718 if (nthreads < 0)
720 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
722 else
724 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
727 else
729 nnodes_str[0] = 0;
730 nthreads_str[0] = 0;
731 ncalls_str[0] = 0;
733 /* Convert the cycle count to wallclock time for this task */
734 wallt = c_sum * c2t;
736 fprintf(fplog, " %-19.19s %4s %4s %10s %10.3f %14.3f %5.1f\n", name, nnodes_str,
737 nthreads_str, ncalls_str, wallt, c_sum * 1e-9, percentage);
741 static void print_gputimes(FILE* fplog, const char* name, int n, double t, double tot_t)
743 char num[11];
744 char avg_perf[11];
746 if (n > 0)
748 snprintf(num, sizeof(num), "%10d", n);
749 snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t / n);
751 else
753 sprintf(num, " ");
754 sprintf(avg_perf, " ");
756 if (t != tot_t && tot_t > 0)
758 fprintf(fplog, " %-29s %10s%12.3f %s %5.1f\n", name, num, t / 1000, avg_perf, 100 * t / tot_t);
760 else
762 fprintf(fplog, " %-29s %10s%12.3f %s %5.1f\n", name, "", t / 1000, avg_perf, 100.0);
766 static void print_header(FILE* fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
768 int nrank_tot = nrank_pp + nrank_pme;
769 if (0 == nrank_pme)
771 fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
772 if (nth_pp > 1)
774 fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
776 /* Don't report doing PP+PME, because we can't tell here if
777 * this is RF, etc. */
779 else
781 fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
782 if (nth_pp > 1)
784 fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
786 fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
787 if (nth_pme > 1)
789 fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
793 fprintf(fplog, "\n\n");
794 fprintf(fplog, " Computing: Num Num Call Wall time Giga-Cycles\n");
795 fprintf(fplog, " Ranks Threads Count (s) total sum %%\n");
799 void wallcycle_print(FILE* fplog,
800 const gmx::MDLogger& mdlog,
801 int nnodes,
802 int npme,
803 int nth_pp,
804 int nth_pme,
805 double realtime,
806 gmx_wallcycle_t wc,
807 const WallcycleCounts& cyc_sum,
808 const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
809 const gmx_wallclock_gpu_pme_t* gpu_pme_t)
811 double tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
812 double c2t, c2t_pp, c2t_pme = 0;
813 int i, j, npp, nth_tot;
814 char buf[STRLEN];
815 const char* hline =
816 "-----------------------------------------------------------------------------";
818 if (wc == nullptr)
820 return;
823 GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
824 GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
825 GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
826 GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
827 npp = nnodes - npme;
828 /* npme is the number of PME-only ranks used, and we always do PP work */
829 GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
831 nth_tot = npp * nth_pp + npme * nth_pme;
833 /* When using PME-only nodes, the next line is valid for both
834 PP-only and PME-only nodes because they started ewcRUN at the
835 same time. */
836 tot = cyc_sum[ewcRUN];
837 tot_for_pp = 0;
839 if (tot <= 0.0)
841 /* TODO This is heavy handed, but until someone reworks the
842 code so that it is provably robust with respect to
843 non-positive values for all possible timer and cycle
844 counters, there is less value gained from printing whatever
845 timing data might still be sensible for some non-Jenkins
846 run, than is lost from diagnosing Jenkins FP exceptions on
847 runs about whose execution time we don't care. */
848 GMX_LOG(mdlog.warning)
849 .asParagraph()
850 .appendTextFormatted(
851 "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a "
852 "time accounting",
853 tot);
854 return;
857 if (wc->haveInvalidCount)
859 GMX_LOG(mdlog.warning)
860 .asParagraph()
861 .appendText(
862 "NOTE: Detected invalid cycle counts, probably because threads moved "
863 "between CPU cores that do not have synchronized cycle counters. Will not "
864 "print the cycle accounting.");
865 return;
869 /* Conversion factor from cycles to seconds */
870 c2t = realtime / tot;
871 c2t_pp = c2t * nth_tot / static_cast<double>(npp * nth_pp);
872 if (npme > 0)
874 c2t_pme = c2t * nth_tot / static_cast<double>(npme * nth_pme);
876 else
878 c2t_pme = 0;
881 fprintf(fplog, "\n R E A L C Y C L E A N D T I M E A C C O U N T I N G\n\n");
883 print_header(fplog, npp, nth_pp, npme, nth_pme);
885 fprintf(fplog, "%s\n", hline);
886 for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
888 if (is_pme_subcounter(i))
890 /* Do not count these at all */
892 else if (npme > 0 && is_pme_counter(i))
894 /* Print timing information for PME-only nodes, but add an
895 * asterisk so the reader of the table can know that the
896 * walltimes are not meant to add up. The asterisk still
897 * fits in the required maximum of 19 characters. */
898 char buffer[STRLEN];
899 snprintf(buffer, STRLEN, "%s *", wcn[i]);
900 print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
902 else
904 /* Print timing information when it is for a PP or PP+PME
905 node */
906 print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot);
907 tot_for_pp += cyc_sum[i];
910 if (wc->wcc_all != nullptr)
912 for (i = 0; i < ewcNR; i++)
914 for (j = 0; j < ewcNR; j++)
916 snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
917 print_cycles(fplog, c2t_pp, buf, npp, nth_pp, wc->wcc_all[i * ewcNR + j].n,
918 wc->wcc_all[i * ewcNR + j].c, tot);
922 tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
923 print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot);
924 fprintf(fplog, "%s\n", hline);
925 print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot);
926 fprintf(fplog, "%s\n", hline);
928 if (npme > 0)
930 fprintf(fplog,
931 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
932 " twice the total reported, but the cycle count total and %% are correct.\n"
933 "%s\n",
934 hline);
937 if (wc->wcc[ewcPMEMESH].n > 0)
939 // A workaround to not print breakdown when no subcounters were recorded.
940 // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
941 std::vector<int> validPmeSubcounterIndices;
942 for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
944 if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
946 validPmeSubcounterIndices.push_back(i);
950 if (!validPmeSubcounterIndices.empty())
952 fprintf(fplog, " Breakdown of PME mesh computation\n");
953 fprintf(fplog, "%s\n", hline);
954 for (auto i : validPmeSubcounterIndices)
956 print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i], npme > 0 ? npme : npp,
957 nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
959 fprintf(fplog, "%s\n", hline);
963 if (useCycleSubcounters && wc->wcsc)
965 fprintf(fplog, " Breakdown of PP computation\n");
966 fprintf(fplog, "%s\n", hline);
967 for (i = 0; i < ewcsNR; i++)
969 print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR + i], tot);
971 fprintf(fplog, "%s\n", hline);
974 /* print GPU timing summary */
975 double tot_gpu = 0.0;
976 if (gpu_pme_t)
978 for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
980 tot_gpu += gpu_pme_t->timing[k].t;
983 if (gpu_nbnxn_t)
985 const char* k_log_str[2][2] = { { "Nonbonded F kernel", "Nonbonded F+ene k." },
986 { "Nonbonded F+prune k.", "Nonbonded F+ene+prune k." } };
987 tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
989 /* add up the kernel timings */
990 for (i = 0; i < 2; i++)
992 for (j = 0; j < 2; j++)
994 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
997 tot_gpu += gpu_nbnxn_t->pruneTime.t;
999 tot_cpu_overlap = wc->wcc[ewcFORCE].c;
1000 if (wc->wcc[ewcPMEMESH].n > 0)
1002 tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
1004 tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
1006 fprintf(fplog, "\n GPU timings\n%s\n", hline);
1007 fprintf(fplog,
1008 " Computing: Count Wall t (s) ms/step %c\n", '%');
1009 fprintf(fplog, "%s\n", hline);
1010 print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
1011 print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
1013 for (i = 0; i < 2; i++)
1015 for (j = 0; j < 2; j++)
1017 if (gpu_nbnxn_t->ktime[i][j].c)
1019 print_gputimes(fplog, k_log_str[i][j], gpu_nbnxn_t->ktime[i][j].c,
1020 gpu_nbnxn_t->ktime[i][j].t, tot_gpu);
1024 if (gpu_pme_t)
1026 for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
1028 if (gpu_pme_t->timing[k].c)
1030 print_gputimes(fplog, PMEStageNames[k], gpu_pme_t->timing[k].c,
1031 gpu_pme_t->timing[k].t, tot_gpu);
1035 if (gpu_nbnxn_t->pruneTime.c)
1037 print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c,
1038 gpu_nbnxn_t->pruneTime.t, tot_gpu);
1040 print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
1041 fprintf(fplog, "%s\n", hline);
1042 print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
1043 fprintf(fplog, "%s\n", hline);
1044 if (gpu_nbnxn_t->dynamicPruneTime.c)
1046 /* We print the dynamic pruning kernel timings after a separator
1047 * and avoid adding it to tot_gpu as this is not in the force
1048 * overlap. We print the fraction as relative to the rest.
1050 print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c,
1051 gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu);
1052 fprintf(fplog, "%s\n", hline);
1054 gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
1055 if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
1057 fprintf(fplog,
1058 "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
1059 "%.3f\n",
1060 tot_gpu / gpu_nbnxn_t->nb_c, tot_cpu_overlap / wc->wcc[ewcFORCE].n, gpu_cpu_ratio);
1063 /* only print notes related to CPU-GPU load balance with PME */
1064 if (wc->wcc[ewcPMEMESH].n > 0)
1066 fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
1068 /* print note if the imbalance is high with PME case in which
1069 * CPU-GPU load balancing is possible */
1070 if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
1072 /* Only the sim master calls this function, so always print to stderr */
1073 if (gpu_cpu_ratio < 0.8)
1075 if (npp > 1)
1077 /* The user could have used -notunepme,
1078 * but we currently can't check that here.
1080 GMX_LOG(mdlog.warning)
1081 .asParagraph()
1082 .appendText(
1083 "NOTE: The CPU has >25% more load than the GPU. This "
1084 "imbalance wastes\n"
1085 " GPU resources. Maybe the domain decomposition "
1086 "limits the PME tuning.\n"
1087 " In that case, try setting the DD grid manually "
1088 "(-dd) or lowering -dds.");
1090 else
1092 /* We should not end up here, unless the box is
1093 * too small for increasing the cut-off for PME tuning.
1095 GMX_LOG(mdlog.warning)
1096 .asParagraph()
1097 .appendText(
1098 "NOTE: The CPU has >25% more load than the GPU. This "
1099 "imbalance wastes\n"
1100 " GPU resources.");
1103 if (gpu_cpu_ratio > 1.25)
1105 GMX_LOG(mdlog.warning)
1106 .asParagraph()
1107 .appendText(
1108 "NOTE: The GPU has >25% more load than the CPU. This imbalance "
1109 "wastes\n"
1110 " CPU resources.");
1116 if (wc->wc_barrier)
1118 GMX_LOG(mdlog.warning)
1119 .asParagraph()
1120 .appendText(
1121 "MPI_Barrier was called before each cycle start/stop\n"
1122 "call, so timings are not those of real runs.");
1125 if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot * 0.1 || cyc_sum[ewcNS] > tot * 0.1))
1127 /* Only the sim master calls this function, so always print to stderr */
1128 if (wc->wcc[ewcDOMDEC].n == 0)
1130 GMX_LOG(mdlog.warning)
1131 .asParagraph()
1132 .appendTextFormatted(
1133 "NOTE: %d %% of the run time was spent in pair search,\n"
1134 " you might want to increase nstlist (this has no effect on "
1135 "accuracy)\n",
1136 gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1138 else
1140 GMX_LOG(mdlog.warning)
1141 .asParagraph()
1142 .appendTextFormatted(
1143 "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1144 " %d %% of the run time was spent in pair search,\n"
1145 " you might want to increase nstlist (this has no effect on "
1146 "accuracy)\n",
1147 gmx::roundToInt(100 * cyc_sum[ewcDOMDEC] / tot),
1148 gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1152 if (cyc_sum[ewcMoveE] > tot * 0.05)
1154 GMX_LOG(mdlog.warning)
1155 .asParagraph()
1156 .appendTextFormatted(
1157 "NOTE: %d %% of the run time was spent communicating energies,\n"
1158 " you might want to increase some nst* mdp options\n",
1159 gmx::roundToInt(100 * cyc_sum[ewcMoveE] / tot));
1163 extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
1165 if (wc == nullptr)
1167 return -1;
1170 return wc->reset_counters;
1173 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
1175 if (wc == nullptr)
1177 return;
1180 wc->reset_counters = reset_counters;
1183 void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
1185 if (useCycleSubcounters && wc != nullptr)
1187 wc->wcsc[ewcs].start = gmx_cycles_read();
1191 void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
1193 if (useCycleSubcounters && wc != nullptr)
1195 wallcycle_sub_start(wc, ewcs);
1196 wc->wcsc[ewcs].n--;
1200 void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
1202 if (useCycleSubcounters && wc != nullptr)
1204 wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
1205 wc->wcsc[ewcs].n++;