Fix cycle counters for "comm.coord" and "Wait + Comm. F" to support GPU halo exchange...
[gromacs.git] / src / gromacs / timing / wallcycle.cpp
blob4e3e241d9660d6e9f15f582b6c9ce2e362150037
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2008, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "gmxpre.h"
40 #include "wallcycle.h"
42 #include "config.h"
44 #include <cstdlib>
46 #include <array>
47 #include <vector>
49 #include "gromacs/math/functions.h"
50 #include "gromacs/mdtypes/commrec.h"
51 #include "gromacs/timing/cyclecounter.h"
52 #include "gromacs/timing/gpu_timing.h"
53 #include "gromacs/timing/wallcyclereporting.h"
54 #include "gromacs/utility/cstringutil.h"
55 #include "gromacs/utility/gmxassert.h"
56 #include "gromacs/utility/gmxmpi.h"
57 #include "gromacs/utility/logger.h"
58 #include "gromacs/utility/smalloc.h"
59 #include "gromacs/utility/snprintf.h"
61 static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
63 /* DEBUG_WCYCLE adds consistency checking for the counters.
64 * It checks if you stop a counter different from the last
65 * one that was opened and if you do nest too deep.
67 /* #define DEBUG_WCYCLE */
69 #ifdef DEBUG_WCYCLE
70 # include "gromacs/utility/fatalerror.h"
71 #endif
73 typedef struct
75 int n;
76 gmx_cycles_t c;
77 gmx_cycles_t start;
78 } wallcc_t;
80 struct gmx_wallcycle
82 wallcc_t* wcc;
83 /* did we detect one or more invalid cycle counts */
84 gmx_bool haveInvalidCount;
85 /* variables for testing/debugging */
86 gmx_bool wc_barrier;
87 wallcc_t* wcc_all;
88 int wc_depth;
89 #ifdef DEBUG_WCYCLE
90 # define DEPTH_MAX 6
91 int counterlist[DEPTH_MAX];
92 int count_depth;
93 #endif
94 int ewc_prev;
95 gmx_cycles_t cycle_prev;
96 int64_t reset_counters;
97 #if GMX_MPI
98 MPI_Comm mpi_comm_mygroup;
99 #endif
100 wallcc_t* wcsc;
103 /* Each name should not exceed 19 printing characters
104 (ie. terminating null can be twentieth) */
105 static const char* wcn[ewcNR] = { "Run",
106 "Step",
107 "PP during PME",
108 "Domain decomp.",
109 "DD comm. load",
110 "DD comm. bounds",
111 "Vsite constr.",
112 "Send X to PME",
113 "Neighbor search",
114 "Launch GPU ops.",
115 "Comm. coord.",
116 "Force",
117 "Wait + Comm. F",
118 "PME mesh",
119 "PME redist. X/F",
120 "PME spread",
121 "PME gather",
122 "PME 3D-FFT",
123 "PME 3D-FFT Comm.",
124 "PME solve LJ",
125 "PME solve Elec",
126 "PME wait for PP",
127 "Wait + Recv. PME F",
128 "Wait PME GPU spread",
129 "PME 3D-FFT",
130 "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
131 "Wait PME GPU gather",
132 "Wait Bonded GPU",
133 "Reduce GPU PME F",
134 "Wait GPU NB nonloc.",
135 "Wait GPU NB local",
136 "Wait GPU state copy",
137 "NB X/F buffer ops.",
138 "Vsite spread",
139 "COM pull force",
140 "AWH",
141 "Write traj.",
142 "Update",
143 "Constraints",
144 "Comm. energies",
145 "Enforced rotation",
146 "Add rot. forces",
147 "Position swapping",
148 "IMD",
149 "Test" };
151 static const char* wcsn[ewcsNR] = {
152 "DD redist.",
153 "DD NS grid + sort",
154 "DD setup comm.",
155 "DD make top.",
156 "DD make constr.",
157 "DD top. other",
158 "NS grid local",
159 "NS grid non-loc.",
160 "NS search local",
161 "NS search non-loc.",
162 "Bonded F",
163 "Bonded-FEP F",
164 "Restraints F",
165 "Listed buffer ops.",
166 "Nonbonded pruning",
167 "Nonbonded F kernel",
168 "Nonbonded F clear",
169 "Nonbonded FEP",
170 "Launch NB GPU tasks",
171 "Launch Bonded GPU tasks",
172 "Launch PME GPU tasks",
173 "Launch state copy",
174 "Ewald F correction",
175 "NB X buffer ops.",
176 "NB F buffer ops.",
177 "Clear force buffer",
178 "Launch GPU NB X buffer ops.",
179 "Launch GPU NB F buffer ops.",
180 "Launch GPU Comm. coord.",
181 "Launch GPU Comm. force."
182 "Test subcounter",
185 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
186 static const char* PMEStageNames[] = {
187 "PME spline", "PME spread", "PME spline + spread", "PME 3D-FFT r2c",
188 "PME solve", "PME 3D-FFT c2r", "PME gather",
191 gmx_bool wallcycle_have_counter()
193 return gmx_cycles_have_counter();
196 gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, t_commrec gmx_unused* cr)
198 gmx_wallcycle_t wc;
201 if (!wallcycle_have_counter())
203 return nullptr;
206 snew(wc, 1);
208 wc->haveInvalidCount = FALSE;
209 wc->wc_barrier = FALSE;
210 wc->wcc_all = nullptr;
211 wc->wc_depth = 0;
212 wc->ewc_prev = -1;
213 wc->reset_counters = resetstep;
215 #if GMX_MPI
216 if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
218 if (fplog)
220 fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
222 wc->wc_barrier = TRUE;
223 wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
225 #endif
227 snew(wc->wcc, ewcNR);
228 if (getenv("GMX_CYCLE_ALL") != nullptr)
230 if (fplog)
232 fprintf(fplog, "\nWill time all the code during the run\n\n");
234 snew(wc->wcc_all, ewcNR * ewcNR);
237 if (useCycleSubcounters)
239 snew(wc->wcsc, ewcsNR);
242 #ifdef DEBUG_WCYCLE
243 wc->count_depth = 0;
244 #endif
246 return wc;
249 void wallcycle_destroy(gmx_wallcycle_t wc)
251 if (wc == nullptr)
253 return;
256 if (wc->wcc != nullptr)
258 sfree(wc->wcc);
260 if (wc->wcc_all != nullptr)
262 sfree(wc->wcc_all);
264 if (wc->wcsc != nullptr)
266 sfree(wc->wcsc);
268 sfree(wc);
271 static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
273 wc->ewc_prev = ewc;
274 wc->cycle_prev = cycle;
277 static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
279 wc->wcc_all[wc->ewc_prev * ewcNR + ewc].n += 1;
280 wc->wcc_all[wc->ewc_prev * ewcNR + ewc].c += cycle - wc->cycle_prev;
284 #ifdef DEBUG_WCYCLE
285 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
287 /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
289 if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
291 gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth);
293 wc->counterlist[wc->count_depth] = ewc;
294 wc->count_depth++;
297 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
299 wc->count_depth--;
301 /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
303 if (wc->count_depth < 0)
305 gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc],
306 wc->count_depth);
308 if (wc->counterlist[wc->count_depth] != ewc)
310 gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
311 wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
314 #endif
316 void wallcycle_start(gmx_wallcycle_t wc, int ewc)
318 gmx_cycles_t cycle;
320 if (wc == nullptr)
322 return;
325 #if GMX_MPI
326 if (wc->wc_barrier)
328 MPI_Barrier(wc->mpi_comm_mygroup);
330 #endif
332 #ifdef DEBUG_WCYCLE
333 debug_start_check(wc, ewc);
334 #endif
336 cycle = gmx_cycles_read();
337 wc->wcc[ewc].start = cycle;
338 if (wc->wcc_all != nullptr)
340 wc->wc_depth++;
341 if (ewc == ewcRUN)
343 wallcycle_all_start(wc, ewc, cycle);
345 else if (wc->wc_depth == 3)
347 wallcycle_all_stop(wc, ewc, cycle);
352 void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
354 if (wc == nullptr)
356 return;
358 wc->wcc[ewc].n++;
361 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
363 if (wc == nullptr)
365 return;
368 wallcycle_start(wc, ewc);
369 wc->wcc[ewc].n--;
372 double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
374 gmx_cycles_t cycle, last;
376 if (wc == nullptr)
378 return 0;
381 #if GMX_MPI
382 if (wc->wc_barrier)
384 MPI_Barrier(wc->mpi_comm_mygroup);
386 #endif
388 #ifdef DEBUG_WCYCLE
389 debug_stop_check(wc, ewc);
390 #endif
392 /* When processes or threads migrate between cores, the cycle counting
393 * can get messed up if the cycle counter on different cores are not
394 * synchronized. When this happens we expect both large negative and
395 * positive cycle differences. We can detect negative cycle differences.
396 * Detecting too large positive counts if difficult, since count can be
397 * large, especially for ewcRUN. If we detect a negative count,
398 * we will not print the cycle accounting table.
400 cycle = gmx_cycles_read();
401 if (cycle >= wc->wcc[ewc].start)
403 last = cycle - wc->wcc[ewc].start;
405 else
407 last = 0;
408 wc->haveInvalidCount = TRUE;
410 wc->wcc[ewc].c += last;
411 wc->wcc[ewc].n++;
412 if (wc->wcc_all)
414 wc->wc_depth--;
415 if (ewc == ewcRUN)
417 wallcycle_all_stop(wc, ewc, cycle);
419 else if (wc->wc_depth == 2)
421 wallcycle_all_start(wc, ewc, cycle);
425 return last;
428 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c)
430 *n = wc->wcc[ewc].n;
431 *c = static_cast<double>(wc->wcc[ewc].c);
434 void wallcycle_reset_all(gmx_wallcycle_t wc)
436 int i;
438 if (wc == nullptr)
440 return;
443 for (i = 0; i < ewcNR; i++)
445 wc->wcc[i].n = 0;
446 wc->wcc[i].c = 0;
448 wc->haveInvalidCount = FALSE;
450 if (wc->wcc_all)
452 for (i = 0; i < ewcNR * ewcNR; i++)
454 wc->wcc_all[i].n = 0;
455 wc->wcc_all[i].c = 0;
458 if (wc->wcsc)
460 for (i = 0; i < ewcsNR; i++)
462 wc->wcsc[i].n = 0;
463 wc->wcsc[i].c = 0;
468 static gmx_bool is_pme_counter(int ewc)
470 return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
473 static gmx_bool is_pme_subcounter(int ewc)
475 return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
478 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
479 static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
481 if (wcc[ewc_sub].n > 0)
483 if (wcc[ewc_main].c >= wcc[ewc_sub].c)
485 wcc[ewc_main].c -= wcc[ewc_sub].c;
487 else
489 /* Something is wrong with the cycle counting */
490 wcc[ewc_main].c = 0;
495 void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
497 if (wc == nullptr)
499 return;
502 for (int i = 0; i < ewcNR; i++)
504 if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
506 wc->wcc[i].c *= nthreads_pme;
508 if (wc->wcc_all)
510 for (int j = 0; j < ewcNR; j++)
512 wc->wcc_all[i * ewcNR + j].c *= nthreads_pme;
516 else
518 wc->wcc[i].c *= nthreads_pp;
520 if (wc->wcc_all)
522 for (int j = 0; j < ewcNR; j++)
524 wc->wcc_all[i * ewcNR + j].c *= nthreads_pp;
529 if (useCycleSubcounters && wc->wcsc && !isPmeRank)
531 for (int i = 0; i < ewcsNR; i++)
533 wc->wcsc[i].c *= nthreads_pp;
538 /* TODO Make an object for this function to return, containing some
539 * vectors of something like wallcc_t for the summed wcc, wcc_all and
540 * wcsc, AND the original wcc for rank 0.
542 * The GPU timing is reported only for rank 0, so we want to preserve
543 * the original wcycle on that rank. Rank 0 also reports the global
544 * counts before that, so needs something to contain the global data
545 * without over-writing the rank-0 data. The current implementation
546 * uses cycles_sum to manage this, which works OK now because wcsc and
547 * wcc_all are unused by the GPU reporting, but it is not satisfactory
548 * for the future. Also, there's no need for MPI_Allreduce, since
549 * only MASTERRANK uses any of the results. */
550 WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc)
552 WallcycleCounts cycles_sum;
553 wallcc_t* wcc;
554 double cycles[int(ewcNR) + int(ewcsNR)];
555 #if GMX_MPI
556 double cycles_n[int(ewcNR) + int(ewcsNR) + 1];
557 #endif
558 int i;
559 int nsum;
561 if (wc == nullptr)
563 /* Default construction of std::array of non-class T can leave
564 the values indeterminate, just like a C array, and icc
565 warns about it. */
566 cycles_sum.fill(0);
567 return cycles_sum;
570 wcc = wc->wcc;
572 subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
573 subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
575 subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
577 if (cr->npmenodes == 0)
579 /* All nodes do PME (or no PME at all) */
580 subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
582 else
584 /* The are PME-only nodes */
585 if (wcc[ewcPMEMESH].n > 0)
587 /* This must be a PME only node, calculate the Wait + Comm. time */
588 GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c,
589 "Total run ticks must be greater than PME-only ticks");
590 wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
594 /* Store the cycles in a double buffer for summing */
595 for (i = 0; i < ewcNR; i++)
597 #if GMX_MPI
598 cycles_n[i] = static_cast<double>(wcc[i].n);
599 #endif
600 cycles[i] = static_cast<double>(wcc[i].c);
602 nsum = ewcNR;
603 if (wc->wcsc)
605 for (i = 0; i < ewcsNR; i++)
607 #if GMX_MPI
608 cycles_n[ewcNR + i] = static_cast<double>(wc->wcsc[i].n);
609 #endif
610 cycles[ewcNR + i] = static_cast<double>(wc->wcsc[i].c);
612 nsum += ewcsNR;
615 #if GMX_MPI
616 if (cr->nnodes > 1)
618 double buf[int(ewcNR) + int(ewcsNR) + 1];
620 // TODO this code is used only at the end of the run, so we
621 // can just do a simple reduce of haveInvalidCount in
622 // wallcycle_print, and avoid bugs
623 cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
624 // TODO Use MPI_Reduce
625 MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
626 for (i = 0; i < ewcNR; i++)
628 wcc[i].n = gmx::roundToInt(buf[i]);
630 wc->haveInvalidCount = (buf[nsum] > 0);
631 if (wc->wcsc)
633 for (i = 0; i < ewcsNR; i++)
635 wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR + i]);
639 // TODO Use MPI_Reduce
640 MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
642 if (wc->wcc_all != nullptr)
644 double *buf_all, *cyc_all;
646 snew(cyc_all, ewcNR * ewcNR);
647 snew(buf_all, ewcNR * ewcNR);
648 for (i = 0; i < ewcNR * ewcNR; i++)
650 cyc_all[i] = wc->wcc_all[i].c;
652 // TODO Use MPI_Reduce
653 MPI_Allreduce(cyc_all, buf_all, ewcNR * ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
654 for (i = 0; i < ewcNR * ewcNR; i++)
656 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
658 sfree(buf_all);
659 sfree(cyc_all);
662 else
663 #endif
665 for (i = 0; i < nsum; i++)
667 cycles_sum[i] = cycles[i];
671 return cycles_sum;
674 static void
675 print_cycles(FILE* fplog, double c2t, const char* name, int nnodes, int nthreads, int ncalls, double c_sum, double tot)
677 char nnodes_str[STRLEN];
678 char nthreads_str[STRLEN];
679 char ncalls_str[STRLEN];
680 double wallt;
681 double percentage = (tot > 0.) ? (100. * c_sum / tot) : 0.;
683 if (c_sum > 0)
685 if (ncalls > 0)
687 snprintf(ncalls_str, sizeof(ncalls_str), "%10d", ncalls);
688 if (nnodes < 0)
690 snprintf(nnodes_str, sizeof(nnodes_str), "N/A");
692 else
694 snprintf(nnodes_str, sizeof(nnodes_str), "%4d", nnodes);
696 if (nthreads < 0)
698 snprintf(nthreads_str, sizeof(nthreads_str), "N/A");
700 else
702 snprintf(nthreads_str, sizeof(nthreads_str), "%4d", nthreads);
705 else
707 nnodes_str[0] = 0;
708 nthreads_str[0] = 0;
709 ncalls_str[0] = 0;
711 /* Convert the cycle count to wallclock time for this task */
712 wallt = c_sum * c2t;
714 fprintf(fplog, " %-19.19s %4s %4s %10s %10.3f %14.3f %5.1f\n", name, nnodes_str,
715 nthreads_str, ncalls_str, wallt, c_sum * 1e-9, percentage);
719 static void print_gputimes(FILE* fplog, const char* name, int n, double t, double tot_t)
721 char num[11];
722 char avg_perf[11];
724 if (n > 0)
726 snprintf(num, sizeof(num), "%10d", n);
727 snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t / n);
729 else
731 sprintf(num, " ");
732 sprintf(avg_perf, " ");
734 if (t != tot_t && tot_t > 0)
736 fprintf(fplog, " %-29s %10s%12.3f %s %5.1f\n", name, num, t / 1000, avg_perf, 100 * t / tot_t);
738 else
740 fprintf(fplog, " %-29s %10s%12.3f %s %5.1f\n", name, "", t / 1000, avg_perf, 100.0);
744 static void print_header(FILE* fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
746 int nrank_tot = nrank_pp + nrank_pme;
747 if (0 == nrank_pme)
749 fprintf(fplog, "On %d MPI rank%s", nrank_tot, nrank_tot == 1 ? "" : "s");
750 if (nth_pp > 1)
752 fprintf(fplog, ", each using %d OpenMP threads", nth_pp);
754 /* Don't report doing PP+PME, because we can't tell here if
755 * this is RF, etc. */
757 else
759 fprintf(fplog, "On %d MPI rank%s doing PP", nrank_pp, nrank_pp == 1 ? "" : "s");
760 if (nth_pp > 1)
762 fprintf(fplog, ",%s using %d OpenMP threads", nrank_pp > 1 ? " each" : "", nth_pp);
764 fprintf(fplog, ", and\non %d MPI rank%s doing PME", nrank_pme, nrank_pme == 1 ? "" : "s");
765 if (nth_pme > 1)
767 fprintf(fplog, ",%s using %d OpenMP threads", nrank_pme > 1 ? " each" : "", nth_pme);
771 fprintf(fplog, "\n\n");
772 fprintf(fplog, " Computing: Num Num Call Wall time Giga-Cycles\n");
773 fprintf(fplog, " Ranks Threads Count (s) total sum %%\n");
777 void wallcycle_print(FILE* fplog,
778 const gmx::MDLogger& mdlog,
779 int nnodes,
780 int npme,
781 int nth_pp,
782 int nth_pme,
783 double realtime,
784 gmx_wallcycle_t wc,
785 const WallcycleCounts& cyc_sum,
786 const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
787 const gmx_wallclock_gpu_pme_t* gpu_pme_t)
789 double tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
790 double c2t, c2t_pp, c2t_pme = 0;
791 int i, j, npp, nth_tot;
792 char buf[STRLEN];
793 const char* hline =
794 "-----------------------------------------------------------------------------";
796 if (wc == nullptr)
798 return;
801 GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
802 GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
803 GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
804 GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
805 npp = nnodes - npme;
806 /* npme is the number of PME-only ranks used, and we always do PP work */
807 GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
809 nth_tot = npp * nth_pp + npme * nth_pme;
811 /* When using PME-only nodes, the next line is valid for both
812 PP-only and PME-only nodes because they started ewcRUN at the
813 same time. */
814 tot = cyc_sum[ewcRUN];
815 tot_for_pp = 0;
817 if (tot <= 0.0)
819 /* TODO This is heavy handed, but until someone reworks the
820 code so that it is provably robust with respect to
821 non-positive values for all possible timer and cycle
822 counters, there is less value gained from printing whatever
823 timing data might still be sensible for some non-Jenkins
824 run, than is lost from diagnosing Jenkins FP exceptions on
825 runs about whose execution time we don't care. */
826 GMX_LOG(mdlog.warning)
827 .asParagraph()
828 .appendTextFormatted(
829 "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a "
830 "time accounting",
831 tot);
832 return;
835 if (wc->haveInvalidCount)
837 GMX_LOG(mdlog.warning)
838 .asParagraph()
839 .appendText(
840 "NOTE: Detected invalid cycle counts, probably because threads moved "
841 "between CPU cores that do not have synchronized cycle counters. Will not "
842 "print the cycle accounting.");
843 return;
847 /* Conversion factor from cycles to seconds */
848 c2t = realtime / tot;
849 c2t_pp = c2t * nth_tot / static_cast<double>(npp * nth_pp);
850 if (npme > 0)
852 c2t_pme = c2t * nth_tot / static_cast<double>(npme * nth_pme);
854 else
856 c2t_pme = 0;
859 fprintf(fplog, "\n R E A L C Y C L E A N D T I M E A C C O U N T I N G\n\n");
861 print_header(fplog, npp, nth_pp, npme, nth_pme);
863 fprintf(fplog, "%s\n", hline);
864 for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
866 if (is_pme_subcounter(i))
868 /* Do not count these at all */
870 else if (npme > 0 && is_pme_counter(i))
872 /* Print timing information for PME-only nodes, but add an
873 * asterisk so the reader of the table can know that the
874 * walltimes are not meant to add up. The asterisk still
875 * fits in the required maximum of 19 characters. */
876 char buffer[STRLEN];
877 snprintf(buffer, STRLEN, "%s *", wcn[i]);
878 print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
880 else
882 /* Print timing information when it is for a PP or PP+PME
883 node */
884 print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot);
885 tot_for_pp += cyc_sum[i];
888 if (wc->wcc_all != nullptr)
890 for (i = 0; i < ewcNR; i++)
892 for (j = 0; j < ewcNR; j++)
894 snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
895 print_cycles(fplog, c2t_pp, buf, npp, nth_pp, wc->wcc_all[i * ewcNR + j].n,
896 wc->wcc_all[i * ewcNR + j].c, tot);
900 tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
901 print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot);
902 fprintf(fplog, "%s\n", hline);
903 print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot);
904 fprintf(fplog, "%s\n", hline);
906 if (npme > 0)
908 fprintf(fplog,
909 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
910 " twice the total reported, but the cycle count total and %% are correct.\n"
911 "%s\n",
912 hline);
915 if (wc->wcc[ewcPMEMESH].n > 0)
917 // A workaround to not print breakdown when no subcounters were recorded.
918 // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
919 std::vector<int> validPmeSubcounterIndices;
920 for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
922 if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
924 validPmeSubcounterIndices.push_back(i);
928 if (!validPmeSubcounterIndices.empty())
930 fprintf(fplog, " Breakdown of PME mesh computation\n");
931 fprintf(fplog, "%s\n", hline);
932 for (auto i : validPmeSubcounterIndices)
934 print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i], npme > 0 ? npme : npp,
935 nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
937 fprintf(fplog, "%s\n", hline);
941 if (useCycleSubcounters && wc->wcsc)
943 fprintf(fplog, " Breakdown of PP computation\n");
944 fprintf(fplog, "%s\n", hline);
945 for (i = 0; i < ewcsNR; i++)
947 print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR + i], tot);
949 fprintf(fplog, "%s\n", hline);
952 /* print GPU timing summary */
953 double tot_gpu = 0.0;
954 if (gpu_pme_t)
956 for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
958 tot_gpu += gpu_pme_t->timing[k].t;
961 if (gpu_nbnxn_t)
963 const char* k_log_str[2][2] = { { "Nonbonded F kernel", "Nonbonded F+ene k." },
964 { "Nonbonded F+prune k.", "Nonbonded F+ene+prune k." } };
965 tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
967 /* add up the kernel timings */
968 for (i = 0; i < 2; i++)
970 for (j = 0; j < 2; j++)
972 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
975 tot_gpu += gpu_nbnxn_t->pruneTime.t;
977 tot_cpu_overlap = wc->wcc[ewcFORCE].c;
978 if (wc->wcc[ewcPMEMESH].n > 0)
980 tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
982 tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
984 fprintf(fplog, "\n GPU timings\n%s\n", hline);
985 fprintf(fplog,
986 " Computing: Count Wall t (s) ms/step %c\n", '%');
987 fprintf(fplog, "%s\n", hline);
988 print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
989 print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
991 for (i = 0; i < 2; i++)
993 for (j = 0; j < 2; j++)
995 if (gpu_nbnxn_t->ktime[i][j].c)
997 print_gputimes(fplog, k_log_str[i][j], gpu_nbnxn_t->ktime[i][j].c,
998 gpu_nbnxn_t->ktime[i][j].t, tot_gpu);
1002 if (gpu_pme_t)
1004 for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
1006 if (gpu_pme_t->timing[k].c)
1008 print_gputimes(fplog, PMEStageNames[k], gpu_pme_t->timing[k].c,
1009 gpu_pme_t->timing[k].t, tot_gpu);
1013 if (gpu_nbnxn_t->pruneTime.c)
1015 print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c,
1016 gpu_nbnxn_t->pruneTime.t, tot_gpu);
1018 print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
1019 fprintf(fplog, "%s\n", hline);
1020 print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
1021 fprintf(fplog, "%s\n", hline);
1022 if (gpu_nbnxn_t->dynamicPruneTime.c)
1024 /* We print the dynamic pruning kernel timings after a separator
1025 * and avoid adding it to tot_gpu as this is not in the force
1026 * overlap. We print the fraction as relative to the rest.
1028 print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c,
1029 gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu);
1030 fprintf(fplog, "%s\n", hline);
1032 gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
1033 if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
1035 fprintf(fplog,
1036 "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
1037 "%.3f\n",
1038 tot_gpu / gpu_nbnxn_t->nb_c, tot_cpu_overlap / wc->wcc[ewcFORCE].n, gpu_cpu_ratio);
1041 /* only print notes related to CPU-GPU load balance with PME */
1042 if (wc->wcc[ewcPMEMESH].n > 0)
1044 fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
1046 /* print note if the imbalance is high with PME case in which
1047 * CPU-GPU load balancing is possible */
1048 if (gpu_cpu_ratio < 0.8 || gpu_cpu_ratio > 1.25)
1050 /* Only the sim master calls this function, so always print to stderr */
1051 if (gpu_cpu_ratio < 0.8)
1053 if (npp > 1)
1055 /* The user could have used -notunepme,
1056 * but we currently can't check that here.
1058 GMX_LOG(mdlog.warning)
1059 .asParagraph()
1060 .appendText(
1061 "NOTE: The CPU has >25% more load than the GPU. This "
1062 "imbalance wastes\n"
1063 " GPU resources. Maybe the domain decomposition "
1064 "limits the PME tuning.\n"
1065 " In that case, try setting the DD grid manually "
1066 "(-dd) or lowering -dds.");
1068 else
1070 /* We should not end up here, unless the box is
1071 * too small for increasing the cut-off for PME tuning.
1073 GMX_LOG(mdlog.warning)
1074 .asParagraph()
1075 .appendText(
1076 "NOTE: The CPU has >25% more load than the GPU. This "
1077 "imbalance wastes\n"
1078 " GPU resources.");
1081 if (gpu_cpu_ratio > 1.25)
1083 GMX_LOG(mdlog.warning)
1084 .asParagraph()
1085 .appendText(
1086 "NOTE: The GPU has >25% more load than the CPU. This imbalance "
1087 "wastes\n"
1088 " CPU resources.");
1094 if (wc->wc_barrier)
1096 GMX_LOG(mdlog.warning)
1097 .asParagraph()
1098 .appendText(
1099 "MPI_Barrier was called before each cycle start/stop\n"
1100 "call, so timings are not those of real runs.");
1103 if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot * 0.1 || cyc_sum[ewcNS] > tot * 0.1))
1105 /* Only the sim master calls this function, so always print to stderr */
1106 if (wc->wcc[ewcDOMDEC].n == 0)
1108 GMX_LOG(mdlog.warning)
1109 .asParagraph()
1110 .appendTextFormatted(
1111 "NOTE: %d %% of the run time was spent in pair search,\n"
1112 " you might want to increase nstlist (this has no effect on "
1113 "accuracy)\n",
1114 gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1116 else
1118 GMX_LOG(mdlog.warning)
1119 .asParagraph()
1120 .appendTextFormatted(
1121 "NOTE: %d %% of the run time was spent in domain decomposition,\n"
1122 " %d %% of the run time was spent in pair search,\n"
1123 " you might want to increase nstlist (this has no effect on "
1124 "accuracy)\n",
1125 gmx::roundToInt(100 * cyc_sum[ewcDOMDEC] / tot),
1126 gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
1130 if (cyc_sum[ewcMoveE] > tot * 0.05)
1132 GMX_LOG(mdlog.warning)
1133 .asParagraph()
1134 .appendTextFormatted(
1135 "NOTE: %d %% of the run time was spent communicating energies,\n"
1136 " you might want to increase some nst* mdp options\n",
1137 gmx::roundToInt(100 * cyc_sum[ewcMoveE] / tot));
1141 extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
1143 if (wc == nullptr)
1145 return -1;
1148 return wc->reset_counters;
1151 extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
1153 if (wc == nullptr)
1155 return;
1158 wc->reset_counters = reset_counters;
1161 void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
1163 if (useCycleSubcounters && wc != nullptr)
1165 wc->wcsc[ewcs].start = gmx_cycles_read();
1169 void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
1171 if (useCycleSubcounters && wc != nullptr)
1173 wallcycle_sub_start(wc, ewcs);
1174 wc->wcsc[ewcs].n--;
1178 void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
1180 if (useCycleSubcounters && wc != nullptr)
1182 wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
1183 wc->wcsc[ewcs].n++;