From 602d904648a7d7274f7eb840820535a420988b95 Mon Sep 17 00:00:00 2001 From: Mark Abraham Date: Mon, 25 Nov 2019 13:10:50 +0100 Subject: [PATCH] Improve handling of task-assignment exceptions Resolves several current TODOs to improve the reporting of error messages derived from exceptions. This improves the robustness of the regressiontests logic that depends on parsing the error message to understand whether and how to retry running mdrun. Change-Id: I8c9eac95a2ef4861e90350e77fb9beafb08e9d2e --- src/gromacs/taskassignment/taskassignment.cpp | 69 ++++++++++++++++++++------- 1 file changed, 53 insertions(+), 16 deletions(-) diff --git a/src/gromacs/taskassignment/taskassignment.cpp b/src/gromacs/taskassignment/taskassignment.cpp index e88a78b240..6a85949ff4 100644 --- a/src/gromacs/taskassignment/taskassignment.cpp +++ b/src/gromacs/taskassignment/taskassignment.cpp @@ -52,6 +52,7 @@ #include "taskassignment.h" #include +#include #include #include @@ -189,6 +190,31 @@ size_t countGpuTasksOnThisNode(const GpuTasksOnRanks& gpuTasksOnRanksOfThisNode) return numGpuTasksOnThisNode; } +/*! \brief Return on each rank the total count over all ranks of all + * simulations. */ +int countOverAllRanks(const t_commrec* cr, const gmx_multisim_t* ms, const int countOnThisRank) +{ + int countOverAllRanksValue = countOnThisRank; + if (PAR(cr)) + { + // Count over the ranks of this simulation. + gmx_sumi(1, &countOverAllRanksValue, cr); + } + if (isMultiSim(ms)) + { + // Count over the ranks of all simulations. + gmx_sumi_sim(1, &countOverAllRanksValue, ms); + if (PAR(cr)) + { + // Propagate the information from other simulations back + // to non-master ranks so they can all agree on future + // behavior. + gmx_bcast(sizeof(decltype(countOverAllRanksValue)), &countOverAllRanksValue, cr); + } + } + return countOverAllRanksValue; +} + } // namespace GpuTaskAssignmentsBuilder::GpuTaskAssignmentsBuilder() = default; @@ -217,6 +243,7 @@ GpuTaskAssignments GpuTaskAssignmentsBuilder::build(const std::vector& gpuI auto gpuTasksOnRanksOfThisNode = findAllGpuTasksOnThisNode(gpuTasksOnThisRank, physicalNodeComm); size_t numGpuTasksOnThisNode = countGpuTasksOnThisNode(gpuTasksOnRanksOfThisNode); + std::exception_ptr exceptionPtr; std::vector taskAssignmentOnRanksOfThisNode; try { @@ -296,27 +323,37 @@ GpuTaskAssignments GpuTaskAssignmentsBuilder::build(const std::vector& gpuI taskAssignmentOnRanksOfThisNode = buildTaskAssignment(gpuTasksOnRanksOfThisNode, gpuIdsForTaskAssignment); } - catch (const std::exception& ex) + catch (...) + { + exceptionPtr = std::current_exception(); + } + int countOfExceptionsOnThisRank = int(bool(exceptionPtr)); + int countOfExceptionsOverAllRanks = countOverAllRanks(cr, ms, countOfExceptionsOnThisRank); + + // Avoid all ranks spamming the error stream + // + // TODO improve this so that unique errors on different ranks + // are all reported. + if (countOfExceptionsOnThisRank > 0 && physicalNodeComm.rank_ == 0) { - // TODO This implementation is quite similar to that of - // processExceptionAsFatalError (which implements - // GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR), but it is unclear - // how we should involve MPI in the implementation of error - // handling. - if (physicalNodeComm.rank_ == 0) + try + { + if (exceptionPtr) + { + std::rethrow_exception(exceptionPtr); + } + } + catch (const std::exception& ex) { printFatalErrorMessage(stderr, ex); } - - gmx_exit_on_fatal_error(ExitType_Abort, 1); } - // TODO This implements a global barrier so that MPI runtimes can - // organize an orderly shutdown if one of the ranks has had to - // issue a fatal error after an exception detected only on one - // rank. When we have MPI-aware error handling and reporting, this - // should be improved. - multiSimBarrier(ms); - simulationBarrier(cr); + if (countOfExceptionsOverAllRanks > 0) + { + gmx_fatal(FARGS, + "Exiting because task assignment failed. If there is no descriptive " + "error message above this, please report this failure as a bug."); + } // TODO There is no check that mdrun -nb gpu or -pme gpu or // -gpu_id is actually being implemented such that nonbonded tasks -- 2.11.4.GIT