From 45878d4363cfd50d218ff53c8aa3502b9d073016 Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Thu, 13 Jun 2013 11:56:25 +1000 Subject: [PATCH] eventscripts: New configuration varable $CTDB_NFS_DUMP_STUCK_THREADS If some nfsd threads are still alive after a shutdown during a restart then this indicates the maximum number of threads for which a stack trace should be dumped. This can be useful for trying to determine why nfsd is stuck. Signed-off-by: Martin Schwenke (This used to be ctdb commit 2503245db10d567af708a04edd3a3b488c24f401) --- ctdb/config/ctdb.sysconfig | 6 ++++++ ctdb/config/functions | 24 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/ctdb/config/ctdb.sysconfig b/ctdb/config/ctdb.sysconfig index 25ad2aef95d..7e775a2a797 100644 --- a/ctdb/config/ctdb.sysconfig +++ b/ctdb/config/ctdb.sysconfig @@ -142,6 +142,12 @@ CTDB_RECOVERY_LOCK="/some/place/on/shared/storage" # to not do this check. # CTDB_MONITOR_NFS_THREAD_COUNT="yes" + +# The number of nfsd threads to dump stack traces for if some are +# still alive after stopping NFS during a restart. The default is to +# dump no stack traces. +# CTDB_NFS_DUMP_STUCK_THREADS=5 + # When set to yes, the CTDB node will start in DISABLED mode and not host # any public ip addresses. The administrator needs to explicitely enable # the node with "ctdb enable" diff --git a/ctdb/config/functions b/ctdb/config/functions index f4707a799d7..0a806cbe280 100755 --- a/ctdb/config/functions +++ b/ctdb/config/functions @@ -779,6 +779,7 @@ startstop_nfs() { set_proc "fs/nfsd/threads" 0 service nfsserver stop > /dev/null 2>&1 pkill -9 nfsd + nfs_dump_some_threads service nfsserver start ;; esac @@ -798,6 +799,7 @@ startstop_nfs() { service nfs stop > /dev/null 2>&1 service nfslock stop > /dev/null 2>&1 pkill -9 nfsd + nfs_dump_some_threads service nfslock start service nfs start ;; @@ -810,6 +812,28 @@ startstop_nfs() { esac } +# Dump up to the configured number of nfsd thread backtraces. +nfs_dump_some_threads () +{ + [ -n "$CTDB_NFS_DUMP_STUCK_THREADS" ] || return 0 + + # Optimisation to avoid running an unnecessary pidof + [ $CTDB_NFS_DUMP_STUCK_THREADS -gt 0 ] || return 0 + + _count=0 + for _pid in $(pidof nfsd) ; do + [ $_count -le $CTDB_NFS_DUMP_STUCK_THREADS ] || break + + # Do this first to avoid racing with thread exit + _stack=$(get_proc "${_pid}/stack" 2>/dev/null) + if [ -n "$_stack" ] ; then + echo "Stack trace for stuck nfsd thread [${_pid}]:" + echo "$_stack" + _count=$(($_count + 1)) + fi + done +} + ######################################################## # start/stop the nfs lockmanager service on different platforms ######################################################## -- 2.11.4.GIT