From 45878d4363cfd50d218ff53c8aa3502b9d073016 Mon Sep 17 00:00:00 2001
From: Martin Schwenke <martin@meltin.net>
Date: Thu, 13 Jun 2013 11:56:25 +1000
Subject: [PATCH] eventscripts: New configuration varable
 $CTDB_NFS_DUMP_STUCK_THREADS

If some nfsd threads are still alive after a shutdown during a restart
then this indicates the maximum number of threads for which a stack
trace should be dumped.  This can be useful for trying to determine
why nfsd is stuck.

Signed-off-by: Martin Schwenke <martin@meltin.net>

(This used to be ctdb commit 2503245db10d567af708a04edd3a3b488c24f401)
---
 ctdb/config/ctdb.sysconfig |  6 ++++++
 ctdb/config/functions      | 24 ++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/ctdb/config/ctdb.sysconfig b/ctdb/config/ctdb.sysconfig
index 25ad2aef95d..7e775a2a797 100644
--- a/ctdb/config/ctdb.sysconfig
+++ b/ctdb/config/ctdb.sysconfig
@@ -142,6 +142,12 @@ CTDB_RECOVERY_LOCK="/some/place/on/shared/storage"
 # to not do this check.
 # CTDB_MONITOR_NFS_THREAD_COUNT="yes"
 
+
+# The number of nfsd threads to dump stack traces for if some are
+# still alive after stopping NFS during a restart.  The default is to
+# dump no stack traces.
+# CTDB_NFS_DUMP_STUCK_THREADS=5
+
 # When set to yes, the CTDB node will start in DISABLED mode and not host
 # any public ip addresses. The administrator needs to explicitely enable
 # the node with "ctdb enable"
diff --git a/ctdb/config/functions b/ctdb/config/functions
index f4707a799d7..0a806cbe280 100755
--- a/ctdb/config/functions
+++ b/ctdb/config/functions
@@ -779,6 +779,7 @@ startstop_nfs() {
 			set_proc "fs/nfsd/threads" 0
 			service nfsserver stop > /dev/null 2>&1
 			pkill -9 nfsd
+			nfs_dump_some_threads
 			service nfsserver start
 			;;
 		esac
@@ -798,6 +799,7 @@ startstop_nfs() {
 			service nfs stop > /dev/null 2>&1
 			service nfslock stop > /dev/null 2>&1
 			pkill -9 nfsd
+			nfs_dump_some_threads
 			service nfslock start
 			service nfs start
 			;;
@@ -810,6 +812,28 @@ startstop_nfs() {
 	esac
 }
 
+# Dump up to the configured number of nfsd thread backtraces.
+nfs_dump_some_threads ()
+{
+    [ -n "$CTDB_NFS_DUMP_STUCK_THREADS" ] || return 0
+
+    # Optimisation to avoid running an unnecessary pidof
+    [ $CTDB_NFS_DUMP_STUCK_THREADS -gt 0 ] || return 0
+
+    _count=0
+    for _pid in $(pidof nfsd) ; do
+	[ $_count -le $CTDB_NFS_DUMP_STUCK_THREADS ] || break
+
+	# Do this first to avoid racing with thread exit
+	_stack=$(get_proc "${_pid}/stack" 2>/dev/null)
+	if [ -n "$_stack" ] ; then
+	    echo "Stack trace for stuck nfsd thread [${_pid}]:"
+	    echo "$_stack"
+	    _count=$(($_count + 1))
+	fi
+    done
+}
+
 ########################################################
 # start/stop the nfs lockmanager service on different platforms
 ########################################################
-- 
2.11.4.GIT