From 0f28ccf87af4e90867eaab213a640f6d0cdaa12d Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 14 Aug 2015 17:08:45 +1000 Subject: [PATCH] ctdb-scripts: Add default system memory usage warnings CTDB should warn by default if too much system memory or swap is used. The tests have also been tweaked. In particular, the filesystem-only tests need to initialise the memory information to avoid errors where meminfo isn't set. Document the defaults, warning against disabling them. Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs --- ctdb/config/events.d/05.system | 9 ++++++--- ctdb/doc/ctdbd.conf.5.xml | 18 ++++++++++++++---- ctdb/tests/eventscripts/05.system.monitor.001.sh | 2 ++ ctdb/tests/eventscripts/05.system.monitor.002.sh | 2 ++ ctdb/tests/eventscripts/05.system.monitor.003.sh | 2 ++ ctdb/tests/eventscripts/05.system.monitor.004.sh | 2 ++ ctdb/tests/eventscripts/05.system.monitor.005.sh | 2 ++ ctdb/tests/eventscripts/05.system.monitor.006.sh | 2 ++ ctdb/tests/eventscripts/05.system.monitor.007.sh | 2 ++ ctdb/tests/eventscripts/05.system.monitor.011.sh | 8 +++++--- ctdb/tests/eventscripts/05.system.monitor.012.sh | 2 +- ctdb/tests/eventscripts/05.system.monitor.013.sh | 3 ++- ctdb/tests/eventscripts/05.system.monitor.014.sh | 2 +- ctdb/tests/eventscripts/05.system.monitor.015.sh | 2 +- ctdb/tests/eventscripts/05.system.monitor.017.sh | 2 +- 15 files changed, 45 insertions(+), 15 deletions(-) diff --git a/ctdb/config/events.d/05.system b/ctdb/config/events.d/05.system index 6aafd283942..c305c4b1548 100755 --- a/ctdb/config/events.d/05.system +++ b/ctdb/config/events.d/05.system @@ -103,9 +103,12 @@ dump_memory_info () monitor_memory_usage () { - if [ -z "$CTDB_MONITOR_MEMORY_USAGE" -a \ - -z "$CTDB_MONITOR_SWAP_USAGE" ] ; then - return + # Defaults + if [ -z "$CTDB_MONITOR_MEMORY_USAGE" ] ; then + CTDB_MONITOR_MEMORY_USAGE=80 + fi + if [ -z "$CTDB_MONITOR_SWAP_USAGE" ] ; then + CTDB_MONITOR_SWAP_USAGE=25 fi _meminfo=$(get_proc "meminfo") diff --git a/ctdb/doc/ctdbd.conf.5.xml b/ctdb/doc/ctdbd.conf.5.xml index b568fc0d3d1..e04c89aafab 100644 --- a/ctdb/doc/ctdbd.conf.5.xml +++ b/ctdb/doc/ctdbd.conf.5.xml @@ -1280,8 +1280,16 @@ CTDB_PER_IP_ROUTING_TABLE_ID_HIGH=9000 CTDB can experience seemingly random (performance and other) issues if system resources become too constrained. Options in - this section can be enabled to allow certain system resources to - be checked. + this section can be enabled to allow certain system resources + to be checked. They allows warnings to be logged and nodes to + be marked unhealthy when system resource usage reaches the + configured thresholds. + + + + Some checks are enabled by default. It is recommended that + these checks remain enabled or are augmented by extra checks. + There is no supported way of completely disabling the checks. @@ -1331,7 +1339,8 @@ CTDB_PER_IP_ROUTING_TABLE_ID_HIGH=9000 left blank, meaning that check will be omitted. - No default. + Default is 80, so warnings will be logged when memory + usage reaches 80%. @@ -1349,7 +1358,8 @@ CTDB_PER_IP_ROUTING_TABLE_ID_HIGH=9000 left blank, meaning that check will be omitted. - No default. + Default is 25, so warnings will be logged when swap + usage reaches 25%. diff --git a/ctdb/tests/eventscripts/05.system.monitor.001.sh b/ctdb/tests/eventscripts/05.system.monitor.001.sh index 72262c19f14..5d513b6de41 100755 --- a/ctdb/tests/eventscripts/05.system.monitor.001.sh +++ b/ctdb/tests/eventscripts/05.system.monitor.001.sh @@ -4,6 +4,8 @@ define_test "Filesystem use check, error situation, no checks enabled" +setup_memcheck + CTDB_MONITOR_FILESYSTEM_USAGE="" setup_fscheck 100 ok_null diff --git a/ctdb/tests/eventscripts/05.system.monitor.002.sh b/ctdb/tests/eventscripts/05.system.monitor.002.sh index 22b479cc947..3a734895771 100755 --- a/ctdb/tests/eventscripts/05.system.monitor.002.sh +++ b/ctdb/tests/eventscripts/05.system.monitor.002.sh @@ -4,6 +4,8 @@ define_test "Filesystem use check, good situation, 1 error check enabled" +setup_memcheck + CTDB_MONITOR_FILESYSTEM_USAGE="/var::80" setup_fscheck ok_null diff --git a/ctdb/tests/eventscripts/05.system.monitor.003.sh b/ctdb/tests/eventscripts/05.system.monitor.003.sh index 3a0ad1aa387..8576e725e58 100755 --- a/ctdb/tests/eventscripts/05.system.monitor.003.sh +++ b/ctdb/tests/eventscripts/05.system.monitor.003.sh @@ -4,6 +4,8 @@ define_test "Filesystem use check, error situation, 1 error check enabled" +setup_memcheck + CTDB_MONITOR_FILESYSTEM_USAGE="/var::80" setup_fscheck 90 required_result 1 <= threshold 80% +WARNING: System swap utilization 100% >= threshold 25% +EOF simple_test diff --git a/ctdb/tests/eventscripts/05.system.monitor.012.sh b/ctdb/tests/eventscripts/05.system.monitor.012.sh index bb2c7b57811..aad789bee0a 100755 --- a/ctdb/tests/eventscripts/05.system.monitor.012.sh +++ b/ctdb/tests/eventscripts/05.system.monitor.012.sh @@ -2,7 +2,7 @@ . "${TEST_SCRIPTS_DIR}/unit.sh" -define_test "Memory check, good situation, all enabled" +define_test "Memory check, good situation, all memory checks enabled" setup_memcheck diff --git a/ctdb/tests/eventscripts/05.system.monitor.013.sh b/ctdb/tests/eventscripts/05.system.monitor.013.sh index a238a591b5a..4bf9f7be18c 100755 --- a/ctdb/tests/eventscripts/05.system.monitor.013.sh +++ b/ctdb/tests/eventscripts/05.system.monitor.013.sh @@ -2,7 +2,7 @@ . "${TEST_SCRIPTS_DIR}/unit.sh" -define_test "Memory check, bad situation, only swap check" +define_test "Memory check, bad situation, custom swap critical" setup_memcheck 100 90 @@ -10,6 +10,7 @@ CTDB_MONITOR_MEMORY_USAGE="" CTDB_MONITOR_SWAP_USAGE=":50" required_result 1 <= threshold 80% ERROR: System swap utilization 90% >= threshold 50% $FAKE_PROC_MEMINFO $(ps foobar) diff --git a/ctdb/tests/eventscripts/05.system.monitor.014.sh b/ctdb/tests/eventscripts/05.system.monitor.014.sh index 46955f34020..48630c4911e 100755 --- a/ctdb/tests/eventscripts/05.system.monitor.014.sh +++ b/ctdb/tests/eventscripts/05.system.monitor.014.sh @@ -2,7 +2,7 @@ . "${TEST_SCRIPTS_DIR}/unit.sh" -define_test "Memory check, bad situation, only memory warning" +define_test "Memory check, bad memory situation, custom memory warning" setup_memcheck 90 10 diff --git a/ctdb/tests/eventscripts/05.system.monitor.015.sh b/ctdb/tests/eventscripts/05.system.monitor.015.sh index 383e06db001..7d73bcf3ac8 100755 --- a/ctdb/tests/eventscripts/05.system.monitor.015.sh +++ b/ctdb/tests/eventscripts/05.system.monitor.015.sh @@ -2,7 +2,7 @@ . "${TEST_SCRIPTS_DIR}/unit.sh" -define_test "Memory check, bad situation, only memory critical" +define_test "Memory check, bad situation, custom memory critical" setup_memcheck 90 0 diff --git a/ctdb/tests/eventscripts/05.system.monitor.017.sh b/ctdb/tests/eventscripts/05.system.monitor.017.sh index 4a838744b9d..b976dba29a8 100755 --- a/ctdb/tests/eventscripts/05.system.monitor.017.sh +++ b/ctdb/tests/eventscripts/05.system.monitor.017.sh @@ -2,7 +2,7 @@ . "${TEST_SCRIPTS_DIR}/unit.sh" -define_test "Memory check, bad situation, both memory checks, causes unhealthy" +define_test "Memory check, bad situation, both custom memory checks, causes unhealthy" setup_memcheck 87 0 -- 2.11.4.GIT