From 695da0b90ff32e53c080e7666d07cac1f3065af1 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 24 Jun 2016 15:01:24 +1000 Subject: [PATCH] pmlogger: auto-reconnect to pmcd whenever possible Extend Dave's consistency checking to also allow pmlogger to reconnect automatically to pmcd (just like pmie always has). This is important for the common case of a pmcd restart on a host monitored by a central pmlogger farm. Previously, we were open to some minutes of missed data until the cron pmlogger_check kicked in - now it is instantaneous. Its worth noting that we still always start a new pmlogger with each new day, even with this change in place. Verify the auto-reconnection and presence of a mark record via the qa/389 test script. --- qa/389 | 21 +++++++++++------- qa/389.out | 18 +++++++++------- src/pmlogger/src/check.c | 4 ++-- src/pmlogger/src/fetch.c | 2 -- src/pmlogger/src/logger.h | 2 -- src/pmlogger/src/pmlogger.c | 52 ++++++++++++++++++++++++++++----------------- 6 files changed, 58 insertions(+), 41 deletions(-) diff --git a/qa/389 b/qa/389 index 42bffa28e..4a2c0773a 100755 --- a/qa/389 +++ b/qa/389 @@ -1,7 +1,6 @@ #! /bin/sh # PCP QA Test No. 389 -# pmlogger does not exit with -L when it loses pmcd connection? -# #528442 +# Exercise pmlogger behaviour when it loses pmcd connection # # Copyright (c) 1995-2002 Silicon Graphics, Inc. All Rights Reserved. # @@ -14,6 +13,7 @@ echo "QA output created by $seq" . ./common.filter . ./common.check +signal=$PCP_BINADM_DIR/pmsignal host=`hostname` status=1 # failure is the default! @@ -35,27 +35,32 @@ ps $PCP_PS_ALL_FLAGS | $PCP_AWK_PROG '$2 == "'$!'" { print }' _filter_pmlogger_log <$tmp.log | sed -e "s/$host/HOST/" echo -echo "=== empty config and -L, should exit when pmcd restarted ===" +echo "=== empty config and -L, no reconnect and no exit ===" $sudo rm -f $tmp.* _start_up_pmlogger -L -c /dev/null -l $tmp.log $tmp _wait_for_pmlogger $pid $tmp.log 5 $sudo $PCP_RC_DIR/pcp restart | _filter_pcp_start _wait_for_pmcd -echo "expect no pmlogger process ..." -ps $PCP_PS_ALL_FLAGS | $PCP_AWK_PROG '$2 == "'$!'" { print }' +echo "expect pmlogger process ..." +ps $PCP_PS_ALL_FLAGS | $PCP_AWK_PROG '$2 == "'$pid'" { print "OK"}' _filter_pmlogger_log <$tmp.log | sed -e "s/$host/HOST/" +$sudo $signal -s TERM $pid echo -echo "=== non-empty config, should exit when pmcd restarted ===" +echo "=== non-empty config, reconnect when pmcd restarted ===" $sudo rm -f $tmp.* echo "log mandatory on 1 sec pmcd.version" >$tmp.config _start_up_pmlogger -c $tmp.config -l $tmp.log $tmp _wait_for_pmlogger $pid $tmp.log 5 $sudo $PCP_RC_DIR/pcp restart | _filter_pcp_start _wait_for_pmcd -echo "expect no pmlogger process ..." -ps $PCP_PS_ALL_FLAGS | $PCP_AWK_PROG '$2 == "'$!'" { print }' +echo "expect pmlogger process ..." +ps $PCP_PS_ALL_FLAGS | $PCP_AWK_PROG '$2 == "'$pid'" { print "OK"}' _filter_pmlogger_log <$tmp.log | _filter | sed -e "s/$host/HOST/" +$sudo $signal -s TERM $pid +sleep 1 +echo "expect one mark record ..." +pmdumplog -M $tmp | grep '' | wc -l # success, all done status=0 diff --git a/qa/389.out b/qa/389.out index 41f53fc35..c316822e2 100644 --- a/qa/389.out +++ b/qa/389.out @@ -9,11 +9,12 @@ Nothing to log, and not the primary logger instance ... good-bye Log finished DATE -=== empty config and -L, should exit when pmcd restarted === +=== empty config and -L, no reconnect and no exit === Waiting for pmcd to terminate ... Starting pmcd ... Starting pmlogger ... -expect no pmlogger process ... +expect pmlogger process ... +OK Log for pmlogger on HOST started DATE Config parsed @@ -21,18 +22,19 @@ Starting logger for host "HOST" Archive basename: ARCHIVE pmlogger: Lost connection to PMCD on "HOST" at DATE -Log finished DATE - -=== non-empty config, should exit when pmcd restarted === +=== non-empty config, reconnect when pmcd restarted === Waiting for pmcd to terminate ... Starting pmcd ... Starting pmlogger ... -expect no pmlogger process ... +expect pmlogger process ... +OK Log for pmlogger on HOST started DATE Config parsed Starting logger for host "HOST" Archive basename: ARCHIVE pmlogger: Lost connection to PMCD on "HOST" at DATE - -Log finished DATE +pmlogger: re-established connection to PMCD on "HOST" at DATE +pmlogger: Validating metrics after PMCD state changed at DATE +expect one mark record ... +1 diff --git a/src/pmlogger/src/check.c b/src/pmlogger/src/check.c index 2f3ecb33d..830c269a4 100644 --- a/src/pmlogger/src/check.c +++ b/src/pmlogger/src/check.c @@ -186,8 +186,8 @@ validate_metrics(void) char buf1[20], buf2[20]; time(&now); - fprintf(stderr, "\n%s", ctime(&now)); - fprintf(stderr, "Validating configured metrics after pmcd state change\n"); + fprintf(stderr, "%s: Validating metrics after PMCD state changed at %s", + pmProgname, ctime(&now)); /* * Check each metric in each element of the task list, whether it is diff --git a/src/pmlogger/src/fetch.c b/src/pmlogger/src/fetch.c index 313495efc..cbb23675c 100644 --- a/src/pmlogger/src/fetch.c +++ b/src/pmlogger/src/fetch.c @@ -79,7 +79,6 @@ myFetch(int numpmid, pmID pmidlist[], __pmPDU **pdup) else return PM_ERR_NOCONTEXT; -#if CAN_RECONNECT if (ctxp->c_pmcd->pc_fd == -1) { /* lost connection, try to get it back */ n = reconnect(); @@ -88,7 +87,6 @@ myFetch(int numpmid, pmID pmidlist[], __pmPDU **pdup) return n; } } -#endif if (ctxp->c_sent == 0) { /* diff --git a/src/pmlogger/src/logger.h b/src/pmlogger/src/logger.h index a5859df87..b33faa659 100644 --- a/src/pmlogger/src/logger.h +++ b/src/pmlogger/src/logger.h @@ -128,9 +128,7 @@ extern int newvolume(int); extern void validate_metrics(void); extern void disconnect(int); -#if CAN_RECONNECT extern int reconnect(void); -#endif extern int do_preamble(void); extern void run_done(int,char *); extern __pmPDU *rewrite_pdu(__pmPDU *, int); diff --git a/src/pmlogger/src/pmlogger.c b/src/pmlogger/src/pmlogger.c index 45d70755e..ba27a8333 100644 --- a/src/pmlogger/src/pmlogger.c +++ b/src/pmlogger/src/pmlogger.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2015 Red Hat. + * Copyright (c) 2012-2016 Red Hat. * Copyright (c) 1995-2001,2003 Silicon Graphics, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it @@ -1281,21 +1281,14 @@ void disconnect(int sts) { time_t now; -#if CAN_RECONNECT int ctx; __pmContext *ctxp; -#endif time(&now); if (sts != 0) fprintf(stderr, "%s: Error: %s\n", pmProgname, pmErrStr(sts)); fprintf(stderr, "%s: Lost connection to PMCD on \"%s\" at %s", pmProgname, pmcd_host, ctime(&now)); -#if CAN_RECONNECT - if (primary) { - fprintf(stderr, "This is fatal for the primary logger."); - exit(1); - } if (pmcdfd != -1) { close(pmcdfd); __pmFD_CLR(pmcdfd, &fds); @@ -1310,36 +1303,57 @@ disconnect(int sts) } ctxp->c_pmcd->pc_fd = -1; PM_UNLOCK(ctxp->c_lock); -#else - exit(1); -#endif } -#if CAN_RECONNECT int reconnect(void) { int sts; int ctx; + time_t now; __pmContext *ctxp; if ((ctx = pmWhichContext()) >= 0) ctxp = __pmHandleToPtr(ctx); if (ctx < 0 || ctxp == NULL) { - fprintf(stderr, "%s: reconnect botch: cannot get context: %s\n", pmProgname, pmErrStr(ctx)); + fprintf(stderr, "%s: reconnect botch: cannot get context: %s\n", + pmProgname, pmErrStr(ctx)); exit(1); } sts = pmReconnectContext(ctx); if (sts >= 0) { - time_t now; - time(&now); - fprintf(stderr, "%s: re-established connection to PMCD on \"%s\" at %s\n", - pmProgname, pmcd_host, ctime(&now)); pmcdfd = ctxp->c_pmcd->pc_fd; __pmFD_SET(pmcdfd, &fds); numfds = maxfd() + 1; } PM_UNLOCK(ctxp->c_lock); - return sts; + if (sts < 0) + return sts; + + time(&now); + fprintf(stderr, "%s: re-established connection to PMCD on \"%s\" at %s", + pmProgname, pmcd_host, ctime(&now)); + + /* + * Metrics may have changed while PMCD was unreachable, so we + * need to recheck each metric to make sure that its PMID and + * semantics have not changed. We cannot recover if there is + * an incompatible change - must defer to controlling scripts + * or processes (a new-named archive will have to be created, + * from a new pmlogger process, and pmlogrewrite/pmlogextract + * will need to become involved if they need to be merged). + */ + validate_metrics(); + + /* + * All metrics have been validated, however, this state change + * represents a potential gap in the stream of metrics. So we + * must store a record at this point. + */ + if ((sts = putmark()) < 0) { + fprintf(stderr, "putmark: %s\n", pmErrStr(sts)); + exit(1); + } + + return 0; } -#endif -- 2.11.4.GIT