compresslog: migrated to "logger" infra to better use of syslog
[compresslog.git] / compresslog
blob2c66a3b2d4ed7c846f4949397aeea6f22e4ddc5a
1 #!/usr/bin/python
3 ## echo | compresslog /path/to/my/log
5 import os
6 import sys
7 import select
8 import signal
9 import gzip
10 import zlib
11 import subprocess
12 import fcntl
13 import errno
14 from optparse import OptionParser
15 import logging
16 import logging.handlers
17 from datetime import datetime
18 import apache_log_parser
19 import numpy as np
22 class parse_stats:
24 ls_users = set()
25 ct_hits = 0
26 ct_pviews = 0
27 pf_pviews_time = list()
29 def __init__(self, logfmt):
30 self._parse_line = apache_log_parser.make_parser(logfmt)
32 def parse_line(self, logline):
33 try:
34 log_parsed = self._parse_line(logline)
35 except:
36 pass
37 if log_parsed['status'] != '200':
38 return
39 self.ct_hits = self.ct_hits + 1
40 if log_parsed['response_header_content_type'] != 'text/html':
41 return
42 self.ct_pviews = self.ct_pviews + 1
43 self.pf_pviews_time.append(int(log_parsed['time_us']))
44 user = log_parsed['note_moodleuser']
45 if user != '-':
46 self.ls_users.add(user)
48 def print_stats(self):
49 print "hits %s" % self.ct_hits
50 print "pageviews %s" % self.ct_pviews
51 print "avg pview time us %s" % np.average(self.pf_pviews_time)
52 print "50th percentile %s" % np.percentile(self.pf_pviews_time,50)
53 print "60th percentile %s" % np.percentile(self.pf_pviews_time,60)
54 print "70th percentile %s" % np.percentile(self.pf_pviews_time,70)
55 print "80th percentile %s" % np.percentile(self.pf_pviews_time,80)
56 print "unique users %s" % len(self.ls_users)
58 # TODO: Find a reasonable way to make this configurable...
59 pstats = parse_stats('%h %l %{MOODLEUSER}n %t %D \"%{Content-Type}o\" \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"')
61 parser = OptionParser()
62 parser.add_option('-d', '--debug', dest='debug', default=False, action='store_true',
63 help='Print debugging info to stderr, no output to syslog')
64 parser.add_option('--stats', dest='stats', default=False, action='store_true',
65 help='Parse and process stats assuming it is a specially-formatted apache access log')
66 (opts, args) = parser.parse_args()
67 fname = args[0]
70 ## Setup logging to syslog bright and early
71 ##
72 # when not in debug mode,
73 # this will route stderr to syslog, including
74 # stacktraces from unhandled exceptions
75 logger = logging.getLogger('compresslog')
76 if opts.debug: # "debug" mode disables logging to syslog
77 logh = logging.StreamHandler()
78 # docs tell us to set logging at the handler level
79 # but that's not what works, setting it at the global
80 # logger level does work
81 #logh.setLevel(logging.DEBUG)
82 logger.setLevel(logging.DEBUG)
83 else:
84 # writing to /dev/log makes it compat w systemd's journal
85 logh = logging.handlers.SysLogHandler(address='/dev/log')
86 # logger.setLevel(logging.DEBUG)
87 logf = logging.Formatter('compresslog: %(message)s')
88 logh.setFormatter(logf)
89 logger.addHandler(logh)
91 def errorprint(msg):
92 logger.error(msg)
93 sys.stderr.write(msg +'\n')
95 def alt_name(fname):
96 # using microsecond (%f) to sidestep races
97 datestr = datetime.now().strftime('_%Y%m%d-%H%M%S-%f')
98 if fname.endswith('_log'):
99 return fname[0:-4] + datestr + '_log'
100 else:
101 return fname + datestr
103 ## Work around several issues
105 ## - gzip files, due to their nature cannot sustain two simultaneous
106 ## writers. Pure text log files can, thanks to promises from POSIX
107 ## implemented in the kernel about the integrity of writes of
108 ## newline-delimited data.
110 ## - Apache 2.2 will sometimes open multiple piped loggers to the same
111 ## file. Apache 2.4 seems to only do it if the piped logger config
112 ## string is different. v2.2 is less predictable on this.
113 ## This means several compresslog processes are started at the same
114 ## time and race to create the file.
116 while True:
117 renamed = False
118 ## Try to open file exclusively
119 f = gzip.open(fname, 'a')
120 try:
121 fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
122 except IOError, e:
123 if e.errno == errno.EACCES or e.errno == errno.EAGAIN:
124 sys.stderr.write("Cannot flock %s\n" % fname)
125 if renamed:
126 sys.exit(1)
127 else:
128 fname = alt_name(fname)
129 sys.stderr.write("Retrying with %s\n" % fname)
130 renamed = True
131 continue
132 else:
133 errorprint("Unexpected error in flock(): %u" % e.errno)
134 raise
136 # make sure we are appending to an empty or a gzip file
137 # to avoid appending to a plaintext log
138 if os.stat(fname)[6] != 0:
139 mimetype = subprocess.Popen(['/usr/bin/file', '--mime-type', '-b', fname],
140 stdout=subprocess.PIPE).communicate()[0]
141 mimetype = mimetype.rstrip()
142 if mimetype != 'application/x-gzip':
143 mvfname = alt_name(fname)
144 errorprint("Destination file exists and is not compressed, renaming old file to "
145 + mimetype + mvfname)
146 os.rename(fname, mvfname)
147 # release the lock, restart the process
148 # unfortunately this will append a gzip header
149 f.close()
150 continue
152 # success - out of the error handling loop
153 break
155 sig_wants_flush = False
156 sig_wants_flushexit = False
158 def sig_flush(signum, frame):
159 logger.debug('sig_flush')
160 global sig_wants_flush
161 sig_wants_flush = True
163 def sig_flushexit(signum, frame):
164 logger.debug('sig_flushexit')
165 global sig_wants_flushexit
166 global timeout
167 sig_wants_flushexit = True
168 timeout = 0
170 ## ensure we flush on various signals
171 # USR1/USR2: user asked to flush out
172 # HUP: apache is no longer writing to us (hung up)
173 signal.signal(signal.SIGUSR1, sig_flush)
174 signal.signal(signal.SIGUSR2, sig_flush)
175 signal.signal(signal.SIGHUP, sig_flushexit)
176 signal.signal(signal.SIGTERM, sig_flushexit)
178 # while True / select / read(int) / O_NONBLOCK put
179 # input buffering explicitly in our hands
180 fcntl.fcntl(sys.stdin, fcntl.F_SETFL, os.O_NONBLOCK)
182 at_eof=False
183 timeout=60
185 buf = False
186 buftail = False
187 linecount = 0
189 while True:
191 try:
192 rfds = select.select([sys.stdin], [], [], timeout)[0]
193 logger.debug('select: %u' % len(rfds))
194 if len(rfds): # only read() when select says
195 logger.debug('read()')
196 # will not block
197 buf = sys.stdin.read(4096)
198 logger.debug('read %u' % len(buf))
199 if buf:
200 # If we do get EINTR during write, in Python < 2.7.2
201 # the write may be screwed/incomplete, but we
202 # have no way to know&retry.
203 # http://bugs.python.org/issue10956
204 f.write(buf)
205 if opts.stats:
206 # collect lines
207 if buftail:
208 logger.debug('line tail length %u' % len(buftail))
209 buf = buftail + buf
210 loglines = buf.split('\n')
211 # save the tail for later
212 buftail = loglines.pop()
213 for logline in loglines:
214 linecount = linecount + 1
215 pstats.parse_line(logline)
216 else:
217 at_eof = True
218 logger.debug('at_eof')
219 except select.error, e:
220 if e[0] == errno.EINTR:
221 logger.debug('E in select: %u' % e[0])
222 continue # on signal, restart at the top
223 else:
224 raise
225 except IOError, e:
226 logger.debug('E in read() or write(): %u' % e[0])
227 if e[0] == errno.EINTR:
228 continue # on signal, restart at the top
229 else:
230 raise
232 if at_eof or sig_wants_flushexit:
233 f.close()
234 sys.stdin.close()
235 logger.debug('fh closed, exiting')
236 if opts.stats:
237 pstats.print_stats()
238 sys.exit(0)
240 if sig_wants_flush:
241 sig_wants_flush = False
242 try:
243 f.flush(zlib.Z_FULL_FLUSH)
244 logger.debug('flush')
245 except:
246 logger.debug('E in flush')
247 # ignore exceptions, try to keep rolling
248 pass