compresslog: migrate to optparse to cleanup opt handling
[compresslog.git] / compresslog
blob1d4a5c5e8c1cec796c4c64892fb732ce99180566
1 #!/usr/bin/python
3 ## echo | compresslog /path/to/my/log
5 import os
6 import sys
7 import select
8 import signal
9 import gzip
10 import zlib
11 import subprocess
12 import fcntl
13 import errno
14 from optparse import OptionParser
15 from datetime import datetime
16 import apache_log_parser
17 import numpy as np
20 class parse_stats:
22 ls_users = set()
23 ct_hits = 0
24 ct_pviews = 0
25 pf_pviews_time = list()
27 def __init__(self, logfmt):
28 self._parse_line = apache_log_parser.make_parser(logfmt)
30 def parse_line(self, logline):
31 try:
32 log_parsed = self._parse_line(logline)
33 except:
34 pass
35 if log_parsed['status'] != '200':
36 return
37 self.ct_hits = self.ct_hits + 1
38 if log_parsed['response_header_content_type'] != 'text/html':
39 return
40 self.ct_pviews = self.ct_pviews + 1
41 self.pf_pviews_time.append(int(log_parsed['time_us']))
42 user = log_parsed['note_moodleuser']
43 if user != '-':
44 self.ls_users.add(user)
46 def print_stats(self):
47 print "hits %s" % self.ct_hits
48 print "pageviews %s" % self.ct_pviews
49 print "avg pview time us %s" % np.average(self.pf_pviews_time)
50 print "50th percentile %s" % np.percentile(self.pf_pviews_time,50)
51 print "60th percentile %s" % np.percentile(self.pf_pviews_time,60)
52 print "70th percentile %s" % np.percentile(self.pf_pviews_time,70)
53 print "80th percentile %s" % np.percentile(self.pf_pviews_time,80)
54 print "unique users %s" % len(self.ls_users)
56 # TODO: Find a reasonable way to make this configurable...
57 pstats = parse_stats('%h %l %{MOODLEUSER}n %t %D \"%{Content-Type}o\" \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"')
59 parser = OptionParser()
60 parser.add_option('-d', '--debug', dest='debug', default=False, action='store_true',
61 help='Print debugging info to stderr, no output to syslog')
62 parser.add_option('--stats', dest='stats', default=False, action='store_true',
63 help='Parse and process stats assuming it is a specially-formatted apache access log')
64 (opts, args) = parser.parse_args()
65 fname = args[0]
68 ## Setup logging to syslog bright and early
69 ##
70 # this will route stderr to syslog, including
71 # stacktraces from unhandled exceptions
72 if not opts.debug: # "debug" mode disables logging to syslog
73 plogger = subprocess.Popen(['/usr/bin/logger', '-t',
74 'compresslog[%u]' % os.getpid()],
75 stdin=subprocess.PIPE, stdout=subprocess.PIPE,
76 close_fds=True )
77 os.dup2(plogger.stdin.fileno(), sys.stderr.fileno())
79 ## init globals used in sig handler
80 # fh in append, binary
82 def debugprint(msg):
83 if opts.debug:
84 sys.stderr.write(msg +'\n')
85 def errorprint(msg):
86 sys.stderr.write(msg +'\n')
88 def alt_name(fname):
89 # using microsecond (%f) to sidestep races
90 datestr = datetime.now().strftime('_%Y%m%d-%H%M%S-%f')
91 if fname.endswith('_log'):
92 return fname[0:-4] + datestr + '_log'
93 else:
94 return fname + datestr
96 ## Work around several issues
98 ## - gzip files, due to their nature cannot sustain two simultaneous
99 ## writers. Pure text log files can, thanks to promises from POSIX
100 ## implemented in the kernel about the integrity of writes of
101 ## newline-delimited data.
103 ## - Apache 2.2 will sometimes open multiple piped loggers to the same
104 ## file. Apache 2.4 seems to only do it if the piped logger config
105 ## string is different. v2.2 is less predictable on this.
106 ## This means several compresslog processes are started at the same
107 ## time and race to create the file.
109 while True:
110 renamed = False
111 ## Try to open file exclusively
112 f = gzip.open(fname, 'a')
113 try:
114 fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
115 except IOError, e:
116 if e.errno == errno.EACCES or e.errno == errno.EAGAIN:
117 sys.stderr.write("Cannot flock %s\n" % fname)
118 if renamed:
119 sys.exit(1)
120 else:
121 fname = alt_name(fname)
122 sys.stderr.write("Retrying with %s\n" % fname)
123 renamed = True
124 continue
125 else:
126 errorprint("Unexpected error in flock(): %u" % e.errno)
127 raise
129 # make sure we are appending to an empty or a gzip file
130 # to avoid appending to a plaintext log
131 if os.stat(fname)[6] != 0:
132 mimetype = subprocess.Popen(['/usr/bin/file', '--mime-type', '-b', fname],
133 stdout=subprocess.PIPE).communicate()[0]
134 mimetype = mimetype.rstrip()
135 if mimetype != 'application/x-gzip':
136 mvfname = alt_name(fname)
137 errorprint("Destination file exists and is not compressed, renaming old file to "
138 + mimetype + mvfname)
139 os.rename(fname, mvfname)
140 # release the lock, restart the process
141 # unfortunately this will append a gzip header
142 f.close()
143 continue
145 # success - out of the error handling loop
146 break
148 sig_wants_flush = False
149 sig_wants_flushexit = False
151 def sig_flush(signum, frame):
152 debugprint('sig_flush')
153 global sig_wants_flush
154 sig_wants_flush = True
156 def sig_flushexit(signum, frame):
157 debugprint('sig_flushexit')
158 global sig_wants_flushexit
159 global timeout
160 sig_wants_flushexit = True
161 timeout = 0
163 ## ensure we flush on various signals
164 # USR1/USR2: user asked to flush out
165 # HUP: apache is no longer writing to us (hung up)
166 signal.signal(signal.SIGUSR1, sig_flush)
167 signal.signal(signal.SIGUSR2, sig_flush)
168 signal.signal(signal.SIGHUP, sig_flushexit)
169 signal.signal(signal.SIGTERM, sig_flushexit)
171 # while True / select / read(int) / O_NONBLOCK put
172 # input buffering explicitly in our hands
173 fcntl.fcntl(sys.stdin, fcntl.F_SETFL, os.O_NONBLOCK)
175 at_eof=False
176 timeout=60
178 buf = False
179 buftail = False
180 linecount = 0
182 while True:
184 try:
185 rfds = select.select([sys.stdin], [], [], timeout)[0]
186 debugprint('select: %u' % len(rfds))
187 if len(rfds): # only read() when select says
188 debugprint('read()')
189 # will not block
190 buf = sys.stdin.read(4096)
191 debugprint('read %u' % len(buf))
192 if buf:
193 # If we do get EINTR during write, in Python < 2.7.2
194 # the write may be screwed/incomplete, but we
195 # have no way to know&retry.
196 # http://bugs.python.org/issue10956
197 f.write(buf)
198 debugprint('wrote')
199 if opts.stats:
200 # collect lines
201 if buftail:
202 debugprint('line tail length %u' % len(buftail))
203 buf = buftail + buf
204 loglines = buf.split('\n')
205 # save the tail for later
206 buftail = loglines.pop()
207 for logline in loglines:
208 linecount = linecount + 1
209 pstats.parse_line(logline)
210 else:
211 at_eof = True
212 debugprint('at_eof')
213 except select.error, e:
214 if e[0] == errno.EINTR:
215 debugprint('E in select: %u' % e[0])
216 continue # on signal, restart at the top
217 else:
218 raise
219 except IOError, e:
220 debugprint('E in read() or write(): %u' % e[0])
221 if e[0] == errno.EINTR:
222 continue # on signal, restart at the top
223 else:
224 raise
226 if at_eof or sig_wants_flushexit:
227 f.close()
228 sys.stdin.close()
229 debugprint('fh closed, exiting')
230 if opts.stats:
231 pstats.print_stats()
232 sys.exit(0)
234 if sig_wants_flush:
235 sig_wants_flush = False
236 try:
237 f.flush(zlib.Z_FULL_FLUSH)
238 debugprint('flush')
239 except:
240 debugprint('E in flush')
241 # ignore exceptions, try to keep rolling
242 pass