watchlog: minor indentation cleanup
[compresslog.git] / compresslog
blobba739d189ddbb7f7d2864e6ca50bd79194549bdf
1 #!/usr/bin/python -OO
3 ## echo | compresslog /path/to/my/log
5 import os
6 import sys
7 import select
8 import signal
9 import gzip
10 import zlib
11 import subprocess
12 import fcntl
13 import errno
14 from optparse import OptionParser
15 from datetime import datetime
16 import syslog
18 class logging:
20 CRITICAL = 50
21 FATAL = CRITICAL
22 ERROR = 40
23 WARNING = 30
24 WARN = WARNING
25 INFO = 20
26 DEBUG = 10
27 NOTSET = 0
29 def __init__(self, tagname, use_syslog=True):
30 self.loglevel = self.WARN
31 self.syslog = use_syslog
32 if use_syslog:
33 syslog.openlog(tagname)
35 def setLevel(self, loglevel):
36 self.loglevel = loglevel
38 def warn(self, msg):
39 if self.WARN >= self.loglevel:
40 self._logit(msg)
42 def error(self, msg):
43 if self.ERROR >= self.loglevel:
44 self._logit(msg)
46 def info(self, msg):
47 if self.INFO >= self.loglevel:
48 self._logit(msg)
50 def debug(self, msg):
51 if self.DEBUG >= self.loglevel:
52 self._logit(msg)
54 def critical(self, msg):
55 self._logit(msg)
57 def _logit(self, msg):
58 if self.syslog:
59 syslog.syslog(msg)
60 else:
61 sys.stderr.write(msg + '\n')
63 class parse_stats:
65 def __init__(self, logfmt):
66 self._parse_line = apache_log_parser.make_parser(logfmt)
67 self._reset()
68 self.ajax_mimetypes = set(['application/ajax', 'text/xml'])
70 def _reset(self):
71 self.ls_users = set()
72 self.ct_hits = 0
73 self.ct_pviews = 0
74 self.pf_pviews_time = list()
75 self.ct_ajax_hits = 0
77 def parse_line(self, logline):
78 try:
79 log_parsed = self._parse_line(logline)
80 except:
81 logger.error("could not parse line: %s" % logline)
82 return
83 if log_parsed['status'] != '200':
84 return
85 self.ct_hits = self.ct_hits + 1
86 if log_parsed['request_method'] == 'POST':
87 if log_parsed['response_header_content_type'] in self.ajax_mimetypes:
88 self.ct_ajax_hits = self.ct_ajax_hits + 1
89 if log_parsed['response_header_content_type'] == 'text/plain':
90 if log_parsed['request_url'].find('ajax') > -1:
91 self.ct_ajax_hits = self.ct_ajax_hits + 1
93 if log_parsed['response_header_content_type'] != 'text/html':
94 return
95 self.ct_pviews = self.ct_pviews + 1
96 self.pf_pviews_time.append(int(log_parsed['time_us']))
97 user = log_parsed['note_moodleuser']
98 if user != '-':
99 self.ls_users.add(user)
101 def percentile(self, N, P):
103 http://stackoverflow.com/questions/2374640/how-do-i-calculate-percentiles-with-python-numpy
104 Find the percentile of a list of values
106 @parameter N - A list of values. N must be sorted.
107 @parameter P - A float value from 0.0 to 1.0
109 @return - The percentile of the values.
111 n = int(round(P * len(N) + 0.5))
112 if n > 1:
113 return N[n-2]
114 else:
115 return 0
117 def log_stats(self, boundary):
118 stats = { 'hits': 0,
119 'pviews': 0,
120 'pview_avg_us': 0,
121 'pview_50th_us': 0,
122 'pview_80th_us': 0,
123 'unique_users': 0,
124 'ajax_hits': 0,
125 'boundary_epoch': boundary
127 if self.ct_hits:
128 stats['hits'] = self.ct_hits
129 stats['ajax_hits'] = self.ct_ajax_hits
130 if self.ct_pviews:
131 stats['pviews'] = self.ct_pviews
132 self.pf_pviews_time.sort()
133 stats['pview_avg_us'] = sum(self.pf_pviews_time) / float(len(self.pf_pviews_time))
134 stats['pview_50th_us'] = self.percentile(self.pf_pviews_time, 0.5)
135 stats['pview_80th_us'] = self.percentile(self.pf_pviews_time, 0.8)
136 stats['unique_users'] = len(self.ls_users)
137 # passing separators explicitly gives us more compact encoding
138 logger.info("stats %s" % json.dumps(stats, separators=(',',':')))
139 self._reset()
141 parser = OptionParser()
142 parser.add_option('-d', '--debug', dest='debug', default=0, action='count',
143 help='Print debugging info to stderr, no output to syslog')
144 parser.add_option('--stats', dest='stats', default=False, action='store_true',
145 help='Parse and process stats assuming it is a specially-formatted apache access log')
146 (opts, args) = parser.parse_args()
147 fname = args[0]
150 ## Setup logging to syslog bright and early
152 logger = logging('www_stats', (opts.debug == 0))
153 logger.setLevel(logging.INFO)
154 if opts.debug > 1:
155 logger.setLevel(logging.DEBUG)
157 if opts.stats:
158 # these libs add nontrivial memory footprint
159 # import them only if we are going to use them...
160 import apache_log_parser
161 import math
162 import json
163 import time
164 # TODO: Find a reasonable way to make this configurable...
165 pstats = parse_stats('%h %l \"%{MOODLEUSER}n\" %t %D \"%{Content-Type}o\" \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"')
167 def errorprint(msg):
168 logger.error(msg)
169 sys.stderr.write(msg +'\n')
171 def alt_name(fname):
172 # using microsecond (%f) to sidestep races
173 datestr = datetime.now().strftime('_%Y%m%d-%H%M%S-%f')
174 if fname.endswith('_log'):
175 return fname[0:-4] + datestr + '_log'
176 else:
177 return fname + datestr
179 ## Work around several issues
181 ## - gzip files, due to their nature cannot sustain two simultaneous
182 ## writers. Pure text log files can, thanks to promises from POSIX
183 ## implemented in the kernel about the integrity of writes of
184 ## newline-delimited data.
186 ## - Apache 2.2 will sometimes open multiple piped loggers to the same
187 ## file. Apache 2.4 seems to only do it if the piped logger config
188 ## string is different. v2.2 is less predictable on this.
189 ## This means several compresslog processes are started at the same
190 ## time and race to create the file.
192 while True:
193 renamed = False
194 ## Try to open file exclusively
195 f = gzip.open(fname, 'a')
196 try:
197 fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
198 except IOError, e:
199 if e.errno == errno.EACCES or e.errno == errno.EAGAIN:
200 logger.warn("Cannot flock %s" % fname)
201 if renamed:
202 sys.exit(1)
203 else:
204 fname = alt_name(fname)
205 logger.warn("Retrying with %s" % fname)
206 renamed = True
207 continue
208 else:
209 errorprint("Unexpected error in flock(): %u" % e.errno)
210 raise
212 # make sure we are appending to an empty or a gzip file
213 # to avoid appending to a plaintext log
214 if os.stat(fname)[6] != 0:
215 mimetype = subprocess.Popen(['/usr/bin/file', '--mime-type', '-b', fname],
216 stdout=subprocess.PIPE).communicate()[0]
217 mimetype = mimetype.rstrip()
218 if mimetype != 'application/x-gzip':
219 mvfname = alt_name(fname)
220 errorprint("Destination file exists and is not compressed, renaming old file to "
221 + mimetype + mvfname)
222 os.rename(fname, mvfname)
223 # release the lock, restart the process
224 # unfortunately this will append a gzip header
225 f.close()
226 continue
228 # success - out of the error handling loop
229 break
231 sig_wants_flush = False
232 sig_wants_flushexit = False
234 def sig_flush(signum, frame):
235 logger.debug('sig_flush')
236 global sig_wants_flush
237 sig_wants_flush = True
239 def sig_flushexit(signum, frame):
240 logger.debug('sig_flushexit')
241 global sig_wants_flushexit
242 global timeout
243 sig_wants_flushexit = True
244 timeout = 0
246 ## ensure we flush on various signals
247 # USR1/USR2: user asked to flush out
248 # HUP: apache is no longer writing to us (hung up)
249 signal.signal(signal.SIGUSR1, sig_flush)
250 signal.signal(signal.SIGUSR2, sig_flush)
251 signal.signal(signal.SIGHUP, sig_flushexit)
252 signal.signal(signal.SIGTERM, sig_flushexit)
254 # while True / select / read(int) / O_NONBLOCK put
255 # input buffering explicitly in our hands
256 fcntl.fcntl(sys.stdin, fcntl.F_SETFL, os.O_NONBLOCK)
258 at_eof=False
259 timeout=60
260 statsboundary=False
261 buf = False
262 buftail = False
263 linecount = 0
264 PERIOD = 60 # 1m
266 while True:
267 if opts.stats:
268 # calculate next period boundary
269 # and use that as timeout
270 t = time.time()
271 if (statsboundary == False) or (statsboundary < t) :
272 # next 5m boundary
273 statsboundary = t - (t % PERIOD ) + PERIOD
274 timeout = statsboundary - t
275 logger.debug('timeout in: %u s' % timeout)
276 try:
277 rfds = select.select([sys.stdin], [], [], timeout)[0]
278 logger.debug('select: %u' % len(rfds))
279 if len(rfds): # only read() when select says
280 logger.debug('read()')
281 # will not block
282 buf = sys.stdin.read(4096)
283 logger.debug('read %u' % len(buf))
284 if buf:
285 # If we do get EINTR during write, in Python < 2.7.2
286 # the write may be screwed/incomplete, but we
287 # have no way to know&retry.
288 # http://bugs.python.org/issue10956
289 f.write(buf)
290 if opts.stats:
291 # collect lines
292 if buftail:
293 logger.debug('line tail length %u' % len(buftail))
294 buf = buftail + buf
295 loglines = buf.split('\n')
296 # save the tail for later
297 buftail = loglines.pop()
298 for logline in loglines:
299 linecount = linecount + 1
300 pstats.parse_line(logline)
301 else:
302 at_eof = True
303 logger.debug('at_eof')
304 if opts.stats:
305 t = time.time()
306 if t > statsboundary:
307 pstats.log_stats(statsboundary)
308 except select.error, e:
309 if e[0] == errno.EINTR:
310 logger.debug('E in select: %u' % e[0])
311 continue # on signal, restart at the top
312 else:
313 raise
314 except IOError, e:
315 logger.debug('E in read() or write(): %u' % e[0])
316 if e[0] == errno.EINTR:
317 continue # on signal, restart at the top
318 else:
319 raise
321 if at_eof or sig_wants_flushexit:
322 f.close()
323 sys.stdin.close()
324 if opts.stats:
325 pstats.log_stats(statsboundary)
326 logger.debug('fh closed, exiting')
327 sys.exit(0)
329 if sig_wants_flush:
330 sig_wants_flush = False
331 try:
332 f.flush(zlib.Z_FULL_FLUSH)
333 logger.debug('flush')
334 except:
335 logger.debug('E in flush')
336 # ignore exceptions, try to keep rolling
337 pass