Add new instance and app_engine_release fields to table schema.
[gae-samples.git] / logparser / logparser.py
blobf8452aa64f96450ef5d6690796ba121a9f2106b7
1 #!/usr/bin/python2.5
3 # Copyright 2010 Google Inc.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
18 """Tool to parse appcfg request_logs to a sqlite database for further analysis.
20 Example:
22 # Download request logs (with applogs) into requests.txt
23 appcfg.py request_logs --severity 0 --include_all <appdirectory> requests.txt
24 # Run the logparser to insert them into requests.db.
25 # (You can specify multiple input files.)
26 logparser.py --db requests.db requests.txt
27 # Query them using the sqlite3 interactive command line interface.
28 sqlite3 requests.db
29 sqlite> -- what are the most common 404s?
30 sqlite> select distinct request_line, count(*) from requests
31 ...> where status == 404 group by request_line order by request_line desc;
32 sqlite> -- what requests see 'Deadline Exceeded'?
33 sqlite> select distinct request_line from requests
34 ...> where applog like '%DeadlineExceeded%';
35 sqlite> -- How many loading requests were seen?
36 sqlite> select count(*) from requests where loading_request=1;
37 sqlite> -- What was the average cpm across all pages?
38 sqlite> select sum(cpm_usd)/count(cpm_usd) from requests;
41 Contents of the Database Table:
43 The database contains request logs parsed from the downloaded log. That log
44 contains an Apache Combined Log Format* lines followed by zero or more
45 tab-indented logging statements from the application.
47 The default format of the request line is
48 "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\""
50 If you specify --include_vhost or --include_all, it becomes
51 "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" \"%v\""
53 If you add --include_all, it is the same as above plus zero or more
54 of the following tags:
55 ms (elapsed request time)
56 cpu_ms
57 api_cpu_ms
58 cpm_usd (float)
59 queue_name (task queue name)
60 task_name (task queue task name)
61 loading_request=1 (if this is a loading request)
62 pending_ms (time in pending queue)
63 exit_code=int
64 throttle_code=int
66 These are recorded in a sqlite database in a table 'requests', which
67 has the following columns: (Source noted in parentheses.)
68 remotehost (%h)
69 user (%u)
70 request_time_str (%t)
71 request_line (%r)
72 status (%s)
73 bytes (%b)
74 referer (%{Referer}i)
75 useragent (%{User-agent}i)
76 host (%v)
78 cpu_ms
79 api_cpu_ms
80 cpm_usd
81 queue_name
82 task_name
83 loading_request
84 pending_ms
85 exit_code
86 throttle_code
88 TODO: Parse method, path, querystring, and protocol from request_line.
90 The additional logging lines are combined into the following columns:
91 applog, applog0, applog1, applog2, applog3, applog4
92 The first (applog) combines all severity levels; the others include only that
93 one severity level to allow for precise queries.
95 * http://httpd.apache.org/docs/1.3/mod/mod_log_config.html#formats
96 The above format is the particular format one which App Engine outputs. This
97 log parser is not a general parser.
100 Custom Columns:
102 You can also specify regular expressions over an applog to create
103 one or more custom columns.
105 Example:
107 --custom_column "widgets:^1:[0-9.]+ Found ([0-9]+) widgets"
109 This will locate all Info logs (severity 1) which look like this:
111 1:1286325423.286856 Found 12 widgets
113 and pull out the "12" part. Now you can run a query like these:
115 sqlite> -- How many widgets were seen?
116 sqlite> select select sum(widgets) from requests;
117 sqlite> -- What was the approximate the processing time per widget?
118 sqlite> select sum(cpu_ms)/cast(sum(widgets) as float)
119 ...> from requests where widgets > '';
121 You can specify multiple --custom_column flags.
124 Handling Duplicate Items:
126 If --discard_duplicates is specified, the entire request log line is stored
127 in "request_log", as primary key. This allows you to run this tool multiple
128 times over several files to combine them--it'll only record the first one
129 seen. However, you may lose legitimate duplicate records; adding --include_all
130 will reduce the chance of legitimate duplicates occuring.
132 This is particularly useful as request_logs has no mechanism to download
133 all request logs and app logs items at once. A common pattern might be:
135 # Download all applogs.
136 appcfg.py request_logs --severity 0 --include_all <appdirectory> applogs.txt
137 # Download all requestlogs. There may be overlap with the applogs.
138 appcfg.py request_logs --include_all <appdirectory> request.txt
139 # Run the logparser to insert them into requests.db.
140 logparser.py --discard_duplicates --db requests.db requests.txt applogs.txt
142 # pylint: disable-msg=C6409
144 import logging
145 import optparse
146 import re
147 import sys
148 import sqlite3
151 class Error(Exception):
152 """Base logparser error type."""
155 class LineParsingFailure(Error):
156 """Cannot parse the line."""
159 def create_database(db_filename, discard_duplicates, custom_columns):
160 """Create a connection to the database, create needed tables.
162 Args:
163 db_filename: Path to the db file.
164 discard_duplicates: If it should attempt to discard duplicate log lines.
165 custom_columns: Dictionary, column name: regexp.
167 Returns:
168 Sqlite database connection, with tables created.
170 connection = sqlite3.connect(db_filename, isolation_level=None)
172 if discard_duplicates:
173 key_column = ' request_log text primary key,\n'
174 else:
175 key_column = ''
177 if custom_columns:
178 custom_column_signature = (
179 ', %s text\n' % ' text\n, '.join(custom_columns))
180 else:
181 custom_column_signature = ''
183 table_signature = (
184 'create table requests (\n'
185 '%s'
186 ' remotehost text,\n'
187 ' user text,\n'
188 ' request_time_str text,\n'
189 ' request_time DATETIME,\n' # Not yet really implemented.
190 ' request_line text,\n'
191 ' status int,\n'
192 ' bytes int,\n'
193 ' referer text,\n'
194 ' useragent text,\n'
195 ' host text,\n'
196 ' ms int,\n'
197 ' cpu_ms int,\n'
198 ' api_cpu_ms int,\n'
199 ' cpm_usd float,\n'
200 ' queue_name text,\n'
201 ' task_name text,\n'
202 ' loading_request boolean,\n'
203 ' pending_ms int,\n'
204 ' exit_code int,\n'
205 ' throttle_code int,\n'
206 ' instance text,\n'
207 ' app_engine_release text,\n'
208 ' method text,\n' # Not yet implemented.
209 ' path text,\n' # Not yet implemented.
210 ' querystring text,\n' # Not yet implemented.
211 ' protocol text,\n' # Not yet implemented.
212 ' applog text,\n'
213 ' applog0 text,\n'
214 ' applog1 text,\n'
215 ' applog2 text,\n'
216 ' applog3 text,\n'
217 ' applog4 text\n'
218 ' %s'
220 ) % (key_column, custom_column_signature)
221 try:
222 connection.execute(table_signature)
223 except sqlite3.OperationalError, e:
224 if 'already exists' not in e.message:
225 logging.exception('Exception creating table:')
226 raise
227 # TODO: Check that the schema matches.
229 return connection
232 # An ugly regex to match the start of the request log line.
233 # %h %l %u %t \"%r\" %>s
234 # %b \"%{Referer}i\" \"%{User-agent}i\""
235 LINE_RE = ('([^ ]+) - ([^ ]+) \[([^]]+)\] (-|"(\\\\.|[^"])*") ([^ ]+) '
236 '([^ ]+) (-|"(\\\\.|[^"])*") (-|"(\\\\.|[^"])*")')
237 LINE_RE_COMPILED = re.compile(LINE_RE)
240 def parse_line(line, custom_columns):
241 """Parse a line and return a dict of values.
243 This is very cheesy code and totally hard coded.
245 Args:
246 line: A line from the log file.
247 custom_columns: Dict of colname: regexp.
249 Returns:
250 Dictionary of values. If this is a request log line, it contains the
251 various values from it. If it's an applog line, it contains two values
252 'applog' with the value and 'applog_severity' with the severity, or ''
253 meaning a continuation of the previous applog.
255 Raises:
256 LineParsingFailure: cannot parse the line.
258 results = {}
259 if line.startswith('\t'):
260 applog = line.strip() # remove leading tab and trailing newline.
261 results['applog'] = applog
262 # Some applogs have severity; some are continuations and do not.
263 if (len(applog) > 2 and applog[1] == ':'
264 and applog[0] >= '0' and applog[0] <= '9'):
265 results['applog_severity'] = applog[0]
266 for column, regexp in custom_columns.iteritems():
267 match = re.search(regexp, applog)
268 if match:
269 # TODO: Consider allowing named groups.
270 results[column] = match.group(1)
271 return results
273 matches = LINE_RE_COMPILED.match(line)
274 if not matches:
275 raise LineParsingFailure('Fail. %s', line)
277 results['remotehost'] = matches.group(1)
278 results['user'] = matches.group(2)
279 results['request_time_str'] = matches.group(3)
280 results['request_line'] = matches.group(4)
281 # Submatch = matches.group(5)
282 results['status'] = matches.group(6)
283 results['bytes'] = matches.group(7)
284 results['referer'] = matches.group(8)
285 # Submatch = matches.group(9)
286 results['useragent'] = matches.group(10)
288 more_values = line[len(matches.group(0)):].strip().split(' ')
289 if more_values:
290 results['host'] = more_values.pop(0)
291 for pair in more_values:
292 key, value = pair.split('=')
293 results[key] = value
294 if more_values and 'loading_request' not in results:
295 # The user requested --include_all, so if this were a loading request
296 # it would be in the line. But it's not in the line, so it's not a loading
297 # request. Otherwise, we want it to be NULL for unknown.
298 results['loading_request'] = '0'
300 return results
303 def insert_row(row_dict, insert_cursor, discard_duplicates):
304 """Insert a row into the database, including conversions if necessary.
306 Args:
307 row_dict: Dictionary of data to insert.
308 insert_cursor: A cursor to the database to execute the insert on.
309 discard_duplicates: If duplicates should be discarded.
311 Returns:
312 Boolean, True if the row was successfully inserted into the db, else False.
314 columns = []
315 values = []
316 # Calculating an insert statement may be vulnerable to sql injection. However,
317 # this is an offline tool. Perhaps sqlite3 can handle this using sqlite3.Row?
318 for column, value in row_dict.iteritems():
319 columns.append(column)
320 values.append(value)
322 try:
323 statement = ('insert into requests (%s) values (%s)' %
324 (','.join(columns), ','.join(['?'] * len(columns))))
325 insert_cursor.execute(statement, values)
326 except sqlite3.IntegrityError:
327 if not discard_duplicates:
328 # This should never happen anyway as we shouldn't have specified
329 # a primary key. But if someone re-uses the db, it might.
330 raise
331 return False
332 return True
335 def parse_log_file(lines, connection, discard_duplicates, custom_columns):
336 """Parse every input line, insert it into the db as appropriate.
338 Args:
339 lines: Iterator returning log lines to parse.
340 connection: Sqlite3 connection with the requests table created.
341 discard_duplicates: If it should attempt to discard duplicate log lines.
342 custom_columns: Dictionary, column name: regexp.
344 Returns:
345 request_count, insert_count: The number of requests seen, and the number
346 inserted. (Will differ if discard_duplicates is true.)
348 result_dict = {}
349 insert_cursor = connection.cursor()
350 request_count = 0
351 insert_count = 0
352 applog_severity = ''
353 for line in lines:
354 parsed_line = parse_line(line, custom_columns)
355 if 'applog' in parsed_line:
356 # TODO: Discard the first severity 0 applog line, which seems to be
357 # a variant of the requestlog line.
358 applog = parsed_line.pop('applog')
359 applog_severity = parsed_line.pop('applog_severity', applog_severity)
360 result_dict['applog'] = result_dict.get('applog', '') + '\n' + applog
361 if applog_severity:
362 severity = 'applog%s' % applog_severity
363 result_dict[severity] = result_dict.get(severity, '') + '\n' + applog
364 # Everything else is a custom column. Last one wins.
365 result_dict.update(parsed_line)
366 continue
367 # At this point we've reached a new line. Write the old one.
368 if request_count % 100 == 0:
369 print '.',
370 sys.stdout.flush()
371 request_count += 1
372 if result_dict:
373 inserted = insert_row(result_dict, insert_cursor, discard_duplicates)
374 if inserted:
375 insert_count += 1
376 result_dict = parsed_line
377 applog_severity = ''
378 if discard_duplicates:
379 result_dict['request_log'] = line
381 # Add the last row.
382 if result_dict:
383 inserted = insert_row(result_dict, insert_cursor, discard_duplicates)
384 if inserted:
385 insert_count += 1
387 connection.commit()
388 insert_cursor.close()
389 print ''
390 return request_count, insert_count
393 def parse_arguments():
394 """Parse arguments from sys.argv; exit if they are no good."""
395 usage = 'usage: %prog [options] input_filename(s)'
396 parser = optparse.OptionParser(usage=usage)
397 parser.add_option('--db', dest='db', help='Filename of sqlite3 database.')
398 parser.add_option('--discard_duplicates', dest='discard_duplicates',
399 action='store_true', default=False,
400 help='Discard duplicate request rows.')
401 parser.add_option('--custom_column', dest='custom_columns', action='append',
402 help='Custom column (can be multiple); format is '
403 'name:regexp. Run across app logs.')
405 options, args = parser.parse_args()
407 errors = []
408 if not args:
409 errors.append('At least one input_filename is required.')
410 required_flags = ('db',)
411 for flag in required_flags:
412 if not getattr(options, flag):
413 errors.append('--%s is required' % flag)
414 if errors:
415 parser.print_help()
416 print '\nErrors: '
417 print ' ' + '\n '.join(errors)
418 sys.exit(1)
420 input_filenames = args
422 return options, input_filenames
425 def main():
426 """Main function. Parse args then run over the log files."""
427 options, input_filenames = parse_arguments()
428 if options.custom_columns:
429 custom_columns = dict(value.split(':', 1)
430 for value in options.custom_columns)
431 else:
432 custom_columns = {}
434 connection = create_database(options.db, options.discard_duplicates,
435 custom_columns)
437 request_count_total = 0
438 insert_count_total = 0
439 for filename in input_filenames:
440 print 'Parsing %s' % filename
441 f = open(filename, 'r')
442 request_count, insert_count = parse_log_file(f, connection,
443 options.discard_duplicates,
444 custom_columns)
445 request_count_total += request_count
446 insert_count_total += insert_count
448 print 'Done! Parsed %d requests. %d duplicate rows.' % (
449 request_count_total, request_count_total - insert_count_total)
452 if __name__ == '__main__':
453 main()