3 # Copyright 2010 Google Inc.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
18 """Tool to parse appcfg request_logs to a sqlite database for further analysis.
22 # Download request logs (with applogs) into requests.txt
23 appcfg.py request_logs --severity 0 --include_all <appdirectory> requests.txt
24 # Run the logparser to insert them into requests.db.
25 # (You can specify multiple input files.)
26 logparser.py --db requests.db requests.txt
27 # Query them using the sqlite3 interactive command line interface.
29 sqlite> -- what are the most common 404s?
30 sqlite> select distinct request_line, count(*) from requests
31 ...> where status == 404 group by request_line order by request_line desc;
32 sqlite> -- what requests see 'Deadline Exceeded'?
33 sqlite> select distinct request_line from requests
34 ...> where applog like '%DeadlineExceeded%';
35 sqlite> -- How many loading requests were seen?
36 sqlite> select count(*) from requests where loading_request=1;
37 sqlite> -- What was the average cpm across all pages?
38 sqlite> select sum(cpm_usd)/count(cpm_usd) from requests;
41 Contents of the Database Table:
43 The database contains request logs parsed from the downloaded log. That log
44 contains an Apache Combined Log Format* lines followed by zero or more
45 tab-indented logging statements from the application.
47 The default format of the request line is
48 "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\""
50 If you specify --include_vhost or --include_all, it becomes
51 "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" \"%v\""
53 If you add --include_all, it is the same as above plus zero or more
54 of the following tags:
55 ms (elapsed request time)
59 queue_name (task queue name)
60 task_name (task queue task name)
61 loading_request=1 (if this is a loading request)
62 pending_ms (time in pending queue)
66 These are recorded in a sqlite database in a table 'requests', which
67 has the following columns: (Source noted in parentheses.)
75 useragent (%{User-agent}i)
88 TODO: Parse method, path, querystring, and protocol from request_line.
90 The additional logging lines are combined into the following columns:
91 applog, applog0, applog1, applog2, applog3, applog4
92 The first (applog) combines all severity levels; the others include only that
93 one severity level to allow for precise queries.
95 * http://httpd.apache.org/docs/1.3/mod/mod_log_config.html#formats
96 The above format is the particular format one which App Engine outputs. This
97 log parser is not a general parser.
102 You can also specify regular expressions over an applog to create
103 one or more custom columns.
107 --custom_column "widgets:^1:[0-9.]+ Found ([0-9]+) widgets"
109 This will locate all Info logs (severity 1) which look like this:
111 1:1286325423.286856 Found 12 widgets
113 and pull out the "12" part. Now you can run a query like these:
115 sqlite> -- How many widgets were seen?
116 sqlite> select select sum(widgets) from requests;
117 sqlite> -- What was the approximate the processing time per widget?
118 sqlite> select sum(cpu_ms)/cast(sum(widgets) as float)
119 ...> from requests where widgets > '';
121 You can specify multiple --custom_column flags.
124 Handling Duplicate Items:
126 If --discard_duplicates is specified, the entire request log line is stored
127 in "request_log", as primary key. This allows you to run this tool multiple
128 times over several files to combine them--it'll only record the first one
129 seen. However, you may lose legitimate duplicate records; adding --include_all
130 will reduce the chance of legitimate duplicates occuring.
132 This is particularly useful as request_logs has no mechanism to download
133 all request logs and app logs items at once. A common pattern might be:
135 # Download all applogs.
136 appcfg.py request_logs --severity 0 --include_all <appdirectory> applogs.txt
137 # Download all requestlogs. There may be overlap with the applogs.
138 appcfg.py request_logs --include_all <appdirectory> request.txt
139 # Run the logparser to insert them into requests.db.
140 logparser.py --discard_duplicates --db requests.db requests.txt applogs.txt
142 # pylint: disable-msg=C6409
151 class Error(Exception):
152 """Base logparser error type."""
155 class LineParsingFailure(Error
):
156 """Cannot parse the line."""
159 def create_database(db_filename
, discard_duplicates
, custom_columns
):
160 """Create a connection to the database, create needed tables.
163 db_filename: Path to the db file.
164 discard_duplicates: If it should attempt to discard duplicate log lines.
165 custom_columns: Dictionary, column name: regexp.
168 Sqlite database connection, with tables created.
170 connection
= sqlite3
.connect(db_filename
, isolation_level
=None)
172 if discard_duplicates
:
173 key_column
= ' request_log text primary key,\n'
178 custom_column_signature
= (
179 ', %s text\n' % ' text\n, '.join(custom_columns
))
181 custom_column_signature
= ''
184 'create table requests (\n'
186 ' remotehost text,\n'
188 ' request_time_str text,\n'
189 ' request_time DATETIME,\n' # Not yet really implemented.
190 ' request_line text,\n'
200 ' queue_name text,\n'
202 ' loading_request boolean,\n'
205 ' throttle_code int,\n'
207 ' app_engine_release text,\n'
208 ' method text,\n' # Not yet implemented.
209 ' path text,\n' # Not yet implemented.
210 ' querystring text,\n' # Not yet implemented.
211 ' protocol text,\n' # Not yet implemented.
220 ) % (key_column
, custom_column_signature
)
222 connection
.execute(table_signature
)
223 except sqlite3
.OperationalError
, e
:
224 if 'already exists' not in e
.message
:
225 logging
.exception('Exception creating table:')
227 # TODO: Check that the schema matches.
232 # An ugly regex to match the start of the request log line.
233 # %h %l %u %t \"%r\" %>s
234 # %b \"%{Referer}i\" \"%{User-agent}i\""
235 LINE_RE
= ('([^ ]+) - ([^ ]+) \[([^]]+)\] (-|"(\\\\.|[^"])*") ([^ ]+) '
236 '([^ ]+) (-|"(\\\\.|[^"])*") (-|"(\\\\.|[^"])*")')
237 LINE_RE_COMPILED
= re
.compile(LINE_RE
)
240 def parse_line(line
, custom_columns
):
241 """Parse a line and return a dict of values.
243 This is very cheesy code and totally hard coded.
246 line: A line from the log file.
247 custom_columns: Dict of colname: regexp.
250 Dictionary of values. If this is a request log line, it contains the
251 various values from it. If it's an applog line, it contains two values
252 'applog' with the value and 'applog_severity' with the severity, or ''
253 meaning a continuation of the previous applog.
256 LineParsingFailure: cannot parse the line.
259 if line
.startswith('\t'):
260 applog
= line
.strip() # remove leading tab and trailing newline.
261 results
['applog'] = applog
262 # Some applogs have severity; some are continuations and do not.
263 if (len(applog
) > 2 and applog
[1] == ':'
264 and applog
[0] >= '0' and applog
[0] <= '9'):
265 results
['applog_severity'] = applog
[0]
266 for column
, regexp
in custom_columns
.iteritems():
267 match
= re
.search(regexp
, applog
)
269 # TODO: Consider allowing named groups.
270 results
[column
] = match
.group(1)
273 matches
= LINE_RE_COMPILED
.match(line
)
275 raise LineParsingFailure('Fail. %s', line
)
277 results
['remotehost'] = matches
.group(1)
278 results
['user'] = matches
.group(2)
279 results
['request_time_str'] = matches
.group(3)
280 results
['request_line'] = matches
.group(4)
281 # Submatch = matches.group(5)
282 results
['status'] = matches
.group(6)
283 results
['bytes'] = matches
.group(7)
284 results
['referer'] = matches
.group(8)
285 # Submatch = matches.group(9)
286 results
['useragent'] = matches
.group(10)
288 more_values
= line
[len(matches
.group(0)):].strip().split(' ')
290 results
['host'] = more_values
.pop(0)
291 for pair
in more_values
:
292 key
, value
= pair
.split('=')
294 if more_values
and 'loading_request' not in results
:
295 # The user requested --include_all, so if this were a loading request
296 # it would be in the line. But it's not in the line, so it's not a loading
297 # request. Otherwise, we want it to be NULL for unknown.
298 results
['loading_request'] = '0'
303 def insert_row(row_dict
, insert_cursor
, discard_duplicates
):
304 """Insert a row into the database, including conversions if necessary.
307 row_dict: Dictionary of data to insert.
308 insert_cursor: A cursor to the database to execute the insert on.
309 discard_duplicates: If duplicates should be discarded.
312 Boolean, True if the row was successfully inserted into the db, else False.
316 # Calculating an insert statement may be vulnerable to sql injection. However,
317 # this is an offline tool. Perhaps sqlite3 can handle this using sqlite3.Row?
318 for column
, value
in row_dict
.iteritems():
319 columns
.append(column
)
323 statement
= ('insert into requests (%s) values (%s)' %
324 (','.join(columns
), ','.join(['?'] * len(columns
))))
325 insert_cursor
.execute(statement
, values
)
326 except sqlite3
.IntegrityError
:
327 if not discard_duplicates
:
328 # This should never happen anyway as we shouldn't have specified
329 # a primary key. But if someone re-uses the db, it might.
335 def parse_log_file(lines
, connection
, discard_duplicates
, custom_columns
):
336 """Parse every input line, insert it into the db as appropriate.
339 lines: Iterator returning log lines to parse.
340 connection: Sqlite3 connection with the requests table created.
341 discard_duplicates: If it should attempt to discard duplicate log lines.
342 custom_columns: Dictionary, column name: regexp.
345 request_count, insert_count: The number of requests seen, and the number
346 inserted. (Will differ if discard_duplicates is true.)
349 insert_cursor
= connection
.cursor()
354 parsed_line
= parse_line(line
, custom_columns
)
355 if 'applog' in parsed_line
:
356 # TODO: Discard the first severity 0 applog line, which seems to be
357 # a variant of the requestlog line.
358 applog
= parsed_line
.pop('applog')
359 applog_severity
= parsed_line
.pop('applog_severity', applog_severity
)
360 result_dict
['applog'] = result_dict
.get('applog', '') + '\n' + applog
362 severity
= 'applog%s' % applog_severity
363 result_dict
[severity
] = result_dict
.get(severity
, '') + '\n' + applog
364 # Everything else is a custom column. Last one wins.
365 result_dict
.update(parsed_line
)
367 # At this point we've reached a new line. Write the old one.
368 if request_count
% 100 == 0:
373 inserted
= insert_row(result_dict
, insert_cursor
, discard_duplicates
)
376 result_dict
= parsed_line
378 if discard_duplicates
:
379 result_dict
['request_log'] = line
383 inserted
= insert_row(result_dict
, insert_cursor
, discard_duplicates
)
388 insert_cursor
.close()
390 return request_count
, insert_count
393 def parse_arguments():
394 """Parse arguments from sys.argv; exit if they are no good."""
395 usage
= 'usage: %prog [options] input_filename(s)'
396 parser
= optparse
.OptionParser(usage
=usage
)
397 parser
.add_option('--db', dest
='db', help='Filename of sqlite3 database.')
398 parser
.add_option('--discard_duplicates', dest
='discard_duplicates',
399 action
='store_true', default
=False,
400 help='Discard duplicate request rows.')
401 parser
.add_option('--custom_column', dest
='custom_columns', action
='append',
402 help='Custom column (can be multiple); format is '
403 'name:regexp. Run across app logs.')
405 options
, args
= parser
.parse_args()
409 errors
.append('At least one input_filename is required.')
410 required_flags
= ('db',)
411 for flag
in required_flags
:
412 if not getattr(options
, flag
):
413 errors
.append('--%s is required' % flag
)
417 print ' ' + '\n '.join(errors
)
420 input_filenames
= args
422 return options
, input_filenames
426 """Main function. Parse args then run over the log files."""
427 options
, input_filenames
= parse_arguments()
428 if options
.custom_columns
:
429 custom_columns
= dict(value
.split(':', 1)
430 for value
in options
.custom_columns
)
434 connection
= create_database(options
.db
, options
.discard_duplicates
,
437 request_count_total
= 0
438 insert_count_total
= 0
439 for filename
in input_filenames
:
440 print 'Parsing %s' % filename
441 f
= open(filename
, 'r')
442 request_count
, insert_count
= parse_log_file(f
, connection
,
443 options
.discard_duplicates
,
445 request_count_total
+= request_count
446 insert_count_total
+= insert_count
448 print 'Done! Parsed %d requests. %d duplicate rows.' % (
449 request_count_total
, request_count_total
- insert_count_total
)
452 if __name__
== '__main__':