App Engine Python SDK version 1.8.9
[gae.git] / python / google / appengine / ext / analytics / process.py
blob6a7a324b0f3825d02c69cc7ad08bcf91d85b565f
1 #!/usr/bin/env python
3 # Copyright 2007 Google Inc.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
21 """Compute statistics on Appstats data and prepare data for UI.
23 Appstats data is processed to compute information necessary for
24 charts. For e.g., for the main page, request counts in different
25 latency bins are computed, and the information is summarized in
26 a manner convenient for the UI.
27 """
30 try:
31 import json
32 except ImportError:
33 import simplejson as json
34 import math
37 class _ExponentialBinner(object):
38 """Bins data in intervals with exponentially increasing sizes.
40 Helps with preparation of histograms. E.g., histograms that
41 plot number of requests within each latency range.
42 """
44 def __init__(self, start, exponent):
45 """Initialize parameters for histograms.
47 E.g., start = 10, and exponent = 2 will bin data using intervals
48 [0, 10], [11, 20], [21, 40], and so on.
50 Args:
51 start: upper bound of first interval
52 exponent: ratio of upper bounds of two consecutive intervals.
53 """
54 self.start = start
55 self.exponent = exponent
57 def Bin(self, data):
58 """Compute counts of data items in various bins.
60 Args:
61 data: sorted list of integer or long data items.
62 Returns:
63 A list, with each element being count of data items in each bin
64 """
65 bincounts = []
67 numbins = self._BinIndex(data[-1]) + 1
69 for bin_index in range(numbins):
70 bincounts.append(0)
71 for item in data:
72 bin_index = self._BinIndex(item)
73 bincounts[bin_index] += 1
74 return bincounts
76 def Intervals(self, numbins):
77 """Returns the upper bounds of intervals under exponential binning.
79 E.g., if intervals are [0, 10], [11, 20], [21, 40], [41, 80], this
80 function returns the list [10, 20, 40, 80].
82 Args:
83 numbins: Number of bins.
84 Returns:
85 A list which contains upper bounds of each interval range.
86 """
87 if numbins < 1:
88 return []
89 intervals = [self.start]
90 for _ in range(1, numbins):
91 intervals.append(intervals[-1] * self.exponent)
92 return intervals
94 def _BinIndex(self, item):
95 """Get bin to which item belongs.
97 E.g., if intervals are [0, 10], [10, 20], [20, 40], [40, 80],
98 _BinIndex(25) is 2, and _BinIndex(50) is 3.
99 Bin numbers are 0-based.
101 Args:
102 item: data item
104 Returns:
105 bin to which item belongs, assuming 0-based binning.
110 if item <= self.start:
112 return 0
113 else:
119 itembin = math.ceil(math.log(float(item)/self.start, self.exponent))
120 return int(itembin)
123 def URLFreqRespTime(urlstatsdict):
124 """Computes request counts in different response time ranges for histograms.
126 Args:
127 urlstatsdict: A dictionary. Key is url path. Value is appropriate
128 URLStats object which contains appstats statistics for the path.
130 Returns:
131 resptime_byfreq: A list of 3-tuples, one per URL, sorted in descending
132 order of the number of requests seen by each URL. The elements of each
133 tuple are (i) URL path; (ii) sorted list of response times of all
134 requests corresponding to that URL; and (iii) a list of request counts
135 in each latency bin for that URL.
136 intervals: A list of latency ranges that requests of each URL are
137 binned into. Each latency range is represented by the upper end of the
138 range. E.g., if we are binning requests into latency ranges
139 [0, 10], [11, 20], [21, 40], ... [1601, 3200]. Then, intervals is
140 represented by the list [10, 20, 40,...,3200]
142 resptime = []
145 binner = _ExponentialBinner(10, 2)
146 maxbins = 0
147 for url, urlstats in urlstatsdict.iteritems():
148 urlresptime = sorted(urlstats.GetResponseTimeList())
149 urlbin = binner.Bin(urlresptime)
152 maxbins = max(maxbins, len(urlbin))
153 resptime.append((url, urlresptime, urlbin))
155 resptime.sort(key=lambda triple: len(triple[1]), reverse=True)
156 intervals = binner.Intervals(maxbins)
157 return resptime, intervals
160 def _GetPercentile(sortedlist, percent):
161 """Returns a desired percentile value of a sorted list of numbers.
163 E.g., if a list of request latencies is
164 [1, 4, 7, 14, 34, 89, 100, 123, 149, 345], and percent is 0.9, the result
165 is 149. If percent is 0.5 (median), result is 34.
167 Args:
168 sortedlist: A sorted list of integers, longs or floats.
169 percent: A fraction between 0 and 1 that indicates desired
170 percentile value. E.g., 0.9 means 90th percentile is desired.
171 Returns:
172 None if list is empty. Else, the desired percentile value.
174 if not sortedlist:
175 return None
180 k = int(math.ceil(len(sortedlist) * percent)) - 1
181 if k < 0:
184 k = 0
185 return sortedlist[k]
188 def _GetPercentileList(items, percentilelist):
189 """Given a list, returns a list of desired percentile values.
191 Args:
192 items: A list of integers, longs or floats.
193 percentilelist: A list of fractions, each between 0 and 1 that indicates
194 desired percentile value. E.g., [0.1, 0.9] means 10th and 90th
195 percentiles are desired.
196 Returns:
197 None if list is empty. Else, the list of desired percentile values.
200 if not items:
201 return None
202 sortedlist = sorted(items)
203 return [_GetPercentile(sortedlist, p) for p in percentilelist]
206 class RequestSummary(object):
207 """Summarizes request statistics for UI.
209 The class summarizes the timestamps, latencies and total rpc time of all
210 requests of a given URL path. An object of this class will then be passed
211 to the UI for display of the page that drills into specific a URL path.
214 def __init__(self):
215 self.timestamps = []
216 self.totaltimes = []
217 self.totalrpctimes = []
220 def Summary(urlstats):
221 """Summarize relevant statistics for requests.
223 Args:
224 urlstats: A list of URLStat objects, which provide statistics for
225 each request of a given URL path.
227 Returns:
228 A RequestSummary object which provides the timestamps, latencies
229 and total rpc times for all requests of a given URL path. Each list
230 is ordered in chronological order.
232 summary = RequestSummary()
234 for request in reversed(urlstats.urlrequestlist):
235 summary.timestamps.append(request.timestamp)
236 summary.totaltimes.append(request.totalresponsetime)
237 summary.totalrpctimes.append(request.totalrpctime)
238 return summary
241 class RPCSummary(object):
242 """Summarize RPC statistics for UI.
244 The class summarizes information relevant to each RPC category
245 such as the number of requests, number of calls, time spent in
246 each RPC etc. There is one object per RPC category. Objects of
247 this class will be passed to the UI for display of the page that
248 drills into specific a URL path.
251 def __init__(self):
253 self.requests = 0
255 self.calls = 0
257 self.times = []
259 self.indices = []
261 self.stats = []
263 self.summary_time = 0
266 def SortedRPCSummaries(urlstats, summary_percentile):
267 """Summarize RPC statistics of requests for UI.
269 Args:
270 urlstats: A list of URLStat objects, which provide statistics for
271 each request of a given URL path.
272 summary_percentile: Summarize the time spent in an RPC across
273 different requests by this percentile value. RPCs are sorted in
274 the decreasing order of this percentile value. E.g., 0.5 indicates
275 RPC times are summarized and sorted by the median.
277 Returns:
278 A list of tuples. The first element of each tuple is an RPC category
279 label. The second element is an RPCSummary object which summarizes
280 statistics about that RPC category. Summarizing data in this form is
281 convenient for rendering UI on the drill page, particularly for bar
282 charts showing times spent in various RPCs across different requests.
283 The list is sorted in decreasing order of the summary_percentile of time
284 spent in that RPC. This is the order in which RPCs will be rendered in
285 the UI.
287 rpcsummary = {}
289 for (index, request) in enumerate(reversed(urlstats.urlrequestlist)):
290 for rpc in request.rpcstatslist:
291 label = rpc.GetLabel()
292 if label not in rpcsummary:
293 rpcsummary[label] = RPCSummary()
294 summary = rpcsummary[label]
295 summary.requests += 1
296 summary.calls += rpc.numcalls
297 summary.times.append(rpc.time)
298 summary.indices.append(index)
299 successful_reads = len(rpc.keys_read) - len(rpc.keys_failed_get)
300 summary.stats.append((rpc.numcalls,
301 successful_reads,
302 len(rpc.keys_written),
303 len(rpc.keys_failed_get)))
306 for label in rpcsummary:
307 summary = _GetPercentile(sorted(rpcsummary[label].times),
308 summary_percentile)
309 rpcsummary[label].summary_time = summary
310 rpcsummary_sort = sorted(rpcsummary.iteritems(),
311 key=lambda pair: pair[1].summary_time,
312 reverse=True)
313 return rpcsummary_sort
316 def RPCVariation(reqsummary, rpcsummaries):
317 """Generates desired percentiles of times spent in each RPC.
319 Produces results useful for a candlestick chart that shows variation
320 in time spent across different RPCs. Currently, the candlestick chart
321 shows the 10th, 25th, 75th and 90th percentiles of RPC times.
323 Args:
324 reqsummary: A reqsummary object.
325 rpcsummaries: a list of tuples generated by the SortedRPCSummaries
326 function. In each tuple, the first element is an RPC category name
327 and the second element is a dictionary containing information
328 about the RPC category, particularly time spent in that RPC category
329 across URL requests.
331 Returns:
332 A list of lists. Each inner list contains delay percentiles for each RPC.
334 rpc_variation = []
336 markers = [0.1, 0.25, 0.75, 0.9]
337 percentiles = _GetPercentileList(reqsummary.totaltimes, markers)
338 percentiles.insert(0, 'Total')
339 rpc_variation.append(percentiles)
341 percentiles = _GetPercentileList(reqsummary.totalrpctimes, markers)
342 percentiles.insert(0, 'TotalRPCTime')
343 rpc_variation.append(percentiles)
345 for pair in rpcsummaries:
346 percentiles = _GetPercentileList(pair[1].times, markers)
347 percentiles.insert(0, pair[0])
348 rpc_variation.append(percentiles)
349 return rpc_variation
352 def SplitByKind(freqdict):
353 """Arranges entity/entity group access counts by their kind.
355 Args:
356 freqdict: a dict with keys corresponding to entities or entity
357 groups. Value is a dict with 3 keys, 'read', 'write', 'missed',
358 the values of which correspond to the appropriate counts for
359 that entity.
361 Returns:
362 kinds_bycount: A list of <kind, entitiesOfKind> tuples, one per entity
363 (group) kind sorted in decreasing order of number of entities
364 (entity groups) of each kind. entitiesOfKind is a list of
365 tuples, one per entity (group) of that kind, sorted in decreasing order
366 of the access count of that entity (group). Each tuple consists of the
367 name of the entity (group), along with read, write and miss counts.
368 maxcount: The maximum access count seen by any entity of any kind.
373 kinds = {}
374 for kind_fullname, freq in freqdict.items():
375 (kind, fullname) = kind_fullname.split(',')
376 if not kind in kinds:
377 kinds[kind] = []
378 kinds[kind].append((fullname, freq['read'],
379 freq['write'], freq['miss']))
383 for kind in kinds:
386 kinds[kind].sort(key=lambda ent: ent[1] + ent[2], reverse=True)
388 kinds_bycount = sorted(kinds.iteritems(),
389 key=lambda pair: len(pair[1]), reverse=True)
391 maxcount = 0
392 for kind in kinds:
393 maxcount = max(maxcount, kinds[kind][0][1] + kinds[kind][0][2])
394 return kinds_bycount, maxcount
397 class Drill(object):
398 """Data structures to be passed to UI for rendering drill page."""
400 def __init__(self):
401 self.reqsummary = None
402 self.rpcsummaries = []
403 self.groupcounts = []
404 self.maxgroupcount = None
405 self.entitycounts = []
406 self.maxentitycount = None
407 self.rpc_variation = []
409 def _ToJsonDrill(self):
410 """Encodes data for drill page in JSON for UI.
412 Returns:
413 drill_json: A dictionary representation of the class with attributes
414 encoded into JSON as necessary for the UI.
416 drill_json = dict(self.__dict__)
420 drill_json['rpcsummaries'] = [(l, s.requests, s.calls,
421 json.dumps(s, cls=_RPCSummaryEncoder))
422 for (l, s) in self.rpcsummaries]
426 drill_json['groupcounts'] = [(k, len(v), json.dumps(v))
427 for (k, v) in self.groupcounts]
428 drill_json['entitycounts'] = [(k, len(v), json.dumps(v))
429 for (k, v) in self.entitycounts]
430 return drill_json
433 class _RPCSummaryEncoder(json.JSONEncoder):
434 """JSON encoder for class RPCSummary."""
436 def default(self, obj):
437 """Arranges entity/entity group access counts by their kind.
439 Args:
440 obj: an object whose JSON encoding is desired.
441 Returns:
442 JSON encoding of obj.
444 if not isinstance(obj, RPCSummary):
445 return json.JSONEncoder.default(self, obj)
446 return obj.__dict__
449 def DrillURL(urlstats):
450 """Analyzes URL statistics and generates data for drill page.
452 Master function that calls all necessary functions to compute
453 various data structures needed for rendering the drill page
454 which shows details about a particular URL path.
456 Args:
457 urlstats: An URLStats object which holds appstats information
458 about all requests of an URL path.
459 Returns:
460 drill: An object of class Drill with attributes encoded into JSON
461 as necessary for the UI.
463 drill = Drill()
464 drill.reqsummary = Summary(urlstats)
468 drill.rpcsummaries = SortedRPCSummaries(urlstats, 0.9)
469 drill.rpc_variation = RPCVariation(drill.reqsummary, drill.rpcsummaries)
470 groupcounts = urlstats.EntityGroupCount()
471 drill.groupcounts, drill.maxgroupcount = SplitByKind(groupcounts)
472 entitycounts = urlstats.EntityCount()
473 drill.entitycounts, drill.maxentitycount = SplitByKind(entitycounts)
474 drill_json = drill._ToJsonDrill()
475 return drill_json