3 # Copyright 2007 Google Inc.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
21 """Compute statistics on Appstats data and prepare data for UI.
23 Appstats data is processed to compute information necessary for
24 charts. For e.g., for the main page, request counts in different
25 latency bins are computed, and the information is summarized in
26 a manner convenient for the UI.
33 import simplejson
as json
37 class _ExponentialBinner(object):
38 """Bins data in intervals with exponentially increasing sizes.
40 Helps with preparation of histograms. E.g., histograms that
41 plot number of requests within each latency range.
44 def __init__(self
, start
, exponent
):
45 """Initialize parameters for histograms.
47 E.g., start = 10, and exponent = 2 will bin data using intervals
48 [0, 10], [11, 20], [21, 40], and so on.
51 start: upper bound of first interval
52 exponent: ratio of upper bounds of two consecutive intervals.
55 self
.exponent
= exponent
58 """Compute counts of data items in various bins.
61 data: sorted list of integer or long data items.
63 A list, with each element being count of data items in each bin
67 numbins
= self
._BinIndex
(data
[-1]) + 1
69 for bin_index
in range(numbins
):
72 bin_index
= self
._BinIndex
(item
)
73 bincounts
[bin_index
] += 1
76 def Intervals(self
, numbins
):
77 """Returns the upper bounds of intervals under exponential binning.
79 E.g., if intervals are [0, 10], [11, 20], [21, 40], [41, 80], this
80 function returns the list [10, 20, 40, 80].
83 numbins: Number of bins.
85 A list which contains upper bounds of each interval range.
89 intervals
= [self
.start
]
90 for _
in range(1, numbins
):
91 intervals
.append(intervals
[-1] * self
.exponent
)
94 def _BinIndex(self
, item
):
95 """Get bin to which item belongs.
97 E.g., if intervals are [0, 10], [10, 20], [20, 40], [40, 80],
98 _BinIndex(25) is 2, and _BinIndex(50) is 3.
99 Bin numbers are 0-based.
105 bin to which item belongs, assuming 0-based binning.
110 if item
<= self
.start
:
119 itembin
= math
.ceil(math
.log(float(item
)/self
.start
, self
.exponent
))
123 def URLFreqRespTime(urlstatsdict
):
124 """Computes request counts in different response time ranges for histograms.
127 urlstatsdict: A dictionary. Key is url path. Value is appropriate
128 URLStats object which contains appstats statistics for the path.
131 resptime_byfreq: A list of 3-tuples, one per URL, sorted in descending
132 order of the number of requests seen by each URL. The elements of each
133 tuple are (i) URL path; (ii) sorted list of response times of all
134 requests corresponding to that URL; and (iii) a list of request counts
135 in each latency bin for that URL.
136 intervals: A list of latency ranges that requests of each URL are
137 binned into. Each latency range is represented by the upper end of the
138 range. E.g., if we are binning requests into latency ranges
139 [0, 10], [11, 20], [21, 40], ... [1601, 3200]. Then, intervals is
140 represented by the list [10, 20, 40,...,3200]
145 binner
= _ExponentialBinner(10, 2)
147 for url
, urlstats
in urlstatsdict
.iteritems():
148 urlresptime
= sorted(urlstats
.GetResponseTimeList())
149 urlbin
= binner
.Bin(urlresptime
)
152 maxbins
= max(maxbins
, len(urlbin
))
153 resptime
.append((url
, urlresptime
, urlbin
))
155 resptime
.sort(key
=lambda triple
: len(triple
[1]), reverse
=True)
156 intervals
= binner
.Intervals(maxbins
)
157 return resptime
, intervals
160 def _GetPercentile(sortedlist
, percent
):
161 """Returns a desired percentile value of a sorted list of numbers.
163 E.g., if a list of request latencies is
164 [1, 4, 7, 14, 34, 89, 100, 123, 149, 345], and percent is 0.9, the result
165 is 149. If percent is 0.5 (median), result is 34.
168 sortedlist: A sorted list of integers, longs or floats.
169 percent: A fraction between 0 and 1 that indicates desired
170 percentile value. E.g., 0.9 means 90th percentile is desired.
172 None if list is empty. Else, the desired percentile value.
180 k
= int(math
.ceil(len(sortedlist
) * percent
)) - 1
188 def _GetPercentileList(items
, percentilelist
):
189 """Given a list, returns a list of desired percentile values.
192 items: A list of integers, longs or floats.
193 percentilelist: A list of fractions, each between 0 and 1 that indicates
194 desired percentile value. E.g., [0.1, 0.9] means 10th and 90th
195 percentiles are desired.
197 None if list is empty. Else, the list of desired percentile values.
202 sortedlist
= sorted(items
)
203 return [_GetPercentile(sortedlist
, p
) for p
in percentilelist
]
206 class RequestSummary(object):
207 """Summarizes request statistics for UI.
209 The class summarizes the timestamps, latencies and total rpc time of all
210 requests of a given URL path. An object of this class will then be passed
211 to the UI for display of the page that drills into specific a URL path.
217 self
.totalrpctimes
= []
220 def Summary(urlstats
):
221 """Summarize relevant statistics for requests.
224 urlstats: A list of URLStat objects, which provide statistics for
225 each request of a given URL path.
228 A RequestSummary object which provides the timestamps, latencies
229 and total rpc times for all requests of a given URL path. Each list
230 is ordered in chronological order.
232 summary
= RequestSummary()
234 for request
in reversed(urlstats
.urlrequestlist
):
235 summary
.timestamps
.append(request
.timestamp
)
236 summary
.totaltimes
.append(request
.totalresponsetime
)
237 summary
.totalrpctimes
.append(request
.totalrpctime
)
241 class RPCSummary(object):
242 """Summarize RPC statistics for UI.
244 The class summarizes information relevant to each RPC category
245 such as the number of requests, number of calls, time spent in
246 each RPC etc. There is one object per RPC category. Objects of
247 this class will be passed to the UI for display of the page that
248 drills into specific a URL path.
263 self
.summary_time
= 0
266 def SortedRPCSummaries(urlstats
, summary_percentile
):
267 """Summarize RPC statistics of requests for UI.
270 urlstats: A list of URLStat objects, which provide statistics for
271 each request of a given URL path.
272 summary_percentile: Summarize the time spent in an RPC across
273 different requests by this percentile value. RPCs are sorted in
274 the decreasing order of this percentile value. E.g., 0.5 indicates
275 RPC times are summarized and sorted by the median.
278 A list of tuples. The first element of each tuple is an RPC category
279 label. The second element is an RPCSummary object which summarizes
280 statistics about that RPC category. Summarizing data in this form is
281 convenient for rendering UI on the drill page, particularly for bar
282 charts showing times spent in various RPCs across different requests.
283 The list is sorted in decreasing order of the summary_percentile of time
284 spent in that RPC. This is the order in which RPCs will be rendered in
289 for (index
, request
) in enumerate(reversed(urlstats
.urlrequestlist
)):
290 for rpc
in request
.rpcstatslist
:
291 label
= rpc
.GetLabel()
292 if label
not in rpcsummary
:
293 rpcsummary
[label
] = RPCSummary()
294 summary
= rpcsummary
[label
]
295 summary
.requests
+= 1
296 summary
.calls
+= rpc
.numcalls
297 summary
.times
.append(rpc
.time
)
298 summary
.indices
.append(index
)
299 successful_reads
= len(rpc
.keys_read
) - len(rpc
.keys_failed_get
)
300 summary
.stats
.append((rpc
.numcalls
,
302 len(rpc
.keys_written
),
303 len(rpc
.keys_failed_get
)))
306 for label
in rpcsummary
:
307 summary
= _GetPercentile(sorted(rpcsummary
[label
].times
),
309 rpcsummary
[label
].summary_time
= summary
310 rpcsummary_sort
= sorted(rpcsummary
.iteritems(),
311 key
=lambda pair
: pair
[1].summary_time
,
313 return rpcsummary_sort
316 def RPCVariation(reqsummary
, rpcsummaries
):
317 """Generates desired percentiles of times spent in each RPC.
319 Produces results useful for a candlestick chart that shows variation
320 in time spent across different RPCs. Currently, the candlestick chart
321 shows the 10th, 25th, 75th and 90th percentiles of RPC times.
324 reqsummary: A reqsummary object.
325 rpcsummaries: a list of tuples generated by the SortedRPCSummaries
326 function. In each tuple, the first element is an RPC category name
327 and the second element is a dictionary containing information
328 about the RPC category, particularly time spent in that RPC category
332 A list of lists. Each inner list contains delay percentiles for each RPC.
336 markers
= [0.1, 0.25, 0.75, 0.9]
337 percentiles
= _GetPercentileList(reqsummary
.totaltimes
, markers
)
338 percentiles
.insert(0, 'Total')
339 rpc_variation
.append(percentiles
)
341 percentiles
= _GetPercentileList(reqsummary
.totalrpctimes
, markers
)
342 percentiles
.insert(0, 'TotalRPCTime')
343 rpc_variation
.append(percentiles
)
345 for pair
in rpcsummaries
:
346 percentiles
= _GetPercentileList(pair
[1].times
, markers
)
347 percentiles
.insert(0, pair
[0])
348 rpc_variation
.append(percentiles
)
352 def SplitByKind(freqdict
):
353 """Arranges entity/entity group access counts by their kind.
356 freqdict: a dict with keys corresponding to entities or entity
357 groups. Value is a dict with 3 keys, 'read', 'write', 'missed',
358 the values of which correspond to the appropriate counts for
362 kinds_bycount: A list of <kind, entitiesOfKind> tuples, one per entity
363 (group) kind sorted in decreasing order of number of entities
364 (entity groups) of each kind. entitiesOfKind is a list of
365 tuples, one per entity (group) of that kind, sorted in decreasing order
366 of the access count of that entity (group). Each tuple consists of the
367 name of the entity (group), along with read, write and miss counts.
368 maxcount: The maximum access count seen by any entity of any kind.
374 for kind_fullname
, freq
in freqdict
.items():
375 (kind
, fullname
) = kind_fullname
.split(',')
376 if not kind
in kinds
:
378 kinds
[kind
].append((fullname
, freq
['read'],
379 freq
['write'], freq
['miss']))
386 kinds
[kind
].sort(key
=lambda ent
: ent
[1] + ent
[2], reverse
=True)
388 kinds_bycount
= sorted(kinds
.iteritems(),
389 key
=lambda pair
: len(pair
[1]), reverse
=True)
393 maxcount
= max(maxcount
, kinds
[kind
][0][1] + kinds
[kind
][0][2])
394 return kinds_bycount
, maxcount
398 """Data structures to be passed to UI for rendering drill page."""
401 self
.reqsummary
= None
402 self
.rpcsummaries
= []
403 self
.groupcounts
= []
404 self
.maxgroupcount
= None
405 self
.entitycounts
= []
406 self
.maxentitycount
= None
407 self
.rpc_variation
= []
409 def _ToJsonDrill(self
):
410 """Encodes data for drill page in JSON for UI.
413 drill_json: A dictionary representation of the class with attributes
414 encoded into JSON as necessary for the UI.
416 drill_json
= dict(self
.__dict
__)
420 drill_json
['rpcsummaries'] = [(l
, s
.requests
, s
.calls
,
421 json
.dumps(s
, cls
=_RPCSummaryEncoder
))
422 for (l
, s
) in self
.rpcsummaries
]
426 drill_json
['groupcounts'] = [(k
, len(v
), json
.dumps(v
))
427 for (k
, v
) in self
.groupcounts
]
428 drill_json
['entitycounts'] = [(k
, len(v
), json
.dumps(v
))
429 for (k
, v
) in self
.entitycounts
]
433 class _RPCSummaryEncoder(json
.JSONEncoder
):
434 """JSON encoder for class RPCSummary."""
436 def default(self
, obj
):
437 """Arranges entity/entity group access counts by their kind.
440 obj: an object whose JSON encoding is desired.
442 JSON encoding of obj.
444 if not isinstance(obj
, RPCSummary
):
445 return json
.JSONEncoder
.default(self
, obj
)
449 def DrillURL(urlstats
):
450 """Analyzes URL statistics and generates data for drill page.
452 Master function that calls all necessary functions to compute
453 various data structures needed for rendering the drill page
454 which shows details about a particular URL path.
457 urlstats: An URLStats object which holds appstats information
458 about all requests of an URL path.
460 drill: An object of class Drill with attributes encoded into JSON
461 as necessary for the UI.
464 drill
.reqsummary
= Summary(urlstats
)
468 drill
.rpcsummaries
= SortedRPCSummaries(urlstats
, 0.9)
469 drill
.rpc_variation
= RPCVariation(drill
.reqsummary
, drill
.rpcsummaries
)
470 groupcounts
= urlstats
.EntityGroupCount()
471 drill
.groupcounts
, drill
.maxgroupcount
= SplitByKind(groupcounts
)
472 entitycounts
= urlstats
.EntityCount()
473 drill
.entitycounts
, drill
.maxentitycount
= SplitByKind(entitycounts
)
474 drill_json
= drill
._ToJsonDrill
()