chrome/tools/history-viz.py

   1 #!/usr/bin/python
   2 # Copyright (c) 2009 The Chromium Authors. All rights reserved.
   3 # Use of this source code is governed by a BSD-style license that can be
   4 # found in the LICENSE file.
   5
   6 """Process a History database and dump a .dot file suitable for GraphViz.
   7
   8 This is useful for debugging history redirect flows.
   9
  10 An example run of this program:
  11   python /src/history-viz.py History > foo.dot
  12   /c/Program\ Files/Graphviz2.18/bin/dot -Tpng foo.dot -o foo.png
  13 """
  14
  15 import struct
  16 import subprocess
  17 import sys
  18 import urlparse
  19
  20 class URL:
  21   """Represents a broken-down URL from our most visited database."""
  22
  23   def __init__(self, id, url):
  24     """Initialize a new URL object.  |id| is the database id of the URL."""
  25     self.id = id
  26     self.url = url
  27     scheme, loc, path, query, fragment = urlparse.urlsplit(url)
  28     if scheme == 'http':
  29       scheme = ''  # Shorten for display purposes.
  30     if len(scheme) > 0:
  31       scheme += '://'
  32     self.host = scheme + loc
  33     self.path = path
  34
  35     extra = ''
  36     if len(query) > 0:
  37       extra += '?' + query
  38     if len(fragment) > 0 or url.find('#') > 0:
  39       extra += '#' + fragment
  40     self.extra = extra
  41
  42   def PrettyPrint(self, include_host=True, include_path=True):
  43     """Pretty-print this URL in a form more suitable for the graph.
  44
  45     This will elide very long paths and potentially puts newlines between parts
  46     of long components.  include_host and include_path determine whether to
  47     include the host and path in the output.
  48
  49     Returns: the pretty-printed string."""
  50     MAX_LEN = 30  # Maximum length of a line in the output.
  51     parts = []
  52     if include_host:
  53       parts.append(self.host)
  54     if include_path:
  55       parts.append(self.path)
  56     parts.append(self.extra)
  57     lines = []
  58     line = ''
  59     for part in parts:
  60       if len(part) > MAX_LEN:
  61         part = part[0:MAX_LEN-3] + '...'
  62       if len(line)+len(part) > MAX_LEN:
  63         lines.append(line)
  64         line = ''
  65       line += part
  66     if len(line) > 0:
  67       lines.append(line)
  68     return '\n'.join(lines)
  69
  70 class Edge:
  71   """Represents an edge in the history graph, connecting two pages.
  72
  73   If a link is traversed twice, it is one Edge with two entries in
  74   the .transitions array."""
  75   def __init__(self, src, dst):
  76     self.src = src
  77     self.dst = dst
  78     self.transitions = []
  79
  80   def Transitions(self):
  81     """Return a dictionary mapping transition type -> occurences."""
  82     all = {}
  83     for trans in self.transitions:
  84       all[trans] = all.get(trans, 0) + 1
  85       # We currently don't use the chain type.
  86       # TODO(evanm): make this a command-line option.
  87       # if trans & 0x30000000 != 0:
  88       #   chain = ''
  89       #   if trans & 0x10000000:
  90       #     chain = 'start'
  91       #   if trans & 0x20000000:
  92       #     if len(chain) == 0:
  93       #       chain = 'end'
  94       #     else:
  95       #       chain = ''
  96       #   if len(chain) > 0:
  97       #     edge['chain'] = chain
  98     return all
  99
 100 def ClusterBy(objs, pred):
 101   """Group a list of objects by a predicate.
 102
 103   Given a list of objects and a predicate over the objects, return a
 104   dictionary mapping pred(obj) -> all objs with the same pred(obj)."""
 105   clusters = {}
 106   for obj in objs:
 107     cluster = pred(obj)
 108     clusters[cluster] = clusters.get(cluster, [])
 109     clusters[cluster].append(obj)
 110   return clusters
 111
 112 def EscapeDot(str):
 113   """Escape a string suitable for embedding in a graphviz graph."""
 114   # TODO(evanm): this is likely not sufficient.
 115   return str.replace('\n', '\\n')
 116
 117 class SQLite:
 118   """Trivial interface to executing SQLite queries.
 119   Spawns a new process with each call."""
 120   def __init__(self, file=None):
 121     self.file = file
 122
 123   def Run(self, sql):
 124     """Execute |sql|, yielding each row of results as an array."""
 125     subproc = subprocess.Popen(['sqlite', self.file],
 126                                stdin=subprocess.PIPE,
 127                                stdout=subprocess.PIPE)
 128     subproc.stdin.write('.mode tabs\n')
 129     subproc.stdin.write(sql + ';')
 130     subproc.stdin.close()
 131     for line in subproc.stdout:
 132       row = line.strip().split('\t')
 133       yield row
 134
 135 def LoadHistory(filename):
 136   db = SQLite(filename)
 137
 138   urls = {}  # Map of urlid => url.
 139   urls['0'] = URL('0', 'start')  # Node name '0' is our special 'start' node.
 140   for id, url in db.Run('SELECT id, url FROM urls'):
 141     urls[id] = URL(id, url)
 142
 143   visiturlids = {}  # Map of visitid => urlid.
 144   visiturlids['0'] = '0'  # '0' is our special 'start' node.
 145   edges = {}  # Map of urlid->urlid->Edge.
 146   for src, dst, url, trans in db.Run('SELECT from_visit, id, url, transition '
 147                                      'FROM visits ORDER BY id'):
 148     visiturlids[dst] = url
 149     src = visiturlids[src]
 150     dst = visiturlids[dst]
 151     edges[src] = edges.get(src, {})
 152     edge = edges[src][dst] = edges[src].get(dst, Edge(src, dst))
 153     # SQLite outputs transition values as signed integers, but they're really
 154     # a bitfield.  Below does "unsigned trans = static_cast<unsigned>(trans)".
 155     trans = struct.unpack('I', struct.pack('i', int(trans)))[0]
 156     edge.transitions.append(trans)
 157
 158   return urls, edges
 159
 160 # Some transition types, copied from page_transition_types.h.
 161 TRANS_TYPES = {
 162   0: 'link',
 163   1: 'typed',
 164   2: 'most-visited',
 165   3: 'auto subframe',
 166   7: 'form',
 167 }
 168
 169 urls, edges = LoadHistory(sys.argv[1])
 170
 171 print 'digraph G {'
 172 print '  graph [rankdir=LR]'  # Display left to right.
 173 print '  node [shape=box]'    # Display nodes as boxes.
 174 print '  subgraph { rank=source; 0 [label="start"] }'
 175
 176 # Output all the nodes within graph clusters.
 177 hosts = ClusterBy(urls.values(), lambda url: url.host)
 178 for i, (host, urls) in enumerate(hosts.items()):
 179   # Cluster all URLs under this host if it has more than one entry.
 180   host_clustered = len(urls) > 1
 181   if host_clustered:
 182     print 'subgraph clusterhost%d {' % i
 183     print '  label="%s"' % host
 184   paths = ClusterBy(urls, lambda url: url.path)
 185   for j, (path, urls) in enumerate(paths.items()):
 186     # Cluster all URLs under this host if it has more than one entry.
 187     path_clustered = host_clustered and len(urls) > 1
 188     if path_clustered:
 189       print '  subgraph cluster%d%d {' % (i, j)
 190       print '    label="%s"' % path
 191     for url in urls:
 192       if url.id == '0': continue  # We already output the special start node.
 193       pretty = url.PrettyPrint(include_host=not host_clustered,
 194                                include_path=not path_clustered)
 195       print '    %s [label="%s"]' % (url.id, EscapeDot(pretty))
 196     if path_clustered:
 197       print '  }'
 198   if host_clustered:
 199     print '}'
 200
 201 # Output all the edges between nodes.
 202 for src, dsts in edges.items():
 203   for dst, edge in dsts.items():
 204     # Gather up all the transitions into the label.
 205     label = []      # Label for the edge.
 206     transitions = edge.Transitions()
 207     for trans, count in transitions.items():
 208       text = ''
 209       if count > 1:
 210         text = '%dx ' % count
 211       base_type = trans & 0xFF
 212       redir = (trans & 0xC0000000) != 0
 213       start = (trans & 0x10000000) != 0
 214       end = (trans & 0x20000000) != 0
 215       if start or end:
 216         if start:
 217           text += '<'
 218         if end:
 219           text += '>'
 220         text += ' '
 221       if redir:
 222         text += 'R '
 223       text += TRANS_TYPES.get(base_type, 'trans%d' % base_type)
 224       label.append(text)
 225     if len(label) == 0:
 226       continue
 227
 228     edgeattrs = []  # Graphviz attributes for the edge.
 229     # If the edge is from the start and the transitions are fishy, make it
 230     # display as a dotted line.
 231     if src == '0' and len(transitions.keys()) == 1 and transitions.has_key(0):
 232       edgeattrs.append('style=dashed')
 233     if len(label) > 0:
 234       edgeattrs.append('label="%s"' % EscapeDot('\n'.join(label)))
 235
 236     out = '%s -> %s' % (src, dst)
 237     if len(edgeattrs) > 0:
 238       out += ' [%s]' % ','.join(edgeattrs)
 239     print out
 240 print '}'
 241