2 # Copyright (c) 2011 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 """Process a History database and dump a .dot file suitable for GraphViz.
8 This is useful for debugging history redirect flows.
10 An example run of this program:
11 python /src/history-viz.py History > foo.dot
12 /c/Program\ Files/Graphviz2.18/bin/dot -Tpng foo.dot -o foo.png
21 # Some transition types, copied from page_transition_types.h.
32 """Represents a broken-down URL from our most visited database."""
34 def __init__(self
, id, url
):
35 """Initialize a new URL object. |id| is the database id of the URL."""
38 scheme
, loc
, path
, query
, fragment
= urlparse
.urlsplit(url
)
40 scheme
= '' # Shorten for display purposes.
43 self
.host
= scheme
+ loc
49 if len(fragment
) > 0 or url
.find('#') > 0:
50 extra
+= '#' + fragment
53 def PrettyPrint(self
, include_host
=True, include_path
=True):
54 """Pretty-print this URL in a form more suitable for the graph.
56 This will elide very long paths and potentially puts newlines between parts
57 of long components. include_host and include_path determine whether to
58 include the host and path in the output.
60 Returns: the pretty-printed string."""
61 MAX_LEN
= 30 # Maximum length of a line in the output.
64 parts
.append(self
.host
)
66 parts
.append(self
.path
)
67 parts
.append(self
.extra
)
71 if len(part
) > MAX_LEN
:
72 part
= part
[0:MAX_LEN
-3] + '...'
73 if len(line
)+len(part
) > MAX_LEN
:
79 return '\n'.join(lines
)
83 """Represents an edge in the history graph, connecting two pages.
85 If a link is traversed twice, it is one Edge with two entries in
86 the .transitions array."""
87 def __init__(self
, src
, dst
):
92 def Transitions(self
):
93 """Return a dictionary mapping transition type -> occurences."""
95 for trans
in self
.transitions
:
96 all
[trans
] = all
.get(trans
, 0) + 1
97 # We currently don't use the chain type.
98 # TODO(evanm): make this a command-line option.
99 # if trans & 0x30000000 != 0:
101 # if trans & 0x10000000:
103 # if trans & 0x20000000:
104 # if len(chain) == 0:
109 # edge['chain'] = chain
113 def ClusterBy(objs
, pred
):
114 """Group a list of objects by a predicate.
116 Given a list of objects and a predicate over the objects, return a
117 dictionary mapping pred(obj) -> all objs with the same pred(obj)."""
121 clusters
[cluster
] = clusters
.get(cluster
, [])
122 clusters
[cluster
].append(obj
)
126 def EscapeDot(string
):
127 """Escape a string suitable for embedding in a graphviz graph."""
128 # TODO(evanm): this is likely not sufficient.
129 return string
.replace('\n', '\\n')
132 class SQLite(object):
133 """Trivial interface to executing SQLite queries.
134 Spawns a new process with each call."""
135 def __init__(self
, file=None):
139 """Execute |sql|, yielding each row of results as an array."""
140 subproc
= subprocess
.Popen(['sqlite', self
.file],
141 stdin
=subprocess
.PIPE
,
142 stdout
=subprocess
.PIPE
)
143 subproc
.stdin
.write('.mode tabs\n')
144 subproc
.stdin
.write(sql
+ ';')
145 subproc
.stdin
.close()
146 for line
in subproc
.stdout
:
147 row
= line
.strip().split('\t')
151 def LoadHistory(filename
):
152 db
= SQLite(filename
)
154 urls
= {} # Map of urlid => url.
155 urls
['0'] = URL('0', 'start') # Node name '0' is our special 'start' node.
156 for id, url
in db
.Run('SELECT id, url FROM urls'):
157 urls
[id] = URL(id, url
)
159 visiturlids
= {} # Map of visitid => urlid.
160 visiturlids
['0'] = '0' # '0' is our special 'start' node.
161 edges
= {} # Map of urlid->urlid->Edge.
162 for src
, dst
, url
, trans
in db
.Run('SELECT from_visit, id, url, transition '
163 'FROM visits ORDER BY id'):
164 visiturlids
[dst
] = url
165 src
= visiturlids
[src
]
166 dst
= visiturlids
[dst
]
167 edges
[src
] = edges
.get(src
, {})
168 edge
= edges
[src
][dst
] = edges
[src
].get(dst
, Edge(src
, dst
))
169 # SQLite outputs transition values as signed integers, but they're really
170 # a bitfield. Below does "unsigned trans = static_cast<unsigned>(trans)".
171 trans
= struct
.unpack('I', struct
.pack('i', int(trans
)))[0]
172 edge
.transitions
.append(trans
)
178 urls
, edges
= LoadHistory(sys
.argv
[1])
180 print ' graph [rankdir=LR]' # Display left to right.
181 print ' node [shape=box]' # Display nodes as boxes.
182 print ' subgraph { rank=source; 0 [label="start"] }'
184 # Output all the nodes within graph clusters.
185 hosts
= ClusterBy(urls
.values(), lambda url
: url
.host
)
186 for i
, (host
, urls
) in enumerate(hosts
.items()):
187 # Cluster all URLs under this host if it has more than one entry.
188 host_clustered
= len(urls
) > 1
190 print 'subgraph clusterhost%d {' % i
191 print ' label="%s"' % host
192 paths
= ClusterBy(urls
, lambda url
: url
.path
)
193 for j
, (path
, urls
) in enumerate(paths
.items()):
194 # Cluster all URLs under this host if it has more than one entry.
195 path_clustered
= host_clustered
and len(urls
) > 1
197 print ' subgraph cluster%d%d {' % (i
, j
)
198 print ' label="%s"' % path
200 if url
.id == '0': continue # We already output the special start node.
201 pretty
= url
.PrettyPrint(include_host
=not host_clustered
,
202 include_path
=not path_clustered
)
203 print ' %s [label="%s"]' % (url
.id, EscapeDot(pretty
))
209 # Output all the edges between nodes.
210 for src
, dsts
in edges
.items():
211 for dst
, edge
in dsts
.items():
212 # Gather up all the transitions into the label.
213 label
= [] # Label for the edge.
214 transitions
= edge
.Transitions()
215 for trans
, count
in transitions
.items():
218 text
= '%dx ' % count
219 base_type
= trans
& 0xFF
220 redir
= (trans
& 0xC0000000) != 0
221 start
= (trans
& 0x10000000) != 0
222 end
= (trans
& 0x20000000) != 0
231 text
+= TRANS_TYPES
.get(base_type
, 'trans%d' % base_type
)
236 edgeattrs
= [] # Graphviz attributes for the edge.
237 # If the edge is from the start and the transitions are fishy, make it
238 # display as a dotted line.
239 if src
== '0' and len(transitions
.keys()) == 1 and transitions
.has_key(0):
240 edgeattrs
.append('style=dashed')
242 edgeattrs
.append('label="%s"' % EscapeDot('\n'.join(label
)))
244 out
= '%s -> %s' % (src
, dst
)
245 if len(edgeattrs
) > 0:
246 out
+= ' [%s]' % ','.join(edgeattrs
)
252 if __name__
== '__main__':