Script to grab stop codes from allstops.xml and sort
[ottawa-travel-planner.git] / HTMLTreeParser.py
blobe69ed975927c955ea1db3368301f87d0228ba19a
2 # vi: set softtabstop=4 shiftwidth=4 tabstop=8 expandtab:
4 """Parses an HTML document into a DOMmy structure."""
6 from HTMLParser import HTMLParser, HTMLParseError
8 class HTMLTreeParser(HTMLParser):
9 def __init__(self):
10 HTMLParser.__init__(self)
12 self.root = Element(None)
13 self.parseStack = [self.root]
15 def getRoot(self):
16 return self.root
18 def handle_starttag(self, tag, attrs):
19 newel = Element(tag, attrs)
20 parent = self.parseStack[-1]
21 parent.addChild(newel)
22 if not tag in self.NO_CHILDREN:
23 self.parseStack.append(newel)
25 def handle_data(self, data):
26 parent = self.parseStack[-1]
28 # concatenate to previous text element if any
29 if len(parent.elementList) > 0 and parent.elementList[-1].name is None:
30 parent.elementList[-1].text += data
31 else:
32 textel = Element(None)
33 textel.text = data
34 parent.addChild(textel)
36 def handle_endtag(self, tag):
37 # We're not expecting closing tags for names in NO_CHILDREN, but
38 # if they show up, whatever.
39 if tag not in self.NO_CHILDREN:
40 # Pop things off the stack until we find something that matches.
41 for i in range(len(self.parseStack)):
42 oldel = self.parseStack.pop()
43 if oldel.name == tag:
44 break
45 else:
46 raise HTMLParseError("Unexpected close tag </" + tag + ">",
47 self.getpos())
49 # These tags don't have children (or </closing> tags.)
50 NO_CHILDREN = set(("img", "hr", "p", "dd", "link", "meta"))
52 class Element:
53 """Represents an HTML element or character data.
55 Child elements are available through both a dictionary and a list.
57 elementList is a list of immediate descendents.
59 elementDict maps a tag name to a list of immediate descendents with
60 that name.
62 The current tag name is available as the member variable "name". For
63 elements that are just data, name is None."""
65 def __init__(self, elementName, attrs=[]):
66 self.name = elementName
67 self.elementList = []
68 self.elementDict = dict()
70 # dictify the attributes: duplicate attributes are dropped
71 self.attrs = dict(attrs)
73 self.text = None
75 def addChild(self, newel):
76 self.elementList.append(newel)
78 if newel.name in self.elementDict:
79 self.elementDict[newel.name].append(newel)
80 else:
81 self.elementDict[newel.name] = [newel]