2 # vi: set softtabstop=4 shiftwidth=4 tabstop=8 expandtab:
4 """Parses an HTML document into a DOMmy structure."""
6 from HTMLParser
import HTMLParser
, HTMLParseError
8 class HTMLTreeParser(HTMLParser
):
10 HTMLParser
.__init
__(self
)
12 self
.root
= Element(None)
13 self
.parseStack
= [self
.root
]
18 def handle_starttag(self
, tag
, attrs
):
19 newel
= Element(tag
, attrs
)
20 parent
= self
.parseStack
[-1]
21 parent
.addChild(newel
)
22 if not tag
in self
.NO_CHILDREN
:
23 self
.parseStack
.append(newel
)
25 def handle_data(self
, data
):
26 parent
= self
.parseStack
[-1]
28 # concatenate to previous text element if any
29 if len(parent
.elementList
) > 0 and parent
.elementList
[-1].name
is None:
30 parent
.elementList
[-1].text
+= data
32 textel
= Element(None)
34 parent
.addChild(textel
)
36 def handle_endtag(self
, tag
):
37 # We're not expecting closing tags for names in NO_CHILDREN, but
38 # if they show up, whatever.
39 if tag
not in self
.NO_CHILDREN
:
40 # Pop things off the stack until we find something that matches.
41 for i
in range(len(self
.parseStack
)):
42 oldel
= self
.parseStack
.pop()
46 raise HTMLParseError("Unexpected close tag </" + tag
+ ">",
49 # These tags don't have children (or </closing> tags.)
50 NO_CHILDREN
= set(("img", "hr", "p", "dd", "link", "meta"))
53 """Represents an HTML element or character data.
55 Child elements are available through both a dictionary and a list.
57 elementList is a list of immediate descendents.
59 elementDict maps a tag name to a list of immediate descendents with
62 The current tag name is available as the member variable "name". For
63 elements that are just data, name is None."""
65 def __init__(self
, elementName
, attrs
=[]):
66 self
.name
= elementName
68 self
.elementDict
= dict()
70 # dictify the attributes: duplicate attributes are dropped
71 self
.attrs
= dict(attrs
)
75 def addChild(self
, newel
):
76 self
.elementList
.append(newel
)
78 if newel
.name
in self
.elementDict
:
79 self
.elementDict
[newel
.name
].append(newel
)
81 self
.elementDict
[newel
.name
] = [newel
]