3 See the HTML 2.0 specification:
4 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
9 from formatter
import AS_IS
11 __all__
= ["HTMLParser", "HTMLParseError"]
14 class HTMLParseError(sgmllib
.SGMLParseError
):
15 """Error raised when an HTML document can't be parsed."""
18 class HTMLParser(sgmllib
.SGMLParser
):
19 """This is the basic HTML parser class.
21 It supports all entity names required by the XHTML 1.0 Recommendation.
22 It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
27 from htmlentitydefs
import entitydefs
29 def __init__(self
, formatter
, verbose
=0):
30 """Creates an instance of the HTMLParser class.
32 The formatter parameter is the formatter instance associated with
36 sgmllib
.SGMLParser
.__init
__(self
, verbose
)
37 self
.formatter
= formatter
39 def error(self
, message
):
40 raise HTMLParseError(message
)
43 sgmllib
.SGMLParser
.reset(self
)
53 # ------ Methods used internally; some may be overridden
55 # --- Formatter interface, taking care of 'savedata' mode;
56 # shouldn't need to be overridden
58 def handle_data(self
, data
):
59 if self
.savedata
is not None:
60 self
.savedata
= self
.savedata
+ data
63 self
.formatter
.add_literal_data(data
)
65 self
.formatter
.add_flowing_data(data
)
67 # --- Hooks to save data; shouldn't need to be overridden
70 """Begins saving character data in a buffer instead of sending it
71 to the formatter object.
73 Retrieve the stored data via the save_end() method. Use of the
74 save_bgn() / save_end() pair may not be nested.
80 """Ends buffering character data and returns all data saved since
81 the preceding call to the save_bgn() method.
83 If the nofill flag is false, whitespace is collapsed to single
84 spaces. A call to this method without a preceding call to the
85 save_bgn() method will raise a TypeError exception.
91 data
= ' '.join(data
.split())
94 # --- Hooks for anchors; should probably be overridden
96 def anchor_bgn(self
, href
, name
, type):
97 """This method is called at the start of an anchor region.
99 The arguments correspond to the attributes of the <A> tag with
100 the same names. The default implementation maintains a list of
101 hyperlinks (defined by the HREF attribute for <A> tags) within
102 the document. The list of hyperlinks is available as the data
103 attribute anchorlist.
108 self
.anchorlist
.append(href
)
110 def anchor_end(self
):
111 """This method is called at the end of an anchor region.
113 The default implementation adds a textual footnote marker using an
114 index into the list of hyperlinks created by the anchor_bgn()method.
118 self
.handle_data("[%d]" % len(self
.anchorlist
))
121 # --- Hook for images; should probably be overridden
123 def handle_image(self
, src
, alt
, *args
):
124 """This method is called to handle images.
126 The default implementation simply passes the alt value to the
127 handle_data() method.
130 self
.handle_data(alt
)
132 # --------- Top level elememts
134 def start_html(self
, attrs
): pass
135 def end_html(self
): pass
137 def start_head(self
, attrs
): pass
138 def end_head(self
): pass
140 def start_body(self
, attrs
): pass
141 def end_body(self
): pass
143 # ------ Head elements
145 def start_title(self
, attrs
):
149 self
.title
= self
.save_end()
151 def do_base(self
, attrs
):
156 def do_isindex(self
, attrs
):
159 def do_link(self
, attrs
):
162 def do_meta(self
, attrs
):
165 def do_nextid(self
, attrs
): # Deprecated
168 # ------ Body elements
172 def start_h1(self
, attrs
):
173 self
.formatter
.end_paragraph(1)
174 self
.formatter
.push_font(('h1', 0, 1, 0))
177 self
.formatter
.end_paragraph(1)
178 self
.formatter
.pop_font()
180 def start_h2(self
, attrs
):
181 self
.formatter
.end_paragraph(1)
182 self
.formatter
.push_font(('h2', 0, 1, 0))
185 self
.formatter
.end_paragraph(1)
186 self
.formatter
.pop_font()
188 def start_h3(self
, attrs
):
189 self
.formatter
.end_paragraph(1)
190 self
.formatter
.push_font(('h3', 0, 1, 0))
193 self
.formatter
.end_paragraph(1)
194 self
.formatter
.pop_font()
196 def start_h4(self
, attrs
):
197 self
.formatter
.end_paragraph(1)
198 self
.formatter
.push_font(('h4', 0, 1, 0))
201 self
.formatter
.end_paragraph(1)
202 self
.formatter
.pop_font()
204 def start_h5(self
, attrs
):
205 self
.formatter
.end_paragraph(1)
206 self
.formatter
.push_font(('h5', 0, 1, 0))
209 self
.formatter
.end_paragraph(1)
210 self
.formatter
.pop_font()
212 def start_h6(self
, attrs
):
213 self
.formatter
.end_paragraph(1)
214 self
.formatter
.push_font(('h6', 0, 1, 0))
217 self
.formatter
.end_paragraph(1)
218 self
.formatter
.pop_font()
220 # --- Block Structuring Elements
222 def do_p(self
, attrs
):
223 self
.formatter
.end_paragraph(1)
225 def start_pre(self
, attrs
):
226 self
.formatter
.end_paragraph(1)
227 self
.formatter
.push_font((AS_IS
, AS_IS
, AS_IS
, 1))
228 self
.nofill
= self
.nofill
+ 1
231 self
.formatter
.end_paragraph(1)
232 self
.formatter
.pop_font()
233 self
.nofill
= max(0, self
.nofill
- 1)
235 def start_xmp(self
, attrs
):
236 self
.start_pre(attrs
)
237 self
.setliteral('xmp') # Tell SGML parser
242 def start_listing(self
, attrs
):
243 self
.start_pre(attrs
)
244 self
.setliteral('listing') # Tell SGML parser
246 def end_listing(self
):
249 def start_address(self
, attrs
):
250 self
.formatter
.end_paragraph(0)
251 self
.formatter
.push_font((AS_IS
, 1, AS_IS
, AS_IS
))
253 def end_address(self
):
254 self
.formatter
.end_paragraph(0)
255 self
.formatter
.pop_font()
257 def start_blockquote(self
, attrs
):
258 self
.formatter
.end_paragraph(1)
259 self
.formatter
.push_margin('blockquote')
261 def end_blockquote(self
):
262 self
.formatter
.end_paragraph(1)
263 self
.formatter
.pop_margin()
267 def start_ul(self
, attrs
):
268 self
.formatter
.end_paragraph(not self
.list_stack
)
269 self
.formatter
.push_margin('ul')
270 self
.list_stack
.append(['ul', '*', 0])
273 if self
.list_stack
: del self
.list_stack
[-1]
274 self
.formatter
.end_paragraph(not self
.list_stack
)
275 self
.formatter
.pop_margin()
277 def do_li(self
, attrs
):
278 self
.formatter
.end_paragraph(0)
280 [dummy
, label
, counter
] = top
= self
.list_stack
[-1]
281 top
[2] = counter
= counter
+1
283 label
, counter
= '*', 0
284 self
.formatter
.add_label_data(label
, counter
)
286 def start_ol(self
, attrs
):
287 self
.formatter
.end_paragraph(not self
.list_stack
)
288 self
.formatter
.push_margin('ol')
292 if len(v
) == 1: v
= v
+ '.'
294 self
.list_stack
.append(['ol', label
, 0])
297 if self
.list_stack
: del self
.list_stack
[-1]
298 self
.formatter
.end_paragraph(not self
.list_stack
)
299 self
.formatter
.pop_margin()
301 def start_menu(self
, attrs
):
307 def start_dir(self
, attrs
):
313 def start_dl(self
, attrs
):
314 self
.formatter
.end_paragraph(1)
315 self
.list_stack
.append(['dl', '', 0])
319 if self
.list_stack
: del self
.list_stack
[-1]
321 def do_dt(self
, attrs
):
324 def do_dd(self
, attrs
):
326 self
.formatter
.push_margin('dd')
327 self
.list_stack
.append(['dd', '', 0])
329 def ddpop(self
, bl
=0):
330 self
.formatter
.end_paragraph(bl
)
332 if self
.list_stack
[-1][0] == 'dd':
333 del self
.list_stack
[-1]
334 self
.formatter
.pop_margin()
340 def start_cite(self
, attrs
): self
.start_i(attrs
)
341 def end_cite(self
): self
.end_i()
343 def start_code(self
, attrs
): self
.start_tt(attrs
)
344 def end_code(self
): self
.end_tt()
346 def start_em(self
, attrs
): self
.start_i(attrs
)
347 def end_em(self
): self
.end_i()
349 def start_kbd(self
, attrs
): self
.start_tt(attrs
)
350 def end_kbd(self
): self
.end_tt()
352 def start_samp(self
, attrs
): self
.start_tt(attrs
)
353 def end_samp(self
): self
.end_tt()
355 def start_strong(self
, attrs
): self
.start_b(attrs
)
356 def end_strong(self
): self
.end_b()
358 def start_var(self
, attrs
): self
.start_i(attrs
)
359 def end_var(self
): self
.end_i()
361 # Typographic Elements
363 def start_i(self
, attrs
):
364 self
.formatter
.push_font((AS_IS
, 1, AS_IS
, AS_IS
))
366 self
.formatter
.pop_font()
368 def start_b(self
, attrs
):
369 self
.formatter
.push_font((AS_IS
, AS_IS
, 1, AS_IS
))
371 self
.formatter
.pop_font()
373 def start_tt(self
, attrs
):
374 self
.formatter
.push_font((AS_IS
, AS_IS
, AS_IS
, 1))
376 self
.formatter
.pop_font()
378 def start_a(self
, attrs
):
382 for attrname
, value
in attrs
:
383 value
= value
.strip()
384 if attrname
== 'href':
386 if attrname
== 'name':
388 if attrname
== 'type':
390 self
.anchor_bgn(href
, name
, type)
397 def do_br(self
, attrs
):
398 self
.formatter
.add_line_break()
400 # --- Horizontal Rule
402 def do_hr(self
, attrs
):
403 self
.formatter
.add_hor_rule()
407 def do_img(self
, attrs
):
414 for attrname
, value
in attrs
:
415 if attrname
== 'align':
417 if attrname
== 'alt':
419 if attrname
== 'ismap':
421 if attrname
== 'src':
423 if attrname
== 'width':
424 try: width
= int(value
)
425 except ValueError: pass
426 if attrname
== 'height':
427 try: height
= int(value
)
428 except ValueError: pass
429 self
.handle_image(src
, alt
, ismap
, align
, width
, height
)
431 # --- Really Old Unofficial Deprecated Stuff
433 def do_plaintext(self
, attrs
):
434 self
.start_pre(attrs
)
435 self
.setnomoretags() # Tell SGML parser
439 def unknown_starttag(self
, tag
, attrs
):
442 def unknown_endtag(self
, tag
):
446 def test(args
= None):
447 import sys
, formatter
452 silent
= args
and args
[0] == '-s'
472 if f
is not sys
.stdin
:
476 f
= formatter
.NullFormatter()
478 f
= formatter
.AbstractFormatter(formatter
.DumbWriter())
485 if __name__
== '__main__':