3 See the HTML 2.0 specification:
4 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
7 from warnings
import warnpy3k
8 warnpy3k("the htmllib module has been removed in Python 3.0",
14 from formatter
import AS_IS
16 __all__
= ["HTMLParser", "HTMLParseError"]
19 class HTMLParseError(sgmllib
.SGMLParseError
):
20 """Error raised when an HTML document can't be parsed."""
23 class HTMLParser(sgmllib
.SGMLParser
):
24 """This is the basic HTML parser class.
26 It supports all entity names required by the XHTML 1.0 Recommendation.
27 It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
32 from htmlentitydefs
import entitydefs
34 def __init__(self
, formatter
, verbose
=0):
35 """Creates an instance of the HTMLParser class.
37 The formatter parameter is the formatter instance associated with
41 sgmllib
.SGMLParser
.__init
__(self
, verbose
)
42 self
.formatter
= formatter
44 def error(self
, message
):
45 raise HTMLParseError(message
)
48 sgmllib
.SGMLParser
.reset(self
)
58 # ------ Methods used internally; some may be overridden
60 # --- Formatter interface, taking care of 'savedata' mode;
61 # shouldn't need to be overridden
63 def handle_data(self
, data
):
64 if self
.savedata
is not None:
65 self
.savedata
= self
.savedata
+ data
68 self
.formatter
.add_literal_data(data
)
70 self
.formatter
.add_flowing_data(data
)
72 # --- Hooks to save data; shouldn't need to be overridden
75 """Begins saving character data in a buffer instead of sending it
76 to the formatter object.
78 Retrieve the stored data via the save_end() method. Use of the
79 save_bgn() / save_end() pair may not be nested.
85 """Ends buffering character data and returns all data saved since
86 the preceding call to the save_bgn() method.
88 If the nofill flag is false, whitespace is collapsed to single
89 spaces. A call to this method without a preceding call to the
90 save_bgn() method will raise a TypeError exception.
96 data
= ' '.join(data
.split())
99 # --- Hooks for anchors; should probably be overridden
101 def anchor_bgn(self
, href
, name
, type):
102 """This method is called at the start of an anchor region.
104 The arguments correspond to the attributes of the <A> tag with
105 the same names. The default implementation maintains a list of
106 hyperlinks (defined by the HREF attribute for <A> tags) within
107 the document. The list of hyperlinks is available as the data
108 attribute anchorlist.
113 self
.anchorlist
.append(href
)
115 def anchor_end(self
):
116 """This method is called at the end of an anchor region.
118 The default implementation adds a textual footnote marker using an
119 index into the list of hyperlinks created by the anchor_bgn()method.
123 self
.handle_data("[%d]" % len(self
.anchorlist
))
126 # --- Hook for images; should probably be overridden
128 def handle_image(self
, src
, alt
, *args
):
129 """This method is called to handle images.
131 The default implementation simply passes the alt value to the
132 handle_data() method.
135 self
.handle_data(alt
)
137 # --------- Top level elememts
139 def start_html(self
, attrs
): pass
140 def end_html(self
): pass
142 def start_head(self
, attrs
): pass
143 def end_head(self
): pass
145 def start_body(self
, attrs
): pass
146 def end_body(self
): pass
148 # ------ Head elements
150 def start_title(self
, attrs
):
154 self
.title
= self
.save_end()
156 def do_base(self
, attrs
):
161 def do_isindex(self
, attrs
):
164 def do_link(self
, attrs
):
167 def do_meta(self
, attrs
):
170 def do_nextid(self
, attrs
): # Deprecated
173 # ------ Body elements
177 def start_h1(self
, attrs
):
178 self
.formatter
.end_paragraph(1)
179 self
.formatter
.push_font(('h1', 0, 1, 0))
182 self
.formatter
.end_paragraph(1)
183 self
.formatter
.pop_font()
185 def start_h2(self
, attrs
):
186 self
.formatter
.end_paragraph(1)
187 self
.formatter
.push_font(('h2', 0, 1, 0))
190 self
.formatter
.end_paragraph(1)
191 self
.formatter
.pop_font()
193 def start_h3(self
, attrs
):
194 self
.formatter
.end_paragraph(1)
195 self
.formatter
.push_font(('h3', 0, 1, 0))
198 self
.formatter
.end_paragraph(1)
199 self
.formatter
.pop_font()
201 def start_h4(self
, attrs
):
202 self
.formatter
.end_paragraph(1)
203 self
.formatter
.push_font(('h4', 0, 1, 0))
206 self
.formatter
.end_paragraph(1)
207 self
.formatter
.pop_font()
209 def start_h5(self
, attrs
):
210 self
.formatter
.end_paragraph(1)
211 self
.formatter
.push_font(('h5', 0, 1, 0))
214 self
.formatter
.end_paragraph(1)
215 self
.formatter
.pop_font()
217 def start_h6(self
, attrs
):
218 self
.formatter
.end_paragraph(1)
219 self
.formatter
.push_font(('h6', 0, 1, 0))
222 self
.formatter
.end_paragraph(1)
223 self
.formatter
.pop_font()
225 # --- Block Structuring Elements
227 def do_p(self
, attrs
):
228 self
.formatter
.end_paragraph(1)
230 def start_pre(self
, attrs
):
231 self
.formatter
.end_paragraph(1)
232 self
.formatter
.push_font((AS_IS
, AS_IS
, AS_IS
, 1))
233 self
.nofill
= self
.nofill
+ 1
236 self
.formatter
.end_paragraph(1)
237 self
.formatter
.pop_font()
238 self
.nofill
= max(0, self
.nofill
- 1)
240 def start_xmp(self
, attrs
):
241 self
.start_pre(attrs
)
242 self
.setliteral('xmp') # Tell SGML parser
247 def start_listing(self
, attrs
):
248 self
.start_pre(attrs
)
249 self
.setliteral('listing') # Tell SGML parser
251 def end_listing(self
):
254 def start_address(self
, attrs
):
255 self
.formatter
.end_paragraph(0)
256 self
.formatter
.push_font((AS_IS
, 1, AS_IS
, AS_IS
))
258 def end_address(self
):
259 self
.formatter
.end_paragraph(0)
260 self
.formatter
.pop_font()
262 def start_blockquote(self
, attrs
):
263 self
.formatter
.end_paragraph(1)
264 self
.formatter
.push_margin('blockquote')
266 def end_blockquote(self
):
267 self
.formatter
.end_paragraph(1)
268 self
.formatter
.pop_margin()
272 def start_ul(self
, attrs
):
273 self
.formatter
.end_paragraph(not self
.list_stack
)
274 self
.formatter
.push_margin('ul')
275 self
.list_stack
.append(['ul', '*', 0])
278 if self
.list_stack
: del self
.list_stack
[-1]
279 self
.formatter
.end_paragraph(not self
.list_stack
)
280 self
.formatter
.pop_margin()
282 def do_li(self
, attrs
):
283 self
.formatter
.end_paragraph(0)
285 [dummy
, label
, counter
] = top
= self
.list_stack
[-1]
286 top
[2] = counter
= counter
+1
288 label
, counter
= '*', 0
289 self
.formatter
.add_label_data(label
, counter
)
291 def start_ol(self
, attrs
):
292 self
.formatter
.end_paragraph(not self
.list_stack
)
293 self
.formatter
.push_margin('ol')
297 if len(v
) == 1: v
= v
+ '.'
299 self
.list_stack
.append(['ol', label
, 0])
302 if self
.list_stack
: del self
.list_stack
[-1]
303 self
.formatter
.end_paragraph(not self
.list_stack
)
304 self
.formatter
.pop_margin()
306 def start_menu(self
, attrs
):
312 def start_dir(self
, attrs
):
318 def start_dl(self
, attrs
):
319 self
.formatter
.end_paragraph(1)
320 self
.list_stack
.append(['dl', '', 0])
324 if self
.list_stack
: del self
.list_stack
[-1]
326 def do_dt(self
, attrs
):
329 def do_dd(self
, attrs
):
331 self
.formatter
.push_margin('dd')
332 self
.list_stack
.append(['dd', '', 0])
334 def ddpop(self
, bl
=0):
335 self
.formatter
.end_paragraph(bl
)
337 if self
.list_stack
[-1][0] == 'dd':
338 del self
.list_stack
[-1]
339 self
.formatter
.pop_margin()
345 def start_cite(self
, attrs
): self
.start_i(attrs
)
346 def end_cite(self
): self
.end_i()
348 def start_code(self
, attrs
): self
.start_tt(attrs
)
349 def end_code(self
): self
.end_tt()
351 def start_em(self
, attrs
): self
.start_i(attrs
)
352 def end_em(self
): self
.end_i()
354 def start_kbd(self
, attrs
): self
.start_tt(attrs
)
355 def end_kbd(self
): self
.end_tt()
357 def start_samp(self
, attrs
): self
.start_tt(attrs
)
358 def end_samp(self
): self
.end_tt()
360 def start_strong(self
, attrs
): self
.start_b(attrs
)
361 def end_strong(self
): self
.end_b()
363 def start_var(self
, attrs
): self
.start_i(attrs
)
364 def end_var(self
): self
.end_i()
366 # Typographic Elements
368 def start_i(self
, attrs
):
369 self
.formatter
.push_font((AS_IS
, 1, AS_IS
, AS_IS
))
371 self
.formatter
.pop_font()
373 def start_b(self
, attrs
):
374 self
.formatter
.push_font((AS_IS
, AS_IS
, 1, AS_IS
))
376 self
.formatter
.pop_font()
378 def start_tt(self
, attrs
):
379 self
.formatter
.push_font((AS_IS
, AS_IS
, AS_IS
, 1))
381 self
.formatter
.pop_font()
383 def start_a(self
, attrs
):
387 for attrname
, value
in attrs
:
388 value
= value
.strip()
389 if attrname
== 'href':
391 if attrname
== 'name':
393 if attrname
== 'type':
395 self
.anchor_bgn(href
, name
, type)
402 def do_br(self
, attrs
):
403 self
.formatter
.add_line_break()
405 # --- Horizontal Rule
407 def do_hr(self
, attrs
):
408 self
.formatter
.add_hor_rule()
412 def do_img(self
, attrs
):
419 for attrname
, value
in attrs
:
420 if attrname
== 'align':
422 if attrname
== 'alt':
424 if attrname
== 'ismap':
426 if attrname
== 'src':
428 if attrname
== 'width':
429 try: width
= int(value
)
430 except ValueError: pass
431 if attrname
== 'height':
432 try: height
= int(value
)
433 except ValueError: pass
434 self
.handle_image(src
, alt
, ismap
, align
, width
, height
)
436 # --- Really Old Unofficial Deprecated Stuff
438 def do_plaintext(self
, attrs
):
439 self
.start_pre(attrs
)
440 self
.setnomoretags() # Tell SGML parser
444 def unknown_starttag(self
, tag
, attrs
):
447 def unknown_endtag(self
, tag
):
451 def test(args
= None):
452 import sys
, formatter
457 silent
= args
and args
[0] == '-s'
477 if f
is not sys
.stdin
:
481 f
= formatter
.NullFormatter()
483 f
= formatter
.AbstractFormatter(formatter
.DumbWriter())
490 if __name__
== '__main__':