Patch by Jeremy Katz (SF #1609407)
[python.git] / Lib / htmllib.py
blob24a2e2f3c71ff45f9649519a81cf46cc99162028
1 """HTML 2.0 parser.
3 See the HTML 2.0 specification:
4 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5 """
7 import sgmllib
9 from formatter import AS_IS
11 __all__ = ["HTMLParser", "HTMLParseError"]
14 class HTMLParseError(sgmllib.SGMLParseError):
15 """Error raised when an HTML document can't be parsed."""
18 class HTMLParser(sgmllib.SGMLParser):
19 """This is the basic HTML parser class.
21 It supports all entity names required by the XHTML 1.0 Recommendation.
22 It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
23 elements.
25 """
27 from htmlentitydefs import entitydefs
29 def __init__(self, formatter, verbose=0):
30 """Creates an instance of the HTMLParser class.
32 The formatter parameter is the formatter instance associated with
33 the parser.
35 """
36 sgmllib.SGMLParser.__init__(self, verbose)
37 self.formatter = formatter
39 def error(self, message):
40 raise HTMLParseError(message)
42 def reset(self):
43 sgmllib.SGMLParser.reset(self)
44 self.savedata = None
45 self.isindex = 0
46 self.title = None
47 self.base = None
48 self.anchor = None
49 self.anchorlist = []
50 self.nofill = 0
51 self.list_stack = []
53 # ------ Methods used internally; some may be overridden
55 # --- Formatter interface, taking care of 'savedata' mode;
56 # shouldn't need to be overridden
58 def handle_data(self, data):
59 if self.savedata is not None:
60 self.savedata = self.savedata + data
61 else:
62 if self.nofill:
63 self.formatter.add_literal_data(data)
64 else:
65 self.formatter.add_flowing_data(data)
67 # --- Hooks to save data; shouldn't need to be overridden
69 def save_bgn(self):
70 """Begins saving character data in a buffer instead of sending it
71 to the formatter object.
73 Retrieve the stored data via the save_end() method. Use of the
74 save_bgn() / save_end() pair may not be nested.
76 """
77 self.savedata = ''
79 def save_end(self):
80 """Ends buffering character data and returns all data saved since
81 the preceding call to the save_bgn() method.
83 If the nofill flag is false, whitespace is collapsed to single
84 spaces. A call to this method without a preceding call to the
85 save_bgn() method will raise a TypeError exception.
87 """
88 data = self.savedata
89 self.savedata = None
90 if not self.nofill:
91 data = ' '.join(data.split())
92 return data
94 # --- Hooks for anchors; should probably be overridden
96 def anchor_bgn(self, href, name, type):
97 """This method is called at the start of an anchor region.
99 The arguments correspond to the attributes of the <A> tag with
100 the same names. The default implementation maintains a list of
101 hyperlinks (defined by the HREF attribute for <A> tags) within
102 the document. The list of hyperlinks is available as the data
103 attribute anchorlist.
106 self.anchor = href
107 if self.anchor:
108 self.anchorlist.append(href)
110 def anchor_end(self):
111 """This method is called at the end of an anchor region.
113 The default implementation adds a textual footnote marker using an
114 index into the list of hyperlinks created by the anchor_bgn()method.
117 if self.anchor:
118 self.handle_data("[%d]" % len(self.anchorlist))
119 self.anchor = None
121 # --- Hook for images; should probably be overridden
123 def handle_image(self, src, alt, *args):
124 """This method is called to handle images.
126 The default implementation simply passes the alt value to the
127 handle_data() method.
130 self.handle_data(alt)
132 # --------- Top level elememts
134 def start_html(self, attrs): pass
135 def end_html(self): pass
137 def start_head(self, attrs): pass
138 def end_head(self): pass
140 def start_body(self, attrs): pass
141 def end_body(self): pass
143 # ------ Head elements
145 def start_title(self, attrs):
146 self.save_bgn()
148 def end_title(self):
149 self.title = self.save_end()
151 def do_base(self, attrs):
152 for a, v in attrs:
153 if a == 'href':
154 self.base = v
156 def do_isindex(self, attrs):
157 self.isindex = 1
159 def do_link(self, attrs):
160 pass
162 def do_meta(self, attrs):
163 pass
165 def do_nextid(self, attrs): # Deprecated
166 pass
168 # ------ Body elements
170 # --- Headings
172 def start_h1(self, attrs):
173 self.formatter.end_paragraph(1)
174 self.formatter.push_font(('h1', 0, 1, 0))
176 def end_h1(self):
177 self.formatter.end_paragraph(1)
178 self.formatter.pop_font()
180 def start_h2(self, attrs):
181 self.formatter.end_paragraph(1)
182 self.formatter.push_font(('h2', 0, 1, 0))
184 def end_h2(self):
185 self.formatter.end_paragraph(1)
186 self.formatter.pop_font()
188 def start_h3(self, attrs):
189 self.formatter.end_paragraph(1)
190 self.formatter.push_font(('h3', 0, 1, 0))
192 def end_h3(self):
193 self.formatter.end_paragraph(1)
194 self.formatter.pop_font()
196 def start_h4(self, attrs):
197 self.formatter.end_paragraph(1)
198 self.formatter.push_font(('h4', 0, 1, 0))
200 def end_h4(self):
201 self.formatter.end_paragraph(1)
202 self.formatter.pop_font()
204 def start_h5(self, attrs):
205 self.formatter.end_paragraph(1)
206 self.formatter.push_font(('h5', 0, 1, 0))
208 def end_h5(self):
209 self.formatter.end_paragraph(1)
210 self.formatter.pop_font()
212 def start_h6(self, attrs):
213 self.formatter.end_paragraph(1)
214 self.formatter.push_font(('h6', 0, 1, 0))
216 def end_h6(self):
217 self.formatter.end_paragraph(1)
218 self.formatter.pop_font()
220 # --- Block Structuring Elements
222 def do_p(self, attrs):
223 self.formatter.end_paragraph(1)
225 def start_pre(self, attrs):
226 self.formatter.end_paragraph(1)
227 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
228 self.nofill = self.nofill + 1
230 def end_pre(self):
231 self.formatter.end_paragraph(1)
232 self.formatter.pop_font()
233 self.nofill = max(0, self.nofill - 1)
235 def start_xmp(self, attrs):
236 self.start_pre(attrs)
237 self.setliteral('xmp') # Tell SGML parser
239 def end_xmp(self):
240 self.end_pre()
242 def start_listing(self, attrs):
243 self.start_pre(attrs)
244 self.setliteral('listing') # Tell SGML parser
246 def end_listing(self):
247 self.end_pre()
249 def start_address(self, attrs):
250 self.formatter.end_paragraph(0)
251 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
253 def end_address(self):
254 self.formatter.end_paragraph(0)
255 self.formatter.pop_font()
257 def start_blockquote(self, attrs):
258 self.formatter.end_paragraph(1)
259 self.formatter.push_margin('blockquote')
261 def end_blockquote(self):
262 self.formatter.end_paragraph(1)
263 self.formatter.pop_margin()
265 # --- List Elements
267 def start_ul(self, attrs):
268 self.formatter.end_paragraph(not self.list_stack)
269 self.formatter.push_margin('ul')
270 self.list_stack.append(['ul', '*', 0])
272 def end_ul(self):
273 if self.list_stack: del self.list_stack[-1]
274 self.formatter.end_paragraph(not self.list_stack)
275 self.formatter.pop_margin()
277 def do_li(self, attrs):
278 self.formatter.end_paragraph(0)
279 if self.list_stack:
280 [dummy, label, counter] = top = self.list_stack[-1]
281 top[2] = counter = counter+1
282 else:
283 label, counter = '*', 0
284 self.formatter.add_label_data(label, counter)
286 def start_ol(self, attrs):
287 self.formatter.end_paragraph(not self.list_stack)
288 self.formatter.push_margin('ol')
289 label = '1.'
290 for a, v in attrs:
291 if a == 'type':
292 if len(v) == 1: v = v + '.'
293 label = v
294 self.list_stack.append(['ol', label, 0])
296 def end_ol(self):
297 if self.list_stack: del self.list_stack[-1]
298 self.formatter.end_paragraph(not self.list_stack)
299 self.formatter.pop_margin()
301 def start_menu(self, attrs):
302 self.start_ul(attrs)
304 def end_menu(self):
305 self.end_ul()
307 def start_dir(self, attrs):
308 self.start_ul(attrs)
310 def end_dir(self):
311 self.end_ul()
313 def start_dl(self, attrs):
314 self.formatter.end_paragraph(1)
315 self.list_stack.append(['dl', '', 0])
317 def end_dl(self):
318 self.ddpop(1)
319 if self.list_stack: del self.list_stack[-1]
321 def do_dt(self, attrs):
322 self.ddpop()
324 def do_dd(self, attrs):
325 self.ddpop()
326 self.formatter.push_margin('dd')
327 self.list_stack.append(['dd', '', 0])
329 def ddpop(self, bl=0):
330 self.formatter.end_paragraph(bl)
331 if self.list_stack:
332 if self.list_stack[-1][0] == 'dd':
333 del self.list_stack[-1]
334 self.formatter.pop_margin()
336 # --- Phrase Markup
338 # Idiomatic Elements
340 def start_cite(self, attrs): self.start_i(attrs)
341 def end_cite(self): self.end_i()
343 def start_code(self, attrs): self.start_tt(attrs)
344 def end_code(self): self.end_tt()
346 def start_em(self, attrs): self.start_i(attrs)
347 def end_em(self): self.end_i()
349 def start_kbd(self, attrs): self.start_tt(attrs)
350 def end_kbd(self): self.end_tt()
352 def start_samp(self, attrs): self.start_tt(attrs)
353 def end_samp(self): self.end_tt()
355 def start_strong(self, attrs): self.start_b(attrs)
356 def end_strong(self): self.end_b()
358 def start_var(self, attrs): self.start_i(attrs)
359 def end_var(self): self.end_i()
361 # Typographic Elements
363 def start_i(self, attrs):
364 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
365 def end_i(self):
366 self.formatter.pop_font()
368 def start_b(self, attrs):
369 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
370 def end_b(self):
371 self.formatter.pop_font()
373 def start_tt(self, attrs):
374 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
375 def end_tt(self):
376 self.formatter.pop_font()
378 def start_a(self, attrs):
379 href = ''
380 name = ''
381 type = ''
382 for attrname, value in attrs:
383 value = value.strip()
384 if attrname == 'href':
385 href = value
386 if attrname == 'name':
387 name = value
388 if attrname == 'type':
389 type = value.lower()
390 self.anchor_bgn(href, name, type)
392 def end_a(self):
393 self.anchor_end()
395 # --- Line Break
397 def do_br(self, attrs):
398 self.formatter.add_line_break()
400 # --- Horizontal Rule
402 def do_hr(self, attrs):
403 self.formatter.add_hor_rule()
405 # --- Image
407 def do_img(self, attrs):
408 align = ''
409 alt = '(image)'
410 ismap = ''
411 src = ''
412 width = 0
413 height = 0
414 for attrname, value in attrs:
415 if attrname == 'align':
416 align = value
417 if attrname == 'alt':
418 alt = value
419 if attrname == 'ismap':
420 ismap = value
421 if attrname == 'src':
422 src = value
423 if attrname == 'width':
424 try: width = int(value)
425 except ValueError: pass
426 if attrname == 'height':
427 try: height = int(value)
428 except ValueError: pass
429 self.handle_image(src, alt, ismap, align, width, height)
431 # --- Really Old Unofficial Deprecated Stuff
433 def do_plaintext(self, attrs):
434 self.start_pre(attrs)
435 self.setnomoretags() # Tell SGML parser
437 # --- Unhandled tags
439 def unknown_starttag(self, tag, attrs):
440 pass
442 def unknown_endtag(self, tag):
443 pass
446 def test(args = None):
447 import sys, formatter
449 if not args:
450 args = sys.argv[1:]
452 silent = args and args[0] == '-s'
453 if silent:
454 del args[0]
456 if args:
457 file = args[0]
458 else:
459 file = 'test.html'
461 if file == '-':
462 f = sys.stdin
463 else:
464 try:
465 f = open(file, 'r')
466 except IOError, msg:
467 print file, ":", msg
468 sys.exit(1)
470 data = f.read()
472 if f is not sys.stdin:
473 f.close()
475 if silent:
476 f = formatter.NullFormatter()
477 else:
478 f = formatter.AbstractFormatter(formatter.DumbWriter())
480 p = HTMLParser(f)
481 p.feed(data)
482 p.close()
485 if __name__ == '__main__':
486 test()