Add better error reporting for MemoryErrors caused by str->float conversions.
[python.git] / Lib / htmllib.py
blob44647dbf026c054c563a78dee19020927cfd5d88
1 """HTML 2.0 parser.
3 See the HTML 2.0 specification:
4 http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5 """
7 from warnings import warnpy3k
8 warnpy3k("the htmllib module has been removed in Python 3.0",
9 stacklevel=2)
10 del warnpy3k
12 import sgmllib
14 from formatter import AS_IS
16 __all__ = ["HTMLParser", "HTMLParseError"]
19 class HTMLParseError(sgmllib.SGMLParseError):
20 """Error raised when an HTML document can't be parsed."""
23 class HTMLParser(sgmllib.SGMLParser):
24 """This is the basic HTML parser class.
26 It supports all entity names required by the XHTML 1.0 Recommendation.
27 It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
28 elements.
30 """
32 from htmlentitydefs import entitydefs
34 def __init__(self, formatter, verbose=0):
35 """Creates an instance of the HTMLParser class.
37 The formatter parameter is the formatter instance associated with
38 the parser.
40 """
41 sgmllib.SGMLParser.__init__(self, verbose)
42 self.formatter = formatter
44 def error(self, message):
45 raise HTMLParseError(message)
47 def reset(self):
48 sgmllib.SGMLParser.reset(self)
49 self.savedata = None
50 self.isindex = 0
51 self.title = None
52 self.base = None
53 self.anchor = None
54 self.anchorlist = []
55 self.nofill = 0
56 self.list_stack = []
58 # ------ Methods used internally; some may be overridden
60 # --- Formatter interface, taking care of 'savedata' mode;
61 # shouldn't need to be overridden
63 def handle_data(self, data):
64 if self.savedata is not None:
65 self.savedata = self.savedata + data
66 else:
67 if self.nofill:
68 self.formatter.add_literal_data(data)
69 else:
70 self.formatter.add_flowing_data(data)
72 # --- Hooks to save data; shouldn't need to be overridden
74 def save_bgn(self):
75 """Begins saving character data in a buffer instead of sending it
76 to the formatter object.
78 Retrieve the stored data via the save_end() method. Use of the
79 save_bgn() / save_end() pair may not be nested.
81 """
82 self.savedata = ''
84 def save_end(self):
85 """Ends buffering character data and returns all data saved since
86 the preceding call to the save_bgn() method.
88 If the nofill flag is false, whitespace is collapsed to single
89 spaces. A call to this method without a preceding call to the
90 save_bgn() method will raise a TypeError exception.
92 """
93 data = self.savedata
94 self.savedata = None
95 if not self.nofill:
96 data = ' '.join(data.split())
97 return data
99 # --- Hooks for anchors; should probably be overridden
101 def anchor_bgn(self, href, name, type):
102 """This method is called at the start of an anchor region.
104 The arguments correspond to the attributes of the <A> tag with
105 the same names. The default implementation maintains a list of
106 hyperlinks (defined by the HREF attribute for <A> tags) within
107 the document. The list of hyperlinks is available as the data
108 attribute anchorlist.
111 self.anchor = href
112 if self.anchor:
113 self.anchorlist.append(href)
115 def anchor_end(self):
116 """This method is called at the end of an anchor region.
118 The default implementation adds a textual footnote marker using an
119 index into the list of hyperlinks created by the anchor_bgn()method.
122 if self.anchor:
123 self.handle_data("[%d]" % len(self.anchorlist))
124 self.anchor = None
126 # --- Hook for images; should probably be overridden
128 def handle_image(self, src, alt, *args):
129 """This method is called to handle images.
131 The default implementation simply passes the alt value to the
132 handle_data() method.
135 self.handle_data(alt)
137 # --------- Top level elememts
139 def start_html(self, attrs): pass
140 def end_html(self): pass
142 def start_head(self, attrs): pass
143 def end_head(self): pass
145 def start_body(self, attrs): pass
146 def end_body(self): pass
148 # ------ Head elements
150 def start_title(self, attrs):
151 self.save_bgn()
153 def end_title(self):
154 self.title = self.save_end()
156 def do_base(self, attrs):
157 for a, v in attrs:
158 if a == 'href':
159 self.base = v
161 def do_isindex(self, attrs):
162 self.isindex = 1
164 def do_link(self, attrs):
165 pass
167 def do_meta(self, attrs):
168 pass
170 def do_nextid(self, attrs): # Deprecated
171 pass
173 # ------ Body elements
175 # --- Headings
177 def start_h1(self, attrs):
178 self.formatter.end_paragraph(1)
179 self.formatter.push_font(('h1', 0, 1, 0))
181 def end_h1(self):
182 self.formatter.end_paragraph(1)
183 self.formatter.pop_font()
185 def start_h2(self, attrs):
186 self.formatter.end_paragraph(1)
187 self.formatter.push_font(('h2', 0, 1, 0))
189 def end_h2(self):
190 self.formatter.end_paragraph(1)
191 self.formatter.pop_font()
193 def start_h3(self, attrs):
194 self.formatter.end_paragraph(1)
195 self.formatter.push_font(('h3', 0, 1, 0))
197 def end_h3(self):
198 self.formatter.end_paragraph(1)
199 self.formatter.pop_font()
201 def start_h4(self, attrs):
202 self.formatter.end_paragraph(1)
203 self.formatter.push_font(('h4', 0, 1, 0))
205 def end_h4(self):
206 self.formatter.end_paragraph(1)
207 self.formatter.pop_font()
209 def start_h5(self, attrs):
210 self.formatter.end_paragraph(1)
211 self.formatter.push_font(('h5', 0, 1, 0))
213 def end_h5(self):
214 self.formatter.end_paragraph(1)
215 self.formatter.pop_font()
217 def start_h6(self, attrs):
218 self.formatter.end_paragraph(1)
219 self.formatter.push_font(('h6', 0, 1, 0))
221 def end_h6(self):
222 self.formatter.end_paragraph(1)
223 self.formatter.pop_font()
225 # --- Block Structuring Elements
227 def do_p(self, attrs):
228 self.formatter.end_paragraph(1)
230 def start_pre(self, attrs):
231 self.formatter.end_paragraph(1)
232 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
233 self.nofill = self.nofill + 1
235 def end_pre(self):
236 self.formatter.end_paragraph(1)
237 self.formatter.pop_font()
238 self.nofill = max(0, self.nofill - 1)
240 def start_xmp(self, attrs):
241 self.start_pre(attrs)
242 self.setliteral('xmp') # Tell SGML parser
244 def end_xmp(self):
245 self.end_pre()
247 def start_listing(self, attrs):
248 self.start_pre(attrs)
249 self.setliteral('listing') # Tell SGML parser
251 def end_listing(self):
252 self.end_pre()
254 def start_address(self, attrs):
255 self.formatter.end_paragraph(0)
256 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
258 def end_address(self):
259 self.formatter.end_paragraph(0)
260 self.formatter.pop_font()
262 def start_blockquote(self, attrs):
263 self.formatter.end_paragraph(1)
264 self.formatter.push_margin('blockquote')
266 def end_blockquote(self):
267 self.formatter.end_paragraph(1)
268 self.formatter.pop_margin()
270 # --- List Elements
272 def start_ul(self, attrs):
273 self.formatter.end_paragraph(not self.list_stack)
274 self.formatter.push_margin('ul')
275 self.list_stack.append(['ul', '*', 0])
277 def end_ul(self):
278 if self.list_stack: del self.list_stack[-1]
279 self.formatter.end_paragraph(not self.list_stack)
280 self.formatter.pop_margin()
282 def do_li(self, attrs):
283 self.formatter.end_paragraph(0)
284 if self.list_stack:
285 [dummy, label, counter] = top = self.list_stack[-1]
286 top[2] = counter = counter+1
287 else:
288 label, counter = '*', 0
289 self.formatter.add_label_data(label, counter)
291 def start_ol(self, attrs):
292 self.formatter.end_paragraph(not self.list_stack)
293 self.formatter.push_margin('ol')
294 label = '1.'
295 for a, v in attrs:
296 if a == 'type':
297 if len(v) == 1: v = v + '.'
298 label = v
299 self.list_stack.append(['ol', label, 0])
301 def end_ol(self):
302 if self.list_stack: del self.list_stack[-1]
303 self.formatter.end_paragraph(not self.list_stack)
304 self.formatter.pop_margin()
306 def start_menu(self, attrs):
307 self.start_ul(attrs)
309 def end_menu(self):
310 self.end_ul()
312 def start_dir(self, attrs):
313 self.start_ul(attrs)
315 def end_dir(self):
316 self.end_ul()
318 def start_dl(self, attrs):
319 self.formatter.end_paragraph(1)
320 self.list_stack.append(['dl', '', 0])
322 def end_dl(self):
323 self.ddpop(1)
324 if self.list_stack: del self.list_stack[-1]
326 def do_dt(self, attrs):
327 self.ddpop()
329 def do_dd(self, attrs):
330 self.ddpop()
331 self.formatter.push_margin('dd')
332 self.list_stack.append(['dd', '', 0])
334 def ddpop(self, bl=0):
335 self.formatter.end_paragraph(bl)
336 if self.list_stack:
337 if self.list_stack[-1][0] == 'dd':
338 del self.list_stack[-1]
339 self.formatter.pop_margin()
341 # --- Phrase Markup
343 # Idiomatic Elements
345 def start_cite(self, attrs): self.start_i(attrs)
346 def end_cite(self): self.end_i()
348 def start_code(self, attrs): self.start_tt(attrs)
349 def end_code(self): self.end_tt()
351 def start_em(self, attrs): self.start_i(attrs)
352 def end_em(self): self.end_i()
354 def start_kbd(self, attrs): self.start_tt(attrs)
355 def end_kbd(self): self.end_tt()
357 def start_samp(self, attrs): self.start_tt(attrs)
358 def end_samp(self): self.end_tt()
360 def start_strong(self, attrs): self.start_b(attrs)
361 def end_strong(self): self.end_b()
363 def start_var(self, attrs): self.start_i(attrs)
364 def end_var(self): self.end_i()
366 # Typographic Elements
368 def start_i(self, attrs):
369 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
370 def end_i(self):
371 self.formatter.pop_font()
373 def start_b(self, attrs):
374 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
375 def end_b(self):
376 self.formatter.pop_font()
378 def start_tt(self, attrs):
379 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
380 def end_tt(self):
381 self.formatter.pop_font()
383 def start_a(self, attrs):
384 href = ''
385 name = ''
386 type = ''
387 for attrname, value in attrs:
388 value = value.strip()
389 if attrname == 'href':
390 href = value
391 if attrname == 'name':
392 name = value
393 if attrname == 'type':
394 type = value.lower()
395 self.anchor_bgn(href, name, type)
397 def end_a(self):
398 self.anchor_end()
400 # --- Line Break
402 def do_br(self, attrs):
403 self.formatter.add_line_break()
405 # --- Horizontal Rule
407 def do_hr(self, attrs):
408 self.formatter.add_hor_rule()
410 # --- Image
412 def do_img(self, attrs):
413 align = ''
414 alt = '(image)'
415 ismap = ''
416 src = ''
417 width = 0
418 height = 0
419 for attrname, value in attrs:
420 if attrname == 'align':
421 align = value
422 if attrname == 'alt':
423 alt = value
424 if attrname == 'ismap':
425 ismap = value
426 if attrname == 'src':
427 src = value
428 if attrname == 'width':
429 try: width = int(value)
430 except ValueError: pass
431 if attrname == 'height':
432 try: height = int(value)
433 except ValueError: pass
434 self.handle_image(src, alt, ismap, align, width, height)
436 # --- Really Old Unofficial Deprecated Stuff
438 def do_plaintext(self, attrs):
439 self.start_pre(attrs)
440 self.setnomoretags() # Tell SGML parser
442 # --- Unhandled tags
444 def unknown_starttag(self, tag, attrs):
445 pass
447 def unknown_endtag(self, tag):
448 pass
451 def test(args = None):
452 import sys, formatter
454 if not args:
455 args = sys.argv[1:]
457 silent = args and args[0] == '-s'
458 if silent:
459 del args[0]
461 if args:
462 file = args[0]
463 else:
464 file = 'test.html'
466 if file == '-':
467 f = sys.stdin
468 else:
469 try:
470 f = open(file, 'r')
471 except IOError, msg:
472 print file, ":", msg
473 sys.exit(1)
475 data = f.read()
477 if f is not sys.stdin:
478 f.close()
480 if silent:
481 f = formatter.NullFormatter()
482 else:
483 f = formatter.AbstractFormatter(formatter.DumbWriter())
485 p = HTMLParser(f)
486 p.feed(data)
487 p.close()
490 if __name__ == '__main__':
491 test()