Updated documentation for findCaller() to indicate that a 3-tuple is now returned...
[python.git] / Lib / sgmllib.py
blob3ab57c23071707b0a23c29e58f35afaa09306f37
1 """A parser for SGML, using the derived class as a static DTD."""
3 # XXX This only supports those SGML features used by HTML.
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special). RCDATA is
9 # not supported at all.
12 import markupbase
13 import re
15 __all__ = ["SGMLParser", "SGMLParseError"]
17 # Regular expressions used for parsing
19 interesting = re.compile('[&<]')
20 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
21 '<([a-zA-Z][^<>]*|'
22 '/([a-zA-Z][^<>]*)?|'
23 '![^<>]*)?')
25 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
26 charref = re.compile('&#([0-9]+)[^0-9]')
28 starttagopen = re.compile('<[>a-zA-Z]')
29 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
30 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
31 piclose = re.compile('>')
32 endbracket = re.compile('[<>]')
33 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
34 attrfind = re.compile(
35 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
36 r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
39 class SGMLParseError(RuntimeError):
40 """Exception raised for all parse errors."""
41 pass
44 # SGML parser base class -- find tags and call handler functions.
45 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
46 # The dtd is defined by deriving a class which defines methods
47 # with special names to handle tags: start_foo and end_foo to handle
48 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
49 # (Tags are converted to lower case for this purpose.) The data
50 # between tags is passed to the parser by calling self.handle_data()
51 # with some data as argument (the data may be split up in arbitrary
52 # chunks). Entity references are passed by calling
53 # self.handle_entityref() with the entity reference as argument.
55 class SGMLParser(markupbase.ParserBase):
56 # Definition of entities -- derived classes may override
57 entity_or_charref = re.compile('&(?:'
58 '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
59 ')(;?)')
61 def __init__(self, verbose=0):
62 """Initialize and reset this instance."""
63 self.verbose = verbose
64 self.reset()
66 def reset(self):
67 """Reset this instance. Loses all unprocessed data."""
68 self.__starttag_text = None
69 self.rawdata = ''
70 self.stack = []
71 self.lasttag = '???'
72 self.nomoretags = 0
73 self.literal = 0
74 markupbase.ParserBase.reset(self)
76 def setnomoretags(self):
77 """Enter literal mode (CDATA) till EOF.
79 Intended for derived classes only.
80 """
81 self.nomoretags = self.literal = 1
83 def setliteral(self, *args):
84 """Enter literal mode (CDATA).
86 Intended for derived classes only.
87 """
88 self.literal = 1
90 def feed(self, data):
91 """Feed some data to the parser.
93 Call this as often as you want, with as little or as much text
94 as you want (may include '\n'). (This just saves the text,
95 all the processing is done by goahead().)
96 """
98 self.rawdata = self.rawdata + data
99 self.goahead(0)
101 def close(self):
102 """Handle the remaining data."""
103 self.goahead(1)
105 def error(self, message):
106 raise SGMLParseError(message)
108 # Internal -- handle data as far as reasonable. May leave state
109 # and data to be processed by a subsequent call. If 'end' is
110 # true, force handling all data as if followed by EOF marker.
111 def goahead(self, end):
112 rawdata = self.rawdata
113 i = 0
114 n = len(rawdata)
115 while i < n:
116 if self.nomoretags:
117 self.handle_data(rawdata[i:n])
118 i = n
119 break
120 match = interesting.search(rawdata, i)
121 if match: j = match.start()
122 else: j = n
123 if i < j:
124 self.handle_data(rawdata[i:j])
125 i = j
126 if i == n: break
127 if rawdata[i] == '<':
128 if starttagopen.match(rawdata, i):
129 if self.literal:
130 self.handle_data(rawdata[i])
131 i = i+1
132 continue
133 k = self.parse_starttag(i)
134 if k < 0: break
135 i = k
136 continue
137 if rawdata.startswith("</", i):
138 k = self.parse_endtag(i)
139 if k < 0: break
140 i = k
141 self.literal = 0
142 continue
143 if self.literal:
144 if n > (i + 1):
145 self.handle_data("<")
146 i = i+1
147 else:
148 # incomplete
149 break
150 continue
151 if rawdata.startswith("<!--", i):
152 # Strictly speaking, a comment is --.*--
153 # within a declaration tag <!...>.
154 # This should be removed,
155 # and comments handled only in parse_declaration.
156 k = self.parse_comment(i)
157 if k < 0: break
158 i = k
159 continue
160 if rawdata.startswith("<?", i):
161 k = self.parse_pi(i)
162 if k < 0: break
163 i = i+k
164 continue
165 if rawdata.startswith("<!", i):
166 # This is some sort of declaration; in "HTML as
167 # deployed," this should only be the document type
168 # declaration ("<!DOCTYPE html...>").
169 k = self.parse_declaration(i)
170 if k < 0: break
171 i = k
172 continue
173 elif rawdata[i] == '&':
174 if self.literal:
175 self.handle_data(rawdata[i])
176 i = i+1
177 continue
178 match = charref.match(rawdata, i)
179 if match:
180 name = match.group(1)
181 self.handle_charref(name)
182 i = match.end(0)
183 if rawdata[i-1] != ';': i = i-1
184 continue
185 match = entityref.match(rawdata, i)
186 if match:
187 name = match.group(1)
188 self.handle_entityref(name)
189 i = match.end(0)
190 if rawdata[i-1] != ';': i = i-1
191 continue
192 else:
193 self.error('neither < nor & ??')
194 # We get here only if incomplete matches but
195 # nothing else
196 match = incomplete.match(rawdata, i)
197 if not match:
198 self.handle_data(rawdata[i])
199 i = i+1
200 continue
201 j = match.end(0)
202 if j == n:
203 break # Really incomplete
204 self.handle_data(rawdata[i:j])
205 i = j
206 # end while
207 if end and i < n:
208 self.handle_data(rawdata[i:n])
209 i = n
210 self.rawdata = rawdata[i:]
211 # XXX if end: check for empty stack
213 # Extensions for the DOCTYPE scanner:
214 _decl_otherchars = '='
216 # Internal -- parse processing instr, return length or -1 if not terminated
217 def parse_pi(self, i):
218 rawdata = self.rawdata
219 if rawdata[i:i+2] != '<?':
220 self.error('unexpected call to parse_pi()')
221 match = piclose.search(rawdata, i+2)
222 if not match:
223 return -1
224 j = match.start(0)
225 self.handle_pi(rawdata[i+2: j])
226 j = match.end(0)
227 return j-i
229 def get_starttag_text(self):
230 return self.__starttag_text
232 # Internal -- handle starttag, return length or -1 if not terminated
233 def parse_starttag(self, i):
234 self.__starttag_text = None
235 start_pos = i
236 rawdata = self.rawdata
237 if shorttagopen.match(rawdata, i):
238 # SGML shorthand: <tag/data/ == <tag>data</tag>
239 # XXX Can data contain &... (entity or char refs)?
240 # XXX Can data contain < or > (tag characters)?
241 # XXX Can there be whitespace before the first /?
242 match = shorttag.match(rawdata, i)
243 if not match:
244 return -1
245 tag, data = match.group(1, 2)
246 self.__starttag_text = '<%s/' % tag
247 tag = tag.lower()
248 k = match.end(0)
249 self.finish_shorttag(tag, data)
250 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
251 return k
252 # XXX The following should skip matching quotes (' or ")
253 # As a shortcut way to exit, this isn't so bad, but shouldn't
254 # be used to locate the actual end of the start tag since the
255 # < or > characters may be embedded in an attribute value.
256 match = endbracket.search(rawdata, i+1)
257 if not match:
258 return -1
259 j = match.start(0)
260 # Now parse the data between i+1 and j into a tag and attrs
261 attrs = []
262 if rawdata[i:i+2] == '<>':
263 # SGML shorthand: <> == <last open tag seen>
264 k = j
265 tag = self.lasttag
266 else:
267 match = tagfind.match(rawdata, i+1)
268 if not match:
269 self.error('unexpected call to parse_starttag')
270 k = match.end(0)
271 tag = rawdata[i+1:k].lower()
272 self.lasttag = tag
273 while k < j:
274 match = attrfind.match(rawdata, k)
275 if not match: break
276 attrname, rest, attrvalue = match.group(1, 2, 3)
277 if not rest:
278 attrvalue = attrname
279 else:
280 if (attrvalue[:1] == "'" == attrvalue[-1:] or
281 attrvalue[:1] == '"' == attrvalue[-1:]):
282 # strip quotes
283 attrvalue = attrvalue[1:-1]
284 attrvalue = self.entity_or_charref.sub(
285 self._convert_ref, attrvalue)
286 attrs.append((attrname.lower(), attrvalue))
287 k = match.end(0)
288 if rawdata[j] == '>':
289 j = j+1
290 self.__starttag_text = rawdata[start_pos:j]
291 self.finish_starttag(tag, attrs)
292 return j
294 # Internal -- convert entity or character reference
295 def _convert_ref(self, match):
296 if match.group(2):
297 return self.convert_charref(match.group(2)) or \
298 '&#%s%s' % match.groups()[1:]
299 elif match.group(3):
300 return self.convert_entityref(match.group(1)) or \
301 '&%s;' % match.group(1)
302 else:
303 return '&%s' % match.group(1)
305 # Internal -- parse endtag
306 def parse_endtag(self, i):
307 rawdata = self.rawdata
308 match = endbracket.search(rawdata, i+1)
309 if not match:
310 return -1
311 j = match.start(0)
312 tag = rawdata[i+2:j].strip().lower()
313 if rawdata[j] == '>':
314 j = j+1
315 self.finish_endtag(tag)
316 return j
318 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
319 def finish_shorttag(self, tag, data):
320 self.finish_starttag(tag, [])
321 self.handle_data(data)
322 self.finish_endtag(tag)
324 # Internal -- finish processing of start tag
325 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
326 def finish_starttag(self, tag, attrs):
327 try:
328 method = getattr(self, 'start_' + tag)
329 except AttributeError:
330 try:
331 method = getattr(self, 'do_' + tag)
332 except AttributeError:
333 self.unknown_starttag(tag, attrs)
334 return -1
335 else:
336 self.handle_starttag(tag, method, attrs)
337 return 0
338 else:
339 self.stack.append(tag)
340 self.handle_starttag(tag, method, attrs)
341 return 1
343 # Internal -- finish processing of end tag
344 def finish_endtag(self, tag):
345 if not tag:
346 found = len(self.stack) - 1
347 if found < 0:
348 self.unknown_endtag(tag)
349 return
350 else:
351 if tag not in self.stack:
352 try:
353 method = getattr(self, 'end_' + tag)
354 except AttributeError:
355 self.unknown_endtag(tag)
356 else:
357 self.report_unbalanced(tag)
358 return
359 found = len(self.stack)
360 for i in range(found):
361 if self.stack[i] == tag: found = i
362 while len(self.stack) > found:
363 tag = self.stack[-1]
364 try:
365 method = getattr(self, 'end_' + tag)
366 except AttributeError:
367 method = None
368 if method:
369 self.handle_endtag(tag, method)
370 else:
371 self.unknown_endtag(tag)
372 del self.stack[-1]
374 # Overridable -- handle start tag
375 def handle_starttag(self, tag, method, attrs):
376 method(attrs)
378 # Overridable -- handle end tag
379 def handle_endtag(self, tag, method):
380 method()
382 # Example -- report an unbalanced </...> tag.
383 def report_unbalanced(self, tag):
384 if self.verbose:
385 print '*** Unbalanced </' + tag + '>'
386 print '*** Stack:', self.stack
388 def convert_charref(self, name):
389 """Convert character reference, may be overridden."""
390 try:
391 n = int(name)
392 except ValueError:
393 return
394 if not 0 <= n <= 255:
395 return
396 return self.convert_codepoint(n)
398 def convert_codepoint(self, codepoint):
399 return chr(codepoint)
401 def handle_charref(self, name):
402 """Handle character reference, no need to override."""
403 replacement = self.convert_charref(name)
404 if replacement is None:
405 self.unknown_charref(name)
406 else:
407 self.handle_data(replacement)
409 # Definition of entities -- derived classes may override
410 entitydefs = \
411 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
413 def convert_entityref(self, name):
414 """Convert entity references.
416 As an alternative to overriding this method; one can tailor the
417 results by setting up the self.entitydefs mapping appropriately.
419 table = self.entitydefs
420 if name in table:
421 return table[name]
422 else:
423 return
425 def handle_entityref(self, name):
426 """Handle entity references, no need to override."""
427 replacement = self.convert_entityref(name)
428 if replacement is None:
429 self.unknown_entityref(name)
430 else:
431 self.handle_data(self.convert_entityref(name))
433 # Example -- handle data, should be overridden
434 def handle_data(self, data):
435 pass
437 # Example -- handle comment, could be overridden
438 def handle_comment(self, data):
439 pass
441 # Example -- handle declaration, could be overridden
442 def handle_decl(self, decl):
443 pass
445 # Example -- handle processing instruction, could be overridden
446 def handle_pi(self, data):
447 pass
449 # To be overridden -- handlers for unknown objects
450 def unknown_starttag(self, tag, attrs): pass
451 def unknown_endtag(self, tag): pass
452 def unknown_charref(self, ref): pass
453 def unknown_entityref(self, ref): pass
456 class TestSGMLParser(SGMLParser):
458 def __init__(self, verbose=0):
459 self.testdata = ""
460 SGMLParser.__init__(self, verbose)
462 def handle_data(self, data):
463 self.testdata = self.testdata + data
464 if len(repr(self.testdata)) >= 70:
465 self.flush()
467 def flush(self):
468 data = self.testdata
469 if data:
470 self.testdata = ""
471 print 'data:', repr(data)
473 def handle_comment(self, data):
474 self.flush()
475 r = repr(data)
476 if len(r) > 68:
477 r = r[:32] + '...' + r[-32:]
478 print 'comment:', r
480 def unknown_starttag(self, tag, attrs):
481 self.flush()
482 if not attrs:
483 print 'start tag: <' + tag + '>'
484 else:
485 print 'start tag: <' + tag,
486 for name, value in attrs:
487 print name + '=' + '"' + value + '"',
488 print '>'
490 def unknown_endtag(self, tag):
491 self.flush()
492 print 'end tag: </' + tag + '>'
494 def unknown_entityref(self, ref):
495 self.flush()
496 print '*** unknown entity ref: &' + ref + ';'
498 def unknown_charref(self, ref):
499 self.flush()
500 print '*** unknown char ref: &#' + ref + ';'
502 def unknown_decl(self, data):
503 self.flush()
504 print '*** unknown decl: [' + data + ']'
506 def close(self):
507 SGMLParser.close(self)
508 self.flush()
511 def test(args = None):
512 import sys
514 if args is None:
515 args = sys.argv[1:]
517 if args and args[0] == '-s':
518 args = args[1:]
519 klass = SGMLParser
520 else:
521 klass = TestSGMLParser
523 if args:
524 file = args[0]
525 else:
526 file = 'test.html'
528 if file == '-':
529 f = sys.stdin
530 else:
531 try:
532 f = open(file, 'r')
533 except IOError, msg:
534 print file, ":", msg
535 sys.exit(1)
537 data = f.read()
538 if f is not sys.stdin:
539 f.close()
541 x = klass()
542 for c in data:
543 x.feed(c)
544 x.close()
547 if __name__ == '__main__':
548 test()