I don't think we know of any tests that really leak anymore
[python.git] / Lib / sgmllib.py
blob3e85a910e049809a7a580bda779ce8c58af9e43d
1 """A parser for SGML, using the derived class as a static DTD."""
3 # XXX This only supports those SGML features used by HTML.
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special). RCDATA is
9 # not supported at all.
12 import markupbase
13 import re
15 __all__ = ["SGMLParser", "SGMLParseError"]
17 # Regular expressions used for parsing
19 interesting = re.compile('[&<]')
20 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
21 '<([a-zA-Z][^<>]*|'
22 '/([a-zA-Z][^<>]*)?|'
23 '![^<>]*)?')
25 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
26 charref = re.compile('&#([0-9]+)[^0-9]')
28 starttagopen = re.compile('<[>a-zA-Z]')
29 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
30 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
31 piclose = re.compile('>')
32 endbracket = re.compile('[<>]')
33 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
34 attrfind = re.compile(
35 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
36 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
39 class SGMLParseError(RuntimeError):
40 """Exception raised for all parse errors."""
41 pass
44 # SGML parser base class -- find tags and call handler functions.
45 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
46 # The dtd is defined by deriving a class which defines methods
47 # with special names to handle tags: start_foo and end_foo to handle
48 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
49 # (Tags are converted to lower case for this purpose.) The data
50 # between tags is passed to the parser by calling self.handle_data()
51 # with some data as argument (the data may be split up in arbitrary
52 # chunks). Entity references are passed by calling
53 # self.handle_entityref() with the entity reference as argument.
55 class SGMLParser(markupbase.ParserBase):
57 def __init__(self, verbose=0):
58 """Initialize and reset this instance."""
59 self.verbose = verbose
60 self.reset()
62 def reset(self):
63 """Reset this instance. Loses all unprocessed data."""
64 self.__starttag_text = None
65 self.rawdata = ''
66 self.stack = []
67 self.lasttag = '???'
68 self.nomoretags = 0
69 self.literal = 0
70 markupbase.ParserBase.reset(self)
72 def setnomoretags(self):
73 """Enter literal mode (CDATA) till EOF.
75 Intended for derived classes only.
76 """
77 self.nomoretags = self.literal = 1
79 def setliteral(self, *args):
80 """Enter literal mode (CDATA).
82 Intended for derived classes only.
83 """
84 self.literal = 1
86 def feed(self, data):
87 """Feed some data to the parser.
89 Call this as often as you want, with as little or as much text
90 as you want (may include '\n'). (This just saves the text,
91 all the processing is done by goahead().)
92 """
94 self.rawdata = self.rawdata + data
95 self.goahead(0)
97 def close(self):
98 """Handle the remaining data."""
99 self.goahead(1)
101 def error(self, message):
102 raise SGMLParseError(message)
104 # Internal -- handle data as far as reasonable. May leave state
105 # and data to be processed by a subsequent call. If 'end' is
106 # true, force handling all data as if followed by EOF marker.
107 def goahead(self, end):
108 rawdata = self.rawdata
109 i = 0
110 n = len(rawdata)
111 while i < n:
112 if self.nomoretags:
113 self.handle_data(rawdata[i:n])
114 i = n
115 break
116 match = interesting.search(rawdata, i)
117 if match: j = match.start()
118 else: j = n
119 if i < j:
120 self.handle_data(rawdata[i:j])
121 i = j
122 if i == n: break
123 if rawdata[i] == '<':
124 if starttagopen.match(rawdata, i):
125 if self.literal:
126 self.handle_data(rawdata[i])
127 i = i+1
128 continue
129 k = self.parse_starttag(i)
130 if k < 0: break
131 i = k
132 continue
133 if rawdata.startswith("</", i):
134 k = self.parse_endtag(i)
135 if k < 0: break
136 i = k
137 self.literal = 0
138 continue
139 if self.literal:
140 if n > (i + 1):
141 self.handle_data("<")
142 i = i+1
143 else:
144 # incomplete
145 break
146 continue
147 if rawdata.startswith("<!--", i):
148 # Strictly speaking, a comment is --.*--
149 # within a declaration tag <!...>.
150 # This should be removed,
151 # and comments handled only in parse_declaration.
152 k = self.parse_comment(i)
153 if k < 0: break
154 i = k
155 continue
156 if rawdata.startswith("<?", i):
157 k = self.parse_pi(i)
158 if k < 0: break
159 i = i+k
160 continue
161 if rawdata.startswith("<!", i):
162 # This is some sort of declaration; in "HTML as
163 # deployed," this should only be the document type
164 # declaration ("<!DOCTYPE html...>").
165 k = self.parse_declaration(i)
166 if k < 0: break
167 i = k
168 continue
169 elif rawdata[i] == '&':
170 if self.literal:
171 self.handle_data(rawdata[i])
172 i = i+1
173 continue
174 match = charref.match(rawdata, i)
175 if match:
176 name = match.group(1)
177 self.handle_charref(name)
178 i = match.end(0)
179 if rawdata[i-1] != ';': i = i-1
180 continue
181 match = entityref.match(rawdata, i)
182 if match:
183 name = match.group(1)
184 self.handle_entityref(name)
185 i = match.end(0)
186 if rawdata[i-1] != ';': i = i-1
187 continue
188 else:
189 self.error('neither < nor & ??')
190 # We get here only if incomplete matches but
191 # nothing else
192 match = incomplete.match(rawdata, i)
193 if not match:
194 self.handle_data(rawdata[i])
195 i = i+1
196 continue
197 j = match.end(0)
198 if j == n:
199 break # Really incomplete
200 self.handle_data(rawdata[i:j])
201 i = j
202 # end while
203 if end and i < n:
204 self.handle_data(rawdata[i:n])
205 i = n
206 self.rawdata = rawdata[i:]
207 # XXX if end: check for empty stack
209 # Extensions for the DOCTYPE scanner:
210 _decl_otherchars = '='
212 # Internal -- parse processing instr, return length or -1 if not terminated
213 def parse_pi(self, i):
214 rawdata = self.rawdata
215 if rawdata[i:i+2] != '<?':
216 self.error('unexpected call to parse_pi()')
217 match = piclose.search(rawdata, i+2)
218 if not match:
219 return -1
220 j = match.start(0)
221 self.handle_pi(rawdata[i+2: j])
222 j = match.end(0)
223 return j-i
225 def get_starttag_text(self):
226 return self.__starttag_text
228 # Internal -- handle starttag, return length or -1 if not terminated
229 def parse_starttag(self, i):
230 self.__starttag_text = None
231 start_pos = i
232 rawdata = self.rawdata
233 if shorttagopen.match(rawdata, i):
234 # SGML shorthand: <tag/data/ == <tag>data</tag>
235 # XXX Can data contain &... (entity or char refs)?
236 # XXX Can data contain < or > (tag characters)?
237 # XXX Can there be whitespace before the first /?
238 match = shorttag.match(rawdata, i)
239 if not match:
240 return -1
241 tag, data = match.group(1, 2)
242 self.__starttag_text = '<%s/' % tag
243 tag = tag.lower()
244 k = match.end(0)
245 self.finish_shorttag(tag, data)
246 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
247 return k
248 # XXX The following should skip matching quotes (' or ")
249 match = endbracket.search(rawdata, i+1)
250 if not match:
251 return -1
252 j = match.start(0)
253 # Now parse the data between i+1 and j into a tag and attrs
254 attrs = []
255 if rawdata[i:i+2] == '<>':
256 # SGML shorthand: <> == <last open tag seen>
257 k = j
258 tag = self.lasttag
259 else:
260 match = tagfind.match(rawdata, i+1)
261 if not match:
262 self.error('unexpected call to parse_starttag')
263 k = match.end(0)
264 tag = rawdata[i+1:k].lower()
265 self.lasttag = tag
266 while k < j:
267 match = attrfind.match(rawdata, k)
268 if not match: break
269 attrname, rest, attrvalue = match.group(1, 2, 3)
270 if not rest:
271 attrvalue = attrname
272 else:
273 if (attrvalue[:1] == "'" == attrvalue[-1:] or
274 attrvalue[:1] == '"' == attrvalue[-1:]):
275 # strip quotes
276 attrvalue = attrvalue[1:-1]
277 l = 0
278 new_attrvalue = ''
279 while l < len(attrvalue):
280 av_match = entityref.match(attrvalue, l)
281 if (av_match and av_match.group(1) in self.entitydefs and
282 attrvalue[av_match.end(1)] == ';'):
283 # only substitute entityrefs ending in ';' since
284 # otherwise we may break <a href='?p=x&q=y'>
285 # which is very common
286 new_attrvalue += self.entitydefs[av_match.group(1)]
287 l = av_match.end(0)
288 continue
289 ch_match = charref.match(attrvalue, l)
290 if ch_match:
291 try:
292 char = chr(int(ch_match.group(1)))
293 new_attrvalue += char
294 l = ch_match.end(0)
295 continue
296 except ValueError:
297 # invalid character reference, don't substitute
298 pass
299 # all other cases
300 new_attrvalue += attrvalue[l]
301 l += 1
302 attrvalue = new_attrvalue
303 attrs.append((attrname.lower(), attrvalue))
304 k = match.end(0)
305 if rawdata[j] == '>':
306 j = j+1
307 self.__starttag_text = rawdata[start_pos:j]
308 self.finish_starttag(tag, attrs)
309 return j
311 # Internal -- parse endtag
312 def parse_endtag(self, i):
313 rawdata = self.rawdata
314 match = endbracket.search(rawdata, i+1)
315 if not match:
316 return -1
317 j = match.start(0)
318 tag = rawdata[i+2:j].strip().lower()
319 if rawdata[j] == '>':
320 j = j+1
321 self.finish_endtag(tag)
322 return j
324 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
325 def finish_shorttag(self, tag, data):
326 self.finish_starttag(tag, [])
327 self.handle_data(data)
328 self.finish_endtag(tag)
330 # Internal -- finish processing of start tag
331 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
332 def finish_starttag(self, tag, attrs):
333 try:
334 method = getattr(self, 'start_' + tag)
335 except AttributeError:
336 try:
337 method = getattr(self, 'do_' + tag)
338 except AttributeError:
339 self.unknown_starttag(tag, attrs)
340 return -1
341 else:
342 self.handle_starttag(tag, method, attrs)
343 return 0
344 else:
345 self.stack.append(tag)
346 self.handle_starttag(tag, method, attrs)
347 return 1
349 # Internal -- finish processing of end tag
350 def finish_endtag(self, tag):
351 if not tag:
352 found = len(self.stack) - 1
353 if found < 0:
354 self.unknown_endtag(tag)
355 return
356 else:
357 if tag not in self.stack:
358 try:
359 method = getattr(self, 'end_' + tag)
360 except AttributeError:
361 self.unknown_endtag(tag)
362 else:
363 self.report_unbalanced(tag)
364 return
365 found = len(self.stack)
366 for i in range(found):
367 if self.stack[i] == tag: found = i
368 while len(self.stack) > found:
369 tag = self.stack[-1]
370 try:
371 method = getattr(self, 'end_' + tag)
372 except AttributeError:
373 method = None
374 if method:
375 self.handle_endtag(tag, method)
376 else:
377 self.unknown_endtag(tag)
378 del self.stack[-1]
380 # Overridable -- handle start tag
381 def handle_starttag(self, tag, method, attrs):
382 method(attrs)
384 # Overridable -- handle end tag
385 def handle_endtag(self, tag, method):
386 method()
388 # Example -- report an unbalanced </...> tag.
389 def report_unbalanced(self, tag):
390 if self.verbose:
391 print '*** Unbalanced </' + tag + '>'
392 print '*** Stack:', self.stack
394 def handle_charref(self, name):
395 """Handle character reference, no need to override."""
396 try:
397 n = int(name)
398 except ValueError:
399 self.unknown_charref(name)
400 return
401 if not 0 <= n <= 255:
402 self.unknown_charref(name)
403 return
404 self.handle_data(chr(n))
406 # Definition of entities -- derived classes may override
407 entitydefs = \
408 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
410 def handle_entityref(self, name):
411 """Handle entity references.
413 There should be no need to override this method; it can be
414 tailored by setting up the self.entitydefs mapping appropriately.
416 table = self.entitydefs
417 if name in table:
418 self.handle_data(table[name])
419 else:
420 self.unknown_entityref(name)
421 return
423 # Example -- handle data, should be overridden
424 def handle_data(self, data):
425 pass
427 # Example -- handle comment, could be overridden
428 def handle_comment(self, data):
429 pass
431 # Example -- handle declaration, could be overridden
432 def handle_decl(self, decl):
433 pass
435 # Example -- handle processing instruction, could be overridden
436 def handle_pi(self, data):
437 pass
439 # To be overridden -- handlers for unknown objects
440 def unknown_starttag(self, tag, attrs): pass
441 def unknown_endtag(self, tag): pass
442 def unknown_charref(self, ref): pass
443 def unknown_entityref(self, ref): pass
446 class TestSGMLParser(SGMLParser):
448 def __init__(self, verbose=0):
449 self.testdata = ""
450 SGMLParser.__init__(self, verbose)
452 def handle_data(self, data):
453 self.testdata = self.testdata + data
454 if len(repr(self.testdata)) >= 70:
455 self.flush()
457 def flush(self):
458 data = self.testdata
459 if data:
460 self.testdata = ""
461 print 'data:', repr(data)
463 def handle_comment(self, data):
464 self.flush()
465 r = repr(data)
466 if len(r) > 68:
467 r = r[:32] + '...' + r[-32:]
468 print 'comment:', r
470 def unknown_starttag(self, tag, attrs):
471 self.flush()
472 if not attrs:
473 print 'start tag: <' + tag + '>'
474 else:
475 print 'start tag: <' + tag,
476 for name, value in attrs:
477 print name + '=' + '"' + value + '"',
478 print '>'
480 def unknown_endtag(self, tag):
481 self.flush()
482 print 'end tag: </' + tag + '>'
484 def unknown_entityref(self, ref):
485 self.flush()
486 print '*** unknown entity ref: &' + ref + ';'
488 def unknown_charref(self, ref):
489 self.flush()
490 print '*** unknown char ref: &#' + ref + ';'
492 def unknown_decl(self, data):
493 self.flush()
494 print '*** unknown decl: [' + data + ']'
496 def close(self):
497 SGMLParser.close(self)
498 self.flush()
501 def test(args = None):
502 import sys
504 if args is None:
505 args = sys.argv[1:]
507 if args and args[0] == '-s':
508 args = args[1:]
509 klass = SGMLParser
510 else:
511 klass = TestSGMLParser
513 if args:
514 file = args[0]
515 else:
516 file = 'test.html'
518 if file == '-':
519 f = sys.stdin
520 else:
521 try:
522 f = open(file, 'r')
523 except IOError, msg:
524 print file, ":", msg
525 sys.exit(1)
527 data = f.read()
528 if f is not sys.stdin:
529 f.close()
531 x = klass()
532 for c in data:
533 x.feed(c)
534 x.close()
537 if __name__ == '__main__':
538 test()