only order comparisons are removed in py3k #6119
[python.git] / Lib / sgmllib.py
blob104b25f2a07b0e716c40328d1878f56e774d8169
1 """A parser for SGML, using the derived class as a static DTD."""
3 # XXX This only supports those SGML features used by HTML.
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special). RCDATA is
9 # not supported at all.
12 from warnings import warnpy3k
13 warnpy3k("the sgmllib module has been removed in Python 3.0",
14 stacklevel=2)
15 del warnpy3k
17 import markupbase
18 import re
20 __all__ = ["SGMLParser", "SGMLParseError"]
22 # Regular expressions used for parsing
24 interesting = re.compile('[&<]')
25 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
26 '<([a-zA-Z][^<>]*|'
27 '/([a-zA-Z][^<>]*)?|'
28 '![^<>]*)?')
30 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
31 charref = re.compile('&#([0-9]+)[^0-9]')
33 starttagopen = re.compile('<[>a-zA-Z]')
34 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
35 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
36 piclose = re.compile('>')
37 endbracket = re.compile('[<>]')
38 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
39 attrfind = re.compile(
40 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
41 r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
44 class SGMLParseError(RuntimeError):
45 """Exception raised for all parse errors."""
46 pass
49 # SGML parser base class -- find tags and call handler functions.
50 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
51 # The dtd is defined by deriving a class which defines methods
52 # with special names to handle tags: start_foo and end_foo to handle
53 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
54 # (Tags are converted to lower case for this purpose.) The data
55 # between tags is passed to the parser by calling self.handle_data()
56 # with some data as argument (the data may be split up in arbitrary
57 # chunks). Entity references are passed by calling
58 # self.handle_entityref() with the entity reference as argument.
60 class SGMLParser(markupbase.ParserBase):
61 # Definition of entities -- derived classes may override
62 entity_or_charref = re.compile('&(?:'
63 '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
64 ')(;?)')
66 def __init__(self, verbose=0):
67 """Initialize and reset this instance."""
68 self.verbose = verbose
69 self.reset()
71 def reset(self):
72 """Reset this instance. Loses all unprocessed data."""
73 self.__starttag_text = None
74 self.rawdata = ''
75 self.stack = []
76 self.lasttag = '???'
77 self.nomoretags = 0
78 self.literal = 0
79 markupbase.ParserBase.reset(self)
81 def setnomoretags(self):
82 """Enter literal mode (CDATA) till EOF.
84 Intended for derived classes only.
85 """
86 self.nomoretags = self.literal = 1
88 def setliteral(self, *args):
89 """Enter literal mode (CDATA).
91 Intended for derived classes only.
92 """
93 self.literal = 1
95 def feed(self, data):
96 """Feed some data to the parser.
98 Call this as often as you want, with as little or as much text
99 as you want (may include '\n'). (This just saves the text,
100 all the processing is done by goahead().)
103 self.rawdata = self.rawdata + data
104 self.goahead(0)
106 def close(self):
107 """Handle the remaining data."""
108 self.goahead(1)
110 def error(self, message):
111 raise SGMLParseError(message)
113 # Internal -- handle data as far as reasonable. May leave state
114 # and data to be processed by a subsequent call. If 'end' is
115 # true, force handling all data as if followed by EOF marker.
116 def goahead(self, end):
117 rawdata = self.rawdata
118 i = 0
119 n = len(rawdata)
120 while i < n:
121 if self.nomoretags:
122 self.handle_data(rawdata[i:n])
123 i = n
124 break
125 match = interesting.search(rawdata, i)
126 if match: j = match.start()
127 else: j = n
128 if i < j:
129 self.handle_data(rawdata[i:j])
130 i = j
131 if i == n: break
132 if rawdata[i] == '<':
133 if starttagopen.match(rawdata, i):
134 if self.literal:
135 self.handle_data(rawdata[i])
136 i = i+1
137 continue
138 k = self.parse_starttag(i)
139 if k < 0: break
140 i = k
141 continue
142 if rawdata.startswith("</", i):
143 k = self.parse_endtag(i)
144 if k < 0: break
145 i = k
146 self.literal = 0
147 continue
148 if self.literal:
149 if n > (i + 1):
150 self.handle_data("<")
151 i = i+1
152 else:
153 # incomplete
154 break
155 continue
156 if rawdata.startswith("<!--", i):
157 # Strictly speaking, a comment is --.*--
158 # within a declaration tag <!...>.
159 # This should be removed,
160 # and comments handled only in parse_declaration.
161 k = self.parse_comment(i)
162 if k < 0: break
163 i = k
164 continue
165 if rawdata.startswith("<?", i):
166 k = self.parse_pi(i)
167 if k < 0: break
168 i = i+k
169 continue
170 if rawdata.startswith("<!", i):
171 # This is some sort of declaration; in "HTML as
172 # deployed," this should only be the document type
173 # declaration ("<!DOCTYPE html...>").
174 k = self.parse_declaration(i)
175 if k < 0: break
176 i = k
177 continue
178 elif rawdata[i] == '&':
179 if self.literal:
180 self.handle_data(rawdata[i])
181 i = i+1
182 continue
183 match = charref.match(rawdata, i)
184 if match:
185 name = match.group(1)
186 self.handle_charref(name)
187 i = match.end(0)
188 if rawdata[i-1] != ';': i = i-1
189 continue
190 match = entityref.match(rawdata, i)
191 if match:
192 name = match.group(1)
193 self.handle_entityref(name)
194 i = match.end(0)
195 if rawdata[i-1] != ';': i = i-1
196 continue
197 else:
198 self.error('neither < nor & ??')
199 # We get here only if incomplete matches but
200 # nothing else
201 match = incomplete.match(rawdata, i)
202 if not match:
203 self.handle_data(rawdata[i])
204 i = i+1
205 continue
206 j = match.end(0)
207 if j == n:
208 break # Really incomplete
209 self.handle_data(rawdata[i:j])
210 i = j
211 # end while
212 if end and i < n:
213 self.handle_data(rawdata[i:n])
214 i = n
215 self.rawdata = rawdata[i:]
216 # XXX if end: check for empty stack
218 # Extensions for the DOCTYPE scanner:
219 _decl_otherchars = '='
221 # Internal -- parse processing instr, return length or -1 if not terminated
222 def parse_pi(self, i):
223 rawdata = self.rawdata
224 if rawdata[i:i+2] != '<?':
225 self.error('unexpected call to parse_pi()')
226 match = piclose.search(rawdata, i+2)
227 if not match:
228 return -1
229 j = match.start(0)
230 self.handle_pi(rawdata[i+2: j])
231 j = match.end(0)
232 return j-i
234 def get_starttag_text(self):
235 return self.__starttag_text
237 # Internal -- handle starttag, return length or -1 if not terminated
238 def parse_starttag(self, i):
239 self.__starttag_text = None
240 start_pos = i
241 rawdata = self.rawdata
242 if shorttagopen.match(rawdata, i):
243 # SGML shorthand: <tag/data/ == <tag>data</tag>
244 # XXX Can data contain &... (entity or char refs)?
245 # XXX Can data contain < or > (tag characters)?
246 # XXX Can there be whitespace before the first /?
247 match = shorttag.match(rawdata, i)
248 if not match:
249 return -1
250 tag, data = match.group(1, 2)
251 self.__starttag_text = '<%s/' % tag
252 tag = tag.lower()
253 k = match.end(0)
254 self.finish_shorttag(tag, data)
255 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
256 return k
257 # XXX The following should skip matching quotes (' or ")
258 # As a shortcut way to exit, this isn't so bad, but shouldn't
259 # be used to locate the actual end of the start tag since the
260 # < or > characters may be embedded in an attribute value.
261 match = endbracket.search(rawdata, i+1)
262 if not match:
263 return -1
264 j = match.start(0)
265 # Now parse the data between i+1 and j into a tag and attrs
266 attrs = []
267 if rawdata[i:i+2] == '<>':
268 # SGML shorthand: <> == <last open tag seen>
269 k = j
270 tag = self.lasttag
271 else:
272 match = tagfind.match(rawdata, i+1)
273 if not match:
274 self.error('unexpected call to parse_starttag')
275 k = match.end(0)
276 tag = rawdata[i+1:k].lower()
277 self.lasttag = tag
278 while k < j:
279 match = attrfind.match(rawdata, k)
280 if not match: break
281 attrname, rest, attrvalue = match.group(1, 2, 3)
282 if not rest:
283 attrvalue = attrname
284 else:
285 if (attrvalue[:1] == "'" == attrvalue[-1:] or
286 attrvalue[:1] == '"' == attrvalue[-1:]):
287 # strip quotes
288 attrvalue = attrvalue[1:-1]
289 attrvalue = self.entity_or_charref.sub(
290 self._convert_ref, attrvalue)
291 attrs.append((attrname.lower(), attrvalue))
292 k = match.end(0)
293 if rawdata[j] == '>':
294 j = j+1
295 self.__starttag_text = rawdata[start_pos:j]
296 self.finish_starttag(tag, attrs)
297 return j
299 # Internal -- convert entity or character reference
300 def _convert_ref(self, match):
301 if match.group(2):
302 return self.convert_charref(match.group(2)) or \
303 '&#%s%s' % match.groups()[1:]
304 elif match.group(3):
305 return self.convert_entityref(match.group(1)) or \
306 '&%s;' % match.group(1)
307 else:
308 return '&%s' % match.group(1)
310 # Internal -- parse endtag
311 def parse_endtag(self, i):
312 rawdata = self.rawdata
313 match = endbracket.search(rawdata, i+1)
314 if not match:
315 return -1
316 j = match.start(0)
317 tag = rawdata[i+2:j].strip().lower()
318 if rawdata[j] == '>':
319 j = j+1
320 self.finish_endtag(tag)
321 return j
323 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
324 def finish_shorttag(self, tag, data):
325 self.finish_starttag(tag, [])
326 self.handle_data(data)
327 self.finish_endtag(tag)
329 # Internal -- finish processing of start tag
330 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
331 def finish_starttag(self, tag, attrs):
332 try:
333 method = getattr(self, 'start_' + tag)
334 except AttributeError:
335 try:
336 method = getattr(self, 'do_' + tag)
337 except AttributeError:
338 self.unknown_starttag(tag, attrs)
339 return -1
340 else:
341 self.handle_starttag(tag, method, attrs)
342 return 0
343 else:
344 self.stack.append(tag)
345 self.handle_starttag(tag, method, attrs)
346 return 1
348 # Internal -- finish processing of end tag
349 def finish_endtag(self, tag):
350 if not tag:
351 found = len(self.stack) - 1
352 if found < 0:
353 self.unknown_endtag(tag)
354 return
355 else:
356 if tag not in self.stack:
357 try:
358 method = getattr(self, 'end_' + tag)
359 except AttributeError:
360 self.unknown_endtag(tag)
361 else:
362 self.report_unbalanced(tag)
363 return
364 found = len(self.stack)
365 for i in range(found):
366 if self.stack[i] == tag: found = i
367 while len(self.stack) > found:
368 tag = self.stack[-1]
369 try:
370 method = getattr(self, 'end_' + tag)
371 except AttributeError:
372 method = None
373 if method:
374 self.handle_endtag(tag, method)
375 else:
376 self.unknown_endtag(tag)
377 del self.stack[-1]
379 # Overridable -- handle start tag
380 def handle_starttag(self, tag, method, attrs):
381 method(attrs)
383 # Overridable -- handle end tag
384 def handle_endtag(self, tag, method):
385 method()
387 # Example -- report an unbalanced </...> tag.
388 def report_unbalanced(self, tag):
389 if self.verbose:
390 print '*** Unbalanced </' + tag + '>'
391 print '*** Stack:', self.stack
393 def convert_charref(self, name):
394 """Convert character reference, may be overridden."""
395 try:
396 n = int(name)
397 except ValueError:
398 return
399 if not 0 <= n <= 127:
400 return
401 return self.convert_codepoint(n)
403 def convert_codepoint(self, codepoint):
404 return chr(codepoint)
406 def handle_charref(self, name):
407 """Handle character reference, no need to override."""
408 replacement = self.convert_charref(name)
409 if replacement is None:
410 self.unknown_charref(name)
411 else:
412 self.handle_data(replacement)
414 # Definition of entities -- derived classes may override
415 entitydefs = \
416 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
418 def convert_entityref(self, name):
419 """Convert entity references.
421 As an alternative to overriding this method; one can tailor the
422 results by setting up the self.entitydefs mapping appropriately.
424 table = self.entitydefs
425 if name in table:
426 return table[name]
427 else:
428 return
430 def handle_entityref(self, name):
431 """Handle entity references, no need to override."""
432 replacement = self.convert_entityref(name)
433 if replacement is None:
434 self.unknown_entityref(name)
435 else:
436 self.handle_data(replacement)
438 # Example -- handle data, should be overridden
439 def handle_data(self, data):
440 pass
442 # Example -- handle comment, could be overridden
443 def handle_comment(self, data):
444 pass
446 # Example -- handle declaration, could be overridden
447 def handle_decl(self, decl):
448 pass
450 # Example -- handle processing instruction, could be overridden
451 def handle_pi(self, data):
452 pass
454 # To be overridden -- handlers for unknown objects
455 def unknown_starttag(self, tag, attrs): pass
456 def unknown_endtag(self, tag): pass
457 def unknown_charref(self, ref): pass
458 def unknown_entityref(self, ref): pass
461 class TestSGMLParser(SGMLParser):
463 def __init__(self, verbose=0):
464 self.testdata = ""
465 SGMLParser.__init__(self, verbose)
467 def handle_data(self, data):
468 self.testdata = self.testdata + data
469 if len(repr(self.testdata)) >= 70:
470 self.flush()
472 def flush(self):
473 data = self.testdata
474 if data:
475 self.testdata = ""
476 print 'data:', repr(data)
478 def handle_comment(self, data):
479 self.flush()
480 r = repr(data)
481 if len(r) > 68:
482 r = r[:32] + '...' + r[-32:]
483 print 'comment:', r
485 def unknown_starttag(self, tag, attrs):
486 self.flush()
487 if not attrs:
488 print 'start tag: <' + tag + '>'
489 else:
490 print 'start tag: <' + tag,
491 for name, value in attrs:
492 print name + '=' + '"' + value + '"',
493 print '>'
495 def unknown_endtag(self, tag):
496 self.flush()
497 print 'end tag: </' + tag + '>'
499 def unknown_entityref(self, ref):
500 self.flush()
501 print '*** unknown entity ref: &' + ref + ';'
503 def unknown_charref(self, ref):
504 self.flush()
505 print '*** unknown char ref: &#' + ref + ';'
507 def unknown_decl(self, data):
508 self.flush()
509 print '*** unknown decl: [' + data + ']'
511 def close(self):
512 SGMLParser.close(self)
513 self.flush()
516 def test(args = None):
517 import sys
519 if args is None:
520 args = sys.argv[1:]
522 if args and args[0] == '-s':
523 args = args[1:]
524 klass = SGMLParser
525 else:
526 klass = TestSGMLParser
528 if args:
529 file = args[0]
530 else:
531 file = 'test.html'
533 if file == '-':
534 f = sys.stdin
535 else:
536 try:
537 f = open(file, 'r')
538 except IOError, msg:
539 print file, ":", msg
540 sys.exit(1)
542 data = f.read()
543 if f is not sys.stdin:
544 f.close()
546 x = klass()
547 for c in data:
548 x.feed(c)
549 x.close()
552 if __name__ == '__main__':
553 test()