1 """A parser for SGML, using the derived class as a static DTD."""
3 # XXX This only supports those SGML features used by HTML.
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special). RCDATA is
9 # not supported at all.
15 __all__
= ["SGMLParser", "SGMLParseError"]
17 # Regular expressions used for parsing
19 interesting
= re
.compile('[&<]')
20 incomplete
= re
.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
25 entityref
= re
.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
26 charref
= re
.compile('&#([0-9]+)[^0-9]')
28 starttagopen
= re
.compile('<[>a-zA-Z]')
29 shorttagopen
= re
.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
30 shorttag
= re
.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
31 piclose
= re
.compile('>')
32 endbracket
= re
.compile('[<>]')
33 tagfind
= re
.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
34 attrfind
= re
.compile(
35 r
'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
36 r
'(\'[^
\']*\'|
"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?
')
39 class SGMLParseError(RuntimeError):
40 """Exception raised for all parse errors."""
44 # SGML parser base class -- find tags and call handler functions.
45 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
46 # The dtd is defined by deriving a class which defines methods
47 # with special names to handle tags: start_foo and end_foo to handle
48 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
49 # (Tags are converted to lower case for this purpose.) The data
50 # between tags is passed to the parser by calling self.handle_data()
51 # with some data as argument (the data may be split up in arbitrary
52 # chunks). Entity references are passed by calling
53 # self.handle_entityref() with the entity reference as argument.
55 class SGMLParser(markupbase.ParserBase):
57 def __init__(self, verbose=0):
58 """Initialize and reset this instance."""
59 self.verbose = verbose
63 """Reset this instance. Loses all unprocessed data."""
64 self.__starttag_text = None
70 markupbase.ParserBase.reset(self)
72 def setnomoretags(self):
73 """Enter literal mode (CDATA) till EOF.
75 Intended for derived classes only.
77 self.nomoretags = self.literal = 1
79 def setliteral(self, *args):
80 """Enter literal mode (CDATA).
82 Intended for derived classes only.
87 """Feed some data to the parser.
89 Call this as often as you want, with as little or as much text
90 as you want (may include '\n'). (This just saves the text,
91 all the processing is done by goahead().)
94 self.rawdata = self.rawdata + data
98 """Handle the remaining data."""
101 def error(self, message):
102 raise SGMLParseError(message)
104 # Internal -- handle data as far as reasonable. May leave state
105 # and data to be processed by a subsequent call. If 'end
' is
106 # true, force handling all data as if followed by EOF marker.
107 def goahead(self, end):
108 rawdata = self.rawdata
113 self.handle_data(rawdata[i:n])
116 match = interesting.search(rawdata, i)
117 if match: j = match.start()
120 self.handle_data(rawdata[i:j])
123 if rawdata[i] == '<':
124 if starttagopen.match(rawdata, i):
126 self.handle_data(rawdata[i])
129 k = self.parse_starttag(i)
133 if rawdata.startswith("</", i):
134 k = self.parse_endtag(i)
141 self.handle_data("<")
147 if rawdata.startswith("<!--", i):
148 # Strictly speaking, a comment is --.*--
149 # within a declaration tag <!...>.
150 # This should be removed,
151 # and comments handled only in parse_declaration.
152 k = self.parse_comment(i)
156 if rawdata.startswith("<?", i):
161 if rawdata.startswith("<!", i):
162 # This is some sort of declaration; in "HTML as
163 # deployed," this should only be the document type
164 # declaration ("<!DOCTYPE html...>").
165 k = self.parse_declaration(i)
169 elif rawdata[i] == '&':
171 self.handle_data(rawdata[i])
174 match = charref.match(rawdata, i)
176 name = match.group(1)
177 self.handle_charref(name)
179 if rawdata[i-1] != ';': i = i-1
181 match = entityref.match(rawdata, i)
183 name = match.group(1)
184 self.handle_entityref(name)
186 if rawdata[i-1] != ';': i = i-1
189 self.error('neither
< nor
& ??
')
190 # We get here only if incomplete matches but
192 match = incomplete.match(rawdata, i)
194 self.handle_data(rawdata[i])
199 break # Really incomplete
200 self.handle_data(rawdata[i:j])
204 self.handle_data(rawdata[i:n])
206 self.rawdata = rawdata[i:]
207 # XXX if end: check for empty stack
209 # Extensions for the DOCTYPE scanner:
210 _decl_otherchars = '='
212 # Internal -- parse processing instr, return length or -1 if not terminated
213 def parse_pi(self, i):
214 rawdata = self.rawdata
215 if rawdata[i:i+2] != '<?
':
216 self.error('unexpected call to
parse_pi()')
217 match = piclose.search(rawdata, i+2)
221 self.handle_pi(rawdata[i+2: j])
225 def get_starttag_text(self):
226 return self.__starttag_text
228 # Internal -- handle starttag, return length or -1 if not terminated
229 def parse_starttag(self, i):
230 self.__starttag_text = None
232 rawdata = self.rawdata
233 if shorttagopen.match(rawdata, i):
234 # SGML shorthand: <tag/data/ == <tag>data</tag>
235 # XXX Can data contain &... (entity or char refs)?
236 # XXX Can data contain < or > (tag characters)?
237 # XXX Can there be whitespace before the first /?
238 match = shorttag.match(rawdata, i)
241 tag, data = match.group(1, 2)
242 self.__starttag_text = '<%s/' % tag
245 self.finish_shorttag(tag, data)
246 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
248 # XXX The following should skip matching quotes (' or ")
249 match = endbracket.search(rawdata, i+1)
253 # Now parse the data between i+1 and j into a tag and attrs
255 if rawdata[i:i+2] == '<>':
256 # SGML shorthand: <> == <last open tag seen>
260 match = tagfind.match(rawdata, i+1)
262 self.error('unexpected call to parse_starttag')
264 tag = rawdata[i+1:k].lower()
267 match = attrfind.match(rawdata, k)
269 attrname, rest, attrvalue = match.group(1, 2, 3)
272 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
273 attrvalue[:1] == '"' == attrvalue[-1:]:
274 attrvalue = attrvalue[1:-1]
275 attrs.append((attrname.lower(), attrvalue))
277 if rawdata[j] == '>':
279 self.__starttag_text = rawdata[start_pos:j]
280 self.finish_starttag(tag, attrs)
283 # Internal -- parse endtag
284 def parse_endtag(self, i):
285 rawdata = self.rawdata
286 match = endbracket.search(rawdata, i+1)
290 tag = rawdata[i+2:j].strip().lower()
291 if rawdata[j] == '>':
293 self.finish_endtag(tag)
296 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
297 def finish_shorttag(self, tag, data):
298 self.finish_starttag(tag, [])
299 self.handle_data(data)
300 self.finish_endtag(tag)
302 # Internal -- finish processing of start tag
303 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
304 def finish_starttag(self, tag, attrs):
306 method = getattr(self, 'start_
' + tag)
307 except AttributeError:
309 method = getattr(self, 'do_
' + tag)
310 except AttributeError:
311 self.unknown_starttag(tag, attrs)
314 self.handle_starttag(tag, method, attrs)
317 self.stack.append(tag)
318 self.handle_starttag(tag, method, attrs)
321 # Internal -- finish processing of end tag
322 def finish_endtag(self, tag):
324 found = len(self.stack) - 1
326 self.unknown_endtag(tag)
329 if tag not in self.stack:
331 method = getattr(self, 'end_
' + tag)
332 except AttributeError:
333 self.unknown_endtag(tag)
335 self.report_unbalanced(tag)
337 found = len(self.stack)
338 for i in range(found):
339 if self.stack[i] == tag: found = i
340 while len(self.stack) > found:
343 method = getattr(self, 'end_
' + tag)
344 except AttributeError:
347 self.handle_endtag(tag, method)
349 self.unknown_endtag(tag)
352 # Overridable -- handle start tag
353 def handle_starttag(self, tag, method, attrs):
356 # Overridable -- handle end tag
357 def handle_endtag(self, tag, method):
360 # Example -- report an unbalanced </...> tag.
361 def report_unbalanced(self, tag):
363 print '*** Unbalanced
</' + tag + '>'
364 print '*** Stack
:', self.stack
366 def handle_charref(self, name):
367 """Handle character reference, no need to override."""
371 self.unknown_charref(name)
373 if not 0 <= n <= 255:
374 self.unknown_charref(name)
376 self.handle_data(chr(n))
378 # Definition of entities -- derived classes may override
380 {'lt
': '<', 'gt
': '>', 'amp
': '&', 'quot
': '"', 'apos': '\''}
382 def handle_entityref(self, name):
383 """Handle entity references.
385 There should be no need to override this method; it can be
386 tailored by setting up the self.entitydefs mapping appropriately.
388 table = self.entitydefs
390 self.handle_data(table[name])
392 self.unknown_entityref(name)
395 # Example -- handle data, should be overridden
396 def handle_data(self, data):
399 # Example -- handle comment, could be overridden
400 def handle_comment(self, data):
403 # Example -- handle declaration, could be overridden
404 def handle_decl(self, decl):
407 # Example -- handle processing instruction, could be overridden
408 def handle_pi(self, data):
411 # To be overridden -- handlers for unknown objects
412 def unknown_starttag(self, tag, attrs): pass
413 def unknown_endtag(self, tag): pass
414 def unknown_charref(self, ref): pass
415 def unknown_entityref(self, ref): pass
418 class TestSGMLParser(SGMLParser):
420 def __init__(self, verbose=0):
422 SGMLParser.__init__(self, verbose)
424 def handle_data(self, data):
425 self.testdata = self.testdata + data
426 if len(repr(self.testdata)) >= 70:
433 print 'data:', repr(data)
435 def handle_comment(self, data):
439 r = r[:32] + '...' + r[-32:]
442 def unknown_starttag(self, tag, attrs):
445 print 'start tag: <' + tag + '>'
447 print 'start tag: <' + tag,
448 for name, value in attrs:
449 print name + '=' + '"' + value + '"',
452 def unknown_endtag(self, tag):
454 print 'end tag: </' + tag + '>'
456 def unknown_entityref(self, ref):
458 print '*** unknown entity ref: &' + ref + ';'
460 def unknown_charref(self, ref):
462 print '*** unknown char ref: &#' + ref + ';'
464 def unknown_decl(self, data):
466 print '*** unknown decl: [' + data + ']'
469 SGMLParser.close(self)
473 def test(args = None):
479 if args and args[0] == '-s':
483 klass = TestSGMLParser
500 if f is not sys.stdin:
509 if __name__ == '__main__':