1 """A parser for SGML, using the derived class as a static DTD."""
3 # XXX This only supports those SGML features used by HTML.
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special). RCDATA is
9 # not supported at all.
15 __all__
= ["SGMLParser", "SGMLParseError"]
17 # Regular expressions used for parsing
19 interesting
= re
.compile('[&<]')
20 incomplete
= re
.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
25 entityref
= re
.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
26 charref
= re
.compile('&#([0-9]+)[^0-9]')
28 starttagopen
= re
.compile('<[>a-zA-Z]')
29 shorttagopen
= re
.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
30 shorttag
= re
.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
31 piclose
= re
.compile('>')
32 endbracket
= re
.compile('[<>]')
33 tagfind
= re
.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
34 attrfind
= re
.compile(
35 r
'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
36 r
'(\'[^
\']*\'|
"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?
')
39 class SGMLParseError(RuntimeError):
40 """Exception raised for all parse errors."""
44 # SGML parser base class -- find tags and call handler functions.
45 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
46 # The dtd is defined by deriving a class which defines methods
47 # with special names to handle tags: start_foo and end_foo to handle
48 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
49 # (Tags are converted to lower case for this purpose.) The data
50 # between tags is passed to the parser by calling self.handle_data()
51 # with some data as argument (the data may be split up in arbitrary
52 # chunks). Entity references are passed by calling
53 # self.handle_entityref() with the entity reference as argument.
55 class SGMLParser(markupbase.ParserBase):
57 def __init__(self, verbose=0):
58 """Initialize and reset this instance."""
59 self.verbose = verbose
63 """Reset this instance. Loses all unprocessed data."""
64 self.__starttag_text = None
70 markupbase.ParserBase.reset(self)
72 def setnomoretags(self):
73 """Enter literal mode (CDATA) till EOF.
75 Intended for derived classes only.
77 self.nomoretags = self.literal = 1
79 def setliteral(self, *args):
80 """Enter literal mode (CDATA).
82 Intended for derived classes only.
87 """Feed some data to the parser.
89 Call this as often as you want, with as little or as much text
90 as you want (may include '\n'). (This just saves the text,
91 all the processing is done by goahead().)
94 self.rawdata = self.rawdata + data
98 """Handle the remaining data."""
101 def error(self, message):
102 raise SGMLParseError(message)
104 # Internal -- handle data as far as reasonable. May leave state
105 # and data to be processed by a subsequent call. If 'end
' is
106 # true, force handling all data as if followed by EOF marker.
107 def goahead(self, end):
108 rawdata = self.rawdata
113 self.handle_data(rawdata[i:n])
116 match = interesting.search(rawdata, i)
117 if match: j = match.start()
120 self.handle_data(rawdata[i:j])
123 if rawdata[i] == '<':
124 if starttagopen.match(rawdata, i):
126 self.handle_data(rawdata[i])
129 k = self.parse_starttag(i)
133 if rawdata.startswith("</", i):
134 k = self.parse_endtag(i)
141 self.handle_data("<")
147 if rawdata.startswith("<!--", i):
148 # Strictly speaking, a comment is --.*--
149 # within a declaration tag <!...>.
150 # This should be removed,
151 # and comments handled only in parse_declaration.
152 k = self.parse_comment(i)
156 if rawdata.startswith("<?", i):
161 if rawdata.startswith("<!", i):
162 # This is some sort of declaration; in "HTML as
163 # deployed," this should only be the document type
164 # declaration ("<!DOCTYPE html...>").
165 k = self.parse_declaration(i)
169 elif rawdata[i] == '&':
171 self.handle_data(rawdata[i])
174 match = charref.match(rawdata, i)
176 name = match.group(1)
177 self.handle_charref(name)
179 if rawdata[i-1] != ';': i = i-1
181 match = entityref.match(rawdata, i)
183 name = match.group(1)
184 self.handle_entityref(name)
186 if rawdata[i-1] != ';': i = i-1
189 self.error('neither
< nor
& ??
')
190 # We get here only if incomplete matches but
192 match = incomplete.match(rawdata, i)
194 self.handle_data(rawdata[i])
199 break # Really incomplete
200 self.handle_data(rawdata[i:j])
204 self.handle_data(rawdata[i:n])
206 self.rawdata = rawdata[i:]
207 # XXX if end: check for empty stack
209 # Extensions for the DOCTYPE scanner:
210 _decl_otherchars = '='
212 # Internal -- parse processing instr, return length or -1 if not terminated
213 def parse_pi(self, i):
214 rawdata = self.rawdata
215 if rawdata[i:i+2] != '<?
':
216 self.error('unexpected call to
parse_pi()')
217 match = piclose.search(rawdata, i+2)
221 self.handle_pi(rawdata[i+2: j])
225 def get_starttag_text(self):
226 return self.__starttag_text
228 # Internal -- handle starttag, return length or -1 if not terminated
229 def parse_starttag(self, i):
230 self.__starttag_text = None
232 rawdata = self.rawdata
233 if shorttagopen.match(rawdata, i):
234 # SGML shorthand: <tag/data/ == <tag>data</tag>
235 # XXX Can data contain &... (entity or char refs)?
236 # XXX Can data contain < or > (tag characters)?
237 # XXX Can there be whitespace before the first /?
238 match = shorttag.match(rawdata, i)
241 tag, data = match.group(1, 2)
242 self.__starttag_text = '<%s/' % tag
245 self.finish_shorttag(tag, data)
246 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
248 # XXX The following should skip matching quotes (' or ")
249 match = endbracket.search(rawdata, i+1)
253 # Now parse the data between i+1 and j into a tag and attrs
255 if rawdata[i:i+2] == '<>':
256 # SGML shorthand: <> == <last open tag seen>
260 match = tagfind.match(rawdata, i+1)
262 self.error('unexpected call to parse_starttag')
264 tag = rawdata[i+1:k].lower()
267 match = attrfind.match(rawdata, k)
269 attrname, rest, attrvalue = match.group(1, 2, 3)
273 if (attrvalue[:1] == "'" == attrvalue[-1:] or
274 attrvalue[:1] == '"' == attrvalue[-1:]):
276 attrvalue = attrvalue[1:-1]
279 while l < len(attrvalue):
280 av_match = entityref.match(attrvalue, l)
281 if (av_match and av_match.group(1) in self.entitydefs and
282 attrvalue[av_match.end(1)] == ';'):
283 # only substitute entityrefs ending in ';' since
284 # otherwise we may break <a href='?p=x&q=y'>
285 # which is very common
286 new_attrvalue += self.entitydefs[av_match.group(1)]
289 ch_match = charref.match(attrvalue, l)
292 char = chr(int(ch_match.group(1)))
293 new_attrvalue += char
297 # invalid character reference, don't substitute
300 new_attrvalue += attrvalue[l]
302 attrvalue = new_attrvalue
303 attrs.append((attrname.lower(), attrvalue))
305 if rawdata[j] == '>':
307 self.__starttag_text = rawdata[start_pos:j]
308 self.finish_starttag(tag, attrs)
311 # Internal -- parse endtag
312 def parse_endtag(self, i):
313 rawdata = self.rawdata
314 match = endbracket.search(rawdata, i+1)
318 tag = rawdata[i+2:j].strip().lower()
319 if rawdata[j] == '>':
321 self.finish_endtag(tag)
324 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
325 def finish_shorttag(self, tag, data):
326 self.finish_starttag(tag, [])
327 self.handle_data(data)
328 self.finish_endtag(tag)
330 # Internal -- finish processing of start tag
331 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
332 def finish_starttag(self, tag, attrs):
334 method = getattr(self, 'start_' + tag)
335 except AttributeError:
337 method = getattr(self, 'do_' + tag)
338 except AttributeError:
339 self.unknown_starttag(tag, attrs)
342 self.handle_starttag(tag, method, attrs)
345 self.stack.append(tag)
346 self.handle_starttag(tag, method, attrs)
349 # Internal -- finish processing of end tag
350 def finish_endtag(self, tag):
352 found = len(self.stack) - 1
354 self.unknown_endtag(tag)
357 if tag not in self.stack:
359 method = getattr(self, 'end_' + tag)
360 except AttributeError:
361 self.unknown_endtag(tag)
363 self.report_unbalanced(tag)
365 found = len(self.stack)
366 for i in range(found):
367 if self.stack[i] == tag: found = i
368 while len(self.stack) > found:
371 method = getattr(self, 'end_' + tag)
372 except AttributeError:
375 self.handle_endtag(tag, method)
377 self.unknown_endtag(tag)
380 # Overridable -- handle start tag
381 def handle_starttag(self, tag, method, attrs):
384 # Overridable -- handle end tag
385 def handle_endtag(self, tag, method):
388 # Example -- report an unbalanced </...> tag.
389 def report_unbalanced(self, tag):
391 print '*** Unbalanced </' + tag + '>'
392 print '*** Stack:', self.stack
394 def handle_charref(self, name):
395 """Handle character reference, no need to override."""
399 self.unknown_charref(name)
401 if not 0 <= n <= 255:
402 self.unknown_charref(name)
404 self.handle_data(chr(n))
406 # Definition of entities -- derived classes may override
408 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos
': '\''}
410 def handle_entityref(self, name):
411 """Handle entity references.
413 There should be no need to override this method; it can be
414 tailored by setting up the self.entitydefs mapping appropriately.
416 table = self.entitydefs
418 self.handle_data(table[name])
420 self.unknown_entityref(name)
423 # Example -- handle data, should be overridden
424 def handle_data(self, data):
427 # Example -- handle comment, could be overridden
428 def handle_comment(self, data):
431 # Example -- handle declaration, could be overridden
432 def handle_decl(self, decl):
435 # Example -- handle processing instruction, could be overridden
436 def handle_pi(self, data):
439 # To be overridden -- handlers for unknown objects
440 def unknown_starttag(self, tag, attrs): pass
441 def unknown_endtag(self, tag): pass
442 def unknown_charref(self, ref): pass
443 def unknown_entityref(self, ref): pass
446 class TestSGMLParser(SGMLParser):
448 def __init__(self, verbose=0):
450 SGMLParser.__init__(self, verbose)
452 def handle_data(self, data):
453 self.testdata = self.testdata + data
454 if len(repr(self.testdata)) >= 70:
461 print 'data
:', repr(data)
463 def handle_comment(self, data):
467 r = r[:32] + '...' + r[-32:]
470 def unknown_starttag(self, tag, attrs):
473 print 'start tag
: <' + tag + '>'
475 print 'start tag
: <' + tag,
476 for name, value in attrs:
477 print name + '=' + '"' + value + '"',
480 def unknown_endtag(self, tag):
482 print 'end tag
: </' + tag + '>'
484 def unknown_entityref(self, ref):
486 print '*** unknown entity ref
: &' + ref + ';'
488 def unknown_charref(self, ref):
490 print '*** unknown char ref
: &#' + ref + ';'
492 def unknown_decl(self
, data
):
494 print '*** unknown decl: [' + data
+ ']'
497 SGMLParser
.close(self
)
501 def test(args
= None):
507 if args
and args
[0] == '-s':
511 klass
= TestSGMLParser
528 if f
is not sys
.stdin
:
537 if __name__
== '__main__':