1 """A parser for SGML, using the derived class as a static DTD."""
3 # XXX This only supports those SGML features used by HTML.
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special). RCDATA is
9 # not supported at all.
15 __all__
= ["SGMLParser", "SGMLParseError"]
17 # Regular expressions used for parsing
19 interesting
= re
.compile('[&<]')
20 incomplete
= re
.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
25 entityref
= re
.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
26 charref
= re
.compile('&#([0-9]+)[^0-9]')
28 starttagopen
= re
.compile('<[>a-zA-Z]')
29 shorttagopen
= re
.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
30 shorttag
= re
.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
31 piclose
= re
.compile('>')
32 endbracket
= re
.compile('[<>]')
33 tagfind
= re
.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
34 attrfind
= re
.compile(
35 r
'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
36 r
'(\'[^
\']*\'|
"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?
')
39 class SGMLParseError(RuntimeError):
40 """Exception raised for all parse errors."""
44 # SGML parser base class -- find tags and call handler functions.
45 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
46 # The dtd is defined by deriving a class which defines methods
47 # with special names to handle tags: start_foo and end_foo to handle
48 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
49 # (Tags are converted to lower case for this purpose.) The data
50 # between tags is passed to the parser by calling self.handle_data()
51 # with some data as argument (the data may be split up in arbitrary
52 # chunks). Entity references are passed by calling
53 # self.handle_entityref() with the entity reference as argument.
55 class SGMLParser(markupbase.ParserBase):
56 # Definition of entities -- derived classes may override
57 entity_or_charref = re.compile('&(?
:'
58 '([a
-zA
-Z
][-.a
-zA
-Z0
-9]*)|
#([0-9]+)'
61 def __init__(self
, verbose
=0):
62 """Initialize and reset this instance."""
63 self
.verbose
= verbose
67 """Reset this instance. Loses all unprocessed data."""
68 self
.__starttag
_text
= None
74 markupbase
.ParserBase
.reset(self
)
76 def setnomoretags(self
):
77 """Enter literal mode (CDATA) till EOF.
79 Intended for derived classes only.
81 self
.nomoretags
= self
.literal
= 1
83 def setliteral(self
, *args
):
84 """Enter literal mode (CDATA).
86 Intended for derived classes only.
91 """Feed some data to the parser.
93 Call this as often as you want, with as little or as much text
94 as you want (may include '\n'). (This just saves the text,
95 all the processing is done by goahead().)
98 self
.rawdata
= self
.rawdata
+ data
102 """Handle the remaining data."""
105 def error(self
, message
):
106 raise SGMLParseError(message
)
108 # Internal -- handle data as far as reasonable. May leave state
109 # and data to be processed by a subsequent call. If 'end' is
110 # true, force handling all data as if followed by EOF marker.
111 def goahead(self
, end
):
112 rawdata
= self
.rawdata
117 self
.handle_data(rawdata
[i
:n
])
120 match
= interesting
.search(rawdata
, i
)
121 if match
: j
= match
.start()
124 self
.handle_data(rawdata
[i
:j
])
127 if rawdata
[i
] == '<':
128 if starttagopen
.match(rawdata
, i
):
130 self
.handle_data(rawdata
[i
])
133 k
= self
.parse_starttag(i
)
137 if rawdata
.startswith("</", i
):
138 k
= self
.parse_endtag(i
)
145 self
.handle_data("<")
151 if rawdata
.startswith("<!--", i
):
152 # Strictly speaking, a comment is --.*--
153 # within a declaration tag <!...>.
154 # This should be removed,
155 # and comments handled only in parse_declaration.
156 k
= self
.parse_comment(i
)
160 if rawdata
.startswith("<?", i
):
165 if rawdata
.startswith("<!", i
):
166 # This is some sort of declaration; in "HTML as
167 # deployed," this should only be the document type
168 # declaration ("<!DOCTYPE html...>").
169 k
= self
.parse_declaration(i
)
173 elif rawdata
[i
] == '&':
175 self
.handle_data(rawdata
[i
])
178 match
= charref
.match(rawdata
, i
)
180 name
= match
.group(1)
181 self
.handle_charref(name
)
183 if rawdata
[i
-1] != ';': i
= i
-1
185 match
= entityref
.match(rawdata
, i
)
187 name
= match
.group(1)
188 self
.handle_entityref(name
)
190 if rawdata
[i
-1] != ';': i
= i
-1
193 self
.error('neither < nor & ??')
194 # We get here only if incomplete matches but
196 match
= incomplete
.match(rawdata
, i
)
198 self
.handle_data(rawdata
[i
])
203 break # Really incomplete
204 self
.handle_data(rawdata
[i
:j
])
208 self
.handle_data(rawdata
[i
:n
])
210 self
.rawdata
= rawdata
[i
:]
211 # XXX if end: check for empty stack
213 # Extensions for the DOCTYPE scanner:
214 _decl_otherchars
= '='
216 # Internal -- parse processing instr, return length or -1 if not terminated
217 def parse_pi(self
, i
):
218 rawdata
= self
.rawdata
219 if rawdata
[i
:i
+2] != '<?':
220 self
.error('unexpected call to parse_pi()')
221 match
= piclose
.search(rawdata
, i
+2)
225 self
.handle_pi(rawdata
[i
+2: j
])
229 def get_starttag_text(self
):
230 return self
.__starttag
_text
232 # Internal -- handle starttag, return length or -1 if not terminated
233 def parse_starttag(self
, i
):
234 self
.__starttag
_text
= None
236 rawdata
= self
.rawdata
237 if shorttagopen
.match(rawdata
, i
):
238 # SGML shorthand: <tag/data/ == <tag>data</tag>
239 # XXX Can data contain &... (entity or char refs)?
240 # XXX Can data contain < or > (tag characters)?
241 # XXX Can there be whitespace before the first /?
242 match
= shorttag
.match(rawdata
, i
)
245 tag
, data
= match
.group(1, 2)
246 self
.__starttag
_text
= '<%s/' % tag
249 self
.finish_shorttag(tag
, data
)
250 self
.__starttag
_text
= rawdata
[start_pos
:match
.end(1) + 1]
252 # XXX The following should skip matching quotes (' or ")
253 # As a shortcut way to exit, this isn't so bad, but shouldn't
254 # be used to locate the actual end of the start tag since the
255 # < or > characters may be embedded in an attribute value.
256 match
= endbracket
.search(rawdata
, i
+1)
260 # Now parse the data between i+1 and j into a tag and attrs
262 if rawdata
[i
:i
+2] == '<>':
263 # SGML shorthand: <> == <last open tag seen>
267 match
= tagfind
.match(rawdata
, i
+1)
269 self
.error('unexpected call to parse_starttag')
271 tag
= rawdata
[i
+1:k
].lower()
274 match
= attrfind
.match(rawdata
, k
)
276 attrname
, rest
, attrvalue
= match
.group(1, 2, 3)
280 if (attrvalue
[:1] == "'" == attrvalue
[-1:] or
281 attrvalue
[:1] == '"' == attrvalue
[-1:]):
283 attrvalue
= attrvalue
[1:-1]
284 attrvalue
= self
.entity_or_charref
.sub(
285 self
._convert
_ref
, attrvalue
)
286 attrs
.append((attrname
.lower(), attrvalue
))
288 if rawdata
[j
] == '>':
290 self
.__starttag
_text
= rawdata
[start_pos
:j
]
291 self
.finish_starttag(tag
, attrs
)
294 # Internal -- convert entity or character reference
295 def _convert_ref(self
, match
):
297 return self
.convert_charref(match
.group(2)) or \
298 '&#%s%s' % match
.groups()[1:]
300 return self
.convert_entityref(match
.group(1)) or \
301 '&%s;' % match
.group(1)
303 return '&%s' % match
.group(1)
305 # Internal -- parse endtag
306 def parse_endtag(self
, i
):
307 rawdata
= self
.rawdata
308 match
= endbracket
.search(rawdata
, i
+1)
312 tag
= rawdata
[i
+2:j
].strip().lower()
313 if rawdata
[j
] == '>':
315 self
.finish_endtag(tag
)
318 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
319 def finish_shorttag(self
, tag
, data
):
320 self
.finish_starttag(tag
, [])
321 self
.handle_data(data
)
322 self
.finish_endtag(tag
)
324 # Internal -- finish processing of start tag
325 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
326 def finish_starttag(self
, tag
, attrs
):
328 method
= getattr(self
, 'start_' + tag
)
329 except AttributeError:
331 method
= getattr(self
, 'do_' + tag
)
332 except AttributeError:
333 self
.unknown_starttag(tag
, attrs
)
336 self
.handle_starttag(tag
, method
, attrs
)
339 self
.stack
.append(tag
)
340 self
.handle_starttag(tag
, method
, attrs
)
343 # Internal -- finish processing of end tag
344 def finish_endtag(self
, tag
):
346 found
= len(self
.stack
) - 1
348 self
.unknown_endtag(tag
)
351 if tag
not in self
.stack
:
353 method
= getattr(self
, 'end_' + tag
)
354 except AttributeError:
355 self
.unknown_endtag(tag
)
357 self
.report_unbalanced(tag
)
359 found
= len(self
.stack
)
360 for i
in range(found
):
361 if self
.stack
[i
] == tag
: found
= i
362 while len(self
.stack
) > found
:
365 method
= getattr(self
, 'end_' + tag
)
366 except AttributeError:
369 self
.handle_endtag(tag
, method
)
371 self
.unknown_endtag(tag
)
374 # Overridable -- handle start tag
375 def handle_starttag(self
, tag
, method
, attrs
):
378 # Overridable -- handle end tag
379 def handle_endtag(self
, tag
, method
):
382 # Example -- report an unbalanced </...> tag.
383 def report_unbalanced(self
, tag
):
385 print '*** Unbalanced </' + tag
+ '>'
386 print '*** Stack:', self
.stack
388 def convert_charref(self
, name
):
389 """Convert character reference, may be overridden."""
394 if not 0 <= n
<= 255:
396 return self
.convert_codepoint(n
)
398 def convert_codepoint(self
, codepoint
):
399 return chr(codepoint
)
401 def handle_charref(self
, name
):
402 """Handle character reference, no need to override."""
403 replacement
= self
.convert_charref(name
)
404 if replacement
is None:
405 self
.unknown_charref(name
)
407 self
.handle_data(replacement
)
409 # Definition of entities -- derived classes may override
411 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
413 def convert_entityref(self
, name
):
414 """Convert entity references.
416 As an alternative to overriding this method; one can tailor the
417 results by setting up the self.entitydefs mapping appropriately.
419 table
= self
.entitydefs
425 def handle_entityref(self
, name
):
426 """Handle entity references, no need to override."""
427 replacement
= self
.convert_entityref(name
)
428 if replacement
is None:
429 self
.unknown_entityref(name
)
431 self
.handle_data(self
.convert_entityref(name
))
433 # Example -- handle data, should be overridden
434 def handle_data(self
, data
):
437 # Example -- handle comment, could be overridden
438 def handle_comment(self
, data
):
441 # Example -- handle declaration, could be overridden
442 def handle_decl(self
, decl
):
445 # Example -- handle processing instruction, could be overridden
446 def handle_pi(self
, data
):
449 # To be overridden -- handlers for unknown objects
450 def unknown_starttag(self
, tag
, attrs
): pass
451 def unknown_endtag(self
, tag
): pass
452 def unknown_charref(self
, ref
): pass
453 def unknown_entityref(self
, ref
): pass
456 class TestSGMLParser(SGMLParser
):
458 def __init__(self
, verbose
=0):
460 SGMLParser
.__init
__(self
, verbose
)
462 def handle_data(self
, data
):
463 self
.testdata
= self
.testdata
+ data
464 if len(repr(self
.testdata
)) >= 70:
471 print 'data:', repr(data
)
473 def handle_comment(self
, data
):
477 r
= r
[:32] + '...' + r
[-32:]
480 def unknown_starttag(self
, tag
, attrs
):
483 print 'start tag: <' + tag
+ '>'
485 print 'start tag: <' + tag
,
486 for name
, value
in attrs
:
487 print name
+ '=' + '"' + value
+ '"',
490 def unknown_endtag(self
, tag
):
492 print 'end tag: </' + tag
+ '>'
494 def unknown_entityref(self
, ref
):
496 print '*** unknown entity ref: &' + ref
+ ';'
498 def unknown_charref(self
, ref
):
500 print '*** unknown char ref: &#' + ref
+ ';'
502 def unknown_decl(self
, data
):
504 print '*** unknown decl: [' + data
+ ']'
507 SGMLParser
.close(self
)
511 def test(args
= None):
517 if args
and args
[0] == '-s':
521 klass
= TestSGMLParser
538 if f
is not sys
.stdin
:
547 if __name__
== '__main__':