1 """A parser for SGML, using the derived class as a static DTD."""
3 # XXX This only supports those SGML features used by HTML.
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special). RCDATA is
9 # not supported at all.
12 from warnings
import warnpy3k
13 warnpy3k("the sgmllib module has been removed in Python 3.0",
20 __all__
= ["SGMLParser", "SGMLParseError"]
22 # Regular expressions used for parsing
24 interesting
= re
.compile('[&<]')
25 incomplete
= re
.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
30 entityref
= re
.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
31 charref
= re
.compile('&#([0-9]+)[^0-9]')
33 starttagopen
= re
.compile('<[>a-zA-Z]')
34 shorttagopen
= re
.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
35 shorttag
= re
.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
36 piclose
= re
.compile('>')
37 endbracket
= re
.compile('[<>]')
38 tagfind
= re
.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
39 attrfind
= re
.compile(
40 r
'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
41 r
'(\'[^
\']*\'|
"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?
')
44 class SGMLParseError(RuntimeError):
45 """Exception raised for all parse errors."""
49 # SGML parser base class -- find tags and call handler functions.
50 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
51 # The dtd is defined by deriving a class which defines methods
52 # with special names to handle tags: start_foo and end_foo to handle
53 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
54 # (Tags are converted to lower case for this purpose.) The data
55 # between tags is passed to the parser by calling self.handle_data()
56 # with some data as argument (the data may be split up in arbitrary
57 # chunks). Entity references are passed by calling
58 # self.handle_entityref() with the entity reference as argument.
60 class SGMLParser(markupbase.ParserBase):
61 # Definition of entities -- derived classes may override
62 entity_or_charref = re.compile('&(?
:'
63 '([a
-zA
-Z
][-.a
-zA
-Z0
-9]*)|
#([0-9]+)'
66 def __init__(self
, verbose
=0):
67 """Initialize and reset this instance."""
68 self
.verbose
= verbose
72 """Reset this instance. Loses all unprocessed data."""
73 self
.__starttag
_text
= None
79 markupbase
.ParserBase
.reset(self
)
81 def setnomoretags(self
):
82 """Enter literal mode (CDATA) till EOF.
84 Intended for derived classes only.
86 self
.nomoretags
= self
.literal
= 1
88 def setliteral(self
, *args
):
89 """Enter literal mode (CDATA).
91 Intended for derived classes only.
96 """Feed some data to the parser.
98 Call this as often as you want, with as little or as much text
99 as you want (may include '\n'). (This just saves the text,
100 all the processing is done by goahead().)
103 self
.rawdata
= self
.rawdata
+ data
107 """Handle the remaining data."""
110 def error(self
, message
):
111 raise SGMLParseError(message
)
113 # Internal -- handle data as far as reasonable. May leave state
114 # and data to be processed by a subsequent call. If 'end' is
115 # true, force handling all data as if followed by EOF marker.
116 def goahead(self
, end
):
117 rawdata
= self
.rawdata
122 self
.handle_data(rawdata
[i
:n
])
125 match
= interesting
.search(rawdata
, i
)
126 if match
: j
= match
.start()
129 self
.handle_data(rawdata
[i
:j
])
132 if rawdata
[i
] == '<':
133 if starttagopen
.match(rawdata
, i
):
135 self
.handle_data(rawdata
[i
])
138 k
= self
.parse_starttag(i
)
142 if rawdata
.startswith("</", i
):
143 k
= self
.parse_endtag(i
)
150 self
.handle_data("<")
156 if rawdata
.startswith("<!--", i
):
157 # Strictly speaking, a comment is --.*--
158 # within a declaration tag <!...>.
159 # This should be removed,
160 # and comments handled only in parse_declaration.
161 k
= self
.parse_comment(i
)
165 if rawdata
.startswith("<?", i
):
170 if rawdata
.startswith("<!", i
):
171 # This is some sort of declaration; in "HTML as
172 # deployed," this should only be the document type
173 # declaration ("<!DOCTYPE html...>").
174 k
= self
.parse_declaration(i
)
178 elif rawdata
[i
] == '&':
180 self
.handle_data(rawdata
[i
])
183 match
= charref
.match(rawdata
, i
)
185 name
= match
.group(1)
186 self
.handle_charref(name
)
188 if rawdata
[i
-1] != ';': i
= i
-1
190 match
= entityref
.match(rawdata
, i
)
192 name
= match
.group(1)
193 self
.handle_entityref(name
)
195 if rawdata
[i
-1] != ';': i
= i
-1
198 self
.error('neither < nor & ??')
199 # We get here only if incomplete matches but
201 match
= incomplete
.match(rawdata
, i
)
203 self
.handle_data(rawdata
[i
])
208 break # Really incomplete
209 self
.handle_data(rawdata
[i
:j
])
213 self
.handle_data(rawdata
[i
:n
])
215 self
.rawdata
= rawdata
[i
:]
216 # XXX if end: check for empty stack
218 # Extensions for the DOCTYPE scanner:
219 _decl_otherchars
= '='
221 # Internal -- parse processing instr, return length or -1 if not terminated
222 def parse_pi(self
, i
):
223 rawdata
= self
.rawdata
224 if rawdata
[i
:i
+2] != '<?':
225 self
.error('unexpected call to parse_pi()')
226 match
= piclose
.search(rawdata
, i
+2)
230 self
.handle_pi(rawdata
[i
+2: j
])
234 def get_starttag_text(self
):
235 return self
.__starttag
_text
237 # Internal -- handle starttag, return length or -1 if not terminated
238 def parse_starttag(self
, i
):
239 self
.__starttag
_text
= None
241 rawdata
= self
.rawdata
242 if shorttagopen
.match(rawdata
, i
):
243 # SGML shorthand: <tag/data/ == <tag>data</tag>
244 # XXX Can data contain &... (entity or char refs)?
245 # XXX Can data contain < or > (tag characters)?
246 # XXX Can there be whitespace before the first /?
247 match
= shorttag
.match(rawdata
, i
)
250 tag
, data
= match
.group(1, 2)
251 self
.__starttag
_text
= '<%s/' % tag
254 self
.finish_shorttag(tag
, data
)
255 self
.__starttag
_text
= rawdata
[start_pos
:match
.end(1) + 1]
257 # XXX The following should skip matching quotes (' or ")
258 # As a shortcut way to exit, this isn't so bad, but shouldn't
259 # be used to locate the actual end of the start tag since the
260 # < or > characters may be embedded in an attribute value.
261 match
= endbracket
.search(rawdata
, i
+1)
265 # Now parse the data between i+1 and j into a tag and attrs
267 if rawdata
[i
:i
+2] == '<>':
268 # SGML shorthand: <> == <last open tag seen>
272 match
= tagfind
.match(rawdata
, i
+1)
274 self
.error('unexpected call to parse_starttag')
276 tag
= rawdata
[i
+1:k
].lower()
279 match
= attrfind
.match(rawdata
, k
)
281 attrname
, rest
, attrvalue
= match
.group(1, 2, 3)
285 if (attrvalue
[:1] == "'" == attrvalue
[-1:] or
286 attrvalue
[:1] == '"' == attrvalue
[-1:]):
288 attrvalue
= attrvalue
[1:-1]
289 attrvalue
= self
.entity_or_charref
.sub(
290 self
._convert
_ref
, attrvalue
)
291 attrs
.append((attrname
.lower(), attrvalue
))
293 if rawdata
[j
] == '>':
295 self
.__starttag
_text
= rawdata
[start_pos
:j
]
296 self
.finish_starttag(tag
, attrs
)
299 # Internal -- convert entity or character reference
300 def _convert_ref(self
, match
):
302 return self
.convert_charref(match
.group(2)) or \
303 '&#%s%s' % match
.groups()[1:]
305 return self
.convert_entityref(match
.group(1)) or \
306 '&%s;' % match
.group(1)
308 return '&%s' % match
.group(1)
310 # Internal -- parse endtag
311 def parse_endtag(self
, i
):
312 rawdata
= self
.rawdata
313 match
= endbracket
.search(rawdata
, i
+1)
317 tag
= rawdata
[i
+2:j
].strip().lower()
318 if rawdata
[j
] == '>':
320 self
.finish_endtag(tag
)
323 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
324 def finish_shorttag(self
, tag
, data
):
325 self
.finish_starttag(tag
, [])
326 self
.handle_data(data
)
327 self
.finish_endtag(tag
)
329 # Internal -- finish processing of start tag
330 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
331 def finish_starttag(self
, tag
, attrs
):
333 method
= getattr(self
, 'start_' + tag
)
334 except AttributeError:
336 method
= getattr(self
, 'do_' + tag
)
337 except AttributeError:
338 self
.unknown_starttag(tag
, attrs
)
341 self
.handle_starttag(tag
, method
, attrs
)
344 self
.stack
.append(tag
)
345 self
.handle_starttag(tag
, method
, attrs
)
348 # Internal -- finish processing of end tag
349 def finish_endtag(self
, tag
):
351 found
= len(self
.stack
) - 1
353 self
.unknown_endtag(tag
)
356 if tag
not in self
.stack
:
358 method
= getattr(self
, 'end_' + tag
)
359 except AttributeError:
360 self
.unknown_endtag(tag
)
362 self
.report_unbalanced(tag
)
364 found
= len(self
.stack
)
365 for i
in range(found
):
366 if self
.stack
[i
] == tag
: found
= i
367 while len(self
.stack
) > found
:
370 method
= getattr(self
, 'end_' + tag
)
371 except AttributeError:
374 self
.handle_endtag(tag
, method
)
376 self
.unknown_endtag(tag
)
379 # Overridable -- handle start tag
380 def handle_starttag(self
, tag
, method
, attrs
):
383 # Overridable -- handle end tag
384 def handle_endtag(self
, tag
, method
):
387 # Example -- report an unbalanced </...> tag.
388 def report_unbalanced(self
, tag
):
390 print '*** Unbalanced </' + tag
+ '>'
391 print '*** Stack:', self
.stack
393 def convert_charref(self
, name
):
394 """Convert character reference, may be overridden."""
399 if not 0 <= n
<= 255:
401 return self
.convert_codepoint(n
)
403 def convert_codepoint(self
, codepoint
):
404 return chr(codepoint
)
406 def handle_charref(self
, name
):
407 """Handle character reference, no need to override."""
408 replacement
= self
.convert_charref(name
)
409 if replacement
is None:
410 self
.unknown_charref(name
)
412 self
.handle_data(replacement
)
414 # Definition of entities -- derived classes may override
416 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
418 def convert_entityref(self
, name
):
419 """Convert entity references.
421 As an alternative to overriding this method; one can tailor the
422 results by setting up the self.entitydefs mapping appropriately.
424 table
= self
.entitydefs
430 def handle_entityref(self
, name
):
431 """Handle entity references, no need to override."""
432 replacement
= self
.convert_entityref(name
)
433 if replacement
is None:
434 self
.unknown_entityref(name
)
436 self
.handle_data(replacement
)
438 # Example -- handle data, should be overridden
439 def handle_data(self
, data
):
442 # Example -- handle comment, could be overridden
443 def handle_comment(self
, data
):
446 # Example -- handle declaration, could be overridden
447 def handle_decl(self
, decl
):
450 # Example -- handle processing instruction, could be overridden
451 def handle_pi(self
, data
):
454 # To be overridden -- handlers for unknown objects
455 def unknown_starttag(self
, tag
, attrs
): pass
456 def unknown_endtag(self
, tag
): pass
457 def unknown_charref(self
, ref
): pass
458 def unknown_entityref(self
, ref
): pass
461 class TestSGMLParser(SGMLParser
):
463 def __init__(self
, verbose
=0):
465 SGMLParser
.__init
__(self
, verbose
)
467 def handle_data(self
, data
):
468 self
.testdata
= self
.testdata
+ data
469 if len(repr(self
.testdata
)) >= 70:
476 print 'data:', repr(data
)
478 def handle_comment(self
, data
):
482 r
= r
[:32] + '...' + r
[-32:]
485 def unknown_starttag(self
, tag
, attrs
):
488 print 'start tag: <' + tag
+ '>'
490 print 'start tag: <' + tag
,
491 for name
, value
in attrs
:
492 print name
+ '=' + '"' + value
+ '"',
495 def unknown_endtag(self
, tag
):
497 print 'end tag: </' + tag
+ '>'
499 def unknown_entityref(self
, ref
):
501 print '*** unknown entity ref: &' + ref
+ ';'
503 def unknown_charref(self
, ref
):
505 print '*** unknown char ref: &#' + ref
+ ';'
507 def unknown_decl(self
, data
):
509 print '*** unknown decl: [' + data
+ ']'
512 SGMLParser
.close(self
)
516 def test(args
= None):
522 if args
and args
[0] == '-s':
526 klass
= TestSGMLParser
543 if f
is not sys
.stdin
:
552 if __name__
== '__main__':