Lib/sgmllib.py

   1 """A parser for SGML, using the derived class as a static DTD."""
   2
   3 # XXX This only supports those SGML features used by HTML.
   4
   5 # XXX There should be a way to distinguish between PCDATA (parsed
   6 # character data -- the normal case), RCDATA (replaceable character
   7 # data -- only char and entity references and end tags are special)
   8 # and CDATA (character data -- only end tags are special).  RCDATA is
   9 # not supported at all.
  10
  11
  12 import markupbase
  13 import re
  14
  15 __all__ = ["SGMLParser", "SGMLParseError"]
  16
  17 # Regular expressions used for parsing
  18
  19 interesting = re.compile('[&<]')
  20 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
  21                            '<([a-zA-Z][^<>]*|'
  22                               '/([a-zA-Z][^<>]*)?|'
  23                               '![^<>]*)?')
  24
  25 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  26 charref = re.compile('&#([0-9]+)[^0-9]')
  27
  28 starttagopen = re.compile('<[>a-zA-Z]')
  29 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
  30 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
  31 piclose = re.compile('>')
  32 endbracket = re.compile('[<>]')
  33 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
  34 attrfind = re.compile(
  35     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
  36     r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
  37
  38
  39 class SGMLParseError(RuntimeError):
  40     """Exception raised for all parse errors."""
  41     pass
  42
  43
  44 # SGML parser base class -- find tags and call handler functions.
  45 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
  46 # The dtd is defined by deriving a class which defines methods
  47 # with special names to handle tags: start_foo and end_foo to handle
  48 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
  49 # (Tags are converted to lower case for this purpose.)  The data
  50 # between tags is passed to the parser by calling self.handle_data()
  51 # with some data as argument (the data may be split up in arbitrary
  52 # chunks).  Entity references are passed by calling
  53 # self.handle_entityref() with the entity reference as argument.
  54
  55 class SGMLParser(markupbase.ParserBase):
  56     # Definition of entities -- derived classes may override
  57     entity_or_charref = re.compile('&(?:'
  58       '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
  59       ')(;?)')
  60
  61     def __init__(self, verbose=0):
  62         """Initialize and reset this instance."""
  63         self.verbose = verbose
  64         self.reset()
  65
  66     def reset(self):
  67         """Reset this instance. Loses all unprocessed data."""
  68         self.__starttag_text = None
  69         self.rawdata = ''
  70         self.stack = []
  71         self.lasttag = '???'
  72         self.nomoretags = 0
  73         self.literal = 0
  74         markupbase.ParserBase.reset(self)
  75
  76     def setnomoretags(self):
  77         """Enter literal mode (CDATA) till EOF.
  78
  79         Intended for derived classes only.
  80         """
  81         self.nomoretags = self.literal = 1
  82
  83     def setliteral(self, *args):
  84         """Enter literal mode (CDATA).
  85
  86         Intended for derived classes only.
  87         """
  88         self.literal = 1
  89
  90     def feed(self, data):
  91         """Feed some data to the parser.
  92
  93         Call this as often as you want, with as little or as much text
  94         as you want (may include '\n').  (This just saves the text,
  95         all the processing is done by goahead().)
  96         """
  97
  98         self.rawdata = self.rawdata + data
  99         self.goahead(0)
 100
 101     def close(self):
 102         """Handle the remaining data."""
 103         self.goahead(1)
 104
 105     def error(self, message):
 106         raise SGMLParseError(message)
 107
 108     # Internal -- handle data as far as reasonable.  May leave state
 109     # and data to be processed by a subsequent call.  If 'end' is
 110     # true, force handling all data as if followed by EOF marker.
 111     def goahead(self, end):
 112         rawdata = self.rawdata
 113         i = 0
 114         n = len(rawdata)
 115         while i < n:
 116             if self.nomoretags:
 117                 self.handle_data(rawdata[i:n])
 118                 i = n
 119                 break
 120             match = interesting.search(rawdata, i)
 121             if match: j = match.start()
 122             else: j = n
 123             if i < j:
 124                 self.handle_data(rawdata[i:j])
 125             i = j
 126             if i == n: break
 127             if rawdata[i] == '<':
 128                 if starttagopen.match(rawdata, i):
 129                     if self.literal:
 130                         self.handle_data(rawdata[i])
 131                         i = i+1
 132                         continue
 133                     k = self.parse_starttag(i)
 134                     if k < 0: break
 135                     i = k
 136                     continue
 137                 if rawdata.startswith("</", i):
 138                     k = self.parse_endtag(i)
 139                     if k < 0: break
 140                     i = k
 141                     self.literal = 0
 142                     continue
 143                 if self.literal:
 144                     if n > (i + 1):
 145                         self.handle_data("<")
 146                         i = i+1
 147                     else:
 148                         # incomplete
 149                         break
 150                     continue
 151                 if rawdata.startswith("<!--", i):
 152                         # Strictly speaking, a comment is --.*--
 153                         # within a declaration tag <!...>.
 154                         # This should be removed,
 155                         # and comments handled only in parse_declaration.
 156                     k = self.parse_comment(i)
 157                     if k < 0: break
 158                     i = k
 159                     continue
 160                 if rawdata.startswith("<?", i):
 161                     k = self.parse_pi(i)
 162                     if k < 0: break
 163                     i = i+k
 164                     continue
 165                 if rawdata.startswith("<!", i):
 166                     # This is some sort of declaration; in "HTML as
 167                     # deployed," this should only be the document type
 168                     # declaration ("<!DOCTYPE html...>").
 169                     k = self.parse_declaration(i)
 170                     if k < 0: break
 171                     i = k
 172                     continue
 173             elif rawdata[i] == '&':
 174                 if self.literal:
 175                     self.handle_data(rawdata[i])
 176                     i = i+1
 177                     continue
 178                 match = charref.match(rawdata, i)
 179                 if match:
 180                     name = match.group(1)
 181                     self.handle_charref(name)
 182                     i = match.end(0)
 183                     if rawdata[i-1] != ';': i = i-1
 184                     continue
 185                 match = entityref.match(rawdata, i)
 186                 if match:
 187                     name = match.group(1)
 188                     self.handle_entityref(name)
 189                     i = match.end(0)
 190                     if rawdata[i-1] != ';': i = i-1
 191                     continue
 192             else:
 193                 self.error('neither < nor & ??')
 194             # We get here only if incomplete matches but
 195             # nothing else
 196             match = incomplete.match(rawdata, i)
 197             if not match:
 198                 self.handle_data(rawdata[i])
 199                 i = i+1
 200                 continue
 201             j = match.end(0)
 202             if j == n:
 203                 break # Really incomplete
 204             self.handle_data(rawdata[i:j])
 205             i = j
 206         # end while
 207         if end and i < n:
 208             self.handle_data(rawdata[i:n])
 209             i = n
 210         self.rawdata = rawdata[i:]
 211         # XXX if end: check for empty stack
 212
 213     # Extensions for the DOCTYPE scanner:
 214     _decl_otherchars = '='
 215
 216     # Internal -- parse processing instr, return length or -1 if not terminated
 217     def parse_pi(self, i):
 218         rawdata = self.rawdata
 219         if rawdata[i:i+2] != '<?':
 220             self.error('unexpected call to parse_pi()')
 221         match = piclose.search(rawdata, i+2)
 222         if not match:
 223             return -1
 224         j = match.start(0)
 225         self.handle_pi(rawdata[i+2: j])
 226         j = match.end(0)
 227         return j-i
 228
 229     def get_starttag_text(self):
 230         return self.__starttag_text
 231
 232     # Internal -- handle starttag, return length or -1 if not terminated
 233     def parse_starttag(self, i):
 234         self.__starttag_text = None
 235         start_pos = i
 236         rawdata = self.rawdata
 237         if shorttagopen.match(rawdata, i):
 238             # SGML shorthand: <tag/data/ == <tag>data</tag>
 239             # XXX Can data contain &... (entity or char refs)?
 240             # XXX Can data contain < or > (tag characters)?
 241             # XXX Can there be whitespace before the first /?
 242             match = shorttag.match(rawdata, i)
 243             if not match:
 244                 return -1
 245             tag, data = match.group(1, 2)
 246             self.__starttag_text = '<%s/' % tag
 247             tag = tag.lower()
 248             k = match.end(0)
 249             self.finish_shorttag(tag, data)
 250             self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
 251             return k
 252         # XXX The following should skip matching quotes (' or ")
 253         # As a shortcut way to exit, this isn't so bad, but shouldn't
 254         # be used to locate the actual end of the start tag since the
 255         # < or > characters may be embedded in an attribute value.
 256         match = endbracket.search(rawdata, i+1)
 257         if not match:
 258             return -1
 259         j = match.start(0)
 260         # Now parse the data between i+1 and j into a tag and attrs
 261         attrs = []
 262         if rawdata[i:i+2] == '<>':
 263             # SGML shorthand: <> == <last open tag seen>
 264             k = j
 265             tag = self.lasttag
 266         else:
 267             match = tagfind.match(rawdata, i+1)
 268             if not match:
 269                 self.error('unexpected call to parse_starttag')
 270             k = match.end(0)
 271             tag = rawdata[i+1:k].lower()
 272             self.lasttag = tag
 273         while k < j:
 274             match = attrfind.match(rawdata, k)
 275             if not match: break
 276             attrname, rest, attrvalue = match.group(1, 2, 3)
 277             if not rest:
 278                 attrvalue = attrname
 279             else:
 280                 if (attrvalue[:1] == "'" == attrvalue[-1:] or
 281                     attrvalue[:1] == '"' == attrvalue[-1:]):
 282                     # strip quotes
 283                     attrvalue = attrvalue[1:-1]
 284                 attrvalue = self.entity_or_charref.sub(
 285                     self._convert_ref, attrvalue)
 286             attrs.append((attrname.lower(), attrvalue))
 287             k = match.end(0)
 288         if rawdata[j] == '>':
 289             j = j+1
 290         self.__starttag_text = rawdata[start_pos:j]
 291         self.finish_starttag(tag, attrs)
 292         return j
 293
 294     # Internal -- convert entity or character reference
 295     def _convert_ref(self, match):
 296         if match.group(2):
 297             return self.convert_charref(match.group(2)) or \
 298                 '&#%s%s' % match.groups()[1:]
 299         elif match.group(3):
 300             return self.convert_entityref(match.group(1)) or \
 301                 '&%s;' % match.group(1)
 302         else:
 303             return '&%s' % match.group(1)
 304
 305     # Internal -- parse endtag
 306     def parse_endtag(self, i):
 307         rawdata = self.rawdata
 308         match = endbracket.search(rawdata, i+1)
 309         if not match:
 310             return -1
 311         j = match.start(0)
 312         tag = rawdata[i+2:j].strip().lower()
 313         if rawdata[j] == '>':
 314             j = j+1
 315         self.finish_endtag(tag)
 316         return j
 317
 318     # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
 319     def finish_shorttag(self, tag, data):
 320         self.finish_starttag(tag, [])
 321         self.handle_data(data)
 322         self.finish_endtag(tag)
 323
 324     # Internal -- finish processing of start tag
 325     # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
 326     def finish_starttag(self, tag, attrs):
 327         try:
 328             method = getattr(self, 'start_' + tag)
 329         except AttributeError:
 330             try:
 331                 method = getattr(self, 'do_' + tag)
 332             except AttributeError:
 333                 self.unknown_starttag(tag, attrs)
 334                 return -1
 335             else:
 336                 self.handle_starttag(tag, method, attrs)
 337                 return 0
 338         else:
 339             self.stack.append(tag)
 340             self.handle_starttag(tag, method, attrs)
 341             return 1
 342
 343     # Internal -- finish processing of end tag
 344     def finish_endtag(self, tag):
 345         if not tag:
 346             found = len(self.stack) - 1
 347             if found < 0:
 348                 self.unknown_endtag(tag)
 349                 return
 350         else:
 351             if tag not in self.stack:
 352                 try:
 353                     method = getattr(self, 'end_' + tag)
 354                 except AttributeError:
 355                     self.unknown_endtag(tag)
 356                 else:
 357                     self.report_unbalanced(tag)
 358                 return
 359             found = len(self.stack)
 360             for i in range(found):
 361                 if self.stack[i] == tag: found = i
 362         while len(self.stack) > found:
 363             tag = self.stack[-1]
 364             try:
 365                 method = getattr(self, 'end_' + tag)
 366             except AttributeError:
 367                 method = None
 368             if method:
 369                 self.handle_endtag(tag, method)
 370             else:
 371                 self.unknown_endtag(tag)
 372             del self.stack[-1]
 373
 374     # Overridable -- handle start tag
 375     def handle_starttag(self, tag, method, attrs):
 376         method(attrs)
 377
 378     # Overridable -- handle end tag
 379     def handle_endtag(self, tag, method):
 380         method()
 381
 382     # Example -- report an unbalanced </...> tag.
 383     def report_unbalanced(self, tag):
 384         if self.verbose:
 385             print '*** Unbalanced </' + tag + '>'
 386             print '*** Stack:', self.stack
 387
 388     def convert_charref(self, name):
 389         """Convert character reference, may be overridden."""
 390         try:
 391             n = int(name)
 392         except ValueError:
 393             return
 394         if not 0 <= n <= 255:
 395             return
 396         return self.convert_codepoint(n)
 397
 398     def convert_codepoint(self, codepoint):
 399         return chr(codepoint)
 400
 401     def handle_charref(self, name):
 402         """Handle character reference, no need to override."""
 403         replacement = self.convert_charref(name)
 404         if replacement is None:
 405             self.unknown_charref(name)
 406         else:
 407             self.handle_data(replacement)
 408
 409     # Definition of entities -- derived classes may override
 410     entitydefs = \
 411             {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
 412
 413     def convert_entityref(self, name):
 414         """Convert entity references.
 415
 416         As an alternative to overriding this method; one can tailor the
 417         results by setting up the self.entitydefs mapping appropriately.
 418         """
 419         table = self.entitydefs
 420         if name in table:
 421             return table[name]
 422         else:
 423             return
 424
 425     def handle_entityref(self, name):
 426         """Handle entity references, no need to override."""
 427         replacement = self.convert_entityref(name)
 428         if replacement is None:
 429             self.unknown_entityref(name)
 430         else:
 431             self.handle_data(self.convert_entityref(name))
 432
 433     # Example -- handle data, should be overridden
 434     def handle_data(self, data):
 435         pass
 436
 437     # Example -- handle comment, could be overridden
 438     def handle_comment(self, data):
 439         pass
 440
 441     # Example -- handle declaration, could be overridden
 442     def handle_decl(self, decl):
 443         pass
 444
 445     # Example -- handle processing instruction, could be overridden
 446     def handle_pi(self, data):
 447         pass
 448
 449     # To be overridden -- handlers for unknown objects
 450     def unknown_starttag(self, tag, attrs): pass
 451     def unknown_endtag(self, tag): pass
 452     def unknown_charref(self, ref): pass
 453     def unknown_entityref(self, ref): pass
 454
 455
 456 class TestSGMLParser(SGMLParser):
 457
 458     def __init__(self, verbose=0):
 459         self.testdata = ""
 460         SGMLParser.__init__(self, verbose)
 461
 462     def handle_data(self, data):
 463         self.testdata = self.testdata + data
 464         if len(repr(self.testdata)) >= 70:
 465             self.flush()
 466
 467     def flush(self):
 468         data = self.testdata
 469         if data:
 470             self.testdata = ""
 471             print 'data:', repr(data)
 472
 473     def handle_comment(self, data):
 474         self.flush()
 475         r = repr(data)
 476         if len(r) > 68:
 477             r = r[:32] + '...' + r[-32:]
 478         print 'comment:', r
 479
 480     def unknown_starttag(self, tag, attrs):
 481         self.flush()
 482         if not attrs:
 483             print 'start tag: <' + tag + '>'
 484         else:
 485             print 'start tag: <' + tag,
 486             for name, value in attrs:
 487                 print name + '=' + '"' + value + '"',
 488             print '>'
 489
 490     def unknown_endtag(self, tag):
 491         self.flush()
 492         print 'end tag: </' + tag + '>'
 493
 494     def unknown_entityref(self, ref):
 495         self.flush()
 496         print '*** unknown entity ref: &' + ref + ';'
 497
 498     def unknown_charref(self, ref):
 499         self.flush()
 500         print '*** unknown char ref: &#' + ref + ';'
 501
 502     def unknown_decl(self, data):
 503         self.flush()
 504         print '*** unknown decl: [' + data + ']'
 505
 506     def close(self):
 507         SGMLParser.close(self)
 508         self.flush()
 509
 510
 511 def test(args = None):
 512     import sys
 513
 514     if args is None:
 515         args = sys.argv[1:]
 516
 517     if args and args[0] == '-s':
 518         args = args[1:]
 519         klass = SGMLParser
 520     else:
 521         klass = TestSGMLParser
 522
 523     if args:
 524         file = args[0]
 525     else:
 526         file = 'test.html'
 527
 528     if file == '-':
 529         f = sys.stdin
 530     else:
 531         try:
 532             f = open(file, 'r')
 533         except IOError, msg:
 534             print file, ":", msg
 535             sys.exit(1)
 536
 537     data = f.read()
 538     if f is not sys.stdin:
 539         f.close()
 540
 541     x = klass()
 542     for c in data:
 543         x.feed(c)
 544     x.close()
 545
 546
 547 if __name__ == '__main__':
 548     test()