Lib/sgmllib.py

   1 """A parser for SGML, using the derived class as a static DTD."""
   2
   3 # XXX This only supports those SGML features used by HTML.
   4
   5 # XXX There should be a way to distinguish between PCDATA (parsed
   6 # character data -- the normal case), RCDATA (replaceable character
   7 # data -- only char and entity references and end tags are special)
   8 # and CDATA (character data -- only end tags are special).  RCDATA is
   9 # not supported at all.
  10
  11
  12 from warnings import warnpy3k
  13 warnpy3k("the sgmllib module has been removed in Python 3.0",
  14          stacklevel=2)
  15 del warnpy3k
  16
  17 import markupbase
  18 import re
  19
  20 __all__ = ["SGMLParser", "SGMLParseError"]
  21
  22 # Regular expressions used for parsing
  23
  24 interesting = re.compile('[&<]')
  25 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
  26                            '<([a-zA-Z][^<>]*|'
  27                               '/([a-zA-Z][^<>]*)?|'
  28                               '![^<>]*)?')
  29
  30 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  31 charref = re.compile('&#([0-9]+)[^0-9]')
  32
  33 starttagopen = re.compile('<[>a-zA-Z]')
  34 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
  35 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
  36 piclose = re.compile('>')
  37 endbracket = re.compile('[<>]')
  38 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
  39 attrfind = re.compile(
  40     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
  41     r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
  42
  43
  44 class SGMLParseError(RuntimeError):
  45     """Exception raised for all parse errors."""
  46     pass
  47
  48
  49 # SGML parser base class -- find tags and call handler functions.
  50 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
  51 # The dtd is defined by deriving a class which defines methods
  52 # with special names to handle tags: start_foo and end_foo to handle
  53 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
  54 # (Tags are converted to lower case for this purpose.)  The data
  55 # between tags is passed to the parser by calling self.handle_data()
  56 # with some data as argument (the data may be split up in arbitrary
  57 # chunks).  Entity references are passed by calling
  58 # self.handle_entityref() with the entity reference as argument.
  59
  60 class SGMLParser(markupbase.ParserBase):
  61     # Definition of entities -- derived classes may override
  62     entity_or_charref = re.compile('&(?:'
  63       '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
  64       ')(;?)')
  65
  66     def __init__(self, verbose=0):
  67         """Initialize and reset this instance."""
  68         self.verbose = verbose
  69         self.reset()
  70
  71     def reset(self):
  72         """Reset this instance. Loses all unprocessed data."""
  73         self.__starttag_text = None
  74         self.rawdata = ''
  75         self.stack = []
  76         self.lasttag = '???'
  77         self.nomoretags = 0
  78         self.literal = 0
  79         markupbase.ParserBase.reset(self)
  80
  81     def setnomoretags(self):
  82         """Enter literal mode (CDATA) till EOF.
  83
  84         Intended for derived classes only.
  85         """
  86         self.nomoretags = self.literal = 1
  87
  88     def setliteral(self, *args):
  89         """Enter literal mode (CDATA).
  90
  91         Intended for derived classes only.
  92         """
  93         self.literal = 1
  94
  95     def feed(self, data):
  96         """Feed some data to the parser.
  97
  98         Call this as often as you want, with as little or as much text
  99         as you want (may include '\n').  (This just saves the text,
 100         all the processing is done by goahead().)
 101         """
 102
 103         self.rawdata = self.rawdata + data
 104         self.goahead(0)
 105
 106     def close(self):
 107         """Handle the remaining data."""
 108         self.goahead(1)
 109
 110     def error(self, message):
 111         raise SGMLParseError(message)
 112
 113     # Internal -- handle data as far as reasonable.  May leave state
 114     # and data to be processed by a subsequent call.  If 'end' is
 115     # true, force handling all data as if followed by EOF marker.
 116     def goahead(self, end):
 117         rawdata = self.rawdata
 118         i = 0
 119         n = len(rawdata)
 120         while i < n:
 121             if self.nomoretags:
 122                 self.handle_data(rawdata[i:n])
 123                 i = n
 124                 break
 125             match = interesting.search(rawdata, i)
 126             if match: j = match.start()
 127             else: j = n
 128             if i < j:
 129                 self.handle_data(rawdata[i:j])
 130             i = j
 131             if i == n: break
 132             if rawdata[i] == '<':
 133                 if starttagopen.match(rawdata, i):
 134                     if self.literal:
 135                         self.handle_data(rawdata[i])
 136                         i = i+1
 137                         continue
 138                     k = self.parse_starttag(i)
 139                     if k < 0: break
 140                     i = k
 141                     continue
 142                 if rawdata.startswith("</", i):
 143                     k = self.parse_endtag(i)
 144                     if k < 0: break
 145                     i = k
 146                     self.literal = 0
 147                     continue
 148                 if self.literal:
 149                     if n > (i + 1):
 150                         self.handle_data("<")
 151                         i = i+1
 152                     else:
 153                         # incomplete
 154                         break
 155                     continue
 156                 if rawdata.startswith("<!--", i):
 157                         # Strictly speaking, a comment is --.*--
 158                         # within a declaration tag <!...>.
 159                         # This should be removed,
 160                         # and comments handled only in parse_declaration.
 161                     k = self.parse_comment(i)
 162                     if k < 0: break
 163                     i = k
 164                     continue
 165                 if rawdata.startswith("<?", i):
 166                     k = self.parse_pi(i)
 167                     if k < 0: break
 168                     i = i+k
 169                     continue
 170                 if rawdata.startswith("<!", i):
 171                     # This is some sort of declaration; in "HTML as
 172                     # deployed," this should only be the document type
 173                     # declaration ("<!DOCTYPE html...>").
 174                     k = self.parse_declaration(i)
 175                     if k < 0: break
 176                     i = k
 177                     continue
 178             elif rawdata[i] == '&':
 179                 if self.literal:
 180                     self.handle_data(rawdata[i])
 181                     i = i+1
 182                     continue
 183                 match = charref.match(rawdata, i)
 184                 if match:
 185                     name = match.group(1)
 186                     self.handle_charref(name)
 187                     i = match.end(0)
 188                     if rawdata[i-1] != ';': i = i-1
 189                     continue
 190                 match = entityref.match(rawdata, i)
 191                 if match:
 192                     name = match.group(1)
 193                     self.handle_entityref(name)
 194                     i = match.end(0)
 195                     if rawdata[i-1] != ';': i = i-1
 196                     continue
 197             else:
 198                 self.error('neither < nor & ??')
 199             # We get here only if incomplete matches but
 200             # nothing else
 201             match = incomplete.match(rawdata, i)
 202             if not match:
 203                 self.handle_data(rawdata[i])
 204                 i = i+1
 205                 continue
 206             j = match.end(0)
 207             if j == n:
 208                 break # Really incomplete
 209             self.handle_data(rawdata[i:j])
 210             i = j
 211         # end while
 212         if end and i < n:
 213             self.handle_data(rawdata[i:n])
 214             i = n
 215         self.rawdata = rawdata[i:]
 216         # XXX if end: check for empty stack
 217
 218     # Extensions for the DOCTYPE scanner:
 219     _decl_otherchars = '='
 220
 221     # Internal -- parse processing instr, return length or -1 if not terminated
 222     def parse_pi(self, i):
 223         rawdata = self.rawdata
 224         if rawdata[i:i+2] != '<?':
 225             self.error('unexpected call to parse_pi()')
 226         match = piclose.search(rawdata, i+2)
 227         if not match:
 228             return -1
 229         j = match.start(0)
 230         self.handle_pi(rawdata[i+2: j])
 231         j = match.end(0)
 232         return j-i
 233
 234     def get_starttag_text(self):
 235         return self.__starttag_text
 236
 237     # Internal -- handle starttag, return length or -1 if not terminated
 238     def parse_starttag(self, i):
 239         self.__starttag_text = None
 240         start_pos = i
 241         rawdata = self.rawdata
 242         if shorttagopen.match(rawdata, i):
 243             # SGML shorthand: <tag/data/ == <tag>data</tag>
 244             # XXX Can data contain &... (entity or char refs)?
 245             # XXX Can data contain < or > (tag characters)?
 246             # XXX Can there be whitespace before the first /?
 247             match = shorttag.match(rawdata, i)
 248             if not match:
 249                 return -1
 250             tag, data = match.group(1, 2)
 251             self.__starttag_text = '<%s/' % tag
 252             tag = tag.lower()
 253             k = match.end(0)
 254             self.finish_shorttag(tag, data)
 255             self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
 256             return k
 257         # XXX The following should skip matching quotes (' or ")
 258         # As a shortcut way to exit, this isn't so bad, but shouldn't
 259         # be used to locate the actual end of the start tag since the
 260         # < or > characters may be embedded in an attribute value.
 261         match = endbracket.search(rawdata, i+1)
 262         if not match:
 263             return -1
 264         j = match.start(0)
 265         # Now parse the data between i+1 and j into a tag and attrs
 266         attrs = []
 267         if rawdata[i:i+2] == '<>':
 268             # SGML shorthand: <> == <last open tag seen>
 269             k = j
 270             tag = self.lasttag
 271         else:
 272             match = tagfind.match(rawdata, i+1)
 273             if not match:
 274                 self.error('unexpected call to parse_starttag')
 275             k = match.end(0)
 276             tag = rawdata[i+1:k].lower()
 277             self.lasttag = tag
 278         while k < j:
 279             match = attrfind.match(rawdata, k)
 280             if not match: break
 281             attrname, rest, attrvalue = match.group(1, 2, 3)
 282             if not rest:
 283                 attrvalue = attrname
 284             else:
 285                 if (attrvalue[:1] == "'" == attrvalue[-1:] or
 286                     attrvalue[:1] == '"' == attrvalue[-1:]):
 287                     # strip quotes
 288                     attrvalue = attrvalue[1:-1]
 289                 attrvalue = self.entity_or_charref.sub(
 290                     self._convert_ref, attrvalue)
 291             attrs.append((attrname.lower(), attrvalue))
 292             k = match.end(0)
 293         if rawdata[j] == '>':
 294             j = j+1
 295         self.__starttag_text = rawdata[start_pos:j]
 296         self.finish_starttag(tag, attrs)
 297         return j
 298
 299     # Internal -- convert entity or character reference
 300     def _convert_ref(self, match):
 301         if match.group(2):
 302             return self.convert_charref(match.group(2)) or \
 303                 '&#%s%s' % match.groups()[1:]
 304         elif match.group(3):
 305             return self.convert_entityref(match.group(1)) or \
 306                 '&%s;' % match.group(1)
 307         else:
 308             return '&%s' % match.group(1)
 309
 310     # Internal -- parse endtag
 311     def parse_endtag(self, i):
 312         rawdata = self.rawdata
 313         match = endbracket.search(rawdata, i+1)
 314         if not match:
 315             return -1
 316         j = match.start(0)
 317         tag = rawdata[i+2:j].strip().lower()
 318         if rawdata[j] == '>':
 319             j = j+1
 320         self.finish_endtag(tag)
 321         return j
 322
 323     # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
 324     def finish_shorttag(self, tag, data):
 325         self.finish_starttag(tag, [])
 326         self.handle_data(data)
 327         self.finish_endtag(tag)
 328
 329     # Internal -- finish processing of start tag
 330     # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
 331     def finish_starttag(self, tag, attrs):
 332         try:
 333             method = getattr(self, 'start_' + tag)
 334         except AttributeError:
 335             try:
 336                 method = getattr(self, 'do_' + tag)
 337             except AttributeError:
 338                 self.unknown_starttag(tag, attrs)
 339                 return -1
 340             else:
 341                 self.handle_starttag(tag, method, attrs)
 342                 return 0
 343         else:
 344             self.stack.append(tag)
 345             self.handle_starttag(tag, method, attrs)
 346             return 1
 347
 348     # Internal -- finish processing of end tag
 349     def finish_endtag(self, tag):
 350         if not tag:
 351             found = len(self.stack) - 1
 352             if found < 0:
 353                 self.unknown_endtag(tag)
 354                 return
 355         else:
 356             if tag not in self.stack:
 357                 try:
 358                     method = getattr(self, 'end_' + tag)
 359                 except AttributeError:
 360                     self.unknown_endtag(tag)
 361                 else:
 362                     self.report_unbalanced(tag)
 363                 return
 364             found = len(self.stack)
 365             for i in range(found):
 366                 if self.stack[i] == tag: found = i
 367         while len(self.stack) > found:
 368             tag = self.stack[-1]
 369             try:
 370                 method = getattr(self, 'end_' + tag)
 371             except AttributeError:
 372                 method = None
 373             if method:
 374                 self.handle_endtag(tag, method)
 375             else:
 376                 self.unknown_endtag(tag)
 377             del self.stack[-1]
 378
 379     # Overridable -- handle start tag
 380     def handle_starttag(self, tag, method, attrs):
 381         method(attrs)
 382
 383     # Overridable -- handle end tag
 384     def handle_endtag(self, tag, method):
 385         method()
 386
 387     # Example -- report an unbalanced </...> tag.
 388     def report_unbalanced(self, tag):
 389         if self.verbose:
 390             print '*** Unbalanced </' + tag + '>'
 391             print '*** Stack:', self.stack
 392
 393     def convert_charref(self, name):
 394         """Convert character reference, may be overridden."""
 395         try:
 396             n = int(name)
 397         except ValueError:
 398             return
 399         if not 0 <= n <= 255:
 400             return
 401         return self.convert_codepoint(n)
 402
 403     def convert_codepoint(self, codepoint):
 404         return chr(codepoint)
 405
 406     def handle_charref(self, name):
 407         """Handle character reference, no need to override."""
 408         replacement = self.convert_charref(name)
 409         if replacement is None:
 410             self.unknown_charref(name)
 411         else:
 412             self.handle_data(replacement)
 413
 414     # Definition of entities -- derived classes may override
 415     entitydefs = \
 416             {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
 417
 418     def convert_entityref(self, name):
 419         """Convert entity references.
 420
 421         As an alternative to overriding this method; one can tailor the
 422         results by setting up the self.entitydefs mapping appropriately.
 423         """
 424         table = self.entitydefs
 425         if name in table:
 426             return table[name]
 427         else:
 428             return
 429
 430     def handle_entityref(self, name):
 431         """Handle entity references, no need to override."""
 432         replacement = self.convert_entityref(name)
 433         if replacement is None:
 434             self.unknown_entityref(name)
 435         else:
 436             self.handle_data(replacement)
 437
 438     # Example -- handle data, should be overridden
 439     def handle_data(self, data):
 440         pass
 441
 442     # Example -- handle comment, could be overridden
 443     def handle_comment(self, data):
 444         pass
 445
 446     # Example -- handle declaration, could be overridden
 447     def handle_decl(self, decl):
 448         pass
 449
 450     # Example -- handle processing instruction, could be overridden
 451     def handle_pi(self, data):
 452         pass
 453
 454     # To be overridden -- handlers for unknown objects
 455     def unknown_starttag(self, tag, attrs): pass
 456     def unknown_endtag(self, tag): pass
 457     def unknown_charref(self, ref): pass
 458     def unknown_entityref(self, ref): pass
 459
 460
 461 class TestSGMLParser(SGMLParser):
 462
 463     def __init__(self, verbose=0):
 464         self.testdata = ""
 465         SGMLParser.__init__(self, verbose)
 466
 467     def handle_data(self, data):
 468         self.testdata = self.testdata + data
 469         if len(repr(self.testdata)) >= 70:
 470             self.flush()
 471
 472     def flush(self):
 473         data = self.testdata
 474         if data:
 475             self.testdata = ""
 476             print 'data:', repr(data)
 477
 478     def handle_comment(self, data):
 479         self.flush()
 480         r = repr(data)
 481         if len(r) > 68:
 482             r = r[:32] + '...' + r[-32:]
 483         print 'comment:', r
 484
 485     def unknown_starttag(self, tag, attrs):
 486         self.flush()
 487         if not attrs:
 488             print 'start tag: <' + tag + '>'
 489         else:
 490             print 'start tag: <' + tag,
 491             for name, value in attrs:
 492                 print name + '=' + '"' + value + '"',
 493             print '>'
 494
 495     def unknown_endtag(self, tag):
 496         self.flush()
 497         print 'end tag: </' + tag + '>'
 498
 499     def unknown_entityref(self, ref):
 500         self.flush()
 501         print '*** unknown entity ref: &' + ref + ';'
 502
 503     def unknown_charref(self, ref):
 504         self.flush()
 505         print '*** unknown char ref: &#' + ref + ';'
 506
 507     def unknown_decl(self, data):
 508         self.flush()
 509         print '*** unknown decl: [' + data + ']'
 510
 511     def close(self):
 512         SGMLParser.close(self)
 513         self.flush()
 514
 515
 516 def test(args = None):
 517     import sys
 518
 519     if args is None:
 520         args = sys.argv[1:]
 521
 522     if args and args[0] == '-s':
 523         args = args[1:]
 524         klass = SGMLParser
 525     else:
 526         klass = TestSGMLParser
 527
 528     if args:
 529         file = args[0]
 530     else:
 531         file = 'test.html'
 532
 533     if file == '-':
 534         f = sys.stdin
 535     else:
 536         try:
 537             f = open(file, 'r')
 538         except IOError, msg:
 539             print file, ":", msg
 540             sys.exit(1)
 541
 542     data = f.read()
 543     if f is not sys.stdin:
 544         f.close()
 545
 546     x = klass()
 547     for c in data:
 548         x.feed(c)
 549     x.close()
 550
 551
 552 if __name__ == '__main__':
 553     test()