Lib/sgmllib.py

   1 """A parser for SGML, using the derived class as a static DTD."""
   2
   3 # XXX This only supports those SGML features used by HTML.
   4
   5 # XXX There should be a way to distinguish between PCDATA (parsed
   6 # character data -- the normal case), RCDATA (replaceable character
   7 # data -- only char and entity references and end tags are special)
   8 # and CDATA (character data -- only end tags are special).  RCDATA is
   9 # not supported at all.
  10
  11
  12 import markupbase
  13 import re
  14
  15 __all__ = ["SGMLParser", "SGMLParseError"]
  16
  17 # Regular expressions used for parsing
  18
  19 interesting = re.compile('[&<]')
  20 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
  21                            '<([a-zA-Z][^<>]*|'
  22                               '/([a-zA-Z][^<>]*)?|'
  23                               '![^<>]*)?')
  24
  25 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  26 charref = re.compile('&#([0-9]+)[^0-9]')
  27
  28 starttagopen = re.compile('<[>a-zA-Z]')
  29 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
  30 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
  31 piclose = re.compile('>')
  32 endbracket = re.compile('[<>]')
  33 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
  34 attrfind = re.compile(
  35     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
  36     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
  37
  38
  39 class SGMLParseError(RuntimeError):
  40     """Exception raised for all parse errors."""
  41     pass
  42
  43
  44 # SGML parser base class -- find tags and call handler functions.
  45 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
  46 # The dtd is defined by deriving a class which defines methods
  47 # with special names to handle tags: start_foo and end_foo to handle
  48 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
  49 # (Tags are converted to lower case for this purpose.)  The data
  50 # between tags is passed to the parser by calling self.handle_data()
  51 # with some data as argument (the data may be split up in arbitrary
  52 # chunks).  Entity references are passed by calling
  53 # self.handle_entityref() with the entity reference as argument.
  54
  55 class SGMLParser(markupbase.ParserBase):
  56
  57     def __init__(self, verbose=0):
  58         """Initialize and reset this instance."""
  59         self.verbose = verbose
  60         self.reset()
  61
  62     def reset(self):
  63         """Reset this instance. Loses all unprocessed data."""
  64         self.__starttag_text = None
  65         self.rawdata = ''
  66         self.stack = []
  67         self.lasttag = '???'
  68         self.nomoretags = 0
  69         self.literal = 0
  70         markupbase.ParserBase.reset(self)
  71
  72     def setnomoretags(self):
  73         """Enter literal mode (CDATA) till EOF.
  74
  75         Intended for derived classes only.
  76         """
  77         self.nomoretags = self.literal = 1
  78
  79     def setliteral(self, *args):
  80         """Enter literal mode (CDATA).
  81
  82         Intended for derived classes only.
  83         """
  84         self.literal = 1
  85
  86     def feed(self, data):
  87         """Feed some data to the parser.
  88
  89         Call this as often as you want, with as little or as much text
  90         as you want (may include '\n').  (This just saves the text,
  91         all the processing is done by goahead().)
  92         """
  93
  94         self.rawdata = self.rawdata + data
  95         self.goahead(0)
  96
  97     def close(self):
  98         """Handle the remaining data."""
  99         self.goahead(1)
 100
 101     def error(self, message):
 102         raise SGMLParseError(message)
 103
 104     # Internal -- handle data as far as reasonable.  May leave state
 105     # and data to be processed by a subsequent call.  If 'end' is
 106     # true, force handling all data as if followed by EOF marker.
 107     def goahead(self, end):
 108         rawdata = self.rawdata
 109         i = 0
 110         n = len(rawdata)
 111         while i < n:
 112             if self.nomoretags:
 113                 self.handle_data(rawdata[i:n])
 114                 i = n
 115                 break
 116             match = interesting.search(rawdata, i)
 117             if match: j = match.start()
 118             else: j = n
 119             if i < j:
 120                 self.handle_data(rawdata[i:j])
 121             i = j
 122             if i == n: break
 123             if rawdata[i] == '<':
 124                 if starttagopen.match(rawdata, i):
 125                     if self.literal:
 126                         self.handle_data(rawdata[i])
 127                         i = i+1
 128                         continue
 129                     k = self.parse_starttag(i)
 130                     if k < 0: break
 131                     i = k
 132                     continue
 133                 if rawdata.startswith("</", i):
 134                     k = self.parse_endtag(i)
 135                     if k < 0: break
 136                     i = k
 137                     self.literal = 0
 138                     continue
 139                 if self.literal:
 140                     if n > (i + 1):
 141                         self.handle_data("<")
 142                         i = i+1
 143                     else:
 144                         # incomplete
 145                         break
 146                     continue
 147                 if rawdata.startswith("<!--", i):
 148                         # Strictly speaking, a comment is --.*--
 149                         # within a declaration tag <!...>.
 150                         # This should be removed,
 151                         # and comments handled only in parse_declaration.
 152                     k = self.parse_comment(i)
 153                     if k < 0: break
 154                     i = k
 155                     continue
 156                 if rawdata.startswith("<?", i):
 157                     k = self.parse_pi(i)
 158                     if k < 0: break
 159                     i = i+k
 160                     continue
 161                 if rawdata.startswith("<!", i):
 162                     # This is some sort of declaration; in "HTML as
 163                     # deployed," this should only be the document type
 164                     # declaration ("<!DOCTYPE html...>").
 165                     k = self.parse_declaration(i)
 166                     if k < 0: break
 167                     i = k
 168                     continue
 169             elif rawdata[i] == '&':
 170                 if self.literal:
 171                     self.handle_data(rawdata[i])
 172                     i = i+1
 173                     continue
 174                 match = charref.match(rawdata, i)
 175                 if match:
 176                     name = match.group(1)
 177                     self.handle_charref(name)
 178                     i = match.end(0)
 179                     if rawdata[i-1] != ';': i = i-1
 180                     continue
 181                 match = entityref.match(rawdata, i)
 182                 if match:
 183                     name = match.group(1)
 184                     self.handle_entityref(name)
 185                     i = match.end(0)
 186                     if rawdata[i-1] != ';': i = i-1
 187                     continue
 188             else:
 189                 self.error('neither < nor & ??')
 190             # We get here only if incomplete matches but
 191             # nothing else
 192             match = incomplete.match(rawdata, i)
 193             if not match:
 194                 self.handle_data(rawdata[i])
 195                 i = i+1
 196                 continue
 197             j = match.end(0)
 198             if j == n:
 199                 break # Really incomplete
 200             self.handle_data(rawdata[i:j])
 201             i = j
 202         # end while
 203         if end and i < n:
 204             self.handle_data(rawdata[i:n])
 205             i = n
 206         self.rawdata = rawdata[i:]
 207         # XXX if end: check for empty stack
 208
 209     # Extensions for the DOCTYPE scanner:
 210     _decl_otherchars = '='
 211
 212     # Internal -- parse processing instr, return length or -1 if not terminated
 213     def parse_pi(self, i):
 214         rawdata = self.rawdata
 215         if rawdata[i:i+2] != '<?':
 216             self.error('unexpected call to parse_pi()')
 217         match = piclose.search(rawdata, i+2)
 218         if not match:
 219             return -1
 220         j = match.start(0)
 221         self.handle_pi(rawdata[i+2: j])
 222         j = match.end(0)
 223         return j-i
 224
 225     def get_starttag_text(self):
 226         return self.__starttag_text
 227
 228     # Internal -- handle starttag, return length or -1 if not terminated
 229     def parse_starttag(self, i):
 230         self.__starttag_text = None
 231         start_pos = i
 232         rawdata = self.rawdata
 233         if shorttagopen.match(rawdata, i):
 234             # SGML shorthand: <tag/data/ == <tag>data</tag>
 235             # XXX Can data contain &... (entity or char refs)?
 236             # XXX Can data contain < or > (tag characters)?
 237             # XXX Can there be whitespace before the first /?
 238             match = shorttag.match(rawdata, i)
 239             if not match:
 240                 return -1
 241             tag, data = match.group(1, 2)
 242             self.__starttag_text = '<%s/' % tag
 243             tag = tag.lower()
 244             k = match.end(0)
 245             self.finish_shorttag(tag, data)
 246             self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
 247             return k
 248         # XXX The following should skip matching quotes (' or ")
 249         match = endbracket.search(rawdata, i+1)
 250         if not match:
 251             return -1
 252         j = match.start(0)
 253         # Now parse the data between i+1 and j into a tag and attrs
 254         attrs = []
 255         if rawdata[i:i+2] == '<>':
 256             # SGML shorthand: <> == <last open tag seen>
 257             k = j
 258             tag = self.lasttag
 259         else:
 260             match = tagfind.match(rawdata, i+1)
 261             if not match:
 262                 self.error('unexpected call to parse_starttag')
 263             k = match.end(0)
 264             tag = rawdata[i+1:k].lower()
 265             self.lasttag = tag
 266         while k < j:
 267             match = attrfind.match(rawdata, k)
 268             if not match: break
 269             attrname, rest, attrvalue = match.group(1, 2, 3)
 270             if not rest:
 271                 attrvalue = attrname
 272             else:
 273                 if (attrvalue[:1] == "'" == attrvalue[-1:] or
 274                     attrvalue[:1] == '"' == attrvalue[-1:]):
 275                     # strip quotes
 276                     attrvalue = attrvalue[1:-1]
 277                 l = 0
 278                 new_attrvalue = ''
 279                 while l < len(attrvalue):
 280                     av_match = entityref.match(attrvalue, l)
 281                     if (av_match and av_match.group(1) in self.entitydefs and
 282                         attrvalue[av_match.end(1)] == ';'):
 283                         # only substitute entityrefs ending in ';' since
 284                         # otherwise we may break <a href='?p=x&q=y'>
 285                         # which is very common
 286                         new_attrvalue += self.entitydefs[av_match.group(1)]
 287                         l = av_match.end(0)
 288                         continue
 289                     ch_match = charref.match(attrvalue, l)
 290                     if ch_match:
 291                         try:
 292                             char = chr(int(ch_match.group(1)))
 293                             new_attrvalue += char
 294                             l = ch_match.end(0)
 295                             continue
 296                         except ValueError:
 297                             # invalid character reference, don't substitute
 298                             pass
 299                     # all other cases
 300                     new_attrvalue += attrvalue[l]
 301                     l += 1
 302                 attrvalue = new_attrvalue
 303             attrs.append((attrname.lower(), attrvalue))
 304             k = match.end(0)
 305         if rawdata[j] == '>':
 306             j = j+1
 307         self.__starttag_text = rawdata[start_pos:j]
 308         self.finish_starttag(tag, attrs)
 309         return j
 310
 311     # Internal -- parse endtag
 312     def parse_endtag(self, i):
 313         rawdata = self.rawdata
 314         match = endbracket.search(rawdata, i+1)
 315         if not match:
 316             return -1
 317         j = match.start(0)
 318         tag = rawdata[i+2:j].strip().lower()
 319         if rawdata[j] == '>':
 320             j = j+1
 321         self.finish_endtag(tag)
 322         return j
 323
 324     # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
 325     def finish_shorttag(self, tag, data):
 326         self.finish_starttag(tag, [])
 327         self.handle_data(data)
 328         self.finish_endtag(tag)
 329
 330     # Internal -- finish processing of start tag
 331     # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
 332     def finish_starttag(self, tag, attrs):
 333         try:
 334             method = getattr(self, 'start_' + tag)
 335         except AttributeError:
 336             try:
 337                 method = getattr(self, 'do_' + tag)
 338             except AttributeError:
 339                 self.unknown_starttag(tag, attrs)
 340                 return -1
 341             else:
 342                 self.handle_starttag(tag, method, attrs)
 343                 return 0
 344         else:
 345             self.stack.append(tag)
 346             self.handle_starttag(tag, method, attrs)
 347             return 1
 348
 349     # Internal -- finish processing of end tag
 350     def finish_endtag(self, tag):
 351         if not tag:
 352             found = len(self.stack) - 1
 353             if found < 0:
 354                 self.unknown_endtag(tag)
 355                 return
 356         else:
 357             if tag not in self.stack:
 358                 try:
 359                     method = getattr(self, 'end_' + tag)
 360                 except AttributeError:
 361                     self.unknown_endtag(tag)
 362                 else:
 363                     self.report_unbalanced(tag)
 364                 return
 365             found = len(self.stack)
 366             for i in range(found):
 367                 if self.stack[i] == tag: found = i
 368         while len(self.stack) > found:
 369             tag = self.stack[-1]
 370             try:
 371                 method = getattr(self, 'end_' + tag)
 372             except AttributeError:
 373                 method = None
 374             if method:
 375                 self.handle_endtag(tag, method)
 376             else:
 377                 self.unknown_endtag(tag)
 378             del self.stack[-1]
 379
 380     # Overridable -- handle start tag
 381     def handle_starttag(self, tag, method, attrs):
 382         method(attrs)
 383
 384     # Overridable -- handle end tag
 385     def handle_endtag(self, tag, method):
 386         method()
 387
 388     # Example -- report an unbalanced </...> tag.
 389     def report_unbalanced(self, tag):
 390         if self.verbose:
 391             print '*** Unbalanced </' + tag + '>'
 392             print '*** Stack:', self.stack
 393
 394     def handle_charref(self, name):
 395         """Handle character reference, no need to override."""
 396         try:
 397             n = int(name)
 398         except ValueError:
 399             self.unknown_charref(name)
 400             return
 401         if not 0 <= n <= 255:
 402             self.unknown_charref(name)
 403             return
 404         self.handle_data(chr(n))
 405
 406     # Definition of entities -- derived classes may override
 407     entitydefs = \
 408             {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
 409
 410     def handle_entityref(self, name):
 411         """Handle entity references.
 412
 413         There should be no need to override this method; it can be
 414         tailored by setting up the self.entitydefs mapping appropriately.
 415         """
 416         table = self.entitydefs
 417         if name in table:
 418             self.handle_data(table[name])
 419         else:
 420             self.unknown_entityref(name)
 421             return
 422
 423     # Example -- handle data, should be overridden
 424     def handle_data(self, data):
 425         pass
 426
 427     # Example -- handle comment, could be overridden
 428     def handle_comment(self, data):
 429         pass
 430
 431     # Example -- handle declaration, could be overridden
 432     def handle_decl(self, decl):
 433         pass
 434
 435     # Example -- handle processing instruction, could be overridden
 436     def handle_pi(self, data):
 437         pass
 438
 439     # To be overridden -- handlers for unknown objects
 440     def unknown_starttag(self, tag, attrs): pass
 441     def unknown_endtag(self, tag): pass
 442     def unknown_charref(self, ref): pass
 443     def unknown_entityref(self, ref): pass
 444
 445
 446 class TestSGMLParser(SGMLParser):
 447
 448     def __init__(self, verbose=0):
 449         self.testdata = ""
 450         SGMLParser.__init__(self, verbose)
 451
 452     def handle_data(self, data):
 453         self.testdata = self.testdata + data
 454         if len(repr(self.testdata)) >= 70:
 455             self.flush()
 456
 457     def flush(self):
 458         data = self.testdata
 459         if data:
 460             self.testdata = ""
 461             print 'data:', repr(data)
 462
 463     def handle_comment(self, data):
 464         self.flush()
 465         r = repr(data)
 466         if len(r) > 68:
 467             r = r[:32] + '...' + r[-32:]
 468         print 'comment:', r
 469
 470     def unknown_starttag(self, tag, attrs):
 471         self.flush()
 472         if not attrs:
 473             print 'start tag: <' + tag + '>'
 474         else:
 475             print 'start tag: <' + tag,
 476             for name, value in attrs:
 477                 print name + '=' + '"' + value + '"',
 478             print '>'
 479
 480     def unknown_endtag(self, tag):
 481         self.flush()
 482         print 'end tag: </' + tag + '>'
 483
 484     def unknown_entityref(self, ref):
 485         self.flush()
 486         print '*** unknown entity ref: &' + ref + ';'
 487
 488     def unknown_charref(self, ref):
 489         self.flush()
 490         print '*** unknown char ref: &#' + ref + ';'
 491
 492     def unknown_decl(self, data):
 493         self.flush()
 494         print '*** unknown decl: [' + data + ']'
 495
 496     def close(self):
 497         SGMLParser.close(self)
 498         self.flush()
 499
 500
 501 def test(args = None):
 502     import sys
 503
 504     if args is None:
 505         args = sys.argv[1:]
 506
 507     if args and args[0] == '-s':
 508         args = args[1:]
 509         klass = SGMLParser
 510     else:
 511         klass = TestSGMLParser
 512
 513     if args:
 514         file = args[0]
 515     else:
 516         file = 'test.html'
 517
 518     if file == '-':
 519         f = sys.stdin
 520     else:
 521         try:
 522             f = open(file, 'r')
 523         except IOError, msg:
 524             print file, ":", msg
 525             sys.exit(1)
 526
 527     data = f.read()
 528     if f is not sys.stdin:
 529         f.close()
 530
 531     x = klass()
 532     for c in data:
 533         x.feed(c)
 534     x.close()
 535
 536
 537 if __name__ == '__main__':
 538     test()