Lib/sgmllib.py

   1 """A parser for SGML, using the derived class as a static DTD."""
   2
   3 # XXX This only supports those SGML features used by HTML.
   4
   5 # XXX There should be a way to distinguish between PCDATA (parsed
   6 # character data -- the normal case), RCDATA (replaceable character
   7 # data -- only char and entity references and end tags are special)
   8 # and CDATA (character data -- only end tags are special).  RCDATA is
   9 # not supported at all.
  10
  11
  12 import markupbase
  13 import re
  14
  15 __all__ = ["SGMLParser", "SGMLParseError"]
  16
  17 # Regular expressions used for parsing
  18
  19 interesting = re.compile('[&<]')
  20 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
  21                            '<([a-zA-Z][^<>]*|'
  22                               '/([a-zA-Z][^<>]*)?|'
  23                               '![^<>]*)?')
  24
  25 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  26 charref = re.compile('&#([0-9]+)[^0-9]')
  27
  28 starttagopen = re.compile('<[>a-zA-Z]')
  29 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
  30 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
  31 piclose = re.compile('>')
  32 endbracket = re.compile('[<>]')
  33 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
  34 attrfind = re.compile(
  35     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
  36     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
  37
  38
  39 class SGMLParseError(RuntimeError):
  40     """Exception raised for all parse errors."""
  41     pass
  42
  43
  44 # SGML parser base class -- find tags and call handler functions.
  45 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
  46 # The dtd is defined by deriving a class which defines methods
  47 # with special names to handle tags: start_foo and end_foo to handle
  48 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
  49 # (Tags are converted to lower case for this purpose.)  The data
  50 # between tags is passed to the parser by calling self.handle_data()
  51 # with some data as argument (the data may be split up in arbitrary
  52 # chunks).  Entity references are passed by calling
  53 # self.handle_entityref() with the entity reference as argument.
  54
  55 class SGMLParser(markupbase.ParserBase):
  56
  57     def __init__(self, verbose=0):
  58         """Initialize and reset this instance."""
  59         self.verbose = verbose
  60         self.reset()
  61
  62     def reset(self):
  63         """Reset this instance. Loses all unprocessed data."""
  64         self.__starttag_text = None
  65         self.rawdata = ''
  66         self.stack = []
  67         self.lasttag = '???'
  68         self.nomoretags = 0
  69         self.literal = 0
  70         markupbase.ParserBase.reset(self)
  71
  72     def setnomoretags(self):
  73         """Enter literal mode (CDATA) till EOF.
  74
  75         Intended for derived classes only.
  76         """
  77         self.nomoretags = self.literal = 1
  78
  79     def setliteral(self, *args):
  80         """Enter literal mode (CDATA).
  81
  82         Intended for derived classes only.
  83         """
  84         self.literal = 1
  85
  86     def feed(self, data):
  87         """Feed some data to the parser.
  88
  89         Call this as often as you want, with as little or as much text
  90         as you want (may include '\n').  (This just saves the text,
  91         all the processing is done by goahead().)
  92         """
  93
  94         self.rawdata = self.rawdata + data
  95         self.goahead(0)
  96
  97     def close(self):
  98         """Handle the remaining data."""
  99         self.goahead(1)
 100
 101     def error(self, message):
 102         raise SGMLParseError(message)
 103
 104     # Internal -- handle data as far as reasonable.  May leave state
 105     # and data to be processed by a subsequent call.  If 'end' is
 106     # true, force handling all data as if followed by EOF marker.
 107     def goahead(self, end):
 108         rawdata = self.rawdata
 109         i = 0
 110         n = len(rawdata)
 111         while i < n:
 112             if self.nomoretags:
 113                 self.handle_data(rawdata[i:n])
 114                 i = n
 115                 break
 116             match = interesting.search(rawdata, i)
 117             if match: j = match.start()
 118             else: j = n
 119             if i < j:
 120                 self.handle_data(rawdata[i:j])
 121             i = j
 122             if i == n: break
 123             if rawdata[i] == '<':
 124                 if starttagopen.match(rawdata, i):
 125                     if self.literal:
 126                         self.handle_data(rawdata[i])
 127                         i = i+1
 128                         continue
 129                     k = self.parse_starttag(i)
 130                     if k < 0: break
 131                     i = k
 132                     continue
 133                 if rawdata.startswith("</", i):
 134                     k = self.parse_endtag(i)
 135                     if k < 0: break
 136                     i = k
 137                     self.literal = 0
 138                     continue
 139                 if self.literal:
 140                     if n > (i + 1):
 141                         self.handle_data("<")
 142                         i = i+1
 143                     else:
 144                         # incomplete
 145                         break
 146                     continue
 147                 if rawdata.startswith("<!--", i):
 148                         # Strictly speaking, a comment is --.*--
 149                         # within a declaration tag <!...>.
 150                         # This should be removed,
 151                         # and comments handled only in parse_declaration.
 152                     k = self.parse_comment(i)
 153                     if k < 0: break
 154                     i = k
 155                     continue
 156                 if rawdata.startswith("<?", i):
 157                     k = self.parse_pi(i)
 158                     if k < 0: break
 159                     i = i+k
 160                     continue
 161                 if rawdata.startswith("<!", i):
 162                     # This is some sort of declaration; in "HTML as
 163                     # deployed," this should only be the document type
 164                     # declaration ("<!DOCTYPE html...>").
 165                     k = self.parse_declaration(i)
 166                     if k < 0: break
 167                     i = k
 168                     continue
 169             elif rawdata[i] == '&':
 170                 if self.literal:
 171                     self.handle_data(rawdata[i])
 172                     i = i+1
 173                     continue
 174                 match = charref.match(rawdata, i)
 175                 if match:
 176                     name = match.group(1)
 177                     self.handle_charref(name)
 178                     i = match.end(0)
 179                     if rawdata[i-1] != ';': i = i-1
 180                     continue
 181                 match = entityref.match(rawdata, i)
 182                 if match:
 183                     name = match.group(1)
 184                     self.handle_entityref(name)
 185                     i = match.end(0)
 186                     if rawdata[i-1] != ';': i = i-1
 187                     continue
 188             else:
 189                 self.error('neither < nor & ??')
 190             # We get here only if incomplete matches but
 191             # nothing else
 192             match = incomplete.match(rawdata, i)
 193             if not match:
 194                 self.handle_data(rawdata[i])
 195                 i = i+1
 196                 continue
 197             j = match.end(0)
 198             if j == n:
 199                 break # Really incomplete
 200             self.handle_data(rawdata[i:j])
 201             i = j
 202         # end while
 203         if end and i < n:
 204             self.handle_data(rawdata[i:n])
 205             i = n
 206         self.rawdata = rawdata[i:]
 207         # XXX if end: check for empty stack
 208
 209     # Extensions for the DOCTYPE scanner:
 210     _decl_otherchars = '='
 211
 212     # Internal -- parse processing instr, return length or -1 if not terminated
 213     def parse_pi(self, i):
 214         rawdata = self.rawdata
 215         if rawdata[i:i+2] != '<?':
 216             self.error('unexpected call to parse_pi()')
 217         match = piclose.search(rawdata, i+2)
 218         if not match:
 219             return -1
 220         j = match.start(0)
 221         self.handle_pi(rawdata[i+2: j])
 222         j = match.end(0)
 223         return j-i
 224
 225     def get_starttag_text(self):
 226         return self.__starttag_text
 227
 228     # Internal -- handle starttag, return length or -1 if not terminated
 229     def parse_starttag(self, i):
 230         self.__starttag_text = None
 231         start_pos = i
 232         rawdata = self.rawdata
 233         if shorttagopen.match(rawdata, i):
 234             # SGML shorthand: <tag/data/ == <tag>data</tag>
 235             # XXX Can data contain &... (entity or char refs)?
 236             # XXX Can data contain < or > (tag characters)?
 237             # XXX Can there be whitespace before the first /?
 238             match = shorttag.match(rawdata, i)
 239             if not match:
 240                 return -1
 241             tag, data = match.group(1, 2)
 242             self.__starttag_text = '<%s/' % tag
 243             tag = tag.lower()
 244             k = match.end(0)
 245             self.finish_shorttag(tag, data)
 246             self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
 247             return k
 248         # XXX The following should skip matching quotes (' or ")
 249         match = endbracket.search(rawdata, i+1)
 250         if not match:
 251             return -1
 252         j = match.start(0)
 253         # Now parse the data between i+1 and j into a tag and attrs
 254         attrs = []
 255         if rawdata[i:i+2] == '<>':
 256             # SGML shorthand: <> == <last open tag seen>
 257             k = j
 258             tag = self.lasttag
 259         else:
 260             match = tagfind.match(rawdata, i+1)
 261             if not match:
 262                 self.error('unexpected call to parse_starttag')
 263             k = match.end(0)
 264             tag = rawdata[i+1:k].lower()
 265             self.lasttag = tag
 266         while k < j:
 267             match = attrfind.match(rawdata, k)
 268             if not match: break
 269             attrname, rest, attrvalue = match.group(1, 2, 3)
 270             if not rest:
 271                 attrvalue = attrname
 272             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
 273                  attrvalue[:1] == '"' == attrvalue[-1:]:
 274                 attrvalue = attrvalue[1:-1]
 275             attrs.append((attrname.lower(), attrvalue))
 276             k = match.end(0)
 277         if rawdata[j] == '>':
 278             j = j+1
 279         self.__starttag_text = rawdata[start_pos:j]
 280         self.finish_starttag(tag, attrs)
 281         return j
 282
 283     # Internal -- parse endtag
 284     def parse_endtag(self, i):
 285         rawdata = self.rawdata
 286         match = endbracket.search(rawdata, i+1)
 287         if not match:
 288             return -1
 289         j = match.start(0)
 290         tag = rawdata[i+2:j].strip().lower()
 291         if rawdata[j] == '>':
 292             j = j+1
 293         self.finish_endtag(tag)
 294         return j
 295
 296     # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
 297     def finish_shorttag(self, tag, data):
 298         self.finish_starttag(tag, [])
 299         self.handle_data(data)
 300         self.finish_endtag(tag)
 301
 302     # Internal -- finish processing of start tag
 303     # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
 304     def finish_starttag(self, tag, attrs):
 305         try:
 306             method = getattr(self, 'start_' + tag)
 307         except AttributeError:
 308             try:
 309                 method = getattr(self, 'do_' + tag)
 310             except AttributeError:
 311                 self.unknown_starttag(tag, attrs)
 312                 return -1
 313             else:
 314                 self.handle_starttag(tag, method, attrs)
 315                 return 0
 316         else:
 317             self.stack.append(tag)
 318             self.handle_starttag(tag, method, attrs)
 319             return 1
 320
 321     # Internal -- finish processing of end tag
 322     def finish_endtag(self, tag):
 323         if not tag:
 324             found = len(self.stack) - 1
 325             if found < 0:
 326                 self.unknown_endtag(tag)
 327                 return
 328         else:
 329             if tag not in self.stack:
 330                 try:
 331                     method = getattr(self, 'end_' + tag)
 332                 except AttributeError:
 333                     self.unknown_endtag(tag)
 334                 else:
 335                     self.report_unbalanced(tag)
 336                 return
 337             found = len(self.stack)
 338             for i in range(found):
 339                 if self.stack[i] == tag: found = i
 340         while len(self.stack) > found:
 341             tag = self.stack[-1]
 342             try:
 343                 method = getattr(self, 'end_' + tag)
 344             except AttributeError:
 345                 method = None
 346             if method:
 347                 self.handle_endtag(tag, method)
 348             else:
 349                 self.unknown_endtag(tag)
 350             del self.stack[-1]
 351
 352     # Overridable -- handle start tag
 353     def handle_starttag(self, tag, method, attrs):
 354         method(attrs)
 355
 356     # Overridable -- handle end tag
 357     def handle_endtag(self, tag, method):
 358         method()
 359
 360     # Example -- report an unbalanced </...> tag.
 361     def report_unbalanced(self, tag):
 362         if self.verbose:
 363             print '*** Unbalanced </' + tag + '>'
 364             print '*** Stack:', self.stack
 365
 366     def handle_charref(self, name):
 367         """Handle character reference, no need to override."""
 368         try:
 369             n = int(name)
 370         except ValueError:
 371             self.unknown_charref(name)
 372             return
 373         if not 0 <= n <= 255:
 374             self.unknown_charref(name)
 375             return
 376         self.handle_data(chr(n))
 377
 378     # Definition of entities -- derived classes may override
 379     entitydefs = \
 380             {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
 381
 382     def handle_entityref(self, name):
 383         """Handle entity references.
 384
 385         There should be no need to override this method; it can be
 386         tailored by setting up the self.entitydefs mapping appropriately.
 387         """
 388         table = self.entitydefs
 389         if name in table:
 390             self.handle_data(table[name])
 391         else:
 392             self.unknown_entityref(name)
 393             return
 394
 395     # Example -- handle data, should be overridden
 396     def handle_data(self, data):
 397         pass
 398
 399     # Example -- handle comment, could be overridden
 400     def handle_comment(self, data):
 401         pass
 402
 403     # Example -- handle declaration, could be overridden
 404     def handle_decl(self, decl):
 405         pass
 406
 407     # Example -- handle processing instruction, could be overridden
 408     def handle_pi(self, data):
 409         pass
 410
 411     # To be overridden -- handlers for unknown objects
 412     def unknown_starttag(self, tag, attrs): pass
 413     def unknown_endtag(self, tag): pass
 414     def unknown_charref(self, ref): pass
 415     def unknown_entityref(self, ref): pass
 416
 417
 418 class TestSGMLParser(SGMLParser):
 419
 420     def __init__(self, verbose=0):
 421         self.testdata = ""
 422         SGMLParser.__init__(self, verbose)
 423
 424     def handle_data(self, data):
 425         self.testdata = self.testdata + data
 426         if len(repr(self.testdata)) >= 70:
 427             self.flush()
 428
 429     def flush(self):
 430         data = self.testdata
 431         if data:
 432             self.testdata = ""
 433             print 'data:', repr(data)
 434
 435     def handle_comment(self, data):
 436         self.flush()
 437         r = repr(data)
 438         if len(r) > 68:
 439             r = r[:32] + '...' + r[-32:]
 440         print 'comment:', r
 441
 442     def unknown_starttag(self, tag, attrs):
 443         self.flush()
 444         if not attrs:
 445             print 'start tag: <' + tag + '>'
 446         else:
 447             print 'start tag: <' + tag,
 448             for name, value in attrs:
 449                 print name + '=' + '"' + value + '"',
 450             print '>'
 451
 452     def unknown_endtag(self, tag):
 453         self.flush()
 454         print 'end tag: </' + tag + '>'
 455
 456     def unknown_entityref(self, ref):
 457         self.flush()
 458         print '*** unknown entity ref: &' + ref + ';'
 459
 460     def unknown_charref(self, ref):
 461         self.flush()
 462         print '*** unknown char ref: &#' + ref + ';'
 463
 464     def unknown_decl(self, data):
 465         self.flush()
 466         print '*** unknown decl: [' + data + ']'
 467
 468     def close(self):
 469         SGMLParser.close(self)
 470         self.flush()
 471
 472
 473 def test(args = None):
 474     import sys
 475
 476     if args is None:
 477         args = sys.argv[1:]
 478
 479     if args and args[0] == '-s':
 480         args = args[1:]
 481         klass = SGMLParser
 482     else:
 483         klass = TestSGMLParser
 484
 485     if args:
 486         file = args[0]
 487     else:
 488         file = 'test.html'
 489
 490     if file == '-':
 491         f = sys.stdin
 492     else:
 493         try:
 494             f = open(file, 'r')
 495         except IOError, msg:
 496             print file, ":", msg
 497             sys.exit(1)
 498
 499     data = f.read()
 500     if f is not sys.stdin:
 501         f.close()
 502
 503     x = klass()
 504     for c in data:
 505         x.feed(c)
 506     x.close()
 507
 508
 509 if __name__ == '__main__':
 510     test()