Lib/HTMLParser.py

   1 """A parser for HTML and XHTML."""
   2
   3 # This file is based on sgmllib.py, but the API is slightly different.
   4
   5 # XXX There should be a way to distinguish between PCDATA (parsed
   6 # character data -- the normal case), RCDATA (replaceable character
   7 # data -- only char and entity references and end tags are special)
   8 # and CDATA (character data -- only end tags are special).
   9
  10
  11 import markupbase
  12 import re
  13
  14 # Regular expressions used for parsing
  15
  16 interesting_normal = re.compile('[&<]')
  17 interesting_cdata = re.compile(r'<(/|\Z)')
  18 incomplete = re.compile('&[a-zA-Z#]')
  19
  20 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  21 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
  22
  23 starttagopen = re.compile('<[a-zA-Z]')
  24 piclose = re.compile('>')
  25 commentclose = re.compile(r'--\s*>')
  26 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
  27 attrfind = re.compile(
  28     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
  29     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
  30
  31 locatestarttagend = re.compile(r"""
  32   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
  33   (?:\s+                             # whitespace before attribute name
  34     (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
  35       (?:\s*=\s*                     # value indicator
  36         (?:'[^']*'                   # LITA-enclosed value
  37           |\"[^\"]*\"                # LIT-enclosed value
  38           |[^'\">\s]+                # bare value
  39          )
  40        )?
  41      )
  42    )*
  43   \s*                                # trailing whitespace
  44 """, re.VERBOSE)
  45 endendtag = re.compile('>')
  46 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
  47
  48
  49 class HTMLParseError(Exception):
  50     """Exception raised for all parse errors."""
  51
  52     def __init__(self, msg, position=(None, None)):
  53         assert msg
  54         self.msg = msg
  55         self.lineno = position[0]
  56         self.offset = position[1]
  57
  58     def __str__(self):
  59         result = self.msg
  60         if self.lineno is not None:
  61             result = result + ", at line %d" % self.lineno
  62         if self.offset is not None:
  63             result = result + ", column %d" % (self.offset + 1)
  64         return result
  65
  66
  67 class HTMLParser(markupbase.ParserBase):
  68     """Find tags and other markup and call handler functions.
  69
  70     Usage:
  71         p = HTMLParser()
  72         p.feed(data)
  73         ...
  74         p.close()
  75
  76     Start tags are handled by calling self.handle_starttag() or
  77     self.handle_startendtag(); end tags by self.handle_endtag().  The
  78     data between tags is passed from the parser to the derived class
  79     by calling self.handle_data() with the data as argument (the data
  80     may be split up in arbitrary chunks).  Entity references are
  81     passed by calling self.handle_entityref() with the entity
  82     reference as the argument.  Numeric character references are
  83     passed to self.handle_charref() with the string containing the
  84     reference as the argument.
  85     """
  86
  87     CDATA_CONTENT_ELEMENTS = ("script", "style")
  88
  89
  90     def __init__(self):
  91         """Initialize and reset this instance."""
  92         self.reset()
  93
  94     def reset(self):
  95         """Reset this instance.  Loses all unprocessed data."""
  96         self.rawdata = ''
  97         self.lasttag = '???'
  98         self.interesting = interesting_normal
  99         markupbase.ParserBase.reset(self)
 100
 101     def feed(self, data):
 102         """Feed data to the parser.
 103
 104         Call this as often as you want, with as little or as much text
 105         as you want (may include '\n').
 106         """
 107         self.rawdata = self.rawdata + data
 108         self.goahead(0)
 109
 110     def close(self):
 111         """Handle any buffered data."""
 112         self.goahead(1)
 113
 114     def error(self, message):
 115         raise HTMLParseError(message, self.getpos())
 116
 117     __starttag_text = None
 118
 119     def get_starttag_text(self):
 120         """Return full source of start tag: '<...>'."""
 121         return self.__starttag_text
 122
 123     def set_cdata_mode(self):
 124         self.interesting = interesting_cdata
 125
 126     def clear_cdata_mode(self):
 127         self.interesting = interesting_normal
 128
 129     # Internal -- handle data as far as reasonable.  May leave state
 130     # and data to be processed by a subsequent call.  If 'end' is
 131     # true, force handling all data as if followed by EOF marker.
 132     def goahead(self, end):
 133         rawdata = self.rawdata
 134         i = 0
 135         n = len(rawdata)
 136         while i < n:
 137             match = self.interesting.search(rawdata, i) # < or &
 138             if match:
 139                 j = match.start()
 140             else:
 141                 j = n
 142             if i < j: self.handle_data(rawdata[i:j])
 143             i = self.updatepos(i, j)
 144             if i == n: break
 145             startswith = rawdata.startswith
 146             if startswith('<', i):
 147                 if starttagopen.match(rawdata, i): # < + letter
 148                     k = self.parse_starttag(i)
 149                 elif startswith("</", i):
 150                     k = self.parse_endtag(i)
 151                 elif startswith("<!--", i):
 152                     k = self.parse_comment(i)
 153                 elif startswith("<?", i):
 154                     k = self.parse_pi(i)
 155                 elif startswith("<!", i):
 156                     k = self.parse_declaration(i)
 157                 elif (i + 1) < n:
 158                     self.handle_data("<")
 159                     k = i + 1
 160                 else:
 161                     break
 162                 if k < 0:
 163                     if end:
 164                         self.error("EOF in middle of construct")
 165                     break
 166                 i = self.updatepos(i, k)
 167             elif startswith("&#", i):
 168                 match = charref.match(rawdata, i)
 169                 if match:
 170                     name = match.group()[2:-1]
 171                     self.handle_charref(name)
 172                     k = match.end()
 173                     if not startswith(';', k-1):
 174                         k = k - 1
 175                     i = self.updatepos(i, k)
 176                     continue
 177                 else:
 178                     break
 179             elif startswith('&', i):
 180                 match = entityref.match(rawdata, i)
 181                 if match:
 182                     name = match.group(1)
 183                     self.handle_entityref(name)
 184                     k = match.end()
 185                     if not startswith(';', k-1):
 186                         k = k - 1
 187                     i = self.updatepos(i, k)
 188                     continue
 189                 match = incomplete.match(rawdata, i)
 190                 if match:
 191                     # match.group() will contain at least 2 chars
 192                     if end and match.group() == rawdata[i:]:
 193                         self.error("EOF in middle of entity or char ref")
 194                     # incomplete
 195                     break
 196                 elif (i + 1) < n:
 197                     # not the end of the buffer, and can't be confused
 198                     # with some other construct
 199                     self.handle_data("&")
 200                     i = self.updatepos(i, i + 1)
 201                 else:
 202                     break
 203             else:
 204                 assert 0, "interesting.search() lied"
 205         # end while
 206         if end and i < n:
 207             self.handle_data(rawdata[i:n])
 208             i = self.updatepos(i, n)
 209         self.rawdata = rawdata[i:]
 210
 211     # Internal -- parse processing instr, return end or -1 if not terminated
 212     def parse_pi(self, i):
 213         rawdata = self.rawdata
 214         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
 215         match = piclose.search(rawdata, i+2) # >
 216         if not match:
 217             return -1
 218         j = match.start()
 219         self.handle_pi(rawdata[i+2: j])
 220         j = match.end()
 221         return j
 222
 223     # Internal -- handle starttag, return end or -1 if not terminated
 224     def parse_starttag(self, i):
 225         self.__starttag_text = None
 226         endpos = self.check_for_whole_start_tag(i)
 227         if endpos < 0:
 228             return endpos
 229         rawdata = self.rawdata
 230         self.__starttag_text = rawdata[i:endpos]
 231
 232         # Now parse the data between i+1 and j into a tag and attrs
 233         attrs = []
 234         match = tagfind.match(rawdata, i+1)
 235         assert match, 'unexpected call to parse_starttag()'
 236         k = match.end()
 237         self.lasttag = tag = rawdata[i+1:k].lower()
 238
 239         while k < endpos:
 240             m = attrfind.match(rawdata, k)
 241             if not m:
 242                 break
 243             attrname, rest, attrvalue = m.group(1, 2, 3)
 244             if not rest:
 245                 attrvalue = None
 246             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
 247                  attrvalue[:1] == '"' == attrvalue[-1:]:
 248                 attrvalue = attrvalue[1:-1]
 249                 attrvalue = self.unescape(attrvalue)
 250             attrs.append((attrname.lower(), attrvalue))
 251             k = m.end()
 252
 253         end = rawdata[k:endpos].strip()
 254         if end not in (">", "/>"):
 255             lineno, offset = self.getpos()
 256             if "\n" in self.__starttag_text:
 257                 lineno = lineno + self.__starttag_text.count("\n")
 258                 offset = len(self.__starttag_text) \
 259                          - self.__starttag_text.rfind("\n")
 260             else:
 261                 offset = offset + len(self.__starttag_text)
 262             self.error("junk characters in start tag: %r"
 263                        % (rawdata[k:endpos][:20],))
 264         if end.endswith('/>'):
 265             # XHTML-style empty tag: <span attr="value" />
 266             self.handle_startendtag(tag, attrs)
 267         else:
 268             self.handle_starttag(tag, attrs)
 269             if tag in self.CDATA_CONTENT_ELEMENTS:
 270                 self.set_cdata_mode()
 271         return endpos
 272
 273     # Internal -- check to see if we have a complete starttag; return end
 274     # or -1 if incomplete.
 275     def check_for_whole_start_tag(self, i):
 276         rawdata = self.rawdata
 277         m = locatestarttagend.match(rawdata, i)
 278         if m:
 279             j = m.end()
 280             next = rawdata[j:j+1]
 281             if next == ">":
 282                 return j + 1
 283             if next == "/":
 284                 if rawdata.startswith("/>", j):
 285                     return j + 2
 286                 if rawdata.startswith("/", j):
 287                     # buffer boundary
 288                     return -1
 289                 # else bogus input
 290                 self.updatepos(i, j + 1)
 291                 self.error("malformed empty start tag")
 292             if next == "":
 293                 # end of input
 294                 return -1
 295             if next in ("abcdefghijklmnopqrstuvwxyz=/"
 296                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
 297                 # end of input in or before attribute value, or we have the
 298                 # '/' from a '/>' ending
 299                 return -1
 300             self.updatepos(i, j)
 301             self.error("malformed start tag")
 302         raise AssertionError("we should not get here!")
 303
 304     # Internal -- parse endtag, return end or -1 if incomplete
 305     def parse_endtag(self, i):
 306         rawdata = self.rawdata
 307         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
 308         match = endendtag.search(rawdata, i+1) # >
 309         if not match:
 310             return -1
 311         j = match.end()
 312         match = endtagfind.match(rawdata, i) # </ + tag + >
 313         if not match:
 314             self.error("bad end tag: %r" % (rawdata[i:j],))
 315         tag = match.group(1)
 316         self.handle_endtag(tag.lower())
 317         self.clear_cdata_mode()
 318         return j
 319
 320     # Overridable -- finish processing of start+end tag: <tag.../>
 321     def handle_startendtag(self, tag, attrs):
 322         self.handle_starttag(tag, attrs)
 323         self.handle_endtag(tag)
 324
 325     # Overridable -- handle start tag
 326     def handle_starttag(self, tag, attrs):
 327         pass
 328
 329     # Overridable -- handle end tag
 330     def handle_endtag(self, tag):
 331         pass
 332
 333     # Overridable -- handle character reference
 334     def handle_charref(self, name):
 335         pass
 336
 337     # Overridable -- handle entity reference
 338     def handle_entityref(self, name):
 339         pass
 340
 341     # Overridable -- handle data
 342     def handle_data(self, data):
 343         pass
 344
 345     # Overridable -- handle comment
 346     def handle_comment(self, data):
 347         pass
 348
 349     # Overridable -- handle declaration
 350     def handle_decl(self, decl):
 351         pass
 352
 353     # Overridable -- handle processing instruction
 354     def handle_pi(self, data):
 355         pass
 356
 357     def unknown_decl(self, data):
 358         self.error("unknown declaration: %r" % (data,))
 359
 360     # Internal -- helper to remove special character quoting
 361     entitydefs = None
 362     def unescape(self, s):
 363         if '&' not in s:
 364             return s
 365         def replaceEntities(s):
 366             s = s.groups()[0]
 367             if s[0] == "#":
 368                 s = s[1:]
 369                 if s[0] in ['x','X']:
 370                     c = int(s[1:], 16)
 371                 else:
 372                     c = int(s)
 373                 return unichr(c)
 374             else:
 375                 # Cannot use name2codepoint directly, because HTMLParser supports apos,
 376                 # which is not part of HTML 4
 377                 import htmlentitydefs
 378                 if HTMLParser.entitydefs is None:
 379                     entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
 380                     for k, v in htmlentitydefs.name2codepoint.iteritems():
 381                         entitydefs[k] = unichr(v)
 382                 try:
 383                     return self.entitydefs[s]
 384                 except KeyError:
 385                     return '&'+s+';'
 386
 387         return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)