Lib/markupbase.py

   1 """Shared support for scanning document type declarations in HTML and XHTML.
   2
   3 This module is used as a foundation for the HTMLParser and sgmllib
   4 modules (indirectly, for htmllib as well).  It has no documented
   5 public API and should not be used directly.
   6
   7 """
   8
   9 import re
  10
  11 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
  12 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
  13 _commentclose = re.compile(r'--\s*>')
  14 _markedsectionclose = re.compile(r']\s*]\s*>')
  15
  16 # An analysis of the MS-Word extensions is available at
  17 # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
  18
  19 _msmarkedsectionclose = re.compile(r']\s*>')
  20
  21 del re
  22
  23
  24 class ParserBase:
  25     """Parser base class which provides some common support methods used
  26     by the SGML/HTML and XHTML parsers."""
  27
  28     def __init__(self):
  29         if self.__class__ is ParserBase:
  30             raise RuntimeError(
  31                 "markupbase.ParserBase must be subclassed")
  32
  33     def error(self, message):
  34         raise NotImplementedError(
  35             "subclasses of ParserBase must override error()")
  36
  37     def reset(self):
  38         self.lineno = 1
  39         self.offset = 0
  40
  41     def getpos(self):
  42         """Return current line number and offset."""
  43         return self.lineno, self.offset
  44
  45     # Internal -- update line number and offset.  This should be
  46     # called for each piece of data exactly once, in order -- in other
  47     # words the concatenation of all the input strings to this
  48     # function should be exactly the entire input.
  49     def updatepos(self, i, j):
  50         if i >= j:
  51             return j
  52         rawdata = self.rawdata
  53         nlines = rawdata.count("\n", i, j)
  54         if nlines:
  55             self.lineno = self.lineno + nlines
  56             pos = rawdata.rindex("\n", i, j) # Should not fail
  57             self.offset = j-(pos+1)
  58         else:
  59             self.offset = self.offset + j-i
  60         return j
  61
  62     _decl_otherchars = ''
  63
  64     # Internal -- parse declaration (for use by subclasses).
  65     def parse_declaration(self, i):
  66         # This is some sort of declaration; in "HTML as
  67         # deployed," this should only be the document type
  68         # declaration ("<!DOCTYPE html...>").
  69         # ISO 8879:1986, however, has more complex
  70         # declaration syntax for elements in <!...>, including:
  71         # --comment--
  72         # [marked section]
  73         # name in the following list: ENTITY, DOCTYPE, ELEMENT,
  74         # ATTLIST, NOTATION, SHORTREF, USEMAP,
  75         # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
  76         rawdata = self.rawdata
  77         j = i + 2
  78         assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
  79         if rawdata[j:j+1] in ("-", ""):
  80             # Start of comment followed by buffer boundary,
  81             # or just a buffer boundary.
  82             return -1
  83         # A simple, practical version could look like: ((name|stringlit) S*) + '>'
  84         n = len(rawdata)
  85         if rawdata[j:j+1] == '--': #comment
  86             # Locate --.*-- as the body of the comment
  87             return self.parse_comment(i)
  88         elif rawdata[j] == '[': #marked section
  89             # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
  90             # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
  91             # Note that this is extended by Microsoft Office "Save as Web" function
  92             # to include [if...] and [endif].
  93             return self.parse_marked_section(i)
  94         else: #all other declaration elements
  95             decltype, j = self._scan_name(j, i)
  96         if j < 0:
  97             return j
  98         if decltype == "doctype":
  99             self._decl_otherchars = ''
 100         while j < n:
 101             c = rawdata[j]
 102             if c == ">":
 103                 # end of declaration syntax
 104                 data = rawdata[i+2:j]
 105                 if decltype == "doctype":
 106                     self.handle_decl(data)
 107                 else:
 108                     self.unknown_decl(data)
 109                 return j + 1
 110             if c in "\"'":
 111                 m = _declstringlit_match(rawdata, j)
 112                 if not m:
 113                     return -1 # incomplete
 114                 j = m.end()
 115             elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
 116                 name, j = self._scan_name(j, i)
 117             elif c in self._decl_otherchars:
 118                 j = j + 1
 119             elif c == "[":
 120                 # this could be handled in a separate doctype parser
 121                 if decltype == "doctype":
 122                     j = self._parse_doctype_subset(j + 1, i)
 123                 elif decltype in ("attlist", "linktype", "link", "element"):
 124                     # must tolerate []'d groups in a content model in an element declaration
 125                     # also in data attribute specifications of attlist declaration
 126                     # also link type declaration subsets in linktype declarations
 127                     # also link attribute specification lists in link declarations
 128                     self.error("unsupported '[' char in %s declaration" % decltype)
 129                 else:
 130                     self.error("unexpected '[' char in declaration")
 131             else:
 132                 self.error(
 133                     "unexpected %r char in declaration" % rawdata[j])
 134             if j < 0:
 135                 return j
 136         return -1 # incomplete
 137
 138     # Internal -- parse a marked section
 139     # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
 140     def parse_marked_section( self, i, report=1 ):
 141         rawdata= self.rawdata
 142         assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
 143         sectName, j = self._scan_name( i+3, i )
 144         if j < 0:
 145             return j
 146         if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
 147             # look for standard ]]> ending
 148             match= _markedsectionclose.search(rawdata, i+3)
 149         elif sectName in ("if", "else", "endif"):
 150             # look for MS Office ]> ending
 151             match= _msmarkedsectionclose.search(rawdata, i+3)
 152         else:
 153             self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
 154         if not match:
 155             return -1
 156         if report:
 157             j = match.start(0)
 158             self.unknown_decl(rawdata[i+3: j])
 159         return match.end(0)
 160
 161     # Internal -- parse comment, return length or -1 if not terminated
 162     def parse_comment(self, i, report=1):
 163         rawdata = self.rawdata
 164         if rawdata[i:i+4] != '<!--':
 165             self.error('unexpected call to parse_comment()')
 166         match = _commentclose.search(rawdata, i+4)
 167         if not match:
 168             return -1
 169         if report:
 170             j = match.start(0)
 171             self.handle_comment(rawdata[i+4: j])
 172         return match.end(0)
 173
 174     # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
 175     # returning the index just past any whitespace following the trailing ']'.
 176     def _parse_doctype_subset(self, i, declstartpos):
 177         rawdata = self.rawdata
 178         n = len(rawdata)
 179         j = i
 180         while j < n:
 181             c = rawdata[j]
 182             if c == "<":
 183                 s = rawdata[j:j+2]
 184                 if s == "<":
 185                     # end of buffer; incomplete
 186                     return -1
 187                 if s != "<!":
 188                     self.updatepos(declstartpos, j + 1)
 189                     self.error("unexpected char in internal subset (in %r)" % s)
 190                 if (j + 2) == n:
 191                     # end of buffer; incomplete
 192                     return -1
 193                 if (j + 4) > n:
 194                     # end of buffer; incomplete
 195                     return -1
 196                 if rawdata[j:j+4] == "<!--":
 197                     j = self.parse_comment(j, report=0)
 198                     if j < 0:
 199                         return j
 200                     continue
 201                 name, j = self._scan_name(j + 2, declstartpos)
 202                 if j == -1:
 203                     return -1
 204                 if name not in ("attlist", "element", "entity", "notation"):
 205                     self.updatepos(declstartpos, j + 2)
 206                     self.error(
 207                         "unknown declaration %r in internal subset" % name)
 208                 # handle the individual names
 209                 meth = getattr(self, "_parse_doctype_" + name)
 210                 j = meth(j, declstartpos)
 211                 if j < 0:
 212                     return j
 213             elif c == "%":
 214                 # parameter entity reference
 215                 if (j + 1) == n:
 216                     # end of buffer; incomplete
 217                     return -1
 218                 s, j = self._scan_name(j + 1, declstartpos)
 219                 if j < 0:
 220                     return j
 221                 if rawdata[j] == ";":
 222                     j = j + 1
 223             elif c == "]":
 224                 j = j + 1
 225                 while j < n and rawdata[j].isspace():
 226                     j = j + 1
 227                 if j < n:
 228                     if rawdata[j] == ">":
 229                         return j
 230                     self.updatepos(declstartpos, j)
 231                     self.error("unexpected char after internal subset")
 232                 else:
 233                     return -1
 234             elif c.isspace():
 235                 j = j + 1
 236             else:
 237                 self.updatepos(declstartpos, j)
 238                 self.error("unexpected char %r in internal subset" % c)
 239         # end of buffer reached
 240         return -1
 241
 242     # Internal -- scan past <!ELEMENT declarations
 243     def _parse_doctype_element(self, i, declstartpos):
 244         name, j = self._scan_name(i, declstartpos)
 245         if j == -1:
 246             return -1
 247         # style content model; just skip until '>'
 248         rawdata = self.rawdata
 249         if '>' in rawdata[j:]:
 250             return rawdata.find(">", j) + 1
 251         return -1
 252
 253     # Internal -- scan past <!ATTLIST declarations
 254     def _parse_doctype_attlist(self, i, declstartpos):
 255         rawdata = self.rawdata
 256         name, j = self._scan_name(i, declstartpos)
 257         c = rawdata[j:j+1]
 258         if c == "":
 259             return -1
 260         if c == ">":
 261             return j + 1
 262         while 1:
 263             # scan a series of attribute descriptions; simplified:
 264             #   name type [value] [#constraint]
 265             name, j = self._scan_name(j, declstartpos)
 266             if j < 0:
 267                 return j
 268             c = rawdata[j:j+1]
 269             if c == "":
 270                 return -1
 271             if c == "(":
 272                 # an enumerated type; look for ')'
 273                 if ")" in rawdata[j:]:
 274                     j = rawdata.find(")", j) + 1
 275                 else:
 276                     return -1
 277                 while rawdata[j:j+1].isspace():
 278                     j = j + 1
 279                 if not rawdata[j:]:
 280                     # end of buffer, incomplete
 281                     return -1
 282             else:
 283                 name, j = self._scan_name(j, declstartpos)
 284             c = rawdata[j:j+1]
 285             if not c:
 286                 return -1
 287             if c in "'\"":
 288                 m = _declstringlit_match(rawdata, j)
 289                 if m:
 290                     j = m.end()
 291                 else:
 292                     return -1
 293                 c = rawdata[j:j+1]
 294                 if not c:
 295                     return -1
 296             if c == "#":
 297                 if rawdata[j:] == "#":
 298                     # end of buffer
 299                     return -1
 300                 name, j = self._scan_name(j + 1, declstartpos)
 301                 if j < 0:
 302                     return j
 303                 c = rawdata[j:j+1]
 304                 if not c:
 305                     return -1
 306             if c == '>':
 307                 # all done
 308                 return j + 1
 309
 310     # Internal -- scan past <!NOTATION declarations
 311     def _parse_doctype_notation(self, i, declstartpos):
 312         name, j = self._scan_name(i, declstartpos)
 313         if j < 0:
 314             return j
 315         rawdata = self.rawdata
 316         while 1:
 317             c = rawdata[j:j+1]
 318             if not c:
 319                 # end of buffer; incomplete
 320                 return -1
 321             if c == '>':
 322                 return j + 1
 323             if c in "'\"":
 324                 m = _declstringlit_match(rawdata, j)
 325                 if not m:
 326                     return -1
 327                 j = m.end()
 328             else:
 329                 name, j = self._scan_name(j, declstartpos)
 330                 if j < 0:
 331                     return j
 332
 333     # Internal -- scan past <!ENTITY declarations
 334     def _parse_doctype_entity(self, i, declstartpos):
 335         rawdata = self.rawdata
 336         if rawdata[i:i+1] == "%":
 337             j = i + 1
 338             while 1:
 339                 c = rawdata[j:j+1]
 340                 if not c:
 341                     return -1
 342                 if c.isspace():
 343                     j = j + 1
 344                 else:
 345                     break
 346         else:
 347             j = i
 348         name, j = self._scan_name(j, declstartpos)
 349         if j < 0:
 350             return j
 351         while 1:
 352             c = self.rawdata[j:j+1]
 353             if not c:
 354                 return -1
 355             if c in "'\"":
 356                 m = _declstringlit_match(rawdata, j)
 357                 if m:
 358                     j = m.end()
 359                 else:
 360                     return -1    # incomplete
 361             elif c == ">":
 362                 return j + 1
 363             else:
 364                 name, j = self._scan_name(j, declstartpos)
 365                 if j < 0:
 366                     return j
 367
 368     # Internal -- scan a name token and the new position and the token, or
 369     # return -1 if we've reached the end of the buffer.
 370     def _scan_name(self, i, declstartpos):
 371         rawdata = self.rawdata
 372         n = len(rawdata)
 373         if i == n:
 374             return None, -1
 375         m = _declname_match(rawdata, i)
 376         if m:
 377             s = m.group()
 378             name = s.strip()
 379             if (i + len(s)) == n:
 380                 return None, -1  # end of buffer
 381             return name.lower(), m.end()
 382         else:
 383             self.updatepos(declstartpos, i)
 384             self.error("expected name token at %r"
 385                        % rawdata[declstartpos:declstartpos+20])
 386
 387     # To be overridden -- handlers for unknown objects
 388     def unknown_decl(self, data):
 389         pass