Lib/_markupbase.py

   1 """Shared support for scanning document type declarations in HTML and XHTML.
   2
   3 This module is used as a foundation for the html.parser module.  It has no
   4 documented public API and should not be used directly.
   5
   6 """
   7
   8 import re
   9
  10 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
  11 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
  12 _commentclose = re.compile(r'--\s*>')
  13 _markedsectionclose = re.compile(r']\s*]\s*>')
  14
  15 # An analysis of the MS-Word extensions is available at
  16 # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
  17
  18 _msmarkedsectionclose = re.compile(r']\s*>')
  19
  20 del re
  21
  22
  23 class ParserBase:
  24     """Parser base class which provides some common support methods used
  25     by the SGML/HTML and XHTML parsers."""
  26
  27     def __init__(self):
  28         if self.__class__ is ParserBase:
  29             raise RuntimeError(
  30                 "_markupbase.ParserBase must be subclassed")
  31
  32     def error(self, message):
  33         raise NotImplementedError(
  34             "subclasses of ParserBase must override error()")
  35
  36     def reset(self):
  37         self.lineno = 1
  38         self.offset = 0
  39
  40     def getpos(self):
  41         """Return current line number and offset."""
  42         return self.lineno, self.offset
  43
  44     # Internal -- update line number and offset.  This should be
  45     # called for each piece of data exactly once, in order -- in other
  46     # words the concatenation of all the input strings to this
  47     # function should be exactly the entire input.
  48     def updatepos(self, i, j):
  49         if i >= j:
  50             return j
  51         rawdata = self.rawdata
  52         nlines = rawdata.count("\n", i, j)
  53         if nlines:
  54             self.lineno = self.lineno + nlines
  55             pos = rawdata.rindex("\n", i, j) # Should not fail
  56             self.offset = j-(pos+1)
  57         else:
  58             self.offset = self.offset + j-i
  59         return j
  60
  61     _decl_otherchars = ''
  62
  63     # Internal -- parse declaration (for use by subclasses).
  64     def parse_declaration(self, i):
  65         # This is some sort of declaration; in "HTML as
  66         # deployed," this should only be the document type
  67         # declaration ("<!DOCTYPE html...>").
  68         # ISO 8879:1986, however, has more complex
  69         # declaration syntax for elements in <!...>, including:
  70         # --comment--
  71         # [marked section]
  72         # name in the following list: ENTITY, DOCTYPE, ELEMENT,
  73         # ATTLIST, NOTATION, SHORTREF, USEMAP,
  74         # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
  75         rawdata = self.rawdata
  76         j = i + 2
  77         assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
  78         if rawdata[j:j+1] == ">":
  79             # the empty comment <!>
  80             return j + 1
  81         if rawdata[j:j+1] in ("-", ""):
  82             # Start of comment followed by buffer boundary,
  83             # or just a buffer boundary.
  84             return -1
  85         # A simple, practical version could look like: ((name|stringlit) S*) + '>'
  86         n = len(rawdata)
  87         if rawdata[j:j+2] == '--': #comment
  88             # Locate --.*-- as the body of the comment
  89             return self.parse_comment(i)
  90         elif rawdata[j] == '[': #marked section
  91             # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
  92             # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
  93             # Note that this is extended by Microsoft Office "Save as Web" function
  94             # to include [if...] and [endif].
  95             return self.parse_marked_section(i)
  96         else: #all other declaration elements
  97             decltype, j = self._scan_name(j, i)
  98         if j < 0:
  99             return j
 100         if decltype == "doctype":
 101             self._decl_otherchars = ''
 102         while j < n:
 103             c = rawdata[j]
 104             if c == ">":
 105                 # end of declaration syntax
 106                 data = rawdata[i+2:j]
 107                 if decltype == "doctype":
 108                     self.handle_decl(data)
 109                 else:
 110                     self.unknown_decl(data)
 111                 return j + 1
 112             if c in "\"'":
 113                 m = _declstringlit_match(rawdata, j)
 114                 if not m:
 115                     return -1 # incomplete
 116                 j = m.end()
 117             elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
 118                 name, j = self._scan_name(j, i)
 119             elif c in self._decl_otherchars:
 120                 j = j + 1
 121             elif c == "[":
 122                 # this could be handled in a separate doctype parser
 123                 if decltype == "doctype":
 124                     j = self._parse_doctype_subset(j + 1, i)
 125                 elif decltype in ("attlist", "linktype", "link", "element"):
 126                     # must tolerate []'d groups in a content model in an element declaration
 127                     # also in data attribute specifications of attlist declaration
 128                     # also link type declaration subsets in linktype declarations
 129                     # also link attribute specification lists in link declarations
 130                     self.error("unsupported '[' char in %s declaration" % decltype)
 131                 else:
 132                     self.error("unexpected '[' char in declaration")
 133             else:
 134                 self.error(
 135                     "unexpected %r char in declaration" % rawdata[j])
 136             if j < 0:
 137                 return j
 138         return -1 # incomplete
 139
 140     # Internal -- parse a marked section
 141     # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
 142     def parse_marked_section(self, i, report=1):
 143         rawdata= self.rawdata
 144         assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
 145         sectName, j = self._scan_name( i+3, i )
 146         if j < 0:
 147             return j
 148         if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
 149             # look for standard ]]> ending
 150             match= _markedsectionclose.search(rawdata, i+3)
 151         elif sectName in ("if", "else", "endif"):
 152             # look for MS Office ]> ending
 153             match= _msmarkedsectionclose.search(rawdata, i+3)
 154         else:
 155             self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
 156         if not match:
 157             return -1
 158         if report:
 159             j = match.start(0)
 160             self.unknown_decl(rawdata[i+3: j])
 161         return match.end(0)
 162
 163     # Internal -- parse comment, return length or -1 if not terminated
 164     def parse_comment(self, i, report=1):
 165         rawdata = self.rawdata
 166         if rawdata[i:i+4] != '<!--':
 167             self.error('unexpected call to parse_comment()')
 168         match = _commentclose.search(rawdata, i+4)
 169         if not match:
 170             return -1
 171         if report:
 172             j = match.start(0)
 173             self.handle_comment(rawdata[i+4: j])
 174         return match.end(0)
 175
 176     # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
 177     # returning the index just past any whitespace following the trailing ']'.
 178     def _parse_doctype_subset(self, i, declstartpos):
 179         rawdata = self.rawdata
 180         n = len(rawdata)
 181         j = i
 182         while j < n:
 183             c = rawdata[j]
 184             if c == "<":
 185                 s = rawdata[j:j+2]
 186                 if s == "<":
 187                     # end of buffer; incomplete
 188                     return -1
 189                 if s != "<!":
 190                     self.updatepos(declstartpos, j + 1)
 191                     self.error("unexpected char in internal subset (in %r)" % s)
 192                 if (j + 2) == n:
 193                     # end of buffer; incomplete
 194                     return -1
 195                 if (j + 4) > n:
 196                     # end of buffer; incomplete
 197                     return -1
 198                 if rawdata[j:j+4] == "<!--":
 199                     j = self.parse_comment(j, report=0)
 200                     if j < 0:
 201                         return j
 202                     continue
 203                 name, j = self._scan_name(j + 2, declstartpos)
 204                 if j == -1:
 205                     return -1
 206                 if name not in ("attlist", "element", "entity", "notation"):
 207                     self.updatepos(declstartpos, j + 2)
 208                     self.error(
 209                         "unknown declaration %r in internal subset" % name)
 210                 # handle the individual names
 211                 meth = getattr(self, "_parse_doctype_" + name)
 212                 j = meth(j, declstartpos)
 213                 if j < 0:
 214                     return j
 215             elif c == "%":
 216                 # parameter entity reference
 217                 if (j + 1) == n:
 218                     # end of buffer; incomplete
 219                     return -1
 220                 s, j = self._scan_name(j + 1, declstartpos)
 221                 if j < 0:
 222                     return j
 223                 if rawdata[j] == ";":
 224                     j = j + 1
 225             elif c == "]":
 226                 j = j + 1
 227                 while j < n and rawdata[j].isspace():
 228                     j = j + 1
 229                 if j < n:
 230                     if rawdata[j] == ">":
 231                         return j
 232                     self.updatepos(declstartpos, j)
 233                     self.error("unexpected char after internal subset")
 234                 else:
 235                     return -1
 236             elif c.isspace():
 237                 j = j + 1
 238             else:
 239                 self.updatepos(declstartpos, j)
 240                 self.error("unexpected char %r in internal subset" % c)
 241         # end of buffer reached
 242         return -1
 243
 244     # Internal -- scan past <!ELEMENT declarations
 245     def _parse_doctype_element(self, i, declstartpos):
 246         name, j = self._scan_name(i, declstartpos)
 247         if j == -1:
 248             return -1
 249         # style content model; just skip until '>'
 250         rawdata = self.rawdata
 251         if '>' in rawdata[j:]:
 252             return rawdata.find(">", j) + 1
 253         return -1
 254
 255     # Internal -- scan past <!ATTLIST declarations
 256     def _parse_doctype_attlist(self, i, declstartpos):
 257         rawdata = self.rawdata
 258         name, j = self._scan_name(i, declstartpos)
 259         c = rawdata[j:j+1]
 260         if c == "":
 261             return -1
 262         if c == ">":
 263             return j + 1
 264         while 1:
 265             # scan a series of attribute descriptions; simplified:
 266             #   name type [value] [#constraint]
 267             name, j = self._scan_name(j, declstartpos)
 268             if j < 0:
 269                 return j
 270             c = rawdata[j:j+1]
 271             if c == "":
 272                 return -1
 273             if c == "(":
 274                 # an enumerated type; look for ')'
 275                 if ")" in rawdata[j:]:
 276                     j = rawdata.find(")", j) + 1
 277                 else:
 278                     return -1
 279                 while rawdata[j:j+1].isspace():
 280                     j = j + 1
 281                 if not rawdata[j:]:
 282                     # end of buffer, incomplete
 283                     return -1
 284             else:
 285                 name, j = self._scan_name(j, declstartpos)
 286             c = rawdata[j:j+1]
 287             if not c:
 288                 return -1
 289             if c in "'\"":
 290                 m = _declstringlit_match(rawdata, j)
 291                 if m:
 292                     j = m.end()
 293                 else:
 294                     return -1
 295                 c = rawdata[j:j+1]
 296                 if not c:
 297                     return -1
 298             if c == "#":
 299                 if rawdata[j:] == "#":
 300                     # end of buffer
 301                     return -1
 302                 name, j = self._scan_name(j + 1, declstartpos)
 303                 if j < 0:
 304                     return j
 305                 c = rawdata[j:j+1]
 306                 if not c:
 307                     return -1
 308             if c == '>':
 309                 # all done
 310                 return j + 1
 311
 312     # Internal -- scan past <!NOTATION declarations
 313     def _parse_doctype_notation(self, i, declstartpos):
 314         name, j = self._scan_name(i, declstartpos)
 315         if j < 0:
 316             return j
 317         rawdata = self.rawdata
 318         while 1:
 319             c = rawdata[j:j+1]
 320             if not c:
 321                 # end of buffer; incomplete
 322                 return -1
 323             if c == '>':
 324                 return j + 1
 325             if c in "'\"":
 326                 m = _declstringlit_match(rawdata, j)
 327                 if not m:
 328                     return -1
 329                 j = m.end()
 330             else:
 331                 name, j = self._scan_name(j, declstartpos)
 332                 if j < 0:
 333                     return j
 334
 335     # Internal -- scan past <!ENTITY declarations
 336     def _parse_doctype_entity(self, i, declstartpos):
 337         rawdata = self.rawdata
 338         if rawdata[i:i+1] == "%":
 339             j = i + 1
 340             while 1:
 341                 c = rawdata[j:j+1]
 342                 if not c:
 343                     return -1
 344                 if c.isspace():
 345                     j = j + 1
 346                 else:
 347                     break
 348         else:
 349             j = i
 350         name, j = self._scan_name(j, declstartpos)
 351         if j < 0:
 352             return j
 353         while 1:
 354             c = self.rawdata[j:j+1]
 355             if not c:
 356                 return -1
 357             if c in "'\"":
 358                 m = _declstringlit_match(rawdata, j)
 359                 if m:
 360                     j = m.end()
 361                 else:
 362                     return -1    # incomplete
 363             elif c == ">":
 364                 return j + 1
 365             else:
 366                 name, j = self._scan_name(j, declstartpos)
 367                 if j < 0:
 368                     return j
 369
 370     # Internal -- scan a name token and the new position and the token, or
 371     # return -1 if we've reached the end of the buffer.
 372     def _scan_name(self, i, declstartpos):
 373         rawdata = self.rawdata
 374         n = len(rawdata)
 375         if i == n:
 376             return None, -1
 377         m = _declname_match(rawdata, i)
 378         if m:
 379             s = m.group()
 380             name = s.strip()
 381             if (i + len(s)) == n:
 382                 return None, -1  # end of buffer
 383             return name.lower(), m.end()
 384         else:
 385             self.updatepos(declstartpos, i)
 386             self.error("expected name token at %r"
 387                        % rawdata[declstartpos:declstartpos+20])
 388
 389     # To be overridden -- handlers for unknown objects
 390     def unknown_decl(self, data):
 391         pass