Lib/test/test_htmlparser.py

   1 """Tests for HTMLParser.py."""
   2
   3 import HTMLParser
   4 import pprint
   5 import unittest
   6 from test import test_support
   7
   8
   9 class EventCollector(HTMLParser.HTMLParser):
  10
  11     def __init__(self):
  12         self.events = []
  13         self.append = self.events.append
  14         HTMLParser.HTMLParser.__init__(self)
  15
  16     def get_events(self):
  17         # Normalize the list of events so that buffer artefacts don't
  18         # separate runs of contiguous characters.
  19         L = []
  20         prevtype = None
  21         for event in self.events:
  22             type = event[0]
  23             if type == prevtype == "data":
  24                 L[-1] = ("data", L[-1][1] + event[1])
  25             else:
  26                 L.append(event)
  27             prevtype = type
  28         self.events = L
  29         return L
  30
  31     # structure markup
  32
  33     def handle_starttag(self, tag, attrs):
  34         self.append(("starttag", tag, attrs))
  35
  36     def handle_startendtag(self, tag, attrs):
  37         self.append(("startendtag", tag, attrs))
  38
  39     def handle_endtag(self, tag):
  40         self.append(("endtag", tag))
  41
  42     # all other markup
  43
  44     def handle_comment(self, data):
  45         self.append(("comment", data))
  46
  47     def handle_charref(self, data):
  48         self.append(("charref", data))
  49
  50     def handle_data(self, data):
  51         self.append(("data", data))
  52
  53     def handle_decl(self, data):
  54         self.append(("decl", data))
  55
  56     def handle_entityref(self, data):
  57         self.append(("entityref", data))
  58
  59     def handle_pi(self, data):
  60         self.append(("pi", data))
  61
  62     def unknown_decl(self, decl):
  63         self.append(("unknown decl", decl))
  64
  65
  66 class EventCollectorExtra(EventCollector):
  67
  68     def handle_starttag(self, tag, attrs):
  69         EventCollector.handle_starttag(self, tag, attrs)
  70         self.append(("starttag_text", self.get_starttag_text()))
  71
  72
  73 class TestCaseBase(unittest.TestCase):
  74
  75     def _run_check(self, source, expected_events, collector=EventCollector):
  76         parser = collector()
  77         for s in source:
  78             parser.feed(s)
  79         parser.close()
  80         events = parser.get_events()
  81         if events != expected_events:
  82             self.fail("received events did not match expected events\n"
  83                       "Expected:\n" + pprint.pformat(expected_events) +
  84                       "\nReceived:\n" + pprint.pformat(events))
  85
  86     def _run_check_extra(self, source, events):
  87         self._run_check(source, events, EventCollectorExtra)
  88
  89     def _parse_error(self, source):
  90         def parse(source=source):
  91             parser = HTMLParser.HTMLParser()
  92             parser.feed(source)
  93             parser.close()
  94         self.assertRaises(HTMLParser.HTMLParseError, parse)
  95
  96
  97 class HTMLParserTestCase(TestCaseBase):
  98
  99     def test_processing_instruction_only(self):
 100         self._run_check("<?processing instruction>", [
 101             ("pi", "processing instruction"),
 102             ])
 103         self._run_check("<?processing instruction ?>", [
 104             ("pi", "processing instruction ?"),
 105             ])
 106
 107     def test_simple_html(self):
 108         self._run_check("""
 109 <!DOCTYPE html PUBLIC 'foo'>
 110 <HTML>&entity;&#32;
 111 <!--comment1a
 112 -></foo><bar>&lt;<?pi?></foo<bar
 113 comment1b-->
 114 <Img sRc='Bar' isMAP>sample
 115 text
 116 &#x201C;
 117 <!--comment2a-- --comment2b--><!>
 118 </Html>
 119 """, [
 120     ("data", "\n"),
 121     ("decl", "DOCTYPE html PUBLIC 'foo'"),
 122     ("data", "\n"),
 123     ("starttag", "html", []),
 124     ("entityref", "entity"),
 125     ("charref", "32"),
 126     ("data", "\n"),
 127     ("comment", "comment1a\n-></foo><bar>&lt;<?pi?></foo<bar\ncomment1b"),
 128     ("data", "\n"),
 129     ("starttag", "img", [("src", "Bar"), ("ismap", None)]),
 130     ("data", "sample\ntext\n"),
 131     ("charref", "x201C"),
 132     ("data", "\n"),
 133     ("comment", "comment2a-- --comment2b"),
 134     ("data", "\n"),
 135     ("endtag", "html"),
 136     ("data", "\n"),
 137     ])
 138
 139     def test_unclosed_entityref(self):
 140         self._run_check("&entityref foo", [
 141             ("entityref", "entityref"),
 142             ("data", " foo"),
 143             ])
 144
 145     def test_doctype_decl(self):
 146         inside = """\
 147 DOCTYPE html [
 148   <!ELEMENT html - O EMPTY>
 149   <!ATTLIST html
 150       version CDATA #IMPLIED
 151       profile CDATA 'DublinCore'>
 152   <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
 153   <!ENTITY myEntity 'internal parsed entity'>
 154   <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
 155   <!ENTITY % paramEntity 'name|name|name'>
 156   %paramEntity;
 157   <!-- comment -->
 158 ]"""
 159         self._run_check("<!%s>" % inside, [
 160             ("decl", inside),
 161             ])
 162
 163     def test_bad_nesting(self):
 164         # Strangely, this *is* supposed to test that overlapping
 165         # elements are allowed.  HTMLParser is more geared toward
 166         # lexing the input that parsing the structure.
 167         self._run_check("<a><b></a></b>", [
 168             ("starttag", "a", []),
 169             ("starttag", "b", []),
 170             ("endtag", "a"),
 171             ("endtag", "b"),
 172             ])
 173
 174     def test_bare_ampersands(self):
 175         self._run_check("this text & contains & ampersands &", [
 176             ("data", "this text & contains & ampersands &"),
 177             ])
 178
 179     def test_bare_pointy_brackets(self):
 180         self._run_check("this < text > contains < bare>pointy< brackets", [
 181             ("data", "this < text > contains < bare>pointy< brackets"),
 182             ])
 183
 184     def test_attr_syntax(self):
 185         output = [
 186           ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
 187           ]
 188         self._run_check("""<a b='v' c="v" d=v e>""", output)
 189         self._run_check("""<a  b = 'v' c = "v" d = v e>""", output)
 190         self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
 191         self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
 192
 193     def test_attr_values(self):
 194         self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
 195                         [("starttag", "a", [("b", "xxx\n\txxx"),
 196                                             ("c", "yyy\t\nyyy"),
 197                                             ("d", "\txyz\n")])
 198                          ])
 199         self._run_check("""<a b='' c="">""", [
 200             ("starttag", "a", [("b", ""), ("c", "")]),
 201             ])
 202         # Regression test for SF patch #669683.
 203         self._run_check("<e a=rgb(1,2,3)>", [
 204             ("starttag", "e", [("a", "rgb(1,2,3)")]),
 205             ])
 206         # Regression test for SF bug #921657.
 207         self._run_check("<a href=mailto:xyz@example.com>", [
 208             ("starttag", "a", [("href", "mailto:xyz@example.com")]),
 209             ])
 210
 211     def test_attr_entity_replacement(self):
 212         self._run_check("""<a b='&amp;&gt;&lt;&quot;&apos;'>""", [
 213             ("starttag", "a", [("b", "&><\"'")]),
 214             ])
 215
 216     def test_attr_funky_names(self):
 217         self._run_check("""<a a.b='v' c:d=v e-f=v>""", [
 218             ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
 219             ])
 220
 221     def test_illegal_declarations(self):
 222         self._parse_error('<!spacer type="block" height="25">')
 223
 224     def test_starttag_end_boundary(self):
 225         self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
 226         self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])])
 227
 228     def test_buffer_artefacts(self):
 229         output = [("starttag", "a", [("b", "<")])]
 230         self._run_check(["<a b='<'>"], output)
 231         self._run_check(["<a ", "b='<'>"], output)
 232         self._run_check(["<a b", "='<'>"], output)
 233         self._run_check(["<a b=", "'<'>"], output)
 234         self._run_check(["<a b='<", "'>"], output)
 235         self._run_check(["<a b='<'", ">"], output)
 236
 237         output = [("starttag", "a", [("b", ">")])]
 238         self._run_check(["<a b='>'>"], output)
 239         self._run_check(["<a ", "b='>'>"], output)
 240         self._run_check(["<a b", "='>'>"], output)
 241         self._run_check(["<a b=", "'>'>"], output)
 242         self._run_check(["<a b='>", "'>"], output)
 243         self._run_check(["<a b='>'", ">"], output)
 244
 245         output = [("comment", "abc")]
 246         self._run_check(["", "<!--abc-->"], output)
 247         self._run_check(["<", "!--abc-->"], output)
 248         self._run_check(["<!", "--abc-->"], output)
 249         self._run_check(["<!-", "-abc-->"], output)
 250         self._run_check(["<!--", "abc-->"], output)
 251         self._run_check(["<!--a", "bc-->"], output)
 252         self._run_check(["<!--ab", "c-->"], output)
 253         self._run_check(["<!--abc", "-->"], output)
 254         self._run_check(["<!--abc-", "->"], output)
 255         self._run_check(["<!--abc--", ">"], output)
 256         self._run_check(["<!--abc-->", ""], output)
 257
 258     def test_starttag_junk_chars(self):
 259         self._parse_error("</>")
 260         self._parse_error("</$>")
 261         self._parse_error("</")
 262         self._parse_error("</a")
 263         self._parse_error("<a<a>")
 264         self._parse_error("</a<a>")
 265         self._parse_error("<!")
 266         self._parse_error("<a $>")
 267         self._parse_error("<a")
 268         self._parse_error("<a foo='bar'")
 269         self._parse_error("<a foo='bar")
 270         self._parse_error("<a foo='>'")
 271         self._parse_error("<a foo='>")
 272         self._parse_error("<a foo=>")
 273
 274     def test_declaration_junk_chars(self):
 275         self._parse_error("<!DOCTYPE foo $ >")
 276
 277     def test_startendtag(self):
 278         self._run_check("<p/>", [
 279             ("startendtag", "p", []),
 280             ])
 281         self._run_check("<p></p>", [
 282             ("starttag", "p", []),
 283             ("endtag", "p"),
 284             ])
 285         self._run_check("<p><img src='foo' /></p>", [
 286             ("starttag", "p", []),
 287             ("startendtag", "img", [("src", "foo")]),
 288             ("endtag", "p"),
 289             ])
 290
 291     def test_get_starttag_text(self):
 292         s = """<foo:bar   \n   one="1"\ttwo=2   >"""
 293         self._run_check_extra(s, [
 294             ("starttag", "foo:bar", [("one", "1"), ("two", "2")]),
 295             ("starttag_text", s)])
 296
 297     def test_cdata_content(self):
 298         s = """<script> <!-- not a comment --> &not-an-entity-ref; </script>"""
 299         self._run_check(s, [
 300             ("starttag", "script", []),
 301             ("data", " <!-- not a comment --> &not-an-entity-ref; "),
 302             ("endtag", "script"),
 303             ])
 304         s = """<script> <not a='start tag'> </script>"""
 305         self._run_check(s, [
 306             ("starttag", "script", []),
 307             ("data", " <not a='start tag'> "),
 308             ("endtag", "script"),
 309             ])
 310
 311     def test_entityrefs_in_attributes(self):
 312         self._run_check("<html foo='&euro;&amp;&#97;&#x61;&unsupported;'>", [
 313                 ("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")])
 314                 ])
 315
 316
 317 def test_main():
 318     test_support.run_unittest(HTMLParserTestCase)
 319
 320
 321 if __name__ == "__main__":
 322     test_main()