app/htmlsanitizer/BeautifulSoupTests.py

   1 # -*- coding: utf-8 -*-
   2 """Unit tests for Beautiful Soup.
   3
   4 These tests make sure the Beautiful Soup works as it should. If you
   5 find a bug in Beautiful Soup, the best way to express it is as a test
   6 case like this that fails."""
   7
   8 import unittest
   9 from BeautifulSoup import *
  10
  11 class SoupTest(unittest.TestCase):
  12
  13     def assertSoupEquals(self, toParse, rep=None, c=BeautifulSoup,
  14                          encoding=None):
  15         """Parse the given text and make sure its string rep is the other
  16         given text."""
  17         if rep == None:
  18             rep = toParse
  19         obj = c(toParse)
  20         if encoding is None:
  21             rep2 = obj.decode()
  22         else:
  23             rep2 = obj.encode(encoding)
  24         self.assertEqual(rep2, rep)
  25
  26 class FollowThatTag(SoupTest):
  27
  28     "Tests the various ways of fetching tags from a soup."
  29
  30     def setUp(self):
  31         ml = """
  32         <a id="x">1</a>
  33         <A id="a">2</a>
  34         <b id="b">3</a>
  35         <b href="foo" id="x">4</a>
  36         <ac width=100>4</ac>"""
  37         self.soup = BeautifulStoneSoup(ml)
  38
  39     def testFindAllByName(self):
  40         matching = self.soup('a')
  41         self.assertEqual(len(matching), 2)
  42         self.assertEqual(matching[0].name, 'a')
  43         self.assertEqual(matching, self.soup.findAll('a'))
  44         self.assertEqual(matching, self.soup.findAll(SoupStrainer('a')))
  45
  46     def testFindAllByAttribute(self):
  47         matching = self.soup.findAll(id='x')
  48         self.assertEqual(len(matching), 2)
  49         self.assertEqual(matching[0].name, 'a')
  50         self.assertEqual(matching[1].name, 'b')
  51
  52         matching2 = self.soup.findAll(attrs={'id' : 'x'})
  53         self.assertEqual(matching, matching2)
  54
  55         strainer = SoupStrainer(attrs={'id' : 'x'})
  56         self.assertEqual(matching, self.soup.findAll(strainer))
  57
  58         self.assertEqual(len(self.soup.findAll(id=None)), 1)
  59
  60         self.assertEqual(len(self.soup.findAll(width=100)), 1)
  61         self.assertEqual(len(self.soup.findAll(junk=None)), 5)
  62         self.assertEqual(len(self.soup.findAll(junk=[1, None])), 5)
  63
  64         self.assertEqual(len(self.soup.findAll(junk=re.compile('.*'))), 0)
  65         self.assertEqual(len(self.soup.findAll(junk=True)), 0)
  66
  67         self.assertEqual(len(self.soup.findAll(junk=True)), 0)
  68         self.assertEqual(len(self.soup.findAll(href=True)), 1)
  69
  70     def testFindallByClass(self):
  71         soup = BeautifulSoup('<a>Foo</a><a class="1">Bar</a>')
  72         self.assertEqual(soup.find('a', '1').string, "Bar")
  73
  74     def testFindAllByList(self):
  75         matching = self.soup(['a', 'ac'])
  76         self.assertEqual(len(matching), 3)
  77
  78     def testFindAllByHash(self):
  79         matching = self.soup({'a' : True, 'b' : True})
  80         self.assertEqual(len(matching), 4)
  81
  82     def testFindAllText(self):
  83         soup = BeautifulSoup("<html>\xbb</html>")
  84         self.assertEqual(soup.findAll(text=re.compile('.*')),
  85                          [u'\xbb'])
  86
  87     def testFindAllByRE(self):
  88         import re
  89         r = re.compile('a.*')
  90         self.assertEqual(len(self.soup(r)), 3)
  91
  92     def testFindAllByMethod(self):
  93         def matchTagWhereIDMatchesName(tag):
  94             return tag.name == tag.get('id')
  95
  96         matching = self.soup.findAll(matchTagWhereIDMatchesName)
  97         self.assertEqual(len(matching), 2)
  98         self.assertEqual(matching[0].name, 'a')
  99
 100     def testParents(self):
 101         soup = BeautifulSoup('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah')
 102         b = soup.b
 103         self.assertEquals(len(b.findParents('ul', {'id' : 'foo'})), 2)
 104         self.assertEquals(b.findParent('ul')['a'], 'b')
 105
 106     PROXIMITY_TEST = BeautifulSoup('<b id="1"><b id="2"><b id="3"><b id="4">')
 107
 108     def testNext(self):
 109         soup = self.PROXIMITY_TEST
 110         b = soup.find('b', {'id' : 2})
 111         self.assertEquals(b.findNext('b')['id'], '3')
 112         self.assertEquals(b.findNext('b')['id'], '3')
 113         self.assertEquals(len(b.findAllNext('b')), 2)
 114         self.assertEquals(len(b.findAllNext('b', {'id' : 4})), 1)
 115
 116     def testPrevious(self):
 117         soup = self.PROXIMITY_TEST
 118         b = soup.find('b', {'id' : 3})
 119         self.assertEquals(b.findPrevious('b')['id'], '2')
 120         self.assertEquals(b.findPrevious('b')['id'], '2')
 121         self.assertEquals(len(b.findAllPrevious('b')), 2)
 122         self.assertEquals(len(b.findAllPrevious('b', {'id' : 2})), 1)
 123
 124
 125     SIBLING_TEST = BeautifulSoup('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
 126
 127     def testNextSibling(self):
 128         soup = self.SIBLING_TEST
 129         tag = 'blockquote'
 130         b = soup.find(tag, {'id' : 2})
 131         self.assertEquals(b.findNext(tag)['id'], '2.1')
 132         self.assertEquals(b.findNextSibling(tag)['id'], '3')
 133         self.assertEquals(b.findNextSibling(tag)['id'], '3')
 134         self.assertEquals(len(b.findNextSiblings(tag)), 2)
 135         self.assertEquals(len(b.findNextSiblings(tag, {'id' : 4})), 1)
 136
 137     def testPreviousSibling(self):
 138         soup = self.SIBLING_TEST
 139         tag = 'blockquote'
 140         b = soup.find(tag, {'id' : 3})
 141         self.assertEquals(b.findPrevious(tag)['id'], '2.1')
 142         self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
 143         self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
 144         self.assertEquals(len(b.findPreviousSiblings(tag)), 2)
 145         self.assertEquals(len(b.findPreviousSiblings(tag, id=1)), 1)
 146
 147     def testTextNavigation(self):
 148         soup = BeautifulSoup('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
 149         baz = soup.find(text='Baz')
 150         self.assertEquals(baz.findParent("i")['id'], '1')
 151         self.assertEquals(baz.findNext(text='Blee'), 'Blee')
 152         self.assertEquals(baz.findNextSibling(text='Blee'), 'Blee')
 153         self.assertEquals(baz.findNextSibling(text='Blargh'), None)
 154         self.assertEquals(baz.findNextSibling('hr')['id'], '1')
 155
 156 class SiblingRivalry(SoupTest):
 157     "Tests the nextSibling and previousSibling navigation."
 158
 159     def testSiblings(self):
 160         soup = BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>")
 161         secondLI = soup.find('li').nextSibling
 162         self.assert_(secondLI.name == 'li' and secondLI.string == '2')
 163         self.assertEquals(soup.find(text='1').nextSibling.name, 'p')
 164         self.assertEquals(soup.find('p').nextSibling, 'B')
 165         self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B')
 166
 167 class TagsAreObjectsToo(SoupTest):
 168     "Tests the various built-in functions of Tag objects."
 169
 170     def testLen(self):
 171         soup = BeautifulSoup("<top>1<b>2</b>3</top>")
 172         self.assertEquals(len(soup.top), 3)
 173
 174 class StringEmUp(SoupTest):
 175     "Tests the use of 'string' as an alias for a tag's only content."
 176
 177     def testString(self):
 178         s = BeautifulSoup("<b>foo</b>")
 179         self.assertEquals(s.b.string, 'foo')
 180
 181     def testLackOfString(self):
 182         s = BeautifulSoup("<b>f<i>e</i>o</b>")
 183         self.assert_(not s.b.string)
 184
 185 class ThatsMyLimit(SoupTest):
 186     "Tests the limit argument."
 187
 188     def testBasicLimits(self):
 189         s = BeautifulSoup('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
 190         self.assertEquals(len(s.findAll('br')), 4)
 191         self.assertEquals(len(s.findAll('br', limit=2)), 2)
 192         self.assertEquals(len(s('br', limit=2)), 2)
 193
 194 class OnlyTheLonely(SoupTest):
 195     "Tests the parseOnly argument to the constructor."
 196     def setUp(self):
 197         x = []
 198         for i in range(1,6):
 199             x.append('<a id="%s">' % i)
 200             for j in range(100,103):
 201                 x.append('<b id="%s.%s">Content %s.%s</b>' % (i,j, i,j))
 202             x.append('</a>')
 203         self.x = ''.join(x)
 204
 205     def testOnly(self):
 206         strainer = SoupStrainer("b")
 207         soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
 208         self.assertEquals(len(soup), 15)
 209
 210         strainer = SoupStrainer(id=re.compile("100.*"))
 211         soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
 212         self.assertEquals(len(soup), 5)
 213
 214         strainer = SoupStrainer(text=re.compile("10[01].*"))
 215         soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
 216         self.assertEquals(len(soup), 10)
 217
 218         strainer = SoupStrainer(text=lambda(x):x[8]=='3')
 219         soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
 220         self.assertEquals(len(soup), 3)
 221
 222 class PickleMeThis(SoupTest):
 223     "Testing features like pickle and deepcopy."
 224
 225     def setUp(self):
 226         self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
 227 "http://www.w3.org/TR/REC-html40/transitional.dtd">
 228 <html>
 229 <head>
 230 <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 231 <title>Beautiful Soup: We called him Tortoise because he taught us.</title>
 232 <link rev="made" href="mailto:leonardr@segfault.org">
 233 <meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
 234 <meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
 235 <meta name="author" content="Leonard Richardson">
 236 </head>
 237 <body>
 238 <a href="foo">foo</a>
 239 <a href="foo"><b>bar</b></a>
 240 </body>
 241 </html>"""
 242
 243         self.soup = BeautifulSoup(self.page)
 244
 245     def testPickle(self):
 246         import pickle
 247         dumped = pickle.dumps(self.soup, 2)
 248         loaded = pickle.loads(dumped)
 249         self.assertEqual(loaded.__class__, BeautifulSoup)
 250         self.assertEqual(loaded.decode(), self.soup.decode())
 251
 252     def testDeepcopy(self):
 253         from copy import deepcopy
 254         deepcopy(BeautifulSoup("<a></a>"))
 255         copied = deepcopy(self.soup)
 256         self.assertEqual(copied.decode(), self.soup.decode())
 257
 258     def testUnicodePickle(self):
 259         import cPickle as pickle
 260         html = "<b>" + chr(0xc3) + "</b>"
 261         soup = BeautifulSoup(html)
 262         dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
 263         loaded = pickle.loads(dumped)
 264         self.assertEqual(loaded.decode(), soup.decode())
 265
 266
 267 class WriteOnlyCode(SoupTest):
 268     "Testing the modification of the tree."
 269
 270     def testModifyAttributes(self):
 271         soup = BeautifulSoup('<a id="1"></a>')
 272         soup.a['id'] = 2
 273         self.assertEqual(soup.decode(), '<a id="2"></a>')
 274         del(soup.a['id'])
 275         self.assertEqual(soup.decode(), '<a></a>')
 276         soup.a['id2'] = 'foo'
 277         self.assertEqual(soup.decode(), '<a id2="foo"></a>')
 278
 279     def testNewTagCreation(self):
 280         "Makes sure tags don't step on each others' toes."
 281         soup = BeautifulSoup()
 282         a = Tag(soup, 'a')
 283         ol = Tag(soup, 'ol')
 284         a['href'] = 'http://foo.com/'
 285         self.assertRaises(KeyError, lambda : ol['href'])
 286
 287     def testTagReplacement(self):
 288         # Make sure you can replace an element with itself.
 289         text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
 290         soup = BeautifulSoup(text)
 291         c = soup.c
 292         soup.c.replaceWith(c)
 293         self.assertEquals(soup.decode(), text)
 294
 295         # A very simple case
 296         soup = BeautifulSoup("<b>Argh!</b>")
 297         soup.find(text="Argh!").replaceWith("Hooray!")
 298         newText = soup.find(text="Hooray!")
 299         b = soup.b
 300         self.assertEqual(newText.previous, b)
 301         self.assertEqual(newText.parent, b)
 302         self.assertEqual(newText.previous.next, newText)
 303         self.assertEqual(newText.next, None)
 304
 305         # A more complex case
 306         soup = BeautifulSoup("<a><b>Argh!</b><c></c><d></d></a>")
 307         soup.b.insert(1, "Hooray!")
 308         newText = soup.find(text="Hooray!")
 309         self.assertEqual(newText.previous, "Argh!")
 310         self.assertEqual(newText.previous.next, newText)
 311
 312         self.assertEqual(newText.previousSibling, "Argh!")
 313         self.assertEqual(newText.previousSibling.nextSibling, newText)
 314
 315         self.assertEqual(newText.nextSibling, None)
 316         self.assertEqual(newText.next, soup.c)
 317
 318         text = "<html>There's <b>no</b> business like <b>show</b> business</html>"
 319         soup = BeautifulSoup(text)
 320         no, show = soup.findAll('b')
 321         show.replaceWith(no)
 322         self.assertEquals(soup.decode(), "<html>There's  business like <b>no</b> business</html>")
 323
 324         # Even more complex
 325         soup = BeautifulSoup("<a><b>Find</b><c>lady!</c><d></d></a>")
 326         tag = Tag(soup, 'magictag')
 327         tag.insert(0, "the")
 328         soup.a.insert(1, tag)
 329
 330         b = soup.b
 331         c = soup.c
 332         theText = tag.find(text=True)
 333         findText = b.find(text="Find")
 334
 335         self.assertEqual(findText.next, tag)
 336         self.assertEqual(tag.previous, findText)
 337         self.assertEqual(b.nextSibling, tag)
 338         self.assertEqual(tag.previousSibling, b)
 339         self.assertEqual(tag.nextSibling, c)
 340         self.assertEqual(c.previousSibling, tag)
 341
 342         self.assertEqual(theText.next, c)
 343         self.assertEqual(c.previous, theText)
 344
 345         # Aand... incredibly complex.
 346         soup = BeautifulSoup("""<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
 347         f = soup.f
 348         a = soup.a
 349         c = soup.c
 350         e = soup.e
 351         weText = a.find(text="We")
 352         soup.b.replaceWith(soup.f)
 353         self.assertEqual(soup.decode(), "<a>We<f>refuse</f></a><e>to<g>service</g></e>")
 354
 355         self.assertEqual(f.previous, weText)
 356         self.assertEqual(weText.next, f)
 357         self.assertEqual(f.previousSibling, weText)
 358         self.assertEqual(f.nextSibling, None)
 359         self.assertEqual(weText.nextSibling, f)
 360
 361     def testAppend(self):
 362        doc = "<p>Don't leave me <b>here</b>.</p> <p>Don't leave me.</p>"
 363        soup = BeautifulSoup(doc)
 364        second_para = soup('p')[1]
 365        bold = soup.find('b')
 366        soup('p')[1].append(soup.find('b'))
 367        self.assertEqual(bold.parent, second_para)
 368        self.assertEqual(soup.decode(),
 369                         "<p>Don't leave me .</p> "
 370                         "<p>Don't leave me.<b>here</b></p>")
 371
 372     def testTagExtraction(self):
 373         # A very simple case
 374         text = '<html><div id="nav">Nav crap</div>Real content here.</html>'
 375         soup = BeautifulSoup(text)
 376         extracted = soup.find("div", id="nav").extract()
 377         self.assertEqual(soup.decode(), "<html>Real content here.</html>")
 378         self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
 379
 380         # A simple case, a more complex test.
 381         text = "<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>"
 382         soup = BeautifulStoneSoup(text)
 383         doc = soup.doc
 384         numbers, roman, letters = soup("a")
 385
 386         self.assertEqual(roman.parent, doc)
 387         oldPrevious = roman.previous
 388         endOfThisTag = roman.nextSibling.previous
 389         self.assertEqual(oldPrevious, "2")
 390         self.assertEqual(roman.next, "i")
 391         self.assertEqual(endOfThisTag, "ii")
 392         self.assertEqual(roman.previousSibling, numbers)
 393         self.assertEqual(roman.nextSibling, letters)
 394
 395         roman.extract()
 396         self.assertEqual(roman.parent, None)
 397         self.assertEqual(roman.previous, None)
 398         self.assertEqual(roman.next, "i")
 399         self.assertEqual(letters.previous, '2')
 400         self.assertEqual(roman.previousSibling, None)
 401         self.assertEqual(roman.nextSibling, None)
 402         self.assertEqual(endOfThisTag.next, None)
 403         self.assertEqual(roman.b.contents[0].next, None)
 404         self.assertEqual(numbers.nextSibling, letters)
 405         self.assertEqual(letters.previousSibling, numbers)
 406         self.assertEqual(len(doc.contents), 2)
 407         self.assertEqual(doc.contents[0], numbers)
 408         self.assertEqual(doc.contents[1], letters)
 409
 410         # A more complex case.
 411         text = "<a>1<b>2<c>Hollywood, baby!</c></b></a>3"
 412         soup = BeautifulStoneSoup(text)
 413         one = soup.find(text="1")
 414         three = soup.find(text="3")
 415         toExtract = soup.b
 416         soup.b.extract()
 417         self.assertEqual(one.next, three)
 418         self.assertEqual(three.previous, one)
 419         self.assertEqual(one.parent.nextSibling, three)
 420         self.assertEqual(three.previousSibling, soup.a)
 421
 422 class TheManWithoutAttributes(SoupTest):
 423     "Test attribute access"
 424
 425     def testHasKey(self):
 426         text = "<foo attr='bar'>"
 427         self.assertTrue(BeautifulSoup(text).foo.has_key('attr'))
 428
 429 class QuoteMeOnThat(SoupTest):
 430     "Test quoting"
 431     def testQuotedAttributeValues(self):
 432         self.assertSoupEquals("<foo attr='bar'></foo>",
 433                               '<foo attr="bar"></foo>')
 434
 435         text = """<foo attr='bar "brawls" happen'>a</foo>"""
 436         soup = BeautifulSoup(text)
 437         self.assertEquals(soup.decode(), text)
 438
 439         soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
 440         newText = """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>"""
 441         self.assertSoupEquals(soup.decode(), newText)
 442
 443         self.assertSoupEquals('<this is="really messed up & stuff">',
 444                               '<this is="really messed up &amp; stuff"></this>')
 445
 446
 447
 448 class YoureSoLiteral(SoupTest):
 449     "Test literal mode."
 450     def testLiteralMode(self):
 451         text = "<script>if (i<imgs.length)</script><b>Foo</b>"
 452         soup = BeautifulSoup(text)
 453         self.assertEqual(soup.script.contents[0], "if (i<imgs.length)")
 454         self.assertEqual(soup.b.contents[0], "Foo")
 455
 456     def testTextArea(self):
 457         text = "<textarea><b>This is an example of an HTML tag</b><&<&</textarea>"
 458         soup = BeautifulSoup(text)
 459         self.assertEqual(soup.textarea.contents[0],
 460                          "<b>This is an example of an HTML tag</b><&<&")
 461
 462 class OperatorOverload(SoupTest):
 463     "Our operators do it all! Call now!"
 464
 465     def testTagNameAsFind(self):
 466         "Tests that referencing a tag name as a member delegates to find()."
 467         soup = BeautifulSoup('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
 468         self.assertEqual(soup.b.i, soup.find('b').find('i'))
 469         self.assertEqual(soup.b.i.string, 'bar')
 470         self.assertEqual(soup.b['id'], '1')
 471         self.assertEqual(soup.b.contents[0], 'foo')
 472         self.assert_(not soup.a)
 473
 474         #Test the .fooTag variant of .foo.
 475         self.assertEqual(soup.bTag.iTag.string, 'bar')
 476         self.assertEqual(soup.b.iTag.string, 'bar')
 477         self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag)
 478
 479 class NestableEgg(SoupTest):
 480     """Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!"""
 481
 482     def testParaInsideBlockquote(self):
 483         soup = BeautifulSoup('<blockquote><p><b>Foo</blockquote><p>Bar')
 484         self.assertEqual(soup.blockquote.p.b.string, 'Foo')
 485         self.assertEqual(soup.blockquote.b.string, 'Foo')
 486         self.assertEqual(soup.find('p', recursive=False).string, 'Bar')
 487
 488     def testNestedTables(self):
 489         text = """<table id="1"><tr><td>Here's another table:
 490         <table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>"""
 491         soup = BeautifulSoup(text)
 492         self.assertEquals(soup.table.table.td.string, 'Juicy text')
 493         self.assertEquals(len(soup.findAll('table')), 2)
 494         self.assertEquals(len(soup.table.findAll('table')), 1)
 495         self.assertEquals(soup.find('table', {'id' : 2}).parent.parent.parent.name,
 496                           'table')
 497
 498         text = "<table><tr><td><div><table>Foo</table></div></td></tr></table>"
 499         soup = BeautifulSoup(text)
 500         self.assertEquals(soup.table.tr.td.div.table.contents[0], "Foo")
 501
 502         text = """<table><thead><tr>Foo</tr></thead><tbody><tr>Bar</tr></tbody>
 503         <tfoot><tr>Baz</tr></tfoot></table>"""
 504         soup = BeautifulSoup(text)
 505         self.assertEquals(soup.table.thead.tr.contents[0], "Foo")
 506
 507     def testBadNestedTables(self):
 508         soup = BeautifulSoup("<table><tr><table><tr id='nested'>")
 509         self.assertEquals(soup.table.tr.table.tr['id'], 'nested')
 510
 511 class CleanupOnAisleFour(SoupTest):
 512     """Here we test cleanup of text that breaks HTMLParser or is just
 513     obnoxious."""
 514
 515     def testSelfClosingtag(self):
 516         self.assertEqual(BeautifulSoup("Foo<br/>Bar").find('br').decode(),
 517                          '<br />')
 518
 519         self.assertSoupEquals('<p>test1<br/>test2</p>',
 520                               '<p>test1<br />test2</p>')
 521
 522         text = '<p>test1<selfclosing>test2'
 523         soup = BeautifulStoneSoup(text)
 524         self.assertEqual(soup.decode(),
 525                          '<p>test1<selfclosing>test2</selfclosing></p>')
 526
 527         soup = BeautifulStoneSoup(text, selfClosingTags='selfclosing')
 528         self.assertEqual(soup.decode(),
 529                          '<p>test1<selfclosing />test2</p>')
 530
 531     def testSelfClosingTagOrNot(self):
 532         text = "<item><link>http://foo.com/</link></item>"
 533         self.assertEqual(BeautifulStoneSoup(text).decode(), text)
 534         self.assertEqual(BeautifulSoup(text).decode(),
 535                          '<item><link />http://foo.com/</item>')
 536
 537     def testBooleanAttributes(self):
 538         text = "<td nowrap>foo</td>"
 539         self.assertSoupEquals(text, text)
 540
 541     def testCData(self):
 542         xml = "<root>foo<![CDATA[foobar]]>bar</root>"
 543         self.assertSoupEquals(xml, xml)
 544         r = re.compile("foo.*bar")
 545         soup = BeautifulSoup(xml)
 546         self.assertEquals(soup.find(text=r).string, "foobar")
 547         self.assertEquals(soup.find(text=r).__class__, CData)
 548
 549     def testComments(self):
 550         xml = "foo<!--foobar-->baz"
 551         self.assertSoupEquals(xml)
 552         r = re.compile("foo.*bar")
 553         soup = BeautifulSoup(xml)
 554         self.assertEquals(soup.find(text=r).string, "foobar")
 555         self.assertEquals(soup.find(text="foobar").__class__, Comment)
 556
 557     def testDeclaration(self):
 558         xml = "foo<!DOCTYPE foobar>baz"
 559         self.assertSoupEquals(xml)
 560         r = re.compile(".*foo.*bar")
 561         soup = BeautifulSoup(xml)
 562         text = "DOCTYPE foobar"
 563         self.assertEquals(soup.find(text=r).string, text)
 564         self.assertEquals(soup.find(text=text).__class__, Declaration)
 565
 566         namespaced_doctype = ('<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">'
 567                               '<html>foo</html>')
 568         soup = BeautifulSoup(namespaced_doctype)
 569         self.assertEquals(soup.contents[0],
 570                           'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"')
 571         self.assertEquals(soup.html.contents[0], 'foo')
 572
 573     def testEntityConversions(self):
 574         text = "&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;"
 575         soup = BeautifulStoneSoup(text)
 576         self.assertSoupEquals(text)
 577
 578         xmlEnt = BeautifulStoneSoup.XML_ENTITIES
 579         htmlEnt = BeautifulStoneSoup.HTML_ENTITIES
 580         xhtmlEnt = BeautifulStoneSoup.XHTML_ENTITIES
 581
 582         soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
 583         self.assertEquals(soup.decode(), "<<sacr&eacute; bleu!>>")
 584
 585         soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
 586         self.assertEquals(soup.decode(), "<<sacr&eacute; bleu!>>")
 587
 588         soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
 589         self.assertEquals(soup.decode(), u"<<sacr\xe9 bleu!>>")
 590
 591         # Make sure the "XML", "HTML", and "XHTML" settings work.
 592         text = "&lt;&trade;&apos;"
 593         soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
 594         self.assertEquals(soup.decode(), u"<&trade;'")
 595
 596         soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
 597         self.assertEquals(soup.decode(), u"<\u2122&apos;")
 598
 599         soup = BeautifulStoneSoup(text, convertEntities=xhtmlEnt)
 600         self.assertEquals(soup.decode(), u"<\u2122'")
 601
 602     def testNonBreakingSpaces(self):
 603         soup = BeautifulSoup("<a>&nbsp;&nbsp;</a>",
 604                              convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
 605         self.assertEquals(soup.decode(), u"<a>\xa0\xa0</a>")
 606
 607     def testWhitespaceInDeclaration(self):
 608         self.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>')
 609
 610     def testJunkInDeclaration(self):
 611         self.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a')
 612
 613     def testIncompleteDeclaration(self):
 614         self.assertSoupEquals('a<!b <p>c')
 615
 616     def testEntityReplacement(self):
 617         self.assertSoupEquals('<b>hello&nbsp;there</b>')
 618
 619     def testEntitiesInAttributeValues(self):
 620         self.assertSoupEquals('<x t="x&#241;">', '<x t="x\xc3\xb1"></x>',
 621                               encoding='utf-8')
 622         self.assertSoupEquals('<x t="x&#xf1;">', '<x t="x\xc3\xb1"></x>',
 623                               encoding='utf-8')
 624
 625         soup = BeautifulSoup('<x t="&gt;&trade;">',
 626                              convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
 627         self.assertEquals(soup.decode(), u'<x t="&gt;\u2122"></x>')
 628
 629         uri = "http://crummy.com?sacr&eacute;&amp;bleu"
 630         link = '<a href="%s"></a>' % uri
 631
 632         soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
 633         self.assertEquals(soup.decode(),
 634                           link.replace("&eacute;", u"\xe9"))
 635
 636         uri = "http://crummy.com?sacr&eacute;&bleu"
 637         link = '<a href="%s"></a>' % uri
 638         soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
 639         self.assertEquals(soup.a['href'],
 640                           uri.replace("&eacute;", u"\xe9"))
 641
 642     def testNakedAmpersands(self):
 643         html = {'convertEntities':BeautifulStoneSoup.HTML_ENTITIES}
 644         soup = BeautifulStoneSoup("AT&T ", **html)
 645         self.assertEquals(soup.decode(), 'AT&amp;T ')
 646
 647         nakedAmpersandInASentence = "AT&T was Ma Bell"
 648         soup = BeautifulStoneSoup(nakedAmpersandInASentence,**html)
 649         self.assertEquals(soup.decode(), \
 650                nakedAmpersandInASentence.replace('&','&amp;'))
 651
 652         invalidURL = '<a href="http://example.org?a=1&b=2;3">foo</a>'
 653         validURL = invalidURL.replace('&','&amp;')
 654         soup = BeautifulStoneSoup(invalidURL)
 655         self.assertEquals(soup.decode(), validURL)
 656
 657         soup = BeautifulStoneSoup(validURL)
 658         self.assertEquals(soup.decode(), validURL)
 659
 660
 661 class EncodeRed(SoupTest):
 662     """Tests encoding conversion, Unicode conversion, and Microsoft
 663     smart quote fixes."""
 664
 665     def testUnicodeDammitStandalone(self):
 666         markup = "<foo>\x92</foo>"
 667         dammit = UnicodeDammit(markup)
 668         self.assertEquals(dammit.unicode, "<foo>&#x2019;</foo>")
 669
 670         hebrew = "\xed\xe5\xec\xf9"
 671         dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
 672         self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9')
 673         self.assertEquals(dammit.originalEncoding, 'iso-8859-8')
 674
 675     def testGarbageInGarbageOut(self):
 676         ascii = "<foo>a</foo>"
 677         asciiSoup = BeautifulStoneSoup(ascii)
 678         self.assertEquals(ascii, asciiSoup.decode())
 679
 680         unicodeData = u"<foo>\u00FC</foo>"
 681         utf8 = unicodeData.encode("utf-8")
 682         self.assertEquals(utf8, '<foo>\xc3\xbc</foo>')
 683
 684         unicodeSoup = BeautifulStoneSoup(unicodeData)
 685         self.assertEquals(unicodeData, unicodeSoup.decode())
 686         self.assertEquals(unicodeSoup.foo.string, u'\u00FC')
 687
 688         utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8')
 689         self.assertEquals(utf8, utf8Soup.encode('utf-8'))
 690         self.assertEquals(utf8Soup.originalEncoding, "utf-8")
 691
 692         utf8Soup = BeautifulStoneSoup(unicodeData)
 693         self.assertEquals(utf8, utf8Soup.encode('utf-8'))
 694         self.assertEquals(utf8Soup.originalEncoding, None)
 695
 696
 697     def testHandleInvalidCodec(self):
 698         for bad_encoding in ['.utf8', '...', 'utF---16.!']:
 699             soup = BeautifulSoup(u"Räksmörgås".encode("utf-8"),
 700                                  fromEncoding=bad_encoding)
 701             self.assertEquals(soup.originalEncoding, 'utf-8')
 702
 703     def testUnicodeSearch(self):
 704         html = u'<html><body><h1>Räksmörgås</h1></body></html>'
 705         soup = BeautifulSoup(html)
 706         self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås')
 707
 708     def testRewrittenXMLHeader(self):
 709         euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n'
 710         utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n"
 711         soup = BeautifulStoneSoup(euc_jp)
 712         if soup.originalEncoding != "euc-jp":
 713             raise Exception("Test failed when parsing euc-jp document. "
 714                             "If you're running Python >=2.4, or you have "
 715                             "cjkcodecs installed, this is a real problem. "
 716                             "Otherwise, ignore it.")
 717
 718         self.assertEquals(soup.originalEncoding, "euc-jp")
 719         self.assertEquals(soup.renderContents('utf-8'), utf8)
 720
 721         old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>"
 722         new_text = "<?xml version='1.0' encoding='utf-8'?><foo>&rsquo;</foo>"
 723         self.assertSoupEquals(old_text, new_text)
 724
 725     def testRewrittenMetaTag(self):
 726         no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''
 727         soup = BeautifulSoup(no_shift_jis_html)
 728
 729         # Beautiful Soup used to try to rewrite the meta tag even if the
 730         # meta tag got filtered out by the strainer. This test makes
 731         # sure that doesn't happen.
 732         strainer = SoupStrainer('pre')
 733         soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer)
 734         self.assertEquals(soup.contents[0].name, 'pre')
 735
 736         meta_tag = ('<meta content="text/html; charset=x-sjis" '
 737                     'http-equiv="Content-type" />')
 738         shift_jis_html = (
 739             '<html><head>\n%s\n'
 740             '<meta http-equiv="Content-language" content="ja" />'
 741             '</head><body><pre>\n'
 742             '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
 743             '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
 744             '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'
 745             '</pre></body></html>') % meta_tag
 746         soup = BeautifulSoup(shift_jis_html)
 747         if soup.originalEncoding != "shift-jis":
 748             raise Exception("Test failed when parsing shift-jis document "
 749                             "with meta tag '%s'."
 750                             "If you're running Python >=2.4, or you have "
 751                             "cjkcodecs installed, this is a real problem. "
 752                             "Otherwise, ignore it." % meta_tag)
 753         self.assertEquals(soup.originalEncoding, "shift-jis")
 754
 755         content_type_tag = soup.meta['content']
 756         self.assertEquals(content_type_tag[content_type_tag.find('charset='):],
 757                           'charset=%SOUP-ENCODING%')
 758         content_type = str(soup.meta)
 759         index = content_type.find('charset=')
 760         self.assertEqual(content_type[index:index+len('charset=utf8')+1],
 761                          'charset=utf-8')
 762         content_type = soup.meta.encode('shift-jis')
 763         index = content_type.find('charset=')
 764         self.assertEqual(content_type[index:index+len('charset=shift-jis')],
 765                          'charset=shift-jis'.encode())
 766
 767         self.assertEquals(soup.encode('utf-8'), (
 768                 '<html><head>\n'
 769                 '<meta content="text/html; charset=utf-8" '
 770                 'http-equiv="Content-type" />\n'
 771                 '<meta http-equiv="Content-language" content="ja" />'
 772                 '</head><body><pre>\n'
 773                 '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'
 774                 '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'
 775                 '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'
 776                 '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'
 777                 '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'
 778                 '</pre></body></html>'))
 779         self.assertEquals(soup.encode("shift-jis"),
 780                           shift_jis_html.replace('x-sjis'.encode(),
 781                                                  'shift-jis'.encode()))
 782
 783         isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
 784         soup = BeautifulSoup(isolatin)
 785
 786         utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
 787         utf8 = utf8.replace("\xe9", "\xc3\xa9")
 788         self.assertSoupEquals(soup.encode("utf-8"), utf8, encoding='utf-8')
 789
 790     def testHebrew(self):
 791         iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n'
 792         utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n'
 793         soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8")
 794         self.assertEquals(soup.encode('utf-8'), utf8)
 795
 796     def testSmartQuotesNotSoSmartAnymore(self):
 797         self.assertSoupEquals("\x91Foo\x92 <!--blah-->",
 798                               '&lsquo;Foo&rsquo; <!--blah-->')
 799
 800     def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self):
 801         smartQuotes = "Il a dit, \x8BSacr&eacute; bl&#101;u!\x9b"
 802         soup = BeautifulSoup(smartQuotes)
 803         self.assertEquals(soup.decode(),
 804                           'Il a dit, &lsaquo;Sacr&eacute; bl&#101;u!&rsaquo;')
 805         soup = BeautifulSoup(smartQuotes, convertEntities="html")
 806         self.assertEquals(soup.encode('utf-8'),
 807                           'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba')
 808
 809     def testDontSeeSmartQuotesWhereThereAreNone(self):
 810         utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
 811         self.assertSoupEquals(utf_8, encoding='utf-8')
 812
 813
 814 class Whitewash(SoupTest):
 815     """Test whitespace preservation."""
 816
 817     def testPreservedWhitespace(self):
 818         self.assertSoupEquals("<pre>   </pre>")
 819         self.assertSoupEquals("<pre> woo  </pre>")
 820
 821     def testCollapsedWhitespace(self):
 822         self.assertSoupEquals("<p>   </p>", "<p> </p>")
 823
 824
 825 if __name__ == '__main__':
 826     unittest.main()