Add BeautifulSoup Python HTML/XML parser to Melange repository.
[Melange.git] / app / htmlsanitizer / BeautifulSoupTests.py
blob416d978eea1f90d8d4f60b8fcda57648f0bf2a04
1 # -*- coding: utf-8 -*-
2 """Unit tests for Beautiful Soup.
4 These tests make sure the Beautiful Soup works as it should. If you
5 find a bug in Beautiful Soup, the best way to express it is as a test
6 case like this that fails."""
8 import unittest
9 from BeautifulSoup import *
11 class SoupTest(unittest.TestCase):
13 def assertSoupEquals(self, toParse, rep=None, c=BeautifulSoup,
14 encoding=None):
15 """Parse the given text and make sure its string rep is the other
16 given text."""
17 if rep == None:
18 rep = toParse
19 obj = c(toParse)
20 if encoding is None:
21 rep2 = obj.decode()
22 else:
23 rep2 = obj.encode(encoding)
24 self.assertEqual(rep2, rep)
26 class FollowThatTag(SoupTest):
28 "Tests the various ways of fetching tags from a soup."
30 def setUp(self):
31 ml = """
32 <a id="x">1</a>
33 <A id="a">2</a>
34 <b id="b">3</a>
35 <b href="foo" id="x">4</a>
36 <ac width=100>4</ac>"""
37 self.soup = BeautifulStoneSoup(ml)
39 def testFindAllByName(self):
40 matching = self.soup('a')
41 self.assertEqual(len(matching), 2)
42 self.assertEqual(matching[0].name, 'a')
43 self.assertEqual(matching, self.soup.findAll('a'))
44 self.assertEqual(matching, self.soup.findAll(SoupStrainer('a')))
46 def testFindAllByAttribute(self):
47 matching = self.soup.findAll(id='x')
48 self.assertEqual(len(matching), 2)
49 self.assertEqual(matching[0].name, 'a')
50 self.assertEqual(matching[1].name, 'b')
52 matching2 = self.soup.findAll(attrs={'id' : 'x'})
53 self.assertEqual(matching, matching2)
55 strainer = SoupStrainer(attrs={'id' : 'x'})
56 self.assertEqual(matching, self.soup.findAll(strainer))
58 self.assertEqual(len(self.soup.findAll(id=None)), 1)
60 self.assertEqual(len(self.soup.findAll(width=100)), 1)
61 self.assertEqual(len(self.soup.findAll(junk=None)), 5)
62 self.assertEqual(len(self.soup.findAll(junk=[1, None])), 5)
64 self.assertEqual(len(self.soup.findAll(junk=re.compile('.*'))), 0)
65 self.assertEqual(len(self.soup.findAll(junk=True)), 0)
67 self.assertEqual(len(self.soup.findAll(junk=True)), 0)
68 self.assertEqual(len(self.soup.findAll(href=True)), 1)
70 def testFindallByClass(self):
71 soup = BeautifulSoup('<a>Foo</a><a class="1">Bar</a>')
72 self.assertEqual(soup.find('a', '1').string, "Bar")
74 def testFindAllByList(self):
75 matching = self.soup(['a', 'ac'])
76 self.assertEqual(len(matching), 3)
78 def testFindAllByHash(self):
79 matching = self.soup({'a' : True, 'b' : True})
80 self.assertEqual(len(matching), 4)
82 def testFindAllText(self):
83 soup = BeautifulSoup("<html>\xbb</html>")
84 self.assertEqual(soup.findAll(text=re.compile('.*')),
85 [u'\xbb'])
87 def testFindAllByRE(self):
88 import re
89 r = re.compile('a.*')
90 self.assertEqual(len(self.soup(r)), 3)
92 def testFindAllByMethod(self):
93 def matchTagWhereIDMatchesName(tag):
94 return tag.name == tag.get('id')
96 matching = self.soup.findAll(matchTagWhereIDMatchesName)
97 self.assertEqual(len(matching), 2)
98 self.assertEqual(matching[0].name, 'a')
100 def testParents(self):
101 soup = BeautifulSoup('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah')
102 b = soup.b
103 self.assertEquals(len(b.findParents('ul', {'id' : 'foo'})), 2)
104 self.assertEquals(b.findParent('ul')['a'], 'b')
106 PROXIMITY_TEST = BeautifulSoup('<b id="1"><b id="2"><b id="3"><b id="4">')
108 def testNext(self):
109 soup = self.PROXIMITY_TEST
110 b = soup.find('b', {'id' : 2})
111 self.assertEquals(b.findNext('b')['id'], '3')
112 self.assertEquals(b.findNext('b')['id'], '3')
113 self.assertEquals(len(b.findAllNext('b')), 2)
114 self.assertEquals(len(b.findAllNext('b', {'id' : 4})), 1)
116 def testPrevious(self):
117 soup = self.PROXIMITY_TEST
118 b = soup.find('b', {'id' : 3})
119 self.assertEquals(b.findPrevious('b')['id'], '2')
120 self.assertEquals(b.findPrevious('b')['id'], '2')
121 self.assertEquals(len(b.findAllPrevious('b')), 2)
122 self.assertEquals(len(b.findAllPrevious('b', {'id' : 2})), 1)
125 SIBLING_TEST = BeautifulSoup('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
127 def testNextSibling(self):
128 soup = self.SIBLING_TEST
129 tag = 'blockquote'
130 b = soup.find(tag, {'id' : 2})
131 self.assertEquals(b.findNext(tag)['id'], '2.1')
132 self.assertEquals(b.findNextSibling(tag)['id'], '3')
133 self.assertEquals(b.findNextSibling(tag)['id'], '3')
134 self.assertEquals(len(b.findNextSiblings(tag)), 2)
135 self.assertEquals(len(b.findNextSiblings(tag, {'id' : 4})), 1)
137 def testPreviousSibling(self):
138 soup = self.SIBLING_TEST
139 tag = 'blockquote'
140 b = soup.find(tag, {'id' : 3})
141 self.assertEquals(b.findPrevious(tag)['id'], '2.1')
142 self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
143 self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
144 self.assertEquals(len(b.findPreviousSiblings(tag)), 2)
145 self.assertEquals(len(b.findPreviousSiblings(tag, id=1)), 1)
147 def testTextNavigation(self):
148 soup = BeautifulSoup('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
149 baz = soup.find(text='Baz')
150 self.assertEquals(baz.findParent("i")['id'], '1')
151 self.assertEquals(baz.findNext(text='Blee'), 'Blee')
152 self.assertEquals(baz.findNextSibling(text='Blee'), 'Blee')
153 self.assertEquals(baz.findNextSibling(text='Blargh'), None)
154 self.assertEquals(baz.findNextSibling('hr')['id'], '1')
156 class SiblingRivalry(SoupTest):
157 "Tests the nextSibling and previousSibling navigation."
159 def testSiblings(self):
160 soup = BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>")
161 secondLI = soup.find('li').nextSibling
162 self.assert_(secondLI.name == 'li' and secondLI.string == '2')
163 self.assertEquals(soup.find(text='1').nextSibling.name, 'p')
164 self.assertEquals(soup.find('p').nextSibling, 'B')
165 self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B')
167 class TagsAreObjectsToo(SoupTest):
168 "Tests the various built-in functions of Tag objects."
170 def testLen(self):
171 soup = BeautifulSoup("<top>1<b>2</b>3</top>")
172 self.assertEquals(len(soup.top), 3)
174 class StringEmUp(SoupTest):
175 "Tests the use of 'string' as an alias for a tag's only content."
177 def testString(self):
178 s = BeautifulSoup("<b>foo</b>")
179 self.assertEquals(s.b.string, 'foo')
181 def testLackOfString(self):
182 s = BeautifulSoup("<b>f<i>e</i>o</b>")
183 self.assert_(not s.b.string)
185 class ThatsMyLimit(SoupTest):
186 "Tests the limit argument."
188 def testBasicLimits(self):
189 s = BeautifulSoup('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
190 self.assertEquals(len(s.findAll('br')), 4)
191 self.assertEquals(len(s.findAll('br', limit=2)), 2)
192 self.assertEquals(len(s('br', limit=2)), 2)
194 class OnlyTheLonely(SoupTest):
195 "Tests the parseOnly argument to the constructor."
196 def setUp(self):
197 x = []
198 for i in range(1,6):
199 x.append('<a id="%s">' % i)
200 for j in range(100,103):
201 x.append('<b id="%s.%s">Content %s.%s</b>' % (i,j, i,j))
202 x.append('</a>')
203 self.x = ''.join(x)
205 def testOnly(self):
206 strainer = SoupStrainer("b")
207 soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
208 self.assertEquals(len(soup), 15)
210 strainer = SoupStrainer(id=re.compile("100.*"))
211 soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
212 self.assertEquals(len(soup), 5)
214 strainer = SoupStrainer(text=re.compile("10[01].*"))
215 soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
216 self.assertEquals(len(soup), 10)
218 strainer = SoupStrainer(text=lambda(x):x[8]=='3')
219 soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
220 self.assertEquals(len(soup), 3)
222 class PickleMeThis(SoupTest):
223 "Testing features like pickle and deepcopy."
225 def setUp(self):
226 self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
227 "http://www.w3.org/TR/REC-html40/transitional.dtd">
228 <html>
229 <head>
230 <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
231 <title>Beautiful Soup: We called him Tortoise because he taught us.</title>
232 <link rev="made" href="mailto:leonardr@segfault.org">
233 <meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
234 <meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
235 <meta name="author" content="Leonard Richardson">
236 </head>
237 <body>
238 <a href="foo">foo</a>
239 <a href="foo"><b>bar</b></a>
240 </body>
241 </html>"""
243 self.soup = BeautifulSoup(self.page)
245 def testPickle(self):
246 import pickle
247 dumped = pickle.dumps(self.soup, 2)
248 loaded = pickle.loads(dumped)
249 self.assertEqual(loaded.__class__, BeautifulSoup)
250 self.assertEqual(loaded.decode(), self.soup.decode())
252 def testDeepcopy(self):
253 from copy import deepcopy
254 deepcopy(BeautifulSoup("<a></a>"))
255 copied = deepcopy(self.soup)
256 self.assertEqual(copied.decode(), self.soup.decode())
258 def testUnicodePickle(self):
259 import cPickle as pickle
260 html = "<b>" + chr(0xc3) + "</b>"
261 soup = BeautifulSoup(html)
262 dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
263 loaded = pickle.loads(dumped)
264 self.assertEqual(loaded.decode(), soup.decode())
267 class WriteOnlyCode(SoupTest):
268 "Testing the modification of the tree."
270 def testModifyAttributes(self):
271 soup = BeautifulSoup('<a id="1"></a>')
272 soup.a['id'] = 2
273 self.assertEqual(soup.decode(), '<a id="2"></a>')
274 del(soup.a['id'])
275 self.assertEqual(soup.decode(), '<a></a>')
276 soup.a['id2'] = 'foo'
277 self.assertEqual(soup.decode(), '<a id2="foo"></a>')
279 def testNewTagCreation(self):
280 "Makes sure tags don't step on each others' toes."
281 soup = BeautifulSoup()
282 a = Tag(soup, 'a')
283 ol = Tag(soup, 'ol')
284 a['href'] = 'http://foo.com/'
285 self.assertRaises(KeyError, lambda : ol['href'])
287 def testTagReplacement(self):
288 # Make sure you can replace an element with itself.
289 text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
290 soup = BeautifulSoup(text)
291 c = soup.c
292 soup.c.replaceWith(c)
293 self.assertEquals(soup.decode(), text)
295 # A very simple case
296 soup = BeautifulSoup("<b>Argh!</b>")
297 soup.find(text="Argh!").replaceWith("Hooray!")
298 newText = soup.find(text="Hooray!")
299 b = soup.b
300 self.assertEqual(newText.previous, b)
301 self.assertEqual(newText.parent, b)
302 self.assertEqual(newText.previous.next, newText)
303 self.assertEqual(newText.next, None)
305 # A more complex case
306 soup = BeautifulSoup("<a><b>Argh!</b><c></c><d></d></a>")
307 soup.b.insert(1, "Hooray!")
308 newText = soup.find(text="Hooray!")
309 self.assertEqual(newText.previous, "Argh!")
310 self.assertEqual(newText.previous.next, newText)
312 self.assertEqual(newText.previousSibling, "Argh!")
313 self.assertEqual(newText.previousSibling.nextSibling, newText)
315 self.assertEqual(newText.nextSibling, None)
316 self.assertEqual(newText.next, soup.c)
318 text = "<html>There's <b>no</b> business like <b>show</b> business</html>"
319 soup = BeautifulSoup(text)
320 no, show = soup.findAll('b')
321 show.replaceWith(no)
322 self.assertEquals(soup.decode(), "<html>There's business like <b>no</b> business</html>")
324 # Even more complex
325 soup = BeautifulSoup("<a><b>Find</b><c>lady!</c><d></d></a>")
326 tag = Tag(soup, 'magictag')
327 tag.insert(0, "the")
328 soup.a.insert(1, tag)
330 b = soup.b
331 c = soup.c
332 theText = tag.find(text=True)
333 findText = b.find(text="Find")
335 self.assertEqual(findText.next, tag)
336 self.assertEqual(tag.previous, findText)
337 self.assertEqual(b.nextSibling, tag)
338 self.assertEqual(tag.previousSibling, b)
339 self.assertEqual(tag.nextSibling, c)
340 self.assertEqual(c.previousSibling, tag)
342 self.assertEqual(theText.next, c)
343 self.assertEqual(c.previous, theText)
345 # Aand... incredibly complex.
346 soup = BeautifulSoup("""<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
347 f = soup.f
348 a = soup.a
349 c = soup.c
350 e = soup.e
351 weText = a.find(text="We")
352 soup.b.replaceWith(soup.f)
353 self.assertEqual(soup.decode(), "<a>We<f>refuse</f></a><e>to<g>service</g></e>")
355 self.assertEqual(f.previous, weText)
356 self.assertEqual(weText.next, f)
357 self.assertEqual(f.previousSibling, weText)
358 self.assertEqual(f.nextSibling, None)
359 self.assertEqual(weText.nextSibling, f)
361 def testAppend(self):
362 doc = "<p>Don't leave me <b>here</b>.</p> <p>Don't leave me.</p>"
363 soup = BeautifulSoup(doc)
364 second_para = soup('p')[1]
365 bold = soup.find('b')
366 soup('p')[1].append(soup.find('b'))
367 self.assertEqual(bold.parent, second_para)
368 self.assertEqual(soup.decode(),
369 "<p>Don't leave me .</p> "
370 "<p>Don't leave me.<b>here</b></p>")
372 def testTagExtraction(self):
373 # A very simple case
374 text = '<html><div id="nav">Nav crap</div>Real content here.</html>'
375 soup = BeautifulSoup(text)
376 extracted = soup.find("div", id="nav").extract()
377 self.assertEqual(soup.decode(), "<html>Real content here.</html>")
378 self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
380 # A simple case, a more complex test.
381 text = "<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>"
382 soup = BeautifulStoneSoup(text)
383 doc = soup.doc
384 numbers, roman, letters = soup("a")
386 self.assertEqual(roman.parent, doc)
387 oldPrevious = roman.previous
388 endOfThisTag = roman.nextSibling.previous
389 self.assertEqual(oldPrevious, "2")
390 self.assertEqual(roman.next, "i")
391 self.assertEqual(endOfThisTag, "ii")
392 self.assertEqual(roman.previousSibling, numbers)
393 self.assertEqual(roman.nextSibling, letters)
395 roman.extract()
396 self.assertEqual(roman.parent, None)
397 self.assertEqual(roman.previous, None)
398 self.assertEqual(roman.next, "i")
399 self.assertEqual(letters.previous, '2')
400 self.assertEqual(roman.previousSibling, None)
401 self.assertEqual(roman.nextSibling, None)
402 self.assertEqual(endOfThisTag.next, None)
403 self.assertEqual(roman.b.contents[0].next, None)
404 self.assertEqual(numbers.nextSibling, letters)
405 self.assertEqual(letters.previousSibling, numbers)
406 self.assertEqual(len(doc.contents), 2)
407 self.assertEqual(doc.contents[0], numbers)
408 self.assertEqual(doc.contents[1], letters)
410 # A more complex case.
411 text = "<a>1<b>2<c>Hollywood, baby!</c></b></a>3"
412 soup = BeautifulStoneSoup(text)
413 one = soup.find(text="1")
414 three = soup.find(text="3")
415 toExtract = soup.b
416 soup.b.extract()
417 self.assertEqual(one.next, three)
418 self.assertEqual(three.previous, one)
419 self.assertEqual(one.parent.nextSibling, three)
420 self.assertEqual(three.previousSibling, soup.a)
422 class TheManWithoutAttributes(SoupTest):
423 "Test attribute access"
425 def testHasKey(self):
426 text = "<foo attr='bar'>"
427 self.assertTrue(BeautifulSoup(text).foo.has_key('attr'))
429 class QuoteMeOnThat(SoupTest):
430 "Test quoting"
431 def testQuotedAttributeValues(self):
432 self.assertSoupEquals("<foo attr='bar'></foo>",
433 '<foo attr="bar"></foo>')
435 text = """<foo attr='bar "brawls" happen'>a</foo>"""
436 soup = BeautifulSoup(text)
437 self.assertEquals(soup.decode(), text)
439 soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
440 newText = """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>"""
441 self.assertSoupEquals(soup.decode(), newText)
443 self.assertSoupEquals('<this is="really messed up & stuff">',
444 '<this is="really messed up &amp; stuff"></this>')
448 class YoureSoLiteral(SoupTest):
449 "Test literal mode."
450 def testLiteralMode(self):
451 text = "<script>if (i<imgs.length)</script><b>Foo</b>"
452 soup = BeautifulSoup(text)
453 self.assertEqual(soup.script.contents[0], "if (i<imgs.length)")
454 self.assertEqual(soup.b.contents[0], "Foo")
456 def testTextArea(self):
457 text = "<textarea><b>This is an example of an HTML tag</b><&<&</textarea>"
458 soup = BeautifulSoup(text)
459 self.assertEqual(soup.textarea.contents[0],
460 "<b>This is an example of an HTML tag</b><&<&")
462 class OperatorOverload(SoupTest):
463 "Our operators do it all! Call now!"
465 def testTagNameAsFind(self):
466 "Tests that referencing a tag name as a member delegates to find()."
467 soup = BeautifulSoup('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
468 self.assertEqual(soup.b.i, soup.find('b').find('i'))
469 self.assertEqual(soup.b.i.string, 'bar')
470 self.assertEqual(soup.b['id'], '1')
471 self.assertEqual(soup.b.contents[0], 'foo')
472 self.assert_(not soup.a)
474 #Test the .fooTag variant of .foo.
475 self.assertEqual(soup.bTag.iTag.string, 'bar')
476 self.assertEqual(soup.b.iTag.string, 'bar')
477 self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag)
479 class NestableEgg(SoupTest):
480 """Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!"""
482 def testParaInsideBlockquote(self):
483 soup = BeautifulSoup('<blockquote><p><b>Foo</blockquote><p>Bar')
484 self.assertEqual(soup.blockquote.p.b.string, 'Foo')
485 self.assertEqual(soup.blockquote.b.string, 'Foo')
486 self.assertEqual(soup.find('p', recursive=False).string, 'Bar')
488 def testNestedTables(self):
489 text = """<table id="1"><tr><td>Here's another table:
490 <table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>"""
491 soup = BeautifulSoup(text)
492 self.assertEquals(soup.table.table.td.string, 'Juicy text')
493 self.assertEquals(len(soup.findAll('table')), 2)
494 self.assertEquals(len(soup.table.findAll('table')), 1)
495 self.assertEquals(soup.find('table', {'id' : 2}).parent.parent.parent.name,
496 'table')
498 text = "<table><tr><td><div><table>Foo</table></div></td></tr></table>"
499 soup = BeautifulSoup(text)
500 self.assertEquals(soup.table.tr.td.div.table.contents[0], "Foo")
502 text = """<table><thead><tr>Foo</tr></thead><tbody><tr>Bar</tr></tbody>
503 <tfoot><tr>Baz</tr></tfoot></table>"""
504 soup = BeautifulSoup(text)
505 self.assertEquals(soup.table.thead.tr.contents[0], "Foo")
507 def testBadNestedTables(self):
508 soup = BeautifulSoup("<table><tr><table><tr id='nested'>")
509 self.assertEquals(soup.table.tr.table.tr['id'], 'nested')
511 class CleanupOnAisleFour(SoupTest):
512 """Here we test cleanup of text that breaks HTMLParser or is just
513 obnoxious."""
515 def testSelfClosingtag(self):
516 self.assertEqual(BeautifulSoup("Foo<br/>Bar").find('br').decode(),
517 '<br />')
519 self.assertSoupEquals('<p>test1<br/>test2</p>',
520 '<p>test1<br />test2</p>')
522 text = '<p>test1<selfclosing>test2'
523 soup = BeautifulStoneSoup(text)
524 self.assertEqual(soup.decode(),
525 '<p>test1<selfclosing>test2</selfclosing></p>')
527 soup = BeautifulStoneSoup(text, selfClosingTags='selfclosing')
528 self.assertEqual(soup.decode(),
529 '<p>test1<selfclosing />test2</p>')
531 def testSelfClosingTagOrNot(self):
532 text = "<item><link>http://foo.com/</link></item>"
533 self.assertEqual(BeautifulStoneSoup(text).decode(), text)
534 self.assertEqual(BeautifulSoup(text).decode(),
535 '<item><link />http://foo.com/</item>')
537 def testBooleanAttributes(self):
538 text = "<td nowrap>foo</td>"
539 self.assertSoupEquals(text, text)
541 def testCData(self):
542 xml = "<root>foo<![CDATA[foobar]]>bar</root>"
543 self.assertSoupEquals(xml, xml)
544 r = re.compile("foo.*bar")
545 soup = BeautifulSoup(xml)
546 self.assertEquals(soup.find(text=r).string, "foobar")
547 self.assertEquals(soup.find(text=r).__class__, CData)
549 def testComments(self):
550 xml = "foo<!--foobar-->baz"
551 self.assertSoupEquals(xml)
552 r = re.compile("foo.*bar")
553 soup = BeautifulSoup(xml)
554 self.assertEquals(soup.find(text=r).string, "foobar")
555 self.assertEquals(soup.find(text="foobar").__class__, Comment)
557 def testDeclaration(self):
558 xml = "foo<!DOCTYPE foobar>baz"
559 self.assertSoupEquals(xml)
560 r = re.compile(".*foo.*bar")
561 soup = BeautifulSoup(xml)
562 text = "DOCTYPE foobar"
563 self.assertEquals(soup.find(text=r).string, text)
564 self.assertEquals(soup.find(text=text).__class__, Declaration)
566 namespaced_doctype = ('<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">'
567 '<html>foo</html>')
568 soup = BeautifulSoup(namespaced_doctype)
569 self.assertEquals(soup.contents[0],
570 'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"')
571 self.assertEquals(soup.html.contents[0], 'foo')
573 def testEntityConversions(self):
574 text = "&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;"
575 soup = BeautifulStoneSoup(text)
576 self.assertSoupEquals(text)
578 xmlEnt = BeautifulStoneSoup.XML_ENTITIES
579 htmlEnt = BeautifulStoneSoup.HTML_ENTITIES
580 xhtmlEnt = BeautifulStoneSoup.XHTML_ENTITIES
582 soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
583 self.assertEquals(soup.decode(), "<<sacr&eacute; bleu!>>")
585 soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
586 self.assertEquals(soup.decode(), "<<sacr&eacute; bleu!>>")
588 soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
589 self.assertEquals(soup.decode(), u"<<sacr\xe9 bleu!>>")
591 # Make sure the "XML", "HTML", and "XHTML" settings work.
592 text = "&lt;&trade;&apos;"
593 soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
594 self.assertEquals(soup.decode(), u"<&trade;'")
596 soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
597 self.assertEquals(soup.decode(), u"<\u2122&apos;")
599 soup = BeautifulStoneSoup(text, convertEntities=xhtmlEnt)
600 self.assertEquals(soup.decode(), u"<\u2122'")
602 def testNonBreakingSpaces(self):
603 soup = BeautifulSoup("<a>&nbsp;&nbsp;</a>",
604 convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
605 self.assertEquals(soup.decode(), u"<a>\xa0\xa0</a>")
607 def testWhitespaceInDeclaration(self):
608 self.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>')
610 def testJunkInDeclaration(self):
611 self.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a')
613 def testIncompleteDeclaration(self):
614 self.assertSoupEquals('a<!b <p>c')
616 def testEntityReplacement(self):
617 self.assertSoupEquals('<b>hello&nbsp;there</b>')
619 def testEntitiesInAttributeValues(self):
620 self.assertSoupEquals('<x t="x&#241;">', '<x t="x\xc3\xb1"></x>',
621 encoding='utf-8')
622 self.assertSoupEquals('<x t="x&#xf1;">', '<x t="x\xc3\xb1"></x>',
623 encoding='utf-8')
625 soup = BeautifulSoup('<x t="&gt;&trade;">',
626 convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
627 self.assertEquals(soup.decode(), u'<x t="&gt;\u2122"></x>')
629 uri = "http://crummy.com?sacr&eacute;&amp;bleu"
630 link = '<a href="%s"></a>' % uri
632 soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
633 self.assertEquals(soup.decode(),
634 link.replace("&eacute;", u"\xe9"))
636 uri = "http://crummy.com?sacr&eacute;&bleu"
637 link = '<a href="%s"></a>' % uri
638 soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
639 self.assertEquals(soup.a['href'],
640 uri.replace("&eacute;", u"\xe9"))
642 def testNakedAmpersands(self):
643 html = {'convertEntities':BeautifulStoneSoup.HTML_ENTITIES}
644 soup = BeautifulStoneSoup("AT&T ", **html)
645 self.assertEquals(soup.decode(), 'AT&amp;T ')
647 nakedAmpersandInASentence = "AT&T was Ma Bell"
648 soup = BeautifulStoneSoup(nakedAmpersandInASentence,**html)
649 self.assertEquals(soup.decode(), \
650 nakedAmpersandInASentence.replace('&','&amp;'))
652 invalidURL = '<a href="http://example.org?a=1&b=2;3">foo</a>'
653 validURL = invalidURL.replace('&','&amp;')
654 soup = BeautifulStoneSoup(invalidURL)
655 self.assertEquals(soup.decode(), validURL)
657 soup = BeautifulStoneSoup(validURL)
658 self.assertEquals(soup.decode(), validURL)
661 class EncodeRed(SoupTest):
662 """Tests encoding conversion, Unicode conversion, and Microsoft
663 smart quote fixes."""
665 def testUnicodeDammitStandalone(self):
666 markup = "<foo>\x92</foo>"
667 dammit = UnicodeDammit(markup)
668 self.assertEquals(dammit.unicode, "<foo>&#x2019;</foo>")
670 hebrew = "\xed\xe5\xec\xf9"
671 dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
672 self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9')
673 self.assertEquals(dammit.originalEncoding, 'iso-8859-8')
675 def testGarbageInGarbageOut(self):
676 ascii = "<foo>a</foo>"
677 asciiSoup = BeautifulStoneSoup(ascii)
678 self.assertEquals(ascii, asciiSoup.decode())
680 unicodeData = u"<foo>\u00FC</foo>"
681 utf8 = unicodeData.encode("utf-8")
682 self.assertEquals(utf8, '<foo>\xc3\xbc</foo>')
684 unicodeSoup = BeautifulStoneSoup(unicodeData)
685 self.assertEquals(unicodeData, unicodeSoup.decode())
686 self.assertEquals(unicodeSoup.foo.string, u'\u00FC')
688 utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8')
689 self.assertEquals(utf8, utf8Soup.encode('utf-8'))
690 self.assertEquals(utf8Soup.originalEncoding, "utf-8")
692 utf8Soup = BeautifulStoneSoup(unicodeData)
693 self.assertEquals(utf8, utf8Soup.encode('utf-8'))
694 self.assertEquals(utf8Soup.originalEncoding, None)
697 def testHandleInvalidCodec(self):
698 for bad_encoding in ['.utf8', '...', 'utF---16.!']:
699 soup = BeautifulSoup(u"Räksmörgås".encode("utf-8"),
700 fromEncoding=bad_encoding)
701 self.assertEquals(soup.originalEncoding, 'utf-8')
703 def testUnicodeSearch(self):
704 html = u'<html><body><h1>Räksmörgås</h1></body></html>'
705 soup = BeautifulSoup(html)
706 self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås')
708 def testRewrittenXMLHeader(self):
709 euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n'
710 utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n"
711 soup = BeautifulStoneSoup(euc_jp)
712 if soup.originalEncoding != "euc-jp":
713 raise Exception("Test failed when parsing euc-jp document. "
714 "If you're running Python >=2.4, or you have "
715 "cjkcodecs installed, this is a real problem. "
716 "Otherwise, ignore it.")
718 self.assertEquals(soup.originalEncoding, "euc-jp")
719 self.assertEquals(soup.renderContents('utf-8'), utf8)
721 old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>"
722 new_text = "<?xml version='1.0' encoding='utf-8'?><foo>&rsquo;</foo>"
723 self.assertSoupEquals(old_text, new_text)
725 def testRewrittenMetaTag(self):
726 no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''
727 soup = BeautifulSoup(no_shift_jis_html)
729 # Beautiful Soup used to try to rewrite the meta tag even if the
730 # meta tag got filtered out by the strainer. This test makes
731 # sure that doesn't happen.
732 strainer = SoupStrainer('pre')
733 soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer)
734 self.assertEquals(soup.contents[0].name, 'pre')
736 meta_tag = ('<meta content="text/html; charset=x-sjis" '
737 'http-equiv="Content-type" />')
738 shift_jis_html = (
739 '<html><head>\n%s\n'
740 '<meta http-equiv="Content-language" content="ja" />'
741 '</head><body><pre>\n'
742 '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
743 '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
744 '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'
745 '</pre></body></html>') % meta_tag
746 soup = BeautifulSoup(shift_jis_html)
747 if soup.originalEncoding != "shift-jis":
748 raise Exception("Test failed when parsing shift-jis document "
749 "with meta tag '%s'."
750 "If you're running Python >=2.4, or you have "
751 "cjkcodecs installed, this is a real problem. "
752 "Otherwise, ignore it." % meta_tag)
753 self.assertEquals(soup.originalEncoding, "shift-jis")
755 content_type_tag = soup.meta['content']
756 self.assertEquals(content_type_tag[content_type_tag.find('charset='):],
757 'charset=%SOUP-ENCODING%')
758 content_type = str(soup.meta)
759 index = content_type.find('charset=')
760 self.assertEqual(content_type[index:index+len('charset=utf8')+1],
761 'charset=utf-8')
762 content_type = soup.meta.encode('shift-jis')
763 index = content_type.find('charset=')
764 self.assertEqual(content_type[index:index+len('charset=shift-jis')],
765 'charset=shift-jis'.encode())
767 self.assertEquals(soup.encode('utf-8'), (
768 '<html><head>\n'
769 '<meta content="text/html; charset=utf-8" '
770 'http-equiv="Content-type" />\n'
771 '<meta http-equiv="Content-language" content="ja" />'
772 '</head><body><pre>\n'
773 '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'
774 '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'
775 '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'
776 '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'
777 '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'
778 '</pre></body></html>'))
779 self.assertEquals(soup.encode("shift-jis"),
780 shift_jis_html.replace('x-sjis'.encode(),
781 'shift-jis'.encode()))
783 isolatin = """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
784 soup = BeautifulSoup(isolatin)
786 utf8 = isolatin.replace("ISO-Latin-1".encode(), "utf-8".encode())
787 utf8 = utf8.replace("\xe9", "\xc3\xa9")
788 self.assertSoupEquals(soup.encode("utf-8"), utf8, encoding='utf-8')
790 def testHebrew(self):
791 iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n'
792 utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n'
793 soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8")
794 self.assertEquals(soup.encode('utf-8'), utf8)
796 def testSmartQuotesNotSoSmartAnymore(self):
797 self.assertSoupEquals("\x91Foo\x92 <!--blah-->",
798 '&lsquo;Foo&rsquo; <!--blah-->')
800 def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self):
801 smartQuotes = "Il a dit, \x8BSacr&eacute; bl&#101;u!\x9b"
802 soup = BeautifulSoup(smartQuotes)
803 self.assertEquals(soup.decode(),
804 'Il a dit, &lsaquo;Sacr&eacute; bl&#101;u!&rsaquo;')
805 soup = BeautifulSoup(smartQuotes, convertEntities="html")
806 self.assertEquals(soup.encode('utf-8'),
807 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba')
809 def testDontSeeSmartQuotesWhereThereAreNone(self):
810 utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
811 self.assertSoupEquals(utf_8, encoding='utf-8')
814 class Whitewash(SoupTest):
815 """Test whitespace preservation."""
817 def testPreservedWhitespace(self):
818 self.assertSoupEquals("<pre> </pre>")
819 self.assertSoupEquals("<pre> woo </pre>")
821 def testCollapsedWhitespace(self):
822 self.assertSoupEquals("<p> </p>", "<p> </p>")
825 if __name__ == '__main__':
826 unittest.main()