1 # -*- coding: utf-8 -*-
2 """Unit tests for Beautiful Soup.
4 These tests make sure the Beautiful Soup works as it should. If you
5 find a bug in Beautiful Soup, the best way to express it is as a test
6 case like this that fails."""
9 from BeautifulSoup
import *
11 class SoupTest(unittest
.TestCase
):
13 def assertSoupEquals(self
, toParse
, rep
=None, c
=BeautifulSoup
,
15 """Parse the given text and make sure its string rep is the other
23 rep2
= obj
.encode(encoding
)
24 self
.assertEqual(rep2
, rep
)
26 class FollowThatTag(SoupTest
):
28 "Tests the various ways of fetching tags from a soup."
35 <b href="foo" id="x">4</a>
36 <ac width=100>4</ac>"""
37 self
.soup
= BeautifulStoneSoup(ml
)
39 def testFindAllByName(self
):
40 matching
= self
.soup('a')
41 self
.assertEqual(len(matching
), 2)
42 self
.assertEqual(matching
[0].name
, 'a')
43 self
.assertEqual(matching
, self
.soup
.findAll('a'))
44 self
.assertEqual(matching
, self
.soup
.findAll(SoupStrainer('a')))
46 def testFindAllByAttribute(self
):
47 matching
= self
.soup
.findAll(id='x')
48 self
.assertEqual(len(matching
), 2)
49 self
.assertEqual(matching
[0].name
, 'a')
50 self
.assertEqual(matching
[1].name
, 'b')
52 matching2
= self
.soup
.findAll(attrs
={'id' : 'x'})
53 self
.assertEqual(matching
, matching2
)
55 strainer
= SoupStrainer(attrs
={'id' : 'x'})
56 self
.assertEqual(matching
, self
.soup
.findAll(strainer
))
58 self
.assertEqual(len(self
.soup
.findAll(id=None)), 1)
60 self
.assertEqual(len(self
.soup
.findAll(width
=100)), 1)
61 self
.assertEqual(len(self
.soup
.findAll(junk
=None)), 5)
62 self
.assertEqual(len(self
.soup
.findAll(junk
=[1, None])), 5)
64 self
.assertEqual(len(self
.soup
.findAll(junk
=re
.compile('.*'))), 0)
65 self
.assertEqual(len(self
.soup
.findAll(junk
=True)), 0)
67 self
.assertEqual(len(self
.soup
.findAll(junk
=True)), 0)
68 self
.assertEqual(len(self
.soup
.findAll(href
=True)), 1)
70 def testFindallByClass(self
):
71 soup
= BeautifulSoup('<a>Foo</a><a class="1">Bar</a>')
72 self
.assertEqual(soup
.find('a', '1').string
, "Bar")
74 def testFindAllByList(self
):
75 matching
= self
.soup(['a', 'ac'])
76 self
.assertEqual(len(matching
), 3)
78 def testFindAllByHash(self
):
79 matching
= self
.soup({'a' : True, 'b' : True})
80 self
.assertEqual(len(matching
), 4)
82 def testFindAllText(self
):
83 soup
= BeautifulSoup("<html>\xbb</html>")
84 self
.assertEqual(soup
.findAll(text
=re
.compile('.*')),
87 def testFindAllByRE(self
):
90 self
.assertEqual(len(self
.soup(r
)), 3)
92 def testFindAllByMethod(self
):
93 def matchTagWhereIDMatchesName(tag
):
94 return tag
.name
== tag
.get('id')
96 matching
= self
.soup
.findAll(matchTagWhereIDMatchesName
)
97 self
.assertEqual(len(matching
), 2)
98 self
.assertEqual(matching
[0].name
, 'a')
100 def testParents(self
):
101 soup
= BeautifulSoup('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah')
103 self
.assertEquals(len(b
.findParents('ul', {'id' : 'foo'})), 2)
104 self
.assertEquals(b
.findParent('ul')['a'], 'b')
106 PROXIMITY_TEST
= BeautifulSoup('<b id="1"><b id="2"><b id="3"><b id="4">')
109 soup
= self
.PROXIMITY_TEST
110 b
= soup
.find('b', {'id' : 2})
111 self
.assertEquals(b
.findNext('b')['id'], '3')
112 self
.assertEquals(b
.findNext('b')['id'], '3')
113 self
.assertEquals(len(b
.findAllNext('b')), 2)
114 self
.assertEquals(len(b
.findAllNext('b', {'id' : 4})), 1)
116 def testPrevious(self
):
117 soup
= self
.PROXIMITY_TEST
118 b
= soup
.find('b', {'id' : 3})
119 self
.assertEquals(b
.findPrevious('b')['id'], '2')
120 self
.assertEquals(b
.findPrevious('b')['id'], '2')
121 self
.assertEquals(len(b
.findAllPrevious('b')), 2)
122 self
.assertEquals(len(b
.findAllPrevious('b', {'id' : 2})), 1)
125 SIBLING_TEST
= BeautifulSoup('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
127 def testNextSibling(self
):
128 soup
= self
.SIBLING_TEST
130 b
= soup
.find(tag
, {'id' : 2})
131 self
.assertEquals(b
.findNext(tag
)['id'], '2.1')
132 self
.assertEquals(b
.findNextSibling(tag
)['id'], '3')
133 self
.assertEquals(b
.findNextSibling(tag
)['id'], '3')
134 self
.assertEquals(len(b
.findNextSiblings(tag
)), 2)
135 self
.assertEquals(len(b
.findNextSiblings(tag
, {'id' : 4})), 1)
137 def testPreviousSibling(self
):
138 soup
= self
.SIBLING_TEST
140 b
= soup
.find(tag
, {'id' : 3})
141 self
.assertEquals(b
.findPrevious(tag
)['id'], '2.1')
142 self
.assertEquals(b
.findPreviousSibling(tag
)['id'], '2')
143 self
.assertEquals(b
.findPreviousSibling(tag
)['id'], '2')
144 self
.assertEquals(len(b
.findPreviousSiblings(tag
)), 2)
145 self
.assertEquals(len(b
.findPreviousSiblings(tag
, id=1)), 1)
147 def testTextNavigation(self
):
148 soup
= BeautifulSoup('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
149 baz
= soup
.find(text
='Baz')
150 self
.assertEquals(baz
.findParent("i")['id'], '1')
151 self
.assertEquals(baz
.findNext(text
='Blee'), 'Blee')
152 self
.assertEquals(baz
.findNextSibling(text
='Blee'), 'Blee')
153 self
.assertEquals(baz
.findNextSibling(text
='Blargh'), None)
154 self
.assertEquals(baz
.findNextSibling('hr')['id'], '1')
156 class SiblingRivalry(SoupTest
):
157 "Tests the nextSibling and previousSibling navigation."
159 def testSiblings(self
):
160 soup
= BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>")
161 secondLI
= soup
.find('li').nextSibling
162 self
.assert_(secondLI
.name
== 'li' and secondLI
.string
== '2')
163 self
.assertEquals(soup
.find(text
='1').nextSibling
.name
, 'p')
164 self
.assertEquals(soup
.find('p').nextSibling
, 'B')
165 self
.assertEquals(soup
.find('p').nextSibling
.previousSibling
.nextSibling
, 'B')
167 class TagsAreObjectsToo(SoupTest
):
168 "Tests the various built-in functions of Tag objects."
171 soup
= BeautifulSoup("<top>1<b>2</b>3</top>")
172 self
.assertEquals(len(soup
.top
), 3)
174 class StringEmUp(SoupTest
):
175 "Tests the use of 'string' as an alias for a tag's only content."
177 def testString(self
):
178 s
= BeautifulSoup("<b>foo</b>")
179 self
.assertEquals(s
.b
.string
, 'foo')
181 def testLackOfString(self
):
182 s
= BeautifulSoup("<b>f<i>e</i>o</b>")
183 self
.assert_(not s
.b
.string
)
185 class ThatsMyLimit(SoupTest
):
186 "Tests the limit argument."
188 def testBasicLimits(self
):
189 s
= BeautifulSoup('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
190 self
.assertEquals(len(s
.findAll('br')), 4)
191 self
.assertEquals(len(s
.findAll('br', limit
=2)), 2)
192 self
.assertEquals(len(s('br', limit
=2)), 2)
194 class OnlyTheLonely(SoupTest
):
195 "Tests the parseOnly argument to the constructor."
199 x
.append('<a id="%s">' % i
)
200 for j
in range(100,103):
201 x
.append('<b id="%s.%s">Content %s.%s</b>' % (i
,j
, i
,j
))
206 strainer
= SoupStrainer("b")
207 soup
= BeautifulSoup(self
.x
, parseOnlyThese
=strainer
)
208 self
.assertEquals(len(soup
), 15)
210 strainer
= SoupStrainer(id=re
.compile("100.*"))
211 soup
= BeautifulSoup(self
.x
, parseOnlyThese
=strainer
)
212 self
.assertEquals(len(soup
), 5)
214 strainer
= SoupStrainer(text
=re
.compile("10[01].*"))
215 soup
= BeautifulSoup(self
.x
, parseOnlyThese
=strainer
)
216 self
.assertEquals(len(soup
), 10)
218 strainer
= SoupStrainer(text
=lambda(x
):x
[8]=='3')
219 soup
= BeautifulSoup(self
.x
, parseOnlyThese
=strainer
)
220 self
.assertEquals(len(soup
), 3)
222 class PickleMeThis(SoupTest
):
223 "Testing features like pickle and deepcopy."
226 self
.page
= """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
227 "http://www.w3.org/TR/REC-html40/transitional.dtd">
230 <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
231 <title>Beautiful Soup: We called him Tortoise because he taught us.</title>
232 <link rev="made" href="mailto:leonardr@segfault.org">
233 <meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
234 <meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
235 <meta name="author" content="Leonard Richardson">
238 <a href="foo">foo</a>
239 <a href="foo"><b>bar</b></a>
243 self
.soup
= BeautifulSoup(self
.page
)
245 def testPickle(self
):
247 dumped
= pickle
.dumps(self
.soup
, 2)
248 loaded
= pickle
.loads(dumped
)
249 self
.assertEqual(loaded
.__class
__, BeautifulSoup
)
250 self
.assertEqual(loaded
.decode(), self
.soup
.decode())
252 def testDeepcopy(self
):
253 from copy
import deepcopy
254 deepcopy(BeautifulSoup("<a></a>"))
255 copied
= deepcopy(self
.soup
)
256 self
.assertEqual(copied
.decode(), self
.soup
.decode())
258 def testUnicodePickle(self
):
259 import cPickle
as pickle
260 html
= "<b>" + chr(0xc3) + "</b>"
261 soup
= BeautifulSoup(html
)
262 dumped
= pickle
.dumps(soup
, pickle
.HIGHEST_PROTOCOL
)
263 loaded
= pickle
.loads(dumped
)
264 self
.assertEqual(loaded
.decode(), soup
.decode())
267 class WriteOnlyCode(SoupTest
):
268 "Testing the modification of the tree."
270 def testModifyAttributes(self
):
271 soup
= BeautifulSoup('<a id="1"></a>')
273 self
.assertEqual(soup
.decode(), '<a id="2"></a>')
275 self
.assertEqual(soup
.decode(), '<a></a>')
276 soup
.a
['id2'] = 'foo'
277 self
.assertEqual(soup
.decode(), '<a id2="foo"></a>')
279 def testNewTagCreation(self
):
280 "Makes sure tags don't step on each others' toes."
281 soup
= BeautifulSoup()
284 a
['href'] = 'http://foo.com/'
285 self
.assertRaises(KeyError, lambda : ol
['href'])
287 def testTagReplacement(self
):
288 # Make sure you can replace an element with itself.
289 text
= "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
290 soup
= BeautifulSoup(text
)
292 soup
.c
.replaceWith(c
)
293 self
.assertEquals(soup
.decode(), text
)
296 soup
= BeautifulSoup("<b>Argh!</b>")
297 soup
.find(text
="Argh!").replaceWith("Hooray!")
298 newText
= soup
.find(text
="Hooray!")
300 self
.assertEqual(newText
.previous
, b
)
301 self
.assertEqual(newText
.parent
, b
)
302 self
.assertEqual(newText
.previous
.next
, newText
)
303 self
.assertEqual(newText
.next
, None)
305 # A more complex case
306 soup
= BeautifulSoup("<a><b>Argh!</b><c></c><d></d></a>")
307 soup
.b
.insert(1, "Hooray!")
308 newText
= soup
.find(text
="Hooray!")
309 self
.assertEqual(newText
.previous
, "Argh!")
310 self
.assertEqual(newText
.previous
.next
, newText
)
312 self
.assertEqual(newText
.previousSibling
, "Argh!")
313 self
.assertEqual(newText
.previousSibling
.nextSibling
, newText
)
315 self
.assertEqual(newText
.nextSibling
, None)
316 self
.assertEqual(newText
.next
, soup
.c
)
318 text
= "<html>There's <b>no</b> business like <b>show</b> business</html>"
319 soup
= BeautifulSoup(text
)
320 no
, show
= soup
.findAll('b')
322 self
.assertEquals(soup
.decode(), "<html>There's business like <b>no</b> business</html>")
325 soup
= BeautifulSoup("<a><b>Find</b><c>lady!</c><d></d></a>")
326 tag
= Tag(soup
, 'magictag')
328 soup
.a
.insert(1, tag
)
332 theText
= tag
.find(text
=True)
333 findText
= b
.find(text
="Find")
335 self
.assertEqual(findText
.next
, tag
)
336 self
.assertEqual(tag
.previous
, findText
)
337 self
.assertEqual(b
.nextSibling
, tag
)
338 self
.assertEqual(tag
.previousSibling
, b
)
339 self
.assertEqual(tag
.nextSibling
, c
)
340 self
.assertEqual(c
.previousSibling
, tag
)
342 self
.assertEqual(theText
.next
, c
)
343 self
.assertEqual(c
.previous
, theText
)
345 # Aand... incredibly complex.
346 soup
= BeautifulSoup("""<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
351 weText
= a
.find(text
="We")
352 soup
.b
.replaceWith(soup
.f
)
353 self
.assertEqual(soup
.decode(), "<a>We<f>refuse</f></a><e>to<g>service</g></e>")
355 self
.assertEqual(f
.previous
, weText
)
356 self
.assertEqual(weText
.next
, f
)
357 self
.assertEqual(f
.previousSibling
, weText
)
358 self
.assertEqual(f
.nextSibling
, None)
359 self
.assertEqual(weText
.nextSibling
, f
)
361 def testAppend(self
):
362 doc
= "<p>Don't leave me <b>here</b>.</p> <p>Don't leave me.</p>"
363 soup
= BeautifulSoup(doc
)
364 second_para
= soup('p')[1]
365 bold
= soup
.find('b')
366 soup('p')[1].append(soup
.find('b'))
367 self
.assertEqual(bold
.parent
, second_para
)
368 self
.assertEqual(soup
.decode(),
369 "<p>Don't leave me .</p> "
370 "<p>Don't leave me.<b>here</b></p>")
372 def testTagExtraction(self
):
374 text
= '<html><div id="nav">Nav crap</div>Real content here.</html>'
375 soup
= BeautifulSoup(text
)
376 extracted
= soup
.find("div", id="nav").extract()
377 self
.assertEqual(soup
.decode(), "<html>Real content here.</html>")
378 self
.assertEqual(extracted
.decode(), '<div id="nav">Nav crap</div>')
380 # A simple case, a more complex test.
381 text
= "<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>"
382 soup
= BeautifulStoneSoup(text
)
384 numbers
, roman
, letters
= soup("a")
386 self
.assertEqual(roman
.parent
, doc
)
387 oldPrevious
= roman
.previous
388 endOfThisTag
= roman
.nextSibling
.previous
389 self
.assertEqual(oldPrevious
, "2")
390 self
.assertEqual(roman
.next
, "i")
391 self
.assertEqual(endOfThisTag
, "ii")
392 self
.assertEqual(roman
.previousSibling
, numbers
)
393 self
.assertEqual(roman
.nextSibling
, letters
)
396 self
.assertEqual(roman
.parent
, None)
397 self
.assertEqual(roman
.previous
, None)
398 self
.assertEqual(roman
.next
, "i")
399 self
.assertEqual(letters
.previous
, '2')
400 self
.assertEqual(roman
.previousSibling
, None)
401 self
.assertEqual(roman
.nextSibling
, None)
402 self
.assertEqual(endOfThisTag
.next
, None)
403 self
.assertEqual(roman
.b
.contents
[0].next
, None)
404 self
.assertEqual(numbers
.nextSibling
, letters
)
405 self
.assertEqual(letters
.previousSibling
, numbers
)
406 self
.assertEqual(len(doc
.contents
), 2)
407 self
.assertEqual(doc
.contents
[0], numbers
)
408 self
.assertEqual(doc
.contents
[1], letters
)
410 # A more complex case.
411 text
= "<a>1<b>2<c>Hollywood, baby!</c></b></a>3"
412 soup
= BeautifulStoneSoup(text
)
413 one
= soup
.find(text
="1")
414 three
= soup
.find(text
="3")
417 self
.assertEqual(one
.next
, three
)
418 self
.assertEqual(three
.previous
, one
)
419 self
.assertEqual(one
.parent
.nextSibling
, three
)
420 self
.assertEqual(three
.previousSibling
, soup
.a
)
422 class TheManWithoutAttributes(SoupTest
):
423 "Test attribute access"
425 def testHasKey(self
):
426 text
= "<foo attr='bar'>"
427 self
.assertTrue(BeautifulSoup(text
).foo
.has_key('attr'))
429 class QuoteMeOnThat(SoupTest
):
431 def testQuotedAttributeValues(self
):
432 self
.assertSoupEquals("<foo attr='bar'></foo>",
433 '<foo attr="bar"></foo>')
435 text
= """<foo attr='bar "brawls" happen'>a</foo>"""
436 soup
= BeautifulSoup(text
)
437 self
.assertEquals(soup
.decode(), text
)
439 soup
.foo
['attr'] = 'Brawls happen at "Bob\'s Bar"'
440 newText
= """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>"""
441 self
.assertSoupEquals(soup
.decode(), newText
)
443 self
.assertSoupEquals('<this is="really messed up & stuff">',
444 '<this is="really messed up & stuff"></this>')
448 class YoureSoLiteral(SoupTest
):
450 def testLiteralMode(self
):
451 text
= "<script>if (i<imgs.length)</script><b>Foo</b>"
452 soup
= BeautifulSoup(text
)
453 self
.assertEqual(soup
.script
.contents
[0], "if (i<imgs.length)")
454 self
.assertEqual(soup
.b
.contents
[0], "Foo")
456 def testTextArea(self
):
457 text
= "<textarea><b>This is an example of an HTML tag</b><&<&</textarea>"
458 soup
= BeautifulSoup(text
)
459 self
.assertEqual(soup
.textarea
.contents
[0],
460 "<b>This is an example of an HTML tag</b><&<&")
462 class OperatorOverload(SoupTest
):
463 "Our operators do it all! Call now!"
465 def testTagNameAsFind(self
):
466 "Tests that referencing a tag name as a member delegates to find()."
467 soup
= BeautifulSoup('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
468 self
.assertEqual(soup
.b
.i
, soup
.find('b').find('i'))
469 self
.assertEqual(soup
.b
.i
.string
, 'bar')
470 self
.assertEqual(soup
.b
['id'], '1')
471 self
.assertEqual(soup
.b
.contents
[0], 'foo')
472 self
.assert_(not soup
.a
)
474 #Test the .fooTag variant of .foo.
475 self
.assertEqual(soup
.bTag
.iTag
.string
, 'bar')
476 self
.assertEqual(soup
.b
.iTag
.string
, 'bar')
477 self
.assertEqual(soup
.find('b').find('i'), soup
.bTag
.iTag
)
479 class NestableEgg(SoupTest
):
480 """Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!"""
482 def testParaInsideBlockquote(self
):
483 soup
= BeautifulSoup('<blockquote><p><b>Foo</blockquote><p>Bar')
484 self
.assertEqual(soup
.blockquote
.p
.b
.string
, 'Foo')
485 self
.assertEqual(soup
.blockquote
.b
.string
, 'Foo')
486 self
.assertEqual(soup
.find('p', recursive
=False).string
, 'Bar')
488 def testNestedTables(self
):
489 text
= """<table id="1"><tr><td>Here's another table:
490 <table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>"""
491 soup
= BeautifulSoup(text
)
492 self
.assertEquals(soup
.table
.table
.td
.string
, 'Juicy text')
493 self
.assertEquals(len(soup
.findAll('table')), 2)
494 self
.assertEquals(len(soup
.table
.findAll('table')), 1)
495 self
.assertEquals(soup
.find('table', {'id' : 2}).parent
.parent
.parent
.name
,
498 text
= "<table><tr><td><div><table>Foo</table></div></td></tr></table>"
499 soup
= BeautifulSoup(text
)
500 self
.assertEquals(soup
.table
.tr
.td
.div
.table
.contents
[0], "Foo")
502 text
= """<table><thead><tr>Foo</tr></thead><tbody><tr>Bar</tr></tbody>
503 <tfoot><tr>Baz</tr></tfoot></table>"""
504 soup
= BeautifulSoup(text
)
505 self
.assertEquals(soup
.table
.thead
.tr
.contents
[0], "Foo")
507 def testBadNestedTables(self
):
508 soup
= BeautifulSoup("<table><tr><table><tr id='nested'>")
509 self
.assertEquals(soup
.table
.tr
.table
.tr
['id'], 'nested')
511 class CleanupOnAisleFour(SoupTest
):
512 """Here we test cleanup of text that breaks HTMLParser or is just
515 def testSelfClosingtag(self
):
516 self
.assertEqual(BeautifulSoup("Foo<br/>Bar").find('br').decode(),
519 self
.assertSoupEquals('<p>test1<br/>test2</p>',
520 '<p>test1<br />test2</p>')
522 text
= '<p>test1<selfclosing>test2'
523 soup
= BeautifulStoneSoup(text
)
524 self
.assertEqual(soup
.decode(),
525 '<p>test1<selfclosing>test2</selfclosing></p>')
527 soup
= BeautifulStoneSoup(text
, selfClosingTags
='selfclosing')
528 self
.assertEqual(soup
.decode(),
529 '<p>test1<selfclosing />test2</p>')
531 def testSelfClosingTagOrNot(self
):
532 text
= "<item><link>http://foo.com/</link></item>"
533 self
.assertEqual(BeautifulStoneSoup(text
).decode(), text
)
534 self
.assertEqual(BeautifulSoup(text
).decode(),
535 '<item><link />http://foo.com/</item>')
537 def testBooleanAttributes(self
):
538 text
= "<td nowrap>foo</td>"
539 self
.assertSoupEquals(text
, text
)
542 xml
= "<root>foo<![CDATA[foobar]]>bar</root>"
543 self
.assertSoupEquals(xml
, xml
)
544 r
= re
.compile("foo.*bar")
545 soup
= BeautifulSoup(xml
)
546 self
.assertEquals(soup
.find(text
=r
).string
, "foobar")
547 self
.assertEquals(soup
.find(text
=r
).__class
__, CData
)
549 def testComments(self
):
550 xml
= "foo<!--foobar-->baz"
551 self
.assertSoupEquals(xml
)
552 r
= re
.compile("foo.*bar")
553 soup
= BeautifulSoup(xml
)
554 self
.assertEquals(soup
.find(text
=r
).string
, "foobar")
555 self
.assertEquals(soup
.find(text
="foobar").__class
__, Comment
)
557 def testDeclaration(self
):
558 xml
= "foo<!DOCTYPE foobar>baz"
559 self
.assertSoupEquals(xml
)
560 r
= re
.compile(".*foo.*bar")
561 soup
= BeautifulSoup(xml
)
562 text
= "DOCTYPE foobar"
563 self
.assertEquals(soup
.find(text
=r
).string
, text
)
564 self
.assertEquals(soup
.find(text
=text
).__class
__, Declaration
)
566 namespaced_doctype
= ('<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">'
568 soup
= BeautifulSoup(namespaced_doctype
)
569 self
.assertEquals(soup
.contents
[0],
570 'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"')
571 self
.assertEquals(soup
.html
.contents
[0], 'foo')
573 def testEntityConversions(self
):
574 text
= "<<sacré bleu!>>"
575 soup
= BeautifulStoneSoup(text
)
576 self
.assertSoupEquals(text
)
578 xmlEnt
= BeautifulStoneSoup
.XML_ENTITIES
579 htmlEnt
= BeautifulStoneSoup
.HTML_ENTITIES
580 xhtmlEnt
= BeautifulStoneSoup
.XHTML_ENTITIES
582 soup
= BeautifulStoneSoup(text
, convertEntities
=xmlEnt
)
583 self
.assertEquals(soup
.decode(), "<<sacré bleu!>>")
585 soup
= BeautifulStoneSoup(text
, convertEntities
=xmlEnt
)
586 self
.assertEquals(soup
.decode(), "<<sacré bleu!>>")
588 soup
= BeautifulStoneSoup(text
, convertEntities
=htmlEnt
)
589 self
.assertEquals(soup
.decode(), u
"<<sacr\xe9 bleu!>>")
591 # Make sure the "XML", "HTML", and "XHTML" settings work.
592 text
= "<™'"
593 soup
= BeautifulStoneSoup(text
, convertEntities
=xmlEnt
)
594 self
.assertEquals(soup
.decode(), u
"<™'")
596 soup
= BeautifulStoneSoup(text
, convertEntities
=htmlEnt
)
597 self
.assertEquals(soup
.decode(), u
"<\u2122'")
599 soup
= BeautifulStoneSoup(text
, convertEntities
=xhtmlEnt
)
600 self
.assertEquals(soup
.decode(), u
"<\u2122'")
602 def testNonBreakingSpaces(self
):
603 soup
= BeautifulSoup("<a> </a>",
604 convertEntities
=BeautifulStoneSoup
.HTML_ENTITIES
)
605 self
.assertEquals(soup
.decode(), u
"<a>\xa0\xa0</a>")
607 def testWhitespaceInDeclaration(self
):
608 self
.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>')
610 def testJunkInDeclaration(self
):
611 self
.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a')
613 def testIncompleteDeclaration(self
):
614 self
.assertSoupEquals('a<!b <p>c')
616 def testEntityReplacement(self
):
617 self
.assertSoupEquals('<b>hello there</b>')
619 def testEntitiesInAttributeValues(self
):
620 self
.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>',
622 self
.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>',
625 soup
= BeautifulSoup('<x t=">™">',
626 convertEntities
=BeautifulStoneSoup
.HTML_ENTITIES
)
627 self
.assertEquals(soup
.decode(), u
'<x t=">\u2122"></x>')
629 uri
= "http://crummy.com?sacré&bleu"
630 link
= '<a href="%s"></a>' % uri
632 soup
= BeautifulSoup(link
, convertEntities
=BeautifulSoup
.HTML_ENTITIES
)
633 self
.assertEquals(soup
.decode(),
634 link
.replace("é", u
"\xe9"))
636 uri
= "http://crummy.com?sacré&bleu"
637 link
= '<a href="%s"></a>' % uri
638 soup
= BeautifulSoup(link
, convertEntities
=BeautifulSoup
.HTML_ENTITIES
)
639 self
.assertEquals(soup
.a
['href'],
640 uri
.replace("é", u
"\xe9"))
642 def testNakedAmpersands(self
):
643 html
= {'convertEntities':BeautifulStoneSoup
.HTML_ENTITIES
}
644 soup
= BeautifulStoneSoup("AT&T ", **html
)
645 self
.assertEquals(soup
.decode(), 'AT&T ')
647 nakedAmpersandInASentence
= "AT&T was Ma Bell"
648 soup
= BeautifulStoneSoup(nakedAmpersandInASentence
,**html
)
649 self
.assertEquals(soup
.decode(), \
650 nakedAmpersandInASentence
.replace('&','&'))
652 invalidURL
= '<a href="http://example.org?a=1&b=2;3">foo</a>'
653 validURL
= invalidURL
.replace('&','&')
654 soup
= BeautifulStoneSoup(invalidURL
)
655 self
.assertEquals(soup
.decode(), validURL
)
657 soup
= BeautifulStoneSoup(validURL
)
658 self
.assertEquals(soup
.decode(), validURL
)
661 class EncodeRed(SoupTest
):
662 """Tests encoding conversion, Unicode conversion, and Microsoft
663 smart quote fixes."""
665 def testUnicodeDammitStandalone(self
):
666 markup
= "<foo>\x92</foo>"
667 dammit
= UnicodeDammit(markup
)
668 self
.assertEquals(dammit
.unicode, "<foo>’</foo>")
670 hebrew
= "\xed\xe5\xec\xf9"
671 dammit
= UnicodeDammit(hebrew
, ["iso-8859-8"])
672 self
.assertEquals(dammit
.unicode, u
'\u05dd\u05d5\u05dc\u05e9')
673 self
.assertEquals(dammit
.originalEncoding
, 'iso-8859-8')
675 def testGarbageInGarbageOut(self
):
676 ascii
= "<foo>a</foo>"
677 asciiSoup
= BeautifulStoneSoup(ascii
)
678 self
.assertEquals(ascii
, asciiSoup
.decode())
680 unicodeData
= u
"<foo>\u00FC</foo>"
681 utf8
= unicodeData
.encode("utf-8")
682 self
.assertEquals(utf8
, '<foo>\xc3\xbc</foo>')
684 unicodeSoup
= BeautifulStoneSoup(unicodeData
)
685 self
.assertEquals(unicodeData
, unicodeSoup
.decode())
686 self
.assertEquals(unicodeSoup
.foo
.string
, u
'\u00FC')
688 utf8Soup
= BeautifulStoneSoup(utf8
, fromEncoding
='utf-8')
689 self
.assertEquals(utf8
, utf8Soup
.encode('utf-8'))
690 self
.assertEquals(utf8Soup
.originalEncoding
, "utf-8")
692 utf8Soup
= BeautifulStoneSoup(unicodeData
)
693 self
.assertEquals(utf8
, utf8Soup
.encode('utf-8'))
694 self
.assertEquals(utf8Soup
.originalEncoding
, None)
697 def testHandleInvalidCodec(self
):
698 for bad_encoding
in ['.utf8', '...', 'utF---16.!']:
699 soup
= BeautifulSoup(u
"Räksmörgås".encode("utf-8"),
700 fromEncoding
=bad_encoding
)
701 self
.assertEquals(soup
.originalEncoding
, 'utf-8')
703 def testUnicodeSearch(self
):
704 html
= u
'<html><body><h1>Räksmörgås</h1></body></html>'
705 soup
= BeautifulSoup(html
)
706 self
.assertEqual(soup
.find(text
=u
'Räksmörgås'),u
'Räksmörgås')
708 def testRewrittenXMLHeader(self
):
709 euc_jp
= '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n'
710 utf8
= "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n"
711 soup
= BeautifulStoneSoup(euc_jp
)
712 if soup
.originalEncoding
!= "euc-jp":
713 raise Exception("Test failed when parsing euc-jp document. "
714 "If you're running Python >=2.4, or you have "
715 "cjkcodecs installed, this is a real problem. "
716 "Otherwise, ignore it.")
718 self
.assertEquals(soup
.originalEncoding
, "euc-jp")
719 self
.assertEquals(soup
.renderContents('utf-8'), utf8
)
721 old_text
= "<?xml encoding='windows-1252'><foo>\x92</foo>"
722 new_text
= "<?xml version='1.0' encoding='utf-8'?><foo>’</foo>"
723 self
.assertSoupEquals(old_text
, new_text
)
725 def testRewrittenMetaTag(self
):
726 no_shift_jis_html
= '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''
727 soup
= BeautifulSoup(no_shift_jis_html
)
729 # Beautiful Soup used to try to rewrite the meta tag even if the
730 # meta tag got filtered out by the strainer. This test makes
731 # sure that doesn't happen.
732 strainer
= SoupStrainer('pre')
733 soup
= BeautifulSoup(no_shift_jis_html
, parseOnlyThese
=strainer
)
734 self
.assertEquals(soup
.contents
[0].name
, 'pre')
736 meta_tag
= ('<meta content="text/html; charset=x-sjis" '
737 'http-equiv="Content-type" />')
740 '<meta http-equiv="Content-language" content="ja" />'
741 '</head><body><pre>\n'
742 '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
743 '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
744 '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'
745 '</pre></body></html>') % meta_tag
746 soup
= BeautifulSoup(shift_jis_html
)
747 if soup
.originalEncoding
!= "shift-jis":
748 raise Exception("Test failed when parsing shift-jis document "
749 "with meta tag '%s'."
750 "If you're running Python >=2.4, or you have "
751 "cjkcodecs installed, this is a real problem. "
752 "Otherwise, ignore it." % meta_tag
)
753 self
.assertEquals(soup
.originalEncoding
, "shift-jis")
755 content_type_tag
= soup
.meta
['content']
756 self
.assertEquals(content_type_tag
[content_type_tag
.find('charset='):],
757 'charset=%SOUP-ENCODING%')
758 content_type
= str(soup
.meta
)
759 index
= content_type
.find('charset=')
760 self
.assertEqual(content_type
[index
:index
+len('charset=utf8')+1],
762 content_type
= soup
.meta
.encode('shift-jis')
763 index
= content_type
.find('charset=')
764 self
.assertEqual(content_type
[index
:index
+len('charset=shift-jis')],
765 'charset=shift-jis'.encode())
767 self
.assertEquals(soup
.encode('utf-8'), (
769 '<meta content="text/html; charset=utf-8" '
770 'http-equiv="Content-type" />\n'
771 '<meta http-equiv="Content-language" content="ja" />'
772 '</head><body><pre>\n'
773 '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'
774 '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'
775 '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'
776 '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'
777 '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'
778 '</pre></body></html>'))
779 self
.assertEquals(soup
.encode("shift-jis"),
780 shift_jis_html
.replace('x-sjis'.encode(),
781 'shift-jis'.encode()))
783 isolatin
= """<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
784 soup
= BeautifulSoup(isolatin
)
786 utf8
= isolatin
.replace("ISO-Latin-1".encode(), "utf-8".encode())
787 utf8
= utf8
.replace("\xe9", "\xc3\xa9")
788 self
.assertSoupEquals(soup
.encode("utf-8"), utf8
, encoding
='utf-8')
790 def testHebrew(self
):
791 iso_8859_8
= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n'
792 utf8
= '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n'
793 soup
= BeautifulStoneSoup(iso_8859_8
, fromEncoding
="iso-8859-8")
794 self
.assertEquals(soup
.encode('utf-8'), utf8
)
796 def testSmartQuotesNotSoSmartAnymore(self
):
797 self
.assertSoupEquals("\x91Foo\x92 <!--blah-->",
798 '‘Foo’ <!--blah-->')
800 def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self
):
801 smartQuotes
= "Il a dit, \x8BSacré bleu!\x9b"
802 soup
= BeautifulSoup(smartQuotes
)
803 self
.assertEquals(soup
.decode(),
804 'Il a dit, ‹Sacré bleu!›')
805 soup
= BeautifulSoup(smartQuotes
, convertEntities
="html")
806 self
.assertEquals(soup
.encode('utf-8'),
807 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba')
809 def testDontSeeSmartQuotesWhereThereAreNone(self
):
810 utf_8
= "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
811 self
.assertSoupEquals(utf_8
, encoding
='utf-8')
814 class Whitewash(SoupTest
):
815 """Test whitespace preservation."""
817 def testPreservedWhitespace(self
):
818 self
.assertSoupEquals("<pre> </pre>")
819 self
.assertSoupEquals("<pre> woo </pre>")
821 def testCollapsedWhitespace(self
):
822 self
.assertSoupEquals("<p> </p>", "<p> </p>")
825 if __name__
== '__main__':