Add BeautifulSoup Python HTML/XML parser to Melange repository.
[Melange.git] / app / htmlsanitizer / BeautifulSoup.py
blob34204e740249cb0c387559ebbdb8fab9fc2df005
1 """Beautiful Soup
2 Elixir and Tonic
3 "The Screen-Scraper's Friend"
4 http://www.crummy.com/software/BeautifulSoup/
6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7 tree representation. It provides methods and Pythonic idioms that make
8 it easy to navigate, search, and modify the tree.
10 A well-formed XML/HTML document yields a well-formed data
11 structure. An ill-formed XML/HTML document yields a correspondingly
12 ill-formed data structure. If your document is only locally
13 well-formed, you can use this library to find and process the
14 well-formed part of it.
16 Beautiful Soup works with Python 2.2 and up. It has no external
17 dependencies, but you'll have more success at converting data to UTF-8
18 if you also install these three packages:
20 * chardet, for auto-detecting character encodings
21 http://chardet.feedparser.org/
22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
23 by stock Python.
24 http://cjkpython.i18n.org/
26 Beautiful Soup defines classes for two main parsing strategies:
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29 language that kind of looks like XML.
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32 or invalid. This class has web browser-like heuristics for
33 obtaining a sensible parse tree in the face of common HTML errors.
35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36 the encoding of an HTML or XML document, and converting it to
37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
39 For more than you ever wanted to know about Beautiful Soup, see the
40 documentation:
41 http://www.crummy.com/software/BeautifulSoup/documentation.html
43 Here, have some legalese:
45 Copyright (c) 2004-2009, Leonard Richardson
47 All rights reserved.
49 Redistribution and use in source and binary forms, with or without
50 modification, are permitted provided that the following conditions are
51 met:
53 * Redistributions of source code must retain the above copyright
54 notice, this list of conditions and the following disclaimer.
56 * Redistributions in binary form must reproduce the above
57 copyright notice, this list of conditions and the following
58 disclaimer in the documentation and/or other materials provided
59 with the distribution.
61 * Neither the name of the the Beautiful Soup Consortium and All
62 Night Kosher Bakery nor the names of its contributors may be
63 used to endorse or promote products derived from this software
64 without specific prior written permission.
66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
78 """
79 from __future__ import generators
81 __author__ = "Leonard Richardson (leonardr@segfault.org)"
82 __version__ = "3.1.0.1"
83 __copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
84 __license__ = "New-style BSD"
86 import codecs
87 import markupbase
88 import types
89 import re
90 from HTMLParser import HTMLParser, HTMLParseError
91 try:
92 from htmlentitydefs import name2codepoint
93 except ImportError:
94 name2codepoint = {}
95 try:
96 set
97 except NameError:
98 from sets import Set as set
100 #These hacks make Beautiful Soup able to parse XML with namespaces
101 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
103 DEFAULT_OUTPUT_ENCODING = "utf-8"
105 # First, the classes that represent markup elements.
107 def sob(unicode, encoding):
108 """Returns either the given Unicode string or its encoding."""
109 if encoding is None:
110 return unicode
111 else:
112 return unicode.encode(encoding)
114 class PageElement:
115 """Contains the navigational information for some part of the page
116 (either a tag or a piece of text)"""
118 def setup(self, parent=None, previous=None):
119 """Sets up the initial relations between this element and
120 other elements."""
121 self.parent = parent
122 self.previous = previous
123 self.next = None
124 self.previousSibling = None
125 self.nextSibling = None
126 if self.parent and self.parent.contents:
127 self.previousSibling = self.parent.contents[-1]
128 self.previousSibling.nextSibling = self
130 def replaceWith(self, replaceWith):
131 oldParent = self.parent
132 myIndex = self.parent.contents.index(self)
133 if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
134 # We're replacing this element with one of its siblings.
135 index = self.parent.contents.index(replaceWith)
136 if index and index < myIndex:
137 # Furthermore, it comes before this element. That
138 # means that when we extract it, the index of this
139 # element will change.
140 myIndex = myIndex - 1
141 self.extract()
142 oldParent.insert(myIndex, replaceWith)
144 def extract(self):
145 """Destructively rips this element out of the tree."""
146 if self.parent:
147 try:
148 self.parent.contents.remove(self)
149 except ValueError:
150 pass
152 #Find the two elements that would be next to each other if
153 #this element (and any children) hadn't been parsed. Connect
154 #the two.
155 lastChild = self._lastRecursiveChild()
156 nextElement = lastChild.next
158 if self.previous:
159 self.previous.next = nextElement
160 if nextElement:
161 nextElement.previous = self.previous
162 self.previous = None
163 lastChild.next = None
165 self.parent = None
166 if self.previousSibling:
167 self.previousSibling.nextSibling = self.nextSibling
168 if self.nextSibling:
169 self.nextSibling.previousSibling = self.previousSibling
170 self.previousSibling = self.nextSibling = None
171 return self
173 def _lastRecursiveChild(self):
174 "Finds the last element beneath this object to be parsed."
175 lastChild = self
176 while hasattr(lastChild, 'contents') and lastChild.contents:
177 lastChild = lastChild.contents[-1]
178 return lastChild
180 def insert(self, position, newChild):
181 if (isinstance(newChild, basestring)
182 or isinstance(newChild, unicode)) \
183 and not isinstance(newChild, NavigableString):
184 newChild = NavigableString(newChild)
186 position = min(position, len(self.contents))
187 if hasattr(newChild, 'parent') and newChild.parent != None:
188 # We're 'inserting' an element that's already one
189 # of this object's children.
190 if newChild.parent == self:
191 index = self.find(newChild)
192 if index and index < position:
193 # Furthermore we're moving it further down the
194 # list of this object's children. That means that
195 # when we extract this element, our target index
196 # will jump down one.
197 position = position - 1
198 newChild.extract()
200 newChild.parent = self
201 previousChild = None
202 if position == 0:
203 newChild.previousSibling = None
204 newChild.previous = self
205 else:
206 previousChild = self.contents[position-1]
207 newChild.previousSibling = previousChild
208 newChild.previousSibling.nextSibling = newChild
209 newChild.previous = previousChild._lastRecursiveChild()
210 if newChild.previous:
211 newChild.previous.next = newChild
213 newChildsLastElement = newChild._lastRecursiveChild()
215 if position >= len(self.contents):
216 newChild.nextSibling = None
218 parent = self
219 parentsNextSibling = None
220 while not parentsNextSibling:
221 parentsNextSibling = parent.nextSibling
222 parent = parent.parent
223 if not parent: # This is the last element in the document.
224 break
225 if parentsNextSibling:
226 newChildsLastElement.next = parentsNextSibling
227 else:
228 newChildsLastElement.next = None
229 else:
230 nextChild = self.contents[position]
231 newChild.nextSibling = nextChild
232 if newChild.nextSibling:
233 newChild.nextSibling.previousSibling = newChild
234 newChildsLastElement.next = nextChild
236 if newChildsLastElement.next:
237 newChildsLastElement.next.previous = newChildsLastElement
238 self.contents.insert(position, newChild)
240 def append(self, tag):
241 """Appends the given tag to the contents of this tag."""
242 self.insert(len(self.contents), tag)
244 def findNext(self, name=None, attrs={}, text=None, **kwargs):
245 """Returns the first item that matches the given criteria and
246 appears after this Tag in the document."""
247 return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
249 def findAllNext(self, name=None, attrs={}, text=None, limit=None,
250 **kwargs):
251 """Returns all items that match the given criteria and appear
252 after this Tag in the document."""
253 return self._findAll(name, attrs, text, limit, self.nextGenerator,
254 **kwargs)
256 def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
257 """Returns the closest sibling to this Tag that matches the
258 given criteria and appears after this Tag in the document."""
259 return self._findOne(self.findNextSiblings, name, attrs, text,
260 **kwargs)
262 def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
263 **kwargs):
264 """Returns the siblings of this Tag that match the given
265 criteria and appear after this Tag in the document."""
266 return self._findAll(name, attrs, text, limit,
267 self.nextSiblingGenerator, **kwargs)
268 fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
270 def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
271 """Returns the first item that matches the given criteria and
272 appears before this Tag in the document."""
273 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
275 def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
276 **kwargs):
277 """Returns all items that match the given criteria and appear
278 before this Tag in the document."""
279 return self._findAll(name, attrs, text, limit, self.previousGenerator,
280 **kwargs)
281 fetchPrevious = findAllPrevious # Compatibility with pre-3.x
283 def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
284 """Returns the closest sibling to this Tag that matches the
285 given criteria and appears before this Tag in the document."""
286 return self._findOne(self.findPreviousSiblings, name, attrs, text,
287 **kwargs)
289 def findPreviousSiblings(self, name=None, attrs={}, text=None,
290 limit=None, **kwargs):
291 """Returns the siblings of this Tag that match the given
292 criteria and appear before this Tag in the document."""
293 return self._findAll(name, attrs, text, limit,
294 self.previousSiblingGenerator, **kwargs)
295 fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
297 def findParent(self, name=None, attrs={}, **kwargs):
298 """Returns the closest parent of this Tag that matches the given
299 criteria."""
300 # NOTE: We can't use _findOne because findParents takes a different
301 # set of arguments.
302 r = None
303 l = self.findParents(name, attrs, 1)
304 if l:
305 r = l[0]
306 return r
308 def findParents(self, name=None, attrs={}, limit=None, **kwargs):
309 """Returns the parents of this Tag that match the given
310 criteria."""
312 return self._findAll(name, attrs, None, limit, self.parentGenerator,
313 **kwargs)
314 fetchParents = findParents # Compatibility with pre-3.x
316 #These methods do the real heavy lifting.
318 def _findOne(self, method, name, attrs, text, **kwargs):
319 r = None
320 l = method(name, attrs, text, 1, **kwargs)
321 if l:
322 r = l[0]
323 return r
325 def _findAll(self, name, attrs, text, limit, generator, **kwargs):
326 "Iterates over a generator looking for things that match."
328 if isinstance(name, SoupStrainer):
329 strainer = name
330 else:
331 # Build a SoupStrainer
332 strainer = SoupStrainer(name, attrs, text, **kwargs)
333 results = ResultSet(strainer)
334 g = generator()
335 while True:
336 try:
337 i = g.next()
338 except StopIteration:
339 break
340 if i:
341 found = strainer.search(i)
342 if found:
343 results.append(found)
344 if limit and len(results) >= limit:
345 break
346 return results
348 #These Generators can be used to navigate starting from both
349 #NavigableStrings and Tags.
350 def nextGenerator(self):
351 i = self
352 while i:
353 i = i.next
354 yield i
356 def nextSiblingGenerator(self):
357 i = self
358 while i:
359 i = i.nextSibling
360 yield i
362 def previousGenerator(self):
363 i = self
364 while i:
365 i = i.previous
366 yield i
368 def previousSiblingGenerator(self):
369 i = self
370 while i:
371 i = i.previousSibling
372 yield i
374 def parentGenerator(self):
375 i = self
376 while i:
377 i = i.parent
378 yield i
380 # Utility methods
381 def substituteEncoding(self, str, encoding=None):
382 encoding = encoding or "utf-8"
383 return str.replace("%SOUP-ENCODING%", encoding)
385 def toEncoding(self, s, encoding=None):
386 """Encodes an object to a string in some encoding, or to Unicode.
387 ."""
388 if isinstance(s, unicode):
389 if encoding:
390 s = s.encode(encoding)
391 elif isinstance(s, str):
392 if encoding:
393 s = s.encode(encoding)
394 else:
395 s = unicode(s)
396 else:
397 if encoding:
398 s = self.toEncoding(str(s), encoding)
399 else:
400 s = unicode(s)
401 return s
403 class NavigableString(unicode, PageElement):
405 def __new__(cls, value):
406 """Create a new NavigableString.
408 When unpickling a NavigableString, this method is called with
409 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
410 passed in to the superclass's __new__ or the superclass won't know
411 how to handle non-ASCII characters.
413 if isinstance(value, unicode):
414 return unicode.__new__(cls, value)
415 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
417 def __getnewargs__(self):
418 return (unicode(self),)
420 def __getattr__(self, attr):
421 """text.string gives you text. This is for backwards
422 compatibility for Navigable*String, but for CData* it lets you
423 get the string without the CData wrapper."""
424 if attr == 'string':
425 return self
426 else:
427 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
429 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING):
430 return self.decode().encode(encoding)
432 def decodeGivenEventualEncoding(self, eventualEncoding):
433 return self
435 class CData(NavigableString):
437 def decodeGivenEventualEncoding(self, eventualEncoding):
438 return u'<![CDATA[' + self + u']]>'
440 class ProcessingInstruction(NavigableString):
442 def decodeGivenEventualEncoding(self, eventualEncoding):
443 output = self
444 if u'%SOUP-ENCODING%' in output:
445 output = self.substituteEncoding(output, eventualEncoding)
446 return u'<?' + output + u'?>'
448 class Comment(NavigableString):
449 def decodeGivenEventualEncoding(self, eventualEncoding):
450 return u'<!--' + self + u'-->'
452 class Declaration(NavigableString):
453 def decodeGivenEventualEncoding(self, eventualEncoding):
454 return u'<!' + self + u'>'
456 class Tag(PageElement):
458 """Represents a found HTML tag with its attributes and contents."""
460 def _invert(h):
461 "Cheap function to invert a hash."
462 i = {}
463 for k,v in h.items():
464 i[v] = k
465 return i
467 XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
468 "quot" : '"',
469 "amp" : "&",
470 "lt" : "<",
471 "gt" : ">" }
473 XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
475 def _convertEntities(self, match):
476 """Used in a call to re.sub to replace HTML, XML, and numeric
477 entities with the appropriate Unicode characters. If HTML
478 entities are being converted, any unrecognized entities are
479 escaped."""
480 x = match.group(1)
481 if self.convertHTMLEntities and x in name2codepoint:
482 return unichr(name2codepoint[x])
483 elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
484 if self.convertXMLEntities:
485 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
486 else:
487 return u'&%s;' % x
488 elif len(x) > 0 and x[0] == '#':
489 # Handle numeric entities
490 if len(x) > 1 and x[1] == 'x':
491 return unichr(int(x[2:], 16))
492 else:
493 return unichr(int(x[1:]))
495 elif self.escapeUnrecognizedEntities:
496 return u'&amp;%s;' % x
497 else:
498 return u'&%s;' % x
500 def __init__(self, parser, name, attrs=None, parent=None,
501 previous=None):
502 "Basic constructor."
504 # We don't actually store the parser object: that lets extracted
505 # chunks be garbage-collected
506 self.parserClass = parser.__class__
507 self.isSelfClosing = parser.isSelfClosingTag(name)
508 self.name = name
509 if attrs == None:
510 attrs = []
511 self.attrs = attrs
512 self.contents = []
513 self.setup(parent, previous)
514 self.hidden = False
515 self.containsSubstitutions = False
516 self.convertHTMLEntities = parser.convertHTMLEntities
517 self.convertXMLEntities = parser.convertXMLEntities
518 self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
520 def convert(kval):
521 "Converts HTML, XML and numeric entities in the attribute value."
522 k, val = kval
523 if val is None:
524 return kval
525 return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
526 self._convertEntities, val))
527 self.attrs = map(convert, self.attrs)
529 def get(self, key, default=None):
530 """Returns the value of the 'key' attribute for the tag, or
531 the value given for 'default' if it doesn't have that
532 attribute."""
533 return self._getAttrMap().get(key, default)
535 def has_key(self, key):
536 return self._getAttrMap().has_key(key)
538 def __getitem__(self, key):
539 """tag[key] returns the value of the 'key' attribute for the tag,
540 and throws an exception if it's not there."""
541 return self._getAttrMap()[key]
543 def __iter__(self):
544 "Iterating over a tag iterates over its contents."
545 return iter(self.contents)
547 def __len__(self):
548 "The length of a tag is the length of its list of contents."
549 return len(self.contents)
551 def __contains__(self, x):
552 return x in self.contents
554 def __nonzero__(self):
555 "A tag is non-None even if it has no contents."
556 return True
558 def __setitem__(self, key, value):
559 """Setting tag[key] sets the value of the 'key' attribute for the
560 tag."""
561 self._getAttrMap()
562 self.attrMap[key] = value
563 found = False
564 for i in range(0, len(self.attrs)):
565 if self.attrs[i][0] == key:
566 self.attrs[i] = (key, value)
567 found = True
568 if not found:
569 self.attrs.append((key, value))
570 self._getAttrMap()[key] = value
572 def __delitem__(self, key):
573 "Deleting tag[key] deletes all 'key' attributes for the tag."
574 for item in self.attrs:
575 if item[0] == key:
576 self.attrs.remove(item)
577 #We don't break because bad HTML can define the same
578 #attribute multiple times.
579 self._getAttrMap()
580 if self.attrMap.has_key(key):
581 del self.attrMap[key]
583 def __call__(self, *args, **kwargs):
584 """Calling a tag like a function is the same as calling its
585 findAll() method. Eg. tag('a') returns a list of all the A tags
586 found within this tag."""
587 return apply(self.findAll, args, kwargs)
589 def __getattr__(self, tag):
590 #print "Getattr %s.%s" % (self.__class__, tag)
591 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
592 return self.find(tag[:-3])
593 elif tag.find('__') != 0:
594 return self.find(tag)
595 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
597 def __eq__(self, other):
598 """Returns true iff this tag has the same name, the same attributes,
599 and the same contents (recursively) as the given tag.
601 NOTE: right now this will return false if two tags have the
602 same attributes in a different order. Should this be fixed?"""
603 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
604 return False
605 for i in range(0, len(self.contents)):
606 if self.contents[i] != other.contents[i]:
607 return False
608 return True
610 def __ne__(self, other):
611 """Returns true iff this tag is not identical to the other tag,
612 as defined in __eq__."""
613 return not self == other
615 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
616 """Renders this tag as a string."""
617 return self.decode(eventualEncoding=encoding)
619 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
620 + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
621 + ")")
623 def _sub_entity(self, x):
624 """Used with a regular expression to substitute the
625 appropriate XML entity for an XML special character."""
626 return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
628 def __unicode__(self):
629 return self.decode()
631 def __str__(self):
632 return self.encode()
634 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
635 prettyPrint=False, indentLevel=0):
636 return self.decode(prettyPrint, indentLevel, encoding).encode(encoding)
638 def decode(self, prettyPrint=False, indentLevel=0,
639 eventualEncoding=DEFAULT_OUTPUT_ENCODING):
640 """Returns a string or Unicode representation of this tag and
641 its contents. To get Unicode, pass None for encoding."""
643 attrs = []
644 if self.attrs:
645 for key, val in self.attrs:
646 fmt = '%s="%s"'
647 if isString(val):
648 if (self.containsSubstitutions
649 and eventualEncoding is not None
650 and '%SOUP-ENCODING%' in val):
651 val = self.substituteEncoding(val, eventualEncoding)
653 # The attribute value either:
655 # * Contains no embedded double quotes or single quotes.
656 # No problem: we enclose it in double quotes.
657 # * Contains embedded single quotes. No problem:
658 # double quotes work here too.
659 # * Contains embedded double quotes. No problem:
660 # we enclose it in single quotes.
661 # * Embeds both single _and_ double quotes. This
662 # can't happen naturally, but it can happen if
663 # you modify an attribute value after parsing
664 # the document. Now we have a bit of a
665 # problem. We solve it by enclosing the
666 # attribute in single quotes, and escaping any
667 # embedded single quotes to XML entities.
668 if '"' in val:
669 fmt = "%s='%s'"
670 if "'" in val:
671 # TODO: replace with apos when
672 # appropriate.
673 val = val.replace("'", "&squot;")
675 # Now we're okay w/r/t quotes. But the attribute
676 # value might also contain angle brackets, or
677 # ampersands that aren't part of entities. We need
678 # to escape those to XML entities too.
679 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
680 if val is None:
681 # Handle boolean attributes.
682 decoded = key
683 else:
684 decoded = fmt % (key, val)
685 attrs.append(decoded)
686 close = ''
687 closeTag = ''
688 if self.isSelfClosing:
689 close = ' /'
690 else:
691 closeTag = '</%s>' % self.name
693 indentTag, indentContents = 0, 0
694 if prettyPrint:
695 indentTag = indentLevel
696 space = (' ' * (indentTag-1))
697 indentContents = indentTag + 1
698 contents = self.decodeContents(prettyPrint, indentContents,
699 eventualEncoding)
700 if self.hidden:
701 s = contents
702 else:
703 s = []
704 attributeString = ''
705 if attrs:
706 attributeString = ' ' + ' '.join(attrs)
707 if prettyPrint:
708 s.append(space)
709 s.append('<%s%s%s>' % (self.name, attributeString, close))
710 if prettyPrint:
711 s.append("\n")
712 s.append(contents)
713 if prettyPrint and contents and contents[-1] != "\n":
714 s.append("\n")
715 if prettyPrint and closeTag:
716 s.append(space)
717 s.append(closeTag)
718 if prettyPrint and closeTag and self.nextSibling:
719 s.append("\n")
720 s = ''.join(s)
721 return s
723 def decompose(self):
724 """Recursively destroys the contents of this tree."""
725 contents = [i for i in self.contents]
726 for i in contents:
727 if isinstance(i, Tag):
728 i.decompose()
729 else:
730 i.extract()
731 self.extract()
733 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
734 return self.encode(encoding, True)
736 def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
737 prettyPrint=False, indentLevel=0):
738 return self.decodeContents(prettyPrint, indentLevel).encode(encoding)
740 def decodeContents(self, prettyPrint=False, indentLevel=0,
741 eventualEncoding=DEFAULT_OUTPUT_ENCODING):
742 """Renders the contents of this tag as a string in the given
743 encoding. If encoding is None, returns a Unicode string.."""
744 s=[]
745 for c in self:
746 text = None
747 if isinstance(c, NavigableString):
748 text = c.decodeGivenEventualEncoding(eventualEncoding)
749 elif isinstance(c, Tag):
750 s.append(c.decode(prettyPrint, indentLevel, eventualEncoding))
751 if text and prettyPrint:
752 text = text.strip()
753 if text:
754 if prettyPrint:
755 s.append(" " * (indentLevel-1))
756 s.append(text)
757 if prettyPrint:
758 s.append("\n")
759 return ''.join(s)
761 #Soup methods
763 def find(self, name=None, attrs={}, recursive=True, text=None,
764 **kwargs):
765 """Return only the first child of this Tag matching the given
766 criteria."""
767 r = None
768 l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
769 if l:
770 r = l[0]
771 return r
772 findChild = find
774 def findAll(self, name=None, attrs={}, recursive=True, text=None,
775 limit=None, **kwargs):
776 """Extracts a list of Tag objects that match the given
777 criteria. You can specify the name of the Tag and any
778 attributes you want the Tag to have.
780 The value of a key-value pair in the 'attrs' map can be a
781 string, a list of strings, a regular expression object, or a
782 callable that takes a string and returns whether or not the
783 string matches for some custom definition of 'matches'. The
784 same is true of the tag name."""
785 generator = self.recursiveChildGenerator
786 if not recursive:
787 generator = self.childGenerator
788 return self._findAll(name, attrs, text, limit, generator, **kwargs)
789 findChildren = findAll
791 # Pre-3.x compatibility methods. Will go away in 4.0.
792 first = find
793 fetch = findAll
795 def fetchText(self, text=None, recursive=True, limit=None):
796 return self.findAll(text=text, recursive=recursive, limit=limit)
798 def firstText(self, text=None, recursive=True):
799 return self.find(text=text, recursive=recursive)
801 # 3.x compatibility methods. Will go away in 4.0.
802 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
803 prettyPrint=False, indentLevel=0):
804 if encoding is None:
805 return self.decodeContents(prettyPrint, indentLevel, encoding)
806 else:
807 return self.encodeContents(encoding, prettyPrint, indentLevel)
810 #Private methods
812 def _getAttrMap(self):
813 """Initializes a map representation of this tag's attributes,
814 if not already initialized."""
815 if not getattr(self, 'attrMap'):
816 self.attrMap = {}
817 for (key, value) in self.attrs:
818 self.attrMap[key] = value
819 return self.attrMap
821 #Generator methods
822 def recursiveChildGenerator(self):
823 if not len(self.contents):
824 raise StopIteration
825 stopNode = self._lastRecursiveChild().next
826 current = self.contents[0]
827 while current is not stopNode:
828 yield current
829 current = current.next
831 def childGenerator(self):
832 if not len(self.contents):
833 raise StopIteration
834 current = self.contents[0]
835 while current:
836 yield current
837 current = current.nextSibling
838 raise StopIteration
840 # Next, a couple classes to represent queries and their results.
841 class SoupStrainer:
842 """Encapsulates a number of ways of matching a markup element (tag or
843 text)."""
845 def __init__(self, name=None, attrs={}, text=None, **kwargs):
846 self.name = name
847 if isString(attrs):
848 kwargs['class'] = attrs
849 attrs = None
850 if kwargs:
851 if attrs:
852 attrs = attrs.copy()
853 attrs.update(kwargs)
854 else:
855 attrs = kwargs
856 self.attrs = attrs
857 self.text = text
859 def __str__(self):
860 if self.text:
861 return self.text
862 else:
863 return "%s|%s" % (self.name, self.attrs)
865 def searchTag(self, markupName=None, markupAttrs={}):
866 found = None
867 markup = None
868 if isinstance(markupName, Tag):
869 markup = markupName
870 markupAttrs = markup
871 callFunctionWithTagData = callable(self.name) \
872 and not isinstance(markupName, Tag)
874 if (not self.name) \
875 or callFunctionWithTagData \
876 or (markup and self._matches(markup, self.name)) \
877 or (not markup and self._matches(markupName, self.name)):
878 if callFunctionWithTagData:
879 match = self.name(markupName, markupAttrs)
880 else:
881 match = True
882 markupAttrMap = None
883 for attr, matchAgainst in self.attrs.items():
884 if not markupAttrMap:
885 if hasattr(markupAttrs, 'get'):
886 markupAttrMap = markupAttrs
887 else:
888 markupAttrMap = {}
889 for k,v in markupAttrs:
890 markupAttrMap[k] = v
891 attrValue = markupAttrMap.get(attr)
892 if not self._matches(attrValue, matchAgainst):
893 match = False
894 break
895 if match:
896 if markup:
897 found = markup
898 else:
899 found = markupName
900 return found
902 def search(self, markup):
903 #print 'looking for %s in %s' % (self, markup)
904 found = None
905 # If given a list of items, scan it for a text element that
906 # matches.
907 if isList(markup) and not isinstance(markup, Tag):
908 for element in markup:
909 if isinstance(element, NavigableString) \
910 and self.search(element):
911 found = element
912 break
913 # If it's a Tag, make sure its name or attributes match.
914 # Don't bother with Tags if we're searching for text.
915 elif isinstance(markup, Tag):
916 if not self.text:
917 found = self.searchTag(markup)
918 # If it's text, make sure the text matches.
919 elif isinstance(markup, NavigableString) or \
920 isString(markup):
921 if self._matches(markup, self.text):
922 found = markup
923 else:
924 raise Exception, "I don't know how to match against a %s" \
925 % markup.__class__
926 return found
928 def _matches(self, markup, matchAgainst):
929 #print "Matching %s against %s" % (markup, matchAgainst)
930 result = False
931 if matchAgainst == True and type(matchAgainst) == types.BooleanType:
932 result = markup != None
933 elif callable(matchAgainst):
934 result = matchAgainst(markup)
935 else:
936 #Custom match methods take the tag as an argument, but all
937 #other ways of matching match the tag name as a string.
938 if isinstance(markup, Tag):
939 markup = markup.name
940 if markup is not None and not isString(markup):
941 markup = unicode(markup)
942 #Now we know that chunk is either a string, or None.
943 if hasattr(matchAgainst, 'match'):
944 # It's a regexp object.
945 result = markup and matchAgainst.search(markup)
946 elif (isList(matchAgainst)
947 and (markup is not None or not isString(matchAgainst))):
948 result = markup in matchAgainst
949 elif hasattr(matchAgainst, 'items'):
950 result = markup.has_key(matchAgainst)
951 elif matchAgainst and isString(markup):
952 if isinstance(markup, unicode):
953 matchAgainst = unicode(matchAgainst)
954 else:
955 matchAgainst = str(matchAgainst)
957 if not result:
958 result = matchAgainst == markup
959 return result
961 class ResultSet(list):
962 """A ResultSet is just a list that keeps track of the SoupStrainer
963 that created it."""
964 def __init__(self, source):
965 list.__init__([])
966 self.source = source
968 # Now, some helper functions.
970 def isList(l):
971 """Convenience method that works with all 2.x versions of Python
972 to determine whether or not something is listlike."""
973 return ((hasattr(l, '__iter__') and not isString(l))
974 or (type(l) in (types.ListType, types.TupleType)))
976 def isString(s):
977 """Convenience method that works with all 2.x versions of Python
978 to determine whether or not something is stringlike."""
979 try:
980 return isinstance(s, unicode) or isinstance(s, basestring)
981 except NameError:
982 return isinstance(s, str)
984 def buildTagMap(default, *args):
985 """Turns a list of maps, lists, or scalars into a single map.
986 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
987 NESTING_RESET_TAGS maps out of lists and partial maps."""
988 built = {}
989 for portion in args:
990 if hasattr(portion, 'items'):
991 #It's a map. Merge it.
992 for k,v in portion.items():
993 built[k] = v
994 elif isList(portion) and not isString(portion):
995 #It's a list. Map each item to the default.
996 for k in portion:
997 built[k] = default
998 else:
999 #It's a scalar. Map it to the default.
1000 built[portion] = default
1001 return built
1003 # Now, the parser classes.
1005 class HTMLParserBuilder(HTMLParser):
1007 def __init__(self, soup):
1008 HTMLParser.__init__(self)
1009 self.soup = soup
1011 # We inherit feed() and reset().
1013 def handle_starttag(self, name, attrs):
1014 if name == 'meta':
1015 self.soup.extractCharsetFromMeta(attrs)
1016 else:
1017 self.soup.unknown_starttag(name, attrs)
1019 def handle_endtag(self, name):
1020 self.soup.unknown_endtag(name)
1022 def handle_data(self, content):
1023 self.soup.handle_data(content)
1025 def _toStringSubclass(self, text, subclass):
1026 """Adds a certain piece of text to the tree as a NavigableString
1027 subclass."""
1028 self.soup.endData()
1029 self.handle_data(text)
1030 self.soup.endData(subclass)
1032 def handle_pi(self, text):
1033 """Handle a processing instruction as a ProcessingInstruction
1034 object, possibly one with a %SOUP-ENCODING% slot into which an
1035 encoding will be plugged later."""
1036 if text[:3] == "xml":
1037 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1038 self._toStringSubclass(text, ProcessingInstruction)
1040 def handle_comment(self, text):
1041 "Handle comments as Comment objects."
1042 self._toStringSubclass(text, Comment)
1044 def handle_charref(self, ref):
1045 "Handle character references as data."
1046 if self.soup.convertEntities:
1047 data = unichr(int(ref))
1048 else:
1049 data = '&#%s;' % ref
1050 self.handle_data(data)
1052 def handle_entityref(self, ref):
1053 """Handle entity references as data, possibly converting known
1054 HTML and/or XML entity references to the corresponding Unicode
1055 characters."""
1056 data = None
1057 if self.soup.convertHTMLEntities:
1058 try:
1059 data = unichr(name2codepoint[ref])
1060 except KeyError:
1061 pass
1063 if not data and self.soup.convertXMLEntities:
1064 data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1066 if not data and self.soup.convertHTMLEntities and \
1067 not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1068 # TODO: We've got a problem here. We're told this is
1069 # an entity reference, but it's not an XML entity
1070 # reference or an HTML entity reference. Nonetheless,
1071 # the logical thing to do is to pass it through as an
1072 # unrecognized entity reference.
1074 # Except: when the input is "&carol;" this function
1075 # will be called with input "carol". When the input is
1076 # "AT&T", this function will be called with input
1077 # "T". We have no way of knowing whether a semicolon
1078 # was present originally, so we don't know whether
1079 # this is an unknown entity or just a misplaced
1080 # ampersand.
1082 # The more common case is a misplaced ampersand, so I
1083 # escape the ampersand and omit the trailing semicolon.
1084 data = "&amp;%s" % ref
1085 if not data:
1086 # This case is different from the one above, because we
1087 # haven't already gone through a supposedly comprehensive
1088 # mapping of entities to Unicode characters. We might not
1089 # have gone through any mapping at all. So the chances are
1090 # very high that this is a real entity, and not a
1091 # misplaced ampersand.
1092 data = "&%s;" % ref
1093 self.handle_data(data)
1095 def handle_decl(self, data):
1096 "Handle DOCTYPEs and the like as Declaration objects."
1097 self._toStringSubclass(data, Declaration)
1099 def parse_declaration(self, i):
1100 """Treat a bogus SGML declaration as raw data. Treat a CDATA
1101 declaration as a CData object."""
1102 j = None
1103 if self.rawdata[i:i+9] == '<![CDATA[':
1104 k = self.rawdata.find(']]>', i)
1105 if k == -1:
1106 k = len(self.rawdata)
1107 data = self.rawdata[i+9:k]
1108 j = k+3
1109 self._toStringSubclass(data, CData)
1110 else:
1111 try:
1112 j = HTMLParser.parse_declaration(self, i)
1113 except HTMLParseError:
1114 toHandle = self.rawdata[i:]
1115 self.handle_data(toHandle)
1116 j = i + len(toHandle)
1117 return j
1120 class BeautifulStoneSoup(Tag):
1122 """This class contains the basic parser and search code. It defines
1123 a parser that knows nothing about tag behavior except for the
1124 following:
1126 You can't close a tag without closing all the tags it encloses.
1127 That is, "<foo><bar></foo>" actually means
1128 "<foo><bar></bar></foo>".
1130 [Another possible explanation is "<foo><bar /></foo>", but since
1131 this class defines no SELF_CLOSING_TAGS, it will never use that
1132 explanation.]
1134 This class is useful for parsing XML or made-up markup languages,
1135 or when BeautifulSoup makes an assumption counter to what you were
1136 expecting."""
1138 SELF_CLOSING_TAGS = {}
1139 NESTABLE_TAGS = {}
1140 RESET_NESTING_TAGS = {}
1141 QUOTE_TAGS = {}
1142 PRESERVE_WHITESPACE_TAGS = []
1144 MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
1145 lambda x: x.group(1) + ' />'),
1146 (re.compile('<!\s+([^<>]*)>'),
1147 lambda x: '<!' + x.group(1) + '>')
1150 ROOT_TAG_NAME = u'[document]'
1152 HTML_ENTITIES = "html"
1153 XML_ENTITIES = "xml"
1154 XHTML_ENTITIES = "xhtml"
1155 # TODO: This only exists for backwards-compatibility
1156 ALL_ENTITIES = XHTML_ENTITIES
1158 # Used when determining whether a text node is all whitespace and
1159 # can be replaced with a single space. A text node that contains
1160 # fancy Unicode spaces (usually non-breaking) should be left
1161 # alone.
1162 STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1164 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1165 markupMassage=True, smartQuotesTo=XML_ENTITIES,
1166 convertEntities=None, selfClosingTags=None, isHTML=False,
1167 builder=HTMLParserBuilder):
1168 """The Soup object is initialized as the 'root tag', and the
1169 provided markup (which can be a string or a file-like object)
1170 is fed into the underlying parser.
1172 HTMLParser will process most bad HTML, and the BeautifulSoup
1173 class has some tricks for dealing with some HTML that kills
1174 HTMLParser, but Beautiful Soup can nonetheless choke or lose data
1175 if your data uses self-closing tags or declarations
1176 incorrectly.
1178 By default, Beautiful Soup uses regexes to sanitize input,
1179 avoiding the vast majority of these problems. If the problems
1180 don't apply to you, pass in False for markupMassage, and
1181 you'll get better performance.
1183 The default parser massage techniques fix the two most common
1184 instances of invalid HTML that choke HTMLParser:
1186 <br/> (No space between name of closing tag and tag close)
1187 <! --Comment--> (Extraneous whitespace in declaration)
1189 You can pass in a custom list of (RE object, replace method)
1190 tuples to get Beautiful Soup to scrub your input the way you
1191 want."""
1193 self.parseOnlyThese = parseOnlyThese
1194 self.fromEncoding = fromEncoding
1195 self.smartQuotesTo = smartQuotesTo
1196 self.convertEntities = convertEntities
1197 # Set the rules for how we'll deal with the entities we
1198 # encounter
1199 if self.convertEntities:
1200 # It doesn't make sense to convert encoded characters to
1201 # entities even while you're converting entities to Unicode.
1202 # Just convert it all to Unicode.
1203 self.smartQuotesTo = None
1204 if convertEntities == self.HTML_ENTITIES:
1205 self.convertXMLEntities = False
1206 self.convertHTMLEntities = True
1207 self.escapeUnrecognizedEntities = True
1208 elif convertEntities == self.XHTML_ENTITIES:
1209 self.convertXMLEntities = True
1210 self.convertHTMLEntities = True
1211 self.escapeUnrecognizedEntities = False
1212 elif convertEntities == self.XML_ENTITIES:
1213 self.convertXMLEntities = True
1214 self.convertHTMLEntities = False
1215 self.escapeUnrecognizedEntities = False
1216 else:
1217 self.convertXMLEntities = False
1218 self.convertHTMLEntities = False
1219 self.escapeUnrecognizedEntities = False
1221 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1222 self.builder = builder(self)
1223 self.reset()
1225 if hasattr(markup, 'read'): # It's a file-type object.
1226 markup = markup.read()
1227 self.markup = markup
1228 self.markupMassage = markupMassage
1229 try:
1230 self._feed(isHTML=isHTML)
1231 except StopParsing:
1232 pass
1233 self.markup = None # The markup can now be GCed.
1234 self.builder = None # So can the builder.
1236 def _feed(self, inDocumentEncoding=None, isHTML=False):
1237 # Convert the document to Unicode.
1238 markup = self.markup
1239 if isinstance(markup, unicode):
1240 if not hasattr(self, 'originalEncoding'):
1241 self.originalEncoding = None
1242 else:
1243 dammit = UnicodeDammit\
1244 (markup, [self.fromEncoding, inDocumentEncoding],
1245 smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1246 markup = dammit.unicode
1247 self.originalEncoding = dammit.originalEncoding
1248 self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1249 if markup:
1250 if self.markupMassage:
1251 if not isList(self.markupMassage):
1252 self.markupMassage = self.MARKUP_MASSAGE
1253 for fix, m in self.markupMassage:
1254 markup = fix.sub(m, markup)
1255 # TODO: We get rid of markupMassage so that the
1256 # soup object can be deepcopied later on. Some
1257 # Python installations can't copy regexes. If anyone
1258 # was relying on the existence of markupMassage, this
1259 # might cause problems.
1260 del(self.markupMassage)
1261 self.builder.reset()
1263 self.builder.feed(markup)
1264 # Close out any unfinished strings and close all the open tags.
1265 self.endData()
1266 while self.currentTag.name != self.ROOT_TAG_NAME:
1267 self.popTag()
1269 def isSelfClosingTag(self, name):
1270 """Returns true iff the given string is the name of a
1271 self-closing tag according to this parser."""
1272 return self.SELF_CLOSING_TAGS.has_key(name) \
1273 or self.instanceSelfClosingTags.has_key(name)
1275 def reset(self):
1276 Tag.__init__(self, self, self.ROOT_TAG_NAME)
1277 self.hidden = 1
1278 self.builder.reset()
1279 self.currentData = []
1280 self.currentTag = None
1281 self.tagStack = []
1282 self.quoteStack = []
1283 self.pushTag(self)
1285 def popTag(self):
1286 tag = self.tagStack.pop()
1287 # Tags with just one string-owning child get the child as a
1288 # 'string' property, so that soup.tag.string is shorthand for
1289 # soup.tag.contents[0]
1290 if len(self.currentTag.contents) == 1 and \
1291 isinstance(self.currentTag.contents[0], NavigableString):
1292 self.currentTag.string = self.currentTag.contents[0]
1294 #print "Pop", tag.name
1295 if self.tagStack:
1296 self.currentTag = self.tagStack[-1]
1297 return self.currentTag
1299 def pushTag(self, tag):
1300 #print "Push", tag.name
1301 if self.currentTag:
1302 self.currentTag.contents.append(tag)
1303 self.tagStack.append(tag)
1304 self.currentTag = self.tagStack[-1]
1306 def endData(self, containerClass=NavigableString):
1307 if self.currentData:
1308 currentData = u''.join(self.currentData)
1309 if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1310 not set([tag.name for tag in self.tagStack]).intersection(
1311 self.PRESERVE_WHITESPACE_TAGS)):
1312 if '\n' in currentData:
1313 currentData = '\n'
1314 else:
1315 currentData = ' '
1316 self.currentData = []
1317 if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1318 (not self.parseOnlyThese.text or \
1319 not self.parseOnlyThese.search(currentData)):
1320 return
1321 o = containerClass(currentData)
1322 o.setup(self.currentTag, self.previous)
1323 if self.previous:
1324 self.previous.next = o
1325 self.previous = o
1326 self.currentTag.contents.append(o)
1329 def _popToTag(self, name, inclusivePop=True):
1330 """Pops the tag stack up to and including the most recent
1331 instance of the given tag. If inclusivePop is false, pops the tag
1332 stack up to but *not* including the most recent instqance of
1333 the given tag."""
1334 #print "Popping to %s" % name
1335 if name == self.ROOT_TAG_NAME:
1336 return
1338 numPops = 0
1339 mostRecentTag = None
1340 for i in range(len(self.tagStack)-1, 0, -1):
1341 if name == self.tagStack[i].name:
1342 numPops = len(self.tagStack)-i
1343 break
1344 if not inclusivePop:
1345 numPops = numPops - 1
1347 for i in range(0, numPops):
1348 mostRecentTag = self.popTag()
1349 return mostRecentTag
1351 def _smartPop(self, name):
1353 """We need to pop up to the previous tag of this type, unless
1354 one of this tag's nesting reset triggers comes between this
1355 tag and the previous tag of this type, OR unless this tag is a
1356 generic nesting trigger and another generic nesting trigger
1357 comes between this tag and the previous tag of this type.
1359 Examples:
1360 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1361 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1362 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1364 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1365 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1366 <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1369 nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1370 isNestable = nestingResetTriggers != None
1371 isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1372 popTo = None
1373 inclusive = True
1374 for i in range(len(self.tagStack)-1, 0, -1):
1375 p = self.tagStack[i]
1376 if (not p or p.name == name) and not isNestable:
1377 #Non-nestable tags get popped to the top or to their
1378 #last occurance.
1379 popTo = name
1380 break
1381 if (nestingResetTriggers != None
1382 and p.name in nestingResetTriggers) \
1383 or (nestingResetTriggers == None and isResetNesting
1384 and self.RESET_NESTING_TAGS.has_key(p.name)):
1386 #If we encounter one of the nesting reset triggers
1387 #peculiar to this tag, or we encounter another tag
1388 #that causes nesting to reset, pop up to but not
1389 #including that tag.
1390 popTo = p.name
1391 inclusive = False
1392 break
1393 p = p.parent
1394 if popTo:
1395 self._popToTag(popTo, inclusive)
1397 def unknown_starttag(self, name, attrs, selfClosing=0):
1398 #print "Start tag %s: %s" % (name, attrs)
1399 if self.quoteStack:
1400 #This is not a real tag.
1401 #print "<%s> is not real!" % name
1402 attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1403 self.handle_data('<%s%s>' % (name, attrs))
1404 return
1405 self.endData()
1407 if not self.isSelfClosingTag(name) and not selfClosing:
1408 self._smartPop(name)
1410 if self.parseOnlyThese and len(self.tagStack) <= 1 \
1411 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1412 return
1414 tag = Tag(self, name, attrs, self.currentTag, self.previous)
1415 if self.previous:
1416 self.previous.next = tag
1417 self.previous = tag
1418 self.pushTag(tag)
1419 if selfClosing or self.isSelfClosingTag(name):
1420 self.popTag()
1421 if name in self.QUOTE_TAGS:
1422 #print "Beginning quote (%s)" % name
1423 self.quoteStack.append(name)
1424 self.literal = 1
1425 return tag
1427 def unknown_endtag(self, name):
1428 #print "End tag %s" % name
1429 if self.quoteStack and self.quoteStack[-1] != name:
1430 #This is not a real end tag.
1431 #print "</%s> is not real!" % name
1432 self.handle_data('</%s>' % name)
1433 return
1434 self.endData()
1435 self._popToTag(name)
1436 if self.quoteStack and self.quoteStack[-1] == name:
1437 self.quoteStack.pop()
1438 self.literal = (len(self.quoteStack) > 0)
1440 def handle_data(self, data):
1441 self.currentData.append(data)
1443 def extractCharsetFromMeta(self, attrs):
1444 self.unknown_starttag('meta', attrs)
1447 class BeautifulSoup(BeautifulStoneSoup):
1449 """This parser knows the following facts about HTML:
1451 * Some tags have no closing tag and should be interpreted as being
1452 closed as soon as they are encountered.
1454 * The text inside some tags (ie. 'script') may contain tags which
1455 are not really part of the document and which should be parsed
1456 as text, not tags. If you want to parse the text as tags, you can
1457 always fetch it and parse it explicitly.
1459 * Tag nesting rules:
1461 Most tags can't be nested at all. For instance, the occurance of
1462 a <p> tag should implicitly close the previous <p> tag.
1464 <p>Para1<p>Para2
1465 should be transformed into:
1466 <p>Para1</p><p>Para2
1468 Some tags can be nested arbitrarily. For instance, the occurance
1469 of a <blockquote> tag should _not_ implicitly close the previous
1470 <blockquote> tag.
1472 Alice said: <blockquote>Bob said: <blockquote>Blah
1473 should NOT be transformed into:
1474 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1476 Some tags can be nested, but the nesting is reset by the
1477 interposition of other tags. For instance, a <tr> tag should
1478 implicitly close the previous <tr> tag within the same <table>,
1479 but not close a <tr> tag in another table.
1481 <table><tr>Blah<tr>Blah
1482 should be transformed into:
1483 <table><tr>Blah</tr><tr>Blah
1484 but,
1485 <tr>Blah<table><tr>Blah
1486 should NOT be transformed into
1487 <tr>Blah<table></tr><tr>Blah
1489 Differing assumptions about tag nesting rules are a major source
1490 of problems with the BeautifulSoup class. If BeautifulSoup is not
1491 treating as nestable a tag your page author treats as nestable,
1492 try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1493 BeautifulStoneSoup before writing your own subclass."""
1495 def __init__(self, *args, **kwargs):
1496 if not kwargs.has_key('smartQuotesTo'):
1497 kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1498 kwargs['isHTML'] = True
1499 BeautifulStoneSoup.__init__(self, *args, **kwargs)
1501 SELF_CLOSING_TAGS = buildTagMap(None,
1502 ['br' , 'hr', 'input', 'img', 'meta',
1503 'spacer', 'link', 'frame', 'base'])
1505 PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1507 QUOTE_TAGS = {'script' : None, 'textarea' : None}
1509 #According to the HTML standard, each of these inline tags can
1510 #contain another tag of the same type. Furthermore, it's common
1511 #to actually use these tags this way.
1512 NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1513 'center']
1515 #According to the HTML standard, these block tags can contain
1516 #another tag of the same type. Furthermore, it's common
1517 #to actually use these tags this way.
1518 NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1520 #Lists can contain other lists, but there are restrictions.
1521 NESTABLE_LIST_TAGS = { 'ol' : [],
1522 'ul' : [],
1523 'li' : ['ul', 'ol'],
1524 'dl' : [],
1525 'dd' : ['dl'],
1526 'dt' : ['dl'] }
1528 #Tables can contain other tables, but there are restrictions.
1529 NESTABLE_TABLE_TAGS = {'table' : [],
1530 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1531 'td' : ['tr'],
1532 'th' : ['tr'],
1533 'thead' : ['table'],
1534 'tbody' : ['table'],
1535 'tfoot' : ['table'],
1538 NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1540 #If one of these tags is encountered, all tags up to the next tag of
1541 #this type are popped.
1542 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1543 NON_NESTABLE_BLOCK_TAGS,
1544 NESTABLE_LIST_TAGS,
1545 NESTABLE_TABLE_TAGS)
1547 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1548 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1550 # Used to detect the charset in a META tag; see start_meta
1551 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
1553 def extractCharsetFromMeta(self, attrs):
1554 """Beautiful Soup can detect a charset included in a META tag,
1555 try to convert the document to that charset, and re-parse the
1556 document from the beginning."""
1557 httpEquiv = None
1558 contentType = None
1559 contentTypeIndex = None
1560 tagNeedsEncodingSubstitution = False
1562 for i in range(0, len(attrs)):
1563 key, value = attrs[i]
1564 key = key.lower()
1565 if key == 'http-equiv':
1566 httpEquiv = value
1567 elif key == 'content':
1568 contentType = value
1569 contentTypeIndex = i
1571 if httpEquiv and contentType: # It's an interesting meta tag.
1572 match = self.CHARSET_RE.search(contentType)
1573 if match:
1574 if (self.declaredHTMLEncoding is not None or
1575 self.originalEncoding == self.fromEncoding):
1576 # An HTML encoding was sniffed while converting
1577 # the document to Unicode, or an HTML encoding was
1578 # sniffed during a previous pass through the
1579 # document, or an encoding was specified
1580 # explicitly and it worked. Rewrite the meta tag.
1581 def rewrite(match):
1582 return match.group(1) + "%SOUP-ENCODING%"
1583 newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1584 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1585 newAttr)
1586 tagNeedsEncodingSubstitution = True
1587 else:
1588 # This is our first pass through the document.
1589 # Go through it again with the encoding information.
1590 newCharset = match.group(3)
1591 if newCharset and newCharset != self.originalEncoding:
1592 self.declaredHTMLEncoding = newCharset
1593 self._feed(self.declaredHTMLEncoding)
1594 raise StopParsing
1595 pass
1596 tag = self.unknown_starttag("meta", attrs)
1597 if tag and tagNeedsEncodingSubstitution:
1598 tag.containsSubstitutions = True
1601 class StopParsing(Exception):
1602 pass
1604 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1606 """The BeautifulSoup class is oriented towards skipping over
1607 common HTML errors like unclosed tags. However, sometimes it makes
1608 errors of its own. For instance, consider this fragment:
1610 <b>Foo<b>Bar</b></b>
1612 This is perfectly valid (if bizarre) HTML. However, the
1613 BeautifulSoup class will implicitly close the first b tag when it
1614 encounters the second 'b'. It will think the author wrote
1615 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1616 there's no real-world reason to bold something that's already
1617 bold. When it encounters '</b></b>' it will close two more 'b'
1618 tags, for a grand total of three tags closed instead of two. This
1619 can throw off the rest of your document structure. The same is
1620 true of a number of other tags, listed below.
1622 It's much more common for someone to forget to close a 'b' tag
1623 than to actually use nested 'b' tags, and the BeautifulSoup class
1624 handles the common case. This class handles the not-co-common
1625 case: where you can't believe someone wrote what they did, but
1626 it's valid HTML and BeautifulSoup screwed up by assuming it
1627 wouldn't be."""
1629 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1630 ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1631 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1632 'big']
1634 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1636 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1637 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1638 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1640 class MinimalSoup(BeautifulSoup):
1641 """The MinimalSoup class is for parsing HTML that contains
1642 pathologically bad markup. It makes no assumptions about tag
1643 nesting, but it does know which tags are self-closing, that
1644 <script> tags contain Javascript and should not be parsed, that
1645 META tags may contain encoding information, and so on.
1647 This also makes it better for subclassing than BeautifulStoneSoup
1648 or BeautifulSoup."""
1650 RESET_NESTING_TAGS = buildTagMap('noscript')
1651 NESTABLE_TAGS = {}
1653 class BeautifulSOAP(BeautifulStoneSoup):
1654 """This class will push a tag with only a single string child into
1655 the tag's parent as an attribute. The attribute's name is the tag
1656 name, and the value is the string child. An example should give
1657 the flavor of the change:
1659 <foo><bar>baz</bar></foo>
1661 <foo bar="baz"><bar>baz</bar></foo>
1663 You can then access fooTag['bar'] instead of fooTag.barTag.string.
1665 This is, of course, useful for scraping structures that tend to
1666 use subelements instead of attributes, such as SOAP messages. Note
1667 that it modifies its input, so don't print the modified version
1668 out.
1670 I'm not sure how many people really want to use this class; let me
1671 know if you do. Mainly I like the name."""
1673 def popTag(self):
1674 if len(self.tagStack) > 1:
1675 tag = self.tagStack[-1]
1676 parent = self.tagStack[-2]
1677 parent._getAttrMap()
1678 if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1679 isinstance(tag.contents[0], NavigableString) and
1680 not parent.attrMap.has_key(tag.name)):
1681 parent[tag.name] = tag.contents[0]
1682 BeautifulStoneSoup.popTag(self)
1684 #Enterprise class names! It has come to our attention that some people
1685 #think the names of the Beautiful Soup parser classes are too silly
1686 #and "unprofessional" for use in enterprise screen-scraping. We feel
1687 #your pain! For such-minded folk, the Beautiful Soup Consortium And
1688 #All-Night Kosher Bakery recommends renaming this file to
1689 #"RobustParser.py" (or, in cases of extreme enterprisiness,
1690 #"RobustParserBeanInterface.class") and using the following
1691 #enterprise-friendly class aliases:
1692 class RobustXMLParser(BeautifulStoneSoup):
1693 pass
1694 class RobustHTMLParser(BeautifulSoup):
1695 pass
1696 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1697 pass
1698 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1699 pass
1700 class SimplifyingSOAPParser(BeautifulSOAP):
1701 pass
1703 ######################################################
1705 # Bonus library: Unicode, Dammit
1707 # This class forces XML data into a standard format (usually to UTF-8
1708 # or Unicode). It is heavily based on code from Mark Pilgrim's
1709 # Universal Feed Parser. It does not rewrite the XML or HTML to
1710 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1711 # (XML) and BeautifulSoup.start_meta (HTML).
1713 # Autodetects character encodings.
1714 # Download from http://chardet.feedparser.org/
1715 try:
1716 import chardet
1717 # import chardet.constants
1718 # chardet.constants._debug = 1
1719 except ImportError:
1720 chardet = None
1722 # cjkcodecs and iconv_codec make Python know about more character encodings.
1723 # Both are available from http://cjkpython.i18n.org/
1724 # They're built in if you use Python 2.4.
1725 try:
1726 import cjkcodecs.aliases
1727 except ImportError:
1728 pass
1729 try:
1730 import iconv_codec
1731 except ImportError:
1732 pass
1734 class UnicodeDammit:
1735 """A class for detecting the encoding of a *ML document and
1736 converting it to a Unicode string. If the source encoding is
1737 windows-1252, can replace MS smart quotes with their HTML or XML
1738 equivalents."""
1740 # This dictionary maps commonly seen values for "charset" in HTML
1741 # meta tags to the corresponding Python codec names. It only covers
1742 # values that aren't in Python's aliases and can't be determined
1743 # by the heuristics in find_codec.
1744 CHARSET_ALIASES = { "macintosh" : "mac-roman",
1745 "x-sjis" : "shift-jis" }
1747 def __init__(self, markup, overrideEncodings=[],
1748 smartQuotesTo='xml', isHTML=False):
1749 self.declaredHTMLEncoding = None
1750 self.markup, documentEncoding, sniffedEncoding = \
1751 self._detectEncoding(markup, isHTML)
1752 self.smartQuotesTo = smartQuotesTo
1753 self.triedEncodings = []
1754 if markup == '' or isinstance(markup, unicode):
1755 self.originalEncoding = None
1756 self.unicode = unicode(markup)
1757 return
1759 u = None
1760 for proposedEncoding in overrideEncodings:
1761 u = self._convertFrom(proposedEncoding)
1762 if u: break
1763 if not u:
1764 for proposedEncoding in (documentEncoding, sniffedEncoding):
1765 u = self._convertFrom(proposedEncoding)
1766 if u: break
1768 # If no luck and we have auto-detection library, try that:
1769 if not u and chardet and not isinstance(self.markup, unicode):
1770 u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1772 # As a last resort, try utf-8 and windows-1252:
1773 if not u:
1774 for proposed_encoding in ("utf-8", "windows-1252"):
1775 u = self._convertFrom(proposed_encoding)
1776 if u: break
1778 self.unicode = u
1779 if not u: self.originalEncoding = None
1781 def _subMSChar(self, match):
1782 """Changes a MS smart quote character to an XML or HTML
1783 entity."""
1784 orig = match.group(1)
1785 sub = self.MS_CHARS.get(orig)
1786 if type(sub) == types.TupleType:
1787 if self.smartQuotesTo == 'xml':
1788 sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
1789 else:
1790 sub = '&'.encode() + sub[0].encode() + ';'.encode()
1791 else:
1792 sub = sub.encode()
1793 return sub
1795 def _convertFrom(self, proposed):
1796 proposed = self.find_codec(proposed)
1797 if not proposed or proposed in self.triedEncodings:
1798 return None
1799 self.triedEncodings.append(proposed)
1800 markup = self.markup
1802 # Convert smart quotes to HTML if coming from an encoding
1803 # that might have them.
1804 if self.smartQuotesTo and proposed.lower() in("windows-1252",
1805 "iso-8859-1",
1806 "iso-8859-2"):
1807 smart_quotes_re = "([\x80-\x9f])"
1808 smart_quotes_compiled = re.compile(smart_quotes_re)
1809 markup = smart_quotes_compiled.sub(self._subMSChar, markup)
1811 try:
1812 # print "Trying to convert document to %s" % proposed
1813 u = self._toUnicode(markup, proposed)
1814 self.markup = u
1815 self.originalEncoding = proposed
1816 except Exception, e:
1817 # print "That didn't work!"
1818 # print e
1819 return None
1820 #print "Correct encoding: %s" % proposed
1821 return self.markup
1823 def _toUnicode(self, data, encoding):
1824 '''Given a string and its encoding, decodes the string into Unicode.
1825 %encoding is a string recognized by encodings.aliases'''
1827 # strip Byte Order Mark (if present)
1828 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1829 and (data[2:4] != '\x00\x00'):
1830 encoding = 'utf-16be'
1831 data = data[2:]
1832 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1833 and (data[2:4] != '\x00\x00'):
1834 encoding = 'utf-16le'
1835 data = data[2:]
1836 elif data[:3] == '\xef\xbb\xbf':
1837 encoding = 'utf-8'
1838 data = data[3:]
1839 elif data[:4] == '\x00\x00\xfe\xff':
1840 encoding = 'utf-32be'
1841 data = data[4:]
1842 elif data[:4] == '\xff\xfe\x00\x00':
1843 encoding = 'utf-32le'
1844 data = data[4:]
1845 newdata = unicode(data, encoding)
1846 return newdata
1848 def _detectEncoding(self, xml_data, isHTML=False):
1849 """Given a document, tries to detect its XML encoding."""
1850 xml_encoding = sniffed_xml_encoding = None
1851 try:
1852 if xml_data[:4] == '\x4c\x6f\xa7\x94':
1853 # EBCDIC
1854 xml_data = self._ebcdic_to_ascii(xml_data)
1855 elif xml_data[:4] == '\x00\x3c\x00\x3f':
1856 # UTF-16BE
1857 sniffed_xml_encoding = 'utf-16be'
1858 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1859 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1860 and (xml_data[2:4] != '\x00\x00'):
1861 # UTF-16BE with BOM
1862 sniffed_xml_encoding = 'utf-16be'
1863 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1864 elif xml_data[:4] == '\x3c\x00\x3f\x00':
1865 # UTF-16LE
1866 sniffed_xml_encoding = 'utf-16le'
1867 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1868 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1869 (xml_data[2:4] != '\x00\x00'):
1870 # UTF-16LE with BOM
1871 sniffed_xml_encoding = 'utf-16le'
1872 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1873 elif xml_data[:4] == '\x00\x00\x00\x3c':
1874 # UTF-32BE
1875 sniffed_xml_encoding = 'utf-32be'
1876 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1877 elif xml_data[:4] == '\x3c\x00\x00\x00':
1878 # UTF-32LE
1879 sniffed_xml_encoding = 'utf-32le'
1880 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1881 elif xml_data[:4] == '\x00\x00\xfe\xff':
1882 # UTF-32BE with BOM
1883 sniffed_xml_encoding = 'utf-32be'
1884 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1885 elif xml_data[:4] == '\xff\xfe\x00\x00':
1886 # UTF-32LE with BOM
1887 sniffed_xml_encoding = 'utf-32le'
1888 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1889 elif xml_data[:3] == '\xef\xbb\xbf':
1890 # UTF-8 with BOM
1891 sniffed_xml_encoding = 'utf-8'
1892 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1893 else:
1894 sniffed_xml_encoding = 'ascii'
1895 pass
1896 except:
1897 xml_encoding_match = None
1898 xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
1899 xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
1900 if not xml_encoding_match and isHTML:
1901 meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
1902 regexp = re.compile(meta_re, re.I)
1903 xml_encoding_match = regexp.search(xml_data)
1904 if xml_encoding_match is not None:
1905 xml_encoding = xml_encoding_match.groups()[0].decode(
1906 'ascii').lower()
1907 if isHTML:
1908 self.declaredHTMLEncoding = xml_encoding
1909 if sniffed_xml_encoding and \
1910 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1911 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1912 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1913 'utf16', 'u16')):
1914 xml_encoding = sniffed_xml_encoding
1915 return xml_data, xml_encoding, sniffed_xml_encoding
1918 def find_codec(self, charset):
1919 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1920 or (charset and self._codec(charset.replace("-", ""))) \
1921 or (charset and self._codec(charset.replace("-", "_"))) \
1922 or charset
1924 def _codec(self, charset):
1925 if not charset: return charset
1926 codec = None
1927 try:
1928 codecs.lookup(charset)
1929 codec = charset
1930 except (LookupError, ValueError):
1931 pass
1932 return codec
1934 EBCDIC_TO_ASCII_MAP = None
1935 def _ebcdic_to_ascii(self, s):
1936 c = self.__class__
1937 if not c.EBCDIC_TO_ASCII_MAP:
1938 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1939 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1940 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1941 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1942 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1943 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1944 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1945 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1946 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1947 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1948 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1949 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1950 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1951 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1952 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1953 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1954 250,251,252,253,254,255)
1955 import string
1956 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1957 ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1958 return s.translate(c.EBCDIC_TO_ASCII_MAP)
1960 MS_CHARS = { '\x80' : ('euro', '20AC'),
1961 '\x81' : ' ',
1962 '\x82' : ('sbquo', '201A'),
1963 '\x83' : ('fnof', '192'),
1964 '\x84' : ('bdquo', '201E'),
1965 '\x85' : ('hellip', '2026'),
1966 '\x86' : ('dagger', '2020'),
1967 '\x87' : ('Dagger', '2021'),
1968 '\x88' : ('circ', '2C6'),
1969 '\x89' : ('permil', '2030'),
1970 '\x8A' : ('Scaron', '160'),
1971 '\x8B' : ('lsaquo', '2039'),
1972 '\x8C' : ('OElig', '152'),
1973 '\x8D' : '?',
1974 '\x8E' : ('#x17D', '17D'),
1975 '\x8F' : '?',
1976 '\x90' : '?',
1977 '\x91' : ('lsquo', '2018'),
1978 '\x92' : ('rsquo', '2019'),
1979 '\x93' : ('ldquo', '201C'),
1980 '\x94' : ('rdquo', '201D'),
1981 '\x95' : ('bull', '2022'),
1982 '\x96' : ('ndash', '2013'),
1983 '\x97' : ('mdash', '2014'),
1984 '\x98' : ('tilde', '2DC'),
1985 '\x99' : ('trade', '2122'),
1986 '\x9a' : ('scaron', '161'),
1987 '\x9b' : ('rsaquo', '203A'),
1988 '\x9c' : ('oelig', '153'),
1989 '\x9d' : '?',
1990 '\x9e' : ('#x17E', '17E'),
1991 '\x9f' : ('Yuml', ''),}
1993 #######################################################################
1996 #By default, act as an HTML pretty-printer.
1997 if __name__ == '__main__':
1998 import sys
1999 soup = BeautifulSoup(sys.stdin)
2000 print soup.prettify()