src/lib/feedparser.py

   1 """Universal feed parser
   2
   3 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
   4
   5 Visit http://feedparser.org/ for the latest version
   6 Visit http://feedparser.org/docs/ for the latest documentation
   7
   8 Required: Python 2.1 or later
   9 Recommended: Python 2.3 or later
  10 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
  11 """
  12
  13 __version__ = "4.1"# + "$Revision$"[11:15] + "-cvs"
  14 __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
  15
  16 Redistribution and use in source and binary forms, with or without modification,
  17 are permitted provided that the following conditions are met:
  18
  19 * Redistributions of source code must retain the above copyright notice,
  20   this list of conditions and the following disclaimer.
  21 * Redistributions in binary form must reproduce the above copyright notice,
  22   this list of conditions and the following disclaimer in the documentation
  23   and/or other materials provided with the distribution.
  24
  25 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
  26 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  27 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  28 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  29 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  30 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  31 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  32 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  33 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  34 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  35 POSSIBILITY OF SUCH DAMAGE."""
  36 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
  37 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
  38                     "John Beimler <http://john.beimler.org/>",
  39                     "Fazal Majid <http://www.majid.info/mylos/weblog/>",
  40                     "Aaron Swartz <http://aaronsw.com/>",
  41                     "Kevin Marks <http://epeus.blogspot.com/>"]
  42 _debug = 0
  43
  44 # HTTP "User-Agent" header to send to servers when downloading feeds.
  45 # If you are embedding feedparser in a larger application, you should
  46 # change this to your application name and URL.
  47 USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
  48
  49 # HTTP "Accept" header to send to servers when downloading feeds.  If you don't
  50 # want to send an Accept header, set this to None.
  51 ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
  52
  53 # List of preferred XML parsers, by SAX driver name.  These will be tried first,
  54 # but if they're not installed, Python will keep searching through its own list
  55 # of pre-installed parsers until it finds one that supports everything we need.
  56 PREFERRED_XML_PARSERS = ["drv_libxml2"]
  57
  58 # If you want feedparser to automatically run HTML markup through HTML Tidy, set
  59 # this to 1.  Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
  60 # or utidylib <http://utidylib.berlios.de/>.
  61 TIDY_MARKUP = 0
  62
  63 # List of Python interfaces for HTML Tidy, in order of preference.  Only useful
  64 # if TIDY_MARKUP = 1
  65 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
  66
  67 # ---------- required modules (should come with any Python distribution) ----------
  68 import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
  69 try:
  70     from cStringIO import StringIO as _StringIO
  71 except:
  72     from StringIO import StringIO as _StringIO
  73
  74 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
  75
  76 # gzip is included with most Python distributions, but may not be available if you compiled your own
  77 try:
  78     import gzip
  79 except:
  80     gzip = None
  81 try:
  82     import zlib
  83 except:
  84     zlib = None
  85
  86 # If a real XML parser is available, feedparser will attempt to use it.  feedparser has
  87 # been tested with the built-in SAX parser, PyXML, and libxml2.  On platforms where the
  88 # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
  89 # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
  90 try:
  91     import xml.sax
  92     xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
  93     from xml.sax.saxutils import escape as _xmlescape
  94     _XML_AVAILABLE = 1
  95 except:
  96     _XML_AVAILABLE = 0
  97     def _xmlescape(data):
  98         data = data.replace('&', '&amp;')
  99         data = data.replace('>', '&gt;')
 100         data = data.replace('<', '&lt;')
 101         return data
 102
 103 # base64 support for Atom feeds that contain embedded binary data
 104 try:
 105     import base64, binascii
 106 except:
 107     base64 = binascii = None
 108
 109 # cjkcodecs and iconv_codec provide support for more character encodings.
 110 # Both are available from http://cjkpython.i18n.org/
 111 try:
 112     import cjkcodecs.aliases
 113 except:
 114     pass
 115 try:
 116     import iconv_codec
 117 except:
 118     pass
 119
 120 # chardet library auto-detects character encodings
 121 # Download from http://chardet.feedparser.org/
 122 try:
 123     import chardet
 124     if _debug:
 125         import chardet.constants
 126         chardet.constants._debug = 1
 127 except:
 128     chardet = None
 129
 130 # ---------- don't touch these ----------
 131 class ThingsNobodyCaresAboutButMe(Exception): pass
 132 class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
 133 class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
 134 class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
 135 class UndeclaredNamespace(Exception): pass
 136
 137 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
 138 sgmllib.special = re.compile('<!')
 139 sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
 140
 141 SUPPORTED_VERSIONS = {'': 'unknown',
 142                       'rss090': 'RSS 0.90',
 143                       'rss091n': 'RSS 0.91 (Netscape)',
 144                       'rss091u': 'RSS 0.91 (Userland)',
 145                       'rss092': 'RSS 0.92',
 146                       'rss093': 'RSS 0.93',
 147                       'rss094': 'RSS 0.94',
 148                       'rss20': 'RSS 2.0',
 149                       'rss10': 'RSS 1.0',
 150                       'rss': 'RSS (unknown version)',
 151                       'atom01': 'Atom 0.1',
 152                       'atom02': 'Atom 0.2',
 153                       'atom03': 'Atom 0.3',
 154                       'atom10': 'Atom 1.0',
 155                       'atom': 'Atom (unknown version)',
 156                       'cdf': 'CDF',
 157                       'hotrss': 'Hot RSS'
 158                       }
 159
 160 try:
 161     UserDict = dict
 162 except NameError:
 163     # Python 2.1 does not have dict
 164     from UserDict import UserDict
 165     def dict(aList):
 166         rc = {}
 167         for k, v in aList:
 168             rc[k] = v
 169         return rc
 170
 171 class FeedParserDict(UserDict):
 172     keymap = {'channel': 'feed',
 173               'items': 'entries',
 174               'guid': 'id',
 175               'date': 'updated',
 176               'date_parsed': 'updated_parsed',
 177               'description': ['subtitle', 'summary'],
 178               'url': ['href'],
 179               'modified': 'updated',
 180               'modified_parsed': 'updated_parsed',
 181               'issued': 'published',
 182               'issued_parsed': 'published_parsed',
 183               'copyright': 'rights',
 184               'copyright_detail': 'rights_detail',
 185               'tagline': 'subtitle',
 186               'tagline_detail': 'subtitle_detail'}
 187     def __getitem__(self, key):
 188         if key == 'category':
 189             return UserDict.__getitem__(self, 'tags')[0]['term']
 190         if key == 'categories':
 191             return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
 192         realkey = self.keymap.get(key, key)
 193         if type(realkey) == types.ListType:
 194             for k in realkey:
 195                 if UserDict.has_key(self, k):
 196                     return UserDict.__getitem__(self, k)
 197         if UserDict.has_key(self, key):
 198             return UserDict.__getitem__(self, key)
 199         return UserDict.__getitem__(self, realkey)
 200
 201     def __setitem__(self, key, value):
 202         for k in self.keymap.keys():
 203             if key == k:
 204                 key = self.keymap[k]
 205                 if type(key) == types.ListType:
 206                     key = key[0]
 207         return UserDict.__setitem__(self, key, value)
 208
 209     def get(self, key, default=None):
 210         if self.has_key(key):
 211             return self[key]
 212         else:
 213             return default
 214
 215     def setdefault(self, key, value):
 216         if not self.has_key(key):
 217             self[key] = value
 218         return self[key]
 219
 220     def has_key(self, key):
 221         try:
 222             return hasattr(self, key) or UserDict.has_key(self, key)
 223         except AttributeError:
 224             return False
 225
 226     def __getattr__(self, key):
 227         try:
 228             return self.__dict__[key]
 229         except KeyError:
 230             pass
 231         try:
 232             assert not key.startswith('_')
 233             return self.__getitem__(key)
 234         except:
 235             raise AttributeError, "object has no attribute '%s'" % key
 236
 237     def __setattr__(self, key, value):
 238         if key.startswith('_') or key == 'data':
 239             self.__dict__[key] = value
 240         else:
 241             return self.__setitem__(key, value)
 242
 243     def __contains__(self, key):
 244         return self.has_key(key)
 245
 246 def zopeCompatibilityHack():
 247     global FeedParserDict
 248     del FeedParserDict
 249     def FeedParserDict(aDict=None):
 250         rc = {}
 251         if aDict:
 252             rc.update(aDict)
 253         return rc
 254
 255 _ebcdic_to_ascii_map = None
 256 def _ebcdic_to_ascii(s):
 257     global _ebcdic_to_ascii_map
 258     if not _ebcdic_to_ascii_map:
 259         emap = (
 260             0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
 261             16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
 262             128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
 263             144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
 264             32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
 265             38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
 266             45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
 267             186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
 268             195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
 269             202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
 270             209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
 271             216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
 272             123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
 273             125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
 274             92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
 275             48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
 276             )
 277         import string
 278         _ebcdic_to_ascii_map = string.maketrans( \
 279             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
 280     return s.translate(_ebcdic_to_ascii_map)
 281
 282 _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
 283 def _urljoin(base, uri):
 284     uri = _urifixer.sub(r'\1\3', uri)
 285     return urlparse.urljoin(base, uri)
 286
 287 class _FeedParserMixin:
 288     namespaces = {'': '',
 289                   'http://backend.userland.com/rss': '',
 290                   'http://blogs.law.harvard.edu/tech/rss': '',
 291                   'http://purl.org/rss/1.0/': '',
 292                   'http://my.netscape.com/rdf/simple/0.9/': '',
 293                   'http://example.com/newformat#': '',
 294                   'http://example.com/necho': '',
 295                   'http://purl.org/echo/': '',
 296                   'uri/of/echo/namespace#': '',
 297                   'http://purl.org/pie/': '',
 298                   'http://purl.org/atom/ns#': '',
 299                   'http://www.w3.org/2005/Atom': '',
 300                   'http://purl.org/rss/1.0/modules/rss091#': '',
 301
 302                   'http://webns.net/mvcb/':                               'admin',
 303                   'http://purl.org/rss/1.0/modules/aggregation/':         'ag',
 304                   'http://purl.org/rss/1.0/modules/annotate/':            'annotate',
 305                   'http://media.tangent.org/rss/1.0/':                    'audio',
 306                   'http://backend.userland.com/blogChannelModule':        'blogChannel',
 307                   'http://web.resource.org/cc/':                          'cc',
 308                   'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
 309                   'http://purl.org/rss/1.0/modules/company':              'co',
 310                   'http://purl.org/rss/1.0/modules/content/':             'content',
 311                   'http://my.theinfo.org/changed/1.0/rss/':               'cp',
 312                   'http://purl.org/dc/elements/1.1/':                     'dc',
 313                   'http://purl.org/dc/terms/':                            'dcterms',
 314                   'http://purl.org/rss/1.0/modules/email/':               'email',
 315                   'http://purl.org/rss/1.0/modules/event/':               'ev',
 316                   'http://rssnamespace.org/feedburner/ext/1.0':           'feedburner',
 317                   'http://freshmeat.net/rss/fm/':                         'fm',
 318                   'http://xmlns.com/foaf/0.1/':                           'foaf',
 319                   'http://www.w3.org/2003/01/geo/wgs84_pos#':             'geo',
 320                   'http://postneo.com/icbm/':                             'icbm',
 321                   'http://purl.org/rss/1.0/modules/image/':               'image',
 322                   'http://www.itunes.com/DTDs/PodCast-1.0.dtd':           'itunes',
 323                   'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes',
 324                   'http://purl.org/rss/1.0/modules/link/':                'l',
 325                   'http://search.yahoo.com/mrss':                         'media',
 326                   'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
 327                   'http://prismstandard.org/namespaces/1.2/basic/':       'prism',
 328                   'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',
 329                   'http://www.w3.org/2000/01/rdf-schema#':                'rdfs',
 330                   'http://purl.org/rss/1.0/modules/reference/':           'ref',
 331                   'http://purl.org/rss/1.0/modules/richequiv/':           'reqv',
 332                   'http://purl.org/rss/1.0/modules/search/':              'search',
 333                   'http://purl.org/rss/1.0/modules/slash/':               'slash',
 334                   'http://schemas.xmlsoap.org/soap/envelope/':            'soap',
 335                   'http://purl.org/rss/1.0/modules/servicestatus/':       'ss',
 336                   'http://hacks.benhammersley.com/rss/streaming/':        'str',
 337                   'http://purl.org/rss/1.0/modules/subscription/':        'sub',
 338                   'http://purl.org/rss/1.0/modules/syndication/':         'sy',
 339                   'http://purl.org/rss/1.0/modules/taxonomy/':            'taxo',
 340                   'http://purl.org/rss/1.0/modules/threading/':           'thr',
 341                   'http://purl.org/rss/1.0/modules/textinput/':           'ti',
 342                   'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
 343                   'http://wellformedweb.org/commentAPI/':                 'wfw',
 344                   'http://purl.org/rss/1.0/modules/wiki/':                'wiki',
 345                   'http://www.w3.org/1999/xhtml':                         'xhtml',
 346                   'http://www.w3.org/XML/1998/namespace':                 'xml',
 347                   'http://schemas.pocketsoap.com/rss/myDescModule/':      'szf'
 348 }
 349     _matchnamespaces = {}
 350
 351     can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
 352     can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
 353     can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
 354     html_types = ['text/html', 'application/xhtml+xml']
 355
 356     def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
 357         if _debug: sys.stderr.write('initializing FeedParser\n')
 358         if not self._matchnamespaces:
 359             for k, v in self.namespaces.items():
 360                 self._matchnamespaces[k.lower()] = v
 361         self.feeddata = FeedParserDict() # feed-level data
 362         self.encoding = encoding # character encoding
 363         self.entries = [] # list of entry-level data
 364         self.version = '' # feed type/version, see SUPPORTED_VERSIONS
 365         self.namespacesInUse = {} # dictionary of namespaces defined by the feed
 366
 367         # the following are used internally to track state;
 368         # this is really out of control and should be refactored
 369         self.infeed = 0
 370         self.inentry = 0
 371         self.incontent = 0
 372         self.intextinput = 0
 373         self.inimage = 0
 374         self.inauthor = 0
 375         self.incontributor = 0
 376         self.inpublisher = 0
 377         self.insource = 0
 378         self.sourcedata = FeedParserDict()
 379         self.contentparams = FeedParserDict()
 380         self._summaryKey = None
 381         self.namespacemap = {}
 382         self.elementstack = []
 383         self.basestack = []
 384         self.langstack = []
 385         self.baseuri = baseuri or ''
 386         self.lang = baselang or None
 387         if baselang:
 388             self.feeddata['language'] = baselang
 389
 390     def unknown_starttag(self, tag, attrs):
 391         if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
 392         # normalize attrs
 393         attrs = [(k.lower(), v) for k, v in attrs]
 394         attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
 395
 396         # track xml:base and xml:lang
 397         attrsD = dict(attrs)
 398         baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
 399         self.baseuri = _urljoin(self.baseuri, baseuri)
 400         lang = attrsD.get('xml:lang', attrsD.get('lang'))
 401         if lang == '':
 402             # xml:lang could be explicitly set to '', we need to capture that
 403             lang = None
 404         elif lang is None:
 405             # if no xml:lang is specified, use parent lang
 406             lang = self.lang
 407         if lang:
 408             if tag in ('feed', 'rss', 'rdf:RDF'):
 409                 self.feeddata['language'] = lang
 410         self.lang = lang
 411         self.basestack.append(self.baseuri)
 412         self.langstack.append(lang)
 413
 414         # track namespaces
 415         for prefix, uri in attrs:
 416             if prefix.startswith('xmlns:'):
 417                 self.trackNamespace(prefix[6:], uri)
 418             elif prefix == 'xmlns':
 419                 self.trackNamespace(None, uri)
 420
 421         # track inline content
 422         if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
 423             # element declared itself as escaped markup, but it isn't really
 424             self.contentparams['type'] = 'application/xhtml+xml'
 425         if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
 426             # Note: probably shouldn't simply recreate localname here, but
 427             # our namespace handling isn't actually 100% correct in cases where
 428             # the feed redefines the default namespace (which is actually
 429             # the usual case for inline content, thanks Sam), so here we
 430             # cheat and just reconstruct the element based on localname
 431             # because that compensates for the bugs in our namespace handling.
 432             # This will horribly munge inline content with non-empty qnames,
 433             # but nobody actually does that, so I'm not fixing it.
 434             tag = tag.split(':')[-1]
 435             return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)
 436
 437         # match namespaces
 438         if tag.find(':') <> -1:
 439             prefix, suffix = tag.split(':', 1)
 440         else:
 441             prefix, suffix = '', tag
 442         prefix = self.namespacemap.get(prefix, prefix)
 443         if prefix:
 444             prefix = prefix + '_'
 445
 446         # special hack for better tracking of empty textinput/image elements in illformed feeds
 447         if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
 448             self.intextinput = 0
 449         if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
 450             self.inimage = 0
 451
 452         # call special handler (if defined) or default handler
 453         methodname = '_start_' + prefix + suffix
 454         try:
 455             method = getattr(self, methodname)
 456             return method(attrsD)
 457         except AttributeError:
 458             return self.push(prefix + suffix, 1)
 459
 460     def unknown_endtag(self, tag):
 461         if _debug: sys.stderr.write('end %s\n' % tag)
 462         # match namespaces
 463         if tag.find(':') <> -1:
 464             prefix, suffix = tag.split(':', 1)
 465         else:
 466             prefix, suffix = '', tag
 467         prefix = self.namespacemap.get(prefix, prefix)
 468         if prefix:
 469             prefix = prefix + '_'
 470
 471         # call special handler (if defined) or default handler
 472         methodname = '_end_' + prefix + suffix
 473         try:
 474             method = getattr(self, methodname)
 475             method()
 476         except AttributeError:
 477             self.pop(prefix + suffix)
 478
 479         # track inline content
 480         if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
 481             # element declared itself as escaped markup, but it isn't really
 482             self.contentparams['type'] = 'application/xhtml+xml'
 483         if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
 484             tag = tag.split(':')[-1]
 485             self.handle_data('</%s>' % tag, escape=0)
 486
 487         # track xml:base and xml:lang going out of scope
 488         if self.basestack:
 489             self.basestack.pop()
 490             if self.basestack and self.basestack[-1]:
 491                 self.baseuri = self.basestack[-1]
 492         if self.langstack:
 493             self.langstack.pop()
 494             if self.langstack: # and (self.langstack[-1] is not None):
 495                 self.lang = self.langstack[-1]
 496
 497     def handle_charref(self, ref):
 498         # called for each character reference, e.g. for '&#160;', ref will be '160'
 499         if not self.elementstack: return
 500         ref = ref.lower()
 501         if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
 502             text = '&#%s;' % ref
 503         else:
 504             if ref[0] == 'x':
 505                 c = int(ref[1:], 16)
 506             else:
 507                 c = int(ref)
 508             text = unichr(c).encode('utf-8')
 509         self.elementstack[-1][2].append(text)
 510
 511     def handle_entityref(self, ref):
 512         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
 513         if not self.elementstack: return
 514         if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
 515         if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
 516             text = '&%s;' % ref
 517         else:
 518             # entity resolution graciously donated by Aaron Swartz
 519             def name2cp(k):
 520                 import htmlentitydefs
 521                 if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
 522                     return htmlentitydefs.name2codepoint[k]
 523                 k = htmlentitydefs.entitydefs[k]
 524                 if k.startswith('&#') and k.endswith(';'):
 525                     return int(k[2:-1]) # not in latin-1
 526                 return ord(k)
 527             try: name2cp(ref)
 528             except KeyError: text = '&%s;' % ref
 529             else: text = unichr(name2cp(ref)).encode('utf-8')
 530         self.elementstack[-1][2].append(text)
 531
 532     def handle_data(self, text, escape=1):
 533         # called for each block of plain text, i.e. outside of any tag and
 534         # not containing any character or entity references
 535         if not self.elementstack: return
 536         if escape and self.contentparams.get('type') == 'application/xhtml+xml':
 537             text = _xmlescape(text)
 538         self.elementstack[-1][2].append(text)
 539
 540     def handle_comment(self, text):
 541         # called for each comment, e.g. <!-- insert message here -->
 542         pass
 543
 544     def handle_pi(self, text):
 545         # called for each processing instruction, e.g. <?instruction>
 546         pass
 547
 548     def handle_decl(self, text):
 549         pass
 550
 551     def parse_declaration(self, i):
 552         # override internal declaration handler to handle CDATA blocks
 553         if _debug: sys.stderr.write('entering parse_declaration\n')
 554         if self.rawdata[i:i+9] == '<![CDATA[':
 555             k = self.rawdata.find(']]>', i)
 556             if k == -1: k = len(self.rawdata)
 557             self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
 558             return k+3
 559         else:
 560             k = self.rawdata.find('>', i)
 561             return k+1
 562
 563     def mapContentType(self, contentType):
 564         contentType = contentType.lower()
 565         if contentType == 'text':
 566             contentType = 'text/plain'
 567         elif contentType == 'html':
 568             contentType = 'text/html'
 569         elif contentType == 'xhtml':
 570             contentType = 'application/xhtml+xml'
 571         return contentType
 572
 573     def trackNamespace(self, prefix, uri):
 574         loweruri = uri.lower()
 575         if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
 576             self.version = 'rss090'
 577         if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
 578             self.version = 'rss10'
 579         if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
 580             self.version = 'atom10'
 581         if loweruri.find('backend.userland.com/rss') <> -1:
 582             # match any backend.userland.com namespace
 583             uri = 'http://backend.userland.com/rss'
 584             loweruri = uri
 585         if self._matchnamespaces.has_key(loweruri):
 586             self.namespacemap[prefix] = self._matchnamespaces[loweruri]
 587             self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
 588         else:
 589             self.namespacesInUse[prefix or ''] = uri
 590
 591     def resolveURI(self, uri):
 592         return _urljoin(self.baseuri or '', uri)
 593
 594     def decodeEntities(self, element, data):
 595         return data
 596
 597     def push(self, element, expectingText):
 598         self.elementstack.append([element, expectingText, []])
 599
 600     def pop(self, element, stripWhitespace=1):
 601         if not self.elementstack: return
 602         if self.elementstack[-1][0] != element: return
 603
 604         element, expectingText, pieces = self.elementstack.pop()
 605         output = ''.join(pieces)
 606         if stripWhitespace:
 607             output = output.strip()
 608         if not expectingText: return output
 609
 610         # decode base64 content
 611         if base64 and self.contentparams.get('base64', 0):
 612             try:
 613                 output = base64.decodestring(output)
 614             except binascii.Error:
 615                 pass
 616             except binascii.Incomplete:
 617                 pass
 618
 619         # resolve relative URIs
 620         if (element in self.can_be_relative_uri) and output:
 621             output = self.resolveURI(output)
 622
 623         # decode entities within embedded markup
 624         if not self.contentparams.get('base64', 0):
 625             output = self.decodeEntities(element, output)
 626
 627         # remove temporary cruft from contentparams
 628         try:
 629             del self.contentparams['mode']
 630         except KeyError:
 631             pass
 632         try:
 633             del self.contentparams['base64']
 634         except KeyError:
 635             pass
 636
 637         # resolve relative URIs within embedded markup
 638         if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
 639             if element in self.can_contain_relative_uris:
 640                 output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
 641
 642         # sanitize embedded markup
 643         if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
 644             if element in self.can_contain_dangerous_markup:
 645                 output = _sanitizeHTML(output, self.encoding)
 646
 647         if self.encoding and type(output) != type(u''):
 648             try:
 649                 output = unicode(output, self.encoding)
 650             except:
 651                 pass
 652
 653         # categories/tags/keywords/whatever are handled in _end_category
 654         if element == 'category':
 655             return output
 656
 657         # store output in appropriate place(s)
 658         if self.inentry and not self.insource:
 659             if element == 'content':
 660                 self.entries[-1].setdefault(element, [])
 661                 contentparams = copy.deepcopy(self.contentparams)
 662                 contentparams['value'] = output
 663                 self.entries[-1][element].append(contentparams)
 664             elif element == 'link':
 665                 self.entries[-1][element] = output
 666                 if output:
 667                     self.entries[-1]['links'][-1]['href'] = output
 668             else:
 669                 if element == 'description':
 670                     element = 'summary'
 671                 self.entries[-1][element] = output
 672                 if self.incontent:
 673                     contentparams = copy.deepcopy(self.contentparams)
 674                     contentparams['value'] = output
 675                     self.entries[-1][element + '_detail'] = contentparams
 676         elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage):
 677             context = self._getContext()
 678             if element == 'description':
 679                 element = 'subtitle'
 680             context[element] = output
 681             if element == 'link':
 682                 context['links'][-1]['href'] = output
 683             elif self.incontent:
 684                 contentparams = copy.deepcopy(self.contentparams)
 685                 contentparams['value'] = output
 686                 context[element + '_detail'] = contentparams
 687         return output
 688
 689     def pushContent(self, tag, attrsD, defaultContentType, expectingText):
 690         self.incontent += 1
 691         self.contentparams = FeedParserDict({
 692             'type': self.mapContentType(attrsD.get('type', defaultContentType)),
 693             'language': self.lang,
 694             'base': self.baseuri})
 695         self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
 696         self.push(tag, expectingText)
 697
 698     def popContent(self, tag):
 699         value = self.pop(tag)
 700         self.incontent -= 1
 701         self.contentparams.clear()
 702         return value
 703
 704     def _mapToStandardPrefix(self, name):
 705         colonpos = name.find(':')
 706         if colonpos <> -1:
 707             prefix = name[:colonpos]
 708             suffix = name[colonpos+1:]
 709             prefix = self.namespacemap.get(prefix, prefix)
 710             name = prefix + ':' + suffix
 711         return name
 712
 713     def _getAttribute(self, attrsD, name):
 714         return attrsD.get(self._mapToStandardPrefix(name))
 715
 716     def _isBase64(self, attrsD, contentparams):
 717         if attrsD.get('mode', '') == 'base64':
 718             return 1
 719         if self.contentparams['type'].startswith('text/'):
 720             return 0
 721         if self.contentparams['type'].endswith('+xml'):
 722             return 0
 723         if self.contentparams['type'].endswith('/xml'):
 724             return 0
 725         return 1
 726
 727     def _itsAnHrefDamnIt(self, attrsD):
 728         href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
 729         if href:
 730             try:
 731                 del attrsD['url']
 732             except KeyError:
 733                 pass
 734             try:
 735                 del attrsD['uri']
 736             except KeyError:
 737                 pass
 738             attrsD['href'] = href
 739         return attrsD
 740
 741     def _save(self, key, value):
 742         context = self._getContext()
 743         context.setdefault(key, value)
 744
 745     def _start_rss(self, attrsD):
 746         versionmap = {'0.91': 'rss091u',
 747                       '0.92': 'rss092',
 748                       '0.93': 'rss093',
 749                       '0.94': 'rss094'}
 750         if not self.version:
 751             attr_version = attrsD.get('version', '')
 752             version = versionmap.get(attr_version)
 753             if version:
 754                 self.version = version
 755             elif attr_version.startswith('2.'):
 756                 self.version = 'rss20'
 757             else:
 758                 self.version = 'rss'
 759
 760     def _start_dlhottitles(self, attrsD):
 761         self.version = 'hotrss'
 762
 763     def _start_channel(self, attrsD):
 764         self.infeed = 1
 765         self._cdf_common(attrsD)
 766     _start_feedinfo = _start_channel
 767
 768     def _cdf_common(self, attrsD):
 769         if attrsD.has_key('lastmod'):
 770             self._start_modified({})
 771             self.elementstack[-1][-1] = attrsD['lastmod']
 772             self._end_modified()
 773         if attrsD.has_key('href'):
 774             self._start_link({})
 775             self.elementstack[-1][-1] = attrsD['href']
 776             self._end_link()
 777
 778     def _start_feed(self, attrsD):
 779         self.infeed = 1
 780         versionmap = {'0.1': 'atom01',
 781                       '0.2': 'atom02',
 782                       '0.3': 'atom03'}
 783         if not self.version:
 784             attr_version = attrsD.get('version')
 785             version = versionmap.get(attr_version)
 786             if version:
 787                 self.version = version
 788             else:
 789                 self.version = 'atom'
 790
 791     def _end_channel(self):
 792         self.infeed = 0
 793     _end_feed = _end_channel
 794
 795     def _start_image(self, attrsD):
 796         self.inimage = 1
 797         self.push('image', 0)
 798         context = self._getContext()
 799         context.setdefault('image', FeedParserDict())
 800
 801     def _end_image(self):
 802         self.pop('image')
 803         self.inimage = 0
 804
 805     def _start_textinput(self, attrsD):
 806         self.intextinput = 1
 807         self.push('textinput', 0)
 808         context = self._getContext()
 809         context.setdefault('textinput', FeedParserDict())
 810     _start_textInput = _start_textinput
 811
 812     def _end_textinput(self):
 813         self.pop('textinput')
 814         self.intextinput = 0
 815     _end_textInput = _end_textinput
 816
 817     def _start_author(self, attrsD):
 818         self.inauthor = 1
 819         self.push('author', 1)
 820     _start_managingeditor = _start_author
 821     _start_dc_author = _start_author
 822     _start_dc_creator = _start_author
 823     _start_itunes_author = _start_author
 824
 825     def _end_author(self):
 826         self.pop('author')
 827         self.inauthor = 0
 828         self._sync_author_detail()
 829     _end_managingeditor = _end_author
 830     _end_dc_author = _end_author
 831     _end_dc_creator = _end_author
 832     _end_itunes_author = _end_author
 833
 834     def _start_itunes_owner(self, attrsD):
 835         self.inpublisher = 1
 836         self.push('publisher', 0)
 837
 838     def _end_itunes_owner(self):
 839         self.pop('publisher')
 840         self.inpublisher = 0
 841         self._sync_author_detail('publisher')
 842
 843     def _start_contributor(self, attrsD):
 844         self.incontributor = 1
 845         context = self._getContext()
 846         context.setdefault('contributors', [])
 847         context['contributors'].append(FeedParserDict())
 848         self.push('contributor', 0)
 849
 850     def _end_contributor(self):
 851         self.pop('contributor')
 852         self.incontributor = 0
 853
 854     def _start_dc_contributor(self, attrsD):
 855         self.incontributor = 1
 856         context = self._getContext()
 857         context.setdefault('contributors', [])
 858         context['contributors'].append(FeedParserDict())
 859         self.push('name', 0)
 860
 861     def _end_dc_contributor(self):
 862         self._end_name()
 863         self.incontributor = 0
 864
 865     def _start_name(self, attrsD):
 866         self.push('name', 0)
 867     _start_itunes_name = _start_name
 868
 869     def _end_name(self):
 870         value = self.pop('name')
 871         if self.inpublisher:
 872             self._save_author('name', value, 'publisher')
 873         elif self.inauthor:
 874             self._save_author('name', value)
 875         elif self.incontributor:
 876             self._save_contributor('name', value)
 877         elif self.intextinput:
 878             context = self._getContext()
 879             context['textinput']['name'] = value
 880     _end_itunes_name = _end_name
 881
 882     def _start_width(self, attrsD):
 883         self.push('width', 0)
 884
 885     def _end_width(self):
 886         value = self.pop('width')
 887         try:
 888             value = int(value)
 889         except:
 890             value = 0
 891         if self.inimage:
 892             context = self._getContext()
 893             context['image']['width'] = value
 894
 895     def _start_height(self, attrsD):
 896         self.push('height', 0)
 897
 898     def _end_height(self):
 899         value = self.pop('height')
 900         try:
 901             value = int(value)
 902         except:
 903             value = 0
 904         if self.inimage:
 905             context = self._getContext()
 906             context['image']['height'] = value
 907
 908     def _start_url(self, attrsD):
 909         self.push('href', 1)
 910     _start_homepage = _start_url
 911     _start_uri = _start_url
 912
 913     def _end_url(self):
 914         value = self.pop('href')
 915         if self.inauthor:
 916             self._save_author('href', value)
 917         elif self.incontributor:
 918             self._save_contributor('href', value)
 919         elif self.inimage:
 920             context = self._getContext()
 921             context['image']['href'] = value
 922         elif self.intextinput:
 923             context = self._getContext()
 924             context['textinput']['link'] = value
 925     _end_homepage = _end_url
 926     _end_uri = _end_url
 927
 928     def _start_email(self, attrsD):
 929         self.push('email', 0)
 930     _start_itunes_email = _start_email
 931
 932     def _end_email(self):
 933         value = self.pop('email')
 934         if self.inpublisher:
 935             self._save_author('email', value, 'publisher')
 936         elif self.inauthor:
 937             self._save_author('email', value)
 938         elif self.incontributor:
 939             self._save_contributor('email', value)
 940     _end_itunes_email = _end_email
 941
 942     def _getContext(self):
 943         if self.insource:
 944             context = self.sourcedata
 945         elif self.inentry:
 946             context = self.entries[-1]
 947         else:
 948             context = self.feeddata
 949         return context
 950
 951     def _save_author(self, key, value, prefix='author'):
 952         context = self._getContext()
 953         context.setdefault(prefix + '_detail', FeedParserDict())
 954         context[prefix + '_detail'][key] = value
 955         self._sync_author_detail()
 956
 957     def _save_contributor(self, key, value):
 958         context = self._getContext()
 959         context.setdefault('contributors', [FeedParserDict()])
 960         context['contributors'][-1][key] = value
 961
 962     def _sync_author_detail(self, key='author'):
 963         context = self._getContext()
 964         detail = context.get('%s_detail' % key)
 965         if detail:
 966             name = detail.get('name')
 967             email = detail.get('email')
 968             if name and email:
 969                 context[key] = '%s (%s)' % (name, email)
 970             elif name:
 971                 context[key] = name
 972             elif email:
 973                 context[key] = email
 974         else:
 975             author = context.get(key)
 976             if not author: return
 977             emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)
 978             if not emailmatch: return
 979             email = emailmatch.group(0)
 980             # probably a better way to do the following, but it passes all the tests
 981             author = author.replace(email, '')
 982             author = author.replace('()', '')
 983             author = author.strip()
 984             if author and (author[0] == '('):
 985                 author = author[1:]
 986             if author and (author[-1] == ')'):
 987                 author = author[:-1]
 988             author = author.strip()
 989             context.setdefault('%s_detail' % key, FeedParserDict())
 990             context['%s_detail' % key]['name'] = author
 991             context['%s_detail' % key]['email'] = email
 992
 993     def _start_subtitle(self, attrsD):
 994         self.pushContent('subtitle', attrsD, 'text/plain', 1)
 995     _start_tagline = _start_subtitle
 996     _start_itunes_subtitle = _start_subtitle
 997
 998     def _end_subtitle(self):
 999         self.popContent('subtitle')
1000     _end_tagline = _end_subtitle
1001     _end_itunes_subtitle = _end_subtitle
1002
1003     def _start_rights(self, attrsD):
1004         self.pushContent('rights', attrsD, 'text/plain', 1)
1005     _start_dc_rights = _start_rights
1006     _start_copyright = _start_rights
1007
1008     def _end_rights(self):
1009         self.popContent('rights')
1010     _end_dc_rights = _end_rights
1011     _end_copyright = _end_rights
1012
1013     def _start_item(self, attrsD):
1014         self.entries.append(FeedParserDict())
1015         self.push('item', 0)
1016         self.inentry = 1
1017         self.guidislink = 0
1018         id = self._getAttribute(attrsD, 'rdf:about')
1019         if id:
1020             context = self._getContext()
1021             context['id'] = id
1022         self._cdf_common(attrsD)
1023     _start_entry = _start_item
1024     _start_product = _start_item
1025
1026     def _end_item(self):
1027         self.pop('item')
1028         self.inentry = 0
1029     _end_entry = _end_item
1030
1031     def _start_dc_language(self, attrsD):
1032         self.push('language', 1)
1033     _start_language = _start_dc_language
1034
1035     def _end_dc_language(self):
1036         self.lang = self.pop('language')
1037     _end_language = _end_dc_language
1038
1039     def _start_dc_publisher(self, attrsD):
1040         self.push('publisher', 1)
1041     _start_webmaster = _start_dc_publisher
1042
1043     def _end_dc_publisher(self):
1044         self.pop('publisher')
1045         self._sync_author_detail('publisher')
1046     _end_webmaster = _end_dc_publisher
1047
1048     def _start_published(self, attrsD):
1049         self.push('published', 1)
1050     _start_dcterms_issued = _start_published
1051     _start_issued = _start_published
1052
1053     def _end_published(self):
1054         value = self.pop('published')
1055         self._save('published_parsed', _parse_date(value))
1056     _end_dcterms_issued = _end_published
1057     _end_issued = _end_published
1058
1059     def _start_updated(self, attrsD):
1060         self.push('updated', 1)
1061     _start_modified = _start_updated
1062     _start_dcterms_modified = _start_updated
1063     _start_pubdate = _start_updated
1064     _start_dc_date = _start_updated
1065
1066     def _end_updated(self):
1067         value = self.pop('updated')
1068         parsed_value = _parse_date(value)
1069         self._save('updated_parsed', parsed_value)
1070     _end_modified = _end_updated
1071     _end_dcterms_modified = _end_updated
1072     _end_pubdate = _end_updated
1073     _end_dc_date = _end_updated
1074
1075     def _start_created(self, attrsD):
1076         self.push('created', 1)
1077     _start_dcterms_created = _start_created
1078
1079     def _end_created(self):
1080         value = self.pop('created')
1081         self._save('created_parsed', _parse_date(value))
1082     _end_dcterms_created = _end_created
1083
1084     def _start_expirationdate(self, attrsD):
1085         self.push('expired', 1)
1086
1087     def _end_expirationdate(self):
1088         self._save('expired_parsed', _parse_date(self.pop('expired')))
1089
1090     def _start_cc_license(self, attrsD):
1091         self.push('license', 1)
1092         value = self._getAttribute(attrsD, 'rdf:resource')
1093         if value:
1094             self.elementstack[-1][2].append(value)
1095         self.pop('license')
1096
1097     def _start_creativecommons_license(self, attrsD):
1098         self.push('license', 1)
1099
1100     def _end_creativecommons_license(self):
1101         self.pop('license')
1102
1103     def _addTag(self, term, scheme, label):
1104         context = self._getContext()
1105         tags = context.setdefault('tags', [])
1106         if (not term) and (not scheme) and (not label): return
1107         value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1108         if value not in tags:
1109             tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))
1110
1111     def _start_category(self, attrsD):
1112         if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
1113         term = attrsD.get('term')
1114         scheme = attrsD.get('scheme', attrsD.get('domain'))
1115         label = attrsD.get('label')
1116         self._addTag(term, scheme, label)
1117         self.push('category', 1)
1118     _start_dc_subject = _start_category
1119     _start_keywords = _start_category
1120
1121     def _end_itunes_keywords(self):
1122         for term in self.pop('itunes_keywords').split():
1123             self._addTag(term, 'http://www.itunes.com/', None)
1124
1125     def _start_itunes_category(self, attrsD):
1126         self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
1127         self.push('category', 1)
1128
1129     def _end_category(self):
1130         value = self.pop('category')
1131         if not value: return
1132         context = self._getContext()
1133         tags = context['tags']
1134         if value and len(tags) and not tags[-1]['term']:
1135             tags[-1]['term'] = value
1136         else:
1137             self._addTag(value, None, None)
1138     _end_dc_subject = _end_category
1139     _end_keywords = _end_category
1140     _end_itunes_category = _end_category
1141
1142     def _start_cloud(self, attrsD):
1143         self._getContext()['cloud'] = FeedParserDict(attrsD)
1144
1145     def _start_link(self, attrsD):
1146         attrsD.setdefault('rel', 'alternate')
1147         attrsD.setdefault('type', 'text/html')
1148         attrsD = self._itsAnHrefDamnIt(attrsD)
1149         if attrsD.has_key('href'):
1150             attrsD['href'] = self.resolveURI(attrsD['href'])
1151         expectingText = self.infeed or self.inentry or self.insource
1152         context = self._getContext()
1153         context.setdefault('links', [])
1154         context['links'].append(FeedParserDict(attrsD))
1155         if attrsD['rel'] == 'enclosure':
1156             self._start_enclosure(attrsD)
1157         if attrsD.has_key('href'):
1158             expectingText = 0
1159             if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1160                 context['link'] = attrsD['href']
1161         else:
1162             self.push('link', expectingText)
1163     _start_producturl = _start_link
1164
1165     def _end_link(self):
1166         value = self.pop('link')
1167         context = self._getContext()
1168         if self.intextinput:
1169             context['textinput']['link'] = value
1170         if self.inimage:
1171             context['image']['link'] = value
1172     _end_producturl = _end_link
1173
1174     def _start_guid(self, attrsD):
1175         self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1176         self.push('id', 1)
1177
1178     def _end_guid(self):
1179         value = self.pop('id')
1180         self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1181         if self.guidislink:
1182             # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1183             # and only if the item doesn't already have a link element
1184             self._save('link', value)
1185
1186     def _start_title(self, attrsD):
1187         self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1188     _start_dc_title = _start_title
1189     _start_media_title = _start_title
1190
1191     def _end_title(self):
1192         value = self.popContent('title')
1193         context = self._getContext()
1194         if self.intextinput:
1195             context['textinput']['title'] = value
1196         elif self.inimage:
1197             context['image']['title'] = value
1198     _end_dc_title = _end_title
1199     _end_media_title = _end_title
1200
1201     def _start_description(self, attrsD):
1202         context = self._getContext()
1203         if context.has_key('summary'):
1204             self._summaryKey = 'content'
1205             self._start_content(attrsD)
1206         else:
1207             self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
1208
1209     def _start_abstract(self, attrsD):
1210         self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1211
1212     def _end_description(self):
1213         if self._summaryKey == 'content':
1214             self._end_content()
1215         else:
1216             value = self.popContent('description')
1217             context = self._getContext()
1218             if self.intextinput:
1219                 context['textinput']['description'] = value
1220             elif self.inimage:
1221                 context['image']['description'] = value
1222         self._summaryKey = None
1223     _end_abstract = _end_description
1224
1225     def _start_info(self, attrsD):
1226         self.pushContent('info', attrsD, 'text/plain', 1)
1227     _start_feedburner_browserfriendly = _start_info
1228
1229     def _end_info(self):
1230         self.popContent('info')
1231     _end_feedburner_browserfriendly = _end_info
1232
1233     def _start_generator(self, attrsD):
1234         if attrsD:
1235             attrsD = self._itsAnHrefDamnIt(attrsD)
1236             if attrsD.has_key('href'):
1237                 attrsD['href'] = self.resolveURI(attrsD['href'])
1238         self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1239         self.push('generator', 1)
1240
1241     def _end_generator(self):
1242         value = self.pop('generator')
1243         context = self._getContext()
1244         if context.has_key('generator_detail'):
1245             context['generator_detail']['name'] = value
1246
1247     def _start_admin_generatoragent(self, attrsD):
1248         self.push('generator', 1)
1249         value = self._getAttribute(attrsD, 'rdf:resource')
1250         if value:
1251             self.elementstack[-1][2].append(value)
1252         self.pop('generator')
1253         self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1254
1255     def _start_admin_errorreportsto(self, attrsD):
1256         self.push('errorreportsto', 1)
1257         value = self._getAttribute(attrsD, 'rdf:resource')
1258         if value:
1259             self.elementstack[-1][2].append(value)
1260         self.pop('errorreportsto')
1261
1262     def _start_summary(self, attrsD):
1263         context = self._getContext()
1264         if context.has_key('summary'):
1265             self._summaryKey = 'content'
1266             self._start_content(attrsD)
1267         else:
1268             self._summaryKey = 'summary'
1269             self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
1270     _start_itunes_summary = _start_summary
1271
1272     def _end_summary(self):
1273         if self._summaryKey == 'content':
1274             self._end_content()
1275         else:
1276             self.popContent(self._summaryKey or 'summary')
1277         self._summaryKey = None
1278     _end_itunes_summary = _end_summary
1279
1280     def _start_enclosure(self, attrsD):
1281         attrsD = self._itsAnHrefDamnIt(attrsD)
1282         self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))
1283         href = attrsD.get('href')
1284         if href:
1285             context = self._getContext()
1286             if not context.get('id'):
1287                 context['id'] = href
1288
1289     def _start_source(self, attrsD):
1290         self.insource = 1
1291
1292     def _end_source(self):
1293         self.insource = 0
1294         self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1295         self.sourcedata.clear()
1296
1297     def _start_content(self, attrsD):
1298         self.pushContent('content', attrsD, 'text/plain', 1)
1299         src = attrsD.get('src')
1300         if src:
1301             self.contentparams['src'] = src
1302         self.push('content', 1)
1303
1304     def _start_prodlink(self, attrsD):
1305         self.pushContent('content', attrsD, 'text/html', 1)
1306
1307     def _start_body(self, attrsD):
1308         self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
1309     _start_xhtml_body = _start_body
1310
1311     def _start_content_encoded(self, attrsD):
1312         self.pushContent('content', attrsD, 'text/html', 1)
1313     _start_fullitem = _start_content_encoded
1314
1315     def _end_content(self):
1316         copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
1317         value = self.popContent('content')
1318         if copyToDescription:
1319             self._save('description', value)
1320     _end_body = _end_content
1321     _end_xhtml_body = _end_content
1322     _end_content_encoded = _end_content
1323     _end_fullitem = _end_content
1324     _end_prodlink = _end_content
1325
1326     def _start_itunes_image(self, attrsD):
1327         self.push('itunes_image', 0)
1328         self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1329     _start_itunes_link = _start_itunes_image
1330
1331     def _end_itunes_block(self):
1332         value = self.pop('itunes_block', 0)
1333         self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1334
1335     def _end_itunes_explicit(self):
1336         value = self.pop('itunes_explicit', 0)
1337         self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
1338
1339 if _XML_AVAILABLE:
1340     class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1341         def __init__(self, baseuri, baselang, encoding):
1342             if _debug: sys.stderr.write('trying StrictFeedParser\n')
1343             xml.sax.handler.ContentHandler.__init__(self)
1344             _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1345             self.bozo = 0
1346             self.exc = None
1347
1348         def startPrefixMapping(self, prefix, uri):
1349             self.trackNamespace(prefix, uri)
1350
1351         def startElementNS(self, name, qname, attrs):
1352             namespace, localname = name
1353             lowernamespace = str(namespace or '').lower()
1354             if lowernamespace.find('backend.userland.com/rss') <> -1:
1355                 # match any backend.userland.com namespace
1356                 namespace = 'http://backend.userland.com/rss'
1357                 lowernamespace = namespace
1358             if qname and qname.find(':') > 0:
1359                 givenprefix = qname.split(':')[0]
1360             else:
1361                 givenprefix = None
1362             prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1363             if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
1364                     raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1365             if prefix:
1366                 localname = prefix + ':' + localname
1367             localname = str(localname).lower()
1368             if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
1369
1370             # qname implementation is horribly broken in Python 2.1 (it
1371             # doesn't report any), and slightly broken in Python 2.2 (it
1372             # doesn't report the xml: namespace). So we match up namespaces
1373             # with a known list first, and then possibly override them with
1374             # the qnames the SAX parser gives us (if indeed it gives us any
1375             # at all).  Thanks to MatejC for helping me test this and
1376             # tirelessly telling me that it didn't work yet.
1377             attrsD = {}
1378             for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1379                 lowernamespace = (namespace or '').lower()
1380                 prefix = self._matchnamespaces.get(lowernamespace, '')
1381                 if prefix:
1382                     attrlocalname = prefix + ':' + attrlocalname
1383                 attrsD[str(attrlocalname).lower()] = attrvalue
1384             for qname in attrs.getQNames():
1385                 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1386             self.unknown_starttag(localname, attrsD.items())
1387
1388         def characters(self, text):
1389             self.handle_data(text)
1390
1391         def endElementNS(self, name, qname):
1392             namespace, localname = name
1393             lowernamespace = str(namespace or '').lower()
1394             if qname and qname.find(':') > 0:
1395                 givenprefix = qname.split(':')[0]
1396             else:
1397                 givenprefix = ''
1398             prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1399             if prefix:
1400                 localname = prefix + ':' + localname
1401             localname = str(localname).lower()
1402             self.unknown_endtag(localname)
1403
1404         def error(self, exc):
1405             self.bozo = 1
1406             self.exc = exc
1407
1408         def fatalError(self, exc):
1409             self.error(exc)
1410             raise exc
1411
1412 class _BaseHTMLProcessor(sgmllib.SGMLParser):
1413     elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1414       'img', 'input', 'isindex', 'link', 'meta', 'param']
1415
1416     def __init__(self, encoding):
1417         self.encoding = encoding
1418         if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1419         sgmllib.SGMLParser.__init__(self)
1420
1421     def reset(self):
1422         self.pieces = []
1423         sgmllib.SGMLParser.reset(self)
1424
1425     def _shorttag_replace(self, match):
1426         tag = match.group(1)
1427         if tag in self.elements_no_end_tag:
1428             return '<' + tag + ' />'
1429         else:
1430             return '<' + tag + '></' + tag + '>'
1431
1432     def feed(self, data):
1433         data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
1434         #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1435         data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)
1436         data = data.replace('&#39;', "'")
1437         data = data.replace('&#34;', '"')
1438         if self.encoding and type(data) == type(u''):
1439             data = data.encode(self.encoding)
1440         sgmllib.SGMLParser.feed(self, data)
1441
1442     def normalize_attrs(self, attrs):
1443         # utility method to be called by descendants
1444         attrs = [(k.lower(), v) for k, v in attrs]
1445         attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1446         return attrs
1447
1448     def unknown_starttag(self, tag, attrs):
1449         # called for each start tag
1450         # attrs is a list of (attr, value) tuples
1451         # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
1452         if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1453         uattrs = []
1454         # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1455         for key, value in attrs:
1456             if type(value) != type(u''):
1457                 value = unicode(value, self.encoding)
1458             uattrs.append((unicode(key, self.encoding), value))
1459         strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
1460         if tag in self.elements_no_end_tag:
1461             self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1462         else:
1463             self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1464
1465     def unknown_endtag(self, tag):
1466         # called for each end tag, e.g. for </pre>, tag will be 'pre'
1467         # Reconstruct the original end tag.
1468         if tag not in self.elements_no_end_tag:
1469             self.pieces.append("</%(tag)s>" % locals())
1470
1471     def handle_charref(self, ref):
1472         # called for each character reference, e.g. for '&#160;', ref will be '160'
1473         # Reconstruct the original character reference.
1474         self.pieces.append('&#%(ref)s;' % locals())
1475
1476     def handle_entityref(self, ref):
1477         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
1478         # Reconstruct the original entity reference.
1479         self.pieces.append('&%(ref)s;' % locals())
1480
1481     def handle_data(self, text):
1482         # called for each block of plain text, i.e. outside of any tag and
1483         # not containing any character or entity references
1484         # Store the original text verbatim.
1485         if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
1486         self.pieces.append(text)
1487
1488     def handle_comment(self, text):
1489         # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1490         # Reconstruct the original comment.
1491         self.pieces.append('<!--%(text)s-->' % locals())
1492
1493     def handle_pi(self, text):
1494         # called for each processing instruction, e.g. <?instruction>
1495         # Reconstruct original processing instruction.
1496         self.pieces.append('<?%(text)s>' % locals())
1497
1498     def handle_decl(self, text):
1499         # called for the DOCTYPE, if present, e.g.
1500         # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1501         #     "http://www.w3.org/TR/html4/loose.dtd">
1502         # Reconstruct original DOCTYPE
1503         self.pieces.append('<!%(text)s>' % locals())
1504
1505     _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1506     def _scan_name(self, i, declstartpos):
1507         rawdata = self.rawdata
1508         n = len(rawdata)
1509         if i == n:
1510             return None, -1
1511         m = self._new_declname_match(rawdata, i)
1512         if m:
1513             s = m.group()
1514             name = s.strip()
1515             if (i + len(s)) == n:
1516                 return None, -1  # end of buffer
1517             return name.lower(), m.end()
1518         else:
1519             self.handle_data(rawdata)
1520 #            self.updatepos(declstartpos, i)
1521             return None, -1
1522
1523     def output(self):
1524         '''Return processed HTML as a single string'''
1525         return ''.join([str(p) for p in self.pieces])
1526
1527 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1528     def __init__(self, baseuri, baselang, encoding):
1529         sgmllib.SGMLParser.__init__(self)
1530         _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1531
1532     def decodeEntities(self, element, data):
1533         data = data.replace('&#60;', '&lt;')
1534         data = data.replace('&#x3c;', '&lt;')
1535         data = data.replace('&#62;', '&gt;')
1536         data = data.replace('&#x3e;', '&gt;')
1537         data = data.replace('&#38;', '&amp;')
1538         data = data.replace('&#x26;', '&amp;')
1539         data = data.replace('&#34;', '&quot;')
1540         data = data.replace('&#x22;', '&quot;')
1541         data = data.replace('&#39;', '&apos;')
1542         data = data.replace('&#x27;', '&apos;')
1543         if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
1544             data = data.replace('&lt;', '<')
1545             data = data.replace('&gt;', '>')
1546             data = data.replace('&amp;', '&')
1547             data = data.replace('&quot;', '"')
1548             data = data.replace('&apos;', "'")
1549         return data
1550
1551 class _RelativeURIResolver(_BaseHTMLProcessor):
1552     relative_uris = [('a', 'href'),
1553                      ('applet', 'codebase'),
1554                      ('area', 'href'),
1555                      ('blockquote', 'cite'),
1556                      ('body', 'background'),
1557                      ('del', 'cite'),
1558                      ('form', 'action'),
1559                      ('frame', 'longdesc'),
1560                      ('frame', 'src'),
1561                      ('iframe', 'longdesc'),
1562                      ('iframe', 'src'),
1563                      ('head', 'profile'),
1564                      ('img', 'longdesc'),
1565                      ('img', 'src'),
1566                      ('img', 'usemap'),
1567                      ('input', 'src'),
1568                      ('input', 'usemap'),
1569                      ('ins', 'cite'),
1570                      ('link', 'href'),
1571                      ('object', 'classid'),
1572                      ('object', 'codebase'),
1573                      ('object', 'data'),
1574                      ('object', 'usemap'),
1575                      ('q', 'cite'),
1576                      ('script', 'src')]
1577
1578     def __init__(self, baseuri, encoding):
1579         _BaseHTMLProcessor.__init__(self, encoding)
1580         self.baseuri = baseuri
1581
1582     def resolveURI(self, uri):
1583         return _urljoin(self.baseuri, uri)
1584
1585     def unknown_starttag(self, tag, attrs):
1586         attrs = self.normalize_attrs(attrs)
1587         attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
1588         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1589
1590 def _resolveRelativeURIs(htmlSource, baseURI, encoding):
1591     if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
1592     p = _RelativeURIResolver(baseURI, encoding)
1593     p.feed(htmlSource)
1594     return p.output()
1595
1596 class _HTMLSanitizer(_BaseHTMLProcessor):
1597     acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
1598       'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
1599       'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
1600       'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
1601       'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
1602       'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
1603       'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
1604       'thead', 'tr', 'tt', 'u', 'ul', 'var']
1605
1606     acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
1607       'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
1608       'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
1609       'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
1610       'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
1611       'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
1612       'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
1613       'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
1614       'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
1615       'usemap', 'valign', 'value', 'vspace', 'width']
1616
1617     unacceptable_elements_with_end_tag = ['script', 'applet']
1618
1619     def reset(self):
1620         _BaseHTMLProcessor.reset(self)
1621         self.unacceptablestack = 0
1622
1623     def unknown_starttag(self, tag, attrs):
1624         if not tag in self.acceptable_elements:
1625             if tag in self.unacceptable_elements_with_end_tag:
1626                 self.unacceptablestack += 1
1627             return
1628         attrs = self.normalize_attrs(attrs)
1629         attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
1630         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1631
1632     def unknown_endtag(self, tag):
1633         if not tag in self.acceptable_elements:
1634             if tag in self.unacceptable_elements_with_end_tag:
1635                 self.unacceptablestack -= 1
1636             return
1637         _BaseHTMLProcessor.unknown_endtag(self, tag)
1638
1639     def handle_pi(self, text):
1640         pass
1641
1642     def handle_decl(self, text):
1643         pass
1644
1645     def handle_data(self, text):
1646         if not self.unacceptablestack:
1647             _BaseHTMLProcessor.handle_data(self, text)
1648
1649 def _sanitizeHTML(htmlSource, encoding):
1650     p = _HTMLSanitizer(encoding)
1651     p.feed(htmlSource)
1652     data = p.output()
1653     if TIDY_MARKUP:
1654         # loop through list of preferred Tidy interfaces looking for one that's installed,
1655         # then set up a common _tidy function to wrap the interface-specific API.
1656         _tidy = None
1657         for tidy_interface in PREFERRED_TIDY_INTERFACES:
1658             try:
1659                 if tidy_interface == "uTidy":
1660                     from tidy import parseString as _utidy
1661                     def _tidy(data, **kwargs):
1662                         return str(_utidy(data, **kwargs))
1663                     break
1664                 elif tidy_interface == "mxTidy":
1665                     from mx.Tidy import Tidy as _mxtidy
1666                     def _tidy(data, **kwargs):
1667                         nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
1668                         return data
1669                     break
1670             except:
1671                 pass
1672         if _tidy:
1673             utf8 = type(data) == type(u'')
1674             if utf8:
1675                 data = data.encode('utf-8')
1676             data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
1677             if utf8:
1678                 data = unicode(data, 'utf-8')
1679             if data.count('<body'):
1680                 data = data.split('<body', 1)[1]
1681                 if data.count('>'):
1682                     data = data.split('>', 1)[1]
1683             if data.count('</body'):
1684                 data = data.split('</body', 1)[0]
1685     data = data.strip().replace('\r\n', '\n')
1686     return data
1687
1688 class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
1689     def http_error_default(self, req, fp, code, msg, headers):
1690         if ((code / 100) == 3) and (code != 304):
1691             return self.http_error_302(req, fp, code, msg, headers)
1692         infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1693         infourl.status = code
1694         return infourl
1695
1696     def http_error_302(self, req, fp, code, msg, headers):
1697         if headers.dict.has_key('location'):
1698             infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
1699         else:
1700             infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1701         if not hasattr(infourl, 'status'):
1702             infourl.status = code
1703         return infourl
1704
1705     def http_error_301(self, req, fp, code, msg, headers):
1706         if headers.dict.has_key('location'):
1707             infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
1708         else:
1709             infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1710         if not hasattr(infourl, 'status'):
1711             infourl.status = code
1712         return infourl
1713
1714     http_error_300 = http_error_302
1715     http_error_303 = http_error_302
1716     http_error_307 = http_error_302
1717
1718     def http_error_401(self, req, fp, code, msg, headers):
1719         # Check if
1720         # - server requires digest auth, AND
1721         # - we tried (unsuccessfully) with basic auth, AND
1722         # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
1723         # If all conditions hold, parse authentication information
1724         # out of the Authorization header we sent the first time
1725         # (for the username and password) and the WWW-Authenticate
1726         # header the server sent back (for the realm) and retry
1727         # the request with the appropriate digest auth headers instead.
1728         # This evil genius hack has been brought to you by Aaron Swartz.
1729         host = urlparse.urlparse(req.get_full_url())[1]
1730         try:
1731             assert sys.version.split()[0] >= '2.3.3'
1732             assert base64 != None
1733             user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
1734             realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
1735             self.add_password(realm, host, user, passw)
1736             retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
1737             self.reset_retry_count()
1738             return retry
1739         except:
1740             return self.http_error_default(req, fp, code, msg, headers)
1741
1742 def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
1743     """URL, filename, or string --> stream
1744
1745     This function lets you define parsers that take any input source
1746     (URL, pathname to local or network file, or actual data as a string)
1747     and deal with it in a uniform manner.  Returned object is guaranteed
1748     to have all the basic stdio read methods (read, readline, readlines).
1749     Just .close() the object when you're done with it.
1750
1751     If the etag argument is supplied, it will be used as the value of an
1752     If-None-Match request header.
1753
1754     If the modified argument is supplied, it must be a tuple of 9 integers
1755     as returned by gmtime() in the standard Python time module. This MUST
1756     be in GMT (Greenwich Mean Time). The formatted date/time will be used
1757     as the value of an If-Modified-Since request header.
1758
1759     If the agent argument is supplied, it will be used as the value of a
1760     User-Agent request header.
1761
1762     If the referrer argument is supplied, it will be used as the value of a
1763     Referer[sic] request header.
1764
1765     If handlers is supplied, it is a list of handlers used to build a
1766     urllib2 opener.
1767     """
1768
1769     if hasattr(url_file_stream_or_string, 'read'):
1770         return url_file_stream_or_string
1771
1772     if url_file_stream_or_string == '-':
1773         return sys.stdin
1774
1775     if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
1776         if not agent:
1777             agent = USER_AGENT
1778         # test for inline user:password for basic auth
1779         auth = None
1780         if base64:
1781             urltype, rest = urllib.splittype(url_file_stream_or_string)
1782             realhost, rest = urllib.splithost(rest)
1783             if realhost:
1784                 user_passwd, realhost = urllib.splituser(realhost)
1785                 if user_passwd:
1786                     url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
1787                     auth = base64.encodestring(user_passwd).strip()
1788         # try to open with urllib2 (to use optional headers)
1789         request = urllib2.Request(url_file_stream_or_string)
1790         request.add_header('User-Agent', agent)
1791         if etag:
1792             request.add_header('If-None-Match', etag)
1793         if modified:
1794             # format into an RFC 1123-compliant timestamp. We can't use
1795             # time.strftime() since the %a and %b directives can be affected
1796             # by the current locale, but RFC 2616 states that dates must be
1797             # in English.
1798             short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
1799             months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
1800             request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
1801         if referrer:
1802             request.add_header('Referer', referrer)
1803         if gzip and zlib:
1804             request.add_header('Accept-encoding', 'gzip, deflate')
1805         elif gzip:
1806             request.add_header('Accept-encoding', 'gzip')
1807         elif zlib:
1808             request.add_header('Accept-encoding', 'deflate')
1809         else:
1810             request.add_header('Accept-encoding', '')
1811         if auth:
1812             request.add_header('Authorization', 'Basic %s' % auth)
1813         if ACCEPT_HEADER:
1814             request.add_header('Accept', ACCEPT_HEADER)
1815         request.add_header('A-IM', 'feed') # RFC 3229 support
1816         opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
1817         opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
1818         try:
1819             return opener.open(request)
1820         finally:
1821             opener.close() # JohnD
1822
1823     # try to open with native open function (if url_file_stream_or_string is a filename)
1824     try:
1825         return open(url_file_stream_or_string)
1826     except:
1827         pass
1828
1829     # treat url_file_stream_or_string as string
1830     return _StringIO(str(url_file_stream_or_string))
1831
1832 _date_handlers = []
1833 def registerDateHandler(func):
1834     '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
1835     _date_handlers.insert(0, func)
1836
1837 # ISO-8601 date parsing routines written by Fazal Majid.
1838 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
1839 # parser is beyond the scope of feedparser and would be a worthwhile addition
1840 # to the Python library.
1841 # A single regular expression cannot parse ISO 8601 date formats into groups
1842 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
1843 # 0301-04-01), so we use templates instead.
1844 # Please note the order in templates is significant because we need a
1845 # greedy match.
1846 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
1847                 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
1848                 '-YY-?MM', '-OOO', '-YY',
1849                 '--MM-?DD', '--MM',
1850                 '---DD',
1851                 'CC', '']
1852 _iso8601_re = [
1853     tmpl.replace(
1854     'YYYY', r'(?P<year>\d{4})').replace(
1855     'YY', r'(?P<year>\d\d)').replace(
1856     'MM', r'(?P<month>[01]\d)').replace(
1857     'DD', r'(?P<day>[0123]\d)').replace(
1858     'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
1859     'CC', r'(?P<century>\d\d$)')
1860     + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
1861     + r'(:(?P<second>\d{2}))?'
1862     + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
1863     for tmpl in _iso8601_tmpl]
1864 del tmpl
1865 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
1866 del regex
1867 def _parse_date_iso8601(dateString):
1868     '''Parse a variety of ISO-8601-compatible formats like 20040105'''
1869     m = None
1870     for _iso8601_match in _iso8601_matches:
1871         m = _iso8601_match(dateString)
1872         if m: break
1873     if not m: return
1874     if m.span() == (0, 0): return
1875     params = m.groupdict()
1876     ordinal = params.get('ordinal', 0)
1877     if ordinal:
1878         ordinal = int(ordinal)
1879     else:
1880         ordinal = 0
1881     year = params.get('year', '--')
1882     if not year or year == '--':
1883         year = time.gmtime()[0]
1884     elif len(year) == 2:
1885         # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
1886         year = 100 * int(time.gmtime()[0] / 100) + int(year)
1887     else:
1888         year = int(year)
1889     month = params.get('month', '-')
1890     if not month or month == '-':
1891         # ordinals are NOT normalized by mktime, we simulate them
1892         # by setting month=1, day=ordinal
1893         if ordinal:
1894             month = 1
1895         else:
1896             month = time.gmtime()[1]
1897     month = int(month)
1898     day = params.get('day', 0)
1899     if not day:
1900         # see above
1901         if ordinal:
1902             day = ordinal
1903         elif params.get('century', 0) or \
1904                  params.get('year', 0) or params.get('month', 0):
1905             day = 1
1906         else:
1907             day = time.gmtime()[2]
1908     else:
1909         day = int(day)
1910     # special case of the century - is the first year of the 21st century
1911     # 2000 or 2001 ? The debate goes on...
1912     if 'century' in params.keys():
1913         year = (int(params['century']) - 1) * 100 + 1
1914     # in ISO 8601 most fields are optional
1915     for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
1916         if not params.get(field, None):
1917             params[field] = 0
1918     hour = int(params.get('hour', 0))
1919     minute = int(params.get('minute', 0))
1920     second = int(params.get('second', 0))
1921     # weekday is normalized by mktime(), we can ignore it
1922     weekday = 0
1923     # daylight savings is complex, but not needed for feedparser's purposes
1924     # as time zones, if specified, include mention of whether it is active
1925     # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
1926     # and most implementations have DST bugs
1927     daylight_savings_flag = 0
1928     tm = [year, month, day, hour, minute, second, weekday,
1929           ordinal, daylight_savings_flag]
1930     # ISO 8601 time zone adjustments
1931     tz = params.get('tz')
1932     if tz and tz != 'Z':
1933         if tz[0] == '-':
1934             tm[3] += int(params.get('tzhour', 0))
1935             tm[4] += int(params.get('tzmin', 0))
1936         elif tz[0] == '+':
1937             tm[3] -= int(params.get('tzhour', 0))
1938             tm[4] -= int(params.get('tzmin', 0))
1939         else:
1940             return None
1941     # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
1942     # which is guaranteed to normalize d/m/y/h/m/s.
1943     # Many implementations have bugs, but we'll pretend they don't.
1944     return time.localtime(time.mktime(tm))
1945 registerDateHandler(_parse_date_iso8601)
1946
1947 # 8-bit date handling routines written by ytrewq1.
1948 _korean_year  = u'\ub144' # b3e2 in euc-kr
1949 _korean_month = u'\uc6d4' # bff9 in euc-kr
1950 _korean_day   = u'\uc77c' # c0cf in euc-kr
1951 _korean_am    = u'\uc624\uc804' # bfc0 c0fc in euc-kr
1952 _korean_pm    = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
1953
1954 _korean_onblog_date_re = \
1955     re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
1956                (_korean_year, _korean_month, _korean_day))
1957 _korean_nate_date_re = \
1958     re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
1959                (_korean_am, _korean_pm))
1960 def _parse_date_onblog(dateString):
1961     '''Parse a string according to the OnBlog 8-bit date format'''
1962     m = _korean_onblog_date_re.match(dateString)
1963     if not m: return
1964     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1965                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1966                  'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
1967                  'zonediff': '+09:00'}
1968     if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
1969     return _parse_date_w3dtf(w3dtfdate)
1970 registerDateHandler(_parse_date_onblog)
1971
1972 def _parse_date_nate(dateString):
1973     '''Parse a string according to the Nate 8-bit date format'''
1974     m = _korean_nate_date_re.match(dateString)
1975     if not m: return
1976     hour = int(m.group(5))
1977     ampm = m.group(4)
1978     if (ampm == _korean_pm):
1979         hour += 12
1980     hour = str(hour)
1981     if len(hour) == 1:
1982         hour = '0' + hour
1983     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1984                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1985                  'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
1986                  'zonediff': '+09:00'}
1987     if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
1988     return _parse_date_w3dtf(w3dtfdate)
1989 registerDateHandler(_parse_date_nate)
1990
1991 _mssql_date_re = \
1992     re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
1993 def _parse_date_mssql(dateString):
1994     '''Parse a string according to the MS SQL date format'''
1995     m = _mssql_date_re.match(dateString)
1996     if not m: return
1997     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1998                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1999                  'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
2000                  'zonediff': '+09:00'}
2001     if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
2002     return _parse_date_w3dtf(w3dtfdate)
2003 registerDateHandler(_parse_date_mssql)
2004
2005 # Unicode strings for Greek date strings
2006 _greek_months = \
2007   { \
2008    u'\u0399\u03b1\u03bd': u'Jan',       # c9e1ed in iso-8859-7
2009    u'\u03a6\u03b5\u03b2': u'Feb',       # d6e5e2 in iso-8859-7
2010    u'\u039c\u03ac\u03ce': u'Mar',       # ccdcfe in iso-8859-7
2011    u'\u039c\u03b1\u03ce': u'Mar',       # cce1fe in iso-8859-7
2012    u'\u0391\u03c0\u03c1': u'Apr',       # c1f0f1 in iso-8859-7
2013    u'\u039c\u03ac\u03b9': u'May',       # ccdce9 in iso-8859-7
2014    u'\u039c\u03b1\u03ca': u'May',       # cce1fa in iso-8859-7
2015    u'\u039c\u03b1\u03b9': u'May',       # cce1e9 in iso-8859-7
2016    u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
2017    u'\u0399\u03bf\u03bd': u'Jun',       # c9efed in iso-8859-7
2018    u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
2019    u'\u0399\u03bf\u03bb': u'Jul',       # c9f9eb in iso-8859-7
2020    u'\u0391\u03cd\u03b3': u'Aug',       # c1fde3 in iso-8859-7
2021    u'\u0391\u03c5\u03b3': u'Aug',       # c1f5e3 in iso-8859-7
2022    u'\u03a3\u03b5\u03c0': u'Sep',       # d3e5f0 in iso-8859-7
2023    u'\u039f\u03ba\u03c4': u'Oct',       # cfeaf4 in iso-8859-7
2024    u'\u039d\u03bf\u03ad': u'Nov',       # cdefdd in iso-8859-7
2025    u'\u039d\u03bf\u03b5': u'Nov',       # cdefe5 in iso-8859-7
2026    u'\u0394\u03b5\u03ba': u'Dec',       # c4e5ea in iso-8859-7
2027   }
2028
2029 _greek_wdays = \
2030   { \
2031    u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
2032    u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
2033    u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
2034    u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
2035    u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
2036    u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
2037    u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
2038   }
2039
2040 _greek_date_format_re = \
2041     re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
2042
2043 def _parse_date_greek(dateString):
2044     '''Parse a string according to a Greek 8-bit date format.'''
2045     m = _greek_date_format_re.match(dateString)
2046     if not m: return
2047     try:
2048         wday = _greek_wdays[m.group(1)]
2049         month = _greek_months[m.group(3)]
2050     except:
2051         return
2052     rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
2053                  {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
2054                   'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
2055                   'zonediff': m.group(8)}
2056     if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
2057     return _parse_date_rfc822(rfc822date)
2058 registerDateHandler(_parse_date_greek)
2059
2060 # Unicode strings for Hungarian date strings
2061 _hungarian_months = \
2062   { \
2063     u'janu\u00e1r':   u'01',  # e1 in iso-8859-2
2064     u'febru\u00e1ri': u'02',  # e1 in iso-8859-2
2065     u'm\u00e1rcius':  u'03',  # e1 in iso-8859-2
2066     u'\u00e1prilis':  u'04',  # e1 in iso-8859-2
2067     u'm\u00e1ujus':   u'05',  # e1 in iso-8859-2
2068     u'j\u00fanius':   u'06',  # fa in iso-8859-2
2069     u'j\u00falius':   u'07',  # fa in iso-8859-2
2070     u'augusztus':     u'08',
2071     u'szeptember':    u'09',
2072     u'okt\u00f3ber':  u'10',  # f3 in iso-8859-2
2073     u'november':      u'11',
2074     u'december':      u'12',
2075   }
2076
2077 _hungarian_date_format_re = \
2078   re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
2079
2080 def _parse_date_hungarian(dateString):
2081     '''Parse a string according to a Hungarian 8-bit date format.'''
2082     m = _hungarian_date_format_re.match(dateString)
2083     if not m: return
2084     try:
2085         month = _hungarian_months[m.group(2)]
2086         day = m.group(3)
2087         if len(day) == 1:
2088             day = '0' + day
2089         hour = m.group(4)
2090         if len(hour) == 1:
2091             hour = '0' + hour
2092     except:
2093         return
2094     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
2095                 {'year': m.group(1), 'month': month, 'day': day,\
2096                  'hour': hour, 'minute': m.group(5),\
2097                  'zonediff': m.group(6)}
2098     if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
2099     return _parse_date_w3dtf(w3dtfdate)
2100 registerDateHandler(_parse_date_hungarian)
2101
2102 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
2103 # Drake and licensed under the Python license.  Removed all range checking
2104 # for month, day, hour, minute, and second, since mktime will normalize
2105 # these later
2106 def _parse_date_w3dtf(dateString):
2107     def __extract_date(m):
2108         year = int(m.group('year'))
2109         if year < 100:
2110             year = 100 * int(time.gmtime()[0] / 100) + int(year)
2111         if year < 1000:
2112             return 0, 0, 0
2113         julian = m.group('julian')
2114         if julian:
2115             julian = int(julian)
2116             month = julian / 30 + 1
2117             day = julian % 30 + 1
2118             jday = None
2119             while jday != julian:
2120                 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
2121                 jday = time.gmtime(t)[-2]
2122                 diff = abs(jday - julian)
2123                 if jday > julian:
2124                     if diff < day:
2125                         day = day - diff
2126                     else:
2127                         month = month - 1
2128                         day = 31
2129                 elif jday < julian:
2130                     if day + diff < 28:
2131                        day = day + diff
2132                     else:
2133                         month = month + 1
2134             return year, month, day
2135         month = m.group('month')
2136         day = 1
2137         if month is None:
2138             month = 1
2139         else:
2140             month = int(month)
2141             day = m.group('day')
2142             if day:
2143                 day = int(day)
2144             else:
2145                 day = 1
2146         return year, month, day
2147
2148     def __extract_time(m):
2149         if not m:
2150             return 0, 0, 0
2151         hours = m.group('hours')
2152         if not hours:
2153             return 0, 0, 0
2154         hours = int(hours)
2155         minutes = int(m.group('minutes'))
2156         seconds = m.group('seconds')
2157         if seconds:
2158             seconds = int(seconds)
2159         else:
2160             seconds = 0
2161         return hours, minutes, seconds
2162
2163     def __extract_tzd(m):
2164         '''Return the Time Zone Designator as an offset in seconds from UTC.'''
2165         if not m:
2166             return 0
2167         tzd = m.group('tzd')
2168         if not tzd:
2169             return 0
2170         if tzd == 'Z':
2171             return 0
2172         hours = int(m.group('tzdhours'))
2173         minutes = m.group('tzdminutes')
2174         if minutes:
2175             minutes = int(minutes)
2176         else:
2177             minutes = 0
2178         offset = (hours*60 + minutes) * 60
2179         if tzd[0] == '+':
2180             return -offset
2181         return offset
2182
2183     __date_re = ('(?P<year>\d\d\d\d)'
2184                  '(?:(?P<dsep>-|)'
2185                  '(?:(?P<julian>\d\d\d)'
2186                  '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')
2187     __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
2188     __tzd_rx = re.compile(__tzd_re)
2189     __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
2190                  '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'
2191                  + __tzd_re)
2192     __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
2193     __datetime_rx = re.compile(__datetime_re)
2194     m = __datetime_rx.match(dateString)
2195     if (m is None) or (m.group() != dateString): return
2196     gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
2197     if gmt[0] == 0: return
2198     return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
2199 registerDateHandler(_parse_date_w3dtf)
2200
2201 def _parse_date_rfc822(dateString):
2202     '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
2203     data = dateString.split()
2204     if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
2205         del data[0]
2206     if len(data) == 4:
2207         s = data[3]
2208         i = s.find('+')
2209         if i > 0:
2210             data[3:] = [s[:i], s[i+1:]]
2211         else:
2212             data.append('')
2213         dateString = " ".join(data)
2214     if len(data) < 5:
2215         dateString += ' 00:00:00 GMT'
2216     tm = rfc822.parsedate_tz(dateString)
2217     if tm:
2218         return time.gmtime(rfc822.mktime_tz(tm))
2219 # rfc822.py defines several time zones, but we define some extra ones.
2220 # 'ET' is equivalent to 'EST', etc.
2221 _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
2222 rfc822._timezones.update(_additional_timezones)
2223 registerDateHandler(_parse_date_rfc822)
2224
2225 def _parse_date(dateString):
2226     '''Parses a variety of date formats into a 9-tuple in GMT'''
2227     for handler in _date_handlers:
2228         try:
2229             date9tuple = handler(dateString)
2230             if not date9tuple: continue
2231             if len(date9tuple) != 9:
2232                 if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
2233                 raise ValueError
2234             map(int, date9tuple)
2235             return date9tuple
2236         except Exception, e:
2237             if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
2238             pass
2239     return None
2240
2241 def _getCharacterEncoding(http_headers, xml_data):
2242     '''Get the character encoding of the XML document
2243
2244     http_headers is a dictionary
2245     xml_data is a raw string (not Unicode)
2246
2247     This is so much trickier than it sounds, it's not even funny.
2248     According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
2249     is application/xml, application/*+xml,
2250     application/xml-external-parsed-entity, or application/xml-dtd,
2251     the encoding given in the charset parameter of the HTTP Content-Type
2252     takes precedence over the encoding given in the XML prefix within the
2253     document, and defaults to 'utf-8' if neither are specified.  But, if
2254     the HTTP Content-Type is text/xml, text/*+xml, or
2255     text/xml-external-parsed-entity, the encoding given in the XML prefix
2256     within the document is ALWAYS IGNORED and only the encoding given in
2257     the charset parameter of the HTTP Content-Type header should be
2258     respected, and it defaults to 'us-ascii' if not specified.
2259
2260     Furthermore, discussion on the atom-syntax mailing list with the
2261     author of RFC 3023 leads me to the conclusion that any document
2262     served with a Content-Type of text/* and no charset parameter
2263     must be treated as us-ascii.  (We now do this.)  And also that it
2264     must always be flagged as non-well-formed.  (We now do this too.)
2265
2266     If Content-Type is unspecified (input was local file or non-HTTP source)
2267     or unrecognized (server just got it totally wrong), then go by the
2268     encoding given in the XML prefix of the document and default to
2269     'iso-8859-1' as per the HTTP specification (RFC 2616).
2270
2271     Then, assuming we didn't find a character encoding in the HTTP headers
2272     (and the HTTP Content-type allowed us to look in the body), we need
2273     to sniff the first few bytes of the XML data and try to determine
2274     whether the encoding is ASCII-compatible.  Section F of the XML
2275     specification shows the way here:
2276     http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2277
2278     If the sniffed encoding is not ASCII-compatible, we need to make it
2279     ASCII compatible so that we can sniff further into the XML declaration
2280     to find the encoding attribute, which will tell us the true encoding.
2281
2282     Of course, none of this guarantees that we will be able to parse the
2283     feed in the declared character encoding (assuming it was declared
2284     correctly, which many are not).  CJKCodecs and iconv_codec help a lot;
2285     you should definitely install them if you can.
2286     http://cjkpython.i18n.org/
2287     '''
2288
2289     def _parseHTTPContentType(content_type):
2290         '''takes HTTP Content-Type header and returns (content type, charset)
2291
2292         If no charset is specified, returns (content type, '')
2293         If no content type is specified, returns ('', '')
2294         Both return parameters are guaranteed to be lowercase strings
2295         '''
2296         content_type = content_type or ''
2297         content_type, params = cgi.parse_header(content_type)
2298         return content_type, params.get('charset', '').replace("'", '')
2299
2300     sniffed_xml_encoding = ''
2301     xml_encoding = ''
2302     true_encoding = ''
2303     http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
2304     # Must sniff for non-ASCII-compatible character encodings before
2305     # searching for XML declaration.  This heuristic is defined in
2306     # section F of the XML specification:
2307     # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2308     try:
2309         if xml_data[:4] == '\x4c\x6f\xa7\x94':
2310             # EBCDIC
2311             xml_data = _ebcdic_to_ascii(xml_data)
2312         elif xml_data[:4] == '\x00\x3c\x00\x3f':
2313             # UTF-16BE
2314             sniffed_xml_encoding = 'utf-16be'
2315             xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
2316         elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
2317             # UTF-16BE with BOM
2318             sniffed_xml_encoding = 'utf-16be'
2319             xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
2320         elif xml_data[:4] == '\x3c\x00\x3f\x00':
2321             # UTF-16LE
2322             sniffed_xml_encoding = 'utf-16le'
2323             xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
2324         elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
2325             # UTF-16LE with BOM
2326             sniffed_xml_encoding = 'utf-16le'
2327             xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
2328         elif xml_data[:4] == '\x00\x00\x00\x3c':
2329             # UTF-32BE
2330             sniffed_xml_encoding = 'utf-32be'
2331             xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
2332         elif xml_data[:4] == '\x3c\x00\x00\x00':
2333             # UTF-32LE
2334             sniffed_xml_encoding = 'utf-32le'
2335             xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
2336         elif xml_data[:4] == '\x00\x00\xfe\xff':
2337             # UTF-32BE with BOM
2338             sniffed_xml_encoding = 'utf-32be'
2339             xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
2340         elif xml_data[:4] == '\xff\xfe\x00\x00':
2341             # UTF-32LE with BOM
2342             sniffed_xml_encoding = 'utf-32le'
2343             xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
2344         elif xml_data[:3] == '\xef\xbb\xbf':
2345             # UTF-8 with BOM
2346             sniffed_xml_encoding = 'utf-8'
2347             xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
2348         else:
2349             # ASCII-compatible
2350             pass
2351         xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
2352     except:
2353         xml_encoding_match = None
2354     if xml_encoding_match:
2355         xml_encoding = xml_encoding_match.groups()[0].lower()
2356         if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
2357             xml_encoding = sniffed_xml_encoding
2358     acceptable_content_type = 0
2359     application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
2360     text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
2361     if (http_content_type in application_content_types) or \
2362        (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
2363         acceptable_content_type = 1
2364         true_encoding = http_encoding or xml_encoding or 'utf-8'
2365     elif (http_content_type in text_content_types) or \
2366          (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
2367         acceptable_content_type = 1
2368         true_encoding = http_encoding or 'us-ascii'
2369     elif http_content_type.startswith('text/'):
2370         true_encoding = http_encoding or 'us-ascii'
2371     elif http_headers and (not http_headers.has_key('content-type')):
2372         true_encoding = xml_encoding or 'iso-8859-1'
2373     else:
2374         true_encoding = xml_encoding or 'utf-8'
2375     return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
2376
2377 def _toUTF8(data, encoding):
2378     '''Changes an XML data stream on the fly to specify a new encoding
2379
2380     data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
2381     encoding is a string recognized by encodings.aliases
2382     '''
2383     if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
2384     # strip Byte Order Mark (if present)
2385     if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
2386         if _debug:
2387             sys.stderr.write('stripping BOM\n')
2388             if encoding != 'utf-16be':
2389                 sys.stderr.write('trying utf-16be instead\n')
2390         encoding = 'utf-16be'
2391         data = data[2:]
2392     elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
2393         if _debug:
2394             sys.stderr.write('stripping BOM\n')
2395             if encoding != 'utf-16le':
2396                 sys.stderr.write('trying utf-16le instead\n')
2397         encoding = 'utf-16le'
2398         data = data[2:]
2399     elif data[:3] == '\xef\xbb\xbf':
2400         if _debug:
2401             sys.stderr.write('stripping BOM\n')
2402             if encoding != 'utf-8':
2403                 sys.stderr.write('trying utf-8 instead\n')
2404         encoding = 'utf-8'
2405         data = data[3:]
2406     elif data[:4] == '\x00\x00\xfe\xff':
2407         if _debug:
2408             sys.stderr.write('stripping BOM\n')
2409             if encoding != 'utf-32be':
2410                 sys.stderr.write('trying utf-32be instead\n')
2411         encoding = 'utf-32be'
2412         data = data[4:]
2413     elif data[:4] == '\xff\xfe\x00\x00':
2414         if _debug:
2415             sys.stderr.write('stripping BOM\n')
2416             if encoding != 'utf-32le':
2417                 sys.stderr.write('trying utf-32le instead\n')
2418         encoding = 'utf-32le'
2419         data = data[4:]
2420     newdata = unicode(data, encoding)
2421     if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
2422     declmatch = re.compile('^<\?xml[^>]*?>')
2423     newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
2424     if declmatch.search(newdata):
2425         newdata = declmatch.sub(newdecl, newdata)
2426     else:
2427         newdata = newdecl + u'\n' + newdata
2428     return newdata.encode('utf-8')
2429
2430 def _stripDoctype(data):
2431     '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
2432
2433     rss_version may be 'rss091n' or None
2434     stripped_data is the same XML document, minus the DOCTYPE
2435     '''
2436     entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
2437     data = entity_pattern.sub('', data)
2438     doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
2439     doctype_results = doctype_pattern.findall(data)
2440     doctype = doctype_results and doctype_results[0] or ''
2441     if doctype.lower().count('netscape'):
2442         version = 'rss091n'
2443     else:
2444         version = None
2445     data = doctype_pattern.sub('', data)
2446     return version, data
2447
2448 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
2449     '''Parse a feed from a URL, file, stream, or string'''
2450     result = FeedParserDict()
2451     result['feed'] = FeedParserDict()
2452     result['entries'] = []
2453     if _XML_AVAILABLE:
2454         result['bozo'] = 0
2455     if type(handlers) == types.InstanceType:
2456         handlers = [handlers]
2457     try:
2458         f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
2459         data = f.read()
2460     except Exception, e:
2461         result['bozo'] = 1
2462         result['bozo_exception'] = e
2463         data = ''
2464         f = None
2465
2466     # if feed is gzip-compressed, decompress it
2467     if f and data and hasattr(f, 'headers'):
2468         if gzip and f.headers.get('content-encoding', '') == 'gzip':
2469             try:
2470                 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
2471             except Exception, e:
2472                 # Some feeds claim to be gzipped but they're not, so
2473                 # we get garbage.  Ideally, we should re-request the
2474                 # feed without the 'Accept-encoding: gzip' header,
2475                 # but we don't.
2476                 result['bozo'] = 1
2477                 result['bozo_exception'] = e
2478                 data = ''
2479         elif zlib and f.headers.get('content-encoding', '') == 'deflate':
2480             try:
2481                 data = zlib.decompress(data, -zlib.MAX_WBITS)
2482             except Exception, e:
2483                 result['bozo'] = 1
2484                 result['bozo_exception'] = e
2485                 data = ''
2486
2487     # save HTTP headers
2488     if hasattr(f, 'info'):
2489         info = f.info()
2490         result['etag'] = info.getheader('ETag')
2491         last_modified = info.getheader('Last-Modified')
2492         if last_modified:
2493             result['modified'] = _parse_date(last_modified)
2494     if hasattr(f, 'url'):
2495         result['href'] = f.url
2496         result['status'] = 200
2497     if hasattr(f, 'status'):
2498         result['status'] = f.status
2499     if hasattr(f, 'headers'):
2500         result['headers'] = f.headers.dict
2501     if hasattr(f, 'close'):
2502         f.close()
2503
2504     # there are four encodings to keep track of:
2505     # - http_encoding is the encoding declared in the Content-Type HTTP header
2506     # - xml_encoding is the encoding declared in the <?xml declaration
2507     # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
2508     # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
2509     http_headers = result.get('headers', {})
2510     result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
2511         _getCharacterEncoding(http_headers, data)
2512     if http_headers and (not acceptable_content_type):
2513         if http_headers.has_key('content-type'):
2514             bozo_message = '%s is not an XML media type' % http_headers['content-type']
2515         else:
2516             bozo_message = 'no Content-type specified'
2517         result['bozo'] = 1
2518         result['bozo_exception'] = NonXMLContentType(bozo_message)
2519
2520     result['version'], data = _stripDoctype(data)
2521
2522     baseuri = http_headers.get('content-location', result.get('href'))
2523     baselang = http_headers.get('content-language', None)
2524
2525     # if server sent 304, we're done
2526     if result.get('status', 0) == 304:
2527         result['version'] = ''
2528         result['debug_message'] = 'The feed has not changed since you last checked, ' + \
2529             'so the server sent no data.  This is a feature, not a bug!'
2530         return result
2531
2532     # if there was a problem downloading, we're done
2533     if not data:
2534         return result
2535
2536     # determine character encoding
2537     use_strict_parser = 0
2538     known_encoding = 0
2539     tried_encodings = []
2540     # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
2541     for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
2542         if not proposed_encoding: continue
2543         if proposed_encoding in tried_encodings: continue
2544         tried_encodings.append(proposed_encoding)
2545         try:
2546             data = _toUTF8(data, proposed_encoding)
2547             known_encoding = use_strict_parser = 1
2548             break
2549         except:
2550             pass
2551     # if no luck and we have auto-detection library, try that
2552     if (not known_encoding) and chardet:
2553         try:
2554             proposed_encoding = chardet.detect(data)['encoding']
2555             if proposed_encoding and (proposed_encoding not in tried_encodings):
2556                 tried_encodings.append(proposed_encoding)
2557                 data = _toUTF8(data, proposed_encoding)
2558                 known_encoding = use_strict_parser = 1
2559         except:
2560             pass
2561     # if still no luck and we haven't tried utf-8 yet, try that
2562     if (not known_encoding) and ('utf-8' not in tried_encodings):
2563         try:
2564             proposed_encoding = 'utf-8'
2565             tried_encodings.append(proposed_encoding)
2566             data = _toUTF8(data, proposed_encoding)
2567             known_encoding = use_strict_parser = 1
2568         except:
2569             pass
2570     # if still no luck and we haven't tried windows-1252 yet, try that
2571     if (not known_encoding) and ('windows-1252' not in tried_encodings):
2572         try:
2573             proposed_encoding = 'windows-1252'
2574             tried_encodings.append(proposed_encoding)
2575             data = _toUTF8(data, proposed_encoding)
2576             known_encoding = use_strict_parser = 1
2577         except:
2578             pass
2579     # if still no luck, give up
2580     if not known_encoding:
2581         result['bozo'] = 1
2582         result['bozo_exception'] = CharacterEncodingUnknown( \
2583             'document encoding unknown, I tried ' + \
2584             '%s, %s, utf-8, and windows-1252 but nothing worked' % \
2585             (result['encoding'], xml_encoding))
2586         result['encoding'] = ''
2587     elif proposed_encoding != result['encoding']:
2588         result['bozo'] = 1
2589         result['bozo_exception'] = CharacterEncodingOverride( \
2590             'documented declared as %s, but parsed as %s' % \
2591             (result['encoding'], proposed_encoding))
2592         result['encoding'] = proposed_encoding
2593
2594     if not _XML_AVAILABLE:
2595         use_strict_parser = 0
2596     if use_strict_parser:
2597         # initialize the SAX parser
2598         feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
2599         saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
2600         saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
2601         saxparser.setContentHandler(feedparser)
2602         saxparser.setErrorHandler(feedparser)
2603         source = xml.sax.xmlreader.InputSource()
2604         source.setByteStream(_StringIO(data))
2605         if hasattr(saxparser, '_ns_stack'):
2606             # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
2607             # PyXML doesn't have this problem, and it doesn't have _ns_stack either
2608             saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
2609         try:
2610             saxparser.parse(source)
2611         except Exception, e:
2612             if _debug:
2613                 import traceback
2614                 traceback.print_stack()
2615                 traceback.print_exc()
2616                 sys.stderr.write('xml parsing failed\n')
2617             result['bozo'] = 1
2618             result['bozo_exception'] = feedparser.exc or e
2619             use_strict_parser = 0
2620     if not use_strict_parser:
2621         feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')
2622         feedparser.feed(data)
2623     result['feed'] = feedparser.feeddata
2624     result['entries'] = feedparser.entries
2625     result['version'] = result['version'] or feedparser.version
2626     result['namespaces'] = feedparser.namespacesInUse
2627     return result
2628
2629 if __name__ == '__main__':
2630     if not sys.argv[1:]:
2631         print __doc__
2632         sys.exit(0)
2633     else:
2634         urls = sys.argv[1:]
2635     zopeCompatibilityHack()
2636     from pprint import pprint
2637     for url in urls:
2638         print url
2639         print
2640         result = parse(url)
2641         pprint(result)
2642         print
2643
2644 #REVISION HISTORY
2645 #1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
2646 #  added Simon Fell's test suite
2647 #1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
2648 #2.0 - 10/19/2002
2649 #  JD - use inchannel to watch out for image and textinput elements which can
2650 #  also contain title, link, and description elements
2651 #  JD - check for isPermaLink='false' attribute on guid elements
2652 #  JD - replaced openAnything with open_resource supporting ETag and
2653 #  If-Modified-Since request headers
2654 #  JD - parse now accepts etag, modified, agent, and referrer optional
2655 #  arguments
2656 #  JD - modified parse to return a dictionary instead of a tuple so that any
2657 #  etag or modified information can be returned and cached by the caller
2658 #2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
2659 #  because of etag/modified, return the old etag/modified to the caller to
2660 #  indicate why nothing is being returned
2661 #2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
2662 #  useless.  Fixes the problem JD was addressing by adding it.
2663 #2.1 - 11/14/2002 - MAP - added gzip support
2664 #2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
2665 #  start_admingeneratoragent is an example of how to handle elements with
2666 #  only attributes, no content.
2667 #2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
2668 #  also, make sure we send the User-Agent even if urllib2 isn't available.
2669 #  Match any variation of backend.userland.com/rss namespace.
2670 #2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
2671 #2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
2672 #  snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
2673 #  project name
2674 #2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
2675 #  removed unnecessary urllib code -- urllib2 should always be available anyway;
2676 #  return actual url, status, and full HTTP headers (as result['url'],
2677 #  result['status'], and result['headers']) if parsing a remote feed over HTTP --
2678 #  this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
2679 #  added the latest namespace-of-the-week for RSS 2.0
2680 #2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
2681 #  User-Agent (otherwise urllib2 sends two, which confuses some servers)
2682 #2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
2683 #  inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
2684 #2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
2685 #  textInput, and also to return the character encoding (if specified)
2686 #2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
2687 #  nested divs within content (JohnD); fixed missing sys import (JohanS);
2688 #  fixed regular expression to capture XML character encoding (Andrei);
2689 #  added support for Atom 0.3-style links; fixed bug with textInput tracking;
2690 #  added support for cloud (MartijnP); added support for multiple
2691 #  category/dc:subject (MartijnP); normalize content model: 'description' gets
2692 #  description (which can come from description, summary, or full content if no
2693 #  description), 'content' gets dict of base/language/type/value (which can come
2694 #  from content:encoded, xhtml:body, content, or fullitem);
2695 #  fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
2696 #  tracking; fixed bug tracking unknown tags; fixed bug tracking content when
2697 #  <content> element is not in default namespace (like Pocketsoap feed);
2698 #  resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
2699 #  wfw:commentRSS; resolve relative URLs within embedded HTML markup in
2700 #  description, xhtml:body, content, content:encoded, title, subtitle,
2701 #  summary, info, tagline, and copyright; added support for pingback and
2702 #  trackback namespaces
2703 #2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
2704 #  namespaces, as opposed to 2.6 when I said I did but didn't really;
2705 #  sanitize HTML markup within some elements; added mxTidy support (if
2706 #  installed) to tidy HTML markup within some elements; fixed indentation
2707 #  bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
2708 #  (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
2709 #  'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
2710 #  'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
2711 #  and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
2712 #2.7.1 - 1/9/2004 - MAP - fixed bug handling &quot; and &apos;.  fixed memory
2713 #  leak not closing url opener (JohnD); added dc:publisher support (MarekK);
2714 #  added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
2715 #2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
2716 #  encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
2717 #  fixed relative URI processing for guid (skadz); added ICBM support; added
2718 #  base64 support
2719 #2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
2720 #  blogspot.com sites); added _debug variable
2721 #2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
2722 #3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
2723 #  added several new supported namespaces; fixed bug tracking naked markup in
2724 #  description; added support for enclosure; added support for source; re-added
2725 #  support for cloud which got dropped somehow; added support for expirationDate
2726 #3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
2727 #  xml:base URI, one for documents that don't define one explicitly and one for
2728 #  documents that define an outer and an inner xml:base that goes out of scope
2729 #  before the end of the document
2730 #3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
2731 #3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']
2732 #  will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
2733 #  added support for creativeCommons:license and cc:license; added support for
2734 #  full Atom content model in title, tagline, info, copyright, summary; fixed bug
2735 #  with gzip encoding (not always telling server we support it when we do)
2736 #3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
2737 #  (dictionary of 'name', 'url', 'email'); map author to author_detail if author
2738 #  contains name + email address
2739 #3.0b8 - 1/28/2004 - MAP - added support for contributor
2740 #3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
2741 #  support for summary
2742 #3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
2743 #  xml.util.iso8601
2744 #3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
2745 #  dangerous markup; fiddled with decodeEntities (not right); liberalized
2746 #  date parsing even further
2747 #3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
2748 #  added support to Atom 0.2 subtitle; added support for Atom content model
2749 #  in copyright; better sanitizing of dangerous HTML elements with end tags
2750 #  (script, frameset)
2751 #3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
2752 #  etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
2753 #3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
2754 #  Python 2.1
2755 #3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
2756 #  fixed bug capturing author and contributor URL; fixed bug resolving relative
2757 #  links in author and contributor URL; fixed bug resolvin relative links in
2758 #  generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
2759 #  namespace tests, and included them permanently in the test suite with his
2760 #  permission; fixed namespace handling under Python 2.1
2761 #3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
2762 #3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
2763 #3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
2764 #  use libxml2 (if available)
2765 #3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
2766 #  name was in parentheses; removed ultra-problematic mxTidy support; patch to
2767 #  workaround crash in PyXML/expat when encountering invalid entities
2768 #  (MarkMoraes); support for textinput/textInput
2769 #3.0b20 - 4/7/2004 - MAP - added CDF support
2770 #3.0b21 - 4/14/2004 - MAP - added Hot RSS support
2771 #3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
2772 #  results dict; changed results dict to allow getting values with results.key
2773 #  as well as results[key]; work around embedded illformed HTML with half
2774 #  a DOCTYPE; work around malformed Content-Type header; if character encoding
2775 #  is wrong, try several common ones before falling back to regexes (if this
2776 #  works, bozo_exception is set to CharacterEncodingOverride); fixed character
2777 #  encoding issues in BaseHTMLProcessor by tracking encoding and converting
2778 #  from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
2779 #  convert each value in results to Unicode (if possible), even if using
2780 #  regex-based parsing
2781 #3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
2782 #  high-bit characters in attributes in embedded HTML in description (thanks
2783 #  Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
2784 #  FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
2785 #  about a mapped key
2786 #3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
2787 #  results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
2788 #  cause the same encoding to be tried twice (even if it failed the first time);
2789 #  fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
2790 #  better textinput and image tracking in illformed RSS 1.0 feeds
2791 #3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
2792 #  my blink tag tests
2793 #3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
2794 #  failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
2795 #  duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
2796 #  added support for image; refactored parse() fallback logic to try other
2797 #  encodings if SAX parsing fails (previously it would only try other encodings
2798 #  if re-encoding failed); remove unichr madness in normalize_attrs now that
2799 #  we're properly tracking encoding in and out of BaseHTMLProcessor; set
2800 #  feed.language from root-level xml:lang; set entry.id from rdf:about;
2801 #  send Accept header
2802 #3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
2803 #  iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
2804 #  windows-1252); fixed regression that could cause the same encoding to be
2805 #  tried twice (even if it failed the first time)
2806 #3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
2807 #  recover from malformed content-type header parameter with no equals sign
2808 #  ('text/xml; charset:iso-8859-1')
2809 #3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
2810 #  to Unicode equivalents in illformed feeds (aaronsw); added and
2811 #  passed tests for converting character entities to Unicode equivalents
2812 #  in illformed feeds (aaronsw); test for valid parsers when setting
2813 #  XML_AVAILABLE; make version and encoding available when server returns
2814 #  a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
2815 #  digest auth or proxy support); add code to parse username/password
2816 #  out of url and send as basic authentication; expose downloading-related
2817 #  exceptions in bozo_exception (aaronsw); added __contains__ method to
2818 #  FeedParserDict (aaronsw); added publisher_detail (aaronsw)
2819 #3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
2820 #  convert feed to UTF-8 before passing to XML parser; completely revamped
2821 #  logic for determining character encoding and attempting XML parsing
2822 #  (much faster); increased default timeout to 20 seconds; test for presence
2823 #  of Location header on redirects; added tests for many alternate character
2824 #  encodings; support various EBCDIC encodings; support UTF-16BE and
2825 #  UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
2826 #  UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
2827 #  XML parsers are available; added support for 'Content-encoding: deflate';
2828 #  send blank 'Accept-encoding: ' header if neither gzip nor zlib modules
2829 #  are available
2830 #3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure
2831 #  problem tracking xml:base and xml:lang if element declares it, child
2832 #  doesn't, first grandchild redeclares it, and second grandchild doesn't;
2833 #  refactored date parsing; defined public registerDateHandler so callers
2834 #  can add support for additional date formats at runtime; added support
2835 #  for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added
2836 #  zopeCompatibilityHack() which turns FeedParserDict into a regular
2837 #  dictionary, required for Zope compatibility, and also makes command-
2838 #  line debugging easier because pprint module formats real dictionaries
2839 #  better than dictionary-like objects; added NonXMLContentType exception,
2840 #  which is stored in bozo_exception when a feed is served with a non-XML
2841 #  media type such as 'text/plain'; respect Content-Language as default
2842 #  language if not xml:lang is present; cloud dict is now FeedParserDict;
2843 #  generator dict is now FeedParserDict; better tracking of xml:lang,
2844 #  including support for xml:lang='' to unset the current language;
2845 #  recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default
2846 #  namespace; don't overwrite final status on redirects (scenarios:
2847 #  redirecting to a URL that returns 304, redirecting to a URL that
2848 #  redirects to another URL with a different type of redirect); add
2849 #  support for HTTP 303 redirects
2850 #4.0 - MAP - support for relative URIs in xml:base attribute; fixed
2851 #  encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;
2852 #  support for Atom 1.0; support for iTunes extensions; new 'tags' for
2853 #  categories/keywords/etc. as array of dict
2854 #  {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0
2855 #  terminology; parse RFC 822-style dates with no time; lots of other
2856 #  bug fixes
2857 #4.1 - MAP - removed socket timeout; added support for chardet library