straw/feedparser.py

   1 #!/usr/bin/env python
   2 """Universal feed parser
   3
   4 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
   5
   6 Visit http://feedparser.org/ for the latest version
   7 Visit http://feedparser.org/docs/ for the latest documentation
   8
   9 Required: Python 2.1 or later
  10 Recommended: Python 2.3 or later
  11 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
  12 """
  13
  14 __version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs"
  15 __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
  16
  17 Redistribution and use in source and binary forms, with or without modification,
  18 are permitted provided that the following conditions are met:
  19
  20 * Redistributions of source code must retain the above copyright notice,
  21   this list of conditions and the following disclaimer.
  22 * Redistributions in binary form must reproduce the above copyright notice,
  23   this list of conditions and the following disclaimer in the documentation
  24   and/or other materials provided with the distribution.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE."""
  37 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
  38 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
  39                     "John Beimler <http://john.beimler.org/>",
  40                     "Fazal Majid <http://www.majid.info/mylos/weblog/>",
  41                     "Aaron Swartz <http://aaronsw.com/>",
  42                     "Kevin Marks <http://epeus.blogspot.com/>"]
  43 _debug = 0
  44
  45 # HTTP "User-Agent" header to send to servers when downloading feeds.
  46 # If you are embedding feedparser in a larger application, you should
  47 # change this to your application name and URL.
  48 USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
  49
  50 # HTTP "Accept" header to send to servers when downloading feeds.  If you don't
  51 # want to send an Accept header, set this to None.
  52 ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
  53
  54 # List of preferred XML parsers, by SAX driver name.  These will be tried first,
  55 # but if they're not installed, Python will keep searching through its own list
  56 # of pre-installed parsers until it finds one that supports everything we need.
  57 PREFERRED_XML_PARSERS = ["drv_libxml2"]
  58
  59 # If you want feedparser to automatically run HTML markup through HTML Tidy, set
  60 # this to 1.  Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
  61 # or utidylib <http://utidylib.berlios.de/>.
  62 TIDY_MARKUP = 0
  63
  64 # List of Python interfaces for HTML Tidy, in order of preference.  Only useful
  65 # if TIDY_MARKUP = 1
  66 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
  67
  68 # ---------- required modules (should come with any Python distribution) ----------
  69 import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
  70 try:
  71     from cStringIO import StringIO as _StringIO
  72 except:
  73     from StringIO import StringIO as _StringIO
  74
  75 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
  76
  77 # gzip is included with most Python distributions, but may not be available if you compiled your own
  78 try:
  79     import gzip
  80 except:
  81     gzip = None
  82 try:
  83     import zlib
  84 except:
  85     zlib = None
  86
  87 # If a real XML parser is available, feedparser will attempt to use it.  feedparser has
  88 # been tested with the built-in SAX parser, PyXML, and libxml2.  On platforms where the
  89 # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
  90 # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
  91 try:
  92     import xml.sax
  93     xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
  94     from xml.sax.saxutils import escape as _xmlescape
  95     _XML_AVAILABLE = 1
  96 except:
  97     _XML_AVAILABLE = 0
  98     def _xmlescape(data):
  99         data = data.replace('&', '&amp;')
 100         data = data.replace('>', '&gt;')
 101         data = data.replace('<', '&lt;')
 102         return data
 103
 104 # base64 support for Atom feeds that contain embedded binary data
 105 try:
 106     import base64, binascii
 107 except:
 108     base64 = binascii = None
 109
 110 # cjkcodecs and iconv_codec provide support for more character encodings.
 111 # Both are available from http://cjkpython.i18n.org/
 112 try:
 113     import cjkcodecs.aliases
 114 except:
 115     pass
 116 try:
 117     import iconv_codec
 118 except:
 119     pass
 120
 121 # chardet library auto-detects character encodings
 122 # Download from http://chardet.feedparser.org/
 123 try:
 124     import chardet
 125     if _debug:
 126         import chardet.constants
 127         chardet.constants._debug = 1
 128 except:
 129     chardet = None
 130
 131 # ---------- don't touch these ----------
 132 class ThingsNobodyCaresAboutButMe(Exception): pass
 133 class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
 134 class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
 135 class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
 136 class UndeclaredNamespace(Exception): pass
 137
 138 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
 139 sgmllib.special = re.compile('<!')
 140 sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
 141
 142 SUPPORTED_VERSIONS = {'': 'unknown',
 143                       'rss090': 'RSS 0.90',
 144                       'rss091n': 'RSS 0.91 (Netscape)',
 145                       'rss091u': 'RSS 0.91 (Userland)',
 146                       'rss092': 'RSS 0.92',
 147                       'rss093': 'RSS 0.93',
 148                       'rss094': 'RSS 0.94',
 149                       'rss20': 'RSS 2.0',
 150                       'rss10': 'RSS 1.0',
 151                       'rss': 'RSS (unknown version)',
 152                       'atom01': 'Atom 0.1',
 153                       'atom02': 'Atom 0.2',
 154                       'atom03': 'Atom 0.3',
 155                       'atom10': 'Atom 1.0',
 156                       'atom': 'Atom (unknown version)',
 157                       'cdf': 'CDF',
 158                       'hotrss': 'Hot RSS'
 159                       }
 160
 161 try:
 162     UserDict = dict
 163 except NameError:
 164     # Python 2.1 does not have dict
 165     from UserDict import UserDict
 166     def dict(aList):
 167         rc = {}
 168         for k, v in aList:
 169             rc[k] = v
 170         return rc
 171
 172 class FeedParserDict(UserDict):
 173     keymap = {'channel': 'feed',
 174               'items': 'entries',
 175               'guid': 'id',
 176               'date': 'updated',
 177               'date_parsed': 'updated_parsed',
 178               'description': ['subtitle', 'summary'],
 179               'url': ['href'],
 180               'modified': 'updated',
 181               'modified_parsed': 'updated_parsed',
 182               'issued': 'published',
 183               'issued_parsed': 'published_parsed',
 184               'copyright': 'rights',
 185               'copyright_detail': 'rights_detail',
 186               'tagline': 'subtitle',
 187               'tagline_detail': 'subtitle_detail'}
 188     def __getitem__(self, key):
 189         if key == 'category':
 190             return UserDict.__getitem__(self, 'tags')[0]['term']
 191         if key == 'categories':
 192             return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
 193         realkey = self.keymap.get(key, key)
 194         if type(realkey) == types.ListType:
 195             for k in realkey:
 196                 if UserDict.has_key(self, k):
 197                     return UserDict.__getitem__(self, k)
 198         if UserDict.has_key(self, key):
 199             return UserDict.__getitem__(self, key)
 200         return UserDict.__getitem__(self, realkey)
 201
 202     def __setitem__(self, key, value):
 203         for k in self.keymap.keys():
 204             if key == k:
 205                 key = self.keymap[k]
 206                 if type(key) == types.ListType:
 207                     key = key[0]
 208         return UserDict.__setitem__(self, key, value)
 209
 210     def get(self, key, default=None):
 211         if self.has_key(key):
 212             return self[key]
 213         else:
 214             return default
 215
 216     def setdefault(self, key, value):
 217         if not self.has_key(key):
 218             self[key] = value
 219         return self[key]
 220
 221     def has_key(self, key):
 222         try:
 223             return hasattr(self, key) or UserDict.has_key(self, key)
 224         except AttributeError:
 225             return False
 226
 227     def __getattr__(self, key):
 228         try:
 229             return self.__dict__[key]
 230         except KeyError:
 231             pass
 232         try:
 233             assert not key.startswith('_')
 234             return self.__getitem__(key)
 235         except:
 236             raise AttributeError, "object has no attribute '%s'" % key
 237
 238     def __setattr__(self, key, value):
 239         if key.startswith('_') or key == 'data':
 240             self.__dict__[key] = value
 241         else:
 242             return self.__setitem__(key, value)
 243
 244     def __contains__(self, key):
 245         return self.has_key(key)
 246
 247 def zopeCompatibilityHack():
 248     global FeedParserDict
 249     del FeedParserDict
 250     def FeedParserDict(aDict=None):
 251         rc = {}
 252         if aDict:
 253             rc.update(aDict)
 254         return rc
 255
 256 _ebcdic_to_ascii_map = None
 257 def _ebcdic_to_ascii(s):
 258     global _ebcdic_to_ascii_map
 259     if not _ebcdic_to_ascii_map:
 260         emap = (
 261             0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
 262             16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
 263             128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
 264             144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
 265             32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
 266             38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
 267             45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
 268             186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
 269             195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
 270             202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
 271             209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
 272             216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
 273             123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
 274             125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
 275             92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
 276             48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
 277             )
 278         import string
 279         _ebcdic_to_ascii_map = string.maketrans( \
 280             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
 281     return s.translate(_ebcdic_to_ascii_map)
 282
 283 _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
 284 def _urljoin(base, uri):
 285     uri = _urifixer.sub(r'\1\3', uri)
 286     return urlparse.urljoin(base, uri)
 287
 288 class _FeedParserMixin:
 289     namespaces = {'': '',
 290                   'http://backend.userland.com/rss': '',
 291                   'http://blogs.law.harvard.edu/tech/rss': '',
 292                   'http://purl.org/rss/1.0/': '',
 293                   'http://my.netscape.com/rdf/simple/0.9/': '',
 294                   'http://example.com/newformat#': '',
 295                   'http://example.com/necho': '',
 296                   'http://purl.org/echo/': '',
 297                   'uri/of/echo/namespace#': '',
 298                   'http://purl.org/pie/': '',
 299                   'http://purl.org/atom/ns#': '',
 300                   'http://www.w3.org/2005/Atom': '',
 301                   'http://purl.org/rss/1.0/modules/rss091#': '',
 302
 303                   'http://webns.net/mvcb/':                               'admin',
 304                   'http://purl.org/rss/1.0/modules/aggregation/':         'ag',
 305                   'http://purl.org/rss/1.0/modules/annotate/':            'annotate',
 306                   'http://media.tangent.org/rss/1.0/':                    'audio',
 307                   'http://backend.userland.com/blogChannelModule':        'blogChannel',
 308                   'http://web.resource.org/cc/':                          'cc',
 309                   'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
 310                   'http://purl.org/rss/1.0/modules/company':              'co',
 311                   'http://purl.org/rss/1.0/modules/content/':             'content',
 312                   'http://my.theinfo.org/changed/1.0/rss/':               'cp',
 313                   'http://purl.org/dc/elements/1.1/':                     'dc',
 314                   'http://purl.org/dc/terms/':                            'dcterms',
 315                   'http://purl.org/rss/1.0/modules/email/':               'email',
 316                   'http://purl.org/rss/1.0/modules/event/':               'ev',
 317                   'http://rssnamespace.org/feedburner/ext/1.0':           'feedburner',
 318                   'http://freshmeat.net/rss/fm/':                         'fm',
 319                   'http://xmlns.com/foaf/0.1/':                           'foaf',
 320                   'http://www.w3.org/2003/01/geo/wgs84_pos#':             'geo',
 321                   'http://postneo.com/icbm/':                             'icbm',
 322                   'http://purl.org/rss/1.0/modules/image/':               'image',
 323                   'http://www.itunes.com/DTDs/PodCast-1.0.dtd':           'itunes',
 324                   'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes',
 325                   'http://purl.org/rss/1.0/modules/link/':                'l',
 326                   'http://search.yahoo.com/mrss':                         'media',
 327                   'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
 328                   'http://prismstandard.org/namespaces/1.2/basic/':       'prism',
 329                   'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',
 330                   'http://www.w3.org/2000/01/rdf-schema#':                'rdfs',
 331                   'http://purl.org/rss/1.0/modules/reference/':           'ref',
 332                   'http://purl.org/rss/1.0/modules/richequiv/':           'reqv',
 333                   'http://purl.org/rss/1.0/modules/search/':              'search',
 334                   'http://purl.org/rss/1.0/modules/slash/':               'slash',
 335                   'http://schemas.xmlsoap.org/soap/envelope/':            'soap',
 336                   'http://purl.org/rss/1.0/modules/servicestatus/':       'ss',
 337                   'http://hacks.benhammersley.com/rss/streaming/':        'str',
 338                   'http://purl.org/rss/1.0/modules/subscription/':        'sub',
 339                   'http://purl.org/rss/1.0/modules/syndication/':         'sy',
 340                   'http://purl.org/rss/1.0/modules/taxonomy/':            'taxo',
 341                   'http://purl.org/rss/1.0/modules/threading/':           'thr',
 342                   'http://purl.org/rss/1.0/modules/textinput/':           'ti',
 343                   'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
 344                   'http://wellformedweb.org/commentAPI/':                 'wfw',
 345                   'http://purl.org/rss/1.0/modules/wiki/':                'wiki',
 346                   'http://www.w3.org/1999/xhtml':                         'xhtml',
 347                   'http://www.w3.org/XML/1998/namespace':                 'xml',
 348                   'http://schemas.pocketsoap.com/rss/myDescModule/':      'szf'
 349 }
 350     _matchnamespaces = {}
 351
 352     can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
 353     can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
 354     can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
 355     html_types = ['text/html', 'application/xhtml+xml']
 356
 357     def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
 358         if _debug: sys.stderr.write('initializing FeedParser\n')
 359         if not self._matchnamespaces:
 360             for k, v in self.namespaces.items():
 361                 self._matchnamespaces[k.lower()] = v
 362         self.feeddata = FeedParserDict() # feed-level data
 363         self.encoding = encoding # character encoding
 364         self.entries = [] # list of entry-level data
 365         self.version = '' # feed type/version, see SUPPORTED_VERSIONS
 366         self.namespacesInUse = {} # dictionary of namespaces defined by the feed
 367
 368         # the following are used internally to track state;
 369         # this is really out of control and should be refactored
 370         self.infeed = 0
 371         self.inentry = 0
 372         self.incontent = 0
 373         self.intextinput = 0
 374         self.inimage = 0
 375         self.inauthor = 0
 376         self.incontributor = 0
 377         self.inpublisher = 0
 378         self.insource = 0
 379         self.sourcedata = FeedParserDict()
 380         self.contentparams = FeedParserDict()
 381         self._summaryKey = None
 382         self.namespacemap = {}
 383         self.elementstack = []
 384         self.basestack = []
 385         self.langstack = []
 386         self.baseuri = baseuri or ''
 387         self.lang = baselang or None
 388         if baselang:
 389             self.feeddata['language'] = baselang
 390
 391     def unknown_starttag(self, tag, attrs):
 392         if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
 393         # normalize attrs
 394         attrs = [(k.lower(), v) for k, v in attrs]
 395         attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
 396
 397         # track xml:base and xml:lang
 398         attrsD = dict(attrs)
 399         baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
 400         self.baseuri = _urljoin(self.baseuri, baseuri)
 401         lang = attrsD.get('xml:lang', attrsD.get('lang'))
 402         if lang == '':
 403             # xml:lang could be explicitly set to '', we need to capture that
 404             lang = None
 405         elif lang is None:
 406             # if no xml:lang is specified, use parent lang
 407             lang = self.lang
 408         if lang:
 409             if tag in ('feed', 'rss', 'rdf:RDF'):
 410                 self.feeddata['language'] = lang
 411         self.lang = lang
 412         self.basestack.append(self.baseuri)
 413         self.langstack.append(lang)
 414
 415         # track namespaces
 416         for prefix, uri in attrs:
 417             if prefix.startswith('xmlns:'):
 418                 self.trackNamespace(prefix[6:], uri)
 419             elif prefix == 'xmlns':
 420                 self.trackNamespace(None, uri)
 421
 422         # track inline content
 423         if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
 424             # element declared itself as escaped markup, but it isn't really
 425             self.contentparams['type'] = 'application/xhtml+xml'
 426         if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
 427             # Note: probably shouldn't simply recreate localname here, but
 428             # our namespace handling isn't actually 100% correct in cases where
 429             # the feed redefines the default namespace (which is actually
 430             # the usual case for inline content, thanks Sam), so here we
 431             # cheat and just reconstruct the element based on localname
 432             # because that compensates for the bugs in our namespace handling.
 433             # This will horribly munge inline content with non-empty qnames,
 434             # but nobody actually does that, so I'm not fixing it.
 435             tag = tag.split(':')[-1]
 436             return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)
 437
 438         # match namespaces
 439         if tag.find(':') <> -1:
 440             prefix, suffix = tag.split(':', 1)
 441         else:
 442             prefix, suffix = '', tag
 443         prefix = self.namespacemap.get(prefix, prefix)
 444         if prefix:
 445             prefix = prefix + '_'
 446
 447         # special hack for better tracking of empty textinput/image elements in illformed feeds
 448         if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
 449             self.intextinput = 0
 450         if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
 451             self.inimage = 0
 452
 453         # call special handler (if defined) or default handler
 454         methodname = '_start_' + prefix + suffix
 455         try:
 456             method = getattr(self, methodname)
 457             return method(attrsD)
 458         except AttributeError:
 459             return self.push(prefix + suffix, 1)
 460
 461     def unknown_endtag(self, tag):
 462         if _debug: sys.stderr.write('end %s\n' % tag)
 463         # match namespaces
 464         if tag.find(':') <> -1:
 465             prefix, suffix = tag.split(':', 1)
 466         else:
 467             prefix, suffix = '', tag
 468         prefix = self.namespacemap.get(prefix, prefix)
 469         if prefix:
 470             prefix = prefix + '_'
 471
 472         # call special handler (if defined) or default handler
 473         methodname = '_end_' + prefix + suffix
 474         try:
 475             method = getattr(self, methodname)
 476             method()
 477         except AttributeError:
 478             self.pop(prefix + suffix)
 479
 480         # track inline content
 481         if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
 482             # element declared itself as escaped markup, but it isn't really
 483             self.contentparams['type'] = 'application/xhtml+xml'
 484         if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
 485             tag = tag.split(':')[-1]
 486             self.handle_data('</%s>' % tag, escape=0)
 487
 488         # track xml:base and xml:lang going out of scope
 489         if self.basestack:
 490             self.basestack.pop()
 491             if self.basestack and self.basestack[-1]:
 492                 self.baseuri = self.basestack[-1]
 493         if self.langstack:
 494             self.langstack.pop()
 495             if self.langstack: # and (self.langstack[-1] is not None):
 496                 self.lang = self.langstack[-1]
 497
 498     def handle_charref(self, ref):
 499         # called for each character reference, e.g. for '&#160;', ref will be '160'
 500         if not self.elementstack: return
 501         ref = ref.lower()
 502         if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
 503             text = '&#%s;' % ref
 504         else:
 505             if ref[0] == 'x':
 506                 c = int(ref[1:], 16)
 507             else:
 508                 c = int(ref)
 509             text = unichr(c).encode('utf-8')
 510         self.elementstack[-1][2].append(text)
 511
 512     def handle_entityref(self, ref):
 513         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
 514         if not self.elementstack: return
 515         if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
 516         if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
 517             text = '&%s;' % ref
 518         else:
 519             # entity resolution graciously donated by Aaron Swartz
 520             def name2cp(k):
 521                 import htmlentitydefs
 522                 if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
 523                     return htmlentitydefs.name2codepoint[k]
 524                 k = htmlentitydefs.entitydefs[k]
 525                 if k.startswith('&#') and k.endswith(';'):
 526                     return int(k[2:-1]) # not in latin-1
 527                 return ord(k)
 528             try: name2cp(ref)
 529             except KeyError: text = '&%s;' % ref
 530             else: text = unichr(name2cp(ref)).encode('utf-8')
 531         self.elementstack[-1][2].append(text)
 532
 533     def handle_data(self, text, escape=1):
 534         # called for each block of plain text, i.e. outside of any tag and
 535         # not containing any character or entity references
 536         if not self.elementstack: return
 537         if escape and self.contentparams.get('type') == 'application/xhtml+xml':
 538             text = _xmlescape(text)
 539         self.elementstack[-1][2].append(text)
 540
 541     def handle_comment(self, text):
 542         # called for each comment, e.g. <!-- insert message here -->
 543         pass
 544
 545     def handle_pi(self, text):
 546         # called for each processing instruction, e.g. <?instruction>
 547         pass
 548
 549     def handle_decl(self, text):
 550         pass
 551
 552     def parse_declaration(self, i):
 553         # override internal declaration handler to handle CDATA blocks
 554         if _debug: sys.stderr.write('entering parse_declaration\n')
 555         if self.rawdata[i:i+9] == '<![CDATA[':
 556             k = self.rawdata.find(']]>', i)
 557             if k == -1: k = len(self.rawdata)
 558             self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
 559             return k+3
 560         else:
 561             k = self.rawdata.find('>', i)
 562             return k+1
 563
 564     def mapContentType(self, contentType):
 565         contentType = contentType.lower()
 566         if contentType == 'text':
 567             contentType = 'text/plain'
 568         elif contentType == 'html':
 569             contentType = 'text/html'
 570         elif contentType == 'xhtml':
 571             contentType = 'application/xhtml+xml'
 572         return contentType
 573
 574     def trackNamespace(self, prefix, uri):
 575         loweruri = uri.lower()
 576         if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
 577             self.version = 'rss090'
 578         if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
 579             self.version = 'rss10'
 580         if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
 581             self.version = 'atom10'
 582         if loweruri.find('backend.userland.com/rss') <> -1:
 583             # match any backend.userland.com namespace
 584             uri = 'http://backend.userland.com/rss'
 585             loweruri = uri
 586         if self._matchnamespaces.has_key(loweruri):
 587             self.namespacemap[prefix] = self._matchnamespaces[loweruri]
 588             self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
 589         else:
 590             self.namespacesInUse[prefix or ''] = uri
 591
 592     def resolveURI(self, uri):
 593         return _urljoin(self.baseuri or '', uri)
 594
 595     def decodeEntities(self, element, data):
 596         return data
 597
 598     def push(self, element, expectingText):
 599         self.elementstack.append([element, expectingText, []])
 600
 601     def pop(self, element, stripWhitespace=1):
 602         if not self.elementstack: return
 603         if self.elementstack[-1][0] != element: return
 604
 605         element, expectingText, pieces = self.elementstack.pop()
 606         output = ''.join(pieces)
 607         if stripWhitespace:
 608             output = output.strip()
 609         if not expectingText: return output
 610
 611         # decode base64 content
 612         if base64 and self.contentparams.get('base64', 0):
 613             try:
 614                 output = base64.decodestring(output)
 615             except binascii.Error:
 616                 pass
 617             except binascii.Incomplete:
 618                 pass
 619
 620         # resolve relative URIs
 621         if (element in self.can_be_relative_uri) and output:
 622             output = self.resolveURI(output)
 623
 624         # decode entities within embedded markup
 625         if not self.contentparams.get('base64', 0):
 626             output = self.decodeEntities(element, output)
 627
 628         # remove temporary cruft from contentparams
 629         try:
 630             del self.contentparams['mode']
 631         except KeyError:
 632             pass
 633         try:
 634             del self.contentparams['base64']
 635         except KeyError:
 636             pass
 637
 638         # resolve relative URIs within embedded markup
 639         if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
 640             if element in self.can_contain_relative_uris:
 641                 output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
 642
 643         # sanitize embedded markup
 644         if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
 645             if element in self.can_contain_dangerous_markup:
 646                 output = _sanitizeHTML(output, self.encoding)
 647
 648         if self.encoding and type(output) != type(u''):
 649             try:
 650                 output = unicode(output, self.encoding)
 651             except:
 652                 pass
 653
 654         # categories/tags/keywords/whatever are handled in _end_category
 655         if element == 'category':
 656             return output
 657
 658         # store output in appropriate place(s)
 659         if self.inentry and not self.insource:
 660             if element == 'content':
 661                 self.entries[-1].setdefault(element, [])
 662                 contentparams = copy.deepcopy(self.contentparams)
 663                 contentparams['value'] = output
 664                 self.entries[-1][element].append(contentparams)
 665             elif element == 'link':
 666                 self.entries[-1][element] = output
 667                 if output:
 668                     self.entries[-1]['links'][-1]['href'] = output
 669             else:
 670                 if element == 'description':
 671                     element = 'summary'
 672                 self.entries[-1][element] = output
 673                 if self.incontent:
 674                     contentparams = copy.deepcopy(self.contentparams)
 675                     contentparams['value'] = output
 676                     self.entries[-1][element + '_detail'] = contentparams
 677         elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage):
 678             context = self._getContext()
 679             if element == 'description':
 680                 element = 'subtitle'
 681             context[element] = output
 682             if element == 'link':
 683                 context['links'][-1]['href'] = output
 684             elif self.incontent:
 685                 contentparams = copy.deepcopy(self.contentparams)
 686                 contentparams['value'] = output
 687                 context[element + '_detail'] = contentparams
 688         return output
 689
 690     def pushContent(self, tag, attrsD, defaultContentType, expectingText):
 691         self.incontent += 1
 692         self.contentparams = FeedParserDict({
 693             'type': self.mapContentType(attrsD.get('type', defaultContentType)),
 694             'language': self.lang,
 695             'base': self.baseuri})
 696         self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
 697         self.push(tag, expectingText)
 698
 699     def popContent(self, tag):
 700         value = self.pop(tag)
 701         self.incontent -= 1
 702         self.contentparams.clear()
 703         return value
 704
 705     def _mapToStandardPrefix(self, name):
 706         colonpos = name.find(':')
 707         if colonpos <> -1:
 708             prefix = name[:colonpos]
 709             suffix = name[colonpos+1:]
 710             prefix = self.namespacemap.get(prefix, prefix)
 711             name = prefix + ':' + suffix
 712         return name
 713
 714     def _getAttribute(self, attrsD, name):
 715         return attrsD.get(self._mapToStandardPrefix(name))
 716
 717     def _isBase64(self, attrsD, contentparams):
 718         if attrsD.get('mode', '') == 'base64':
 719             return 1
 720         if self.contentparams['type'].startswith('text/'):
 721             return 0
 722         if self.contentparams['type'].endswith('+xml'):
 723             return 0
 724         if self.contentparams['type'].endswith('/xml'):
 725             return 0
 726         return 1
 727
 728     def _itsAnHrefDamnIt(self, attrsD):
 729         href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
 730         if href:
 731             try:
 732                 del attrsD['url']
 733             except KeyError:
 734                 pass
 735             try:
 736                 del attrsD['uri']
 737             except KeyError:
 738                 pass
 739             attrsD['href'] = href
 740         return attrsD
 741
 742     def _save(self, key, value):
 743         context = self._getContext()
 744         context.setdefault(key, value)
 745
 746     def _start_rss(self, attrsD):
 747         versionmap = {'0.91': 'rss091u',
 748                       '0.92': 'rss092',
 749                       '0.93': 'rss093',
 750                       '0.94': 'rss094'}
 751         if not self.version:
 752             attr_version = attrsD.get('version', '')
 753             version = versionmap.get(attr_version)
 754             if version:
 755                 self.version = version
 756             elif attr_version.startswith('2.'):
 757                 self.version = 'rss20'
 758             else:
 759                 self.version = 'rss'
 760
 761     def _start_dlhottitles(self, attrsD):
 762         self.version = 'hotrss'
 763
 764     def _start_channel(self, attrsD):
 765         self.infeed = 1
 766         self._cdf_common(attrsD)
 767     _start_feedinfo = _start_channel
 768
 769     def _cdf_common(self, attrsD):
 770         if attrsD.has_key('lastmod'):
 771             self._start_modified({})
 772             self.elementstack[-1][-1] = attrsD['lastmod']
 773             self._end_modified()
 774         if attrsD.has_key('href'):
 775             self._start_link({})
 776             self.elementstack[-1][-1] = attrsD['href']
 777             self._end_link()
 778
 779     def _start_feed(self, attrsD):
 780         self.infeed = 1
 781         versionmap = {'0.1': 'atom01',
 782                       '0.2': 'atom02',
 783                       '0.3': 'atom03'}
 784         if not self.version:
 785             attr_version = attrsD.get('version')
 786             version = versionmap.get(attr_version)
 787             if version:
 788                 self.version = version
 789             else:
 790                 self.version = 'atom'
 791
 792     def _end_channel(self):
 793         self.infeed = 0
 794     _end_feed = _end_channel
 795
 796     def _start_image(self, attrsD):
 797         self.inimage = 1
 798         self.push('image', 0)
 799         context = self._getContext()
 800         context.setdefault('image', FeedParserDict())
 801
 802     def _end_image(self):
 803         self.pop('image')
 804         self.inimage = 0
 805
 806     def _start_textinput(self, attrsD):
 807         self.intextinput = 1
 808         self.push('textinput', 0)
 809         context = self._getContext()
 810         context.setdefault('textinput', FeedParserDict())
 811     _start_textInput = _start_textinput
 812
 813     def _end_textinput(self):
 814         self.pop('textinput')
 815         self.intextinput = 0
 816     _end_textInput = _end_textinput
 817
 818     def _start_author(self, attrsD):
 819         self.inauthor = 1
 820         self.push('author', 1)
 821     _start_managingeditor = _start_author
 822     _start_dc_author = _start_author
 823     _start_dc_creator = _start_author
 824     _start_itunes_author = _start_author
 825
 826     def _end_author(self):
 827         self.pop('author')
 828         self.inauthor = 0
 829         self._sync_author_detail()
 830     _end_managingeditor = _end_author
 831     _end_dc_author = _end_author
 832     _end_dc_creator = _end_author
 833     _end_itunes_author = _end_author
 834
 835     def _start_itunes_owner(self, attrsD):
 836         self.inpublisher = 1
 837         self.push('publisher', 0)
 838
 839     def _end_itunes_owner(self):
 840         self.pop('publisher')
 841         self.inpublisher = 0
 842         self._sync_author_detail('publisher')
 843
 844     def _start_contributor(self, attrsD):
 845         self.incontributor = 1
 846         context = self._getContext()
 847         context.setdefault('contributors', [])
 848         context['contributors'].append(FeedParserDict())
 849         self.push('contributor', 0)
 850
 851     def _end_contributor(self):
 852         self.pop('contributor')
 853         self.incontributor = 0
 854
 855     def _start_dc_contributor(self, attrsD):
 856         self.incontributor = 1
 857         context = self._getContext()
 858         context.setdefault('contributors', [])
 859         context['contributors'].append(FeedParserDict())
 860         self.push('name', 0)
 861
 862     def _end_dc_contributor(self):
 863         self._end_name()
 864         self.incontributor = 0
 865
 866     def _start_name(self, attrsD):
 867         self.push('name', 0)
 868     _start_itunes_name = _start_name
 869
 870     def _end_name(self):
 871         value = self.pop('name')
 872         if self.inpublisher:
 873             self._save_author('name', value, 'publisher')
 874         elif self.inauthor:
 875             self._save_author('name', value)
 876         elif self.incontributor:
 877             self._save_contributor('name', value)
 878         elif self.intextinput:
 879             context = self._getContext()
 880             context['textinput']['name'] = value
 881     _end_itunes_name = _end_name
 882
 883     def _start_width(self, attrsD):
 884         self.push('width', 0)
 885
 886     def _end_width(self):
 887         value = self.pop('width')
 888         try:
 889             value = int(value)
 890         except:
 891             value = 0
 892         if self.inimage:
 893             context = self._getContext()
 894             context['image']['width'] = value
 895
 896     def _start_height(self, attrsD):
 897         self.push('height', 0)
 898
 899     def _end_height(self):
 900         value = self.pop('height')
 901         try:
 902             value = int(value)
 903         except:
 904             value = 0
 905         if self.inimage:
 906             context = self._getContext()
 907             context['image']['height'] = value
 908
 909     def _start_url(self, attrsD):
 910         self.push('href', 1)
 911     _start_homepage = _start_url
 912     _start_uri = _start_url
 913
 914     def _end_url(self):
 915         value = self.pop('href')
 916         if self.inauthor:
 917             self._save_author('href', value)
 918         elif self.incontributor:
 919             self._save_contributor('href', value)
 920         elif self.inimage:
 921             context = self._getContext()
 922             context['image']['href'] = value
 923         elif self.intextinput:
 924             context = self._getContext()
 925             context['textinput']['link'] = value
 926     _end_homepage = _end_url
 927     _end_uri = _end_url
 928
 929     def _start_email(self, attrsD):
 930         self.push('email', 0)
 931     _start_itunes_email = _start_email
 932
 933     def _end_email(self):
 934         value = self.pop('email')
 935         if self.inpublisher:
 936             self._save_author('email', value, 'publisher')
 937         elif self.inauthor:
 938             self._save_author('email', value)
 939         elif self.incontributor:
 940             self._save_contributor('email', value)
 941     _end_itunes_email = _end_email
 942
 943     def _getContext(self):
 944         if self.insource:
 945             context = self.sourcedata
 946         elif self.inentry:
 947             context = self.entries[-1]
 948         else:
 949             context = self.feeddata
 950         return context
 951
 952     def _save_author(self, key, value, prefix='author'):
 953         context = self._getContext()
 954         context.setdefault(prefix + '_detail', FeedParserDict())
 955         context[prefix + '_detail'][key] = value
 956         self._sync_author_detail()
 957
 958     def _save_contributor(self, key, value):
 959         context = self._getContext()
 960         context.setdefault('contributors', [FeedParserDict()])
 961         context['contributors'][-1][key] = value
 962
 963     def _sync_author_detail(self, key='author'):
 964         context = self._getContext()
 965         detail = context.get('%s_detail' % key)
 966         if detail:
 967             name = detail.get('name')
 968             email = detail.get('email')
 969             if name and email:
 970                 context[key] = '%s (%s)' % (name, email)
 971             elif name:
 972                 context[key] = name
 973             elif email:
 974                 context[key] = email
 975         else:
 976             author = context.get(key)
 977             if not author: return
 978             emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)
 979             if not emailmatch: return
 980             email = emailmatch.group(0)
 981             # probably a better way to do the following, but it passes all the tests
 982             author = author.replace(email, '')
 983             author = author.replace('()', '')
 984             author = author.strip()
 985             if author and (author[0] == '('):
 986                 author = author[1:]
 987             if author and (author[-1] == ')'):
 988                 author = author[:-1]
 989             author = author.strip()
 990             context.setdefault('%s_detail' % key, FeedParserDict())
 991             context['%s_detail' % key]['name'] = author
 992             context['%s_detail' % key]['email'] = email
 993
 994     def _start_subtitle(self, attrsD):
 995         self.pushContent('subtitle', attrsD, 'text/plain', 1)
 996     _start_tagline = _start_subtitle
 997     _start_itunes_subtitle = _start_subtitle
 998
 999     def _end_subtitle(self):
1000         self.popContent('subtitle')
1001     _end_tagline = _end_subtitle
1002     _end_itunes_subtitle = _end_subtitle
1003
1004     def _start_rights(self, attrsD):
1005         self.pushContent('rights', attrsD, 'text/plain', 1)
1006     _start_dc_rights = _start_rights
1007     _start_copyright = _start_rights
1008
1009     def _end_rights(self):
1010         self.popContent('rights')
1011     _end_dc_rights = _end_rights
1012     _end_copyright = _end_rights
1013
1014     def _start_item(self, attrsD):
1015         self.entries.append(FeedParserDict())
1016         self.push('item', 0)
1017         self.inentry = 1
1018         self.guidislink = 0
1019         id = self._getAttribute(attrsD, 'rdf:about')
1020         if id:
1021             context = self._getContext()
1022             context['id'] = id
1023         self._cdf_common(attrsD)
1024     _start_entry = _start_item
1025     _start_product = _start_item
1026
1027     def _end_item(self):
1028         self.pop('item')
1029         self.inentry = 0
1030     _end_entry = _end_item
1031
1032     def _start_dc_language(self, attrsD):
1033         self.push('language', 1)
1034     _start_language = _start_dc_language
1035
1036     def _end_dc_language(self):
1037         self.lang = self.pop('language')
1038     _end_language = _end_dc_language
1039
1040     def _start_dc_publisher(self, attrsD):
1041         self.push('publisher', 1)
1042     _start_webmaster = _start_dc_publisher
1043
1044     def _end_dc_publisher(self):
1045         self.pop('publisher')
1046         self._sync_author_detail('publisher')
1047     _end_webmaster = _end_dc_publisher
1048
1049     def _start_published(self, attrsD):
1050         self.push('published', 1)
1051     _start_dcterms_issued = _start_published
1052     _start_issued = _start_published
1053
1054     def _end_published(self):
1055         value = self.pop('published')
1056         self._save('published_parsed', _parse_date(value))
1057     _end_dcterms_issued = _end_published
1058     _end_issued = _end_published
1059
1060     def _start_updated(self, attrsD):
1061         self.push('updated', 1)
1062     _start_modified = _start_updated
1063     _start_dcterms_modified = _start_updated
1064     _start_pubdate = _start_updated
1065     _start_dc_date = _start_updated
1066
1067     def _end_updated(self):
1068         value = self.pop('updated')
1069         parsed_value = _parse_date(value)
1070         self._save('updated_parsed', parsed_value)
1071     _end_modified = _end_updated
1072     _end_dcterms_modified = _end_updated
1073     _end_pubdate = _end_updated
1074     _end_dc_date = _end_updated
1075
1076     def _start_created(self, attrsD):
1077         self.push('created', 1)
1078     _start_dcterms_created = _start_created
1079
1080     def _end_created(self):
1081         value = self.pop('created')
1082         self._save('created_parsed', _parse_date(value))
1083     _end_dcterms_created = _end_created
1084
1085     def _start_expirationdate(self, attrsD):
1086         self.push('expired', 1)
1087
1088     def _end_expirationdate(self):
1089         self._save('expired_parsed', _parse_date(self.pop('expired')))
1090
1091     def _start_cc_license(self, attrsD):
1092         self.push('license', 1)
1093         value = self._getAttribute(attrsD, 'rdf:resource')
1094         if value:
1095             self.elementstack[-1][2].append(value)
1096         self.pop('license')
1097
1098     def _start_creativecommons_license(self, attrsD):
1099         self.push('license', 1)
1100
1101     def _end_creativecommons_license(self):
1102         self.pop('license')
1103
1104     def _addTag(self, term, scheme, label):
1105         context = self._getContext()
1106         tags = context.setdefault('tags', [])
1107         if (not term) and (not scheme) and (not label): return
1108         value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1109         if value not in tags:
1110             tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))
1111
1112     def _start_category(self, attrsD):
1113         if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
1114         term = attrsD.get('term')
1115         scheme = attrsD.get('scheme', attrsD.get('domain'))
1116         label = attrsD.get('label')
1117         self._addTag(term, scheme, label)
1118         self.push('category', 1)
1119     _start_dc_subject = _start_category
1120     _start_keywords = _start_category
1121
1122     def _end_itunes_keywords(self):
1123         for term in self.pop('itunes_keywords').split():
1124             self._addTag(term, 'http://www.itunes.com/', None)
1125
1126     def _start_itunes_category(self, attrsD):
1127         self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
1128         self.push('category', 1)
1129
1130     def _end_category(self):
1131         value = self.pop('category')
1132         if not value: return
1133         context = self._getContext()
1134         tags = context['tags']
1135         if value and len(tags) and not tags[-1]['term']:
1136             tags[-1]['term'] = value
1137         else:
1138             self._addTag(value, None, None)
1139     _end_dc_subject = _end_category
1140     _end_keywords = _end_category
1141     _end_itunes_category = _end_category
1142
1143     def _start_cloud(self, attrsD):
1144         self._getContext()['cloud'] = FeedParserDict(attrsD)
1145
1146     def _start_link(self, attrsD):
1147         attrsD.setdefault('rel', 'alternate')
1148         attrsD.setdefault('type', 'text/html')
1149         attrsD = self._itsAnHrefDamnIt(attrsD)
1150         if attrsD.has_key('href'):
1151             attrsD['href'] = self.resolveURI(attrsD['href'])
1152         expectingText = self.infeed or self.inentry or self.insource
1153         context = self._getContext()
1154         context.setdefault('links', [])
1155         context['links'].append(FeedParserDict(attrsD))
1156         if attrsD['rel'] == 'enclosure':
1157             self._start_enclosure(attrsD)
1158         if attrsD.has_key('href'):
1159             expectingText = 0
1160             if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1161                 context['link'] = attrsD['href']
1162         else:
1163             self.push('link', expectingText)
1164     _start_producturl = _start_link
1165
1166     def _end_link(self):
1167         value = self.pop('link')
1168         context = self._getContext()
1169         if self.intextinput:
1170             context['textinput']['link'] = value
1171         if self.inimage:
1172             context['image']['link'] = value
1173     _end_producturl = _end_link
1174
1175     def _start_guid(self, attrsD):
1176         self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1177         self.push('id', 1)
1178
1179     def _end_guid(self):
1180         value = self.pop('id')
1181         self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1182         if self.guidislink:
1183             # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1184             # and only if the item doesn't already have a link element
1185             self._save('link', value)
1186
1187     def _start_title(self, attrsD):
1188         self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1189     _start_dc_title = _start_title
1190     _start_media_title = _start_title
1191
1192     def _end_title(self):
1193         value = self.popContent('title')
1194         context = self._getContext()
1195         if self.intextinput:
1196             context['textinput']['title'] = value
1197         elif self.inimage:
1198             context['image']['title'] = value
1199     _end_dc_title = _end_title
1200     _end_media_title = _end_title
1201
1202     def _start_description(self, attrsD):
1203         context = self._getContext()
1204         if context.has_key('summary'):
1205             self._summaryKey = 'content'
1206             self._start_content(attrsD)
1207         else:
1208             self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
1209
1210     def _start_abstract(self, attrsD):
1211         self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1212
1213     def _end_description(self):
1214         if self._summaryKey == 'content':
1215             self._end_content()
1216         else:
1217             value = self.popContent('description')
1218             context = self._getContext()
1219             if self.intextinput:
1220                 context['textinput']['description'] = value
1221             elif self.inimage:
1222                 context['image']['description'] = value
1223         self._summaryKey = None
1224     _end_abstract = _end_description
1225
1226     def _start_info(self, attrsD):
1227         self.pushContent('info', attrsD, 'text/plain', 1)
1228     _start_feedburner_browserfriendly = _start_info
1229
1230     def _end_info(self):
1231         self.popContent('info')
1232     _end_feedburner_browserfriendly = _end_info
1233
1234     def _start_generator(self, attrsD):
1235         if attrsD:
1236             attrsD = self._itsAnHrefDamnIt(attrsD)
1237             if attrsD.has_key('href'):
1238                 attrsD['href'] = self.resolveURI(attrsD['href'])
1239         self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1240         self.push('generator', 1)
1241
1242     def _end_generator(self):
1243         value = self.pop('generator')
1244         context = self._getContext()
1245         if context.has_key('generator_detail'):
1246             context['generator_detail']['name'] = value
1247
1248     def _start_admin_generatoragent(self, attrsD):
1249         self.push('generator', 1)
1250         value = self._getAttribute(attrsD, 'rdf:resource')
1251         if value:
1252             self.elementstack[-1][2].append(value)
1253         self.pop('generator')
1254         self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1255
1256     def _start_admin_errorreportsto(self, attrsD):
1257         self.push('errorreportsto', 1)
1258         value = self._getAttribute(attrsD, 'rdf:resource')
1259         if value:
1260             self.elementstack[-1][2].append(value)
1261         self.pop('errorreportsto')
1262
1263     def _start_summary(self, attrsD):
1264         context = self._getContext()
1265         if context.has_key('summary'):
1266             self._summaryKey = 'content'
1267             self._start_content(attrsD)
1268         else:
1269             self._summaryKey = 'summary'
1270             self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
1271     _start_itunes_summary = _start_summary
1272
1273     def _end_summary(self):
1274         if self._summaryKey == 'content':
1275             self._end_content()
1276         else:
1277             self.popContent(self._summaryKey or 'summary')
1278         self._summaryKey = None
1279     _end_itunes_summary = _end_summary
1280
1281     def _start_enclosure(self, attrsD):
1282         attrsD = self._itsAnHrefDamnIt(attrsD)
1283         self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))
1284         href = attrsD.get('href')
1285         if href:
1286             context = self._getContext()
1287             if not context.get('id'):
1288                 context['id'] = href
1289
1290     def _start_source(self, attrsD):
1291         self.insource = 1
1292
1293     def _end_source(self):
1294         self.insource = 0
1295         self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1296         self.sourcedata.clear()
1297
1298     def _start_content(self, attrsD):
1299         self.pushContent('content', attrsD, 'text/plain', 1)
1300         src = attrsD.get('src')
1301         if src:
1302             self.contentparams['src'] = src
1303         self.push('content', 1)
1304
1305     def _start_prodlink(self, attrsD):
1306         self.pushContent('content', attrsD, 'text/html', 1)
1307
1308     def _start_body(self, attrsD):
1309         self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
1310     _start_xhtml_body = _start_body
1311
1312     def _start_content_encoded(self, attrsD):
1313         self.pushContent('content', attrsD, 'text/html', 1)
1314     _start_fullitem = _start_content_encoded
1315
1316     def _end_content(self):
1317         copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
1318         value = self.popContent('content')
1319         if copyToDescription:
1320             self._save('description', value)
1321     _end_body = _end_content
1322     _end_xhtml_body = _end_content
1323     _end_content_encoded = _end_content
1324     _end_fullitem = _end_content
1325     _end_prodlink = _end_content
1326
1327     def _start_itunes_image(self, attrsD):
1328         self.push('itunes_image', 0)
1329         self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1330     _start_itunes_link = _start_itunes_image
1331
1332     def _end_itunes_block(self):
1333         value = self.pop('itunes_block', 0)
1334         self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1335
1336     def _end_itunes_explicit(self):
1337         value = self.pop('itunes_explicit', 0)
1338         self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
1339
1340 if _XML_AVAILABLE:
1341     class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1342         def __init__(self, baseuri, baselang, encoding):
1343             if _debug: sys.stderr.write('trying StrictFeedParser\n')
1344             xml.sax.handler.ContentHandler.__init__(self)
1345             _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1346             self.bozo = 0
1347             self.exc = None
1348
1349         def startPrefixMapping(self, prefix, uri):
1350             self.trackNamespace(prefix, uri)
1351
1352         def startElementNS(self, name, qname, attrs):
1353             namespace, localname = name
1354             lowernamespace = str(namespace or '').lower()
1355             if lowernamespace.find('backend.userland.com/rss') <> -1:
1356                 # match any backend.userland.com namespace
1357                 namespace = 'http://backend.userland.com/rss'
1358                 lowernamespace = namespace
1359             if qname and qname.find(':') > 0:
1360                 givenprefix = qname.split(':')[0]
1361             else:
1362                 givenprefix = None
1363             prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1364             if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
1365                     raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1366             if prefix:
1367                 localname = prefix + ':' + localname
1368             localname = str(localname).lower()
1369             if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
1370
1371             # qname implementation is horribly broken in Python 2.1 (it
1372             # doesn't report any), and slightly broken in Python 2.2 (it
1373             # doesn't report the xml: namespace). So we match up namespaces
1374             # with a known list first, and then possibly override them with
1375             # the qnames the SAX parser gives us (if indeed it gives us any
1376             # at all).  Thanks to MatejC for helping me test this and
1377             # tirelessly telling me that it didn't work yet.
1378             attrsD = {}
1379             for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1380                 lowernamespace = (namespace or '').lower()
1381                 prefix = self._matchnamespaces.get(lowernamespace, '')
1382                 if prefix:
1383                     attrlocalname = prefix + ':' + attrlocalname
1384                 attrsD[str(attrlocalname).lower()] = attrvalue
1385             for qname in attrs.getQNames():
1386                 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1387             self.unknown_starttag(localname, attrsD.items())
1388
1389         def characters(self, text):
1390             self.handle_data(text)
1391
1392         def endElementNS(self, name, qname):
1393             namespace, localname = name
1394             lowernamespace = str(namespace or '').lower()
1395             if qname and qname.find(':') > 0:
1396                 givenprefix = qname.split(':')[0]
1397             else:
1398                 givenprefix = ''
1399             prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1400             if prefix:
1401                 localname = prefix + ':' + localname
1402             localname = str(localname).lower()
1403             self.unknown_endtag(localname)
1404
1405         def error(self, exc):
1406             self.bozo = 1
1407             self.exc = exc
1408
1409         def fatalError(self, exc):
1410             self.error(exc)
1411             raise exc
1412
1413 class _BaseHTMLProcessor(sgmllib.SGMLParser):
1414     elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1415       'img', 'input', 'isindex', 'link', 'meta', 'param']
1416
1417     def __init__(self, encoding):
1418         self.encoding = encoding
1419         if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1420         sgmllib.SGMLParser.__init__(self)
1421
1422     def reset(self):
1423         self.pieces = []
1424         sgmllib.SGMLParser.reset(self)
1425
1426     def _shorttag_replace(self, match):
1427         tag = match.group(1)
1428         if tag in self.elements_no_end_tag:
1429             return '<' + tag + ' />'
1430         else:
1431             return '<' + tag + '></' + tag + '>'
1432
1433     def feed(self, data):
1434         data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
1435         #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1436         data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)
1437         data = data.replace('&#39;', "'")
1438         data = data.replace('&#34;', '"')
1439         if self.encoding and type(data) == type(u''):
1440             data = data.encode(self.encoding)
1441         sgmllib.SGMLParser.feed(self, data)
1442
1443     def normalize_attrs(self, attrs):
1444         # utility method to be called by descendants
1445         attrs = [(k.lower(), v) for k, v in attrs]
1446         attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1447         return attrs
1448
1449     def unknown_starttag(self, tag, attrs):
1450         # called for each start tag
1451         # attrs is a list of (attr, value) tuples
1452         # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
1453         if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1454         uattrs = []
1455         # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1456         for key, value in attrs:
1457             if type(value) != type(u''):
1458                 value = unicode(value, self.encoding)
1459             uattrs.append((unicode(key, self.encoding), value))
1460         strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
1461         if tag in self.elements_no_end_tag:
1462             self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1463         else:
1464             self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1465
1466     def unknown_endtag(self, tag):
1467         # called for each end tag, e.g. for </pre>, tag will be 'pre'
1468         # Reconstruct the original end tag.
1469         if tag not in self.elements_no_end_tag:
1470             self.pieces.append("</%(tag)s>" % locals())
1471
1472     def handle_charref(self, ref):
1473         # called for each character reference, e.g. for '&#160;', ref will be '160'
1474         # Reconstruct the original character reference.
1475         self.pieces.append('&#%(ref)s;' % locals())
1476
1477     def handle_entityref(self, ref):
1478         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
1479         # Reconstruct the original entity reference.
1480         self.pieces.append('&%(ref)s;' % locals())
1481
1482     def handle_data(self, text):
1483         # called for each block of plain text, i.e. outside of any tag and
1484         # not containing any character or entity references
1485         # Store the original text verbatim.
1486         if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
1487         self.pieces.append(text)
1488
1489     def handle_comment(self, text):
1490         # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1491         # Reconstruct the original comment.
1492         self.pieces.append('<!--%(text)s-->' % locals())
1493
1494     def handle_pi(self, text):
1495         # called for each processing instruction, e.g. <?instruction>
1496         # Reconstruct original processing instruction.
1497         self.pieces.append('<?%(text)s>' % locals())
1498
1499     def handle_decl(self, text):
1500         # called for the DOCTYPE, if present, e.g.
1501         # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1502         #     "http://www.w3.org/TR/html4/loose.dtd">
1503         # Reconstruct original DOCTYPE
1504         self.pieces.append('<!%(text)s>' % locals())
1505
1506     _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1507     def _scan_name(self, i, declstartpos):
1508         rawdata = self.rawdata
1509         n = len(rawdata)
1510         if i == n:
1511             return None, -1
1512         m = self._new_declname_match(rawdata, i)
1513         if m:
1514             s = m.group()
1515             name = s.strip()
1516             if (i + len(s)) == n:
1517                 return None, -1  # end of buffer
1518             return name.lower(), m.end()
1519         else:
1520             self.handle_data(rawdata)
1521 #            self.updatepos(declstartpos, i)
1522             return None, -1
1523
1524     def output(self):
1525         '''Return processed HTML as a single string'''
1526         return ''.join([str(p) for p in self.pieces])
1527
1528 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1529     def __init__(self, baseuri, baselang, encoding):
1530         sgmllib.SGMLParser.__init__(self)
1531         _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1532
1533     def decodeEntities(self, element, data):
1534         data = data.replace('&#60;', '&lt;')
1535         data = data.replace('&#x3c;', '&lt;')
1536         data = data.replace('&#62;', '&gt;')
1537         data = data.replace('&#x3e;', '&gt;')
1538         data = data.replace('&#38;', '&amp;')
1539         data = data.replace('&#x26;', '&amp;')
1540         data = data.replace('&#34;', '&quot;')
1541         data = data.replace('&#x22;', '&quot;')
1542         data = data.replace('&#39;', '&apos;')
1543         data = data.replace('&#x27;', '&apos;')
1544         if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
1545             data = data.replace('&lt;', '<')
1546             data = data.replace('&gt;', '>')
1547             data = data.replace('&amp;', '&')
1548             data = data.replace('&quot;', '"')
1549             data = data.replace('&apos;', "'")
1550         return data
1551
1552 class _RelativeURIResolver(_BaseHTMLProcessor):
1553     relative_uris = [('a', 'href'),
1554                      ('applet', 'codebase'),
1555                      ('area', 'href'),
1556                      ('blockquote', 'cite'),
1557                      ('body', 'background'),
1558                      ('del', 'cite'),
1559                      ('form', 'action'),
1560                      ('frame', 'longdesc'),
1561                      ('frame', 'src'),
1562                      ('iframe', 'longdesc'),
1563                      ('iframe', 'src'),
1564                      ('head', 'profile'),
1565                      ('img', 'longdesc'),
1566                      ('img', 'src'),
1567                      ('img', 'usemap'),
1568                      ('input', 'src'),
1569                      ('input', 'usemap'),
1570                      ('ins', 'cite'),
1571                      ('link', 'href'),
1572                      ('object', 'classid'),
1573                      ('object', 'codebase'),
1574                      ('object', 'data'),
1575                      ('object', 'usemap'),
1576                      ('q', 'cite'),
1577                      ('script', 'src')]
1578
1579     def __init__(self, baseuri, encoding):
1580         _BaseHTMLProcessor.__init__(self, encoding)
1581         self.baseuri = baseuri
1582
1583     def resolveURI(self, uri):
1584         return _urljoin(self.baseuri, uri)
1585
1586     def unknown_starttag(self, tag, attrs):
1587         attrs = self.normalize_attrs(attrs)
1588         attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
1589         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1590
1591 def _resolveRelativeURIs(htmlSource, baseURI, encoding):
1592     if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
1593     p = _RelativeURIResolver(baseURI, encoding)
1594     p.feed(htmlSource)
1595     return p.output()
1596
1597 class _HTMLSanitizer(_BaseHTMLProcessor):
1598     acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
1599       'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
1600       'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
1601       'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
1602       'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
1603       'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
1604       'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
1605       'thead', 'tr', 'tt', 'u', 'ul', 'var']
1606
1607     acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
1608       'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
1609       'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
1610       'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
1611       'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
1612       'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
1613       'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
1614       'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
1615       'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
1616       'usemap', 'valign', 'value', 'vspace', 'width']
1617
1618     unacceptable_elements_with_end_tag = ['script', 'applet']
1619
1620     def reset(self):
1621         _BaseHTMLProcessor.reset(self)
1622         self.unacceptablestack = 0
1623
1624     def unknown_starttag(self, tag, attrs):
1625         if not tag in self.acceptable_elements:
1626             if tag in self.unacceptable_elements_with_end_tag:
1627                 self.unacceptablestack += 1
1628             return
1629         attrs = self.normalize_attrs(attrs)
1630         attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
1631         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1632
1633     def unknown_endtag(self, tag):
1634         if not tag in self.acceptable_elements:
1635             if tag in self.unacceptable_elements_with_end_tag:
1636                 self.unacceptablestack -= 1
1637             return
1638         _BaseHTMLProcessor.unknown_endtag(self, tag)
1639
1640     def handle_pi(self, text):
1641         pass
1642
1643     def handle_decl(self, text):
1644         pass
1645
1646     def handle_data(self, text):
1647         if not self.unacceptablestack:
1648             _BaseHTMLProcessor.handle_data(self, text)
1649
1650 def _sanitizeHTML(htmlSource, encoding):
1651     p = _HTMLSanitizer(encoding)
1652     p.feed(htmlSource)
1653     data = p.output()
1654     if TIDY_MARKUP:
1655         # loop through list of preferred Tidy interfaces looking for one that's installed,
1656         # then set up a common _tidy function to wrap the interface-specific API.
1657         _tidy = None
1658         for tidy_interface in PREFERRED_TIDY_INTERFACES:
1659             try:
1660                 if tidy_interface == "uTidy":
1661                     from tidy import parseString as _utidy
1662                     def _tidy(data, **kwargs):
1663                         return str(_utidy(data, **kwargs))
1664                     break
1665                 elif tidy_interface == "mxTidy":
1666                     from mx.Tidy import Tidy as _mxtidy
1667                     def _tidy(data, **kwargs):
1668                         nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
1669                         return data
1670                     break
1671             except:
1672                 pass
1673         if _tidy:
1674             utf8 = type(data) == type(u'')
1675             if utf8:
1676                 data = data.encode('utf-8')
1677             data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
1678             if utf8:
1679                 data = unicode(data, 'utf-8')
1680             if data.count('<body'):
1681                 data = data.split('<body', 1)[1]
1682                 if data.count('>'):
1683                     data = data.split('>', 1)[1]
1684             if data.count('</body'):
1685                 data = data.split('</body', 1)[0]
1686     data = data.strip().replace('\r\n', '\n')
1687     return data
1688
1689 class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
1690     def http_error_default(self, req, fp, code, msg, headers):
1691         if ((code / 100) == 3) and (code != 304):
1692             return self.http_error_302(req, fp, code, msg, headers)
1693         infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1694         infourl.status = code
1695         return infourl
1696
1697     def http_error_302(self, req, fp, code, msg, headers):
1698         if headers.dict.has_key('location'):
1699             infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
1700         else:
1701             infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1702         if not hasattr(infourl, 'status'):
1703             infourl.status = code
1704         return infourl
1705
1706     def http_error_301(self, req, fp, code, msg, headers):
1707         if headers.dict.has_key('location'):
1708             infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
1709         else:
1710             infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1711         if not hasattr(infourl, 'status'):
1712             infourl.status = code
1713         return infourl
1714
1715     http_error_300 = http_error_302
1716     http_error_303 = http_error_302
1717     http_error_307 = http_error_302
1718
1719     def http_error_401(self, req, fp, code, msg, headers):
1720         # Check if
1721         # - server requires digest auth, AND
1722         # - we tried (unsuccessfully) with basic auth, AND
1723         # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
1724         # If all conditions hold, parse authentication information
1725         # out of the Authorization header we sent the first time
1726         # (for the username and password) and the WWW-Authenticate
1727         # header the server sent back (for the realm) and retry
1728         # the request with the appropriate digest auth headers instead.
1729         # This evil genius hack has been brought to you by Aaron Swartz.
1730         host = urlparse.urlparse(req.get_full_url())[1]
1731         try:
1732             assert sys.version.split()[0] >= '2.3.3'
1733             assert base64 != None
1734             user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
1735             realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
1736             self.add_password(realm, host, user, passw)
1737             retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
1738             self.reset_retry_count()
1739             return retry
1740         except:
1741             return self.http_error_default(req, fp, code, msg, headers)
1742
1743 def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
1744     """URL, filename, or string --> stream
1745
1746     This function lets you define parsers that take any input source
1747     (URL, pathname to local or network file, or actual data as a string)
1748     and deal with it in a uniform manner.  Returned object is guaranteed
1749     to have all the basic stdio read methods (read, readline, readlines).
1750     Just .close() the object when you're done with it.
1751
1752     If the etag argument is supplied, it will be used as the value of an
1753     If-None-Match request header.
1754
1755     If the modified argument is supplied, it must be a tuple of 9 integers
1756     as returned by gmtime() in the standard Python time module. This MUST
1757     be in GMT (Greenwich Mean Time). The formatted date/time will be used
1758     as the value of an If-Modified-Since request header.
1759
1760     If the agent argument is supplied, it will be used as the value of a
1761     User-Agent request header.
1762
1763     If the referrer argument is supplied, it will be used as the value of a
1764     Referer[sic] request header.
1765
1766     If handlers is supplied, it is a list of handlers used to build a
1767     urllib2 opener.
1768     """
1769
1770     if hasattr(url_file_stream_or_string, 'read'):
1771         return url_file_stream_or_string
1772
1773     if url_file_stream_or_string == '-':
1774         return sys.stdin
1775
1776     if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
1777         if not agent:
1778             agent = USER_AGENT
1779         # test for inline user:password for basic auth
1780         auth = None
1781         if base64:
1782             urltype, rest = urllib.splittype(url_file_stream_or_string)
1783             realhost, rest = urllib.splithost(rest)
1784             if realhost:
1785                 user_passwd, realhost = urllib.splituser(realhost)
1786                 if user_passwd:
1787                     url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
1788                     auth = base64.encodestring(user_passwd).strip()
1789         # try to open with urllib2 (to use optional headers)
1790         request = urllib2.Request(url_file_stream_or_string)
1791         request.add_header('User-Agent', agent)
1792         if etag:
1793             request.add_header('If-None-Match', etag)
1794         if modified:
1795             # format into an RFC 1123-compliant timestamp. We can't use
1796             # time.strftime() since the %a and %b directives can be affected
1797             # by the current locale, but RFC 2616 states that dates must be
1798             # in English.
1799             short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
1800             months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
1801             request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
1802         if referrer:
1803             request.add_header('Referer', referrer)
1804         if gzip and zlib:
1805             request.add_header('Accept-encoding', 'gzip, deflate')
1806         elif gzip:
1807             request.add_header('Accept-encoding', 'gzip')
1808         elif zlib:
1809             request.add_header('Accept-encoding', 'deflate')
1810         else:
1811             request.add_header('Accept-encoding', '')
1812         if auth:
1813             request.add_header('Authorization', 'Basic %s' % auth)
1814         if ACCEPT_HEADER:
1815             request.add_header('Accept', ACCEPT_HEADER)
1816         request.add_header('A-IM', 'feed') # RFC 3229 support
1817         opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
1818         opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
1819         try:
1820             return opener.open(request)
1821         finally:
1822             opener.close() # JohnD
1823
1824     # try to open with native open function (if url_file_stream_or_string is a filename)
1825     try:
1826         return open(url_file_stream_or_string)
1827     except:
1828         pass
1829
1830     # treat url_file_stream_or_string as string
1831     return _StringIO(str(url_file_stream_or_string))
1832
1833 _date_handlers = []
1834 def registerDateHandler(func):
1835     '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
1836     _date_handlers.insert(0, func)
1837
1838 # ISO-8601 date parsing routines written by Fazal Majid.
1839 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
1840 # parser is beyond the scope of feedparser and would be a worthwhile addition
1841 # to the Python library.
1842 # A single regular expression cannot parse ISO 8601 date formats into groups
1843 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
1844 # 0301-04-01), so we use templates instead.
1845 # Please note the order in templates is significant because we need a
1846 # greedy match.
1847 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
1848                 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
1849                 '-YY-?MM', '-OOO', '-YY',
1850                 '--MM-?DD', '--MM',
1851                 '---DD',
1852                 'CC', '']
1853 _iso8601_re = [
1854     tmpl.replace(
1855     'YYYY', r'(?P<year>\d{4})').replace(
1856     'YY', r'(?P<year>\d\d)').replace(
1857     'MM', r'(?P<month>[01]\d)').replace(
1858     'DD', r'(?P<day>[0123]\d)').replace(
1859     'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
1860     'CC', r'(?P<century>\d\d$)')
1861     + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
1862     + r'(:(?P<second>\d{2}))?'
1863     + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
1864     for tmpl in _iso8601_tmpl]
1865 del tmpl
1866 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
1867 del regex
1868 def _parse_date_iso8601(dateString):
1869     '''Parse a variety of ISO-8601-compatible formats like 20040105'''
1870     m = None
1871     for _iso8601_match in _iso8601_matches:
1872         m = _iso8601_match(dateString)
1873         if m: break
1874     if not m: return
1875     if m.span() == (0, 0): return
1876     params = m.groupdict()
1877     ordinal = params.get('ordinal', 0)
1878     if ordinal:
1879         ordinal = int(ordinal)
1880     else:
1881         ordinal = 0
1882     year = params.get('year', '--')
1883     if not year or year == '--':
1884         year = time.gmtime()[0]
1885     elif len(year) == 2:
1886         # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
1887         year = 100 * int(time.gmtime()[0] / 100) + int(year)
1888     else:
1889         year = int(year)
1890     month = params.get('month', '-')
1891     if not month or month == '-':
1892         # ordinals are NOT normalized by mktime, we simulate them
1893         # by setting month=1, day=ordinal
1894         if ordinal:
1895             month = 1
1896         else:
1897             month = time.gmtime()[1]
1898     month = int(month)
1899     day = params.get('day', 0)
1900     if not day:
1901         # see above
1902         if ordinal:
1903             day = ordinal
1904         elif params.get('century', 0) or \
1905                  params.get('year', 0) or params.get('month', 0):
1906             day = 1
1907         else:
1908             day = time.gmtime()[2]
1909     else:
1910         day = int(day)
1911     # special case of the century - is the first year of the 21st century
1912     # 2000 or 2001 ? The debate goes on...
1913     if 'century' in params.keys():
1914         year = (int(params['century']) - 1) * 100 + 1
1915     # in ISO 8601 most fields are optional
1916     for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
1917         if not params.get(field, None):
1918             params[field] = 0
1919     hour = int(params.get('hour', 0))
1920     minute = int(params.get('minute', 0))
1921     second = int(params.get('second', 0))
1922     # weekday is normalized by mktime(), we can ignore it
1923     weekday = 0
1924     # daylight savings is complex, but not needed for feedparser's purposes
1925     # as time zones, if specified, include mention of whether it is active
1926     # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
1927     # and most implementations have DST bugs
1928     daylight_savings_flag = 0
1929     tm = [year, month, day, hour, minute, second, weekday,
1930           ordinal, daylight_savings_flag]
1931     # ISO 8601 time zone adjustments
1932     tz = params.get('tz')
1933     if tz and tz != 'Z':
1934         if tz[0] == '-':
1935             tm[3] += int(params.get('tzhour', 0))
1936             tm[4] += int(params.get('tzmin', 0))
1937         elif tz[0] == '+':
1938             tm[3] -= int(params.get('tzhour', 0))
1939             tm[4] -= int(params.get('tzmin', 0))
1940         else:
1941             return None
1942     # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
1943     # which is guaranteed to normalize d/m/y/h/m/s.
1944     # Many implementations have bugs, but we'll pretend they don't.
1945     return time.localtime(time.mktime(tm))
1946 registerDateHandler(_parse_date_iso8601)
1947
1948 # 8-bit date handling routines written by ytrewq1.
1949 _korean_year  = u'\ub144' # b3e2 in euc-kr
1950 _korean_month = u'\uc6d4' # bff9 in euc-kr
1951 _korean_day   = u'\uc77c' # c0cf in euc-kr
1952 _korean_am    = u'\uc624\uc804' # bfc0 c0fc in euc-kr
1953 _korean_pm    = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
1954
1955 _korean_onblog_date_re = \
1956     re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
1957                (_korean_year, _korean_month, _korean_day))
1958 _korean_nate_date_re = \
1959     re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
1960                (_korean_am, _korean_pm))
1961 def _parse_date_onblog(dateString):
1962     '''Parse a string according to the OnBlog 8-bit date format'''
1963     m = _korean_onblog_date_re.match(dateString)
1964     if not m: return
1965     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1966                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1967                  'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
1968                  'zonediff': '+09:00'}
1969     if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
1970     return _parse_date_w3dtf(w3dtfdate)
1971 registerDateHandler(_parse_date_onblog)
1972
1973 def _parse_date_nate(dateString):
1974     '''Parse a string according to the Nate 8-bit date format'''
1975     m = _korean_nate_date_re.match(dateString)
1976     if not m: return
1977     hour = int(m.group(5))
1978     ampm = m.group(4)
1979     if (ampm == _korean_pm):
1980         hour += 12
1981     hour = str(hour)
1982     if len(hour) == 1:
1983         hour = '0' + hour
1984     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1985                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1986                  'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
1987                  'zonediff': '+09:00'}
1988     if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
1989     return _parse_date_w3dtf(w3dtfdate)
1990 registerDateHandler(_parse_date_nate)
1991
1992 _mssql_date_re = \
1993     re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
1994 def _parse_date_mssql(dateString):
1995     '''Parse a string according to the MS SQL date format'''
1996     m = _mssql_date_re.match(dateString)
1997     if not m: return
1998     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1999                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2000                  'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
2001                  'zonediff': '+09:00'}
2002     if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
2003     return _parse_date_w3dtf(w3dtfdate)
2004 registerDateHandler(_parse_date_mssql)
2005
2006 # Unicode strings for Greek date strings
2007 _greek_months = \
2008   { \
2009    u'\u0399\u03b1\u03bd': u'Jan',       # c9e1ed in iso-8859-7
2010    u'\u03a6\u03b5\u03b2': u'Feb',       # d6e5e2 in iso-8859-7
2011    u'\u039c\u03ac\u03ce': u'Mar',       # ccdcfe in iso-8859-7
2012    u'\u039c\u03b1\u03ce': u'Mar',       # cce1fe in iso-8859-7
2013    u'\u0391\u03c0\u03c1': u'Apr',       # c1f0f1 in iso-8859-7
2014    u'\u039c\u03ac\u03b9': u'May',       # ccdce9 in iso-8859-7
2015    u'\u039c\u03b1\u03ca': u'May',       # cce1fa in iso-8859-7
2016    u'\u039c\u03b1\u03b9': u'May',       # cce1e9 in iso-8859-7
2017    u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
2018    u'\u0399\u03bf\u03bd': u'Jun',       # c9efed in iso-8859-7
2019    u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
2020    u'\u0399\u03bf\u03bb': u'Jul',       # c9f9eb in iso-8859-7
2021    u'\u0391\u03cd\u03b3': u'Aug',       # c1fde3 in iso-8859-7
2022    u'\u0391\u03c5\u03b3': u'Aug',       # c1f5e3 in iso-8859-7
2023    u'\u03a3\u03b5\u03c0': u'Sep',       # d3e5f0 in iso-8859-7
2024    u'\u039f\u03ba\u03c4': u'Oct',       # cfeaf4 in iso-8859-7
2025    u'\u039d\u03bf\u03ad': u'Nov',       # cdefdd in iso-8859-7
2026    u'\u039d\u03bf\u03b5': u'Nov',       # cdefe5 in iso-8859-7
2027    u'\u0394\u03b5\u03ba': u'Dec',       # c4e5ea in iso-8859-7
2028   }
2029
2030 _greek_wdays = \
2031   { \
2032    u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
2033    u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
2034    u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
2035    u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
2036    u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
2037    u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
2038    u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
2039   }
2040
2041 _greek_date_format_re = \
2042     re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
2043
2044 def _parse_date_greek(dateString):
2045     '''Parse a string according to a Greek 8-bit date format.'''
2046     m = _greek_date_format_re.match(dateString)
2047     if not m: return
2048     try:
2049         wday = _greek_wdays[m.group(1)]
2050         month = _greek_months[m.group(3)]
2051     except:
2052         return
2053     rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
2054                  {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
2055                   'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
2056                   'zonediff': m.group(8)}
2057     if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
2058     return _parse_date_rfc822(rfc822date)
2059 registerDateHandler(_parse_date_greek)
2060
2061 # Unicode strings for Hungarian date strings
2062 _hungarian_months = \
2063   { \
2064     u'janu\u00e1r':   u'01',  # e1 in iso-8859-2
2065     u'febru\u00e1ri': u'02',  # e1 in iso-8859-2
2066     u'm\u00e1rcius':  u'03',  # e1 in iso-8859-2
2067     u'\u00e1prilis':  u'04',  # e1 in iso-8859-2
2068     u'm\u00e1ujus':   u'05',  # e1 in iso-8859-2
2069     u'j\u00fanius':   u'06',  # fa in iso-8859-2
2070     u'j\u00falius':   u'07',  # fa in iso-8859-2
2071     u'augusztus':     u'08',
2072     u'szeptember':    u'09',
2073     u'okt\u00f3ber':  u'10',  # f3 in iso-8859-2
2074     u'november':      u'11',
2075     u'december':      u'12',
2076   }
2077
2078 _hungarian_date_format_re = \
2079   re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
2080
2081 def _parse_date_hungarian(dateString):
2082     '''Parse a string according to a Hungarian 8-bit date format.'''
2083     m = _hungarian_date_format_re.match(dateString)
2084     if not m: return
2085     try:
2086         month = _hungarian_months[m.group(2)]
2087         day = m.group(3)
2088         if len(day) == 1:
2089             day = '0' + day
2090         hour = m.group(4)
2091         if len(hour) == 1:
2092             hour = '0' + hour
2093     except:
2094         return
2095     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
2096                 {'year': m.group(1), 'month': month, 'day': day,\
2097                  'hour': hour, 'minute': m.group(5),\
2098                  'zonediff': m.group(6)}
2099     if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
2100     return _parse_date_w3dtf(w3dtfdate)
2101 registerDateHandler(_parse_date_hungarian)
2102
2103 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
2104 # Drake and licensed under the Python license.  Removed all range checking
2105 # for month, day, hour, minute, and second, since mktime will normalize
2106 # these later
2107 def _parse_date_w3dtf(dateString):
2108     def __extract_date(m):
2109         year = int(m.group('year'))
2110         if year < 100:
2111             year = 100 * int(time.gmtime()[0] / 100) + int(year)
2112         if year < 1000:
2113             return 0, 0, 0
2114         julian = m.group('julian')
2115         if julian:
2116             julian = int(julian)
2117             month = julian / 30 + 1
2118             day = julian % 30 + 1
2119             jday = None
2120             while jday != julian:
2121                 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
2122                 jday = time.gmtime(t)[-2]
2123                 diff = abs(jday - julian)
2124                 if jday > julian:
2125                     if diff < day:
2126                         day = day - diff
2127                     else:
2128                         month = month - 1
2129                         day = 31
2130                 elif jday < julian:
2131                     if day + diff < 28:
2132                        day = day + diff
2133                     else:
2134                         month = month + 1
2135             return year, month, day
2136         month = m.group('month')
2137         day = 1
2138         if month is None:
2139             month = 1
2140         else:
2141             month = int(month)
2142             day = m.group('day')
2143             if day:
2144                 day = int(day)
2145             else:
2146                 day = 1
2147         return year, month, day
2148
2149     def __extract_time(m):
2150         if not m:
2151             return 0, 0, 0
2152         hours = m.group('hours')
2153         if not hours:
2154             return 0, 0, 0
2155         hours = int(hours)
2156         minutes = int(m.group('minutes'))
2157         seconds = m.group('seconds')
2158         if seconds:
2159             seconds = int(seconds)
2160         else:
2161             seconds = 0
2162         return hours, minutes, seconds
2163
2164     def __extract_tzd(m):
2165         '''Return the Time Zone Designator as an offset in seconds from UTC.'''
2166         if not m:
2167             return 0
2168         tzd = m.group('tzd')
2169         if not tzd:
2170             return 0
2171         if tzd == 'Z':
2172             return 0
2173         hours = int(m.group('tzdhours'))
2174         minutes = m.group('tzdminutes')
2175         if minutes:
2176             minutes = int(minutes)
2177         else:
2178             minutes = 0
2179         offset = (hours*60 + minutes) * 60
2180         if tzd[0] == '+':
2181             return -offset
2182         return offset
2183
2184     __date_re = ('(?P<year>\d\d\d\d)'
2185                  '(?:(?P<dsep>-|)'
2186                  '(?:(?P<julian>\d\d\d)'
2187                  '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')
2188     __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
2189     __tzd_rx = re.compile(__tzd_re)
2190     __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
2191                  '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'
2192                  + __tzd_re)
2193     __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
2194     __datetime_rx = re.compile(__datetime_re)
2195     m = __datetime_rx.match(dateString)
2196     if (m is None) or (m.group() != dateString): return
2197     gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
2198     if gmt[0] == 0: return
2199     return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
2200 registerDateHandler(_parse_date_w3dtf)
2201
2202 def _parse_date_rfc822(dateString):
2203     '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
2204     data = dateString.split()
2205     if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
2206         del data[0]
2207     if len(data) == 4:
2208         s = data[3]
2209         i = s.find('+')
2210         if i > 0:
2211             data[3:] = [s[:i], s[i+1:]]
2212         else:
2213             data.append('')
2214         dateString = " ".join(data)
2215     if len(data) < 5:
2216         dateString += ' 00:00:00 GMT'
2217     tm = rfc822.parsedate_tz(dateString)
2218     if tm:
2219         return time.gmtime(rfc822.mktime_tz(tm))
2220 # rfc822.py defines several time zones, but we define some extra ones.
2221 # 'ET' is equivalent to 'EST', etc.
2222 _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
2223 rfc822._timezones.update(_additional_timezones)
2224 registerDateHandler(_parse_date_rfc822)
2225
2226 def _parse_date(dateString):
2227     '''Parses a variety of date formats into a 9-tuple in GMT'''
2228     for handler in _date_handlers:
2229         try:
2230             date9tuple = handler(dateString)
2231             if not date9tuple: continue
2232             if len(date9tuple) != 9:
2233                 if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
2234                 raise ValueError
2235             map(int, date9tuple)
2236             return date9tuple
2237         except Exception, e:
2238             if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
2239             pass
2240     return None
2241
2242 def _getCharacterEncoding(http_headers, xml_data):
2243     '''Get the character encoding of the XML document
2244
2245     http_headers is a dictionary
2246     xml_data is a raw string (not Unicode)
2247
2248     This is so much trickier than it sounds, it's not even funny.
2249     According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
2250     is application/xml, application/*+xml,
2251     application/xml-external-parsed-entity, or application/xml-dtd,
2252     the encoding given in the charset parameter of the HTTP Content-Type
2253     takes precedence over the encoding given in the XML prefix within the
2254     document, and defaults to 'utf-8' if neither are specified.  But, if
2255     the HTTP Content-Type is text/xml, text/*+xml, or
2256     text/xml-external-parsed-entity, the encoding given in the XML prefix
2257     within the document is ALWAYS IGNORED and only the encoding given in
2258     the charset parameter of the HTTP Content-Type header should be
2259     respected, and it defaults to 'us-ascii' if not specified.
2260
2261     Furthermore, discussion on the atom-syntax mailing list with the
2262     author of RFC 3023 leads me to the conclusion that any document
2263     served with a Content-Type of text/* and no charset parameter
2264     must be treated as us-ascii.  (We now do this.)  And also that it
2265     must always be flagged as non-well-formed.  (We now do this too.)
2266
2267     If Content-Type is unspecified (input was local file or non-HTTP source)
2268     or unrecognized (server just got it totally wrong), then go by the
2269     encoding given in the XML prefix of the document and default to
2270     'iso-8859-1' as per the HTTP specification (RFC 2616).
2271
2272     Then, assuming we didn't find a character encoding in the HTTP headers
2273     (and the HTTP Content-type allowed us to look in the body), we need
2274     to sniff the first few bytes of the XML data and try to determine
2275     whether the encoding is ASCII-compatible.  Section F of the XML
2276     specification shows the way here:
2277     http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2278
2279     If the sniffed encoding is not ASCII-compatible, we need to make it
2280     ASCII compatible so that we can sniff further into the XML declaration
2281     to find the encoding attribute, which will tell us the true encoding.
2282
2283     Of course, none of this guarantees that we will be able to parse the
2284     feed in the declared character encoding (assuming it was declared
2285     correctly, which many are not).  CJKCodecs and iconv_codec help a lot;
2286     you should definitely install them if you can.
2287     http://cjkpython.i18n.org/
2288     '''
2289
2290     def _parseHTTPContentType(content_type):
2291         '''takes HTTP Content-Type header and returns (content type, charset)
2292
2293         If no charset is specified, returns (content type, '')
2294         If no content type is specified, returns ('', '')
2295         Both return parameters are guaranteed to be lowercase strings
2296         '''
2297         content_type = content_type or ''
2298         content_type, params = cgi.parse_header(content_type)
2299         return content_type, params.get('charset', '').replace("'", '')
2300
2301     sniffed_xml_encoding = ''
2302     xml_encoding = ''
2303     true_encoding = ''
2304     http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
2305     # Must sniff for non-ASCII-compatible character encodings before
2306     # searching for XML declaration.  This heuristic is defined in
2307     # section F of the XML specification:
2308     # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2309     try:
2310         if xml_data[:4] == '\x4c\x6f\xa7\x94':
2311             # EBCDIC
2312             xml_data = _ebcdic_to_ascii(xml_data)
2313         elif xml_data[:4] == '\x00\x3c\x00\x3f':
2314             # UTF-16BE
2315             sniffed_xml_encoding = 'utf-16be'
2316             xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
2317         elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
2318             # UTF-16BE with BOM
2319             sniffed_xml_encoding = 'utf-16be'
2320             xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
2321         elif xml_data[:4] == '\x3c\x00\x3f\x00':
2322             # UTF-16LE
2323             sniffed_xml_encoding = 'utf-16le'
2324             xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
2325         elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
2326             # UTF-16LE with BOM
2327             sniffed_xml_encoding = 'utf-16le'
2328             xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
2329         elif xml_data[:4] == '\x00\x00\x00\x3c':
2330             # UTF-32BE
2331             sniffed_xml_encoding = 'utf-32be'
2332             xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
2333         elif xml_data[:4] == '\x3c\x00\x00\x00':
2334             # UTF-32LE
2335             sniffed_xml_encoding = 'utf-32le'
2336             xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
2337         elif xml_data[:4] == '\x00\x00\xfe\xff':
2338             # UTF-32BE with BOM
2339             sniffed_xml_encoding = 'utf-32be'
2340             xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
2341         elif xml_data[:4] == '\xff\xfe\x00\x00':
2342             # UTF-32LE with BOM
2343             sniffed_xml_encoding = 'utf-32le'
2344             xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
2345         elif xml_data[:3] == '\xef\xbb\xbf':
2346             # UTF-8 with BOM
2347             sniffed_xml_encoding = 'utf-8'
2348             xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
2349         else:
2350             # ASCII-compatible
2351             pass
2352         xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
2353     except:
2354         xml_encoding_match = None
2355     if xml_encoding_match:
2356         xml_encoding = xml_encoding_match.groups()[0].lower()
2357         if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
2358             xml_encoding = sniffed_xml_encoding
2359     acceptable_content_type = 0
2360     application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
2361     text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
2362     if (http_content_type in application_content_types) or \
2363        (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
2364         acceptable_content_type = 1
2365         true_encoding = http_encoding or xml_encoding or 'utf-8'
2366     elif (http_content_type in text_content_types) or \
2367          (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
2368         acceptable_content_type = 1
2369         true_encoding = http_encoding or 'us-ascii'
2370     elif http_content_type.startswith('text/'):
2371         true_encoding = http_encoding or 'us-ascii'
2372     elif http_headers and (not http_headers.has_key('content-type')):
2373         true_encoding = xml_encoding or 'iso-8859-1'
2374     else:
2375         true_encoding = xml_encoding or 'utf-8'
2376     return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
2377
2378 def _toUTF8(data, encoding):
2379     '''Changes an XML data stream on the fly to specify a new encoding
2380
2381     data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
2382     encoding is a string recognized by encodings.aliases
2383     '''
2384     if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
2385     # strip Byte Order Mark (if present)
2386     if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
2387         if _debug:
2388             sys.stderr.write('stripping BOM\n')
2389             if encoding != 'utf-16be':
2390                 sys.stderr.write('trying utf-16be instead\n')
2391         encoding = 'utf-16be'
2392         data = data[2:]
2393     elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
2394         if _debug:
2395             sys.stderr.write('stripping BOM\n')
2396             if encoding != 'utf-16le':
2397                 sys.stderr.write('trying utf-16le instead\n')
2398         encoding = 'utf-16le'
2399         data = data[2:]
2400     elif data[:3] == '\xef\xbb\xbf':
2401         if _debug:
2402             sys.stderr.write('stripping BOM\n')
2403             if encoding != 'utf-8':
2404                 sys.stderr.write('trying utf-8 instead\n')
2405         encoding = 'utf-8'
2406         data = data[3:]
2407     elif data[:4] == '\x00\x00\xfe\xff':
2408         if _debug:
2409             sys.stderr.write('stripping BOM\n')
2410             if encoding != 'utf-32be':
2411                 sys.stderr.write('trying utf-32be instead\n')
2412         encoding = 'utf-32be'
2413         data = data[4:]
2414     elif data[:4] == '\xff\xfe\x00\x00':
2415         if _debug:
2416             sys.stderr.write('stripping BOM\n')
2417             if encoding != 'utf-32le':
2418                 sys.stderr.write('trying utf-32le instead\n')
2419         encoding = 'utf-32le'
2420         data = data[4:]
2421     newdata = unicode(data, encoding)
2422     if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
2423     declmatch = re.compile('^<\?xml[^>]*?>')
2424     newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
2425     if declmatch.search(newdata):
2426         newdata = declmatch.sub(newdecl, newdata)
2427     else:
2428         newdata = newdecl + u'\n' + newdata
2429     return newdata.encode('utf-8')
2430
2431 def _stripDoctype(data):
2432     '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
2433
2434     rss_version may be 'rss091n' or None
2435     stripped_data is the same XML document, minus the DOCTYPE
2436     '''
2437     entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
2438     data = entity_pattern.sub('', data)
2439     doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
2440     doctype_results = doctype_pattern.findall(data)
2441     doctype = doctype_results and doctype_results[0] or ''
2442     if doctype.lower().count('netscape'):
2443         version = 'rss091n'
2444     else:
2445         version = None
2446     data = doctype_pattern.sub('', data)
2447     return version, data
2448
2449 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
2450     '''Parse a feed from a URL, file, stream, or string'''
2451     result = FeedParserDict()
2452     result['feed'] = FeedParserDict()
2453     result['entries'] = []
2454     if _XML_AVAILABLE:
2455         result['bozo'] = 0
2456     if type(handlers) == types.InstanceType:
2457         handlers = [handlers]
2458     try:
2459         f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
2460         data = f.read()
2461     except Exception, e:
2462         result['bozo'] = 1
2463         result['bozo_exception'] = e
2464         data = ''
2465         f = None
2466
2467     # if feed is gzip-compressed, decompress it
2468     if f and data and hasattr(f, 'headers'):
2469         if gzip and f.headers.get('content-encoding', '') == 'gzip':
2470             try:
2471                 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
2472             except Exception, e:
2473                 # Some feeds claim to be gzipped but they're not, so
2474                 # we get garbage.  Ideally, we should re-request the
2475                 # feed without the 'Accept-encoding: gzip' header,
2476                 # but we don't.
2477                 result['bozo'] = 1
2478                 result['bozo_exception'] = e
2479                 data = ''
2480         elif zlib and f.headers.get('content-encoding', '') == 'deflate':
2481             try:
2482                 data = zlib.decompress(data, -zlib.MAX_WBITS)
2483             except Exception, e:
2484                 result['bozo'] = 1
2485                 result['bozo_exception'] = e
2486                 data = ''
2487
2488     # save HTTP headers
2489     if hasattr(f, 'info'):
2490         info = f.info()
2491         result['etag'] = info.getheader('ETag')
2492         last_modified = info.getheader('Last-Modified')
2493         if last_modified:
2494             result['modified'] = _parse_date(last_modified)
2495     if hasattr(f, 'url'):
2496         result['href'] = f.url
2497         result['status'] = 200
2498     if hasattr(f, 'status'):
2499         result['status'] = f.status
2500     if hasattr(f, 'headers'):
2501         result['headers'] = f.headers.dict
2502     if hasattr(f, 'close'):
2503         f.close()
2504
2505     # there are four encodings to keep track of:
2506     # - http_encoding is the encoding declared in the Content-Type HTTP header
2507     # - xml_encoding is the encoding declared in the <?xml declaration
2508     # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
2509     # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
2510     http_headers = result.get('headers', {})
2511     result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
2512         _getCharacterEncoding(http_headers, data)
2513     if http_headers and (not acceptable_content_type):
2514         if http_headers.has_key('content-type'):
2515             bozo_message = '%s is not an XML media type' % http_headers['content-type']
2516         else:
2517             bozo_message = 'no Content-type specified'
2518         result['bozo'] = 1
2519         result['bozo_exception'] = NonXMLContentType(bozo_message)
2520
2521     result['version'], data = _stripDoctype(data)
2522
2523     baseuri = http_headers.get('content-location', result.get('href'))
2524     baselang = http_headers.get('content-language', None)
2525
2526     # if server sent 304, we're done
2527     if result.get('status', 0) == 304:
2528         result['version'] = ''
2529         result['debug_message'] = 'The feed has not changed since you last checked, ' + \
2530             'so the server sent no data.  This is a feature, not a bug!'
2531         return result
2532
2533     # if there was a problem downloading, we're done
2534     if not data:
2535         return result
2536
2537     # determine character encoding
2538     use_strict_parser = 0
2539     known_encoding = 0
2540     tried_encodings = []
2541     # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
2542     for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
2543         if not proposed_encoding: continue
2544         if proposed_encoding in tried_encodings: continue
2545         tried_encodings.append(proposed_encoding)
2546         try:
2547             data = _toUTF8(data, proposed_encoding)
2548             known_encoding = use_strict_parser = 1
2549             break
2550         except:
2551             pass
2552     # if no luck and we have auto-detection library, try that
2553     if (not known_encoding) and chardet:
2554         try:
2555             proposed_encoding = chardet.detect(data)['encoding']
2556             if proposed_encoding and (proposed_encoding not in tried_encodings):
2557                 tried_encodings.append(proposed_encoding)
2558                 data = _toUTF8(data, proposed_encoding)
2559                 known_encoding = use_strict_parser = 1
2560         except:
2561             pass
2562     # if still no luck and we haven't tried utf-8 yet, try that
2563     if (not known_encoding) and ('utf-8' not in tried_encodings):
2564         try:
2565             proposed_encoding = 'utf-8'
2566             tried_encodings.append(proposed_encoding)
2567             data = _toUTF8(data, proposed_encoding)
2568             known_encoding = use_strict_parser = 1
2569         except:
2570             pass
2571     # if still no luck and we haven't tried windows-1252 yet, try that
2572     if (not known_encoding) and ('windows-1252' not in tried_encodings):
2573         try:
2574             proposed_encoding = 'windows-1252'
2575             tried_encodings.append(proposed_encoding)
2576             data = _toUTF8(data, proposed_encoding)
2577             known_encoding = use_strict_parser = 1
2578         except:
2579             pass
2580     # if still no luck, give up
2581     if not known_encoding:
2582         result['bozo'] = 1
2583         result['bozo_exception'] = CharacterEncodingUnknown( \
2584             'document encoding unknown, I tried ' + \
2585             '%s, %s, utf-8, and windows-1252 but nothing worked' % \
2586             (result['encoding'], xml_encoding))
2587         result['encoding'] = ''
2588     elif proposed_encoding != result['encoding']:
2589         result['bozo'] = 1
2590         result['bozo_exception'] = CharacterEncodingOverride( \
2591             'documented declared as %s, but parsed as %s' % \
2592             (result['encoding'], proposed_encoding))
2593         result['encoding'] = proposed_encoding
2594
2595     if not _XML_AVAILABLE:
2596         use_strict_parser = 0
2597     if use_strict_parser:
2598         # initialize the SAX parser
2599         feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
2600         saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
2601         saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
2602         saxparser.setContentHandler(feedparser)
2603         saxparser.setErrorHandler(feedparser)
2604         source = xml.sax.xmlreader.InputSource()
2605         source.setByteStream(_StringIO(data))
2606         if hasattr(saxparser, '_ns_stack'):
2607             # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
2608             # PyXML doesn't have this problem, and it doesn't have _ns_stack either
2609             saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
2610         try:
2611             saxparser.parse(source)
2612         except Exception, e:
2613             if _debug:
2614                 import traceback
2615                 traceback.print_stack()
2616                 traceback.print_exc()
2617                 sys.stderr.write('xml parsing failed\n')
2618             result['bozo'] = 1
2619             result['bozo_exception'] = feedparser.exc or e
2620             use_strict_parser = 0
2621     if not use_strict_parser:
2622         feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')
2623         feedparser.feed(data)
2624     result['feed'] = feedparser.feeddata
2625     result['entries'] = feedparser.entries
2626     result['version'] = result['version'] or feedparser.version
2627     result['namespaces'] = feedparser.namespacesInUse
2628     return result
2629
2630 if __name__ == '__main__':
2631     if not sys.argv[1:]:
2632         print __doc__
2633         sys.exit(0)
2634     else:
2635         urls = sys.argv[1:]
2636     zopeCompatibilityHack()
2637     from pprint import pprint
2638     for url in urls:
2639         print url
2640         print
2641         result = parse(url)
2642         pprint(result)
2643         print
2644
2645 #REVISION HISTORY
2646 #1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
2647 #  added Simon Fell's test suite
2648 #1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
2649 #2.0 - 10/19/2002
2650 #  JD - use inchannel to watch out for image and textinput elements which can
2651 #  also contain title, link, and description elements
2652 #  JD - check for isPermaLink='false' attribute on guid elements
2653 #  JD - replaced openAnything with open_resource supporting ETag and
2654 #  If-Modified-Since request headers
2655 #  JD - parse now accepts etag, modified, agent, and referrer optional
2656 #  arguments
2657 #  JD - modified parse to return a dictionary instead of a tuple so that any
2658 #  etag or modified information can be returned and cached by the caller
2659 #2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
2660 #  because of etag/modified, return the old etag/modified to the caller to
2661 #  indicate why nothing is being returned
2662 #2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
2663 #  useless.  Fixes the problem JD was addressing by adding it.
2664 #2.1 - 11/14/2002 - MAP - added gzip support
2665 #2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
2666 #  start_admingeneratoragent is an example of how to handle elements with
2667 #  only attributes, no content.
2668 #2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
2669 #  also, make sure we send the User-Agent even if urllib2 isn't available.
2670 #  Match any variation of backend.userland.com/rss namespace.
2671 #2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
2672 #2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
2673 #  snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
2674 #  project name
2675 #2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
2676 #  removed unnecessary urllib code -- urllib2 should always be available anyway;
2677 #  return actual url, status, and full HTTP headers (as result['url'],
2678 #  result['status'], and result['headers']) if parsing a remote feed over HTTP --
2679 #  this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
2680 #  added the latest namespace-of-the-week for RSS 2.0
2681 #2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
2682 #  User-Agent (otherwise urllib2 sends two, which confuses some servers)
2683 #2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
2684 #  inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
2685 #2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
2686 #  textInput, and also to return the character encoding (if specified)
2687 #2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
2688 #  nested divs within content (JohnD); fixed missing sys import (JohanS);
2689 #  fixed regular expression to capture XML character encoding (Andrei);
2690 #  added support for Atom 0.3-style links; fixed bug with textInput tracking;
2691 #  added support for cloud (MartijnP); added support for multiple
2692 #  category/dc:subject (MartijnP); normalize content model: 'description' gets
2693 #  description (which can come from description, summary, or full content if no
2694 #  description), 'content' gets dict of base/language/type/value (which can come
2695 #  from content:encoded, xhtml:body, content, or fullitem);
2696 #  fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
2697 #  tracking; fixed bug tracking unknown tags; fixed bug tracking content when
2698 #  <content> element is not in default namespace (like Pocketsoap feed);
2699 #  resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
2700 #  wfw:commentRSS; resolve relative URLs within embedded HTML markup in
2701 #  description, xhtml:body, content, content:encoded, title, subtitle,
2702 #  summary, info, tagline, and copyright; added support for pingback and
2703 #  trackback namespaces
2704 #2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
2705 #  namespaces, as opposed to 2.6 when I said I did but didn't really;
2706 #  sanitize HTML markup within some elements; added mxTidy support (if
2707 #  installed) to tidy HTML markup within some elements; fixed indentation
2708 #  bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
2709 #  (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
2710 #  'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
2711 #  'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
2712 #  and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
2713 #2.7.1 - 1/9/2004 - MAP - fixed bug handling &quot; and &apos;.  fixed memory
2714 #  leak not closing url opener (JohnD); added dc:publisher support (MarekK);
2715 #  added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
2716 #2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
2717 #  encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
2718 #  fixed relative URI processing for guid (skadz); added ICBM support; added
2719 #  base64 support
2720 #2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
2721 #  blogspot.com sites); added _debug variable
2722 #2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
2723 #3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
2724 #  added several new supported namespaces; fixed bug tracking naked markup in
2725 #  description; added support for enclosure; added support for source; re-added
2726 #  support for cloud which got dropped somehow; added support for expirationDate
2727 #3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
2728 #  xml:base URI, one for documents that don't define one explicitly and one for
2729 #  documents that define an outer and an inner xml:base that goes out of scope
2730 #  before the end of the document
2731 #3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
2732 #3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']
2733 #  will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
2734 #  added support for creativeCommons:license and cc:license; added support for
2735 #  full Atom content model in title, tagline, info, copyright, summary; fixed bug
2736 #  with gzip encoding (not always telling server we support it when we do)
2737 #3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
2738 #  (dictionary of 'name', 'url', 'email'); map author to author_detail if author
2739 #  contains name + email address
2740 #3.0b8 - 1/28/2004 - MAP - added support for contributor
2741 #3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
2742 #  support for summary
2743 #3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
2744 #  xml.util.iso8601
2745 #3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
2746 #  dangerous markup; fiddled with decodeEntities (not right); liberalized
2747 #  date parsing even further
2748 #3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
2749 #  added support to Atom 0.2 subtitle; added support for Atom content model
2750 #  in copyright; better sanitizing of dangerous HTML elements with end tags
2751 #  (script, frameset)
2752 #3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
2753 #  etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
2754 #3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
2755 #  Python 2.1
2756 #3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
2757 #  fixed bug capturing author and contributor URL; fixed bug resolving relative
2758 #  links in author and contributor URL; fixed bug resolvin relative links in
2759 #  generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
2760 #  namespace tests, and included them permanently in the test suite with his
2761 #  permission; fixed namespace handling under Python 2.1
2762 #3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
2763 #3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
2764 #3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
2765 #  use libxml2 (if available)
2766 #3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
2767 #  name was in parentheses; removed ultra-problematic mxTidy support; patch to
2768 #  workaround crash in PyXML/expat when encountering invalid entities
2769 #  (MarkMoraes); support for textinput/textInput
2770 #3.0b20 - 4/7/2004 - MAP - added CDF support
2771 #3.0b21 - 4/14/2004 - MAP - added Hot RSS support
2772 #3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
2773 #  results dict; changed results dict to allow getting values with results.key
2774 #  as well as results[key]; work around embedded illformed HTML with half
2775 #  a DOCTYPE; work around malformed Content-Type header; if character encoding
2776 #  is wrong, try several common ones before falling back to regexes (if this
2777 #  works, bozo_exception is set to CharacterEncodingOverride); fixed character
2778 #  encoding issues in BaseHTMLProcessor by tracking encoding and converting
2779 #  from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
2780 #  convert each value in results to Unicode (if possible), even if using
2781 #  regex-based parsing
2782 #3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
2783 #  high-bit characters in attributes in embedded HTML in description (thanks
2784 #  Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
2785 #  FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
2786 #  about a mapped key
2787 #3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
2788 #  results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
2789 #  cause the same encoding to be tried twice (even if it failed the first time);
2790 #  fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
2791 #  better textinput and image tracking in illformed RSS 1.0 feeds
2792 #3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
2793 #  my blink tag tests
2794 #3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
2795 #  failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
2796 #  duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
2797 #  added support for image; refactored parse() fallback logic to try other
2798 #  encodings if SAX parsing fails (previously it would only try other encodings
2799 #  if re-encoding failed); remove unichr madness in normalize_attrs now that
2800 #  we're properly tracking encoding in and out of BaseHTMLProcessor; set
2801 #  feed.language from root-level xml:lang; set entry.id from rdf:about;
2802 #  send Accept header
2803 #3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
2804 #  iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
2805 #  windows-1252); fixed regression that could cause the same encoding to be
2806 #  tried twice (even if it failed the first time)
2807 #3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
2808 #  recover from malformed content-type header parameter with no equals sign
2809 #  ('text/xml; charset:iso-8859-1')
2810 #3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
2811 #  to Unicode equivalents in illformed feeds (aaronsw); added and
2812 #  passed tests for converting character entities to Unicode equivalents
2813 #  in illformed feeds (aaronsw); test for valid parsers when setting
2814 #  XML_AVAILABLE; make version and encoding available when server returns
2815 #  a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
2816 #  digest auth or proxy support); add code to parse username/password
2817 #  out of url and send as basic authentication; expose downloading-related
2818 #  exceptions in bozo_exception (aaronsw); added __contains__ method to
2819 #  FeedParserDict (aaronsw); added publisher_detail (aaronsw)
2820 #3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
2821 #  convert feed to UTF-8 before passing to XML parser; completely revamped
2822 #  logic for determining character encoding and attempting XML parsing
2823 #  (much faster); increased default timeout to 20 seconds; test for presence
2824 #  of Location header on redirects; added tests for many alternate character
2825 #  encodings; support various EBCDIC encodings; support UTF-16BE and
2826 #  UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
2827 #  UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
2828 #  XML parsers are available; added support for 'Content-encoding: deflate';
2829 #  send blank 'Accept-encoding: ' header if neither gzip nor zlib modules
2830 #  are available
2831 #3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure
2832 #  problem tracking xml:base and xml:lang if element declares it, child
2833 #  doesn't, first grandchild redeclares it, and second grandchild doesn't;
2834 #  refactored date parsing; defined public registerDateHandler so callers
2835 #  can add support for additional date formats at runtime; added support
2836 #  for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added
2837 #  zopeCompatibilityHack() which turns FeedParserDict into a regular
2838 #  dictionary, required for Zope compatibility, and also makes command-
2839 #  line debugging easier because pprint module formats real dictionaries
2840 #  better than dictionary-like objects; added NonXMLContentType exception,
2841 #  which is stored in bozo_exception when a feed is served with a non-XML
2842 #  media type such as 'text/plain'; respect Content-Language as default
2843 #  language if not xml:lang is present; cloud dict is now FeedParserDict;
2844 #  generator dict is now FeedParserDict; better tracking of xml:lang,
2845 #  including support for xml:lang='' to unset the current language;
2846 #  recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default
2847 #  namespace; don't overwrite final status on redirects (scenarios:
2848 #  redirecting to a URL that returns 304, redirecting to a URL that
2849 #  redirects to another URL with a different type of redirect); add
2850 #  support for HTTP 303 redirects
2851 #4.0 - MAP - support for relative URIs in xml:base attribute; fixed
2852 #  encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;
2853 #  support for Atom 1.0; support for iTunes extensions; new 'tags' for
2854 #  categories/keywords/etc. as array of dict
2855 #  {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0
2856 #  terminology; parse RFC 822-style dates with no time; lots of other
2857 #  bug fixes
2858 #4.1 - MAP - removed socket timeout; added support for chardet library