removed feed title, refresh, and subscriptions label on top of the main view
[straw.git] / src / lib / feedparser.py
blob35e843adaf2b06bdaa14abe943ab7efab7ea8140
1 """Universal feed parser
3 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
5 Visit http://feedparser.org/ for the latest version
6 Visit http://feedparser.org/docs/ for the latest documentation
8 Required: Python 2.1 or later
9 Recommended: Python 2.3 or later
10 Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
11 """
13 __version__ = "4.1"# + "$Revision$"[11:15] + "-cvs"
14 __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
16 Redistribution and use in source and binary forms, with or without modification,
17 are permitted provided that the following conditions are met:
19 * Redistributions of source code must retain the above copyright notice,
20 this list of conditions and the following disclaimer.
21 * Redistributions in binary form must reproduce the above copyright notice,
22 this list of conditions and the following disclaimer in the documentation
23 and/or other materials provided with the distribution.
25 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
26 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
29 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 POSSIBILITY OF SUCH DAMAGE."""
36 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
37 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
38 "John Beimler <http://john.beimler.org/>",
39 "Fazal Majid <http://www.majid.info/mylos/weblog/>",
40 "Aaron Swartz <http://aaronsw.com/>",
41 "Kevin Marks <http://epeus.blogspot.com/>"]
42 _debug = 0
44 # HTTP "User-Agent" header to send to servers when downloading feeds.
45 # If you are embedding feedparser in a larger application, you should
46 # change this to your application name and URL.
47 USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
49 # HTTP "Accept" header to send to servers when downloading feeds. If you don't
50 # want to send an Accept header, set this to None.
51 ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
53 # List of preferred XML parsers, by SAX driver name. These will be tried first,
54 # but if they're not installed, Python will keep searching through its own list
55 # of pre-installed parsers until it finds one that supports everything we need.
56 PREFERRED_XML_PARSERS = ["drv_libxml2"]
58 # If you want feedparser to automatically run HTML markup through HTML Tidy, set
59 # this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
60 # or utidylib <http://utidylib.berlios.de/>.
61 TIDY_MARKUP = 0
63 # List of Python interfaces for HTML Tidy, in order of preference. Only useful
64 # if TIDY_MARKUP = 1
65 PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
67 # ---------- required modules (should come with any Python distribution) ----------
68 import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
69 try:
70 from cStringIO import StringIO as _StringIO
71 except:
72 from StringIO import StringIO as _StringIO
74 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
76 # gzip is included with most Python distributions, but may not be available if you compiled your own
77 try:
78 import gzip
79 except:
80 gzip = None
81 try:
82 import zlib
83 except:
84 zlib = None
86 # If a real XML parser is available, feedparser will attempt to use it. feedparser has
87 # been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
88 # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
89 # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
90 try:
91 import xml.sax
92 xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
93 from xml.sax.saxutils import escape as _xmlescape
94 _XML_AVAILABLE = 1
95 except:
96 _XML_AVAILABLE = 0
97 def _xmlescape(data):
98 data = data.replace('&', '&amp;')
99 data = data.replace('>', '&gt;')
100 data = data.replace('<', '&lt;')
101 return data
103 # base64 support for Atom feeds that contain embedded binary data
104 try:
105 import base64, binascii
106 except:
107 base64 = binascii = None
109 # cjkcodecs and iconv_codec provide support for more character encodings.
110 # Both are available from http://cjkpython.i18n.org/
111 try:
112 import cjkcodecs.aliases
113 except:
114 pass
115 try:
116 import iconv_codec
117 except:
118 pass
120 # chardet library auto-detects character encodings
121 # Download from http://chardet.feedparser.org/
122 try:
123 import chardet
124 if _debug:
125 import chardet.constants
126 chardet.constants._debug = 1
127 except:
128 chardet = None
130 # ---------- don't touch these ----------
131 class ThingsNobodyCaresAboutButMe(Exception): pass
132 class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
133 class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
134 class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
135 class UndeclaredNamespace(Exception): pass
137 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
138 sgmllib.special = re.compile('<!')
139 sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
141 SUPPORTED_VERSIONS = {'': 'unknown',
142 'rss090': 'RSS 0.90',
143 'rss091n': 'RSS 0.91 (Netscape)',
144 'rss091u': 'RSS 0.91 (Userland)',
145 'rss092': 'RSS 0.92',
146 'rss093': 'RSS 0.93',
147 'rss094': 'RSS 0.94',
148 'rss20': 'RSS 2.0',
149 'rss10': 'RSS 1.0',
150 'rss': 'RSS (unknown version)',
151 'atom01': 'Atom 0.1',
152 'atom02': 'Atom 0.2',
153 'atom03': 'Atom 0.3',
154 'atom10': 'Atom 1.0',
155 'atom': 'Atom (unknown version)',
156 'cdf': 'CDF',
157 'hotrss': 'Hot RSS'
160 try:
161 UserDict = dict
162 except NameError:
163 # Python 2.1 does not have dict
164 from UserDict import UserDict
165 def dict(aList):
166 rc = {}
167 for k, v in aList:
168 rc[k] = v
169 return rc
171 class FeedParserDict(UserDict):
172 keymap = {'channel': 'feed',
173 'items': 'entries',
174 'guid': 'id',
175 'date': 'updated',
176 'date_parsed': 'updated_parsed',
177 'description': ['subtitle', 'summary'],
178 'url': ['href'],
179 'modified': 'updated',
180 'modified_parsed': 'updated_parsed',
181 'issued': 'published',
182 'issued_parsed': 'published_parsed',
183 'copyright': 'rights',
184 'copyright_detail': 'rights_detail',
185 'tagline': 'subtitle',
186 'tagline_detail': 'subtitle_detail'}
187 def __getitem__(self, key):
188 if key == 'category':
189 return UserDict.__getitem__(self, 'tags')[0]['term']
190 if key == 'categories':
191 return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
192 realkey = self.keymap.get(key, key)
193 if type(realkey) == types.ListType:
194 for k in realkey:
195 if UserDict.has_key(self, k):
196 return UserDict.__getitem__(self, k)
197 if UserDict.has_key(self, key):
198 return UserDict.__getitem__(self, key)
199 return UserDict.__getitem__(self, realkey)
201 def __setitem__(self, key, value):
202 for k in self.keymap.keys():
203 if key == k:
204 key = self.keymap[k]
205 if type(key) == types.ListType:
206 key = key[0]
207 return UserDict.__setitem__(self, key, value)
209 def get(self, key, default=None):
210 if self.has_key(key):
211 return self[key]
212 else:
213 return default
215 def setdefault(self, key, value):
216 if not self.has_key(key):
217 self[key] = value
218 return self[key]
220 def has_key(self, key):
221 try:
222 return hasattr(self, key) or UserDict.has_key(self, key)
223 except AttributeError:
224 return False
226 def __getattr__(self, key):
227 try:
228 return self.__dict__[key]
229 except KeyError:
230 pass
231 try:
232 assert not key.startswith('_')
233 return self.__getitem__(key)
234 except:
235 raise AttributeError, "object has no attribute '%s'" % key
237 def __setattr__(self, key, value):
238 if key.startswith('_') or key == 'data':
239 self.__dict__[key] = value
240 else:
241 return self.__setitem__(key, value)
243 def __contains__(self, key):
244 return self.has_key(key)
246 def zopeCompatibilityHack():
247 global FeedParserDict
248 del FeedParserDict
249 def FeedParserDict(aDict=None):
250 rc = {}
251 if aDict:
252 rc.update(aDict)
253 return rc
255 _ebcdic_to_ascii_map = None
256 def _ebcdic_to_ascii(s):
257 global _ebcdic_to_ascii_map
258 if not _ebcdic_to_ascii_map:
259 emap = (
260 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
261 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
262 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
263 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
264 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
265 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
266 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
267 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
268 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
269 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
270 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
271 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
272 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
273 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
274 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
275 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
277 import string
278 _ebcdic_to_ascii_map = string.maketrans( \
279 ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
280 return s.translate(_ebcdic_to_ascii_map)
282 _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
283 def _urljoin(base, uri):
284 uri = _urifixer.sub(r'\1\3', uri)
285 return urlparse.urljoin(base, uri)
287 class _FeedParserMixin:
288 namespaces = {'': '',
289 'http://backend.userland.com/rss': '',
290 'http://blogs.law.harvard.edu/tech/rss': '',
291 'http://purl.org/rss/1.0/': '',
292 'http://my.netscape.com/rdf/simple/0.9/': '',
293 'http://example.com/newformat#': '',
294 'http://example.com/necho': '',
295 'http://purl.org/echo/': '',
296 'uri/of/echo/namespace#': '',
297 'http://purl.org/pie/': '',
298 'http://purl.org/atom/ns#': '',
299 'http://www.w3.org/2005/Atom': '',
300 'http://purl.org/rss/1.0/modules/rss091#': '',
302 'http://webns.net/mvcb/': 'admin',
303 'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
304 'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
305 'http://media.tangent.org/rss/1.0/': 'audio',
306 'http://backend.userland.com/blogChannelModule': 'blogChannel',
307 'http://web.resource.org/cc/': 'cc',
308 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
309 'http://purl.org/rss/1.0/modules/company': 'co',
310 'http://purl.org/rss/1.0/modules/content/': 'content',
311 'http://my.theinfo.org/changed/1.0/rss/': 'cp',
312 'http://purl.org/dc/elements/1.1/': 'dc',
313 'http://purl.org/dc/terms/': 'dcterms',
314 'http://purl.org/rss/1.0/modules/email/': 'email',
315 'http://purl.org/rss/1.0/modules/event/': 'ev',
316 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
317 'http://freshmeat.net/rss/fm/': 'fm',
318 'http://xmlns.com/foaf/0.1/': 'foaf',
319 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
320 'http://postneo.com/icbm/': 'icbm',
321 'http://purl.org/rss/1.0/modules/image/': 'image',
322 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
323 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
324 'http://purl.org/rss/1.0/modules/link/': 'l',
325 'http://search.yahoo.com/mrss': 'media',
326 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
327 'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
328 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
329 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
330 'http://purl.org/rss/1.0/modules/reference/': 'ref',
331 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
332 'http://purl.org/rss/1.0/modules/search/': 'search',
333 'http://purl.org/rss/1.0/modules/slash/': 'slash',
334 'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
335 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
336 'http://hacks.benhammersley.com/rss/streaming/': 'str',
337 'http://purl.org/rss/1.0/modules/subscription/': 'sub',
338 'http://purl.org/rss/1.0/modules/syndication/': 'sy',
339 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
340 'http://purl.org/rss/1.0/modules/threading/': 'thr',
341 'http://purl.org/rss/1.0/modules/textinput/': 'ti',
342 'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
343 'http://wellformedweb.org/commentAPI/': 'wfw',
344 'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
345 'http://www.w3.org/1999/xhtml': 'xhtml',
346 'http://www.w3.org/XML/1998/namespace': 'xml',
347 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf'
349 _matchnamespaces = {}
351 can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
352 can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
353 can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
354 html_types = ['text/html', 'application/xhtml+xml']
356 def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
357 if _debug: sys.stderr.write('initializing FeedParser\n')
358 if not self._matchnamespaces:
359 for k, v in self.namespaces.items():
360 self._matchnamespaces[k.lower()] = v
361 self.feeddata = FeedParserDict() # feed-level data
362 self.encoding = encoding # character encoding
363 self.entries = [] # list of entry-level data
364 self.version = '' # feed type/version, see SUPPORTED_VERSIONS
365 self.namespacesInUse = {} # dictionary of namespaces defined by the feed
367 # the following are used internally to track state;
368 # this is really out of control and should be refactored
369 self.infeed = 0
370 self.inentry = 0
371 self.incontent = 0
372 self.intextinput = 0
373 self.inimage = 0
374 self.inauthor = 0
375 self.incontributor = 0
376 self.inpublisher = 0
377 self.insource = 0
378 self.sourcedata = FeedParserDict()
379 self.contentparams = FeedParserDict()
380 self._summaryKey = None
381 self.namespacemap = {}
382 self.elementstack = []
383 self.basestack = []
384 self.langstack = []
385 self.baseuri = baseuri or ''
386 self.lang = baselang or None
387 if baselang:
388 self.feeddata['language'] = baselang
390 def unknown_starttag(self, tag, attrs):
391 if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
392 # normalize attrs
393 attrs = [(k.lower(), v) for k, v in attrs]
394 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
396 # track xml:base and xml:lang
397 attrsD = dict(attrs)
398 baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
399 self.baseuri = _urljoin(self.baseuri, baseuri)
400 lang = attrsD.get('xml:lang', attrsD.get('lang'))
401 if lang == '':
402 # xml:lang could be explicitly set to '', we need to capture that
403 lang = None
404 elif lang is None:
405 # if no xml:lang is specified, use parent lang
406 lang = self.lang
407 if lang:
408 if tag in ('feed', 'rss', 'rdf:RDF'):
409 self.feeddata['language'] = lang
410 self.lang = lang
411 self.basestack.append(self.baseuri)
412 self.langstack.append(lang)
414 # track namespaces
415 for prefix, uri in attrs:
416 if prefix.startswith('xmlns:'):
417 self.trackNamespace(prefix[6:], uri)
418 elif prefix == 'xmlns':
419 self.trackNamespace(None, uri)
421 # track inline content
422 if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
423 # element declared itself as escaped markup, but it isn't really
424 self.contentparams['type'] = 'application/xhtml+xml'
425 if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
426 # Note: probably shouldn't simply recreate localname here, but
427 # our namespace handling isn't actually 100% correct in cases where
428 # the feed redefines the default namespace (which is actually
429 # the usual case for inline content, thanks Sam), so here we
430 # cheat and just reconstruct the element based on localname
431 # because that compensates for the bugs in our namespace handling.
432 # This will horribly munge inline content with non-empty qnames,
433 # but nobody actually does that, so I'm not fixing it.
434 tag = tag.split(':')[-1]
435 return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)
437 # match namespaces
438 if tag.find(':') <> -1:
439 prefix, suffix = tag.split(':', 1)
440 else:
441 prefix, suffix = '', tag
442 prefix = self.namespacemap.get(prefix, prefix)
443 if prefix:
444 prefix = prefix + '_'
446 # special hack for better tracking of empty textinput/image elements in illformed feeds
447 if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
448 self.intextinput = 0
449 if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
450 self.inimage = 0
452 # call special handler (if defined) or default handler
453 methodname = '_start_' + prefix + suffix
454 try:
455 method = getattr(self, methodname)
456 return method(attrsD)
457 except AttributeError:
458 return self.push(prefix + suffix, 1)
460 def unknown_endtag(self, tag):
461 if _debug: sys.stderr.write('end %s\n' % tag)
462 # match namespaces
463 if tag.find(':') <> -1:
464 prefix, suffix = tag.split(':', 1)
465 else:
466 prefix, suffix = '', tag
467 prefix = self.namespacemap.get(prefix, prefix)
468 if prefix:
469 prefix = prefix + '_'
471 # call special handler (if defined) or default handler
472 methodname = '_end_' + prefix + suffix
473 try:
474 method = getattr(self, methodname)
475 method()
476 except AttributeError:
477 self.pop(prefix + suffix)
479 # track inline content
480 if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
481 # element declared itself as escaped markup, but it isn't really
482 self.contentparams['type'] = 'application/xhtml+xml'
483 if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
484 tag = tag.split(':')[-1]
485 self.handle_data('</%s>' % tag, escape=0)
487 # track xml:base and xml:lang going out of scope
488 if self.basestack:
489 self.basestack.pop()
490 if self.basestack and self.basestack[-1]:
491 self.baseuri = self.basestack[-1]
492 if self.langstack:
493 self.langstack.pop()
494 if self.langstack: # and (self.langstack[-1] is not None):
495 self.lang = self.langstack[-1]
497 def handle_charref(self, ref):
498 # called for each character reference, e.g. for '&#160;', ref will be '160'
499 if not self.elementstack: return
500 ref = ref.lower()
501 if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
502 text = '&#%s;' % ref
503 else:
504 if ref[0] == 'x':
505 c = int(ref[1:], 16)
506 else:
507 c = int(ref)
508 text = unichr(c).encode('utf-8')
509 self.elementstack[-1][2].append(text)
511 def handle_entityref(self, ref):
512 # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
513 if not self.elementstack: return
514 if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
515 if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
516 text = '&%s;' % ref
517 else:
518 # entity resolution graciously donated by Aaron Swartz
519 def name2cp(k):
520 import htmlentitydefs
521 if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
522 return htmlentitydefs.name2codepoint[k]
523 k = htmlentitydefs.entitydefs[k]
524 if k.startswith('&#') and k.endswith(';'):
525 return int(k[2:-1]) # not in latin-1
526 return ord(k)
527 try: name2cp(ref)
528 except KeyError: text = '&%s;' % ref
529 else: text = unichr(name2cp(ref)).encode('utf-8')
530 self.elementstack[-1][2].append(text)
532 def handle_data(self, text, escape=1):
533 # called for each block of plain text, i.e. outside of any tag and
534 # not containing any character or entity references
535 if not self.elementstack: return
536 if escape and self.contentparams.get('type') == 'application/xhtml+xml':
537 text = _xmlescape(text)
538 self.elementstack[-1][2].append(text)
540 def handle_comment(self, text):
541 # called for each comment, e.g. <!-- insert message here -->
542 pass
544 def handle_pi(self, text):
545 # called for each processing instruction, e.g. <?instruction>
546 pass
548 def handle_decl(self, text):
549 pass
551 def parse_declaration(self, i):
552 # override internal declaration handler to handle CDATA blocks
553 if _debug: sys.stderr.write('entering parse_declaration\n')
554 if self.rawdata[i:i+9] == '<![CDATA[':
555 k = self.rawdata.find(']]>', i)
556 if k == -1: k = len(self.rawdata)
557 self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
558 return k+3
559 else:
560 k = self.rawdata.find('>', i)
561 return k+1
563 def mapContentType(self, contentType):
564 contentType = contentType.lower()
565 if contentType == 'text':
566 contentType = 'text/plain'
567 elif contentType == 'html':
568 contentType = 'text/html'
569 elif contentType == 'xhtml':
570 contentType = 'application/xhtml+xml'
571 return contentType
573 def trackNamespace(self, prefix, uri):
574 loweruri = uri.lower()
575 if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
576 self.version = 'rss090'
577 if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
578 self.version = 'rss10'
579 if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
580 self.version = 'atom10'
581 if loweruri.find('backend.userland.com/rss') <> -1:
582 # match any backend.userland.com namespace
583 uri = 'http://backend.userland.com/rss'
584 loweruri = uri
585 if self._matchnamespaces.has_key(loweruri):
586 self.namespacemap[prefix] = self._matchnamespaces[loweruri]
587 self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
588 else:
589 self.namespacesInUse[prefix or ''] = uri
591 def resolveURI(self, uri):
592 return _urljoin(self.baseuri or '', uri)
594 def decodeEntities(self, element, data):
595 return data
597 def push(self, element, expectingText):
598 self.elementstack.append([element, expectingText, []])
600 def pop(self, element, stripWhitespace=1):
601 if not self.elementstack: return
602 if self.elementstack[-1][0] != element: return
604 element, expectingText, pieces = self.elementstack.pop()
605 output = ''.join(pieces)
606 if stripWhitespace:
607 output = output.strip()
608 if not expectingText: return output
610 # decode base64 content
611 if base64 and self.contentparams.get('base64', 0):
612 try:
613 output = base64.decodestring(output)
614 except binascii.Error:
615 pass
616 except binascii.Incomplete:
617 pass
619 # resolve relative URIs
620 if (element in self.can_be_relative_uri) and output:
621 output = self.resolveURI(output)
623 # decode entities within embedded markup
624 if not self.contentparams.get('base64', 0):
625 output = self.decodeEntities(element, output)
627 # remove temporary cruft from contentparams
628 try:
629 del self.contentparams['mode']
630 except KeyError:
631 pass
632 try:
633 del self.contentparams['base64']
634 except KeyError:
635 pass
637 # resolve relative URIs within embedded markup
638 if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
639 if element in self.can_contain_relative_uris:
640 output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
642 # sanitize embedded markup
643 if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
644 if element in self.can_contain_dangerous_markup:
645 output = _sanitizeHTML(output, self.encoding)
647 if self.encoding and type(output) != type(u''):
648 try:
649 output = unicode(output, self.encoding)
650 except:
651 pass
653 # categories/tags/keywords/whatever are handled in _end_category
654 if element == 'category':
655 return output
657 # store output in appropriate place(s)
658 if self.inentry and not self.insource:
659 if element == 'content':
660 self.entries[-1].setdefault(element, [])
661 contentparams = copy.deepcopy(self.contentparams)
662 contentparams['value'] = output
663 self.entries[-1][element].append(contentparams)
664 elif element == 'link':
665 self.entries[-1][element] = output
666 if output:
667 self.entries[-1]['links'][-1]['href'] = output
668 else:
669 if element == 'description':
670 element = 'summary'
671 self.entries[-1][element] = output
672 if self.incontent:
673 contentparams = copy.deepcopy(self.contentparams)
674 contentparams['value'] = output
675 self.entries[-1][element + '_detail'] = contentparams
676 elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage):
677 context = self._getContext()
678 if element == 'description':
679 element = 'subtitle'
680 context[element] = output
681 if element == 'link':
682 context['links'][-1]['href'] = output
683 elif self.incontent:
684 contentparams = copy.deepcopy(self.contentparams)
685 contentparams['value'] = output
686 context[element + '_detail'] = contentparams
687 return output
689 def pushContent(self, tag, attrsD, defaultContentType, expectingText):
690 self.incontent += 1
691 self.contentparams = FeedParserDict({
692 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
693 'language': self.lang,
694 'base': self.baseuri})
695 self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
696 self.push(tag, expectingText)
698 def popContent(self, tag):
699 value = self.pop(tag)
700 self.incontent -= 1
701 self.contentparams.clear()
702 return value
704 def _mapToStandardPrefix(self, name):
705 colonpos = name.find(':')
706 if colonpos <> -1:
707 prefix = name[:colonpos]
708 suffix = name[colonpos+1:]
709 prefix = self.namespacemap.get(prefix, prefix)
710 name = prefix + ':' + suffix
711 return name
713 def _getAttribute(self, attrsD, name):
714 return attrsD.get(self._mapToStandardPrefix(name))
716 def _isBase64(self, attrsD, contentparams):
717 if attrsD.get('mode', '') == 'base64':
718 return 1
719 if self.contentparams['type'].startswith('text/'):
720 return 0
721 if self.contentparams['type'].endswith('+xml'):
722 return 0
723 if self.contentparams['type'].endswith('/xml'):
724 return 0
725 return 1
727 def _itsAnHrefDamnIt(self, attrsD):
728 href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
729 if href:
730 try:
731 del attrsD['url']
732 except KeyError:
733 pass
734 try:
735 del attrsD['uri']
736 except KeyError:
737 pass
738 attrsD['href'] = href
739 return attrsD
741 def _save(self, key, value):
742 context = self._getContext()
743 context.setdefault(key, value)
745 def _start_rss(self, attrsD):
746 versionmap = {'0.91': 'rss091u',
747 '0.92': 'rss092',
748 '0.93': 'rss093',
749 '0.94': 'rss094'}
750 if not self.version:
751 attr_version = attrsD.get('version', '')
752 version = versionmap.get(attr_version)
753 if version:
754 self.version = version
755 elif attr_version.startswith('2.'):
756 self.version = 'rss20'
757 else:
758 self.version = 'rss'
760 def _start_dlhottitles(self, attrsD):
761 self.version = 'hotrss'
763 def _start_channel(self, attrsD):
764 self.infeed = 1
765 self._cdf_common(attrsD)
766 _start_feedinfo = _start_channel
768 def _cdf_common(self, attrsD):
769 if attrsD.has_key('lastmod'):
770 self._start_modified({})
771 self.elementstack[-1][-1] = attrsD['lastmod']
772 self._end_modified()
773 if attrsD.has_key('href'):
774 self._start_link({})
775 self.elementstack[-1][-1] = attrsD['href']
776 self._end_link()
778 def _start_feed(self, attrsD):
779 self.infeed = 1
780 versionmap = {'0.1': 'atom01',
781 '0.2': 'atom02',
782 '0.3': 'atom03'}
783 if not self.version:
784 attr_version = attrsD.get('version')
785 version = versionmap.get(attr_version)
786 if version:
787 self.version = version
788 else:
789 self.version = 'atom'
791 def _end_channel(self):
792 self.infeed = 0
793 _end_feed = _end_channel
795 def _start_image(self, attrsD):
796 self.inimage = 1
797 self.push('image', 0)
798 context = self._getContext()
799 context.setdefault('image', FeedParserDict())
801 def _end_image(self):
802 self.pop('image')
803 self.inimage = 0
805 def _start_textinput(self, attrsD):
806 self.intextinput = 1
807 self.push('textinput', 0)
808 context = self._getContext()
809 context.setdefault('textinput', FeedParserDict())
810 _start_textInput = _start_textinput
812 def _end_textinput(self):
813 self.pop('textinput')
814 self.intextinput = 0
815 _end_textInput = _end_textinput
817 def _start_author(self, attrsD):
818 self.inauthor = 1
819 self.push('author', 1)
820 _start_managingeditor = _start_author
821 _start_dc_author = _start_author
822 _start_dc_creator = _start_author
823 _start_itunes_author = _start_author
825 def _end_author(self):
826 self.pop('author')
827 self.inauthor = 0
828 self._sync_author_detail()
829 _end_managingeditor = _end_author
830 _end_dc_author = _end_author
831 _end_dc_creator = _end_author
832 _end_itunes_author = _end_author
834 def _start_itunes_owner(self, attrsD):
835 self.inpublisher = 1
836 self.push('publisher', 0)
838 def _end_itunes_owner(self):
839 self.pop('publisher')
840 self.inpublisher = 0
841 self._sync_author_detail('publisher')
843 def _start_contributor(self, attrsD):
844 self.incontributor = 1
845 context = self._getContext()
846 context.setdefault('contributors', [])
847 context['contributors'].append(FeedParserDict())
848 self.push('contributor', 0)
850 def _end_contributor(self):
851 self.pop('contributor')
852 self.incontributor = 0
854 def _start_dc_contributor(self, attrsD):
855 self.incontributor = 1
856 context = self._getContext()
857 context.setdefault('contributors', [])
858 context['contributors'].append(FeedParserDict())
859 self.push('name', 0)
861 def _end_dc_contributor(self):
862 self._end_name()
863 self.incontributor = 0
865 def _start_name(self, attrsD):
866 self.push('name', 0)
867 _start_itunes_name = _start_name
869 def _end_name(self):
870 value = self.pop('name')
871 if self.inpublisher:
872 self._save_author('name', value, 'publisher')
873 elif self.inauthor:
874 self._save_author('name', value)
875 elif self.incontributor:
876 self._save_contributor('name', value)
877 elif self.intextinput:
878 context = self._getContext()
879 context['textinput']['name'] = value
880 _end_itunes_name = _end_name
882 def _start_width(self, attrsD):
883 self.push('width', 0)
885 def _end_width(self):
886 value = self.pop('width')
887 try:
888 value = int(value)
889 except:
890 value = 0
891 if self.inimage:
892 context = self._getContext()
893 context['image']['width'] = value
895 def _start_height(self, attrsD):
896 self.push('height', 0)
898 def _end_height(self):
899 value = self.pop('height')
900 try:
901 value = int(value)
902 except:
903 value = 0
904 if self.inimage:
905 context = self._getContext()
906 context['image']['height'] = value
908 def _start_url(self, attrsD):
909 self.push('href', 1)
910 _start_homepage = _start_url
911 _start_uri = _start_url
913 def _end_url(self):
914 value = self.pop('href')
915 if self.inauthor:
916 self._save_author('href', value)
917 elif self.incontributor:
918 self._save_contributor('href', value)
919 elif self.inimage:
920 context = self._getContext()
921 context['image']['href'] = value
922 elif self.intextinput:
923 context = self._getContext()
924 context['textinput']['link'] = value
925 _end_homepage = _end_url
926 _end_uri = _end_url
928 def _start_email(self, attrsD):
929 self.push('email', 0)
930 _start_itunes_email = _start_email
932 def _end_email(self):
933 value = self.pop('email')
934 if self.inpublisher:
935 self._save_author('email', value, 'publisher')
936 elif self.inauthor:
937 self._save_author('email', value)
938 elif self.incontributor:
939 self._save_contributor('email', value)
940 _end_itunes_email = _end_email
942 def _getContext(self):
943 if self.insource:
944 context = self.sourcedata
945 elif self.inentry:
946 context = self.entries[-1]
947 else:
948 context = self.feeddata
949 return context
951 def _save_author(self, key, value, prefix='author'):
952 context = self._getContext()
953 context.setdefault(prefix + '_detail', FeedParserDict())
954 context[prefix + '_detail'][key] = value
955 self._sync_author_detail()
957 def _save_contributor(self, key, value):
958 context = self._getContext()
959 context.setdefault('contributors', [FeedParserDict()])
960 context['contributors'][-1][key] = value
962 def _sync_author_detail(self, key='author'):
963 context = self._getContext()
964 detail = context.get('%s_detail' % key)
965 if detail:
966 name = detail.get('name')
967 email = detail.get('email')
968 if name and email:
969 context[key] = '%s (%s)' % (name, email)
970 elif name:
971 context[key] = name
972 elif email:
973 context[key] = email
974 else:
975 author = context.get(key)
976 if not author: return
977 emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)
978 if not emailmatch: return
979 email = emailmatch.group(0)
980 # probably a better way to do the following, but it passes all the tests
981 author = author.replace(email, '')
982 author = author.replace('()', '')
983 author = author.strip()
984 if author and (author[0] == '('):
985 author = author[1:]
986 if author and (author[-1] == ')'):
987 author = author[:-1]
988 author = author.strip()
989 context.setdefault('%s_detail' % key, FeedParserDict())
990 context['%s_detail' % key]['name'] = author
991 context['%s_detail' % key]['email'] = email
993 def _start_subtitle(self, attrsD):
994 self.pushContent('subtitle', attrsD, 'text/plain', 1)
995 _start_tagline = _start_subtitle
996 _start_itunes_subtitle = _start_subtitle
998 def _end_subtitle(self):
999 self.popContent('subtitle')
1000 _end_tagline = _end_subtitle
1001 _end_itunes_subtitle = _end_subtitle
1003 def _start_rights(self, attrsD):
1004 self.pushContent('rights', attrsD, 'text/plain', 1)
1005 _start_dc_rights = _start_rights
1006 _start_copyright = _start_rights
1008 def _end_rights(self):
1009 self.popContent('rights')
1010 _end_dc_rights = _end_rights
1011 _end_copyright = _end_rights
1013 def _start_item(self, attrsD):
1014 self.entries.append(FeedParserDict())
1015 self.push('item', 0)
1016 self.inentry = 1
1017 self.guidislink = 0
1018 id = self._getAttribute(attrsD, 'rdf:about')
1019 if id:
1020 context = self._getContext()
1021 context['id'] = id
1022 self._cdf_common(attrsD)
1023 _start_entry = _start_item
1024 _start_product = _start_item
1026 def _end_item(self):
1027 self.pop('item')
1028 self.inentry = 0
1029 _end_entry = _end_item
1031 def _start_dc_language(self, attrsD):
1032 self.push('language', 1)
1033 _start_language = _start_dc_language
1035 def _end_dc_language(self):
1036 self.lang = self.pop('language')
1037 _end_language = _end_dc_language
1039 def _start_dc_publisher(self, attrsD):
1040 self.push('publisher', 1)
1041 _start_webmaster = _start_dc_publisher
1043 def _end_dc_publisher(self):
1044 self.pop('publisher')
1045 self._sync_author_detail('publisher')
1046 _end_webmaster = _end_dc_publisher
1048 def _start_published(self, attrsD):
1049 self.push('published', 1)
1050 _start_dcterms_issued = _start_published
1051 _start_issued = _start_published
1053 def _end_published(self):
1054 value = self.pop('published')
1055 self._save('published_parsed', _parse_date(value))
1056 _end_dcterms_issued = _end_published
1057 _end_issued = _end_published
1059 def _start_updated(self, attrsD):
1060 self.push('updated', 1)
1061 _start_modified = _start_updated
1062 _start_dcterms_modified = _start_updated
1063 _start_pubdate = _start_updated
1064 _start_dc_date = _start_updated
1066 def _end_updated(self):
1067 value = self.pop('updated')
1068 parsed_value = _parse_date(value)
1069 self._save('updated_parsed', parsed_value)
1070 _end_modified = _end_updated
1071 _end_dcterms_modified = _end_updated
1072 _end_pubdate = _end_updated
1073 _end_dc_date = _end_updated
1075 def _start_created(self, attrsD):
1076 self.push('created', 1)
1077 _start_dcterms_created = _start_created
1079 def _end_created(self):
1080 value = self.pop('created')
1081 self._save('created_parsed', _parse_date(value))
1082 _end_dcterms_created = _end_created
1084 def _start_expirationdate(self, attrsD):
1085 self.push('expired', 1)
1087 def _end_expirationdate(self):
1088 self._save('expired_parsed', _parse_date(self.pop('expired')))
1090 def _start_cc_license(self, attrsD):
1091 self.push('license', 1)
1092 value = self._getAttribute(attrsD, 'rdf:resource')
1093 if value:
1094 self.elementstack[-1][2].append(value)
1095 self.pop('license')
1097 def _start_creativecommons_license(self, attrsD):
1098 self.push('license', 1)
1100 def _end_creativecommons_license(self):
1101 self.pop('license')
1103 def _addTag(self, term, scheme, label):
1104 context = self._getContext()
1105 tags = context.setdefault('tags', [])
1106 if (not term) and (not scheme) and (not label): return
1107 value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1108 if value not in tags:
1109 tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))
1111 def _start_category(self, attrsD):
1112 if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
1113 term = attrsD.get('term')
1114 scheme = attrsD.get('scheme', attrsD.get('domain'))
1115 label = attrsD.get('label')
1116 self._addTag(term, scheme, label)
1117 self.push('category', 1)
1118 _start_dc_subject = _start_category
1119 _start_keywords = _start_category
1121 def _end_itunes_keywords(self):
1122 for term in self.pop('itunes_keywords').split():
1123 self._addTag(term, 'http://www.itunes.com/', None)
1125 def _start_itunes_category(self, attrsD):
1126 self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
1127 self.push('category', 1)
1129 def _end_category(self):
1130 value = self.pop('category')
1131 if not value: return
1132 context = self._getContext()
1133 tags = context['tags']
1134 if value and len(tags) and not tags[-1]['term']:
1135 tags[-1]['term'] = value
1136 else:
1137 self._addTag(value, None, None)
1138 _end_dc_subject = _end_category
1139 _end_keywords = _end_category
1140 _end_itunes_category = _end_category
1142 def _start_cloud(self, attrsD):
1143 self._getContext()['cloud'] = FeedParserDict(attrsD)
1145 def _start_link(self, attrsD):
1146 attrsD.setdefault('rel', 'alternate')
1147 attrsD.setdefault('type', 'text/html')
1148 attrsD = self._itsAnHrefDamnIt(attrsD)
1149 if attrsD.has_key('href'):
1150 attrsD['href'] = self.resolveURI(attrsD['href'])
1151 expectingText = self.infeed or self.inentry or self.insource
1152 context = self._getContext()
1153 context.setdefault('links', [])
1154 context['links'].append(FeedParserDict(attrsD))
1155 if attrsD['rel'] == 'enclosure':
1156 self._start_enclosure(attrsD)
1157 if attrsD.has_key('href'):
1158 expectingText = 0
1159 if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1160 context['link'] = attrsD['href']
1161 else:
1162 self.push('link', expectingText)
1163 _start_producturl = _start_link
1165 def _end_link(self):
1166 value = self.pop('link')
1167 context = self._getContext()
1168 if self.intextinput:
1169 context['textinput']['link'] = value
1170 if self.inimage:
1171 context['image']['link'] = value
1172 _end_producturl = _end_link
1174 def _start_guid(self, attrsD):
1175 self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1176 self.push('id', 1)
1178 def _end_guid(self):
1179 value = self.pop('id')
1180 self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1181 if self.guidislink:
1182 # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1183 # and only if the item doesn't already have a link element
1184 self._save('link', value)
1186 def _start_title(self, attrsD):
1187 self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1188 _start_dc_title = _start_title
1189 _start_media_title = _start_title
1191 def _end_title(self):
1192 value = self.popContent('title')
1193 context = self._getContext()
1194 if self.intextinput:
1195 context['textinput']['title'] = value
1196 elif self.inimage:
1197 context['image']['title'] = value
1198 _end_dc_title = _end_title
1199 _end_media_title = _end_title
1201 def _start_description(self, attrsD):
1202 context = self._getContext()
1203 if context.has_key('summary'):
1204 self._summaryKey = 'content'
1205 self._start_content(attrsD)
1206 else:
1207 self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
1209 def _start_abstract(self, attrsD):
1210 self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1212 def _end_description(self):
1213 if self._summaryKey == 'content':
1214 self._end_content()
1215 else:
1216 value = self.popContent('description')
1217 context = self._getContext()
1218 if self.intextinput:
1219 context['textinput']['description'] = value
1220 elif self.inimage:
1221 context['image']['description'] = value
1222 self._summaryKey = None
1223 _end_abstract = _end_description
1225 def _start_info(self, attrsD):
1226 self.pushContent('info', attrsD, 'text/plain', 1)
1227 _start_feedburner_browserfriendly = _start_info
1229 def _end_info(self):
1230 self.popContent('info')
1231 _end_feedburner_browserfriendly = _end_info
1233 def _start_generator(self, attrsD):
1234 if attrsD:
1235 attrsD = self._itsAnHrefDamnIt(attrsD)
1236 if attrsD.has_key('href'):
1237 attrsD['href'] = self.resolveURI(attrsD['href'])
1238 self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1239 self.push('generator', 1)
1241 def _end_generator(self):
1242 value = self.pop('generator')
1243 context = self._getContext()
1244 if context.has_key('generator_detail'):
1245 context['generator_detail']['name'] = value
1247 def _start_admin_generatoragent(self, attrsD):
1248 self.push('generator', 1)
1249 value = self._getAttribute(attrsD, 'rdf:resource')
1250 if value:
1251 self.elementstack[-1][2].append(value)
1252 self.pop('generator')
1253 self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1255 def _start_admin_errorreportsto(self, attrsD):
1256 self.push('errorreportsto', 1)
1257 value = self._getAttribute(attrsD, 'rdf:resource')
1258 if value:
1259 self.elementstack[-1][2].append(value)
1260 self.pop('errorreportsto')
1262 def _start_summary(self, attrsD):
1263 context = self._getContext()
1264 if context.has_key('summary'):
1265 self._summaryKey = 'content'
1266 self._start_content(attrsD)
1267 else:
1268 self._summaryKey = 'summary'
1269 self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
1270 _start_itunes_summary = _start_summary
1272 def _end_summary(self):
1273 if self._summaryKey == 'content':
1274 self._end_content()
1275 else:
1276 self.popContent(self._summaryKey or 'summary')
1277 self._summaryKey = None
1278 _end_itunes_summary = _end_summary
1280 def _start_enclosure(self, attrsD):
1281 attrsD = self._itsAnHrefDamnIt(attrsD)
1282 self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))
1283 href = attrsD.get('href')
1284 if href:
1285 context = self._getContext()
1286 if not context.get('id'):
1287 context['id'] = href
1289 def _start_source(self, attrsD):
1290 self.insource = 1
1292 def _end_source(self):
1293 self.insource = 0
1294 self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1295 self.sourcedata.clear()
1297 def _start_content(self, attrsD):
1298 self.pushContent('content', attrsD, 'text/plain', 1)
1299 src = attrsD.get('src')
1300 if src:
1301 self.contentparams['src'] = src
1302 self.push('content', 1)
1304 def _start_prodlink(self, attrsD):
1305 self.pushContent('content', attrsD, 'text/html', 1)
1307 def _start_body(self, attrsD):
1308 self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
1309 _start_xhtml_body = _start_body
1311 def _start_content_encoded(self, attrsD):
1312 self.pushContent('content', attrsD, 'text/html', 1)
1313 _start_fullitem = _start_content_encoded
1315 def _end_content(self):
1316 copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
1317 value = self.popContent('content')
1318 if copyToDescription:
1319 self._save('description', value)
1320 _end_body = _end_content
1321 _end_xhtml_body = _end_content
1322 _end_content_encoded = _end_content
1323 _end_fullitem = _end_content
1324 _end_prodlink = _end_content
1326 def _start_itunes_image(self, attrsD):
1327 self.push('itunes_image', 0)
1328 self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1329 _start_itunes_link = _start_itunes_image
1331 def _end_itunes_block(self):
1332 value = self.pop('itunes_block', 0)
1333 self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1335 def _end_itunes_explicit(self):
1336 value = self.pop('itunes_explicit', 0)
1337 self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
1339 if _XML_AVAILABLE:
1340 class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1341 def __init__(self, baseuri, baselang, encoding):
1342 if _debug: sys.stderr.write('trying StrictFeedParser\n')
1343 xml.sax.handler.ContentHandler.__init__(self)
1344 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1345 self.bozo = 0
1346 self.exc = None
1348 def startPrefixMapping(self, prefix, uri):
1349 self.trackNamespace(prefix, uri)
1351 def startElementNS(self, name, qname, attrs):
1352 namespace, localname = name
1353 lowernamespace = str(namespace or '').lower()
1354 if lowernamespace.find('backend.userland.com/rss') <> -1:
1355 # match any backend.userland.com namespace
1356 namespace = 'http://backend.userland.com/rss'
1357 lowernamespace = namespace
1358 if qname and qname.find(':') > 0:
1359 givenprefix = qname.split(':')[0]
1360 else:
1361 givenprefix = None
1362 prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1363 if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
1364 raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1365 if prefix:
1366 localname = prefix + ':' + localname
1367 localname = str(localname).lower()
1368 if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
1370 # qname implementation is horribly broken in Python 2.1 (it
1371 # doesn't report any), and slightly broken in Python 2.2 (it
1372 # doesn't report the xml: namespace). So we match up namespaces
1373 # with a known list first, and then possibly override them with
1374 # the qnames the SAX parser gives us (if indeed it gives us any
1375 # at all). Thanks to MatejC for helping me test this and
1376 # tirelessly telling me that it didn't work yet.
1377 attrsD = {}
1378 for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1379 lowernamespace = (namespace or '').lower()
1380 prefix = self._matchnamespaces.get(lowernamespace, '')
1381 if prefix:
1382 attrlocalname = prefix + ':' + attrlocalname
1383 attrsD[str(attrlocalname).lower()] = attrvalue
1384 for qname in attrs.getQNames():
1385 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1386 self.unknown_starttag(localname, attrsD.items())
1388 def characters(self, text):
1389 self.handle_data(text)
1391 def endElementNS(self, name, qname):
1392 namespace, localname = name
1393 lowernamespace = str(namespace or '').lower()
1394 if qname and qname.find(':') > 0:
1395 givenprefix = qname.split(':')[0]
1396 else:
1397 givenprefix = ''
1398 prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1399 if prefix:
1400 localname = prefix + ':' + localname
1401 localname = str(localname).lower()
1402 self.unknown_endtag(localname)
1404 def error(self, exc):
1405 self.bozo = 1
1406 self.exc = exc
1408 def fatalError(self, exc):
1409 self.error(exc)
1410 raise exc
1412 class _BaseHTMLProcessor(sgmllib.SGMLParser):
1413 elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1414 'img', 'input', 'isindex', 'link', 'meta', 'param']
1416 def __init__(self, encoding):
1417 self.encoding = encoding
1418 if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1419 sgmllib.SGMLParser.__init__(self)
1421 def reset(self):
1422 self.pieces = []
1423 sgmllib.SGMLParser.reset(self)
1425 def _shorttag_replace(self, match):
1426 tag = match.group(1)
1427 if tag in self.elements_no_end_tag:
1428 return '<' + tag + ' />'
1429 else:
1430 return '<' + tag + '></' + tag + '>'
1432 def feed(self, data):
1433 data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
1434 #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1435 data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)
1436 data = data.replace('&#39;', "'")
1437 data = data.replace('&#34;', '"')
1438 if self.encoding and type(data) == type(u''):
1439 data = data.encode(self.encoding)
1440 sgmllib.SGMLParser.feed(self, data)
1442 def normalize_attrs(self, attrs):
1443 # utility method to be called by descendants
1444 attrs = [(k.lower(), v) for k, v in attrs]
1445 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1446 return attrs
1448 def unknown_starttag(self, tag, attrs):
1449 # called for each start tag
1450 # attrs is a list of (attr, value) tuples
1451 # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
1452 if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1453 uattrs = []
1454 # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1455 for key, value in attrs:
1456 if type(value) != type(u''):
1457 value = unicode(value, self.encoding)
1458 uattrs.append((unicode(key, self.encoding), value))
1459 strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
1460 if tag in self.elements_no_end_tag:
1461 self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1462 else:
1463 self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1465 def unknown_endtag(self, tag):
1466 # called for each end tag, e.g. for </pre>, tag will be 'pre'
1467 # Reconstruct the original end tag.
1468 if tag not in self.elements_no_end_tag:
1469 self.pieces.append("</%(tag)s>" % locals())
1471 def handle_charref(self, ref):
1472 # called for each character reference, e.g. for '&#160;', ref will be '160'
1473 # Reconstruct the original character reference.
1474 self.pieces.append('&#%(ref)s;' % locals())
1476 def handle_entityref(self, ref):
1477 # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
1478 # Reconstruct the original entity reference.
1479 self.pieces.append('&%(ref)s;' % locals())
1481 def handle_data(self, text):
1482 # called for each block of plain text, i.e. outside of any tag and
1483 # not containing any character or entity references
1484 # Store the original text verbatim.
1485 if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
1486 self.pieces.append(text)
1488 def handle_comment(self, text):
1489 # called for each HTML comment, e.g. <!-- insert Javascript code here -->
1490 # Reconstruct the original comment.
1491 self.pieces.append('<!--%(text)s-->' % locals())
1493 def handle_pi(self, text):
1494 # called for each processing instruction, e.g. <?instruction>
1495 # Reconstruct original processing instruction.
1496 self.pieces.append('<?%(text)s>' % locals())
1498 def handle_decl(self, text):
1499 # called for the DOCTYPE, if present, e.g.
1500 # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1501 # "http://www.w3.org/TR/html4/loose.dtd">
1502 # Reconstruct original DOCTYPE
1503 self.pieces.append('<!%(text)s>' % locals())
1505 _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1506 def _scan_name(self, i, declstartpos):
1507 rawdata = self.rawdata
1508 n = len(rawdata)
1509 if i == n:
1510 return None, -1
1511 m = self._new_declname_match(rawdata, i)
1512 if m:
1513 s = m.group()
1514 name = s.strip()
1515 if (i + len(s)) == n:
1516 return None, -1 # end of buffer
1517 return name.lower(), m.end()
1518 else:
1519 self.handle_data(rawdata)
1520 # self.updatepos(declstartpos, i)
1521 return None, -1
1523 def output(self):
1524 '''Return processed HTML as a single string'''
1525 return ''.join([str(p) for p in self.pieces])
1527 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1528 def __init__(self, baseuri, baselang, encoding):
1529 sgmllib.SGMLParser.__init__(self)
1530 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1532 def decodeEntities(self, element, data):
1533 data = data.replace('&#60;', '&lt;')
1534 data = data.replace('&#x3c;', '&lt;')
1535 data = data.replace('&#62;', '&gt;')
1536 data = data.replace('&#x3e;', '&gt;')
1537 data = data.replace('&#38;', '&amp;')
1538 data = data.replace('&#x26;', '&amp;')
1539 data = data.replace('&#34;', '&quot;')
1540 data = data.replace('&#x22;', '&quot;')
1541 data = data.replace('&#39;', '&apos;')
1542 data = data.replace('&#x27;', '&apos;')
1543 if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
1544 data = data.replace('&lt;', '<')
1545 data = data.replace('&gt;', '>')
1546 data = data.replace('&amp;', '&')
1547 data = data.replace('&quot;', '"')
1548 data = data.replace('&apos;', "'")
1549 return data
1551 class _RelativeURIResolver(_BaseHTMLProcessor):
1552 relative_uris = [('a', 'href'),
1553 ('applet', 'codebase'),
1554 ('area', 'href'),
1555 ('blockquote', 'cite'),
1556 ('body', 'background'),
1557 ('del', 'cite'),
1558 ('form', 'action'),
1559 ('frame', 'longdesc'),
1560 ('frame', 'src'),
1561 ('iframe', 'longdesc'),
1562 ('iframe', 'src'),
1563 ('head', 'profile'),
1564 ('img', 'longdesc'),
1565 ('img', 'src'),
1566 ('img', 'usemap'),
1567 ('input', 'src'),
1568 ('input', 'usemap'),
1569 ('ins', 'cite'),
1570 ('link', 'href'),
1571 ('object', 'classid'),
1572 ('object', 'codebase'),
1573 ('object', 'data'),
1574 ('object', 'usemap'),
1575 ('q', 'cite'),
1576 ('script', 'src')]
1578 def __init__(self, baseuri, encoding):
1579 _BaseHTMLProcessor.__init__(self, encoding)
1580 self.baseuri = baseuri
1582 def resolveURI(self, uri):
1583 return _urljoin(self.baseuri, uri)
1585 def unknown_starttag(self, tag, attrs):
1586 attrs = self.normalize_attrs(attrs)
1587 attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
1588 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1590 def _resolveRelativeURIs(htmlSource, baseURI, encoding):
1591 if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
1592 p = _RelativeURIResolver(baseURI, encoding)
1593 p.feed(htmlSource)
1594 return p.output()
1596 class _HTMLSanitizer(_BaseHTMLProcessor):
1597 acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
1598 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
1599 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
1600 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
1601 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
1602 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
1603 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
1604 'thead', 'tr', 'tt', 'u', 'ul', 'var']
1606 acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
1607 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
1608 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
1609 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
1610 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
1611 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
1612 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
1613 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
1614 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
1615 'usemap', 'valign', 'value', 'vspace', 'width']
1617 unacceptable_elements_with_end_tag = ['script', 'applet']
1619 def reset(self):
1620 _BaseHTMLProcessor.reset(self)
1621 self.unacceptablestack = 0
1623 def unknown_starttag(self, tag, attrs):
1624 if not tag in self.acceptable_elements:
1625 if tag in self.unacceptable_elements_with_end_tag:
1626 self.unacceptablestack += 1
1627 return
1628 attrs = self.normalize_attrs(attrs)
1629 attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
1630 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1632 def unknown_endtag(self, tag):
1633 if not tag in self.acceptable_elements:
1634 if tag in self.unacceptable_elements_with_end_tag:
1635 self.unacceptablestack -= 1
1636 return
1637 _BaseHTMLProcessor.unknown_endtag(self, tag)
1639 def handle_pi(self, text):
1640 pass
1642 def handle_decl(self, text):
1643 pass
1645 def handle_data(self, text):
1646 if not self.unacceptablestack:
1647 _BaseHTMLProcessor.handle_data(self, text)
1649 def _sanitizeHTML(htmlSource, encoding):
1650 p = _HTMLSanitizer(encoding)
1651 p.feed(htmlSource)
1652 data = p.output()
1653 if TIDY_MARKUP:
1654 # loop through list of preferred Tidy interfaces looking for one that's installed,
1655 # then set up a common _tidy function to wrap the interface-specific API.
1656 _tidy = None
1657 for tidy_interface in PREFERRED_TIDY_INTERFACES:
1658 try:
1659 if tidy_interface == "uTidy":
1660 from tidy import parseString as _utidy
1661 def _tidy(data, **kwargs):
1662 return str(_utidy(data, **kwargs))
1663 break
1664 elif tidy_interface == "mxTidy":
1665 from mx.Tidy import Tidy as _mxtidy
1666 def _tidy(data, **kwargs):
1667 nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
1668 return data
1669 break
1670 except:
1671 pass
1672 if _tidy:
1673 utf8 = type(data) == type(u'')
1674 if utf8:
1675 data = data.encode('utf-8')
1676 data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
1677 if utf8:
1678 data = unicode(data, 'utf-8')
1679 if data.count('<body'):
1680 data = data.split('<body', 1)[1]
1681 if data.count('>'):
1682 data = data.split('>', 1)[1]
1683 if data.count('</body'):
1684 data = data.split('</body', 1)[0]
1685 data = data.strip().replace('\r\n', '\n')
1686 return data
1688 class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
1689 def http_error_default(self, req, fp, code, msg, headers):
1690 if ((code / 100) == 3) and (code != 304):
1691 return self.http_error_302(req, fp, code, msg, headers)
1692 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1693 infourl.status = code
1694 return infourl
1696 def http_error_302(self, req, fp, code, msg, headers):
1697 if headers.dict.has_key('location'):
1698 infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
1699 else:
1700 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1701 if not hasattr(infourl, 'status'):
1702 infourl.status = code
1703 return infourl
1705 def http_error_301(self, req, fp, code, msg, headers):
1706 if headers.dict.has_key('location'):
1707 infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
1708 else:
1709 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1710 if not hasattr(infourl, 'status'):
1711 infourl.status = code
1712 return infourl
1714 http_error_300 = http_error_302
1715 http_error_303 = http_error_302
1716 http_error_307 = http_error_302
1718 def http_error_401(self, req, fp, code, msg, headers):
1719 # Check if
1720 # - server requires digest auth, AND
1721 # - we tried (unsuccessfully) with basic auth, AND
1722 # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
1723 # If all conditions hold, parse authentication information
1724 # out of the Authorization header we sent the first time
1725 # (for the username and password) and the WWW-Authenticate
1726 # header the server sent back (for the realm) and retry
1727 # the request with the appropriate digest auth headers instead.
1728 # This evil genius hack has been brought to you by Aaron Swartz.
1729 host = urlparse.urlparse(req.get_full_url())[1]
1730 try:
1731 assert sys.version.split()[0] >= '2.3.3'
1732 assert base64 != None
1733 user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
1734 realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
1735 self.add_password(realm, host, user, passw)
1736 retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
1737 self.reset_retry_count()
1738 return retry
1739 except:
1740 return self.http_error_default(req, fp, code, msg, headers)
1742 def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
1743 """URL, filename, or string --> stream
1745 This function lets you define parsers that take any input source
1746 (URL, pathname to local or network file, or actual data as a string)
1747 and deal with it in a uniform manner. Returned object is guaranteed
1748 to have all the basic stdio read methods (read, readline, readlines).
1749 Just .close() the object when you're done with it.
1751 If the etag argument is supplied, it will be used as the value of an
1752 If-None-Match request header.
1754 If the modified argument is supplied, it must be a tuple of 9 integers
1755 as returned by gmtime() in the standard Python time module. This MUST
1756 be in GMT (Greenwich Mean Time). The formatted date/time will be used
1757 as the value of an If-Modified-Since request header.
1759 If the agent argument is supplied, it will be used as the value of a
1760 User-Agent request header.
1762 If the referrer argument is supplied, it will be used as the value of a
1763 Referer[sic] request header.
1765 If handlers is supplied, it is a list of handlers used to build a
1766 urllib2 opener.
1769 if hasattr(url_file_stream_or_string, 'read'):
1770 return url_file_stream_or_string
1772 if url_file_stream_or_string == '-':
1773 return sys.stdin
1775 if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
1776 if not agent:
1777 agent = USER_AGENT
1778 # test for inline user:password for basic auth
1779 auth = None
1780 if base64:
1781 urltype, rest = urllib.splittype(url_file_stream_or_string)
1782 realhost, rest = urllib.splithost(rest)
1783 if realhost:
1784 user_passwd, realhost = urllib.splituser(realhost)
1785 if user_passwd:
1786 url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
1787 auth = base64.encodestring(user_passwd).strip()
1788 # try to open with urllib2 (to use optional headers)
1789 request = urllib2.Request(url_file_stream_or_string)
1790 request.add_header('User-Agent', agent)
1791 if etag:
1792 request.add_header('If-None-Match', etag)
1793 if modified:
1794 # format into an RFC 1123-compliant timestamp. We can't use
1795 # time.strftime() since the %a and %b directives can be affected
1796 # by the current locale, but RFC 2616 states that dates must be
1797 # in English.
1798 short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
1799 months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
1800 request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
1801 if referrer:
1802 request.add_header('Referer', referrer)
1803 if gzip and zlib:
1804 request.add_header('Accept-encoding', 'gzip, deflate')
1805 elif gzip:
1806 request.add_header('Accept-encoding', 'gzip')
1807 elif zlib:
1808 request.add_header('Accept-encoding', 'deflate')
1809 else:
1810 request.add_header('Accept-encoding', '')
1811 if auth:
1812 request.add_header('Authorization', 'Basic %s' % auth)
1813 if ACCEPT_HEADER:
1814 request.add_header('Accept', ACCEPT_HEADER)
1815 request.add_header('A-IM', 'feed') # RFC 3229 support
1816 opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
1817 opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
1818 try:
1819 return opener.open(request)
1820 finally:
1821 opener.close() # JohnD
1823 # try to open with native open function (if url_file_stream_or_string is a filename)
1824 try:
1825 return open(url_file_stream_or_string)
1826 except:
1827 pass
1829 # treat url_file_stream_or_string as string
1830 return _StringIO(str(url_file_stream_or_string))
1832 _date_handlers = []
1833 def registerDateHandler(func):
1834 '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
1835 _date_handlers.insert(0, func)
1837 # ISO-8601 date parsing routines written by Fazal Majid.
1838 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
1839 # parser is beyond the scope of feedparser and would be a worthwhile addition
1840 # to the Python library.
1841 # A single regular expression cannot parse ISO 8601 date formats into groups
1842 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
1843 # 0301-04-01), so we use templates instead.
1844 # Please note the order in templates is significant because we need a
1845 # greedy match.
1846 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
1847 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
1848 '-YY-?MM', '-OOO', '-YY',
1849 '--MM-?DD', '--MM',
1850 '---DD',
1851 'CC', '']
1852 _iso8601_re = [
1853 tmpl.replace(
1854 'YYYY', r'(?P<year>\d{4})').replace(
1855 'YY', r'(?P<year>\d\d)').replace(
1856 'MM', r'(?P<month>[01]\d)').replace(
1857 'DD', r'(?P<day>[0123]\d)').replace(
1858 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
1859 'CC', r'(?P<century>\d\d$)')
1860 + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
1861 + r'(:(?P<second>\d{2}))?'
1862 + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
1863 for tmpl in _iso8601_tmpl]
1864 del tmpl
1865 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
1866 del regex
1867 def _parse_date_iso8601(dateString):
1868 '''Parse a variety of ISO-8601-compatible formats like 20040105'''
1869 m = None
1870 for _iso8601_match in _iso8601_matches:
1871 m = _iso8601_match(dateString)
1872 if m: break
1873 if not m: return
1874 if m.span() == (0, 0): return
1875 params = m.groupdict()
1876 ordinal = params.get('ordinal', 0)
1877 if ordinal:
1878 ordinal = int(ordinal)
1879 else:
1880 ordinal = 0
1881 year = params.get('year', '--')
1882 if not year or year == '--':
1883 year = time.gmtime()[0]
1884 elif len(year) == 2:
1885 # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
1886 year = 100 * int(time.gmtime()[0] / 100) + int(year)
1887 else:
1888 year = int(year)
1889 month = params.get('month', '-')
1890 if not month or month == '-':
1891 # ordinals are NOT normalized by mktime, we simulate them
1892 # by setting month=1, day=ordinal
1893 if ordinal:
1894 month = 1
1895 else:
1896 month = time.gmtime()[1]
1897 month = int(month)
1898 day = params.get('day', 0)
1899 if not day:
1900 # see above
1901 if ordinal:
1902 day = ordinal
1903 elif params.get('century', 0) or \
1904 params.get('year', 0) or params.get('month', 0):
1905 day = 1
1906 else:
1907 day = time.gmtime()[2]
1908 else:
1909 day = int(day)
1910 # special case of the century - is the first year of the 21st century
1911 # 2000 or 2001 ? The debate goes on...
1912 if 'century' in params.keys():
1913 year = (int(params['century']) - 1) * 100 + 1
1914 # in ISO 8601 most fields are optional
1915 for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
1916 if not params.get(field, None):
1917 params[field] = 0
1918 hour = int(params.get('hour', 0))
1919 minute = int(params.get('minute', 0))
1920 second = int(params.get('second', 0))
1921 # weekday is normalized by mktime(), we can ignore it
1922 weekday = 0
1923 # daylight savings is complex, but not needed for feedparser's purposes
1924 # as time zones, if specified, include mention of whether it is active
1925 # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
1926 # and most implementations have DST bugs
1927 daylight_savings_flag = 0
1928 tm = [year, month, day, hour, minute, second, weekday,
1929 ordinal, daylight_savings_flag]
1930 # ISO 8601 time zone adjustments
1931 tz = params.get('tz')
1932 if tz and tz != 'Z':
1933 if tz[0] == '-':
1934 tm[3] += int(params.get('tzhour', 0))
1935 tm[4] += int(params.get('tzmin', 0))
1936 elif tz[0] == '+':
1937 tm[3] -= int(params.get('tzhour', 0))
1938 tm[4] -= int(params.get('tzmin', 0))
1939 else:
1940 return None
1941 # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
1942 # which is guaranteed to normalize d/m/y/h/m/s.
1943 # Many implementations have bugs, but we'll pretend they don't.
1944 return time.localtime(time.mktime(tm))
1945 registerDateHandler(_parse_date_iso8601)
1947 # 8-bit date handling routines written by ytrewq1.
1948 _korean_year = u'\ub144' # b3e2 in euc-kr
1949 _korean_month = u'\uc6d4' # bff9 in euc-kr
1950 _korean_day = u'\uc77c' # c0cf in euc-kr
1951 _korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
1952 _korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
1954 _korean_onblog_date_re = \
1955 re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
1956 (_korean_year, _korean_month, _korean_day))
1957 _korean_nate_date_re = \
1958 re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
1959 (_korean_am, _korean_pm))
1960 def _parse_date_onblog(dateString):
1961 '''Parse a string according to the OnBlog 8-bit date format'''
1962 m = _korean_onblog_date_re.match(dateString)
1963 if not m: return
1964 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1965 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1966 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
1967 'zonediff': '+09:00'}
1968 if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
1969 return _parse_date_w3dtf(w3dtfdate)
1970 registerDateHandler(_parse_date_onblog)
1972 def _parse_date_nate(dateString):
1973 '''Parse a string according to the Nate 8-bit date format'''
1974 m = _korean_nate_date_re.match(dateString)
1975 if not m: return
1976 hour = int(m.group(5))
1977 ampm = m.group(4)
1978 if (ampm == _korean_pm):
1979 hour += 12
1980 hour = str(hour)
1981 if len(hour) == 1:
1982 hour = '0' + hour
1983 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1984 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1985 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
1986 'zonediff': '+09:00'}
1987 if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
1988 return _parse_date_w3dtf(w3dtfdate)
1989 registerDateHandler(_parse_date_nate)
1991 _mssql_date_re = \
1992 re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
1993 def _parse_date_mssql(dateString):
1994 '''Parse a string according to the MS SQL date format'''
1995 m = _mssql_date_re.match(dateString)
1996 if not m: return
1997 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1998 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1999 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
2000 'zonediff': '+09:00'}
2001 if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
2002 return _parse_date_w3dtf(w3dtfdate)
2003 registerDateHandler(_parse_date_mssql)
2005 # Unicode strings for Greek date strings
2006 _greek_months = \
2008 u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
2009 u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
2010 u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
2011 u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
2012 u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
2013 u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
2014 u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
2015 u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
2016 u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
2017 u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
2018 u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
2019 u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
2020 u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
2021 u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
2022 u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
2023 u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
2024 u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
2025 u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
2026 u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
2029 _greek_wdays = \
2031 u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
2032 u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
2033 u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
2034 u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
2035 u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
2036 u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
2037 u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
2040 _greek_date_format_re = \
2041 re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
2043 def _parse_date_greek(dateString):
2044 '''Parse a string according to a Greek 8-bit date format.'''
2045 m = _greek_date_format_re.match(dateString)
2046 if not m: return
2047 try:
2048 wday = _greek_wdays[m.group(1)]
2049 month = _greek_months[m.group(3)]
2050 except:
2051 return
2052 rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
2053 {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
2054 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
2055 'zonediff': m.group(8)}
2056 if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
2057 return _parse_date_rfc822(rfc822date)
2058 registerDateHandler(_parse_date_greek)
2060 # Unicode strings for Hungarian date strings
2061 _hungarian_months = \
2063 u'janu\u00e1r': u'01', # e1 in iso-8859-2
2064 u'febru\u00e1ri': u'02', # e1 in iso-8859-2
2065 u'm\u00e1rcius': u'03', # e1 in iso-8859-2
2066 u'\u00e1prilis': u'04', # e1 in iso-8859-2
2067 u'm\u00e1ujus': u'05', # e1 in iso-8859-2
2068 u'j\u00fanius': u'06', # fa in iso-8859-2
2069 u'j\u00falius': u'07', # fa in iso-8859-2
2070 u'augusztus': u'08',
2071 u'szeptember': u'09',
2072 u'okt\u00f3ber': u'10', # f3 in iso-8859-2
2073 u'november': u'11',
2074 u'december': u'12',
2077 _hungarian_date_format_re = \
2078 re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
2080 def _parse_date_hungarian(dateString):
2081 '''Parse a string according to a Hungarian 8-bit date format.'''
2082 m = _hungarian_date_format_re.match(dateString)
2083 if not m: return
2084 try:
2085 month = _hungarian_months[m.group(2)]
2086 day = m.group(3)
2087 if len(day) == 1:
2088 day = '0' + day
2089 hour = m.group(4)
2090 if len(hour) == 1:
2091 hour = '0' + hour
2092 except:
2093 return
2094 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
2095 {'year': m.group(1), 'month': month, 'day': day,\
2096 'hour': hour, 'minute': m.group(5),\
2097 'zonediff': m.group(6)}
2098 if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
2099 return _parse_date_w3dtf(w3dtfdate)
2100 registerDateHandler(_parse_date_hungarian)
2102 # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
2103 # Drake and licensed under the Python license. Removed all range checking
2104 # for month, day, hour, minute, and second, since mktime will normalize
2105 # these later
2106 def _parse_date_w3dtf(dateString):
2107 def __extract_date(m):
2108 year = int(m.group('year'))
2109 if year < 100:
2110 year = 100 * int(time.gmtime()[0] / 100) + int(year)
2111 if year < 1000:
2112 return 0, 0, 0
2113 julian = m.group('julian')
2114 if julian:
2115 julian = int(julian)
2116 month = julian / 30 + 1
2117 day = julian % 30 + 1
2118 jday = None
2119 while jday != julian:
2120 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
2121 jday = time.gmtime(t)[-2]
2122 diff = abs(jday - julian)
2123 if jday > julian:
2124 if diff < day:
2125 day = day - diff
2126 else:
2127 month = month - 1
2128 day = 31
2129 elif jday < julian:
2130 if day + diff < 28:
2131 day = day + diff
2132 else:
2133 month = month + 1
2134 return year, month, day
2135 month = m.group('month')
2136 day = 1
2137 if month is None:
2138 month = 1
2139 else:
2140 month = int(month)
2141 day = m.group('day')
2142 if day:
2143 day = int(day)
2144 else:
2145 day = 1
2146 return year, month, day
2148 def __extract_time(m):
2149 if not m:
2150 return 0, 0, 0
2151 hours = m.group('hours')
2152 if not hours:
2153 return 0, 0, 0
2154 hours = int(hours)
2155 minutes = int(m.group('minutes'))
2156 seconds = m.group('seconds')
2157 if seconds:
2158 seconds = int(seconds)
2159 else:
2160 seconds = 0
2161 return hours, minutes, seconds
2163 def __extract_tzd(m):
2164 '''Return the Time Zone Designator as an offset in seconds from UTC.'''
2165 if not m:
2166 return 0
2167 tzd = m.group('tzd')
2168 if not tzd:
2169 return 0
2170 if tzd == 'Z':
2171 return 0
2172 hours = int(m.group('tzdhours'))
2173 minutes = m.group('tzdminutes')
2174 if minutes:
2175 minutes = int(minutes)
2176 else:
2177 minutes = 0
2178 offset = (hours*60 + minutes) * 60
2179 if tzd[0] == '+':
2180 return -offset
2181 return offset
2183 __date_re = ('(?P<year>\d\d\d\d)'
2184 '(?:(?P<dsep>-|)'
2185 '(?:(?P<julian>\d\d\d)'
2186 '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')
2187 __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
2188 __tzd_rx = re.compile(__tzd_re)
2189 __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
2190 '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'
2191 + __tzd_re)
2192 __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
2193 __datetime_rx = re.compile(__datetime_re)
2194 m = __datetime_rx.match(dateString)
2195 if (m is None) or (m.group() != dateString): return
2196 gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
2197 if gmt[0] == 0: return
2198 return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
2199 registerDateHandler(_parse_date_w3dtf)
2201 def _parse_date_rfc822(dateString):
2202 '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
2203 data = dateString.split()
2204 if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
2205 del data[0]
2206 if len(data) == 4:
2207 s = data[3]
2208 i = s.find('+')
2209 if i > 0:
2210 data[3:] = [s[:i], s[i+1:]]
2211 else:
2212 data.append('')
2213 dateString = " ".join(data)
2214 if len(data) < 5:
2215 dateString += ' 00:00:00 GMT'
2216 tm = rfc822.parsedate_tz(dateString)
2217 if tm:
2218 return time.gmtime(rfc822.mktime_tz(tm))
2219 # rfc822.py defines several time zones, but we define some extra ones.
2220 # 'ET' is equivalent to 'EST', etc.
2221 _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
2222 rfc822._timezones.update(_additional_timezones)
2223 registerDateHandler(_parse_date_rfc822)
2225 def _parse_date(dateString):
2226 '''Parses a variety of date formats into a 9-tuple in GMT'''
2227 for handler in _date_handlers:
2228 try:
2229 date9tuple = handler(dateString)
2230 if not date9tuple: continue
2231 if len(date9tuple) != 9:
2232 if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
2233 raise ValueError
2234 map(int, date9tuple)
2235 return date9tuple
2236 except Exception, e:
2237 if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
2238 pass
2239 return None
2241 def _getCharacterEncoding(http_headers, xml_data):
2242 '''Get the character encoding of the XML document
2244 http_headers is a dictionary
2245 xml_data is a raw string (not Unicode)
2247 This is so much trickier than it sounds, it's not even funny.
2248 According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
2249 is application/xml, application/*+xml,
2250 application/xml-external-parsed-entity, or application/xml-dtd,
2251 the encoding given in the charset parameter of the HTTP Content-Type
2252 takes precedence over the encoding given in the XML prefix within the
2253 document, and defaults to 'utf-8' if neither are specified. But, if
2254 the HTTP Content-Type is text/xml, text/*+xml, or
2255 text/xml-external-parsed-entity, the encoding given in the XML prefix
2256 within the document is ALWAYS IGNORED and only the encoding given in
2257 the charset parameter of the HTTP Content-Type header should be
2258 respected, and it defaults to 'us-ascii' if not specified.
2260 Furthermore, discussion on the atom-syntax mailing list with the
2261 author of RFC 3023 leads me to the conclusion that any document
2262 served with a Content-Type of text/* and no charset parameter
2263 must be treated as us-ascii. (We now do this.) And also that it
2264 must always be flagged as non-well-formed. (We now do this too.)
2266 If Content-Type is unspecified (input was local file or non-HTTP source)
2267 or unrecognized (server just got it totally wrong), then go by the
2268 encoding given in the XML prefix of the document and default to
2269 'iso-8859-1' as per the HTTP specification (RFC 2616).
2271 Then, assuming we didn't find a character encoding in the HTTP headers
2272 (and the HTTP Content-type allowed us to look in the body), we need
2273 to sniff the first few bytes of the XML data and try to determine
2274 whether the encoding is ASCII-compatible. Section F of the XML
2275 specification shows the way here:
2276 http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2278 If the sniffed encoding is not ASCII-compatible, we need to make it
2279 ASCII compatible so that we can sniff further into the XML declaration
2280 to find the encoding attribute, which will tell us the true encoding.
2282 Of course, none of this guarantees that we will be able to parse the
2283 feed in the declared character encoding (assuming it was declared
2284 correctly, which many are not). CJKCodecs and iconv_codec help a lot;
2285 you should definitely install them if you can.
2286 http://cjkpython.i18n.org/
2289 def _parseHTTPContentType(content_type):
2290 '''takes HTTP Content-Type header and returns (content type, charset)
2292 If no charset is specified, returns (content type, '')
2293 If no content type is specified, returns ('', '')
2294 Both return parameters are guaranteed to be lowercase strings
2296 content_type = content_type or ''
2297 content_type, params = cgi.parse_header(content_type)
2298 return content_type, params.get('charset', '').replace("'", '')
2300 sniffed_xml_encoding = ''
2301 xml_encoding = ''
2302 true_encoding = ''
2303 http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
2304 # Must sniff for non-ASCII-compatible character encodings before
2305 # searching for XML declaration. This heuristic is defined in
2306 # section F of the XML specification:
2307 # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2308 try:
2309 if xml_data[:4] == '\x4c\x6f\xa7\x94':
2310 # EBCDIC
2311 xml_data = _ebcdic_to_ascii(xml_data)
2312 elif xml_data[:4] == '\x00\x3c\x00\x3f':
2313 # UTF-16BE
2314 sniffed_xml_encoding = 'utf-16be'
2315 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
2316 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
2317 # UTF-16BE with BOM
2318 sniffed_xml_encoding = 'utf-16be'
2319 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
2320 elif xml_data[:4] == '\x3c\x00\x3f\x00':
2321 # UTF-16LE
2322 sniffed_xml_encoding = 'utf-16le'
2323 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
2324 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
2325 # UTF-16LE with BOM
2326 sniffed_xml_encoding = 'utf-16le'
2327 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
2328 elif xml_data[:4] == '\x00\x00\x00\x3c':
2329 # UTF-32BE
2330 sniffed_xml_encoding = 'utf-32be'
2331 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
2332 elif xml_data[:4] == '\x3c\x00\x00\x00':
2333 # UTF-32LE
2334 sniffed_xml_encoding = 'utf-32le'
2335 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
2336 elif xml_data[:4] == '\x00\x00\xfe\xff':
2337 # UTF-32BE with BOM
2338 sniffed_xml_encoding = 'utf-32be'
2339 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
2340 elif xml_data[:4] == '\xff\xfe\x00\x00':
2341 # UTF-32LE with BOM
2342 sniffed_xml_encoding = 'utf-32le'
2343 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
2344 elif xml_data[:3] == '\xef\xbb\xbf':
2345 # UTF-8 with BOM
2346 sniffed_xml_encoding = 'utf-8'
2347 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
2348 else:
2349 # ASCII-compatible
2350 pass
2351 xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
2352 except:
2353 xml_encoding_match = None
2354 if xml_encoding_match:
2355 xml_encoding = xml_encoding_match.groups()[0].lower()
2356 if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
2357 xml_encoding = sniffed_xml_encoding
2358 acceptable_content_type = 0
2359 application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
2360 text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
2361 if (http_content_type in application_content_types) or \
2362 (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
2363 acceptable_content_type = 1
2364 true_encoding = http_encoding or xml_encoding or 'utf-8'
2365 elif (http_content_type in text_content_types) or \
2366 (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
2367 acceptable_content_type = 1
2368 true_encoding = http_encoding or 'us-ascii'
2369 elif http_content_type.startswith('text/'):
2370 true_encoding = http_encoding or 'us-ascii'
2371 elif http_headers and (not http_headers.has_key('content-type')):
2372 true_encoding = xml_encoding or 'iso-8859-1'
2373 else:
2374 true_encoding = xml_encoding or 'utf-8'
2375 return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
2377 def _toUTF8(data, encoding):
2378 '''Changes an XML data stream on the fly to specify a new encoding
2380 data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
2381 encoding is a string recognized by encodings.aliases
2383 if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
2384 # strip Byte Order Mark (if present)
2385 if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
2386 if _debug:
2387 sys.stderr.write('stripping BOM\n')
2388 if encoding != 'utf-16be':
2389 sys.stderr.write('trying utf-16be instead\n')
2390 encoding = 'utf-16be'
2391 data = data[2:]
2392 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
2393 if _debug:
2394 sys.stderr.write('stripping BOM\n')
2395 if encoding != 'utf-16le':
2396 sys.stderr.write('trying utf-16le instead\n')
2397 encoding = 'utf-16le'
2398 data = data[2:]
2399 elif data[:3] == '\xef\xbb\xbf':
2400 if _debug:
2401 sys.stderr.write('stripping BOM\n')
2402 if encoding != 'utf-8':
2403 sys.stderr.write('trying utf-8 instead\n')
2404 encoding = 'utf-8'
2405 data = data[3:]
2406 elif data[:4] == '\x00\x00\xfe\xff':
2407 if _debug:
2408 sys.stderr.write('stripping BOM\n')
2409 if encoding != 'utf-32be':
2410 sys.stderr.write('trying utf-32be instead\n')
2411 encoding = 'utf-32be'
2412 data = data[4:]
2413 elif data[:4] == '\xff\xfe\x00\x00':
2414 if _debug:
2415 sys.stderr.write('stripping BOM\n')
2416 if encoding != 'utf-32le':
2417 sys.stderr.write('trying utf-32le instead\n')
2418 encoding = 'utf-32le'
2419 data = data[4:]
2420 newdata = unicode(data, encoding)
2421 if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
2422 declmatch = re.compile('^<\?xml[^>]*?>')
2423 newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
2424 if declmatch.search(newdata):
2425 newdata = declmatch.sub(newdecl, newdata)
2426 else:
2427 newdata = newdecl + u'\n' + newdata
2428 return newdata.encode('utf-8')
2430 def _stripDoctype(data):
2431 '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
2433 rss_version may be 'rss091n' or None
2434 stripped_data is the same XML document, minus the DOCTYPE
2436 entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
2437 data = entity_pattern.sub('', data)
2438 doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
2439 doctype_results = doctype_pattern.findall(data)
2440 doctype = doctype_results and doctype_results[0] or ''
2441 if doctype.lower().count('netscape'):
2442 version = 'rss091n'
2443 else:
2444 version = None
2445 data = doctype_pattern.sub('', data)
2446 return version, data
2448 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
2449 '''Parse a feed from a URL, file, stream, or string'''
2450 result = FeedParserDict()
2451 result['feed'] = FeedParserDict()
2452 result['entries'] = []
2453 if _XML_AVAILABLE:
2454 result['bozo'] = 0
2455 if type(handlers) == types.InstanceType:
2456 handlers = [handlers]
2457 try:
2458 f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
2459 data = f.read()
2460 except Exception, e:
2461 result['bozo'] = 1
2462 result['bozo_exception'] = e
2463 data = ''
2464 f = None
2466 # if feed is gzip-compressed, decompress it
2467 if f and data and hasattr(f, 'headers'):
2468 if gzip and f.headers.get('content-encoding', '') == 'gzip':
2469 try:
2470 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
2471 except Exception, e:
2472 # Some feeds claim to be gzipped but they're not, so
2473 # we get garbage. Ideally, we should re-request the
2474 # feed without the 'Accept-encoding: gzip' header,
2475 # but we don't.
2476 result['bozo'] = 1
2477 result['bozo_exception'] = e
2478 data = ''
2479 elif zlib and f.headers.get('content-encoding', '') == 'deflate':
2480 try:
2481 data = zlib.decompress(data, -zlib.MAX_WBITS)
2482 except Exception, e:
2483 result['bozo'] = 1
2484 result['bozo_exception'] = e
2485 data = ''
2487 # save HTTP headers
2488 if hasattr(f, 'info'):
2489 info = f.info()
2490 result['etag'] = info.getheader('ETag')
2491 last_modified = info.getheader('Last-Modified')
2492 if last_modified:
2493 result['modified'] = _parse_date(last_modified)
2494 if hasattr(f, 'url'):
2495 result['href'] = f.url
2496 result['status'] = 200
2497 if hasattr(f, 'status'):
2498 result['status'] = f.status
2499 if hasattr(f, 'headers'):
2500 result['headers'] = f.headers.dict
2501 if hasattr(f, 'close'):
2502 f.close()
2504 # there are four encodings to keep track of:
2505 # - http_encoding is the encoding declared in the Content-Type HTTP header
2506 # - xml_encoding is the encoding declared in the <?xml declaration
2507 # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
2508 # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
2509 http_headers = result.get('headers', {})
2510 result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
2511 _getCharacterEncoding(http_headers, data)
2512 if http_headers and (not acceptable_content_type):
2513 if http_headers.has_key('content-type'):
2514 bozo_message = '%s is not an XML media type' % http_headers['content-type']
2515 else:
2516 bozo_message = 'no Content-type specified'
2517 result['bozo'] = 1
2518 result['bozo_exception'] = NonXMLContentType(bozo_message)
2520 result['version'], data = _stripDoctype(data)
2522 baseuri = http_headers.get('content-location', result.get('href'))
2523 baselang = http_headers.get('content-language', None)
2525 # if server sent 304, we're done
2526 if result.get('status', 0) == 304:
2527 result['version'] = ''
2528 result['debug_message'] = 'The feed has not changed since you last checked, ' + \
2529 'so the server sent no data. This is a feature, not a bug!'
2530 return result
2532 # if there was a problem downloading, we're done
2533 if not data:
2534 return result
2536 # determine character encoding
2537 use_strict_parser = 0
2538 known_encoding = 0
2539 tried_encodings = []
2540 # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
2541 for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
2542 if not proposed_encoding: continue
2543 if proposed_encoding in tried_encodings: continue
2544 tried_encodings.append(proposed_encoding)
2545 try:
2546 data = _toUTF8(data, proposed_encoding)
2547 known_encoding = use_strict_parser = 1
2548 break
2549 except:
2550 pass
2551 # if no luck and we have auto-detection library, try that
2552 if (not known_encoding) and chardet:
2553 try:
2554 proposed_encoding = chardet.detect(data)['encoding']
2555 if proposed_encoding and (proposed_encoding not in tried_encodings):
2556 tried_encodings.append(proposed_encoding)
2557 data = _toUTF8(data, proposed_encoding)
2558 known_encoding = use_strict_parser = 1
2559 except:
2560 pass
2561 # if still no luck and we haven't tried utf-8 yet, try that
2562 if (not known_encoding) and ('utf-8' not in tried_encodings):
2563 try:
2564 proposed_encoding = 'utf-8'
2565 tried_encodings.append(proposed_encoding)
2566 data = _toUTF8(data, proposed_encoding)
2567 known_encoding = use_strict_parser = 1
2568 except:
2569 pass
2570 # if still no luck and we haven't tried windows-1252 yet, try that
2571 if (not known_encoding) and ('windows-1252' not in tried_encodings):
2572 try:
2573 proposed_encoding = 'windows-1252'
2574 tried_encodings.append(proposed_encoding)
2575 data = _toUTF8(data, proposed_encoding)
2576 known_encoding = use_strict_parser = 1
2577 except:
2578 pass
2579 # if still no luck, give up
2580 if not known_encoding:
2581 result['bozo'] = 1
2582 result['bozo_exception'] = CharacterEncodingUnknown( \
2583 'document encoding unknown, I tried ' + \
2584 '%s, %s, utf-8, and windows-1252 but nothing worked' % \
2585 (result['encoding'], xml_encoding))
2586 result['encoding'] = ''
2587 elif proposed_encoding != result['encoding']:
2588 result['bozo'] = 1
2589 result['bozo_exception'] = CharacterEncodingOverride( \
2590 'documented declared as %s, but parsed as %s' % \
2591 (result['encoding'], proposed_encoding))
2592 result['encoding'] = proposed_encoding
2594 if not _XML_AVAILABLE:
2595 use_strict_parser = 0
2596 if use_strict_parser:
2597 # initialize the SAX parser
2598 feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
2599 saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
2600 saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
2601 saxparser.setContentHandler(feedparser)
2602 saxparser.setErrorHandler(feedparser)
2603 source = xml.sax.xmlreader.InputSource()
2604 source.setByteStream(_StringIO(data))
2605 if hasattr(saxparser, '_ns_stack'):
2606 # work around bug in built-in SAX parser (doesn't recognize xml: namespace)
2607 # PyXML doesn't have this problem, and it doesn't have _ns_stack either
2608 saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
2609 try:
2610 saxparser.parse(source)
2611 except Exception, e:
2612 if _debug:
2613 import traceback
2614 traceback.print_stack()
2615 traceback.print_exc()
2616 sys.stderr.write('xml parsing failed\n')
2617 result['bozo'] = 1
2618 result['bozo_exception'] = feedparser.exc or e
2619 use_strict_parser = 0
2620 if not use_strict_parser:
2621 feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')
2622 feedparser.feed(data)
2623 result['feed'] = feedparser.feeddata
2624 result['entries'] = feedparser.entries
2625 result['version'] = result['version'] or feedparser.version
2626 result['namespaces'] = feedparser.namespacesInUse
2627 return result
2629 if __name__ == '__main__':
2630 if not sys.argv[1:]:
2631 print __doc__
2632 sys.exit(0)
2633 else:
2634 urls = sys.argv[1:]
2635 zopeCompatibilityHack()
2636 from pprint import pprint
2637 for url in urls:
2638 print url
2639 print
2640 result = parse(url)
2641 pprint(result)
2642 print
2644 #REVISION HISTORY
2645 #1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
2646 # added Simon Fell's test suite
2647 #1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
2648 #2.0 - 10/19/2002
2649 # JD - use inchannel to watch out for image and textinput elements which can
2650 # also contain title, link, and description elements
2651 # JD - check for isPermaLink='false' attribute on guid elements
2652 # JD - replaced openAnything with open_resource supporting ETag and
2653 # If-Modified-Since request headers
2654 # JD - parse now accepts etag, modified, agent, and referrer optional
2655 # arguments
2656 # JD - modified parse to return a dictionary instead of a tuple so that any
2657 # etag or modified information can be returned and cached by the caller
2658 #2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
2659 # because of etag/modified, return the old etag/modified to the caller to
2660 # indicate why nothing is being returned
2661 #2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
2662 # useless. Fixes the problem JD was addressing by adding it.
2663 #2.1 - 11/14/2002 - MAP - added gzip support
2664 #2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
2665 # start_admingeneratoragent is an example of how to handle elements with
2666 # only attributes, no content.
2667 #2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
2668 # also, make sure we send the User-Agent even if urllib2 isn't available.
2669 # Match any variation of backend.userland.com/rss namespace.
2670 #2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
2671 #2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
2672 # snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
2673 # project name
2674 #2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
2675 # removed unnecessary urllib code -- urllib2 should always be available anyway;
2676 # return actual url, status, and full HTTP headers (as result['url'],
2677 # result['status'], and result['headers']) if parsing a remote feed over HTTP --
2678 # this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
2679 # added the latest namespace-of-the-week for RSS 2.0
2680 #2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
2681 # User-Agent (otherwise urllib2 sends two, which confuses some servers)
2682 #2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
2683 # inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
2684 #2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
2685 # textInput, and also to return the character encoding (if specified)
2686 #2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
2687 # nested divs within content (JohnD); fixed missing sys import (JohanS);
2688 # fixed regular expression to capture XML character encoding (Andrei);
2689 # added support for Atom 0.3-style links; fixed bug with textInput tracking;
2690 # added support for cloud (MartijnP); added support for multiple
2691 # category/dc:subject (MartijnP); normalize content model: 'description' gets
2692 # description (which can come from description, summary, or full content if no
2693 # description), 'content' gets dict of base/language/type/value (which can come
2694 # from content:encoded, xhtml:body, content, or fullitem);
2695 # fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
2696 # tracking; fixed bug tracking unknown tags; fixed bug tracking content when
2697 # <content> element is not in default namespace (like Pocketsoap feed);
2698 # resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
2699 # wfw:commentRSS; resolve relative URLs within embedded HTML markup in
2700 # description, xhtml:body, content, content:encoded, title, subtitle,
2701 # summary, info, tagline, and copyright; added support for pingback and
2702 # trackback namespaces
2703 #2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
2704 # namespaces, as opposed to 2.6 when I said I did but didn't really;
2705 # sanitize HTML markup within some elements; added mxTidy support (if
2706 # installed) to tidy HTML markup within some elements; fixed indentation
2707 # bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
2708 # (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
2709 # 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
2710 # 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
2711 # and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
2712 #2.7.1 - 1/9/2004 - MAP - fixed bug handling &quot; and &apos;. fixed memory
2713 # leak not closing url opener (JohnD); added dc:publisher support (MarekK);
2714 # added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
2715 #2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
2716 # encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
2717 # fixed relative URI processing for guid (skadz); added ICBM support; added
2718 # base64 support
2719 #2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
2720 # blogspot.com sites); added _debug variable
2721 #2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
2722 #3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
2723 # added several new supported namespaces; fixed bug tracking naked markup in
2724 # description; added support for enclosure; added support for source; re-added
2725 # support for cloud which got dropped somehow; added support for expirationDate
2726 #3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
2727 # xml:base URI, one for documents that don't define one explicitly and one for
2728 # documents that define an outer and an inner xml:base that goes out of scope
2729 # before the end of the document
2730 #3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
2731 #3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']
2732 # will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
2733 # added support for creativeCommons:license and cc:license; added support for
2734 # full Atom content model in title, tagline, info, copyright, summary; fixed bug
2735 # with gzip encoding (not always telling server we support it when we do)
2736 #3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
2737 # (dictionary of 'name', 'url', 'email'); map author to author_detail if author
2738 # contains name + email address
2739 #3.0b8 - 1/28/2004 - MAP - added support for contributor
2740 #3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
2741 # support for summary
2742 #3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
2743 # xml.util.iso8601
2744 #3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
2745 # dangerous markup; fiddled with decodeEntities (not right); liberalized
2746 # date parsing even further
2747 #3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
2748 # added support to Atom 0.2 subtitle; added support for Atom content model
2749 # in copyright; better sanitizing of dangerous HTML elements with end tags
2750 # (script, frameset)
2751 #3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
2752 # etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
2753 #3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
2754 # Python 2.1
2755 #3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
2756 # fixed bug capturing author and contributor URL; fixed bug resolving relative
2757 # links in author and contributor URL; fixed bug resolvin relative links in
2758 # generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
2759 # namespace tests, and included them permanently in the test suite with his
2760 # permission; fixed namespace handling under Python 2.1
2761 #3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
2762 #3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
2763 #3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
2764 # use libxml2 (if available)
2765 #3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
2766 # name was in parentheses; removed ultra-problematic mxTidy support; patch to
2767 # workaround crash in PyXML/expat when encountering invalid entities
2768 # (MarkMoraes); support for textinput/textInput
2769 #3.0b20 - 4/7/2004 - MAP - added CDF support
2770 #3.0b21 - 4/14/2004 - MAP - added Hot RSS support
2771 #3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
2772 # results dict; changed results dict to allow getting values with results.key
2773 # as well as results[key]; work around embedded illformed HTML with half
2774 # a DOCTYPE; work around malformed Content-Type header; if character encoding
2775 # is wrong, try several common ones before falling back to regexes (if this
2776 # works, bozo_exception is set to CharacterEncodingOverride); fixed character
2777 # encoding issues in BaseHTMLProcessor by tracking encoding and converting
2778 # from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
2779 # convert each value in results to Unicode (if possible), even if using
2780 # regex-based parsing
2781 #3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
2782 # high-bit characters in attributes in embedded HTML in description (thanks
2783 # Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
2784 # FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
2785 # about a mapped key
2786 #3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
2787 # results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
2788 # cause the same encoding to be tried twice (even if it failed the first time);
2789 # fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
2790 # better textinput and image tracking in illformed RSS 1.0 feeds
2791 #3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
2792 # my blink tag tests
2793 #3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
2794 # failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
2795 # duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
2796 # added support for image; refactored parse() fallback logic to try other
2797 # encodings if SAX parsing fails (previously it would only try other encodings
2798 # if re-encoding failed); remove unichr madness in normalize_attrs now that
2799 # we're properly tracking encoding in and out of BaseHTMLProcessor; set
2800 # feed.language from root-level xml:lang; set entry.id from rdf:about;
2801 # send Accept header
2802 #3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
2803 # iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
2804 # windows-1252); fixed regression that could cause the same encoding to be
2805 # tried twice (even if it failed the first time)
2806 #3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
2807 # recover from malformed content-type header parameter with no equals sign
2808 # ('text/xml; charset:iso-8859-1')
2809 #3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
2810 # to Unicode equivalents in illformed feeds (aaronsw); added and
2811 # passed tests for converting character entities to Unicode equivalents
2812 # in illformed feeds (aaronsw); test for valid parsers when setting
2813 # XML_AVAILABLE; make version and encoding available when server returns
2814 # a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
2815 # digest auth or proxy support); add code to parse username/password
2816 # out of url and send as basic authentication; expose downloading-related
2817 # exceptions in bozo_exception (aaronsw); added __contains__ method to
2818 # FeedParserDict (aaronsw); added publisher_detail (aaronsw)
2819 #3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
2820 # convert feed to UTF-8 before passing to XML parser; completely revamped
2821 # logic for determining character encoding and attempting XML parsing
2822 # (much faster); increased default timeout to 20 seconds; test for presence
2823 # of Location header on redirects; added tests for many alternate character
2824 # encodings; support various EBCDIC encodings; support UTF-16BE and
2825 # UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
2826 # UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
2827 # XML parsers are available; added support for 'Content-encoding: deflate';
2828 # send blank 'Accept-encoding: ' header if neither gzip nor zlib modules
2829 # are available
2830 #3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure
2831 # problem tracking xml:base and xml:lang if element declares it, child
2832 # doesn't, first grandchild redeclares it, and second grandchild doesn't;
2833 # refactored date parsing; defined public registerDateHandler so callers
2834 # can add support for additional date formats at runtime; added support
2835 # for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added
2836 # zopeCompatibilityHack() which turns FeedParserDict into a regular
2837 # dictionary, required for Zope compatibility, and also makes command-
2838 # line debugging easier because pprint module formats real dictionaries
2839 # better than dictionary-like objects; added NonXMLContentType exception,
2840 # which is stored in bozo_exception when a feed is served with a non-XML
2841 # media type such as 'text/plain'; respect Content-Language as default
2842 # language if not xml:lang is present; cloud dict is now FeedParserDict;
2843 # generator dict is now FeedParserDict; better tracking of xml:lang,
2844 # including support for xml:lang='' to unset the current language;
2845 # recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default
2846 # namespace; don't overwrite final status on redirects (scenarios:
2847 # redirecting to a URL that returns 304, redirecting to a URL that
2848 # redirects to another URL with a different type of redirect); add
2849 # support for HTTP 303 redirects
2850 #4.0 - MAP - support for relative URIs in xml:base attribute; fixed
2851 # encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;
2852 # support for Atom 1.0; support for iTunes extensions; new 'tags' for
2853 # categories/keywords/etc. as array of dict
2854 # {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0
2855 # terminology; parse RFC 822-style dates with no time; lots of other
2856 # bug fixes
2857 #4.1 - MAP - removed socket timeout; added support for chardet library