mygpo/feedcore.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # gPodder - A media aggregator and podcast client
   4 # Copyright (c) 2005-2009 Thomas Perl and the gPodder Team
   5 #
   6 # gPodder is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 3 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # gPodder is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 #
  21 # Generic feed fetching module for aggregators
  22 # Thomas Perl <thpinfo.com>; 2009-06-11
  23 #
  24
  25 import feedparser
  26
  27 import urllib
  28 import urlparse
  29 import urllib2
  30
  31 def patch_feedparser():
  32     """Monkey-patch the Universal Feed Parser"""
  33     # Detect the 'plain' content type as 'text/plain'
  34     # http://code.google.com/p/feedparser/issues/detail?id=80
  35     def mapContentType2(self, contentType):
  36         contentType = contentType.lower()
  37         if contentType == 'text' or contentType == 'plain':
  38             contentType = 'text/plain'
  39         elif contentType == 'html':
  40             contentType = 'text/html'
  41         elif contentType == 'xhtml':
  42             contentType = 'application/xhtml+xml'
  43         return contentType
  44
  45     try:
  46         if feedparser._FeedParserMixin().mapContentType('plain') == 'plain':
  47             feedparser._FeedParserMixin.mapContentType = mapContentType2
  48     except:
  49         pass
  50
  51     # Fix parsing of Media RSS with feedparser, as described here:
  52     #   http://code.google.com/p/feedparser/issues/detail?id=100#c4
  53     def _start_media_content(self, attrsD):
  54         context = self._getContext()
  55         context.setdefault('media_content', [])
  56         context['media_content'].append(attrsD)
  57
  58     try:
  59         feedparser._FeedParserMixin._start_media_content = _start_media_content
  60     except:
  61         pass
  62
  63     # Fix problem with the EA.com official podcast
  64     # https://bugs.gpodder.org/show_bug.cgi?id=588
  65     if '*/*' not in feedparser.ACCEPT_HEADER.split(','):
  66         feedparser.ACCEPT_HEADER += ',*/*'
  67
  68 patch_feedparser()
  69
  70
  71 class ExceptionWithData(Exception):
  72     """Base exception with additional payload"""
  73     def __init__(self, data):
  74         Exception.__init__(self)
  75         self.data = data
  76
  77     def __str__(self):
  78         return '%s: %s' % (self.__class__.__name__, str(self.data))
  79
  80
  81 # Temporary errors
  82 class Offline(Exception): pass
  83 class BadRequest(Exception): pass
  84 class InternalServerError(Exception): pass
  85 class WifiLogin(ExceptionWithData): pass
  86
  87 # Fatal errors
  88 class Unsubscribe(Exception): pass
  89 class NotFound(Exception): pass
  90 class InvalidFeed(Exception): pass
  91 class UnknownStatusCode(ExceptionWithData): pass
  92
  93 # Authentication error
  94 class AuthenticationRequired(Exception): pass
  95
  96 # Successful parsing of the feed
  97 class UpdatedFeed(ExceptionWithData): pass
  98 class NewLocation(ExceptionWithData): pass
  99 class NotModified(ExceptionWithData): pass
 100
 101
 102
 103 class Fetcher(object):
 104     # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
 105     FEED_TYPES = ('application/rss+xml',
 106                   'application/atom+xml',
 107                   'application/rdf+xml',
 108                   'application/xml',
 109                   'text/xml')
 110
 111     def __init__(self, user_agent):
 112         self.user_agent = user_agent
 113
 114     def _get_handlers(self):
 115         """Provide additional urllib2 handler objects
 116
 117         Subclasses can override this method to inject urllib2
 118         handler objects into the feedparser.parse() call to
 119         extent the functionalty of this Fetcher (for proxies, ..)
 120         """
 121         return []
 122
 123     def _resolve_url(self, url):
 124         """Provide additional ways of resolving an URL
 125
 126         Subclasses can override this method to provide more
 127         ways of resolving a given URL to a feed URL. If the
 128         Fetcher is in "autodiscovery" mode, it will try this
 129         method as a last resort for coming up with a feed URL.
 130         """
 131         return None
 132
 133     def _autodiscover_feed(self, feed):
 134         try:
 135             # First, try all <link> elements if available
 136             for link in feed.feed.get('links', ()):
 137                 is_feed = link.get('type', '') in self.FEED_TYPES
 138                 is_alternate = link.get('rel', '') == 'alternate'
 139                 url = link.get('href', None)
 140
 141                 if url and is_feed and is_alternate:
 142                     try:
 143                         self._parse_feed(url, None, None, False)
 144                     except UpdatedFeed, updated:
 145                         raise
 146                     except Exception:
 147                         pass
 148
 149             # Second, try to resolve the URL
 150             url = self._resolve_url(feed.href)
 151             if url:
 152                 self._parse_feed(url, None, None, False)
 153         except UpdatedFeed, updated:
 154             raise NewLocation(updated.data.href)
 155         except Exception, e:
 156             pass
 157
 158     def _check_offline(self, feed):
 159         if not hasattr(feed, 'headers'):
 160             raise Offline()
 161
 162     def _check_wifi_login_page(self, feed):
 163         html_page = 'text/html' in feed.headers.get('content-type', '')
 164         if not feed.version and feed.status == 302 and html_page:
 165             raise WifiLogin(feed.href)
 166
 167     def _check_valid_feed(self, feed):
 168         if feed is None:
 169             raise InvalidFeed('feed is None')
 170
 171         if not hasattr(feed, 'status'):
 172             raise InvalidFeed('feed has no status code')
 173
 174         if not feed.version and feed.status != 304 and feed.status != 401:
 175             raise InvalidFeed('unknown feed type')
 176
 177     def _normalize_status(self, status):
 178         # Based on Mark Pilgrim's "Atom aggregator behaviour" article
 179         if status in (200, 301, 302, 304, 400, 401, 403, 404, 410, 500):
 180             return status
 181         elif status >= 200 and status < 300:
 182             return 200
 183         elif status >= 300 and status < 400:
 184             return 302
 185         elif status >= 400 and status < 500:
 186             return 400
 187         elif status >= 500 and status < 600:
 188             return 500
 189         else:
 190             return status
 191
 192     def _check_statuscode(self, feed):
 193         status = self._normalize_status(feed.status)
 194         if status == 200:
 195             raise UpdatedFeed(feed)
 196         elif status == 301:
 197             raise NewLocation(feed.href)
 198         elif status == 302:
 199             raise UpdatedFeed(feed)
 200         elif status == 304:
 201             raise NotModified(feed)
 202         elif status == 400:
 203             raise BadRequest('bad request')
 204         elif status == 401:
 205             raise AuthenticationRequired('authentication required')
 206         elif status == 403:
 207             raise Unsubscribe('forbidden')
 208         elif status == 404:
 209             raise NotFound('not found')
 210         elif status == 410:
 211             raise Unsubscribe('resource is gone')
 212         elif status == 500:
 213             raise InternalServerError('internal server error')
 214         else:
 215             raise UnknownStatusCode(status)
 216
 217     def _parse_feed(self, url, etag, modified, autodiscovery=True):
 218         """Parse the feed and raise the result."""
 219         feed = feedparser.parse(url,
 220                 agent=self.user_agent,
 221                 modified=modified,
 222                 etag=etag,
 223                 handlers=self._get_handlers())
 224
 225         self._check_offline(feed)
 226         self._check_wifi_login_page(feed)
 227
 228         if feed.status != 304 and not feed.version and autodiscovery:
 229             self._autodiscover_feed(feed)
 230
 231         self._check_valid_feed(feed)
 232         self._check_statuscode(feed)
 233
 234     def fetch(self, url, etag=None, modified=None):
 235         """Download a feed, with optional etag an modified values
 236
 237         This method will always raise an exception that tells
 238         the calling code the result of the fetch operation. See
 239         the code for the feedcore module for all the possible
 240         exception types.
 241         """
 242         self._parse_feed(url, etag, modified)
 243