mygpo/data/feedcore.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # gPodder - A media aggregator and podcast client
   4 # Copyright (c) 2005-2009 Thomas Perl and the gPodder Team
   5 #
   6 # gPodder is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 3 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # gPodder is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 #
  19
  20 #
  21 # Generic feed fetching module for aggregators
  22 # Thomas Perl <thpinfo.com>; 2009-06-11
  23 #
  24
  25 import urllib
  26 import urlparse
  27 import urllib2
  28
  29 import feedparser
  30
  31
  32 def patch_feedparser():
  33     """Monkey-patch the Universal Feed Parser"""
  34     # Detect the 'plain' content type as 'text/plain'
  35     # http://code.google.com/p/feedparser/issues/detail?id=80
  36     def mapContentType2(self, contentType):
  37         contentType = contentType.lower()
  38         if contentType == 'text' or contentType == 'plain':
  39             contentType = 'text/plain'
  40         elif contentType == 'html':
  41             contentType = 'text/html'
  42         elif contentType == 'xhtml':
  43             contentType = 'application/xhtml+xml'
  44         return contentType
  45
  46     try:
  47         if feedparser._FeedParserMixin().mapContentType('plain') == 'plain':
  48             feedparser._FeedParserMixin.mapContentType = mapContentType2
  49     except:
  50         pass
  51
  52     # Fix parsing of Media RSS with feedparser, as described here:
  53     #   http://code.google.com/p/feedparser/issues/detail?id=100#c4
  54     def _start_media_content(self, attrsD):
  55         context = self._getContext()
  56         context.setdefault('media_content', [])
  57         context['media_content'].append(attrsD)
  58
  59     try:
  60         feedparser._FeedParserMixin._start_media_content = _start_media_content
  61     except:
  62         pass
  63
  64     # Fix problem with the EA.com official podcast
  65     # https://bugs.gpodder.org/show_bug.cgi?id=588
  66     if '*/*' not in feedparser.ACCEPT_HEADER.split(','):
  67         feedparser.ACCEPT_HEADER += ',*/*'
  68
  69 patch_feedparser()
  70
  71
  72 class ExceptionWithData(Exception):
  73     """Base exception with additional payload"""
  74     def __init__(self, data):
  75         Exception.__init__(self)
  76         self.data = data
  77
  78     def __str__(self):
  79         return '%s: %s' % (self.__class__.__name__, str(self.data))
  80
  81
  82 # Temporary errors
  83 class Offline(Exception): pass
  84 class BadRequest(Exception): pass
  85 class InternalServerError(Exception): pass
  86 class WifiLogin(ExceptionWithData): pass
  87
  88 # Fatal errors
  89 class Unsubscribe(Exception): pass
  90 class NotFound(Exception): pass
  91 class InvalidFeed(Exception): pass
  92 class UnknownStatusCode(ExceptionWithData): pass
  93
  94 # Authentication error
  95 class AuthenticationRequired(Exception): pass
  96
  97 # Successful parsing of the feed
  98 class UpdatedFeed(ExceptionWithData): pass
  99 class NewLocation(ExceptionWithData): pass
 100 class NotModified(ExceptionWithData): pass
 101
 102
 103
 104 class Fetcher(object):
 105     # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
 106     FEED_TYPES = ('application/rss+xml',
 107                   'application/atom+xml',
 108                   'application/rdf+xml',
 109                   'application/xml',
 110                   'text/xml')
 111
 112     def __init__(self, user_agent):
 113         self.user_agent = user_agent
 114
 115     def _get_handlers(self):
 116         """Provide additional urllib2 handler objects
 117
 118         Subclasses can override this method to inject urllib2
 119         handler objects into the feedparser.parse() call to
 120         extent the functionalty of this Fetcher (for proxies, ..)
 121         """
 122         return []
 123
 124     def _resolve_url(self, url):
 125         """Provide additional ways of resolving an URL
 126
 127         Subclasses can override this method to provide more
 128         ways of resolving a given URL to a feed URL. If the
 129         Fetcher is in "autodiscovery" mode, it will try this
 130         method as a last resort for coming up with a feed URL.
 131         """
 132         return None
 133
 134     def _autodiscover_feed(self, feed):
 135         try:
 136             # First, try all <link> elements if available
 137             for link in feed.feed.get('links', ()):
 138                 is_feed = link.get('type', '') in self.FEED_TYPES
 139                 is_alternate = link.get('rel', '') == 'alternate'
 140                 url = link.get('href', None)
 141
 142                 if url and is_feed and is_alternate:
 143                     try:
 144                         self._parse_feed(url, None, None, False)
 145                     except UpdatedFeed, updated:
 146                         raise
 147                     except Exception:
 148                         pass
 149
 150             # Second, try to resolve the URL
 151             url = self._resolve_url(feed.href)
 152             if url:
 153                 self._parse_feed(url, None, None, False)
 154         except UpdatedFeed, updated:
 155             raise NewLocation(updated.data.href)
 156         except Exception, e:
 157             pass
 158
 159     def _check_offline(self, feed):
 160         if not hasattr(feed, 'headers'):
 161             raise Offline()
 162
 163     def _check_wifi_login_page(self, feed):
 164         html_page = 'text/html' in feed.headers.get('content-type', '')
 165         if not feed.version and feed.status == 302 and html_page:
 166             raise WifiLogin(feed.href)
 167
 168     def _check_valid_feed(self, feed):
 169         if feed is None:
 170             raise InvalidFeed('feed is None')
 171
 172         if not hasattr(feed, 'status'):
 173             raise InvalidFeed('feed has no status code')
 174
 175         if not feed.version and feed.status != 304 and feed.status != 401:
 176             raise InvalidFeed('unknown feed type')
 177
 178     def _normalize_status(self, status):
 179         # Based on Mark Pilgrim's "Atom aggregator behaviour" article
 180         if status in (200, 301, 302, 304, 400, 401, 403, 404, 410, 500):
 181             return status
 182         elif status >= 200 and status < 300:
 183             return 200
 184         elif status >= 300 and status < 400:
 185             return 302
 186         elif status >= 400 and status < 500:
 187             return 400
 188         elif status >= 500 and status < 600:
 189             return 500
 190         else:
 191             return status
 192
 193     def _check_statuscode(self, feed):
 194         status = self._normalize_status(feed.status)
 195         if status == 200:
 196             raise UpdatedFeed(feed)
 197         elif status == 301:
 198             raise NewLocation(feed.href)
 199         elif status == 302:
 200             raise UpdatedFeed(feed)
 201         elif status == 304:
 202             raise NotModified(feed)
 203         elif status == 400:
 204             raise BadRequest('bad request')
 205         elif status == 401:
 206             raise AuthenticationRequired('authentication required')
 207         elif status == 403:
 208             raise Unsubscribe('forbidden')
 209         elif status == 404:
 210             raise NotFound('not found')
 211         elif status == 410:
 212             raise Unsubscribe('resource is gone')
 213         elif status == 500:
 214             raise InternalServerError('internal server error')
 215         else:
 216             raise UnknownStatusCode(status)
 217
 218     def _parse_feed(self, url, etag, modified, autodiscovery=True):
 219         """Parse the feed and raise the result."""
 220         feed = feedparser.parse(url,
 221                 agent=self.user_agent,
 222                 modified=modified,
 223                 etag=etag,
 224                 handlers=self._get_handlers())
 225
 226         self._check_offline(feed)
 227         self._check_wifi_login_page(feed)
 228
 229         if feed.status != 304 and not feed.version and autodiscovery:
 230             self._autodiscover_feed(feed)
 231
 232         self._check_valid_feed(feed)
 233         self._check_statuscode(feed)
 234
 235     def fetch(self, url, etag=None, modified=None):
 236         """Download a feed, with optional etag an modified values
 237
 238         This method will always raise an exception that tells
 239         the calling code the result of the fetch operation. See
 240         the code for the feedcore module for all the possible
 241         exception types.
 242         """
 243         self._parse_feed(url, etag, modified)
 244