commit to CouchDB only when necessary
[mygpo.git] / mygpo / feedcore.py
blob866553e5abb4f277df1ad6f80536308ecc124fe9
1 # -*- coding: utf-8 -*-
3 # gPodder - A media aggregator and podcast client
4 # Copyright (c) 2005-2009 Thomas Perl and the gPodder Team
6 # gPodder is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 3 of the License, or
9 # (at your option) any later version.
11 # gPodder is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 # Generic feed fetching module for aggregators
22 # Thomas Perl <thpinfo.com>; 2009-06-11
25 import feedparser
27 import urllib
28 import urlparse
29 import urllib2
31 def patch_feedparser():
32 """Monkey-patch the Universal Feed Parser"""
33 # Detect the 'plain' content type as 'text/plain'
34 # http://code.google.com/p/feedparser/issues/detail?id=80
35 def mapContentType2(self, contentType):
36 contentType = contentType.lower()
37 if contentType == 'text' or contentType == 'plain':
38 contentType = 'text/plain'
39 elif contentType == 'html':
40 contentType = 'text/html'
41 elif contentType == 'xhtml':
42 contentType = 'application/xhtml+xml'
43 return contentType
45 try:
46 if feedparser._FeedParserMixin().mapContentType('plain') == 'plain':
47 feedparser._FeedParserMixin.mapContentType = mapContentType2
48 except:
49 pass
51 # Fix parsing of Media RSS with feedparser, as described here:
52 # http://code.google.com/p/feedparser/issues/detail?id=100#c4
53 def _start_media_content(self, attrsD):
54 context = self._getContext()
55 context.setdefault('media_content', [])
56 context['media_content'].append(attrsD)
58 try:
59 feedparser._FeedParserMixin._start_media_content = _start_media_content
60 except:
61 pass
63 # Fix problem with the EA.com official podcast
64 # https://bugs.gpodder.org/show_bug.cgi?id=588
65 if '*/*' not in feedparser.ACCEPT_HEADER.split(','):
66 feedparser.ACCEPT_HEADER += ',*/*'
68 patch_feedparser()
71 class ExceptionWithData(Exception):
72 """Base exception with additional payload"""
73 def __init__(self, data):
74 Exception.__init__(self)
75 self.data = data
77 def __str__(self):
78 return '%s: %s' % (self.__class__.__name__, str(self.data))
81 # Temporary errors
82 class Offline(Exception): pass
83 class BadRequest(Exception): pass
84 class InternalServerError(Exception): pass
85 class WifiLogin(ExceptionWithData): pass
87 # Fatal errors
88 class Unsubscribe(Exception): pass
89 class NotFound(Exception): pass
90 class InvalidFeed(Exception): pass
91 class UnknownStatusCode(ExceptionWithData): pass
93 # Authentication error
94 class AuthenticationRequired(Exception): pass
96 # Successful parsing of the feed
97 class UpdatedFeed(ExceptionWithData): pass
98 class NewLocation(ExceptionWithData): pass
99 class NotModified(ExceptionWithData): pass
103 class Fetcher(object):
104 # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
105 FEED_TYPES = ('application/rss+xml',
106 'application/atom+xml',
107 'application/rdf+xml',
108 'application/xml',
109 'text/xml')
111 def __init__(self, user_agent):
112 self.user_agent = user_agent
114 def _get_handlers(self):
115 """Provide additional urllib2 handler objects
117 Subclasses can override this method to inject urllib2
118 handler objects into the feedparser.parse() call to
119 extent the functionalty of this Fetcher (for proxies, ..)
121 return []
123 def _resolve_url(self, url):
124 """Provide additional ways of resolving an URL
126 Subclasses can override this method to provide more
127 ways of resolving a given URL to a feed URL. If the
128 Fetcher is in "autodiscovery" mode, it will try this
129 method as a last resort for coming up with a feed URL.
131 return None
133 def _autodiscover_feed(self, feed):
134 try:
135 # First, try all <link> elements if available
136 for link in feed.feed.get('links', ()):
137 is_feed = link.get('type', '') in self.FEED_TYPES
138 is_alternate = link.get('rel', '') == 'alternate'
139 url = link.get('href', None)
141 if url and is_feed and is_alternate:
142 try:
143 self._parse_feed(url, None, None, False)
144 except UpdatedFeed, updated:
145 raise
146 except Exception:
147 pass
149 # Second, try to resolve the URL
150 url = self._resolve_url(feed.href)
151 if url:
152 self._parse_feed(url, None, None, False)
153 except UpdatedFeed, updated:
154 raise NewLocation(updated.data.href)
155 except Exception, e:
156 pass
158 def _check_offline(self, feed):
159 if not hasattr(feed, 'headers'):
160 raise Offline()
162 def _check_wifi_login_page(self, feed):
163 html_page = 'text/html' in feed.headers.get('content-type', '')
164 if not feed.version and feed.status == 302 and html_page:
165 raise WifiLogin(feed.href)
167 def _check_valid_feed(self, feed):
168 if feed is None:
169 raise InvalidFeed('feed is None')
171 if not hasattr(feed, 'status'):
172 raise InvalidFeed('feed has no status code')
174 if not feed.version and feed.status != 304 and feed.status != 401:
175 raise InvalidFeed('unknown feed type')
177 def _normalize_status(self, status):
178 # Based on Mark Pilgrim's "Atom aggregator behaviour" article
179 if status in (200, 301, 302, 304, 400, 401, 403, 404, 410, 500):
180 return status
181 elif status >= 200 and status < 300:
182 return 200
183 elif status >= 300 and status < 400:
184 return 302
185 elif status >= 400 and status < 500:
186 return 400
187 elif status >= 500 and status < 600:
188 return 500
189 else:
190 return status
192 def _check_statuscode(self, feed):
193 status = self._normalize_status(feed.status)
194 if status == 200:
195 raise UpdatedFeed(feed)
196 elif status == 301:
197 raise NewLocation(feed.href)
198 elif status == 302:
199 raise UpdatedFeed(feed)
200 elif status == 304:
201 raise NotModified(feed)
202 elif status == 400:
203 raise BadRequest('bad request')
204 elif status == 401:
205 raise AuthenticationRequired('authentication required')
206 elif status == 403:
207 raise Unsubscribe('forbidden')
208 elif status == 404:
209 raise NotFound('not found')
210 elif status == 410:
211 raise Unsubscribe('resource is gone')
212 elif status == 500:
213 raise InternalServerError('internal server error')
214 else:
215 raise UnknownStatusCode(status)
217 def _parse_feed(self, url, etag, modified, autodiscovery=True):
218 """Parse the feed and raise the result."""
219 feed = feedparser.parse(url,
220 agent=self.user_agent,
221 modified=modified,
222 etag=etag,
223 handlers=self._get_handlers())
225 self._check_offline(feed)
226 self._check_wifi_login_page(feed)
228 if feed.status != 304 and not feed.version and autodiscovery:
229 self._autodiscover_feed(feed)
231 self._check_valid_feed(feed)
232 self._check_statuscode(feed)
234 def fetch(self, url, etag=None, modified=None):
235 """Download a feed, with optional etag an modified values
237 This method will always raise an exception that tells
238 the calling code the result of the fetch operation. See
239 the code for the feedcore module for all the possible
240 exception types.
242 self._parse_feed(url, etag, modified)