rename/move filters, validate_doc_update
[mygpo.git] / mygpo / data / feedcore.py
blobb8cdb563c00e6db0ffdfb192009ceb7abe7f2e4b
1 # -*- coding: utf-8 -*-
3 # gPodder - A media aggregator and podcast client
4 # Copyright (c) 2005-2009 Thomas Perl and the gPodder Team
6 # gPodder is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 3 of the License, or
9 # (at your option) any later version.
11 # gPodder is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 # Generic feed fetching module for aggregators
22 # Thomas Perl <thpinfo.com>; 2009-06-11
25 import urllib
26 import urlparse
27 import urllib2
29 import feedparser
32 def patch_feedparser():
33 """Monkey-patch the Universal Feed Parser"""
34 # Detect the 'plain' content type as 'text/plain'
35 # http://code.google.com/p/feedparser/issues/detail?id=80
36 def mapContentType2(self, contentType):
37 contentType = contentType.lower()
38 if contentType == 'text' or contentType == 'plain':
39 contentType = 'text/plain'
40 elif contentType == 'html':
41 contentType = 'text/html'
42 elif contentType == 'xhtml':
43 contentType = 'application/xhtml+xml'
44 return contentType
46 try:
47 if feedparser._FeedParserMixin().mapContentType('plain') == 'plain':
48 feedparser._FeedParserMixin.mapContentType = mapContentType2
49 except:
50 pass
52 # Fix parsing of Media RSS with feedparser, as described here:
53 # http://code.google.com/p/feedparser/issues/detail?id=100#c4
54 def _start_media_content(self, attrsD):
55 context = self._getContext()
56 context.setdefault('media_content', [])
57 context['media_content'].append(attrsD)
59 try:
60 feedparser._FeedParserMixin._start_media_content = _start_media_content
61 except:
62 pass
64 # Fix problem with the EA.com official podcast
65 # https://bugs.gpodder.org/show_bug.cgi?id=588
66 if '*/*' not in feedparser.ACCEPT_HEADER.split(','):
67 feedparser.ACCEPT_HEADER += ',*/*'
69 patch_feedparser()
72 class ExceptionWithData(Exception):
73 """Base exception with additional payload"""
74 def __init__(self, data):
75 Exception.__init__(self)
76 self.data = data
78 def __str__(self):
79 return '%s: %s' % (self.__class__.__name__, str(self.data))
82 # Temporary errors
83 class Offline(Exception): pass
84 class BadRequest(Exception): pass
85 class InternalServerError(Exception): pass
86 class WifiLogin(ExceptionWithData): pass
88 # Fatal errors
89 class Unsubscribe(Exception): pass
90 class NotFound(Exception): pass
91 class InvalidFeed(Exception): pass
92 class UnknownStatusCode(ExceptionWithData): pass
94 # Authentication error
95 class AuthenticationRequired(Exception): pass
97 # Successful parsing of the feed
98 class UpdatedFeed(ExceptionWithData): pass
99 class NewLocation(ExceptionWithData): pass
100 class NotModified(ExceptionWithData): pass
104 class Fetcher(object):
105 # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
106 FEED_TYPES = ('application/rss+xml',
107 'application/atom+xml',
108 'application/rdf+xml',
109 'application/xml',
110 'text/xml')
112 def __init__(self, user_agent):
113 self.user_agent = user_agent
115 def _get_handlers(self):
116 """Provide additional urllib2 handler objects
118 Subclasses can override this method to inject urllib2
119 handler objects into the feedparser.parse() call to
120 extent the functionalty of this Fetcher (for proxies, ..)
122 return []
124 def _resolve_url(self, url):
125 """Provide additional ways of resolving an URL
127 Subclasses can override this method to provide more
128 ways of resolving a given URL to a feed URL. If the
129 Fetcher is in "autodiscovery" mode, it will try this
130 method as a last resort for coming up with a feed URL.
132 return None
134 def _autodiscover_feed(self, feed):
135 try:
136 # First, try all <link> elements if available
137 for link in feed.feed.get('links', ()):
138 is_feed = link.get('type', '') in self.FEED_TYPES
139 is_alternate = link.get('rel', '') == 'alternate'
140 url = link.get('href', None)
142 if url and is_feed and is_alternate:
143 try:
144 self._parse_feed(url, None, None, False)
145 except UpdatedFeed, updated:
146 raise
147 except Exception:
148 pass
150 # Second, try to resolve the URL
151 url = self._resolve_url(feed.href)
152 if url:
153 self._parse_feed(url, None, None, False)
154 except UpdatedFeed, updated:
155 raise NewLocation(updated.data.href)
156 except Exception, e:
157 pass
159 def _check_offline(self, feed):
160 if not hasattr(feed, 'headers'):
161 raise Offline()
163 def _check_wifi_login_page(self, feed):
164 html_page = 'text/html' in feed.headers.get('content-type', '')
165 if not feed.version and feed.status == 302 and html_page:
166 raise WifiLogin(feed.href)
168 def _check_valid_feed(self, feed):
169 if feed is None:
170 raise InvalidFeed('feed is None')
172 if not hasattr(feed, 'status'):
173 raise InvalidFeed('feed has no status code')
175 if not feed.version and feed.status != 304 and feed.status != 401:
176 raise InvalidFeed('unknown feed type')
178 def _normalize_status(self, status):
179 # Based on Mark Pilgrim's "Atom aggregator behaviour" article
180 if status in (200, 301, 302, 304, 400, 401, 403, 404, 410, 500):
181 return status
182 elif status >= 200 and status < 300:
183 return 200
184 elif status >= 300 and status < 400:
185 return 302
186 elif status >= 400 and status < 500:
187 return 400
188 elif status >= 500 and status < 600:
189 return 500
190 else:
191 return status
193 def _check_statuscode(self, feed):
194 status = self._normalize_status(feed.status)
195 if status == 200:
196 raise UpdatedFeed(feed)
197 elif status == 301:
198 raise NewLocation(feed.href)
199 elif status == 302:
200 raise UpdatedFeed(feed)
201 elif status == 304:
202 raise NotModified(feed)
203 elif status == 400:
204 raise BadRequest('bad request')
205 elif status == 401:
206 raise AuthenticationRequired('authentication required')
207 elif status == 403:
208 raise Unsubscribe('forbidden')
209 elif status == 404:
210 raise NotFound('not found')
211 elif status == 410:
212 raise Unsubscribe('resource is gone')
213 elif status == 500:
214 raise InternalServerError('internal server error')
215 else:
216 raise UnknownStatusCode(status)
218 def _parse_feed(self, url, etag, modified, autodiscovery=True):
219 """Parse the feed and raise the result."""
220 feed = feedparser.parse(url,
221 agent=self.user_agent,
222 modified=modified,
223 etag=etag,
224 handlers=self._get_handlers())
226 self._check_offline(feed)
227 self._check_wifi_login_page(feed)
229 if feed.status != 304 and not feed.version and autodiscovery:
230 self._autodiscover_feed(feed)
232 self._check_valid_feed(feed)
233 self._check_statuscode(feed)
235 def fetch(self, url, etag=None, modified=None):
236 """Download a feed, with optional etag an modified values
238 This method will always raise an exception that tells
239 the calling code the result of the fetch operation. See
240 the code for the feedcore module for all the possible
241 exception types.
243 self._parse_feed(url, etag, modified)