1 # -*- coding: utf-8 -*-
3 # gPodder - A media aggregator and podcast client
4 # Copyright (c) 2005-2009 Thomas Perl and the gPodder Team
6 # gPodder is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 3 of the License, or
9 # (at your option) any later version.
11 # gPodder is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 # Generic feed fetching module for aggregators
22 # Thomas Perl <thpinfo.com>; 2009-06-11
32 def patch_feedparser():
33 """Monkey-patch the Universal Feed Parser"""
34 # Detect the 'plain' content type as 'text/plain'
35 # http://code.google.com/p/feedparser/issues/detail?id=80
36 def mapContentType2(self
, contentType
):
37 contentType
= contentType
.lower()
38 if contentType
== 'text' or contentType
== 'plain':
39 contentType
= 'text/plain'
40 elif contentType
== 'html':
41 contentType
= 'text/html'
42 elif contentType
== 'xhtml':
43 contentType
= 'application/xhtml+xml'
47 if feedparser
._FeedParserMixin
().mapContentType('plain') == 'plain':
48 feedparser
._FeedParserMixin
.mapContentType
= mapContentType2
52 # Fix parsing of Media RSS with feedparser, as described here:
53 # http://code.google.com/p/feedparser/issues/detail?id=100#c4
54 def _start_media_content(self
, attrsD
):
55 context
= self
._getContext
()
56 context
.setdefault('media_content', [])
57 context
['media_content'].append(attrsD
)
60 feedparser
._FeedParserMixin
._start
_media
_content
= _start_media_content
64 # Fix problem with the EA.com official podcast
65 # https://bugs.gpodder.org/show_bug.cgi?id=588
66 if '*/*' not in feedparser
.ACCEPT_HEADER
.split(','):
67 feedparser
.ACCEPT_HEADER
+= ',*/*'
72 class ExceptionWithData(Exception):
73 """Base exception with additional payload"""
74 def __init__(self
, data
):
75 Exception.__init
__(self
)
79 return '%s: %s' % (self
.__class
__.__name
__, str(self
.data
))
83 class Offline(Exception): pass
84 class BadRequest(Exception): pass
85 class InternalServerError(Exception): pass
86 class WifiLogin(ExceptionWithData
): pass
89 class Unsubscribe(Exception): pass
90 class NotFound(Exception): pass
91 class InvalidFeed(Exception): pass
92 class UnknownStatusCode(ExceptionWithData
): pass
94 # Authentication error
95 class AuthenticationRequired(Exception): pass
97 # Successful parsing of the feed
98 class UpdatedFeed(ExceptionWithData
): pass
99 class NewLocation(ExceptionWithData
): pass
100 class NotModified(ExceptionWithData
): pass
104 class Fetcher(object):
105 # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
106 FEED_TYPES
= ('application/rss+xml',
107 'application/atom+xml',
108 'application/rdf+xml',
112 def __init__(self
, user_agent
):
113 self
.user_agent
= user_agent
115 def _get_handlers(self
):
116 """Provide additional urllib2 handler objects
118 Subclasses can override this method to inject urllib2
119 handler objects into the feedparser.parse() call to
120 extent the functionalty of this Fetcher (for proxies, ..)
124 def _resolve_url(self
, url
):
125 """Provide additional ways of resolving an URL
127 Subclasses can override this method to provide more
128 ways of resolving a given URL to a feed URL. If the
129 Fetcher is in "autodiscovery" mode, it will try this
130 method as a last resort for coming up with a feed URL.
134 def _autodiscover_feed(self
, feed
):
136 # First, try all <link> elements if available
137 for link
in feed
.feed
.get('links', ()):
138 is_feed
= link
.get('type', '') in self
.FEED_TYPES
139 is_alternate
= link
.get('rel', '') == 'alternate'
140 url
= link
.get('href', None)
142 if url
and is_feed
and is_alternate
:
144 self
._parse
_feed
(url
, None, None, False)
145 except UpdatedFeed
, updated
:
150 # Second, try to resolve the URL
151 url
= self
._resolve
_url
(feed
.href
)
153 self
._parse
_feed
(url
, None, None, False)
154 except UpdatedFeed
, updated
:
155 raise NewLocation(updated
.data
.href
)
159 def _check_offline(self
, feed
):
160 if not hasattr(feed
, 'headers'):
163 def _check_wifi_login_page(self
, feed
):
164 html_page
= 'text/html' in feed
.headers
.get('content-type', '')
165 if not feed
.version
and feed
.status
== 302 and html_page
:
166 raise WifiLogin(feed
.href
)
168 def _check_valid_feed(self
, feed
):
170 raise InvalidFeed('feed is None')
172 if not hasattr(feed
, 'status'):
173 raise InvalidFeed('feed has no status code')
175 if not feed
.version
and feed
.status
!= 304 and feed
.status
!= 401:
176 raise InvalidFeed('unknown feed type')
178 def _normalize_status(self
, status
):
179 # Based on Mark Pilgrim's "Atom aggregator behaviour" article
180 if status
in (200, 301, 302, 304, 400, 401, 403, 404, 410, 500):
182 elif status
>= 200 and status
< 300:
184 elif status
>= 300 and status
< 400:
186 elif status
>= 400 and status
< 500:
188 elif status
>= 500 and status
< 600:
193 def _check_statuscode(self
, feed
):
194 status
= self
._normalize
_status
(feed
.status
)
196 raise UpdatedFeed(feed
)
198 raise NewLocation(feed
.href
)
200 raise UpdatedFeed(feed
)
202 raise NotModified(feed
)
204 raise BadRequest('bad request')
206 raise AuthenticationRequired('authentication required')
208 raise Unsubscribe('forbidden')
210 raise NotFound('not found')
212 raise Unsubscribe('resource is gone')
214 raise InternalServerError('internal server error')
216 raise UnknownStatusCode(status
)
218 def _parse_feed(self
, url
, etag
, modified
, autodiscovery
=True):
219 """Parse the feed and raise the result."""
220 feed
= feedparser
.parse(url
,
221 agent
=self
.user_agent
,
224 handlers
=self
._get
_handlers
())
226 self
._check
_offline
(feed
)
227 self
._check
_wifi
_login
_page
(feed
)
229 if feed
.status
!= 304 and not feed
.version
and autodiscovery
:
230 self
._autodiscover
_feed
(feed
)
232 self
._check
_valid
_feed
(feed
)
233 self
._check
_statuscode
(feed
)
235 def fetch(self
, url
, etag
=None, modified
=None):
236 """Download a feed, with optional etag an modified values
238 This method will always raise an exception that tells
239 the calling code the result of the fetch operation. See
240 the code for the feedcore module for all the possible
243 self
._parse
_feed
(url
, etag
, modified
)