1 # -*- coding: utf-8 -*-
3 # gPodder - A media aggregator and podcast client
4 # Copyright (c) 2005-2009 Thomas Perl and the gPodder Team
6 # gPodder is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 3 of the License, or
9 # (at your option) any later version.
11 # gPodder is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 # Generic feed fetching module for aggregators
22 # Thomas Perl <thpinfo.com>; 2009-06-11
31 def patch_feedparser():
32 """Monkey-patch the Universal Feed Parser"""
33 # Detect the 'plain' content type as 'text/plain'
34 # http://code.google.com/p/feedparser/issues/detail?id=80
35 def mapContentType2(self
, contentType
):
36 contentType
= contentType
.lower()
37 if contentType
== 'text' or contentType
== 'plain':
38 contentType
= 'text/plain'
39 elif contentType
== 'html':
40 contentType
= 'text/html'
41 elif contentType
== 'xhtml':
42 contentType
= 'application/xhtml+xml'
46 if feedparser
._FeedParserMixin
().mapContentType('plain') == 'plain':
47 feedparser
._FeedParserMixin
.mapContentType
= mapContentType2
51 # Fix parsing of Media RSS with feedparser, as described here:
52 # http://code.google.com/p/feedparser/issues/detail?id=100#c4
53 def _start_media_content(self
, attrsD
):
54 context
= self
._getContext
()
55 context
.setdefault('media_content', [])
56 context
['media_content'].append(attrsD
)
59 feedparser
._FeedParserMixin
._start
_media
_content
= _start_media_content
63 # Fix problem with the EA.com official podcast
64 # https://bugs.gpodder.org/show_bug.cgi?id=588
65 if '*/*' not in feedparser
.ACCEPT_HEADER
.split(','):
66 feedparser
.ACCEPT_HEADER
+= ',*/*'
71 class ExceptionWithData(Exception):
72 """Base exception with additional payload"""
73 def __init__(self
, data
):
74 Exception.__init
__(self
)
78 return '%s: %s' % (self
.__class
__.__name
__, str(self
.data
))
82 class Offline(Exception): pass
83 class BadRequest(Exception): pass
84 class InternalServerError(Exception): pass
85 class WifiLogin(ExceptionWithData
): pass
88 class Unsubscribe(Exception): pass
89 class NotFound(Exception): pass
90 class InvalidFeed(Exception): pass
91 class UnknownStatusCode(ExceptionWithData
): pass
93 # Authentication error
94 class AuthenticationRequired(Exception): pass
96 # Successful parsing of the feed
97 class UpdatedFeed(ExceptionWithData
): pass
98 class NewLocation(ExceptionWithData
): pass
99 class NotModified(ExceptionWithData
): pass
103 class Fetcher(object):
104 # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
105 FEED_TYPES
= ('application/rss+xml',
106 'application/atom+xml',
107 'application/rdf+xml',
111 def __init__(self
, user_agent
):
112 self
.user_agent
= user_agent
114 def _get_handlers(self
):
115 """Provide additional urllib2 handler objects
117 Subclasses can override this method to inject urllib2
118 handler objects into the feedparser.parse() call to
119 extent the functionalty of this Fetcher (for proxies, ..)
123 def _resolve_url(self
, url
):
124 """Provide additional ways of resolving an URL
126 Subclasses can override this method to provide more
127 ways of resolving a given URL to a feed URL. If the
128 Fetcher is in "autodiscovery" mode, it will try this
129 method as a last resort for coming up with a feed URL.
133 def _autodiscover_feed(self
, feed
):
135 # First, try all <link> elements if available
136 for link
in feed
.feed
.get('links', ()):
137 is_feed
= link
.get('type', '') in self
.FEED_TYPES
138 is_alternate
= link
.get('rel', '') == 'alternate'
139 url
= link
.get('href', None)
141 if url
and is_feed
and is_alternate
:
143 self
._parse
_feed
(url
, None, None, False)
144 except UpdatedFeed
, updated
:
149 # Second, try to resolve the URL
150 url
= self
._resolve
_url
(feed
.href
)
152 self
._parse
_feed
(url
, None, None, False)
153 except UpdatedFeed
, updated
:
154 raise NewLocation(updated
.data
.href
)
158 def _check_offline(self
, feed
):
159 if not hasattr(feed
, 'headers'):
162 def _check_wifi_login_page(self
, feed
):
163 html_page
= 'text/html' in feed
.headers
.get('content-type', '')
164 if not feed
.version
and feed
.status
== 302 and html_page
:
165 raise WifiLogin(feed
.href
)
167 def _check_valid_feed(self
, feed
):
169 raise InvalidFeed('feed is None')
171 if not hasattr(feed
, 'status'):
172 raise InvalidFeed('feed has no status code')
174 if not feed
.version
and feed
.status
!= 304 and feed
.status
!= 401:
175 raise InvalidFeed('unknown feed type')
177 def _normalize_status(self
, status
):
178 # Based on Mark Pilgrim's "Atom aggregator behaviour" article
179 if status
in (200, 301, 302, 304, 400, 401, 403, 404, 410, 500):
181 elif status
>= 200 and status
< 300:
183 elif status
>= 300 and status
< 400:
185 elif status
>= 400 and status
< 500:
187 elif status
>= 500 and status
< 600:
192 def _check_statuscode(self
, feed
):
193 status
= self
._normalize
_status
(feed
.status
)
195 raise UpdatedFeed(feed
)
197 raise NewLocation(feed
.href
)
199 raise UpdatedFeed(feed
)
201 raise NotModified(feed
)
203 raise BadRequest('bad request')
205 raise AuthenticationRequired('authentication required')
207 raise Unsubscribe('forbidden')
209 raise NotFound('not found')
211 raise Unsubscribe('resource is gone')
213 raise InternalServerError('internal server error')
215 raise UnknownStatusCode(status
)
217 def _parse_feed(self
, url
, etag
, modified
, autodiscovery
=True):
218 """Parse the feed and raise the result."""
219 feed
= feedparser
.parse(url
,
220 agent
=self
.user_agent
,
223 handlers
=self
._get
_handlers
())
225 self
._check
_offline
(feed
)
226 self
._check
_wifi
_login
_page
(feed
)
228 if feed
.status
!= 304 and not feed
.version
and autodiscovery
:
229 self
._autodiscover
_feed
(feed
)
231 self
._check
_valid
_feed
(feed
)
232 self
._check
_statuscode
(feed
)
234 def fetch(self
, url
, etag
=None, modified
=None):
235 """Download a feed, with optional etag an modified values
237 This method will always raise an exception that tells
238 the calling code the result of the fetch operation. See
239 the code for the feedcore module for all the possible
242 self
._parse
_feed
(url
, etag
, modified
)