1 # -*- coding: utf-8 -*-
3 # python-feedcache (customized by Thomas Perl for use in gPodder)
5 # Copyright 2007 Doug Hellmann.
10 # Permission to use, copy, modify, and distribute this software and
11 # its documentation for any purpose and without fee is hereby
12 # granted, provided that the above copyright notice appear in all
13 # copies and that both that copyright notice and this permission
14 # notice appear in supporting documentation, and that the name of Doug
15 # Hellmann not be used in advertising or publicity pertaining to
16 # distribution of the software without specific, written prior
19 # DOUG HELLMANN DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
20 # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
21 # NO EVENT SHALL DOUG HELLMANN BE LIABLE FOR ANY SPECIAL, INDIRECT OR
22 # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
23 # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
24 # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
25 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
34 from gpodder
.liblogger
import log
37 def patch_feedparser():
38 """Fix a bug in feedparser 4.1
39 This replaces the mapContentType method of the
40 _FeedParserMixin class to correctly detect the
41 "plain" content type as "text/plain".
44 http://code.google.com/p/feedparser/issues/detail?id=80
46 Added by Thomas Perl for gPodder 2007-12-29
48 def mapContentType2(self
, contentType
):
49 contentType
= contentType
.lower()
50 if contentType
== 'text' or contentType
== 'plain':
51 contentType
= 'text/plain'
52 elif contentType
== 'html':
53 contentType
= 'text/html'
54 elif contentType
== 'xhtml':
55 contentType
= 'application/xhtml+xml'
59 if feedparser
._FeedParserMixin
().mapContentType('plain') == 'plain':
60 log('Patching feedparser module... (mapContentType bugfix)')
61 feedparser
._FeedParserMixin
.mapContentType
= mapContentType2
63 log('Warning: feedparser unpatched - might be broken!')
69 """A class to wrap Mark Pilgrim's Universal Feed Parser module
70 (http://www.feedparser.org) so that parameters can be used to
71 cache the feed results locally instead of fetching the feed every
72 time it is requested. Uses both etag and modified times for
76 # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
77 SUPPORTED_FEED_TYPES
= ('application/rss+xml', 'application/atom+xml',
78 'application/rdf+xml', 'application/xml', 'text/xml')
80 def __init__(self
, timeToLiveSeconds
=3600):
84 storage -- Backing store for the cache. It should follow
85 the dictionary API, with URLs used as keys. It should
88 timeToLiveSeconds=300 -- The length of time content should
89 live in the cache before an update is attempted.
91 self
.time_to_live
= timeToLiveSeconds
92 self
.user_agent
= gpodder
.user_agent
95 def fetch(self
, url
, old_channel
=None):
97 Returns an (updated, feed) tuple for the feed at the specified
98 URL. If the feed hasn't updated since the last run, updated
99 will be False. If it has been updated, updated will be True.
101 If updated is False, the feed value is None and you have to use
102 the old channel which you passed to this function.
105 if old_channel
is not None:
106 etag
= old_channel
.etag
107 modified
= feedparser
._parse
_date
(old_channel
.last_modified
)
112 # We know we need to fetch, so go ahead and do it.
113 parsed_result
= feedparser
.parse(url
,
114 agent
=self
.user_agent
,
119 content_type
= parsed_result
.headers
.get('content-type', '').lower()
120 # TODO: Also detect OPML feeds and other content types here
121 if parsed_result
.version
== '':
122 log('%s looks like a webpage - trying feed autodiscovery (%s).', url
, content_type
, sender
=self
)
123 if not hasattr(parsed_result
.feed
, 'links'):
126 found_alternate_feed
= False
127 for link
in parsed_result
.feed
.links
:
128 if hasattr(link
, 'type') and hasattr(link
, 'href') and hasattr(link
, 'rel'):
129 if link
.type in self
.SUPPORTED_FEED_TYPES
and link
.rel
== 'alternate':
130 log('Found alternate feed link: %s', link
.href
, sender
=self
)
131 parsed_result
= feedparser
.parse(link
.href
,
132 agent
=self
.user_agent
,
136 found_alternate_feed
= True
139 # We have not found a valid feed - abort here!
140 if not found_alternate_feed
:
143 log('Error while trying to get feed URL from webpage', sender
=self
, traceback
=True)
146 status
= parsed_result
.get('status', None)
149 # No new data, based on the etag or modified values.
150 # We need to update the modified time in the
151 # storage, though, so we know that what we have
152 # stored is up to date.
153 log('Using cached feed: %s', url
, sender
=self
)
154 elif status
in (200, 301, 302, 307):
155 # log('===============')
157 # log('LM old: %s', old_channel.last_modified)
158 # log('LM new: %s', parsed_result.headers.get('last-modified'))
160 # log('ET old: %s', old_channel.etag)
161 # log('ET new: %s', parsed_result.headers.get('etag'))
162 # log('===============')
164 # There is new content, so store it unless there was an error.
165 # Store it regardless of errors when we don't have anything yet
166 error
= parsed_result
.get('bozo_exception')
168 log('Warning: %s (%s)', url
, str(error
), sender
=self
)
169 parsed_result
['bozo_exception'] = str(error
)
171 log('Strange status code: %s (%s)', url
, status
, sender
=self
)
173 return (updated
, parsed_result
)