1 # -*- coding: utf-8 -*-
3 # python-feedcache (customized by Thomas Perl for use in gPodder)
5 # Copyright 2007 Doug Hellmann.
10 # Permission to use, copy, modify, and distribute this software and
11 # its documentation for any purpose and without fee is hereby
12 # granted, provided that the above copyright notice appear in all
13 # copies and that both that copyright notice and this permission
14 # notice appear in supporting documentation, and that the name of Doug
15 # Hellmann not be used in advertising or publicity pertaining to
16 # distribution of the software without specific, written prior
19 # DOUG HELLMANN DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
20 # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
21 # NO EVENT SHALL DOUG HELLMANN BE LIABLE FOR ANY SPECIAL, INDIRECT OR
22 # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
23 # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
24 # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
25 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35 from gpodder
import resolver
36 from gpodder
.liblogger
import log
39 def patch_feedparser():
40 """Fix a bug in feedparser 4.1
41 This replaces the mapContentType method of the
42 _FeedParserMixin class to correctly detect the
43 "plain" content type as "text/plain".
46 http://code.google.com/p/feedparser/issues/detail?id=80
48 Added by Thomas Perl for gPodder 2007-12-29
50 def mapContentType2(self
, contentType
):
51 contentType
= contentType
.lower()
52 if contentType
== 'text' or contentType
== 'plain':
53 contentType
= 'text/plain'
54 elif contentType
== 'html':
55 contentType
= 'text/html'
56 elif contentType
== 'xhtml':
57 contentType
= 'application/xhtml+xml'
61 if feedparser
._FeedParserMixin
().mapContentType('plain') == 'plain':
62 log('Patching feedparser module... (mapContentType bugfix)')
63 feedparser
._FeedParserMixin
.mapContentType
= mapContentType2
65 log('Warning: feedparser unpatched - might be broken!')
71 """A class to wrap Mark Pilgrim's Universal Feed Parser module
72 (http://www.feedparser.org) so that parameters can be used to
73 cache the feed results locally instead of fetching the feed every
74 time it is requested. Uses both etag and modified times for
78 # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
79 SUPPORTED_FEED_TYPES
= ('application/rss+xml', 'application/atom+xml',
80 'application/rdf+xml', 'application/xml', 'text/xml')
82 def __init__(self
, timeToLiveSeconds
=3600):
86 storage -- Backing store for the cache. It should follow
87 the dictionary API, with URLs used as keys. It should
90 timeToLiveSeconds=300 -- The length of time content should
91 live in the cache before an update is attempted.
93 self
.time_to_live
= timeToLiveSeconds
94 self
.user_agent
= gpodder
.user_agent
97 def fetch(self
, url
, old_channel
=None):
99 Returns an (updated, feed) tuple for the feed at the specified
100 URL. If the feed hasn't updated since the last run, updated
101 will be False. If it has been updated, updated will be True.
103 If updated is False, the feed value is None and you have to use
104 the old channel which you passed to this function.
107 if old_channel
is not None:
108 etag
= old_channel
.etag
109 modified
= feedparser
._parse
_date
(old_channel
.last_modified
)
114 # We know we need to fetch, so go ahead and do it.
115 parsed_result
= feedparser
.parse(url
,
116 agent
=self
.user_agent
,
121 content_type
= parsed_result
.headers
.get('content-type', '').lower()
122 # TODO: Also detect OPML feeds and other content types here
123 if parsed_result
.version
== '':
124 log('%s looks like a webpage - trying feed autodiscovery (%s).', url
, content_type
, sender
=self
)
125 if not hasattr(parsed_result
.feed
, 'links'):
128 found_alternate_feed
= False
129 for link
in parsed_result
.feed
.links
:
130 if hasattr(link
, 'type') and hasattr(link
, 'href') and hasattr(link
, 'rel'):
131 if link
.type in self
.SUPPORTED_FEED_TYPES
and link
.rel
== 'alternate':
132 log('Found alternate feed link: %s', link
.href
, sender
=self
)
133 parsed_result
= feedparser
.parse(link
.href
,
134 agent
=self
.user_agent
,
138 found_alternate_feed
= True
141 # YouTube etc feed lookup (after the normal link lookup in case
142 # they provide a standard feed discovery mechanism in the future).
143 if not found_alternate_feed
:
144 next
= resolver
.get_real_channel_url(url
)
147 parsed_result
= feedparser
.parse(next
, agent
=self
.user_agent
, modified
=modified
, etag
=etag
)
148 found_alternate_feed
= True
150 # We have not found a valid feed - abort here!
151 if not found_alternate_feed
:
154 log('Error while trying to get feed URL from webpage', sender
=self
, traceback
=True)
157 status
= parsed_result
.get('status', None)
160 # No new data, based on the etag or modified values.
161 # We need to update the modified time in the
162 # storage, though, so we know that what we have
163 # stored is up to date.
164 log('Using cached feed: %s', url
, sender
=self
)
165 elif status
in (200, 301, 302, 307):
166 # log('===============')
168 # log('LM old: %s', old_channel.last_modified)
169 # log('LM new: %s', parsed_result.headers.get('last-modified'))
171 # log('ET old: %s', old_channel.etag)
172 # log('ET new: %s', parsed_result.headers.get('etag'))
173 # log('===============')
175 # There is new content, so store it unless there was an error.
176 # Store it regardless of errors when we don't have anything yet
177 error
= parsed_result
.get('bozo_exception')
179 log('Warning: %s (%s)', url
, str(error
), sender
=self
)
180 parsed_result
['bozo_exception'] = str(error
)
182 log('Strange status code: %s (%s)', url
, status
, sender
=self
)
184 return (updated
, parsed_result
)