Correct HTML feed auto-detection (bug 215)
[gpodder.git] / src / gpodder / cache.py
blob70c881bb3c0848bd18edc77a4f1e78f90bb0d986
1 # -*- coding: utf-8 -*-
2 #
3 # python-feedcache (customized by Thomas Perl for use in gPodder)
5 # Copyright 2007 Doug Hellmann.
8 # All Rights Reserved
10 # Permission to use, copy, modify, and distribute this software and
11 # its documentation for any purpose and without fee is hereby
12 # granted, provided that the above copyright notice appear in all
13 # copies and that both that copyright notice and this permission
14 # notice appear in supporting documentation, and that the name of Doug
15 # Hellmann not be used in advertising or publicity pertaining to
16 # distribution of the software without specific, written prior
17 # permission.
19 # DOUG HELLMANN DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
20 # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
21 # NO EVENT SHALL DOUG HELLMANN BE LIABLE FOR ANY SPECIAL, INDIRECT OR
22 # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
23 # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
24 # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
25 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
29 import feedparser
31 import time
32 import gpodder
34 from gpodder.liblogger import log
37 def patch_feedparser():
38 """Fix a bug in feedparser 4.1
39 This replaces the mapContentType method of the
40 _FeedParserMixin class to correctly detect the
41 "plain" content type as "text/plain".
43 See also:
44 http://code.google.com/p/feedparser/issues/detail?id=80
46 Added by Thomas Perl for gPodder 2007-12-29
47 """
48 def mapContentType2(self, contentType):
49 contentType = contentType.lower()
50 if contentType == 'text' or contentType == 'plain':
51 contentType = 'text/plain'
52 elif contentType == 'html':
53 contentType = 'text/html'
54 elif contentType == 'xhtml':
55 contentType = 'application/xhtml+xml'
56 return contentType
58 try:
59 if feedparser._FeedParserMixin().mapContentType('plain') == 'plain':
60 log('Patching feedparser module... (mapContentType bugfix)')
61 feedparser._FeedParserMixin.mapContentType = mapContentType2
62 except:
63 log('Warning: feedparser unpatched - might be broken!')
65 patch_feedparser()
68 class Cache:
69 """A class to wrap Mark Pilgrim's Universal Feed Parser module
70 (http://www.feedparser.org) so that parameters can be used to
71 cache the feed results locally instead of fetching the feed every
72 time it is requested. Uses both etag and modified times for
73 caching.
74 """
76 # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
77 SUPPORTED_FEED_TYPES = ('application/rss+xml', 'application/atom+xml',
78 'application/rdf+xml', 'application/xml', 'text/xml')
80 def __init__(self, timeToLiveSeconds=3600):
81 """
82 Arguments:
84 storage -- Backing store for the cache. It should follow
85 the dictionary API, with URLs used as keys. It should
86 persist data.
88 timeToLiveSeconds=300 -- The length of time content should
89 live in the cache before an update is attempted.
90 """
91 self.time_to_live = timeToLiveSeconds
92 self.user_agent = gpodder.user_agent
93 return
95 def fetch(self, url, old_channel=None):
96 """
97 Returns an (updated, feed) tuple for the feed at the specified
98 URL. If the feed hasn't updated since the last run, updated
99 will be False. If it has been updated, updated will be True.
101 If updated is False, the feed value is None and you have to use
102 the old channel which you passed to this function.
105 if old_channel is not None:
106 etag = old_channel.etag
107 modified = feedparser._parse_date(old_channel.last_modified)
108 else:
109 etag = None
110 modified = None
112 # We know we need to fetch, so go ahead and do it.
113 parsed_result = feedparser.parse(url,
114 agent=self.user_agent,
115 modified=modified,
116 etag=etag,
119 content_type = parsed_result.headers.get('content-type', '').lower()
120 # TODO: Also detect OPML feeds and other content types here
121 if parsed_result.version == '':
122 log('%s looks like a webpage - trying feed autodiscovery (%s).', url, content_type, sender=self)
123 if not hasattr(parsed_result.feed, 'links'):
124 return (False, None)
125 try:
126 found_alternate_feed = False
127 for link in parsed_result.feed.links:
128 if hasattr(link, 'type') and hasattr(link, 'href') and hasattr(link, 'rel'):
129 if link.type in self.SUPPORTED_FEED_TYPES and link.rel == 'alternate':
130 log('Found alternate feed link: %s', link.href, sender=self)
131 parsed_result = feedparser.parse(link.href,
132 agent=self.user_agent,
133 modified=modified,
134 etag=etag,
136 found_alternate_feed = True
137 break
139 # We have not found a valid feed - abort here!
140 if not found_alternate_feed:
141 return (False, None)
142 except:
143 log('Error while trying to get feed URL from webpage', sender=self, traceback=True)
145 updated = False
146 status = parsed_result.get('status', None)
148 if status == 304:
149 # No new data, based on the etag or modified values.
150 # We need to update the modified time in the
151 # storage, though, so we know that what we have
152 # stored is up to date.
153 log('Using cached feed: %s', url, sender=self)
154 elif status in (200, 301, 302, 307):
155 # log('===============')
156 # log('[%s]', url)
157 # log('LM old: %s', old_channel.last_modified)
158 # log('LM new: %s', parsed_result.headers.get('last-modified'))
159 # log('=======')
160 # log('ET old: %s', old_channel.etag)
161 # log('ET new: %s', parsed_result.headers.get('etag'))
162 # log('===============')
163 updated = True
164 # There is new content, so store it unless there was an error.
165 # Store it regardless of errors when we don't have anything yet
166 error = parsed_result.get('bozo_exception')
167 if error:
168 log('Warning: %s (%s)', url, str(error), sender=self)
169 parsed_result['bozo_exception'] = str(error)
170 else:
171 log('Strange status code: %s (%s)', url, status, sender=self)
173 return (updated, parsed_result)