YouTube integration.
[gpodder.git] / src / gpodder / cache.py
blob2a41c2c3d8cebde233a642dac1d07c354f01fba1
1 # -*- coding: utf-8 -*-
2 #
3 # python-feedcache (customized by Thomas Perl for use in gPodder)
5 # Copyright 2007 Doug Hellmann.
8 # All Rights Reserved
10 # Permission to use, copy, modify, and distribute this software and
11 # its documentation for any purpose and without fee is hereby
12 # granted, provided that the above copyright notice appear in all
13 # copies and that both that copyright notice and this permission
14 # notice appear in supporting documentation, and that the name of Doug
15 # Hellmann not be used in advertising or publicity pertaining to
16 # distribution of the software without specific, written prior
17 # permission.
19 # DOUG HELLMANN DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
20 # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
21 # NO EVENT SHALL DOUG HELLMANN BE LIABLE FOR ANY SPECIAL, INDIRECT OR
22 # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
23 # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
24 # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
25 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
29 import feedparser
31 import re
32 import time
33 import gpodder
35 from gpodder import resolver
36 from gpodder.liblogger import log
39 def patch_feedparser():
40 """Fix a bug in feedparser 4.1
41 This replaces the mapContentType method of the
42 _FeedParserMixin class to correctly detect the
43 "plain" content type as "text/plain".
45 See also:
46 http://code.google.com/p/feedparser/issues/detail?id=80
48 Added by Thomas Perl for gPodder 2007-12-29
49 """
50 def mapContentType2(self, contentType):
51 contentType = contentType.lower()
52 if contentType == 'text' or contentType == 'plain':
53 contentType = 'text/plain'
54 elif contentType == 'html':
55 contentType = 'text/html'
56 elif contentType == 'xhtml':
57 contentType = 'application/xhtml+xml'
58 return contentType
60 try:
61 if feedparser._FeedParserMixin().mapContentType('plain') == 'plain':
62 log('Patching feedparser module... (mapContentType bugfix)')
63 feedparser._FeedParserMixin.mapContentType = mapContentType2
64 except:
65 log('Warning: feedparser unpatched - might be broken!')
67 patch_feedparser()
70 class Cache:
71 """A class to wrap Mark Pilgrim's Universal Feed Parser module
72 (http://www.feedparser.org) so that parameters can be used to
73 cache the feed results locally instead of fetching the feed every
74 time it is requested. Uses both etag and modified times for
75 caching.
76 """
78 # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
79 SUPPORTED_FEED_TYPES = ('application/rss+xml', 'application/atom+xml',
80 'application/rdf+xml', 'application/xml', 'text/xml')
82 def __init__(self, timeToLiveSeconds=3600):
83 """
84 Arguments:
86 storage -- Backing store for the cache. It should follow
87 the dictionary API, with URLs used as keys. It should
88 persist data.
90 timeToLiveSeconds=300 -- The length of time content should
91 live in the cache before an update is attempted.
92 """
93 self.time_to_live = timeToLiveSeconds
94 self.user_agent = gpodder.user_agent
95 return
97 def fetch(self, url, old_channel=None):
98 """
99 Returns an (updated, feed) tuple for the feed at the specified
100 URL. If the feed hasn't updated since the last run, updated
101 will be False. If it has been updated, updated will be True.
103 If updated is False, the feed value is None and you have to use
104 the old channel which you passed to this function.
107 if old_channel is not None:
108 etag = old_channel.etag
109 modified = feedparser._parse_date(old_channel.last_modified)
110 else:
111 etag = None
112 modified = None
114 # We know we need to fetch, so go ahead and do it.
115 parsed_result = feedparser.parse(url,
116 agent=self.user_agent,
117 modified=modified,
118 etag=etag,
121 content_type = parsed_result.headers.get('content-type', '').lower()
122 # TODO: Also detect OPML feeds and other content types here
123 if parsed_result.version == '':
124 log('%s looks like a webpage - trying feed autodiscovery (%s).', url, content_type, sender=self)
125 if not hasattr(parsed_result.feed, 'links'):
126 return (False, None)
127 try:
128 found_alternate_feed = False
129 for link in parsed_result.feed.links:
130 if hasattr(link, 'type') and hasattr(link, 'href') and hasattr(link, 'rel'):
131 if link.type in self.SUPPORTED_FEED_TYPES and link.rel == 'alternate':
132 log('Found alternate feed link: %s', link.href, sender=self)
133 parsed_result = feedparser.parse(link.href,
134 agent=self.user_agent,
135 modified=modified,
136 etag=etag,
138 found_alternate_feed = True
139 break
141 # YouTube etc feed lookup (after the normal link lookup in case
142 # they provide a standard feed discovery mechanism in the future).
143 if not found_alternate_feed:
144 next = resolver.get_real_channel_url(url)
146 if next is not None:
147 parsed_result = feedparser.parse(next, agent=self.user_agent, modified=modified, etag=etag)
148 found_alternate_feed = True
150 # We have not found a valid feed - abort here!
151 if not found_alternate_feed:
152 return (False, None)
153 except:
154 log('Error while trying to get feed URL from webpage', sender=self, traceback=True)
156 updated = False
157 status = parsed_result.get('status', None)
159 if status == 304:
160 # No new data, based on the etag or modified values.
161 # We need to update the modified time in the
162 # storage, though, so we know that what we have
163 # stored is up to date.
164 log('Using cached feed: %s', url, sender=self)
165 elif status in (200, 301, 302, 307):
166 # log('===============')
167 # log('[%s]', url)
168 # log('LM old: %s', old_channel.last_modified)
169 # log('LM new: %s', parsed_result.headers.get('last-modified'))
170 # log('=======')
171 # log('ET old: %s', old_channel.etag)
172 # log('ET new: %s', parsed_result.headers.get('etag'))
173 # log('===============')
174 updated = True
175 # There is new content, so store it unless there was an error.
176 # Store it regardless of errors when we don't have anything yet
177 error = parsed_result.get('bozo_exception')
178 if error:
179 log('Warning: %s (%s)', url, str(error), sender=self)
180 parsed_result['bozo_exception'] = str(error)
181 else:
182 log('Strange status code: %s (%s)', url, status, sender=self)
184 return (updated, parsed_result)