Make Wifi auth page detection a bit more liberal
[gpodder.git] / src / gpodder / cache.py
blob1ab6d9fbc8e312350dde051420a740d1967d1627
1 # -*- coding: utf-8 -*-
2 #
3 # python-feedcache (customized by Thomas Perl for use in gPodder)
5 # Copyright 2007 Doug Hellmann.
8 # All Rights Reserved
10 # Permission to use, copy, modify, and distribute this software and
11 # its documentation for any purpose and without fee is hereby
12 # granted, provided that the above copyright notice appear in all
13 # copies and that both that copyright notice and this permission
14 # notice appear in supporting documentation, and that the name of Doug
15 # Hellmann not be used in advertising or publicity pertaining to
16 # distribution of the software without specific, written prior
17 # permission.
19 # DOUG HELLMANN DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
20 # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
21 # NO EVENT SHALL DOUG HELLMANN BE LIABLE FOR ANY SPECIAL, INDIRECT OR
22 # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
23 # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
24 # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
25 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
29 import feedparser
31 import string
32 import re
33 import time
34 import urllib
35 import urlparse
36 import urllib2
38 import gpodder
39 from gpodder import resolver
40 from gpodder.liblogger import log
42 _ = gpodder.gettext
44 def patch_feedparser():
45 """Fix a bug in feedparser 4.1
46 This replaces the mapContentType method of the
47 _FeedParserMixin class to correctly detect the
48 "plain" content type as "text/plain".
50 See also:
51 http://code.google.com/p/feedparser/issues/detail?id=80
53 Added by Thomas Perl for gPodder 2007-12-29
54 """
55 def mapContentType2(self, contentType):
56 contentType = contentType.lower()
57 if contentType == 'text' or contentType == 'plain':
58 contentType = 'text/plain'
59 elif contentType == 'html':
60 contentType = 'text/html'
61 elif contentType == 'xhtml':
62 contentType = 'application/xhtml+xml'
63 return contentType
65 try:
66 if feedparser._FeedParserMixin().mapContentType('plain') == 'plain':
67 log('Patching feedparser module... (mapContentType bugfix)')
68 feedparser._FeedParserMixin.mapContentType = mapContentType2
69 except:
70 log('Warning: feedparser unpatched - might be broken!')
72 patch_feedparser()
75 class Cache:
76 """A class to wrap Mark Pilgrim's Universal Feed Parser module
77 (http://www.feedparser.org) so that parameters can be used to
78 cache the feed results locally instead of fetching the feed every
79 time it is requested. Uses both etag and modified times for
80 caching.
81 """
83 # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
84 SUPPORTED_FEED_TYPES = ('application/rss+xml', 'application/atom+xml',
85 'application/rdf+xml', 'application/xml', 'text/xml')
87 def __init__(self, timeToLiveSeconds=3600):
88 """
89 Arguments:
91 storage -- Backing store for the cache. It should follow
92 the dictionary API, with URLs used as keys. It should
93 persist data.
95 timeToLiveSeconds=300 -- The length of time content should
96 live in the cache before an update is attempted.
97 """
98 self.time_to_live = timeToLiveSeconds
99 self.user_agent = gpodder.user_agent
100 return
102 def fetch(self, url, old_channel=None, use_proxies=False,
103 http_proxy=None, ftp_proxy=None):
105 Returns an (updated, feed) tuple for the feed at the specified
106 URL. If the feed hasn't updated since the last run, updated
107 will be False. If it has been updated, updated will be True.
109 If updated is False, the feed value is None and you have to use
110 the old channel which you passed to this function.
112 If use_proxies is set to True, the cache generates a ProxyHandler
113 from the http_proxy and ftp_proxy variables.
116 if old_channel is not None:
117 etag = old_channel.etag
118 modified = feedparser._parse_date(old_channel.last_modified)
119 else:
120 etag = None
121 modified = None
123 original_url = url
124 # If we have a username or password, rebuild the url with them included
125 # Note: using a HTTPBasicAuthHandler would be pain because we need to
126 # know the realm. It can be done, but I think this method will work fine
127 if old_channel is not None and (
128 old_channel.username or old_channel.password ):
129 username = urllib.quote(old_channel.username)
130 password = urllib.quote(old_channel.password)
131 auth_string = string.join( [username, password], ':' )
132 url_parts = list(urlparse.urlsplit(url))
133 url_parts[1] = string.join( [auth_string, url_parts[1]], '@' )
134 url = urlparse.urlunsplit(url_parts)
136 handlers = []
137 if use_proxies:
138 # Add a ProxyHandler for fetching data via a proxy server
139 proxies = {}
140 if http_proxy:
141 proxies['http'] = http_proxy
142 log('Using proxy for HTTP: %s', http_proxy, sender=self)
143 if ftp_proxy:
144 proxies['ftp'] = ftp_proxy
145 log('Using proxy for FTP: %s', ftp_proxy, sender=self)
146 handlers.append(urllib2.ProxyHandler(proxies))
148 # We know we need to fetch, so go ahead and do it.
149 parsed_result = feedparser.parse(url,
150 agent=self.user_agent,
151 modified=modified,
152 etag=etag,
153 handlers=handlers,
156 # Sometimes, the status code is not set (ugly feed?)
157 status = parsed_result.get('status', None)
159 # 304: Not Modified
160 if status == 304:
161 log('Not Modified: %s', url, sender=self)
162 return (False, None)
164 if status == 401:
165 log('HTTP authentication required: %s', original_url, sender=self)
166 return (False, parsed_result)
167 if not hasattr(parsed_result, 'headers'):
168 log('The requested object does not have a "headers" attribute.', sender=self)
169 return (False, None)
170 content_type = parsed_result.headers.get('content-type', '').lower()
171 # TODO: Also detect OPML feeds and other content types here
172 if parsed_result.version == '':
173 log('%s looks like a webpage - trying feed autodiscovery.', url, sender=self)
174 if not hasattr(parsed_result.feed, 'links'):
175 return (False, None)
176 try:
177 found_alternate_feed = False
178 for link in parsed_result.feed.links:
179 if hasattr(link, 'type') and hasattr(link, 'href') and hasattr(link, 'rel'):
180 if link.type in self.SUPPORTED_FEED_TYPES and link.rel == 'alternate':
181 log('Found alternate feed link: %s', link.href, sender=self)
182 parsed_result = feedparser.parse(link.href,
183 agent=self.user_agent,
184 modified=modified,
185 etag=etag,
187 found_alternate_feed = True
188 break
190 # YouTube etc feed lookup (after the normal link lookup in case
191 # they provide a standard feed discovery mechanism in the future).
192 if not found_alternate_feed:
193 next = resolver.get_real_channel_url(url)
195 if next is not None:
196 parsed_result = feedparser.parse(next, agent=self.user_agent, modified=modified, etag=etag)
197 found_alternate_feed = True
199 # We have not found a valid feed - abort here!
200 if not found_alternate_feed:
201 return (False, None)
202 except:
203 log('Error while trying to get feed URL from webpage', sender=self, traceback=True)
205 updated = False
206 status = parsed_result.get('status', None)
208 if status == 304:
209 # No new data, based on the etag or modified values.
210 # We need to update the modified time in the
211 # storage, though, so we know that what we have
212 # stored is up to date.
213 log('Using cached feed: %s', url, sender=self)
214 elif status in (200, 301, 302, 307):
215 # log('===============')
216 # log('[%s]', url)
217 # log('LM old: %s', old_channel.last_modified)
218 # log('LM new: %s', parsed_result.headers.get('last-modified'))
219 # log('=======')
220 # log('ET old: %s', old_channel.etag)
221 # log('ET new: %s', parsed_result.headers.get('etag'))
222 # log('===============')
223 updated = True
224 # There is new content, so store it unless there was an error.
225 # Store it regardless of errors when we don't have anything yet
226 error = parsed_result.get('bozo_exception')
228 # Detect HTTP authentication pages
229 if isinstance(error, feedparser.NonXMLContentType) and \
230 status == 302 and hasattr(c, 'headers') and \
231 c.header.get('content-type').startswith('text/html'):
232 log('Warning: Looks like a Wifi authentication page: %s', c.url, sender=self)
233 log('Acting as if the feed was not updated (FIXME!)', sender=self)
234 return (True, None)
236 if error:
237 log('Warning: %s (%s)', url, str(error), sender=self)
238 parsed_result['bozo_exception'] = str(error)
239 else:
240 log('Strange status code: %s (%s)', url, status, sender=self)
242 return (updated, parsed_result)