src/gpodder/cache.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # python-feedcache (customized by Thomas Perl for use in gPodder)
   4 #
   5 # Copyright 2007 Doug Hellmann.
   6 #
   7 #
   8 #                         All Rights Reserved
   9 #
  10 # Permission to use, copy, modify, and distribute this software and
  11 # its documentation for any purpose and without fee is hereby
  12 # granted, provided that the above copyright notice appear in all
  13 # copies and that both that copyright notice and this permission
  14 # notice appear in supporting documentation, and that the name of Doug
  15 # Hellmann not be used in advertising or publicity pertaining to
  16 # distribution of the software without specific, written prior
  17 # permission.
  18 #
  19 # DOUG HELLMANN DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
  20 # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
  21 # NO EVENT SHALL DOUG HELLMANN BE LIABLE FOR ANY SPECIAL, INDIRECT OR
  22 # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
  23 # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
  24 # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  25 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  26 #
  27
  28
  29 import feedparser
  30
  31 import string
  32 import re
  33 import time
  34 import urllib
  35 import urlparse
  36 import urllib2
  37
  38 import gpodder
  39 from gpodder import resolver
  40 from gpodder.liblogger import log
  41
  42 _ = gpodder.gettext
  43
  44 def patch_feedparser():
  45     """Fix a bug in feedparser 4.1
  46     This replaces the mapContentType method of the
  47     _FeedParserMixin class to correctly detect the
  48     "plain" content type as "text/plain".
  49
  50     See also:
  51     http://code.google.com/p/feedparser/issues/detail?id=80
  52
  53     Added by Thomas Perl for gPodder 2007-12-29
  54     """
  55     def mapContentType2(self, contentType):
  56         contentType = contentType.lower()
  57         if contentType == 'text' or contentType == 'plain':
  58             contentType = 'text/plain'
  59         elif contentType == 'html':
  60             contentType = 'text/html'
  61         elif contentType == 'xhtml':
  62             contentType = 'application/xhtml+xml'
  63         return contentType
  64
  65     try:
  66         if feedparser._FeedParserMixin().mapContentType('plain') == 'plain':
  67             log('Patching feedparser module... (mapContentType bugfix)')
  68             feedparser._FeedParserMixin.mapContentType = mapContentType2
  69     except:
  70         log('Warning: feedparser unpatched - might be broken!')
  71
  72 patch_feedparser()
  73
  74
  75 class Cache:
  76     """A class to wrap Mark Pilgrim's Universal Feed Parser module
  77     (http://www.feedparser.org) so that parameters can be used to
  78     cache the feed results locally instead of fetching the feed every
  79     time it is requested. Uses both etag and modified times for
  80     caching.
  81     """
  82
  83     # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
  84     SUPPORTED_FEED_TYPES = ('application/rss+xml', 'application/atom+xml',
  85             'application/rdf+xml', 'application/xml', 'text/xml')
  86
  87     def __init__(self, timeToLiveSeconds=3600):
  88         """
  89         Arguments:
  90
  91           storage -- Backing store for the cache.  It should follow
  92           the dictionary API, with URLs used as keys.  It should
  93           persist data.
  94
  95           timeToLiveSeconds=300 -- The length of time content should
  96           live in the cache before an update is attempted.
  97         """
  98         self.time_to_live = timeToLiveSeconds
  99         self.user_agent = gpodder.user_agent
 100         return
 101
 102     def fetch(self, url, old_channel=None, use_proxies=False,
 103             http_proxy=None, ftp_proxy=None):
 104         """
 105         Returns an (updated, feed) tuple for the feed at the specified
 106         URL. If the feed hasn't updated since the last run, updated
 107         will be False. If it has been updated, updated will be True.
 108
 109         If updated is False, the feed value is None and you have to use
 110         the old channel which you passed to this function.
 111
 112         If use_proxies is set to True, the cache generates a ProxyHandler
 113         from the http_proxy and ftp_proxy variables.
 114         """
 115
 116         if old_channel is not None:
 117             etag = old_channel.etag
 118             modified = feedparser._parse_date(old_channel.last_modified)
 119         else:
 120             etag = None
 121             modified = None
 122
 123         original_url = url
 124         # If we have a username or password, rebuild the url with them included
 125         # Note: using a HTTPBasicAuthHandler would be pain because we need to
 126         # know the realm. It can be done, but I think this method will work fine
 127         if old_channel is not None and (
 128                 old_channel.username or old_channel.password ):
 129             username = urllib.quote(old_channel.username)
 130             password = urllib.quote(old_channel.password)
 131             auth_string = string.join( [username, password], ':' )
 132             url_parts = list(urlparse.urlsplit(url))
 133             url_parts[1] = string.join( [auth_string, url_parts[1]], '@' )
 134             url = urlparse.urlunsplit(url_parts)
 135
 136         handlers = []
 137         if use_proxies:
 138             # Add a ProxyHandler for fetching data via a proxy server
 139             proxies = {}
 140             if http_proxy:
 141                 proxies['http'] = http_proxy
 142                 log('Using proxy for HTTP: %s', http_proxy, sender=self)
 143             if ftp_proxy:
 144                 proxies['ftp'] = ftp_proxy
 145                 log('Using proxy for FTP: %s', ftp_proxy, sender=self)
 146             handlers.append(urllib2.ProxyHandler(proxies))
 147
 148         # We know we need to fetch, so go ahead and do it.
 149         parsed_result = feedparser.parse(url,
 150                                          agent=self.user_agent,
 151                                          modified=modified,
 152                                          etag=etag,
 153                                          handlers=handlers,
 154                                          )
 155
 156         # Sometimes, the status code is not set (ugly feed?)
 157         status = parsed_result.get('status', None)
 158
 159         # 304: Not Modified
 160         if status == 304:
 161             log('Not Modified: %s', url, sender=self)
 162             return (False, None)
 163
 164         if status == 401:
 165             log('HTTP authentication required: %s', original_url, sender=self)
 166             return (False, parsed_result)
 167         if not hasattr(parsed_result, 'headers'):
 168             log('The requested object does not have a "headers" attribute.', sender=self)
 169             return (False, None)
 170         content_type = parsed_result.headers.get('content-type', '').lower()
 171         # TODO: Also detect OPML feeds and other content types here
 172         if parsed_result.version == '':
 173             log('%s looks like a webpage - trying feed autodiscovery.', url, sender=self)
 174             if not hasattr(parsed_result.feed, 'links'):
 175                 return (False, None)
 176             try:
 177                 found_alternate_feed = False
 178                 for link in parsed_result.feed.links:
 179                     if hasattr(link, 'type') and hasattr(link, 'href') and hasattr(link, 'rel'):
 180                         if link.type in self.SUPPORTED_FEED_TYPES and link.rel == 'alternate':
 181                             log('Found alternate feed link: %s', link.href, sender=self)
 182                             parsed_result = feedparser.parse(link.href,
 183                                                              agent=self.user_agent,
 184                                                              modified=modified,
 185                                                              etag=etag,
 186                                                              )
 187                             found_alternate_feed = True
 188                             break
 189
 190                 # YouTube etc feed lookup (after the normal link lookup in case
 191                 # they provide a standard feed discovery mechanism in the future).
 192                 if not found_alternate_feed:
 193                     next = resolver.get_real_channel_url(url)
 194
 195                     if next is not None:
 196                         parsed_result = feedparser.parse(next, agent=self.user_agent, modified=modified, etag=etag)
 197                         found_alternate_feed = True
 198
 199                 # We have not found a valid feed - abort here!
 200                 if not found_alternate_feed:
 201                     return (False, None)
 202             except:
 203                 log('Error while trying to get feed URL from webpage', sender=self, traceback=True)
 204
 205         updated = False
 206         status = parsed_result.get('status', None)
 207
 208         if status == 304:
 209             # No new data, based on the etag or modified values.
 210             # We need to update the modified time in the
 211             # storage, though, so we know that what we have
 212             # stored is up to date.
 213             log('Using cached feed: %s', url, sender=self)
 214         elif status in (200, 301, 302, 307):
 215             # log('===============')
 216             # log('[%s]', url)
 217             # log('LM old: %s', old_channel.last_modified)
 218             # log('LM new: %s', parsed_result.headers.get('last-modified'))
 219             # log('=======')
 220             # log('ET old: %s', old_channel.etag)
 221             # log('ET new: %s', parsed_result.headers.get('etag'))
 222             # log('===============')
 223             updated = True
 224             # There is new content, so store it unless there was an error.
 225             # Store it regardless of errors when we don't have anything yet
 226             error = parsed_result.get('bozo_exception')
 227
 228             # Detect HTTP authentication pages
 229             if isinstance(error, feedparser.NonXMLContentType) and \
 230                     status == 302 and hasattr(c, 'headers') and \
 231                     c.header.get('content-type').startswith('text/html'):
 232                 log('Warning: Looks like a Wifi authentication page: %s', c.url, sender=self)
 233                 log('Acting as if the feed was not updated (FIXME!)', sender=self)
 234                 return (True, None)
 235
 236             if error:
 237                 log('Warning: %s (%s)', url, str(error), sender=self)
 238                 parsed_result['bozo_exception'] = str(error)
 239         else:
 240             log('Strange status code: %s (%s)', url, status, sender=self)
 241
 242         return (updated, parsed_result)
 243