src/gpodder/cache.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # python-feedcache (customized by Thomas Perl for use in gPodder)
   4 #
   5 # Copyright 2007 Doug Hellmann.
   6 #
   7 #
   8 #                         All Rights Reserved
   9 #
  10 # Permission to use, copy, modify, and distribute this software and
  11 # its documentation for any purpose and without fee is hereby
  12 # granted, provided that the above copyright notice appear in all
  13 # copies and that both that copyright notice and this permission
  14 # notice appear in supporting documentation, and that the name of Doug
  15 # Hellmann not be used in advertising or publicity pertaining to
  16 # distribution of the software without specific, written prior
  17 # permission.
  18 #
  19 # DOUG HELLMANN DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
  20 # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
  21 # NO EVENT SHALL DOUG HELLMANN BE LIABLE FOR ANY SPECIAL, INDIRECT OR
  22 # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
  23 # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
  24 # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  25 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  26 #
  27
  28
  29 import feedparser
  30
  31 import time
  32 import gpodder
  33
  34 from gpodder.liblogger import log
  35
  36
  37 def patch_feedparser():
  38     """Fix a bug in feedparser 4.1
  39     This replaces the mapContentType method of the
  40     _FeedParserMixin class to correctly detect the
  41     "plain" content type as "text/plain".
  42
  43     See also:
  44     http://code.google.com/p/feedparser/issues/detail?id=80
  45
  46     Added by Thomas Perl for gPodder 2007-12-29
  47     """
  48     def mapContentType2(self, contentType):
  49         contentType = contentType.lower()
  50         if contentType == 'text' or contentType == 'plain':
  51             contentType = 'text/plain'
  52         elif contentType == 'html':
  53             contentType = 'text/html'
  54         elif contentType == 'xhtml':
  55             contentType = 'application/xhtml+xml'
  56         return contentType
  57
  58     try:
  59         if feedparser._FeedParserMixin().mapContentType('plain') == 'plain':
  60             log('Patching feedparser module... (mapContentType bugfix)')
  61             feedparser._FeedParserMixin.mapContentType = mapContentType2
  62     except:
  63         log('Warning: feedparser unpatched - might be broken!')
  64
  65 patch_feedparser()
  66
  67
  68 class Cache:
  69     """A class to wrap Mark Pilgrim's Universal Feed Parser module
  70     (http://www.feedparser.org) so that parameters can be used to
  71     cache the feed results locally instead of fetching the feed every
  72     time it is requested. Uses both etag and modified times for
  73     caching.
  74     """
  75
  76     # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
  77     SUPPORTED_FEED_TYPES = ('application/rss+xml', 'application/atom+xml',
  78             'application/rdf+xml', 'application/xml', 'text/xml')
  79
  80     def __init__(self, timeToLiveSeconds=3600):
  81         """
  82         Arguments:
  83
  84           storage -- Backing store for the cache.  It should follow
  85           the dictionary API, with URLs used as keys.  It should
  86           persist data.
  87
  88           timeToLiveSeconds=300 -- The length of time content should
  89           live in the cache before an update is attempted.
  90         """
  91         self.time_to_live = timeToLiveSeconds
  92         self.user_agent = gpodder.user_agent
  93         return
  94
  95     def fetch(self, url, old_channel=None):
  96         """
  97         Returns an (updated, feed) tuple for the feed at the specified
  98         URL. If the feed hasn't updated since the last run, updated
  99         will be False. If it has been updated, updated will be True.
 100
 101         If updated is False, the feed value is None and you have to use
 102         the old channel which you passed to this function.
 103         """
 104
 105         if old_channel is not None:
 106             etag = old_channel.etag
 107             modified = feedparser._parse_date(old_channel.last_modified)
 108         else:
 109             etag = None
 110             modified = None
 111
 112         # We know we need to fetch, so go ahead and do it.
 113         parsed_result = feedparser.parse(url,
 114                                          agent=self.user_agent,
 115                                          modified=modified,
 116                                          etag=etag,
 117                                          )
 118
 119         content_type = parsed_result.headers.get('content-type', '').lower()
 120         # TODO: Also detect OPML feeds and other content types here
 121         if parsed_result.version == '':
 122             log('%s looks like a webpage - trying feed autodiscovery (%s).', url, content_type, sender=self)
 123             if not hasattr(parsed_result.feed, 'links'):
 124                 return (False, None)
 125             try:
 126                 found_alternate_feed = False
 127                 for link in parsed_result.feed.links:
 128                     if hasattr(link, 'type') and hasattr(link, 'href') and hasattr(link, 'rel'):
 129                         if link.type in self.SUPPORTED_FEED_TYPES and link.rel == 'alternate':
 130                             log('Found alternate feed link: %s', link.href, sender=self)
 131                             parsed_result = feedparser.parse(link.href,
 132                                                              agent=self.user_agent,
 133                                                              modified=modified,
 134                                                              etag=etag,
 135                                                              )
 136                             found_alternate_feed = True
 137                             break
 138
 139                 # We have not found a valid feed - abort here!
 140                 if not found_alternate_feed:
 141                     return (False, None)
 142             except:
 143                 log('Error while trying to get feed URL from webpage', sender=self, traceback=True)
 144
 145         updated = False
 146         status = parsed_result.get('status', None)
 147
 148         if status == 304:
 149             # No new data, based on the etag or modified values.
 150             # We need to update the modified time in the
 151             # storage, though, so we know that what we have
 152             # stored is up to date.
 153             log('Using cached feed: %s', url, sender=self)
 154         elif status in (200, 301, 302, 307):
 155             # log('===============')
 156             # log('[%s]', url)
 157             # log('LM old: %s', old_channel.last_modified)
 158             # log('LM new: %s', parsed_result.headers.get('last-modified'))
 159             # log('=======')
 160             # log('ET old: %s', old_channel.etag)
 161             # log('ET new: %s', parsed_result.headers.get('etag'))
 162             # log('===============')
 163             updated = True
 164             # There is new content, so store it unless there was an error.
 165             # Store it regardless of errors when we don't have anything yet
 166             error = parsed_result.get('bozo_exception')
 167             if error:
 168                 log('Warning: %s (%s)', url, str(error), sender=self)
 169                 parsed_result['bozo_exception'] = str(error)
 170         else:
 171             log('Strange status code: %s (%s)', url, status, sender=self)
 172
 173         return (updated, parsed_result)
 174