src/gpodder/cache.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # python-feedcache (customized by Thomas Perl for use in gPodder)
   4 #
   5 # Copyright 2007 Doug Hellmann.
   6 #
   7 #
   8 #                         All Rights Reserved
   9 #
  10 # Permission to use, copy, modify, and distribute this software and
  11 # its documentation for any purpose and without fee is hereby
  12 # granted, provided that the above copyright notice appear in all
  13 # copies and that both that copyright notice and this permission
  14 # notice appear in supporting documentation, and that the name of Doug
  15 # Hellmann not be used in advertising or publicity pertaining to
  16 # distribution of the software without specific, written prior
  17 # permission.
  18 #
  19 # DOUG HELLMANN DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
  20 # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
  21 # NO EVENT SHALL DOUG HELLMANN BE LIABLE FOR ANY SPECIAL, INDIRECT OR
  22 # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
  23 # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
  24 # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  25 # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  26 #
  27
  28
  29 import feedparser
  30
  31 import re
  32 import time
  33 import gpodder
  34
  35 from gpodder import resolver
  36 from gpodder.liblogger import log
  37
  38
  39 def patch_feedparser():
  40     """Fix a bug in feedparser 4.1
  41     This replaces the mapContentType method of the
  42     _FeedParserMixin class to correctly detect the
  43     "plain" content type as "text/plain".
  44
  45     See also:
  46     http://code.google.com/p/feedparser/issues/detail?id=80
  47
  48     Added by Thomas Perl for gPodder 2007-12-29
  49     """
  50     def mapContentType2(self, contentType):
  51         contentType = contentType.lower()
  52         if contentType == 'text' or contentType == 'plain':
  53             contentType = 'text/plain'
  54         elif contentType == 'html':
  55             contentType = 'text/html'
  56         elif contentType == 'xhtml':
  57             contentType = 'application/xhtml+xml'
  58         return contentType
  59
  60     try:
  61         if feedparser._FeedParserMixin().mapContentType('plain') == 'plain':
  62             log('Patching feedparser module... (mapContentType bugfix)')
  63             feedparser._FeedParserMixin.mapContentType = mapContentType2
  64     except:
  65         log('Warning: feedparser unpatched - might be broken!')
  66
  67 patch_feedparser()
  68
  69
  70 class Cache:
  71     """A class to wrap Mark Pilgrim's Universal Feed Parser module
  72     (http://www.feedparser.org) so that parameters can be used to
  73     cache the feed results locally instead of fetching the feed every
  74     time it is requested. Uses both etag and modified times for
  75     caching.
  76     """
  77
  78     # Supported types, see http://feedvalidator.org/docs/warning/EncodingMismatch.html
  79     SUPPORTED_FEED_TYPES = ('application/rss+xml', 'application/atom+xml',
  80             'application/rdf+xml', 'application/xml', 'text/xml')
  81
  82     def __init__(self, timeToLiveSeconds=3600):
  83         """
  84         Arguments:
  85
  86           storage -- Backing store for the cache.  It should follow
  87           the dictionary API, with URLs used as keys.  It should
  88           persist data.
  89
  90           timeToLiveSeconds=300 -- The length of time content should
  91           live in the cache before an update is attempted.
  92         """
  93         self.time_to_live = timeToLiveSeconds
  94         self.user_agent = gpodder.user_agent
  95         return
  96
  97     def fetch(self, url, old_channel=None):
  98         """
  99         Returns an (updated, feed) tuple for the feed at the specified
 100         URL. If the feed hasn't updated since the last run, updated
 101         will be False. If it has been updated, updated will be True.
 102
 103         If updated is False, the feed value is None and you have to use
 104         the old channel which you passed to this function.
 105         """
 106
 107         if old_channel is not None:
 108             etag = old_channel.etag
 109             modified = feedparser._parse_date(old_channel.last_modified)
 110         else:
 111             etag = None
 112             modified = None
 113
 114         # We know we need to fetch, so go ahead and do it.
 115         parsed_result = feedparser.parse(url,
 116                                          agent=self.user_agent,
 117                                          modified=modified,
 118                                          etag=etag,
 119                                          )
 120
 121         content_type = parsed_result.headers.get('content-type', '').lower()
 122         # TODO: Also detect OPML feeds and other content types here
 123         if parsed_result.version == '':
 124             log('%s looks like a webpage - trying feed autodiscovery (%s).', url, content_type, sender=self)
 125             if not hasattr(parsed_result.feed, 'links'):
 126                 return (False, None)
 127             try:
 128                 found_alternate_feed = False
 129                 for link in parsed_result.feed.links:
 130                     if hasattr(link, 'type') and hasattr(link, 'href') and hasattr(link, 'rel'):
 131                         if link.type in self.SUPPORTED_FEED_TYPES and link.rel == 'alternate':
 132                             log('Found alternate feed link: %s', link.href, sender=self)
 133                             parsed_result = feedparser.parse(link.href,
 134                                                              agent=self.user_agent,
 135                                                              modified=modified,
 136                                                              etag=etag,
 137                                                              )
 138                             found_alternate_feed = True
 139                             break
 140
 141                 # YouTube etc feed lookup (after the normal link lookup in case
 142                 # they provide a standard feed discovery mechanism in the future).
 143                 if not found_alternate_feed:
 144                     next = resolver.get_real_channel_url(url)
 145
 146                     if next is not None:
 147                         parsed_result = feedparser.parse(next, agent=self.user_agent, modified=modified, etag=etag)
 148                         found_alternate_feed = True
 149
 150                 # We have not found a valid feed - abort here!
 151                 if not found_alternate_feed:
 152                     return (False, None)
 153             except:
 154                 log('Error while trying to get feed URL from webpage', sender=self, traceback=True)
 155
 156         updated = False
 157         status = parsed_result.get('status', None)
 158
 159         if status == 304:
 160             # No new data, based on the etag or modified values.
 161             # We need to update the modified time in the
 162             # storage, though, so we know that what we have
 163             # stored is up to date.
 164             log('Using cached feed: %s', url, sender=self)
 165         elif status in (200, 301, 302, 307):
 166             # log('===============')
 167             # log('[%s]', url)
 168             # log('LM old: %s', old_channel.last_modified)
 169             # log('LM new: %s', parsed_result.headers.get('last-modified'))
 170             # log('=======')
 171             # log('ET old: %s', old_channel.etag)
 172             # log('ET new: %s', parsed_result.headers.get('etag'))
 173             # log('===============')
 174             updated = True
 175             # There is new content, so store it unless there was an error.
 176             # Store it regardless of errors when we don't have anything yet
 177             error = parsed_result.get('bozo_exception')
 178             if error:
 179                 log('Warning: %s (%s)', url, str(error), sender=self)
 180                 parsed_result['bozo_exception'] = str(error)
 181         else:
 182             log('Strange status code: %s (%s)', url, status, sender=self)
 183
 184         return (updated, parsed_result)
 185