jhbuild/utils/httpcache.py

   1 # jhbuild - a build script for GNOME 1.x and 2.x
   2 # Copyright (C) 2001-2006  James Henstridge
   3 #
   4 #   httpcache.py: a simple HTTP cache
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with this program; if not, write to the Free Software
  18 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19
  20 '''Very simple minded class that can be used to maintain a cache of files
  21 downloaded from web servers.  It is designed to reduce load on web servers,
  22 and draws ideas from feedparser.py.  Strategies include:
  23     - If a resource has been checked in the last 6 hours, consider it current.
  24     - support gzip transfer encoding.
  25     - send If-Modified-Since and If-None-Match headers when validating a
  26       resource to reduce downloads when the file has not changed.
  27     - honour Expires headers returned by server.  If no expiry time is
  28       given, it defaults to 6 hours.
  29 '''
  30
  31 import os
  32 import sys
  33 import urllib2
  34 import urlparse
  35 import time
  36 import rfc822
  37 import StringIO
  38 try:
  39     import gzip
  40 except ImportError:
  41     gzip = None
  42
  43 try:
  44     import xml.dom.minidom
  45 except ImportError:
  46     raise SystemExit, _('Python xml packages are required but could not be found')
  47
  48 def _parse_isotime(string):
  49     if string[-1] != 'Z':
  50         return time.mktime(time.strptime(string, '%Y-%m-%dT%H:%M:%S'))
  51     tm = time.strptime(string, '%Y-%m-%dT%H:%M:%SZ')
  52     return time.mktime(tm[:8] + (0,)) - time.timezone
  53
  54 def _format_isotime(tm):
  55     return time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(tm))
  56
  57 def _parse_date(date):
  58     tm = rfc822.parsedate_tz(date)
  59     if tm:
  60         return rfc822.mktime_tz(tm)
  61     return 0
  62
  63 class CacheEntry:
  64     def __init__(self, uri, local, modified, etag, expires=0):
  65         self.uri = uri
  66         self.local = local
  67         self.modified = modified
  68         self.etag = etag
  69         self.expires = expires
  70
  71 class Cache:
  72     try:
  73         cachedir = os.path.join(os.environ['XDG_CACHE_HOME'], 'jhbuild')
  74     except KeyError:
  75         cachedir = os.path.join(os.environ['HOME'], '.cache','jhbuild')
  76
  77     # default to a 6 hour expiry time.
  78     default_age = 6 * 60 * 60
  79
  80     def __init__(self, cachedir=None):
  81         if cachedir:
  82             self.cachedir = cachedir
  83         if not os.path.exists(self.cachedir):
  84             os.makedirs(self.cachedir)
  85         self.entries = {}
  86
  87     def read_cache(self):
  88         self.entries = {}
  89         cindex = os.path.join(self.cachedir, 'index.xml')
  90         try:
  91             document = xml.dom.minidom.parse(cindex)
  92         except:
  93             return # treat like an empty cache
  94         if document.documentElement.nodeName != 'cache':
  95             document.unlink()
  96             return # doesn't look like a cache
  97
  98         for node in document.documentElement.childNodes:
  99             if node.nodeType != node.ELEMENT_NODE: continue
 100             if node.nodeName != 'entry': continue
 101             uri = node.getAttribute('uri')
 102             local = str(node.getAttribute('local'))
 103             if node.hasAttribute('modified'):
 104                 modified = node.getAttribute('modified')
 105             else:
 106                 modified = None
 107             if node.hasAttribute('etag'):
 108                 etag = node.getAttribute('etag')
 109             else:
 110                 etag = None
 111             expires = _parse_isotime(node.getAttribute('expires'))
 112             # only add to cache list if file actually exists.
 113             if os.path.exists(os.path.join(self.cachedir, local)):
 114                 self.entries[uri] = CacheEntry(uri, local, modified,
 115                                                etag, expires)
 116         document.unlink()
 117
 118     def write_cache(self):
 119         cindex = os.path.join(self.cachedir, 'index.xml')
 120
 121
 122         document = xml.dom.minidom.Document()
 123         document.appendChild(document.createElement('cache'))
 124         node = document.createTextNode('\n')
 125         document.documentElement.appendChild(node)
 126         for uri in self.entries.keys():
 127             entry = self.entries[uri]
 128             node = document.createElement('entry')
 129             node.setAttribute('uri', entry.uri)
 130             node.setAttribute('local', entry.local)
 131             if entry.modified:
 132                 node.setAttribute('modified', entry.modified)
 133             if entry.etag:
 134                 node.setAttribute('etag', entry.etag)
 135             node.setAttribute('expires', _format_isotime(entry.expires))
 136             document.documentElement.appendChild(node)
 137
 138             node = document.createTextNode('\n')
 139             document.documentElement.appendChild(node)
 140
 141         document.writexml(open(cindex, 'w'))
 142         document.unlink()
 143
 144     def _make_filename(self, uri):
 145         '''picks a unique name for a new entry in the cache.
 146         Very simplistic.'''
 147         # get the basename from the URI
 148         parts = urlparse.urlparse(uri, allow_fragments=False)
 149         base = parts[2].split('/')[-1]
 150         if not base: base = 'index.html'
 151
 152         is_unique = False
 153         while not is_unique:
 154             is_unique = True
 155             for uri in self.entries.keys():
 156                 if self.entries[uri].local == base:
 157                     is_unique = False
 158                     break
 159             if not is_unique:
 160                 base = base + '-'
 161         return base
 162
 163     def load(self, uri, nonetwork=False, age=None):
 164         '''Downloads the file associated with the URI, and returns a local
 165         file name for contents.'''
 166         # pass file URIs straight through -- no need to cache them
 167         parts = urlparse.urlparse(uri)
 168         if parts[0] in ('', 'file'):
 169             return parts[2]
 170         if sys.platform.startswith('win') and uri[1] == ':':
 171             # On Windows, path like c:... are local
 172             return uri
 173
 174         now = time.time()
 175
 176         # is the file cached and not expired?
 177         self.read_cache()
 178         entry = self.entries.get(uri)
 179         if entry and (age != 0 or nonetwork):
 180             if (nonetwork or now <= entry.expires):
 181                 return os.path.join(self.cachedir, entry.local)
 182
 183         if nonetwork:
 184             raise RuntimeError(_('file not in cache, but not allowed to check network'))
 185
 186         request = urllib2.Request(uri)
 187         if gzip:
 188             request.add_header('Accept-encoding', 'gzip')
 189         if entry:
 190             if entry.modified:
 191                 request.add_header('If-Modified-Since', entry.modified)
 192             if entry.etag:
 193                 request.add_header('If-None-Match', entry.etag)
 194
 195         try:
 196             response = urllib2.urlopen(request)
 197
 198             # get data, and gunzip it if it is encoded
 199             data = response.read()
 200             if gzip and response.headers.get('Content-Encoding', '') == 'gzip':
 201                 try:
 202                     data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
 203                 except:
 204                     data = ''
 205
 206             expires = response.headers.get('Expires')
 207
 208             # add new content to cache
 209             entry = CacheEntry(uri, self._make_filename(uri),
 210                                response.headers.get('Last-Modified'),
 211                                response.headers.get('ETag'))
 212             filename = os.path.join(self.cachedir, entry.local)
 213             open(filename, 'wb').write(data)
 214         except urllib2.HTTPError, e:
 215             if e.code == 304: # not modified; update validated
 216                 expires = e.hdrs.get('Expires')
 217                 filename = os.path.join(self.cachedir, entry.local)
 218             else:
 219                 raise
 220
 221         # set expiry date
 222         entry.expires = _parse_date(expires)
 223         if entry.expires <= now: # ignore expiry times that have already passed
 224             if age is None:
 225                 age = self.default_age
 226             entry.expires = now + age
 227
 228         # save cache
 229         self.entries[uri] = entry
 230         self.write_cache()
 231         return filename
 232
 233 _cache = None
 234 def load(uri, nonetwork=False, age=None):
 235     '''Downloads the file associated with the URI, and returns a local
 236     file name for contents.'''
 237     global _cache
 238     if not _cache: _cache = Cache()
 239     return _cache.load(uri, nonetwork=nonetwork, age=age)