indexed-cached-zipserve/memcache_zipserve.py

   1 #!/usr/bin/env python
   2 #
   3 # Copyright 2008 Google Inc.
   4
   5 # Licensed under the Apache License, Version 2.0 (the "License");
   6 # you may not use this file except in compliance with the License.
   7 # You may obtain a copy of the License at
   8 #
   9 #   http://www.apache.org/licenses/LICENSE-2.0
  10 #
  11 # Unless required by applicable law or agreed to in writing, software
  12 # distributed under the License is distributed on an "AS IS" BASIS,
  13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 # See the License for the specific language governing permissions and
  15 # limitations under the License.
  16 #
  17
  18 """A class to serve pages from zip files and use memcache for performance.
  19
  20 This contains a class and a function to create an anonymous instance of the
  21 class to serve HTTP GET requests. Memcache is used to increase response speed
  22 and lower processing cycles used in serving. Credit to Guido van Rossum and
  23 his implementation of zipserve which served as a reference as I wrote this.
  24
  25   MemcachedZipHandler: Class that serves request
  26   create_handler: method to create instance of MemcachedZipHandler
  27 """
  28
  29 __author__ = 'j.c@google.com (Justin Mattson)'
  30
  31 import email.Utils
  32 import logging
  33 import mimetypes
  34 import time
  35 import zipfile
  36
  37 from google.appengine.api import memcache
  38 from google.appengine.ext import webapp
  39 from google.appengine.ext.webapp import util
  40
  41
  42 def create_handler(zip_files, max_age=None, public=None):
  43   """Factory method to create a MemcachedZipHandler instance.
  44
  45   Args:
  46     zip_files: A list of file names, or a list of lists of file name, first
  47         member of file mappings. See MemcachedZipHandler documentation for
  48         more information about using the list of lists format
  49     max_age: The maximum client-side cache lifetime
  50     public: Whether this should be declared public in the client-side cache
  51   Returns:
  52     A MemcachedZipHandler wrapped in a pretty, anonymous bow for use with App
  53     Engine
  54
  55   Raises:
  56     ValueError: if the zip_files argument is not a list
  57   """
  58   # verify argument integrity. If the argument is passed in list format,
  59   # convert it to list of lists format
  60
  61   if zip_files and type(zip_files).__name__ == 'list':
  62     num_items = len(zip_files)
  63     while num_items > 0:
  64       if type(zip_files[num_items - 1]).__name__ != 'list':
  65         zip_files[num_items - 1] = [zip_files[num_items-1]]
  66       num_items -= 1
  67   else:
  68     raise ValueError('File name arguments must be a list')
  69
  70   class HandlerWrapper(MemcachedZipHandler):
  71     """Simple wrapper for an instance of MemcachedZipHandler.
  72
  73     I'm still not sure why this is needed
  74     """
  75
  76     def get(self, name):
  77       self.zipfilenames = zip_files
  78       self.TrueGet(name)
  79       if max_age is not None:
  80         MAX_AGE = max_age
  81       if public is not None:
  82         PUBLIC = public
  83
  84   return HandlerWrapper
  85
  86
  87 class MemcachedZipHandler(webapp.RequestHandler):
  88   """Handles get requests for a given URL.
  89
  90   Serves a GET request from a series of zip files. As files are served they are
  91   put into memcache, which is much faster than retreiving them from the zip
  92   source file again. It also uses considerably fewer CPU cycles.
  93   """
  94   zipfile_cache = {}                # class cache of source zip files
  95   MAX_AGE = 600                     # max client-side cache lifetime
  96   PUBLIC = True                     # public cache setting
  97   CACHE_PREFIX = "cache://"         # memcache key prefix for actual URLs
  98   NEG_CACHE_PREFIX = "noncache://"  # memcache key prefix for non-existant URL
  99
 100   def TrueGet(self, name):
 101     """The top-level entry point to serving requests.
 102
 103     Called 'True' get because it does the work when called from the wrapper
 104     class' get method
 105
 106     Args:
 107       name: URL requested
 108
 109     Returns:
 110       None
 111     """
 112     name = self.PreprocessUrl(name)
 113
 114     # see if we have the page in the memcache
 115     resp_data = self.GetFromCache(name)
 116     if resp_data is None:
 117       logging.info('Cache miss for %s', name)
 118       resp_data = self.GetFromNegativeCache(name)
 119       if resp_data is None:
 120         resp_data = self.GetFromStore(name)
 121
 122         # IF we have the file, put it in the memcache
 123         # ELSE put it in the negative cache
 124         if resp_data is not None:
 125           self.StoreOrUpdateInCache(name, resp_data)
 126         else:
 127           logging.info('Adding %s to negative cache, serving 404', name)
 128           self.StoreInNegativeCache(name)
 129           self.Write404Error()
 130           return
 131       else:
 132         self.Write404Error()
 133         return
 134
 135     content_type, encoding = mimetypes.guess_type(name)
 136     if content_type:
 137       self.response.headers['Content-Type'] = content_type
 138     self.SetCachingHeaders()
 139     self.response.out.write(resp_data)
 140
 141   def PreprocessUrl(self, name):
 142     """Any preprocessing work on the URL when it comes it.
 143
 144     Put any work related to interpretting the incoming URL here. For example,
 145     this is used to redirect requests for a directory to the index.html file
 146     in that directory. Subclasses should override this method to do different
 147     preprocessing.
 148
 149     Args:
 150       name: The incoming URL
 151
 152     Returns:
 153       The processed URL
 154     """
 155     if name[len(name) - 1:] == '/':
 156       return "%s%s" % (name, 'index.html')
 157     else:
 158       return name
 159
 160   def GetFromStore(self, file_path):
 161     """Retrieve file from zip files.
 162
 163     Get the file from the source, it must not have been in the memcache. If
 164     possible, we'll use the zip file index to quickly locate where the file
 165     should be found. (See MapToFileArchive documentation for assumptions about
 166     file ordering.) If we don't have an index or don't find the file where the
 167     index says we should, look through all the zip files to find it.
 168
 169     Args:
 170       file_path: the file that we're looking for
 171
 172     Returns:
 173       The contents of the requested file
 174     """
 175     resp_data = None
 176     file_itr = iter(self.zipfilenames)
 177
 178     # check the index, if we have one, to see what archive the file is in
 179     archive_name = self.MapFileToArchive(file_path)
 180     if not archive_name:
 181       archive_name = file_itr.next()[0]
 182
 183     while resp_data is None and archive_name:
 184       zip_archive = self.LoadZipFile(archive_name)
 185       if zip_archive:
 186
 187         # we expect some lookups will fail, and that's okay, 404s will deal
 188         # with that
 189         try:
 190           resp_data = zip_archive.read(file_path)
 191         except (KeyError, RuntimeError), err:
 192           # no op
 193           x = False
 194         if resp_data is not None:
 195           logging.info('%s read from %s', file_path, archive_name)
 196
 197       try:
 198         archive_name = file_itr.next()[0]
 199       except (StopIteration), err:
 200         archive_name = False
 201
 202     return resp_data
 203
 204   def LoadZipFile(self, zipfilename):
 205     """Convenience method to load zip file.
 206
 207     Just a convenience method to load the zip file from the data store. This is
 208     useful if we ever want to change data stores and also as a means of
 209     dependency injection for testing. This method will look at our file cache
 210     first, and then load and cache the file if there's a cache miss
 211
 212     Args:
 213       zipfilename: the name of the zip file to load
 214
 215     Returns:
 216       The zip file requested, or None if there is an I/O error
 217     """
 218     zip_archive = None
 219     zip_archive = self.zipfile_cache.get(zipfilename)
 220     if zip_archive is None:
 221       try:
 222         zip_archive = zipfile.ZipFile(zipfilename)
 223         self.zipfile_cache[zipfilename] = zip_archive
 224       except (IOError, RuntimeError), err:
 225         logging.error('Can\'t open zipfile %s, cause: %s' % (zipfilename,
 226                                                              err))
 227     return zip_archive
 228
 229   def MapFileToArchive(self, file_path):
 230     """Given a file name, determine what archive it should be in.
 231
 232     This method makes two critical assumptions.
 233     (1) The zip files passed as an argument to the handler, if concatenated
 234         in that same order, would result in a total ordering
 235         of all the files. See (2) for ordering type.
 236     (2) Upper case letters before lower case letters. The traversal of a
 237         directory tree is depth first. A parent directory's files are added
 238         before the files of any child directories
 239
 240     Args:
 241       file_path: the file to be mapped to an archive
 242
 243     Returns:
 244       The name of the archive where we expect the file to be
 245     """
 246     num_archives = len(self.zipfilenames)
 247     while num_archives > 0:
 248       target = self.zipfilenames[num_archives - 1]
 249       if len(target) > 1:
 250         if self.CompareFilenames(target[1], file_path) >= 0:
 251           return target[0]
 252       num_archives -= 1
 253
 254     return None
 255
 256   def CompareFilenames(self, file1, file2):
 257     """Determines whether file1 is lexigraphically 'before' file2.
 258
 259     WARNING: This method assumes that paths are output in a depth-first,
 260     with parent directories' files stored before childs'
 261
 262     We say that file1 is lexigraphically before file2 if the last non-matching
 263     path segment of file1 is alphabetically before file2.
 264
 265     Args:
 266       file1: the first file path
 267       file2: the second file path
 268
 269     Returns:
 270       A positive number if file1 is before file2
 271       A negative number if file2 is before file1
 272       0 if filenames are the same
 273     """
 274     f1_segments = file1.split('/')
 275     f2_segments = file2.split('/')
 276
 277     segment_ptr = 0
 278     while (segment_ptr < len(f1_segments) and
 279            segment_ptr < len(f2_segments) and
 280            f1_segments[segment_ptr] == f2_segments[segment_ptr]):
 281       segment_ptr += 1
 282
 283     if len(f1_segments) == len(f2_segments):
 284
 285       # we fell off the end, the paths much be the same
 286       if segment_ptr == len(f1_segments):
 287         return 0
 288
 289       # we didn't fall of the end, compare the segments where they differ
 290       if f1_segments[segment_ptr] < f2_segments[segment_ptr]:
 291         return 1
 292       elif f1_segments[segment_ptr] > f2_segments[segment_ptr]:
 293         return -1
 294       else:
 295         return 0
 296
 297       # the number of segments differs, we either mismatched comparing
 298       # directories, or comparing a file to a directory
 299     else:
 300
 301       # IF we were looking at the last segment of one of the paths,
 302       # the one with fewer segments is first because files come before
 303       # directories
 304       # ELSE we just need to compare directory names
 305       if (segment_ptr + 1 == len(f1_segments) or
 306           segment_ptr + 1 == len(f2_segments)):
 307         return len(f2_segments) - len(f1_segments)
 308       else:
 309         if f1_segments[segment_ptr] < f2_segments[segment_ptr]:
 310           return 1
 311         elif f1_segments[segment_ptr] > f2_segments[segment_ptr]:
 312           return -1
 313         else:
 314           return 0
 315
 316   def SetCachingHeaders(self):
 317     """Set caching headers for the request."""
 318     max_age = self.MAX_AGE
 319     self.response.headers['Expires'] = email.Utils.formatdate(
 320         time.time() + max_age, usegmt=True)
 321     cache_control = []
 322     if self.PUBLIC:
 323       cache_control.append('public')
 324     cache_control.append('max-age=%d' % max_age)
 325     self.response.headers['Cache-Control'] = ', '.join(cache_control)
 326
 327   def GetFromCache(self, filename):
 328     """Get file from memcache, if available.
 329
 330     Args:
 331       filename: The URL of the file to return
 332
 333     Returns:
 334       The content of the file
 335     """
 336     return memcache.get("%s%s" % (self.CACHE_PREFIX, filename))
 337
 338   def StoreOrUpdateInCache(self, filename, data):
 339     """Store data in the cache.
 340
 341     Store a piece of data in the memcache. Memcache has a maximum item size of
 342     1*10^6 bytes. If the data is too large, fail, but log the failure. Future
 343     work will consider compressing the data before storing or chunking it
 344
 345     Args:
 346       filename: the name of the file to store
 347       data: the data of the file
 348
 349     Returns:
 350       None
 351     """
 352     try:
 353       if not memcache.add("%s%s" % (self.CACHE_PREFIX, filename), data):
 354         memcache.replace("%s%s" % (self.CACHE_PREFIX, filename), data)
 355     except (ValueError), err:
 356       logging.warning("Data size too large to cache\n%s" % err)
 357
 358   def Write404Error(self):
 359     """Ouptut a simple 404 response."""
 360     self.error(404)
 361     self.response.out.write('Error 404, file not found')
 362
 363   def StoreInNegativeCache(self, filename):
 364     """If a non-existant URL is accessed, cache this result as well.
 365
 366     Future work should consider setting a maximum negative cache size to
 367     prevent it from from negatively impacting the real cache.
 368
 369     Args:
 370       filename: URL to add ot negative cache
 371
 372     Returns:
 373       None
 374     """
 375     memcache.add("%s%s" % (self.NEG_CACHE_PREFIX, filename), -1)
 376
 377   def GetFromNegativeCache(self, filename):
 378     """Retrieve from negative cache.
 379
 380     Args:
 381       filename: URL to retreive
 382
 383     Returns:
 384       The file contents if present in the negative cache.
 385     """
 386     return memcache.get("%s%s" % (self.NEG_CACHE_PREFIX, filename))
 387
 388
 389 def main():
 390   application = webapp.WSGIApplication([('/([^/]+)/(.*)',
 391                                          MemcachedZipHandler)])
 392   util.run_wsgi_app(application)
 393
 394
 395 if __name__ == '__main__':
 396   main()