use 'latest' library versions for maximum lifespan as an example
[gae-samples.git] / indexed-cached-zipserve / memcache_zipserve.py
blobb15d70e4be5a2d78c2f4ed4394fa4333381dca06
1 #!/usr/bin/env python
3 # Copyright 2008 Google Inc.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
18 """A class to serve pages from zip files and use memcache for performance.
20 This contains a class and a function to create an anonymous instance of the
21 class to serve HTTP GET requests. Memcache is used to increase response speed
22 and lower processing cycles used in serving. Credit to Guido van Rossum and
23 his implementation of zipserve which served as a reference as I wrote this.
25 MemcachedZipHandler: Class that serves request
26 create_handler: method to create instance of MemcachedZipHandler
27 """
29 __author__ = 'j.c@google.com (Justin Mattson)'
31 import email.Utils
32 import logging
33 import mimetypes
34 import time
35 import zipfile
37 from google.appengine.api import memcache
38 from google.appengine.ext import webapp
39 from google.appengine.ext.webapp import util
42 def create_handler(zip_files, max_age=None, public=None):
43 """Factory method to create a MemcachedZipHandler instance.
45 Args:
46 zip_files: A list of file names, or a list of lists of file name, first
47 member of file mappings. See MemcachedZipHandler documentation for
48 more information about using the list of lists format
49 max_age: The maximum client-side cache lifetime
50 public: Whether this should be declared public in the client-side cache
51 Returns:
52 A MemcachedZipHandler wrapped in a pretty, anonymous bow for use with App
53 Engine
55 Raises:
56 ValueError: if the zip_files argument is not a list
57 """
58 # verify argument integrity. If the argument is passed in list format,
59 # convert it to list of lists format
61 if zip_files and type(zip_files).__name__ == 'list':
62 num_items = len(zip_files)
63 while num_items > 0:
64 if type(zip_files[num_items - 1]).__name__ != 'list':
65 zip_files[num_items - 1] = [zip_files[num_items-1]]
66 num_items -= 1
67 else:
68 raise ValueError('File name arguments must be a list')
70 class HandlerWrapper(MemcachedZipHandler):
71 """Simple wrapper for an instance of MemcachedZipHandler.
73 I'm still not sure why this is needed
74 """
76 def get(self, name):
77 self.zipfilenames = zip_files
78 self.TrueGet(name)
79 if max_age is not None:
80 MAX_AGE = max_age
81 if public is not None:
82 PUBLIC = public
84 return HandlerWrapper
87 class MemcachedZipHandler(webapp.RequestHandler):
88 """Handles get requests for a given URL.
90 Serves a GET request from a series of zip files. As files are served they are
91 put into memcache, which is much faster than retreiving them from the zip
92 source file again. It also uses considerably fewer CPU cycles.
93 """
94 zipfile_cache = {} # class cache of source zip files
95 MAX_AGE = 600 # max client-side cache lifetime
96 PUBLIC = True # public cache setting
97 CACHE_PREFIX = "cache://" # memcache key prefix for actual URLs
98 NEG_CACHE_PREFIX = "noncache://" # memcache key prefix for non-existant URL
100 def TrueGet(self, name):
101 """The top-level entry point to serving requests.
103 Called 'True' get because it does the work when called from the wrapper
104 class' get method
106 Args:
107 name: URL requested
109 Returns:
110 None
112 name = self.PreprocessUrl(name)
114 # see if we have the page in the memcache
115 resp_data = self.GetFromCache(name)
116 if resp_data is None:
117 logging.info('Cache miss for %s', name)
118 resp_data = self.GetFromNegativeCache(name)
119 if resp_data is None:
120 resp_data = self.GetFromStore(name)
122 # IF we have the file, put it in the memcache
123 # ELSE put it in the negative cache
124 if resp_data is not None:
125 self.StoreOrUpdateInCache(name, resp_data)
126 else:
127 logging.info('Adding %s to negative cache, serving 404', name)
128 self.StoreInNegativeCache(name)
129 self.Write404Error()
130 return
131 else:
132 self.Write404Error()
133 return
135 content_type, encoding = mimetypes.guess_type(name)
136 if content_type:
137 self.response.headers['Content-Type'] = content_type
138 self.SetCachingHeaders()
139 self.response.out.write(resp_data)
141 def PreprocessUrl(self, name):
142 """Any preprocessing work on the URL when it comes it.
144 Put any work related to interpretting the incoming URL here. For example,
145 this is used to redirect requests for a directory to the index.html file
146 in that directory. Subclasses should override this method to do different
147 preprocessing.
149 Args:
150 name: The incoming URL
152 Returns:
153 The processed URL
155 if name[len(name) - 1:] == '/':
156 return "%s%s" % (name, 'index.html')
157 else:
158 return name
160 def GetFromStore(self, file_path):
161 """Retrieve file from zip files.
163 Get the file from the source, it must not have been in the memcache. If
164 possible, we'll use the zip file index to quickly locate where the file
165 should be found. (See MapToFileArchive documentation for assumptions about
166 file ordering.) If we don't have an index or don't find the file where the
167 index says we should, look through all the zip files to find it.
169 Args:
170 file_path: the file that we're looking for
172 Returns:
173 The contents of the requested file
175 resp_data = None
176 file_itr = iter(self.zipfilenames)
178 # check the index, if we have one, to see what archive the file is in
179 archive_name = self.MapFileToArchive(file_path)
180 if not archive_name:
181 archive_name = file_itr.next()[0]
183 while resp_data is None and archive_name:
184 zip_archive = self.LoadZipFile(archive_name)
185 if zip_archive:
187 # we expect some lookups will fail, and that's okay, 404s will deal
188 # with that
189 try:
190 resp_data = zip_archive.read(file_path)
191 except (KeyError, RuntimeError), err:
192 # no op
193 x = False
194 if resp_data is not None:
195 logging.info('%s read from %s', file_path, archive_name)
197 try:
198 archive_name = file_itr.next()[0]
199 except (StopIteration), err:
200 archive_name = False
202 return resp_data
204 def LoadZipFile(self, zipfilename):
205 """Convenience method to load zip file.
207 Just a convenience method to load the zip file from the data store. This is
208 useful if we ever want to change data stores and also as a means of
209 dependency injection for testing. This method will look at our file cache
210 first, and then load and cache the file if there's a cache miss
212 Args:
213 zipfilename: the name of the zip file to load
215 Returns:
216 The zip file requested, or None if there is an I/O error
218 zip_archive = None
219 zip_archive = self.zipfile_cache.get(zipfilename)
220 if zip_archive is None:
221 try:
222 zip_archive = zipfile.ZipFile(zipfilename)
223 self.zipfile_cache[zipfilename] = zip_archive
224 except (IOError, RuntimeError), err:
225 logging.error('Can\'t open zipfile %s, cause: %s' % (zipfilename,
226 err))
227 return zip_archive
229 def MapFileToArchive(self, file_path):
230 """Given a file name, determine what archive it should be in.
232 This method makes two critical assumptions.
233 (1) The zip files passed as an argument to the handler, if concatenated
234 in that same order, would result in a total ordering
235 of all the files. See (2) for ordering type.
236 (2) Upper case letters before lower case letters. The traversal of a
237 directory tree is depth first. A parent directory's files are added
238 before the files of any child directories
240 Args:
241 file_path: the file to be mapped to an archive
243 Returns:
244 The name of the archive where we expect the file to be
246 num_archives = len(self.zipfilenames)
247 while num_archives > 0:
248 target = self.zipfilenames[num_archives - 1]
249 if len(target) > 1:
250 if self.CompareFilenames(target[1], file_path) >= 0:
251 return target[0]
252 num_archives -= 1
254 return None
256 def CompareFilenames(self, file1, file2):
257 """Determines whether file1 is lexigraphically 'before' file2.
259 WARNING: This method assumes that paths are output in a depth-first,
260 with parent directories' files stored before childs'
262 We say that file1 is lexigraphically before file2 if the last non-matching
263 path segment of file1 is alphabetically before file2.
265 Args:
266 file1: the first file path
267 file2: the second file path
269 Returns:
270 A positive number if file1 is before file2
271 A negative number if file2 is before file1
272 0 if filenames are the same
274 f1_segments = file1.split('/')
275 f2_segments = file2.split('/')
277 segment_ptr = 0
278 while (segment_ptr < len(f1_segments) and
279 segment_ptr < len(f2_segments) and
280 f1_segments[segment_ptr] == f2_segments[segment_ptr]):
281 segment_ptr += 1
283 if len(f1_segments) == len(f2_segments):
285 # we fell off the end, the paths much be the same
286 if segment_ptr == len(f1_segments):
287 return 0
289 # we didn't fall of the end, compare the segments where they differ
290 if f1_segments[segment_ptr] < f2_segments[segment_ptr]:
291 return 1
292 elif f1_segments[segment_ptr] > f2_segments[segment_ptr]:
293 return -1
294 else:
295 return 0
297 # the number of segments differs, we either mismatched comparing
298 # directories, or comparing a file to a directory
299 else:
301 # IF we were looking at the last segment of one of the paths,
302 # the one with fewer segments is first because files come before
303 # directories
304 # ELSE we just need to compare directory names
305 if (segment_ptr + 1 == len(f1_segments) or
306 segment_ptr + 1 == len(f2_segments)):
307 return len(f2_segments) - len(f1_segments)
308 else:
309 if f1_segments[segment_ptr] < f2_segments[segment_ptr]:
310 return 1
311 elif f1_segments[segment_ptr] > f2_segments[segment_ptr]:
312 return -1
313 else:
314 return 0
316 def SetCachingHeaders(self):
317 """Set caching headers for the request."""
318 max_age = self.MAX_AGE
319 self.response.headers['Expires'] = email.Utils.formatdate(
320 time.time() + max_age, usegmt=True)
321 cache_control = []
322 if self.PUBLIC:
323 cache_control.append('public')
324 cache_control.append('max-age=%d' % max_age)
325 self.response.headers['Cache-Control'] = ', '.join(cache_control)
327 def GetFromCache(self, filename):
328 """Get file from memcache, if available.
330 Args:
331 filename: The URL of the file to return
333 Returns:
334 The content of the file
336 return memcache.get("%s%s" % (self.CACHE_PREFIX, filename))
338 def StoreOrUpdateInCache(self, filename, data):
339 """Store data in the cache.
341 Store a piece of data in the memcache. Memcache has a maximum item size of
342 1*10^6 bytes. If the data is too large, fail, but log the failure. Future
343 work will consider compressing the data before storing or chunking it
345 Args:
346 filename: the name of the file to store
347 data: the data of the file
349 Returns:
350 None
352 try:
353 if not memcache.add("%s%s" % (self.CACHE_PREFIX, filename), data):
354 memcache.replace("%s%s" % (self.CACHE_PREFIX, filename), data)
355 except (ValueError), err:
356 logging.warning("Data size too large to cache\n%s" % err)
358 def Write404Error(self):
359 """Ouptut a simple 404 response."""
360 self.error(404)
361 self.response.out.write('Error 404, file not found')
363 def StoreInNegativeCache(self, filename):
364 """If a non-existant URL is accessed, cache this result as well.
366 Future work should consider setting a maximum negative cache size to
367 prevent it from from negatively impacting the real cache.
369 Args:
370 filename: URL to add ot negative cache
372 Returns:
373 None
375 memcache.add("%s%s" % (self.NEG_CACHE_PREFIX, filename), -1)
377 def GetFromNegativeCache(self, filename):
378 """Retrieve from negative cache.
380 Args:
381 filename: URL to retreive
383 Returns:
384 The file contents if present in the negative cache.
386 return memcache.get("%s%s" % (self.NEG_CACHE_PREFIX, filename))
389 def main():
390 application = webapp.WSGIApplication([('/([^/]+)/(.*)',
391 MemcachedZipHandler)])
392 util.run_wsgi_app(application)
395 if __name__ == '__main__':
396 main()