3 #tooltool is a lookaside cache implemented in Python
4 #Copyright (C) 2011 John H. Ford <john@johnford.info>
6 #This program is free software; you can redistribute it and/or
7 #modify it under the terms of the GNU General Public License
8 #as published by the Free Software Foundation version 2
10 #This program is distributed in the hope that it will be useful,
11 #but WITHOUT ANY WARRANTY; without even the implied warranty of
12 #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 #GNU General Public License for more details.
15 #You should have received a copy of the GNU General Public License
16 #along with this program; if not, write to the Free Software
17 #Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # An manifest file specifies files in that directory that are stored
20 # elsewhere. This file should only contain file in the directory
21 # which the manifest file resides in and it should be called 'manifest.manifest'
33 log
= logging
.getLogger(__name__
)
35 class FileRecordJSONEncoderException(Exception): pass
36 class InvalidManifest(Exception): pass
37 class ExceptionWithFilename(Exception):
38 def __init__(self
, filename
):
39 Exception.__init
__(self
)
40 self
.filename
= filename
42 class DigestMismatchException(ExceptionWithFilename
): pass
43 class MissingFileException(ExceptionWithFilename
): pass
45 class FileRecord(object):
46 def __init__(self
, filename
, size
, digest
, algorithm
):
48 self
.filename
= filename
51 self
.algorithm
= algorithm
52 log
.debug("creating %s 0x%x" % (self
.__class
__.__name
__, id(self
)))
54 def __eq__(self
, other
):
57 if self
.filename
== other
.filename
and \
58 self
.size
== other
.size
and \
59 self
.digest
== other
.digest
and \
60 self
.algorithm
== other
.algorithm
:
65 def __ne__(self
, other
):
66 return not self
.__eq
__(other
)
72 return "%s.%s(filename='%s', size='%s', digest='%s', algorithm='%s')" % (__name__
,
73 self
.__class
__.__name
__,
74 self
.filename
, self
.size
, self
.digest
, self
.algorithm
)
77 # Doesn't check validity
78 return os
.path
.exists(self
.filename
)
80 def validate_size(self
):
82 return self
.size
== os
.path
.getsize(self
.filename
)
84 log
.debug("trying to validate size on a missing file, %s", self
.filename
)
85 raise MissingFileException(filename
=self
.filename
)
87 def validate_digest(self
):
89 with
open(self
.filename
, 'rb') as f
:
90 return self
.digest
== digest_file(f
, self
.algorithm
)
92 log
.debug("trying to validate digest on a missing file, %s', self.filename")
93 raise MissingFileException(filename
=self
.filename
)
96 if self
.validate_size():
97 if self
.validate_digest():
102 if self
.present() and self
.validate():
103 return "'%s' is present and valid" % self
.filename
105 return "'%s' is present and invalid" % self
.filename
107 return "'%s' is absent" % self
.filename
110 def create_file_record(filename
, algorithm
):
111 fo
= open(filename
, 'rb')
112 stored_filename
= os
.path
.split(filename
)[1]
113 fr
= FileRecord(stored_filename
, os
.path
.getsize(filename
), digest_file(fo
, algorithm
), algorithm
)
118 class FileRecordJSONEncoder(json
.JSONEncoder
):
119 def encode_file_record(self
, obj
):
120 if not issubclass(type(obj
), FileRecord
):
121 err
= "FileRecordJSONEncoder is only for FileRecord and lists of FileRecords, not %s" % obj
.__class
__.__name
__
123 raise FileRecordJSONEncoderException(err
)
125 return {'filename': obj
.filename
, 'size': obj
.size
, 'algorithm': obj
.algorithm
, 'digest': obj
.digest
}
127 def default(self
, f
):
128 if issubclass(type(f
), list):
131 record_list
.append(self
.encode_file_record(i
))
134 return self
.encode_file_record(f
)
137 class FileRecordJSONDecoder(json
.JSONDecoder
):
138 """I help the json module materialize a FileRecord from
139 a JSON file. I understand FileRecords and lists of
140 FileRecords. I ignore things that I don't expect for now"""
141 # TODO: make this more explicit in what it's looking for
142 # and error out on unexpected things
143 def process_file_records(self
, obj
):
144 if isinstance(obj
, list):
147 record
= self
.process_file_records(i
)
148 if issubclass(type(record
), FileRecord
):
149 record_list
.append(record
)
151 if isinstance(obj
, dict) and \
152 len(obj
.keys()) == 4 and \
153 obj
.has_key('filename') and \
154 obj
.has_key('size') and \
155 obj
.has_key('algorithm') and \
156 obj
.has_key('digest'):
157 rv
= FileRecord(obj
['filename'], obj
['size'], obj
['digest'], obj
['algorithm'])
158 log
.debug("materialized %s" % rv
)
163 decoded
= json
.JSONDecoder
.decode(self
, s
)
164 rv
= self
.process_file_records(decoded
)
168 class Manifest(object):
170 valid_formats
= ('json',)
172 def __init__(self
, file_records
=[]):
173 self
.file_records
= file_records
175 def __eq__(self
, other
):
178 if len(self
.file_records
) != len(other
.file_records
):
179 log
.debug('Manifests differ in number of files')
181 #TODO: Lists in a different order should be equal
182 for record
in range(0,len(self
.file_records
)):
183 if self
.file_records
[record
] != other
.file_records
[record
]:
184 log
.debug('FileRecords differ, %s vs %s' % (self
.file_records
[record
],
185 other
.file_records
[record
]))
189 def __deepcopy__(self
, memo
):
190 # This is required for a deep copy
191 return Manifest(self
.file_records
[:])
194 return Manifest(self
.file_records
)
197 return Manifest(self
.file_records
[:])
200 return all(i
.present() for i
in self
.file_records
)
202 def validate_sizes(self
):
203 return all(i
.validate_size() for i
in self
.file_records
)
205 def validate_digests(self
):
206 return all(i
.validate_digest() for i
in self
.file_records
)
209 return all(i
.validate() for i
in self
.file_records
)
213 self
.file_records
.sort(key
=lambda x
: x
.size
)
215 def load(self
, data_file
, fmt
='json'):
216 assert fmt
in self
.valid_formats
219 self
.file_records
.extend(json
.load(data_file
, cls
=FileRecordJSONDecoder
))
222 raise InvalidManifest("trying to read invalid manifest file")
224 def loads(self
, data_string
, fmt
='json'):
225 assert fmt
in self
.valid_formats
228 self
.file_records
.extend(json
.loads(data_string
, cls
=FileRecordJSONDecoder
))
231 raise InvalidManifest("trying to read invalid manifest file")
233 def dump(self
, output_file
, fmt
='json'):
234 assert fmt
in self
.valid_formats
237 rv
= json
.dump(self
.file_records
, output_file
, indent
=0, cls
=FileRecordJSONEncoder
)
238 print >> output_file
, ''
241 def dumps(self
, fmt
='json'):
242 assert fmt
in self
.valid_formats
245 return json
.dumps(self
.file_records
, cls
=FileRecordJSONEncoder
)
248 def digest_file(f
, a
):
249 """I take a file like object 'f' and return a hex-string containing
250 of the result of the algorithm 'a' applied to 'f'."""
253 data
= f
.read(chunk_size
)
256 data
= f
.read(chunk_size
)
257 if hasattr(f
, 'name'):
258 log
.debug('hashed %s with %s to be %s', f
.name
, a
, h
.hexdigest())
260 log
.debug('hashed a file with %s to be %s', a
, h
.hexdigest())
263 # TODO: write tests for this function
264 def open_manifest(manifest_file
):
265 """I know how to take a filename and load it into a Manifest object"""
266 if os
.path
.exists(manifest_file
):
267 manifest
= Manifest()
268 with
open(manifest_file
) as f
:
270 log
.debug("loaded manifest from file '%s'" % manifest_file
)
273 log
.debug("tried to load absent file '%s' as manifest" % manifest_file
)
274 raise InvalidManifest("manifest file '%s' does not exist" % manifest_file
)
276 # TODO: write tests for this function
277 def list_manifest(manifest_file
):
278 """I know how print all the files in a location"""
280 manifest
= open_manifest(manifest_file
)
281 except InvalidManifest
:
282 log
.error("failed to load manifest file at '%s'" % manifest_file
)
284 for f
in manifest
.file_records
:
285 print "%s\t%s\t%s" % ("P" if f
.present() else "-",
286 "V" if f
.present() and f
.validate() else "-",
290 def validate_manifest(manifest_file
):
291 """I validate that all files in a manifest are present and valid but
292 don't fetch or delete them if they aren't"""
294 manifest
= open_manifest(manifest_file
)
295 except InvalidManifest
:
296 log
.error("failed to load manifest file at '%s'" % manifest_file
)
300 for f
in manifest
.file_records
:
302 absent_files
.append(f
)
305 invalid_files
.append(f
)
306 if len(invalid_files
+ absent_files
) == 0:
311 # TODO: write tests for this function
312 def add_files(manifest_file
, algorithm
, filenames
):
313 # returns True if all files successfully added, False if not
314 # and doesn't catch library Exceptions. If any files are already
315 # tracked in the manifest, return will be False because they weren't
317 all_files_added
= True
318 # Create a old_manifest object to add to
319 if os
.path
.exists(manifest_file
):
320 old_manifest
= open_manifest(manifest_file
)
322 old_manifest
= Manifest()
323 log
.debug("creating a new manifest file")
324 new_manifest
= Manifest() # use a different manifest for the output
325 for filename
in filenames
:
326 log
.debug("adding %s" % filename
)
327 path
, name
= os
.path
.split(filename
)
328 new_fr
= create_file_record(filename
, algorithm
)
329 log
.debug("appending a new file record to manifest file")
331 for fr
in old_manifest
.file_records
:
332 log
.debug("manifest file has '%s'" % "', ".join([x
.filename
for x
in old_manifest
.file_records
]))
333 if new_fr
== fr
and new_fr
.validate():
334 # TODO: Decide if this case should really cause a False return
335 log
.info("file already in old_manifest file and matches")
337 elif new_fr
== fr
and not new_fr
.validate():
338 log
.error("file already in old_manifest file but is invalid")
340 if filename
== fr
.filename
:
341 log
.error("manifest already contains file named %s" % filename
)
344 new_manifest
.file_records
.append(new_fr
)
345 log
.debug("added '%s' to manifest" % filename
)
347 all_files_added
= False
348 with
open(manifest_file
, 'wb') as output
:
349 new_manifest
.dump(output
, fmt
='json')
350 return all_files_added
353 # TODO: write tests for this function
354 def fetch_file(base_url
, file_record
, overwrite
=False, grabchunk
=1024*4):
355 # A file which is requested to be fetched that exists locally will be hashed.
356 # If the hash matches the requested file's hash, nothing will be done and the
357 # function will return. If the function is told to overwrite and there is a
358 # digest mismatch, the exiting file will be overwritten
359 if file_record
.present():
360 if file_record
.validate():
361 log
.info("existing '%s' is valid, not fetching" % file_record
.filename
)
364 log
.info("overwriting '%s' as requested" % file_record
.filename
)
366 # All of the following is for a useful error message
367 with
open(file_record
.filename
, 'rb') as f
:
368 d
= digest_file(f
, file_record
.algorithm
)
369 log
.error("digest mismatch between manifest(%s...) and local file(%s...)" % \
370 (file_record
.digest
[:8], d
[:8]))
371 log
.debug("full digests: manifest (%s) local file (%s)" % (file_record
.digest
, d
))
375 # Generate the URL for the file on the server side
376 url
= "%s/%s/%s" % (base_url
, file_record
.algorithm
, file_record
.digest
)
378 log
.debug("fetching from '%s'" % url
)
380 # TODO: This should be abstracted to make generic retreival protocol handling easy
381 # Well, the file doesn't exist locally. Lets fetch it.
383 f
= urllib2
.urlopen(url
)
384 log
.debug("opened %s for reading" % url
)
385 with
open(file_record
.filename
, 'wb') as out
:
389 # TODO: print statistics as file transfers happen both for info and to stop
391 indata
= f
.read(grabchunk
)
396 if size
!= file_record
.size
:
397 log
.error("transfer from %s to %s failed due to a difference of %d bytes" % (url
,
398 file_record
.filename
, file_record
.size
- size
))
400 log
.info("fetched %s" % file_record
.filename
)
401 except (urllib2
.URLError
, urllib2
.HTTPError
) as e
:
402 log
.error("failed to fetch '%s': %s" % (file_record
.filename
, e
),
406 log
.error("failed to write to '%s'" % file_record
.filename
,
412 # TODO: write tests for this function
413 def fetch_files(manifest_file
, base_url
, overwrite
, filenames
=[]):
414 # Lets load the manifest file
416 manifest
= open_manifest(manifest_file
)
417 except InvalidManifest
:
418 log
.error("failed to load manifest file at '%s'" % manifest_file
)
420 # We want to track files that fail to be fetched as well as
421 # files that are fetched
424 # Lets go through the manifest and fetch the files that we want
426 for f
in manifest
.file_records
:
427 if f
.filename
in filenames
or len(filenames
) == 0:
428 log
.debug("fetching %s" % f
.filename
)
429 if fetch_file(base_url
, f
, overwrite
):
430 fetched_files
.append(f
)
432 failed_files
.append(f
.filename
)
434 log
.debug("skipping %s" % f
.filename
)
436 # Even if we get the file, lets ensure that it matches what the
438 for localfile
in fetched_files
:
439 if not localfile
.validate():
440 log
.error("'%s'" % localfile
.describe())
442 # If we failed to fetch or validate a file, we need to fail
443 if len(failed_files
) > 0:
444 log
.error("The following files failed: '%s'" % "', ".join(failed_files
))
449 # TODO: write tests for this function
450 def process_command(options
, args
):
451 """ I know how to take a list of program arguments and
452 start doing the right thing with them"""
455 log
.debug("processing '%s' command with args '%s'" % (cmd
, '", "'.join(cmd_args
)))
456 log
.debug("using options: %s" % options
)
458 return list_manifest(options
['manifest'])
459 if cmd
== 'validate':
460 return validate_manifest(options
['manifest'])
462 return add_files(options
['manifest'], options
['algorithm'], cmd_args
)
464 if not options
.has_key('base_url') or options
.get('base_url') is None:
465 log
.critical('fetch command requires url option')
467 return fetch_files(options
['manifest'], options
['base_url'], options
['overwrite'], cmd_args
)
469 log
.critical('command "%s" is not implemented' % cmd
)
473 # http://hostname/algorithm/hash
474 # example: http://people.mozilla.org/sha1/1234567890abcedf
475 # This will make it possible to have the server allow clients to
476 # use different algorithms than what was uploaded to the server
478 # TODO: Implement the following features:
479 # -optimization: do small files first, justification is that they are faster
480 # and cause a faster failure if they are invalid
482 # -local renames i.e. call the file one thing on the server and
483 # something different locally
484 # -deal with the cases:
485 # -local data matches file requested with different filename
486 # -two different files with same name, different hash
487 # -?only ever locally to digest as filename, symlink to real name
488 # -?maybe deal with files as a dir of the filename with all files in that dir as the versions of that file
489 # - e.g. ./python-2.6.7.dmg/0123456789abcdef and ./python-2.6.7.dmg/abcdef0123456789
492 # Set up logging, for now just to the console
493 ch
= logging
.StreamHandler()
494 cf
= logging
.Formatter("%(levelname)s - %(message)s")
497 # Set up option parsing
498 parser
= optparse
.OptionParser()
499 # I wish there was a way to say "only allow args to be
500 # sequential and at the end of the argv.
501 # OH! i could step through sys.argv and check for things starting without -/-- before things starting with them
502 parser
.add_option('-q', '--quiet', default
=False,
503 dest
='quiet', action
='store_true')
504 parser
.add_option('-v', '--verbose', default
=False,
505 dest
='verbose', action
='store_true')
506 parser
.add_option('-m', '--manifest', default
='manifest.tt',
507 dest
='manifest', action
='store',
508 help='specify the manifest file to be operated on')
509 parser
.add_option('-d', '--algorithm', default
='sha512',
510 dest
='algorithm', action
='store',
511 help='openssl hashing algorithm to use')
512 parser
.add_option('-o', '--overwrite', default
=False,
513 dest
='overwrite', action
='store_true',
514 help='if fetching, remote copy will overwrite a local copy that is different. ')
515 parser
.add_option('--url', dest
='base_url', action
='store',
516 help='base url for fetching files')
517 parser
.add_option('--ignore-config-files', action
='store_true', default
=False,
518 dest
='ignore_cfg_files')
519 (options_obj
, args
) = parser
.parse_args()
520 # Dictionaries are easier to work with
521 options
= vars(options_obj
)
524 # Use some of the option parser to figure out application
526 if options
.get('verbose'):
527 ch
.setLevel(logging
.DEBUG
)
528 elif options
.get('quiet'):
529 ch
.setLevel(logging
.ERROR
)
531 ch
.setLevel(logging
.INFO
)
534 cfg_file
= ConfigParser
.SafeConfigParser()
535 if not options
.get("ignore_cfg_files"):
536 read_files
= cfg_file
.read(['/etc/tooltool', os
.path
.expanduser('~/.tooltool'),
537 os
.path
.join(os
.getcwd(), '.tooltool')])
538 log
.debug("read in the config files '%s'" % '", '.join(read_files
))
540 log
.debug("skipping config files")
542 for option
in ('base_url', 'algorithm'):
543 if not options
.get(option
):
545 options
[option
] = cfg_file
.get('general', option
)
546 log
.debug("read '%s' as '%s' from cfg_file" % (option
, options
[option
]))
547 except (ConfigParser
.NoSectionError
, ConfigParser
.NoOptionError
) as e
:
548 log
.debug("%s in config file" % e
, exc_info
=True)
550 if not options
.has_key('manifest'):
551 parser
.error("no manifest file specified")
554 parser
.error('You must specify a command')
555 exit(0 if process_command(options
, args
) else 1)
557 if __name__
== "__main__":
560 log
.addHandler(logging
.NullHandler())
561 #log.addHandler(logging.StreamHandler())