Added sha256new algorithm
[zeroinstall/solver.git] / zeroinstall / zerostore / manifest.py
blobbda89e390a4f16cb01b248fe51f97f67e9c57a58
2 """Processing of implementation manifests.
4 A manifest is a string representing a directory tree, with the property
5 that two trees will generate identical manifest strings if and only if:
7 - They have extactly the same set of files, directories and symlinks.
8 - For each pair of corresponding directories in the two sets:
9 - The mtimes are the same (OldSHA1 only).
10 - For each pair of corresponding files in the two sets:
11 - The size, executable flag and mtime are the same.
12 - The contents have matching secure hash values.
13 - For each pair of corresponding symlinks in the two sets:
14 - The mtime and size are the same.
15 - The targets have matching secure hash values.
17 The manifest is typically processed with a secure hash itself. So, the idea is that
18 any significant change to the contents of the tree will change the secure hash value
19 of the manifest.
21 A top-level ".manifest" file is ignored.
22 """
24 # Copyright (C) 2009, Thomas Leonard
25 # See the README file for details, or visit http://0install.net.
28 import os, stat, base64
29 from zeroinstall import SafeException, _
30 from zeroinstall.zerostore import BadDigest, parse_algorithm_digest_pair, format_algorithm_digest_pair
32 import hashlib
33 sha1_new = hashlib.sha1
35 class Algorithm:
36 """Abstract base class for algorithms.
37 An algorithm knows how to generate a manifest from a directory tree.
38 @ivar rating: how much we like this algorithm (higher is better)
39 @type rating: int
40 """
41 def generate_manifest(self, root):
42 """Returns an iterator that yields each line of the manifest for the directory
43 tree rooted at 'root'."""
44 raise Exception('Abstract')
46 def new_digest(self):
47 """Create a new digest. Call update() on the returned object to digest the data.
48 Call getID() to turn it into a full ID string."""
49 raise Exception('Abstract')
51 def getID(self, digest):
52 """Convert a digest (from new_digest) to a full ID."""
53 raise Exception('Abstract')
55 class OldSHA1(Algorithm):
56 """@deprecated: Injector versions before 0.20 only supported this algorithm."""
58 rating = 10
60 def generate_manifest(self, root):
61 def recurse(sub):
62 # To ensure that a line-by-line comparison of the manifests
63 # is possible, we require that filenames don't contain newlines.
64 # Otherwise, you can name a file so that the part after the \n
65 # would be interpreted as another line in the manifest.
66 if '\n' in sub: raise BadDigest("Newline in filename '%s'" % sub)
67 assert sub.startswith('/')
69 if sub == '/.manifest': return
71 full = os.path.join(root, sub[1:].replace('/', os.sep))
72 info = os.lstat(full)
74 m = info.st_mode
75 if stat.S_ISDIR(m):
76 if sub != '/':
77 yield "D %s %s" % (int(info.st_mtime), sub)
78 items = os.listdir(full)
79 items.sort()
80 subdir = sub
81 if not subdir.endswith('/'):
82 subdir += '/'
83 for x in items:
84 for y in recurse(subdir + x):
85 yield y
86 return
88 assert sub[1:]
89 leaf = os.path.basename(sub[1:])
90 if stat.S_ISREG(m):
91 with open(full, 'rb') as stream:
92 d = sha1_new(stream.read()).hexdigest() # XXX could be very large!
93 if m & 0o111:
94 yield "X %s %s %s %s" % (d, int(info.st_mtime), info.st_size, leaf)
95 else:
96 yield "F %s %s %s %s" % (d, int(info.st_mtime), info.st_size, leaf)
97 elif stat.S_ISLNK(m):
98 target = os.readlink(full).encode('utf-8')
99 d = sha1_new(target).hexdigest()
100 # Note: Can't use utime on symlinks, so skip mtime
101 # Note: eCryptfs may report length as zero, so count ourselves instead
102 yield "S %s %s %s" % (d, len(target), leaf)
103 else:
104 raise SafeException(_("Unknown object '%s' (not a file, directory or symlink)") %
105 full)
106 for x in recurse('/'): yield x
108 def new_digest(self):
109 return sha1_new()
111 def getID(self, digest):
112 return 'sha1=' + digest.hexdigest()
114 def get_algorithm(name):
115 """Look-up an L{Algorithm} by name.
116 @raise BadDigest: if the name is unknown."""
117 try:
118 return algorithms[name]
119 except KeyError:
120 raise BadDigest(_("Unknown algorithm '%s'") % name)
122 def generate_manifest(root, alg = 'sha1'):
123 """@deprecated: use L{get_algorithm} and L{Algorithm.generate_manifest} instead."""
124 return get_algorithm(alg).generate_manifest(root)
126 def add_manifest_file(dir, digest_or_alg):
127 """Writes a .manifest file into 'dir', and returns the digest.
128 You should call fixup_permissions before this to ensure that the permissions are correct.
129 On exit, dir itself has mode 555. Subdirectories are not changed.
130 @param dir: root of the implementation
131 @param digest_or_alg: should be an instance of Algorithm. Passing a digest
132 here is deprecated."""
133 mfile = os.path.join(dir, '.manifest')
134 if os.path.islink(mfile) or os.path.exists(mfile):
135 raise SafeException(_("Directory '%s' already contains a .manifest file!") % dir)
136 manifest = ''
137 if isinstance(digest_or_alg, Algorithm):
138 alg = digest_or_alg
139 digest = alg.new_digest()
140 else:
141 digest = digest_or_alg
142 alg = get_algorithm('sha1')
143 for line in alg.generate_manifest(dir):
144 manifest += line + '\n'
145 manifest = manifest.encode('utf-8')
146 digest.update(manifest)
148 os.chmod(dir, 0o755)
149 with open(mfile, 'wb') as stream:
150 os.chmod(dir, 0o555)
151 stream.write(manifest)
152 os.chmod(mfile, 0o444)
153 return digest
155 def splitID(id):
156 """Take an ID in the form 'alg=value' and return a tuple (alg, value),
157 where 'alg' is an instance of Algorithm and 'value' is a string.
158 @raise BadDigest: if the algorithm isn't known or the ID has the wrong format."""
159 alg, digest = parse_algorithm_digest_pair(id)
160 return (get_algorithm(alg), digest)
162 def copy_with_verify(src, dest, mode, alg, required_digest):
163 """Copy path src to dest, checking that the contents give the right digest.
164 dest must not exist. New file is created with a mode of 'mode & umask'.
165 @param src: source filename
166 @type src: str
167 @param dest: target filename
168 @type dest: str
169 @param mode: target mode
170 @type mode: int
171 @param alg: algorithm to generate digest
172 @type alg: L{Algorithm}
173 @param required_digest: expected digest value
174 @type required_digest: str
175 @raise BadDigest: the contents of the file don't match required_digest"""
176 with open(src, 'rb') as src_obj:
177 dest_fd = os.open(dest, os.O_WRONLY | os.O_CREAT | os.O_EXCL, mode)
178 try:
179 digest = alg.new_digest()
180 while True:
181 data = src_obj.read(256)
182 if not data: break
183 digest.update(data)
184 while data:
185 written = os.write(dest_fd, data)
186 assert written >= 0
187 data = data[written:]
188 finally:
189 os.close(dest_fd)
190 actual = digest.hexdigest()
191 if actual == required_digest: return
192 os.unlink(dest)
193 raise BadDigest(_("Copy failed: file '%(src)s' has wrong digest (may have been tampered with)\n"
194 "Expected: %(required_digest)s\n"
195 "Actual: %(actual_digest)s") % {'src': src, 'required_digest': required_digest, 'actual_digest': actual})
197 def verify(root, required_digest = None):
198 """Ensure that directory 'dir' generates the given digest.
199 For a non-error return:
200 - Dir's name must be a digest (in the form "alg=value")
201 - The calculated digest of the contents must match this name.
202 - If there is a .manifest file, then its digest must also match.
203 @raise BadDigest: if verification fails."""
204 if required_digest is None:
205 required_digest = os.path.basename(root)
206 alg = splitID(required_digest)[0]
208 digest = alg.new_digest()
209 lines = []
210 for line in alg.generate_manifest(root):
211 line += '\n'
212 digest.update(line.encode('utf-8'))
213 lines.append(line)
214 actual_digest = alg.getID(digest)
216 manifest_file = os.path.join(root, '.manifest')
217 if os.path.isfile(manifest_file):
218 digest = alg.new_digest()
219 with open(manifest_file, 'rb') as stream:
220 digest.update(stream.read())
221 manifest_digest = alg.getID(digest)
222 else:
223 manifest_digest = None
225 if required_digest == actual_digest == manifest_digest:
226 return
228 error = BadDigest(_("Cached item does NOT verify."))
230 error.detail = _(" Expected: %(required_digest)s\n"
231 " Actual: %(actual_digest)s\n"
232 ".manifest digest: %(manifest_digest)s\n\n") \
233 % {'required_digest': required_digest, 'actual_digest': actual_digest, 'manifest_digest': manifest_digest or _('No .manifest file')}
235 if manifest_digest is None:
236 error.detail += _("No .manifest, so no further details available.")
237 elif manifest_digest == actual_digest:
238 error.detail += _("The .manifest file matches the actual contents. Very strange!")
239 elif manifest_digest == required_digest:
240 import difflib
241 with open(manifest_file, 'rt') as stream:
242 diff = difflib.unified_diff(stream.readlines(), lines,
243 'Recorded', 'Actual')
244 error.detail += _("The .manifest file matches the directory name.\n" \
245 "The contents of the directory have changed:\n") + \
246 ''.join(diff)
247 elif required_digest == actual_digest:
248 error.detail += _("The directory contents are correct, but the .manifest file is wrong!")
249 else:
250 error.detail += _("The .manifest file matches neither of the other digests. Odd.")
251 raise error
253 # XXX: Be more careful about the source tree changing under us. In particular, what happens if:
254 # - A regualar file suddenly turns into a symlink?
255 # - We find a device file (users can hard-link them if on the same device)
256 def copy_tree_with_verify(source, target, manifest_data, required_digest):
257 """Copy directory source to be a subdirectory of target if it matches the required_digest.
258 manifest_data is normally source/.manifest. source and manifest_data are not trusted
259 (will typically be under the control of another user).
260 The copy is first done to a temporary directory in target, then renamed to the final name
261 only if correct. Therefore, an invalid 'target/required_digest' will never exist.
262 A successful return means than target/required_digest now exists (whether we created it or not)."""
263 import tempfile
264 from logging import info
266 alg, digest_value = splitID(required_digest)
268 if isinstance(alg, OldSHA1):
269 raise SafeException(_("Sorry, the 'sha1' algorithm does not support copying."))
271 digest = alg.new_digest()
272 digest.update(manifest_data)
273 manifest_digest = alg.getID(digest)
275 if manifest_digest != required_digest:
276 raise BadDigest(_("Manifest has been tampered with!\n"
277 "Manifest digest: %(actual_digest)s\n"
278 "Directory name : %(required_digest)s")
279 % {'actual_digest': manifest_digest, 'required_digest': required_digest})
281 target_impl = os.path.join(target, required_digest)
282 if os.path.isdir(target_impl):
283 info(_("Target directory '%s' already exists"), target_impl)
284 return
286 # We've checked that the source's manifest matches required_digest, so it
287 # is what we want. Make a list of all the files we need to copy...
289 wanted = _parse_manifest(manifest_data.decode('utf-8'))
291 tmpdir = tempfile.mkdtemp(prefix = 'tmp-copy-', dir = target)
292 try:
293 _copy_files(alg, wanted, source, tmpdir)
295 if wanted:
296 raise SafeException(_('Copy failed; files missing from source:') + '\n- ' +
297 '\n- '.join(wanted.keys()))
299 # Make directories read-only (files are already RO)
300 for root, dirs, files in os.walk(tmpdir):
301 for d in dirs:
302 path = os.path.join(root, d)
303 mode = os.stat(path).st_mode
304 os.chmod(path, mode & 0o555)
306 # Check that the copy is correct
307 actual_digest = alg.getID(add_manifest_file(tmpdir, alg))
308 if actual_digest != required_digest:
309 raise SafeException(_("Copy failed; double-check of target gave the wrong digest.\n"
310 "Unless the target was modified during the copy, this is a BUG\n"
311 "in 0store and should be reported.\n"
312 "Expected: %(required_digest)s\n"
313 "Actual: %(actual_digest)s") % {'required_digest': required_digest, 'actual_digest': actual_digest})
314 try:
315 os.chmod(tmpdir, 0o755) # need write permission to rename on MacOS X
316 os.rename(tmpdir, target_impl)
317 os.chmod(target_impl, 0o555)
318 tmpdir = None
319 except OSError:
320 if not os.path.isdir(target_impl):
321 raise
322 # else someone else installed it already - return success
323 finally:
324 if tmpdir is not None:
325 info(_("Deleting tmpdir '%s'") % tmpdir)
326 from zeroinstall.support import ro_rmtree
327 ro_rmtree(tmpdir)
329 def _parse_manifest(manifest_data):
330 """Parse a manifest file.
331 @param manifest_data: the contents of the manifest file
332 @type manifest_data: str
333 @return: a mapping from paths to information about that path
334 @rtype: {str: tuple}"""
335 wanted = {}
336 dir = ''
337 for line in manifest_data.split('\n'):
338 if not line: break
339 if line[0] == 'D':
340 data = line.split(' ', 1)
341 if len(data) != 2: raise BadDigest(_("Bad line '%s'") % line)
342 path = data[-1]
343 if not path.startswith('/'): raise BadDigest(_("Not absolute: '%s'") % line)
344 path = path[1:]
345 dir = path
346 elif line[0] == 'S':
347 data = line.split(' ', 3)
348 path = os.path.join(dir, data[-1])
349 if len(data) != 4: raise BadDigest(_("Bad line '%s'") % line)
350 else:
351 data = line.split(' ', 4)
352 path = os.path.join(dir, data[-1])
353 if len(data) != 5: raise BadDigest(_("Bad line '%s'") % line)
354 if path in wanted:
355 raise BadDigest(_('Duplicate entry "%s"') % line)
356 wanted[path] = data[:-1]
357 return wanted
359 def _copy_files(alg, wanted, source, target):
360 """Scan for files under 'source'. For each one:
361 If it is in wanted and has the right details (or they can be fixed; e.g. mtime),
362 then copy it into 'target'.
363 If it's not in wanted, warn and skip it.
364 On exit, wanted contains only files that were not found."""
365 from logging import warn
366 dir = ''
367 for line in alg.generate_manifest(source):
368 if line[0] == 'D':
369 type, name = line.split(' ', 1)
370 assert name.startswith('/')
371 dir = name[1:]
372 path = dir
373 elif line[0] == 'S':
374 type, actual_digest, actual_size, name = line.split(' ', 3)
375 path = os.path.join(dir, name)
376 else:
377 assert line[0] in 'XF'
378 type, actual_digest, actual_mtime, actual_size, name = line.split(' ', 4)
379 path = os.path.join(dir, name)
380 try:
381 required_details = wanted.pop(path)
382 except KeyError:
383 warn(_("Skipping file not in manifest: '%s'"), path)
384 continue
385 if required_details[0] != type:
386 raise BadDigest(_("Item '%s' has wrong type!") % path)
387 if type == 'D':
388 os.mkdir(os.path.join(target, path))
389 elif type in 'XF':
390 required_type, required_digest, required_mtime, required_size = required_details
391 if required_size != actual_size:
392 raise SafeException(_("File '%(path)s' has wrong size (%(actual_size)s bytes, but should be "
393 "%(required_size)s according to manifest)") %
394 {'path': path, 'actual_size': actual_size, 'required_size': required_size})
395 required_mtime = int(required_mtime)
396 dest_path = os.path.join(target, path)
397 if type == 'X':
398 mode = 0o555
399 else:
400 mode = 0o444
401 copy_with_verify(os.path.join(source, path),
402 dest_path,
403 mode,
404 alg,
405 required_digest)
406 os.utime(dest_path, (required_mtime, required_mtime))
407 elif type == 'S':
408 required_type, required_digest, required_size = required_details
409 if required_size != actual_size:
410 raise SafeException(_("Symlink '%(path)s' has wrong size (%(actual_size)s bytes, but should be "
411 "%(required_size)s according to manifest)") %
412 {'path': path, 'actual_size': actual_size, 'required_size': required_size})
413 symlink_target = os.readlink(os.path.join(source, path))
414 symlink_digest = alg.new_digest()
415 symlink_digest.update(symlink_target.encode('utf-8'))
416 if symlink_digest.hexdigest() != required_digest:
417 raise SafeException(_("Symlink '%(path)s' has wrong target (digest should be "
418 "%(digest)s according to manifest)") % {'path': path, 'digest': required_digest})
419 dest_path = os.path.join(target, path)
420 os.symlink(symlink_target, dest_path)
421 else:
422 raise SafeException(_("Unknown manifest type %(type)s for '%(path)s'") % {'type': type, 'path': path})
424 class HashLibAlgorithm(Algorithm):
425 new_digest = None # Constructor for digest objects
427 def __init__(self, name, rating, hash_name = None):
428 self.name = name
429 self.new_digest = getattr(hashlib, hash_name or name)
430 self.rating = rating
432 def generate_manifest(self, root):
433 def recurse(sub):
434 # To ensure that a line-by-line comparison of the manifests
435 # is possible, we require that filenames don't contain newlines.
436 # Otherwise, you can name a file so that the part after the \n
437 # would be interpreted as another line in the manifest.
438 if '\n' in sub: raise BadDigest(_("Newline in filename '%s'") % sub)
439 assert sub.startswith('/')
441 full = os.path.join(root, sub[1:])
442 info = os.lstat(full)
443 new_digest = self.new_digest
445 m = info.st_mode
446 if not stat.S_ISDIR(m): raise Exception(_('Not a directory: "%s"') % full)
447 if sub != '/':
448 yield "D %s" % sub
449 items = os.listdir(full)
450 items.sort()
451 dirs = []
452 for leaf in items:
453 path = os.path.join(root, sub[1:], leaf)
454 info = os.lstat(path)
455 m = info.st_mode
457 if stat.S_ISREG(m):
458 if leaf == '.manifest': continue
460 with open(path, 'rb') as stream:
461 d = new_digest(stream.read()).hexdigest()
462 if m & 0o111:
463 yield "X %s %s %s %s" % (d, int(info.st_mtime), info.st_size, leaf)
464 else:
465 yield "F %s %s %s %s" % (d, int(info.st_mtime), info.st_size, leaf)
466 elif stat.S_ISLNK(m):
467 target = os.readlink(path).encode('utf-8')
468 d = new_digest(target).hexdigest()
469 # Note: Can't use utime on symlinks, so skip mtime
470 # Note: eCryptfs may report length as zero, so count ourselves instead
471 yield "S %s %s %s" % (d, len(target), leaf)
472 elif stat.S_ISDIR(m):
473 dirs.append(leaf)
474 else:
475 raise SafeException(_("Unknown object '%s' (not a file, directory or symlink)") %
476 path)
478 if not sub.endswith('/'):
479 sub += '/'
480 for x in dirs:
481 # Note: "sub" is always Unix style. Don't use os.path.join here.
482 for y in recurse(sub + x): yield y
483 return
485 for x in recurse('/'): yield x
487 def getID(self, digest):
488 if self.name in ('sha1new', 'sha256'):
489 digest_str = digest.hexdigest()
490 else:
491 # Base32-encode newer algorithms to make the digest shorter.
492 # We can't use base64 as Windows is case insensitive.
493 # There's no need for padding (and = characters in paths cause problems for some software).
494 digest_str = base64.b32encode(digest.digest()).rstrip(b'=').decode('ascii')
495 return format_algorithm_digest_pair(self.name, digest_str)
497 algorithms = {
498 'sha1': OldSHA1(),
499 'sha1new': HashLibAlgorithm('sha1new', 50, 'sha1'),
500 'sha256': HashLibAlgorithm('sha256', 80),
501 'sha256new': HashLibAlgorithm('sha256new', 90, 'sha256'),
505 def fixup_permissions(root):
506 """Set permissions recursively for children of root:
507 - If any X bit is set, they all must be.
508 - World readable, non-writable.
509 @raise Exception: if there are unsafe special bits set (setuid, etc)."""
511 for main, dirs, files in os.walk(root):
512 for x in ['.'] + files:
513 full = os.path.join(main, x)
515 raw_mode = os.lstat(full).st_mode
516 if stat.S_ISLNK(raw_mode): continue
518 mode = stat.S_IMODE(raw_mode)
519 if mode & ~0o777:
520 raise Exception(_("Unsafe mode: extracted file '%(filename)s' had special bits set in mode '%(mode)s'") % {'filename': full, 'mode': oct(mode)})
521 if mode & 0o111:
522 os.chmod(full, 0o555)
523 else:
524 os.chmod(full, 0o444)