Don't rely on filesystem's report of symlink length
[zeroinstall.git] / zeroinstall / zerostore / manifest.py
blobd818353aef50e1e9f21eb2f4840f5cda7dc527c4
1 """Processing of implementation manifests.
3 A manifest is a string representing a directory tree, with the property
4 that two trees will generate identical manifest strings if and only if:
6 - They have extactly the same set of files, directories and symlinks.
7 - For each pair of corresponding directories in the two sets:
8 - The mtimes are the same (OldSHA1 only).
9 - For each pair of corresponding files in the two sets:
10 - The size, executable flag and mtime are the same.
11 - The contents have matching secure hash values.
12 - For each pair of corresponding symlinks in the two sets:
13 - The mtime and size are the same.
14 - The targets have matching secure hash values.
16 The manifest is typically processed with a secure hash itself. So, the idea is that
17 any significant change to the contents of the tree will change the secure hash value
18 of the manifest.
20 A top-level ".manifest" file is ignored.
21 """
23 # Copyright (C) 2009, Thomas Leonard
24 # See the README file for details, or visit http://0install.net.
26 from __future__ import generators
27 import os, stat
28 from zeroinstall import SafeException
29 from zeroinstall.zerostore import BadDigest
31 try:
32 import hashlib
33 sha1_new = hashlib.sha1
34 except:
35 import sha
36 sha1_new = sha.new
37 hashlib = None
39 class Algorithm:
40 """Abstract base class for algorithms.
41 An algorithm knows how to generate a manifest from a directory tree.
42 """
43 def generate_manifest(self, root):
44 """Returns an iterator that yields each line of the manifest for the directory
45 tree rooted at 'root'."""
46 raise Exception('Abstract')
48 def new_digest(self):
49 """Create a new digest. Call update() on the returned object to digest the data.
50 Call getID() to turn it into a full ID string."""
51 raise Exception('Abstract')
53 def getID(self, digest):
54 """Convert a digest (from new_digest) to a full ID."""
55 raise Exception('Abstract')
57 class OldSHA1(Algorithm):
58 """@deprecated: Injector versions before 0.20 only supported this algorithm."""
59 def generate_manifest(self, root):
60 def recurse(sub):
61 # To ensure that a line-by-line comparison of the manifests
62 # is possible, we require that filenames don't contain newlines.
63 # Otherwise, you can name a file so that the part after the \n
64 # would be interpreted as another line in the manifest.
65 if '\n' in sub: raise BadDigest("Newline in filename '%s'" % sub)
66 assert sub.startswith('/')
68 if sub == '/.manifest': return
70 full = os.path.join(root, sub[1:])
71 info = os.lstat(full)
73 m = info.st_mode
74 if stat.S_ISDIR(m):
75 if sub != '/':
76 yield "D %s %s" % (int(info.st_mtime), sub)
77 items = os.listdir(full)
78 items.sort()
79 for x in items:
80 for y in recurse(os.path.join(sub, x)):
81 yield y
82 return
84 assert sub[1:]
85 leaf = os.path.basename(sub[1:])
86 if stat.S_ISREG(m):
87 d = sha1_new(file(full).read()).hexdigest()
88 if m & 0111:
89 yield "X %s %s %s %s" % (d, int(info.st_mtime) ,info.st_size, leaf)
90 else:
91 yield "F %s %s %s %s" % (d, int(info.st_mtime) ,info.st_size, leaf)
92 elif stat.S_ISLNK(m):
93 target = os.readlink(full)
94 d = sha1_new(target).hexdigest()
95 # Note: Can't use utime on symlinks, so skip mtime
96 # Note: eCryptfs may report length as zero, so count ourselves instead
97 yield "S %s %s %s" % (d, len(target), leaf)
98 else:
99 raise SafeException("Unknown object '%s' (not a file, directory or symlink)" %
100 full)
101 for x in recurse('/'): yield x
103 def new_digest(self):
104 return sha1_new()
106 def getID(self, digest):
107 return 'sha1=' + digest.hexdigest()
109 def get_algorithm(name):
110 """Look-up an L{Algorithm} by name.
111 @raise BadDigest: if the name is unknown."""
112 try:
113 return algorithms[name]
114 except KeyError:
115 raise BadDigest("Unknown algorithm '%s'" % name)
117 def generate_manifest(root, alg = 'sha1'):
118 """@deprecated: use L{get_algorithm} and L{Algorithm.generate_manifest} instead."""
119 return get_algorithm(alg).generate_manifest(root)
121 def add_manifest_file(dir, digest_or_alg):
122 """Writes a .manifest file into 'dir', and returns the digest.
123 You should call fixup_permissions before this to ensure that the permissions are correct.
124 On exit, dir itself has mode 555. Subdirectories are not changed.
125 @param dir: root of the implementation
126 @param digest_or_alg: should be an instance of Algorithm. Passing a digest
127 here is deprecated."""
128 mfile = os.path.join(dir, '.manifest')
129 if os.path.islink(mfile) or os.path.exists(mfile):
130 raise SafeException("Directory '%s' already contains a .manifest file!" % dir)
131 manifest = ''
132 if isinstance(digest_or_alg, Algorithm):
133 alg = digest_or_alg
134 digest = alg.new_digest()
135 else:
136 digest = digest_or_alg
137 alg = get_algorithm('sha1')
138 for line in alg.generate_manifest(dir):
139 manifest += line + '\n'
140 digest.update(manifest)
142 os.chmod(dir, 0755)
143 stream = file(mfile, 'w')
144 os.chmod(dir, 0555)
145 stream.write(manifest)
146 stream.close()
147 os.chmod(mfile, 0444)
148 return digest
150 def splitID(id):
151 """Take an ID in the form 'alg=value' and return a tuple (alg, value),
152 where 'alg' is an instance of Algorithm and 'value' is a string.
153 @raise BadDigest: if the algorithm isn't known or the ID has the wrong format."""
154 parts = id.split('=', 1)
155 if len(parts) != 2:
156 raise BadDigest("Digest '%s' is not in the form 'algorithm=value'" % id)
157 return (get_algorithm(parts[0]), parts[1])
159 def copy_with_verify(src, dest, mode, alg, required_digest):
160 """Copy path src to dest, checking that the contents give the right digest.
161 dest must not exist. New file is created with a mode of 'mode & umask'.
162 @param src: source filename
163 @type src: str
164 @param dest: target filename
165 @type dest: str
166 @param mode: target mode
167 @type mode: int
168 @param alg: algorithm to generate digest
169 @type alg: L{Algorithm}
170 @param required_digest: expected digest value
171 @type required_digest: str
172 @raise BadDigest: the contents of the file don't match required_digest"""
173 src_obj = file(src)
174 dest_fd = os.open(dest, os.O_WRONLY | os.O_CREAT | os.O_EXCL, mode)
175 try:
176 digest = alg.new_digest()
177 while True:
178 data = src_obj.read(256)
179 if not data: break
180 digest.update(data)
181 while data:
182 written = os.write(dest_fd, data)
183 assert written >= 0
184 data = data[written:]
185 finally:
186 os.close(dest_fd)
187 src_obj.close()
188 actual = digest.hexdigest()
189 if actual == required_digest: return
190 os.unlink(dest)
191 raise BadDigest(("Copy failed: file '%s' has wrong digest (may have been tampered with)\n"
192 "Excepted: %s\n"
193 "Actual: %s") % (src, required_digest, actual))
195 def verify(root, required_digest = None):
196 """Ensure that directory 'dir' generates the given digest.
197 For a non-error return:
198 - Dir's name must be a digest (in the form "alg=value")
199 - The calculated digest of the contents must match this name.
200 - If there is a .manifest file, then its digest must also match.
201 @raise BadDigest: if verification fails."""
202 if required_digest is None:
203 required_digest = os.path.basename(root)
204 alg = splitID(required_digest)[0]
206 digest = alg.new_digest()
207 lines = []
208 for line in alg.generate_manifest(root):
209 line += '\n'
210 digest.update(line)
211 lines.append(line)
212 actual_digest = alg.getID(digest)
214 manifest_file = os.path.join(root, '.manifest')
215 if os.path.isfile(manifest_file):
216 digest = alg.new_digest()
217 digest.update(file(manifest_file).read())
218 manifest_digest = alg.getID(digest)
219 else:
220 manifest_digest = None
222 if required_digest == actual_digest == manifest_digest:
223 return
225 error = BadDigest("Cached item does NOT verify.")
227 error.detail = " Expected digest: " + required_digest + "\n" + \
228 " Actual digest: " + actual_digest + "\n" + \
229 ".manifest digest: " + (manifest_digest or 'No .manifest file') + "\n\n"
231 if manifest_digest is None:
232 error.detail += "No .manifest, so no further details available."
233 elif manifest_digest == actual_digest:
234 error.detail += "The .manifest file matches the actual contents. Very strange!"
235 elif manifest_digest == required_digest:
236 import difflib
237 diff = difflib.unified_diff(file(manifest_file).readlines(), lines,
238 'Recorded', 'Actual')
239 error.detail += "The .manifest file matches the directory name.\n" \
240 "The contents of the directory have changed:\n" + \
241 ''.join(diff)
242 elif required_digest == actual_digest:
243 error.detail += "The directory contents are correct, but the .manifest file is wrong!"
244 else:
245 error.detail += "The .manifest file matches neither of the other digests. Odd."
246 raise error
248 # XXX: Be more careful about the source tree changing under us. In particular, what happens if:
249 # - A regualar file suddenly turns into a symlink?
250 # - We find a device file (users can hard-link them if on the same device)
251 def copy_tree_with_verify(source, target, manifest_data, required_digest):
252 """Copy directory source to be a subdirectory of target if it matches the required_digest.
253 manifest_data is normally source/.manifest. source and manifest_data are not trusted
254 (will typically be under the control of another user).
255 The copy is first done to a temporary directory in target, then renamed to the final name
256 only if correct. Therefore, an invalid 'target/required_digest' will never exist.
257 A successful return means than target/required_digest now exists (whether we created it or not)."""
258 import tempfile, shutil
259 from logging import info
261 alg, digest_value = splitID(required_digest)
263 if isinstance(alg, OldSHA1):
264 raise SafeException("Sorry, the 'sha1' algorithm does not support copying.")
266 digest = alg.new_digest()
267 digest.update(manifest_data)
268 manifest_digest = alg.getID(digest)
270 if manifest_digest != required_digest:
271 raise BadDigest("Manifest has been tampered with!\n"
272 "Manifest digest: " + manifest_digest + "\n"
273 "Directory name : " + required_digest)
275 target_impl = os.path.join(target, required_digest)
276 if os.path.isdir(target_impl):
277 info("Target directory '%s' already exists", target_impl)
278 return
280 # We've checked that the source's manifest matches required_digest, so it
281 # is what we want. Make a list of all the files we need to copy...
283 wanted = _parse_manifest(manifest_data)
285 tmpdir = tempfile.mkdtemp(prefix = 'tmp-copy-', dir = target)
287 try:
288 _copy_files(alg, wanted, source, tmpdir)
290 if wanted:
291 raise SafeException('Copy failed; files missing from source:\n- ' +
292 '\n- '.join(wanted.keys()))
294 # Check that the copy is correct
295 actual_digest = alg.getID(add_manifest_file(tmpdir, alg))
296 if actual_digest != required_digest:
297 raise SafeException(("Copy failed; double-check of target gave the wrong digest.\n"
298 "Unless the target was modified during the copy, this is a BUG\n"
299 "in 0store and should be reported.\n"
300 "Expected: %s\n"
301 "Actual: %s") % (required_digest, actual_digest))
302 os.rename(tmpdir, target_impl)
303 # TODO: catch already-exists, delete tmpdir and return success
304 except:
305 info("Deleting tmpdir '%s'" % tmpdir)
306 shutil.rmtree(tmpdir)
307 raise
309 def _parse_manifest(manifest_data):
310 """Parse a manifest file.
311 @param manifest_data: the contents of the manifest file
312 @type manifest_data: str
313 @return: a mapping from paths to information about that path
314 @rtype: {str: tuple}"""
315 wanted = {}
316 dir = ''
317 for line in manifest_data.split('\n'):
318 if not line: break
319 if line[0] == 'D':
320 data = line.split(' ', 1)
321 if len(data) != 2: raise BadDigest("Bad line '%s'" % line)
322 path = data[-1]
323 if not path.startswith('/'): raise BadDigest("Not absolute: '%s'" % line)
324 path = path[1:]
325 dir = path
326 elif line[0] == 'S':
327 data = line.split(' ', 3)
328 path = os.path.join(dir, data[-1])
329 if len(data) != 4: raise BadDigest("Bad line '%s'" % line)
330 else:
331 data = line.split(' ', 4)
332 path = os.path.join(dir, data[-1])
333 if len(data) != 5: raise BadDigest("Bad line '%s'" % line)
334 if path in wanted:
335 raise BadDigest('Duplicate entry "%s"' % line)
336 wanted[path] = data[:-1]
337 return wanted
339 def _copy_files(alg, wanted, source, target):
340 """Scan for files under 'source'. For each one:
341 If it is in wanted and has the right details (or they can be fixed; e.g. mtime),
342 then copy it into 'target'.
343 If it's not in wanted, warn and skip it.
344 On exit, wanted contains only files that were not found."""
345 from logging import warn
346 dir = ''
347 for line in alg.generate_manifest(source):
348 if line[0] == 'D':
349 type, name = line.split(' ', 1)
350 assert name.startswith('/')
351 dir = name[1:]
352 path = dir
353 elif line[0] == 'S':
354 type, actual_digest, actual_size, name = line.split(' ', 3)
355 path = os.path.join(dir, name)
356 else:
357 assert line[0] in 'XF'
358 type, actual_digest, actual_mtime, actual_size, name = line.split(' ', 4)
359 path = os.path.join(dir, name)
360 try:
361 required_details = wanted.pop(path)
362 except KeyError:
363 warn("Skipping file not in manifest: '%s'", path)
364 continue
365 if required_details[0] != type:
366 raise BadDigest("Item '%s' has wrong type!" % path)
367 if type == 'D':
368 os.mkdir(os.path.join(target, path))
369 elif type in 'XF':
370 required_type, required_digest, required_mtime, required_size = required_details
371 if required_size != actual_size:
372 raise SafeException("File '%s' has wrong size (%s bytes, but should be "
373 "%s according to manifest)" %
374 (path, actual_size, required_size))
375 required_mtime = int(required_mtime)
376 dest_path = os.path.join(target, path)
377 if type == 'X':
378 mode = 0555
379 else:
380 mode = 0444
381 copy_with_verify(os.path.join(source, path),
382 dest_path,
383 mode,
384 alg,
385 required_digest)
386 os.utime(dest_path, (required_mtime, required_mtime))
387 elif type == 'S':
388 required_type, required_digest, required_size = required_details
389 if required_size != actual_size:
390 raise SafeException("Symlink '%s' has wrong size (%s bytes, but should be "
391 "%s according to manifest)" %
392 (path, actual_size, required_size))
393 symlink_target = os.readlink(os.path.join(source, path))
394 symlink_digest = alg.new_digest()
395 symlink_digest.update(symlink_target)
396 if symlink_digest.hexdigest() != required_digest:
397 raise SafeException("Symlink '%s' has wrong target (digest should be "
398 "%s according to manifest)" % (path, required_digest))
399 dest_path = os.path.join(target, path)
400 os.symlink(symlink_target, dest_path)
401 else:
402 raise SafeException("Unknown manifest type %s for '%s'" % (type, path))
404 class HashLibAlgorithm(Algorithm):
405 new_digest = None # Constructor for digest objects
407 def __init__(self, name):
408 if name == 'sha1':
409 self.new_digest = sha1_new
410 self.name = 'sha1new'
411 else:
412 self.new_digest = getattr(hashlib, name)
413 self.name = name
415 def generate_manifest(self, root):
416 def recurse(sub):
417 # To ensure that a line-by-line comparison of the manifests
418 # is possible, we require that filenames don't contain newlines.
419 # Otherwise, you can name a file so that the part after the \n
420 # would be interpreted as another line in the manifest.
421 if '\n' in sub: raise BadDigest("Newline in filename '%s'" % sub)
422 assert sub.startswith('/')
424 full = os.path.join(root, sub[1:])
425 info = os.lstat(full)
426 new_digest = self.new_digest
428 m = info.st_mode
429 if not stat.S_ISDIR(m): raise Exception('Not a directory: "%s"' % full)
430 if sub != '/':
431 yield "D %s" % sub
432 items = os.listdir(full)
433 items.sort()
434 dirs = []
435 for leaf in items:
436 path = os.path.join(root, sub[1:], leaf)
437 info = os.lstat(path)
438 m = info.st_mode
440 if stat.S_ISREG(m):
441 if leaf == '.manifest': continue
443 d = new_digest(file(path).read()).hexdigest()
444 if m & 0111:
445 yield "X %s %s %s %s" % (d, int(info.st_mtime), info.st_size, leaf)
446 else:
447 yield "F %s %s %s %s" % (d, int(info.st_mtime), info.st_size, leaf)
448 elif stat.S_ISLNK(m):
449 target = os.readlink(path)
450 d = new_digest(target).hexdigest()
451 # Note: Can't use utime on symlinks, so skip mtime
452 # Note: eCryptfs may report length as zero, so count ourselves instead
453 yield "S %s %s %s" % (d, len(target), leaf)
454 elif stat.S_ISDIR(m):
455 dirs.append(leaf)
456 else:
457 raise SafeException("Unknown object '%s' (not a file, directory or symlink)" %
458 path)
459 for x in dirs:
460 for y in recurse(os.path.join(sub, x)): yield y
461 return
463 for x in recurse('/'): yield x
465 def getID(self, digest):
466 return self.name + '=' + digest.hexdigest()
468 algorithms = {
469 'sha1': OldSHA1(),
470 'sha1new': HashLibAlgorithm('sha1'),
473 if hashlib is not None:
474 algorithms['sha256'] = HashLibAlgorithm('sha256')
476 def fixup_permissions(root):
477 """Set permissions recursively for children of root:
478 - If any X bit is set, they all must be.
479 - World readable, non-writable.
480 @raise Exception: if there are unsafe special bits set (setuid, etc)."""
482 for main, dirs, files in os.walk(root):
483 for x in ['.'] + files:
484 full = os.path.join(main, x)
486 raw_mode = os.lstat(full).st_mode
487 if stat.S_ISLNK(raw_mode): continue
489 mode = stat.S_IMODE(raw_mode)
490 if mode & ~0777:
491 raise Exception("Unsafe mode: extracted file '%s' had special bits set in mode '%s'" % (full, oct(mode)))
492 if mode & 0111:
493 os.chmod(full, 0555)
494 else:
495 os.chmod(full, 0444)