Added '0store copy' command. This copies a possibly-untrusted directory
[zeroinstall.git] / zeroinstall / zerostore / manifest.py
blobc939b2220dda202c06feaf148a7151820a656cca
1 # Copyright (C) 2006, Thomas Leonard
2 # See the README file for details, or visit http://0install.net.
4 from __future__ import generators
5 import os, stat
6 from sets import Set
7 import sha
8 from zeroinstall import SafeException
9 from zeroinstall.zerostore import BadDigest
11 try:
12 import hashlib
13 except:
14 hashlib = None
16 """A manifest is a string representing a directory tree, with the property
17 that two trees will generate identical manifest strings if and only if:
19 - They have extactly the same set of files, directories and symlinks.
20 - For each pair of corresponding directories in the two sets:
21 - The mtimes are the same.
22 - For each pair of corresponding files in the two sets:
23 - The size, executable flag and mtime are the same.
24 - The contents have matching SHA1 sums.
25 - For each pair of corresponding symlinks in the two sets:
26 - The mtime and size are the same.
27 - The targets have matching SHA1 sums.
29 The manifest is typically processed with SHA1 itself. So, the idea is that
30 any significant change to the contents of the tree will change the SHA1 sum
31 of the manifest.
33 A top-level ".manifest" file is ignored.
34 """
36 class Algorithm:
37 def generate_manifest(root):
38 """Returns an iterator that yields each line of the manifest for the directory
39 tree rooted at 'root'."""
40 raise Exception('Abstract')
42 def new_digest(self):
43 """Create a new digest. Call update() on the returned object to digest the data.
44 Call getID() to turn it into a full ID string."""
45 raise Exception('Abstract')
47 def getID(self, digest):
48 """Convert a digest (from new_digest) to a full ID."""
49 raise Exception('Abstract')
51 class OldSHA1(Algorithm):
52 def generate_manifest(self, root):
53 def recurse(sub):
54 # To ensure that a line-by-line comparison of the manifests
55 # is possible, we require that filenames don't contain newlines.
56 # Otherwise, you can name a file so that the part after the \n
57 # would be interpreted as another line in the manifest.
58 if '\n' in sub: raise BadDigest("Newline in filename '%s'" % sub)
59 assert sub.startswith('/')
61 if sub == '/.manifest': return
63 full = os.path.join(root, sub[1:])
64 info = os.lstat(full)
66 m = info.st_mode
67 if stat.S_ISDIR(m):
68 if sub != '/':
69 yield "D %s %s" % (info.st_mtime, sub)
70 items = os.listdir(full)
71 items.sort()
72 for x in items:
73 for y in recurse(os.path.join(sub, x)):
74 yield y
75 return
77 assert sub[1:]
78 leaf = os.path.basename(sub[1:])
79 if stat.S_ISREG(m):
80 d = sha.new(file(full).read()).hexdigest()
81 if m & 0111:
82 yield "X %s %s %s %s" % (d, info.st_mtime,info.st_size, leaf)
83 else:
84 yield "F %s %s %s %s" % (d, info.st_mtime,info.st_size, leaf)
85 elif stat.S_ISLNK(m):
86 d = sha.new(os.readlink(full)).hexdigest()
87 # Note: Can't use utime on symlinks, so skip mtime
88 yield "S %s %s %s" % (d, info.st_size, leaf)
89 else:
90 raise SafeException("Unknown object '%s' (not a file, directory or symlink)" %
91 full)
92 for x in recurse('/'): yield x
94 def new_digest(self):
95 return sha.new()
97 def getID(self, digest):
98 return 'sha1=' + digest.hexdigest()
100 def get_algorithm(name):
101 try:
102 return algorithms[name]
103 except KeyError:
104 raise BadDigest("Unknown algorithm '%s'" % name)
106 def generate_manifest(root, alg = 'sha1'):
107 return get_algorithm(alg).generate_manifest(root)
109 def add_manifest_file(dir, digest_or_alg):
110 """Writes a .manifest file into 'dir', and returns the digest.
111 Second argument should be an instance of Algorithm. Passing a digest
112 here is deprecated."""
113 mfile = os.path.join(dir, '.manifest')
114 if os.path.islink(mfile) or os.path.exists(mfile):
115 raise SafeException("Directory '%s' already contains a .manifest file!" % dir)
116 manifest = ''
117 if isinstance(digest_or_alg, Algorithm):
118 alg = digest_or_alg
119 digest = alg.new_digest()
120 else:
121 digest = digest_or_alg
122 alg = get_algorithm('sha1')
123 for line in alg.generate_manifest(dir):
124 manifest += line + '\n'
125 digest.update(manifest)
126 stream = file(mfile, 'w')
127 stream.write(manifest)
128 stream.close()
129 return digest
131 def splitID(id):
132 """Take an ID in the form 'alg=value' and return a tuple (alg, value),
133 where 'alg' is an instance of Algorithm and 'value' is a string. If the
134 algorithm isn't known or the ID has the wrong format, raise KeyError."""
135 parts = id.split('=', 1)
136 if len(parts) != 2:
137 raise BadDigest("Digest '%s' is not in the form 'algorithm=value'" % id)
138 return (get_algorithm(parts[0]), parts[1])
140 def copy_with_verify(src, dest, mode, alg, required_digest):
141 """Copy path src to dest, checking that the contents give the right digest.
142 dest must not exist. New file is created with a mode of 'mode & umask'."""
143 src_obj = file(src)
144 dest_fd = os.open(dest, os.O_WRONLY | os.O_CREAT | os.O_EXCL, mode)
145 digest = alg.new_digest()
146 while True:
147 data = src_obj.read(256)
148 if not data: break
149 digest.update(data)
150 while data:
151 written = os.write(dest_fd, data)
152 assert written >= 0
153 data = data[written:]
154 actual = digest.hexdigest()
155 if actual == required_digest: return
156 os.unlink(dest)
157 raise BadDigest(("Copy failed: file '%s' has wrong digest (may have been tampered with)\n"
158 "Excepted: %s\n"
159 "Actual: %s") % (src, required_digest, actual))
161 def verify(root, required_digest = None):
162 """Ensure that directory 'dir' generates the given digest.
163 Raises BadDigest if not. For a non-error return:
164 - Dir's name must be a digest (in the form "alg=value")
165 - The calculated digest of the contents must match this name.
166 - If there is a .manifest file, then its digest must also match."""
167 if required_digest is None:
168 required_digest = os.path.basename(root)
169 alg = splitID(required_digest)[0]
171 digest = alg.new_digest()
172 lines = []
173 for line in alg.generate_manifest(root):
174 line += '\n'
175 digest.update(line)
176 lines.append(line)
177 actual_digest = alg.getID(digest)
179 manifest_file = os.path.join(root, '.manifest')
180 if os.path.isfile(manifest_file):
181 digest = alg.new_digest()
182 digest.update(file(manifest_file).read())
183 manifest_digest = alg.getID(digest)
184 else:
185 manifest_digest = None
187 if required_digest == actual_digest == manifest_digest:
188 return
190 error = BadDigest("Cached item does NOT verify.")
192 error.detail = " Expected digest: " + required_digest + "\n" + \
193 " Actual digest: " + actual_digest + "\n" + \
194 ".manifest digest: " + (manifest_digest or 'No .manifest file') + "\n\n"
196 if manifest_digest is None:
197 error.detail += "No .manifest, so no further details available."
198 elif manifest_digest == actual_digest:
199 error.detail += "The .manifest file matches the actual contents. Very strange!"
200 elif manifest_digest == required_digest:
201 import difflib
202 diff = difflib.unified_diff(file(manifest_file).readlines(), lines,
203 'Recorded', 'Actual')
204 error.detail += "The .manifest file matches the directory name.\n" \
205 "The contents of the directory have changed:\n" + \
206 ''.join(diff)
207 elif required_digest == actual_digest:
208 error.detail += "The directory contents are correct, but the .manifest file is wrong!"
209 else:
210 error.detail += "The .manifest file matches neither of the other digests. Odd."
211 raise error
213 # XXX: Be more careful about the source tree changing under us. In particular, what happens if:
214 # - A regualar file suddenly turns into a symlink?
215 # - We find a device file (users can hard-link them if on the same device)
216 def copy_tree_with_verify(source, target, manifest_data, required_digest):
217 """Copy directory source to be a subdirectory of target if it matches the required_digest.
218 manifest_data is normally source/.manifest. source and manifest_data are not trusted
219 (will typically be under the control of another user).
220 The copy is first done to a temporary directory in target, then renamed to the final name
221 only if correct. Therefore, an invalid 'target/required_digest' will never exist.
222 A successful return means than target/required_digest now exists (whether we created it or not)."""
223 import tempfile, shutil
224 from logging import info
226 alg, digest_value = splitID(required_digest)
228 if isinstance(alg, OldSHA1):
229 raise SafeException("Sorry, the 'sha1' algorithm does not support copying.")
231 digest = alg.new_digest()
232 digest.update(manifest_data)
233 manifest_digest = alg.getID(digest)
235 if manifest_digest != required_digest:
236 raise zerostore.BadDigest("Manifest has been tampered with!\n"
237 "Manifest digest: " + manifest_digest + "\n"
238 "Directory name : " + required_digest)
240 target_impl = os.path.join(target, required_digest)
241 if os.path.isdir(target_impl):
242 info("Target directory '%s' already exists", target_impl)
243 return
245 # We've checked that the source's manifest matches required_digest, so it
246 # is what we want. Make a list of all the files we need to copy...
248 wanted = _parse_manifest(manifest_data)
250 tmpdir = tempfile.mkdtemp(prefix = 'tmp-copy-', dir = target)
252 try:
253 _copy_files(alg, wanted, source, tmpdir)
255 if wanted:
256 raise SafeException('Copy failed; files missing from source:\n- ' +
257 '\n- '.join(wanted.keys()))
259 # Check that the copy is correct
260 actual_digest = alg.getID(add_manifest_file(tmpdir, alg))
261 if actual_digest != required_digest:
262 raise SafeException(("Copy failed; double-check of target gave the wrong digest.\n"
263 "Unless the target was modified during the copy, this is a BUG\n"
264 "in 0store and should be reported.\n"
265 "Expected: %s\n"
266 "Actual: %s") % (required_digest, actual_digest))
267 os.rename(tmpdir, target_impl)
268 # TODO: catch already-exists, delete tmpdir and return success
269 except:
270 info("Deleting tmpdir '%s'" % tmpdir)
271 shutil.rmtree(tmpdir)
272 raise
274 def _parse_manifest(manifest_data):
275 wanted = {} # Path -> (manifest line tuple)
276 dir = ''
277 for line in manifest_data.split('\n'):
278 if not line: break
279 if line[0] == 'D':
280 data = line.split(' ', 1)
281 if len(data) != 2: raise zerostore.BadDigest("Bad line '%s'" % line)
282 path = data[-1]
283 if not path.startswith('/'): raise zerostore.BadDigest("Not absolute: '%s'" % line)
284 path = path[1:]
285 dir = path
286 elif line[0] == 'S':
287 data = line.split(' ', 3)
288 path = os.path.join(dir, data[-1])
289 if len(data) != 4: raise zerostore.BadDigest("Bad line '%s'" % line)
290 else:
291 data = line.split(' ', 4)
292 path = os.path.join(dir, data[-1])
293 if len(data) != 5: raise zerostore.BadDigest("Bad line '%s'" % line)
294 if path in wanted:
295 raise zerostore.BadDigest('Duplicate entry "%s"' % line)
296 wanted[path] = data[:-1]
297 return wanted
299 def _copy_files(alg, wanted, source, target):
300 """Scan for files under 'source'. For each one:
301 If it is in wanted and has the right details (or they can be fixed; e.g. mtime),
302 then copy it into 'target'.
303 If it's not in wanted, warn and skip it.
304 On exit, wanted contains only files that were not found."""
305 from logging import warn
306 dir = ''
307 for line in alg.generate_manifest(source):
308 if line[0] == 'D':
309 type, name = line.split(' ', 1)
310 assert name.startswith('/')
311 dir = name[1:]
312 path = dir
313 elif line[0] == 'S':
314 type, actual_digest, actual_size, name = line.split(' ', 3)
315 path = os.path.join(dir, name)
316 else:
317 assert line[0] in 'XF'
318 type, actual_digest, actual_mtime, actual_size, name = line.split(' ', 4)
319 path = os.path.join(dir, name)
320 try:
321 required_details = wanted.pop(path)
322 except KeyError:
323 warn("Skipping file not in manifest: '%s'", path)
324 continue
325 if required_details[0] != type:
326 raise zerostore.BadDigest("Item '%s' has wrong type!" % path)
327 if type == 'D':
328 os.mkdir(os.path.join(target, path))
329 elif type in 'XF':
330 required_type, required_digest, required_mtime, required_size = required_details
331 if required_size != actual_size:
332 raise SafeException("File '%s' has wrong size (%s bytes, but should be "
333 "%s according to manifest)" %
334 (path, actual_size, required_size))
335 required_mtime = int(required_mtime)
336 dest_path = os.path.join(target, path)
337 if type == 'X':
338 mode = 0555
339 else:
340 mode = 0444
341 copy_with_verify(os.path.join(source, path),
342 dest_path,
343 mode,
344 alg,
345 required_digest)
346 os.utime(dest_path, (required_mtime, required_mtime))
347 elif type == 'S':
348 required_type, required_digest, required_size = required_details
349 if required_size != actual_size:
350 raise SafeException("Symlink '%s' has wrong size (%s bytes, but should be "
351 "%s according to manifest)" %
352 (path, actual_size, required_size))
353 symlink_target = os.readlink(os.path.join(source, path))
354 symlink_digest = alg.new_digest()
355 symlink_digest.update(symlink_target)
356 if symlink_digest.hexdigest() != required_digest:
357 raise SafeException("Symlink '%s' has wrong target (digest should be "
358 "%s according to manifest)" % (path, required_digest))
359 dest_path = os.path.join(target, path)
360 os.symlink(symlink_target, dest_path)
361 else:
362 raise SafeException("Unknown manifest type %s for '%s'" % (type, path))
364 class HashLibAlgorithm(Algorithm):
365 new_digest = None # Constructor for digest objects
367 def __init__(self, name):
368 if name == 'sha1':
369 import sha
370 self.new_digest = sha.new
371 self.name = 'sha1new'
372 else:
373 self.new_digest = getattr(hashlib, name)
374 self.name = name
376 def generate_manifest(self, root):
377 def recurse(sub):
378 # To ensure that a line-by-line comparison of the manifests
379 # is possible, we require that filenames don't contain newlines.
380 # Otherwise, you can name a file so that the part after the \n
381 # would be interpreted as another line in the manifest.
382 if '\n' in sub: raise BadDigest("Newline in filename '%s'" % sub)
383 assert sub.startswith('/')
385 full = os.path.join(root, sub[1:])
386 info = os.lstat(full)
387 new_digest = self.new_digest
389 m = info.st_mode
390 if not stat.S_ISDIR(m): raise Exception('Not a directory: "%s"' % full)
391 if sub != '/':
392 yield "D %s" % sub
393 items = os.listdir(full)
394 items.sort()
395 dirs = []
396 for leaf in items:
397 path = os.path.join(root, sub[1:], leaf)
398 info = os.lstat(path)
399 m = info.st_mode
401 if stat.S_ISREG(m):
402 if leaf == '.manifest': continue
404 d = new_digest(file(path).read()).hexdigest()
405 if m & 0111:
406 yield "X %s %s %s %s" % (d, info.st_mtime,info.st_size, leaf)
407 else:
408 yield "F %s %s %s %s" % (d, info.st_mtime,info.st_size, leaf)
409 elif stat.S_ISLNK(m):
410 d = new_digest(os.readlink(path)).hexdigest()
411 # Note: Can't use utime on symlinks, so skip mtime
412 yield "S %s %s %s" % (d, info.st_size, leaf)
413 elif stat.S_ISDIR(m):
414 dirs.append(leaf)
415 else:
416 raise SafeException("Unknown object '%s' (not a file, directory or symlink)" %
417 path)
418 for x in dirs:
419 for y in recurse(os.path.join(sub, x)): yield y
420 return
422 for x in recurse('/'): yield x
424 def getID(self, digest):
425 return self.name + '=' + digest.hexdigest()
427 algorithms = {
428 'sha1': OldSHA1(),
429 'sha1new': HashLibAlgorithm('sha1'),
432 if hashlib is not None:
433 algorithms['sha256'] = HashLibAlgorithm('sha256')