Guess MIME-types automatically for tbz/tlz/txz suffixes
[zeroinstall/zeroinstall-afb.git] / zeroinstall / zerostore / unpack.py
blob2899ffc1c0eebbd8e5dd3b8bb57f226b5fb6e96f
1 """Unpacking archives of various formats."""
3 # Copyright (C) 2009, Thomas Leonard
4 # See the README file for details, or visit http://0install.net.
6 from zeroinstall import _
7 import os, subprocess
8 import shutil
9 import glob
10 import traceback
11 from tempfile import mkdtemp, mkstemp
12 import re
13 from logging import debug, warn
14 from zeroinstall import SafeException
15 from zeroinstall.support import find_in_path, ro_rmtree
17 _cpio_version = None
18 def _get_cpio_version():
19 global _cpio_version
20 if _cpio_version is None:
21 _cpio_version = os.popen('cpio --version 2>&1').next()
22 debug(_("cpio version = %s"), _cpio_version)
23 return _cpio_version
25 def _gnu_cpio():
26 gnu_cpio = '(GNU cpio)' in _get_cpio_version()
27 debug(_("Is GNU cpio = %s"), gnu_cpio)
28 return gnu_cpio
30 _tar_version = None
31 def _get_tar_version():
32 global _tar_version
33 if _tar_version is None:
34 _tar_version = os.popen('tar --version 2>&1').next().strip()
35 debug(_("tar version = %s"), _tar_version)
36 return _tar_version
38 def _gnu_tar():
39 gnu_tar = '(GNU tar)' in _get_tar_version()
40 debug(_("Is GNU tar = %s"), gnu_tar)
41 return gnu_tar
43 def recent_gnu_tar():
44 """@deprecated: should be private"""
45 recent_gnu_tar = False
46 if _gnu_tar():
47 version = re.search(r'\)\s*(\d+(\.\d+)*)', _get_tar_version())
48 if version:
49 version = map(int, version.group(1).split('.'))
50 recent_gnu_tar = version > [1, 13, 92]
51 else:
52 warn(_("Failed to extract GNU tar version number"))
53 debug(_("Recent GNU tar = %s"), recent_gnu_tar)
54 return recent_gnu_tar
56 # Disabled, as Plash does not currently support fchmod(2).
57 _pola_run = None
58 #_pola_run = find_in_path('pola-run')
59 #if _pola_run:
60 # info('Found pola-run: %s', _pola_run)
61 #else:
62 # info('pola-run not found; archive extraction will not be sandboxed')
64 def type_from_url(url):
65 """Guess the MIME type for this resource based on its URL. Returns None if we don't know what it is."""
66 url = url.lower()
67 if url.endswith('.rpm'): return 'application/x-rpm'
68 if url.endswith('.deb'): return 'application/x-deb'
69 if url.endswith('.tar.bz2'): return 'application/x-bzip-compressed-tar'
70 if url.endswith('.tar.gz'): return 'application/x-compressed-tar'
71 if url.endswith('.tar.lzma'): return 'application/x-lzma-compressed-tar'
72 if url.endswith('.tar.xz'): return 'application/x-xz-compressed-tar'
73 if url.endswith('.tbz'): return 'application/x-bzip-compressed-tar'
74 if url.endswith('.tgz'): return 'application/x-compressed-tar'
75 if url.endswith('.tlz'): return 'application/x-lzma-compressed-tar'
76 if url.endswith('.txz'): return 'application/x-xz-compressed-tar'
77 if url.endswith('.tar'): return 'application/x-tar'
78 if url.endswith('.zip'): return 'application/zip'
79 if url.endswith('.cab'): return 'application/vnd.ms-cab-compressed'
80 if url.endswith('.dmg'): return 'application/x-apple-diskimage'
81 if url.endswith('.gem'): return 'application/x-ruby-gem'
82 return None
84 def check_type_ok(mime_type):
85 """Check we have the needed software to extract from an archive of the given type.
86 @raise SafeException: if the needed software is not available"""
87 assert mime_type
88 if mime_type == 'application/x-rpm':
89 if not find_in_path('rpm2cpio'):
90 raise SafeException(_("This package looks like an RPM, but you don't have the rpm2cpio command "
91 "I need to extract it. Install the 'rpm' package first (this works even if "
92 "you're on a non-RPM-based distribution such as Debian)."))
93 elif mime_type == 'application/x-deb':
94 if not find_in_path('ar'):
95 raise SafeException(_("This package looks like a Debian package, but you don't have the 'ar' command "
96 "I need to extract it. Install the package containing it (sometimes called 'binutils') "
97 "first. This works even if you're on a non-Debian-based distribution such as Red Hat)."))
98 elif mime_type == 'application/x-bzip-compressed-tar':
99 pass # We'll fall back to Python's built-in tar.bz2 support
100 elif mime_type == 'application/zip':
101 if not find_in_path('unzip'):
102 raise SafeException(_("This package looks like a zip-compressed archive, but you don't have the 'unzip' command "
103 "I need to extract it. Install the package containing it first."))
104 elif mime_type == 'application/vnd.ms-cab-compressed':
105 if not find_in_path('cabextract'):
106 raise SafeException(_("This package looks like a Microsoft Cabinet archive, but you don't have the 'cabextract' command "
107 "I need to extract it. Install the package containing it first."))
108 elif mime_type == 'application/x-apple-diskimage':
109 if not find_in_path('hdiutil'):
110 raise SafeException(_("This package looks like a Apple Disk Image, but you don't have the 'hdiutil' command "
111 "I need to extract it."))
112 elif mime_type == 'application/x-lzma-compressed-tar':
113 pass # We can get it through Zero Install
114 elif mime_type == 'application/x-xz-compressed-tar':
115 if not find_in_path('unxz'):
116 raise SafeException(_("This package looks like a xz-compressed package, but you don't have the 'unxz' command "
117 "I need to extract it. Install the package containing it (it's probably called 'xz-utils') "
118 "first."))
119 elif mime_type in ('application/x-compressed-tar', 'application/x-tar'):
120 pass
121 else:
122 from zeroinstall import version
123 raise SafeException(_("Unsupported archive type '%(type)s' (for injector version %(version)s)") % {'type': mime_type, 'version': version})
125 def _exec_maybe_sandboxed(writable, prog, *args):
126 """execlp prog, with (only) the 'writable' directory writable if sandboxing is available.
127 If no sandbox is available, run without a sandbox."""
128 prog_path = find_in_path(prog)
129 if not prog_path: raise Exception(_("'%s' not found in $PATH") % prog)
130 if _pola_run is None:
131 os.execlp(prog_path, prog_path, *args)
132 # We have pola-shell :-)
133 pola_args = ['--prog', prog_path, '-f', '/']
134 for a in args:
135 pola_args += ['-a', a]
136 if writable:
137 pola_args += ['-fw', writable]
138 os.execl(_pola_run, _pola_run, *pola_args)
140 def unpack_archive_over(url, data, destdir, extract = None, type = None, start_offset = 0):
141 """Like unpack_archive, except that we unpack to a temporary directory first and
142 then move things over, checking that we're not following symlinks at each stage.
143 Use this when you want to unpack an unarchive into a directory which already has
144 stuff in it.
145 @note: Since 0.49, the leading "extract" component is removed (unlike unpack_archive).
146 @since: 0.28"""
147 import stat
148 tmpdir = mkdtemp(dir = destdir)
149 assert extract is None or os.sep not in extract, extract
150 try:
151 mtimes = []
153 unpack_archive(url, data, tmpdir, extract, type, start_offset)
155 if extract is None:
156 srcdir = tmpdir
157 else:
158 srcdir = os.path.join(tmpdir, extract)
159 assert not os.path.islink(srcdir)
161 stem_len = len(srcdir)
162 for root, dirs, files in os.walk(srcdir):
163 relative_root = root[stem_len + 1:] or '.'
164 target_root = os.path.join(destdir, relative_root)
165 try:
166 info = os.lstat(target_root)
167 except OSError, ex:
168 if ex.errno != 2:
169 raise # Some odd error.
170 # Doesn't exist. OK.
171 os.mkdir(target_root)
172 else:
173 if stat.S_ISLNK(info.st_mode):
174 raise SafeException(_('Attempt to unpack dir over symlink "%s"!') % relative_root)
175 elif not stat.S_ISDIR(info.st_mode):
176 raise SafeException(_('Attempt to unpack dir over non-directory "%s"!') % relative_root)
177 mtimes.append((relative_root, os.lstat(os.path.join(srcdir, root)).st_mtime))
179 for s in dirs: # Symlinks are counted as directories
180 src = os.path.join(srcdir, relative_root, s)
181 if os.path.islink(src):
182 files.append(s)
184 for f in files:
185 src = os.path.join(srcdir, relative_root, f)
186 dest = os.path.join(destdir, relative_root, f)
187 if os.path.islink(dest):
188 raise SafeException(_('Attempt to unpack file over symlink "%s"!') %
189 os.path.join(relative_root, f))
190 os.rename(src, dest)
192 for path, mtime in mtimes[1:]:
193 os.utime(os.path.join(destdir, path), (mtime, mtime))
194 finally:
195 ro_rmtree(tmpdir)
197 def unpack_archive(url, data, destdir, extract = None, type = None, start_offset = 0):
198 """Unpack stream 'data' into directory 'destdir'. If extract is given, extract just
199 that sub-directory from the archive (i.e. destdir/extract will exist afterwards).
200 Works out the format from the name."""
201 if type is None: type = type_from_url(url)
202 if type is None: raise SafeException(_("Unknown extension (and no MIME type given) in '%s'") % url)
203 if type == 'application/x-bzip-compressed-tar':
204 extract_tar(data, destdir, extract, 'bzip2', start_offset)
205 elif type == 'application/x-deb':
206 extract_deb(data, destdir, extract, start_offset)
207 elif type == 'application/x-rpm':
208 extract_rpm(data, destdir, extract, start_offset)
209 elif type == 'application/zip':
210 extract_zip(data, destdir, extract, start_offset)
211 elif type == 'application/x-tar':
212 extract_tar(data, destdir, extract, None, start_offset)
213 elif type == 'application/x-lzma-compressed-tar':
214 extract_tar(data, destdir, extract, 'lzma', start_offset)
215 elif type == 'application/x-xz-compressed-tar':
216 extract_tar(data, destdir, extract, 'xz', start_offset)
217 elif type == 'application/x-compressed-tar':
218 extract_tar(data, destdir, extract, 'gzip', start_offset)
219 elif type == 'application/vnd.ms-cab-compressed':
220 extract_cab(data, destdir, extract, start_offset)
221 elif type == 'application/x-apple-diskimage':
222 extract_dmg(data, destdir, extract, start_offset)
223 elif type == 'application/x-ruby-gem':
224 extract_gem(data, destdir, extract, start_offset)
225 else:
226 raise SafeException(_('Unknown MIME type "%(type)s" for "%(url)s"') % {'type': type, 'url': url})
228 def extract_deb(stream, destdir, extract = None, start_offset = 0):
229 if extract:
230 raise SafeException(_('Sorry, but the "extract" attribute is not yet supported for Debs'))
232 stream.seek(start_offset)
233 # ar can't read from stdin, so make a copy...
234 deb_copy_name = os.path.join(destdir, 'archive.deb')
235 deb_copy = file(deb_copy_name, 'w')
236 shutil.copyfileobj(stream, deb_copy)
237 deb_copy.close()
239 data_tar = None
240 p = subprocess.Popen(('ar', 't', 'archive.deb'), stdout=subprocess.PIPE, cwd=destdir, universal_newlines=True)
241 o = p.communicate()[0]
242 for line in o.split('\n'):
243 if line == 'data.tar':
244 data_compression = None
245 elif line == 'data.tar.gz':
246 data_compression = 'gzip'
247 elif line == 'data.tar.bz2':
248 data_compression = 'bzip2'
249 elif line == 'data.tar.lzma':
250 data_compression = 'lzma'
251 else:
252 continue
253 data_tar = line
254 break
255 else:
256 raise SafeException(_("File is not a Debian package."))
258 _extract(stream, destdir, ('ar', 'x', 'archive.deb', data_tar))
259 os.unlink(deb_copy_name)
260 data_name = os.path.join(destdir, data_tar)
261 data_stream = file(data_name)
262 os.unlink(data_name)
263 extract_tar(data_stream, destdir, None, data_compression)
265 def extract_rpm(stream, destdir, extract = None, start_offset = 0):
266 if extract:
267 raise SafeException(_('Sorry, but the "extract" attribute is not yet supported for RPMs'))
268 fd, cpiopath = mkstemp('-rpm-tmp')
269 try:
270 child = os.fork()
271 if child == 0:
272 try:
273 try:
274 os.dup2(stream.fileno(), 0)
275 os.lseek(0, start_offset, 0)
276 os.dup2(fd, 1)
277 _exec_maybe_sandboxed(None, 'rpm2cpio', '-')
278 except:
279 traceback.print_exc()
280 finally:
281 os._exit(1)
282 id, status = os.waitpid(child, 0)
283 assert id == child
284 if status != 0:
285 raise SafeException(_("rpm2cpio failed; can't unpack RPM archive; exit code %d") % status)
286 os.close(fd)
287 fd = None
289 args = ['cpio', '-mid']
290 if _gnu_cpio():
291 args.append('--quiet')
293 _extract(file(cpiopath), destdir, args)
294 # Set the mtime of every directory under 'tmp' to 0, since cpio doesn't
295 # preserve directory mtimes.
296 os.path.walk(destdir, lambda arg, dirname, names: os.utime(dirname, (0, 0)), None)
297 finally:
298 if fd is not None:
299 os.close(fd)
300 os.unlink(cpiopath)
302 def extract_gem(stream, destdir, extract = None, start_offset = 0):
303 "@since: 0.53"
304 stream.seek(start_offset)
305 payload = 'data.tar.gz'
306 payload_stream = None
307 tmpdir = mkdtemp(dir = destdir)
308 try:
309 extract_tar(stream, destdir=tmpdir, extract=payload, decompress=None)
310 payload_stream = file(os.path.join(tmpdir, payload))
311 extract_tar(payload_stream, destdir=destdir, extract=extract, decompress='gzip')
312 finally:
313 if payload_stream:
314 payload_stream.close()
315 ro_rmtree(tmpdir)
317 def extract_cab(stream, destdir, extract, start_offset = 0):
318 "@since: 0.24"
319 if extract:
320 raise SafeException(_('Sorry, but the "extract" attribute is not yet supported for Cabinet files'))
322 stream.seek(start_offset)
323 # cabextract can't read from stdin, so make a copy...
324 cab_copy_name = os.path.join(destdir, 'archive.cab')
325 cab_copy = file(cab_copy_name, 'w')
326 shutil.copyfileobj(stream, cab_copy)
327 cab_copy.close()
329 _extract(stream, destdir, ['cabextract', '-s', '-q', 'archive.cab'])
330 os.unlink(cab_copy_name)
332 def extract_dmg(stream, destdir, extract, start_offset = 0):
333 "@since: 0.46"
334 if extract:
335 raise SafeException(_('Sorry, but the "extract" attribute is not yet supported for DMGs'))
337 stream.seek(start_offset)
338 # hdiutil can't read from stdin, so make a copy...
339 dmg_copy_name = os.path.join(destdir, 'archive.dmg')
340 dmg_copy = file(dmg_copy_name, 'w')
341 shutil.copyfileobj(stream, dmg_copy)
342 dmg_copy.close()
344 mountpoint = mkdtemp(prefix='archive')
345 subprocess.check_call(["hdiutil", "attach", "-quiet", "-mountpoint", mountpoint, "-nobrowse", dmg_copy_name])
346 subprocess.check_call(["cp", "-pR"] + glob.glob("%s/*" % mountpoint) + [destdir])
347 subprocess.check_call(["hdiutil", "detach", "-quiet", mountpoint])
348 os.rmdir(mountpoint)
349 os.unlink(dmg_copy_name)
351 def extract_zip(stream, destdir, extract, start_offset = 0):
352 if extract:
353 # Limit the characters we accept, to avoid sending dodgy
354 # strings to zip
355 if not re.match('^[a-zA-Z0-9][- _a-zA-Z0-9.]*$', extract):
356 raise SafeException(_('Illegal character in extract attribute'))
358 stream.seek(start_offset)
359 # unzip can't read from stdin, so make a copy...
360 zip_copy_name = os.path.join(destdir, 'archive.zip')
361 zip_copy = file(zip_copy_name, 'w')
362 shutil.copyfileobj(stream, zip_copy)
363 zip_copy.close()
365 args = ['unzip', '-q', '-o', 'archive.zip']
367 if extract:
368 args.append(extract + '/*')
370 _extract(stream, destdir, args)
371 os.unlink(zip_copy_name)
373 def extract_tar(stream, destdir, extract, decompress, start_offset = 0):
374 if extract:
375 # Limit the characters we accept, to avoid sending dodgy
376 # strings to tar
377 if not re.match('^[a-zA-Z0-9][- _a-zA-Z0-9.]*$', extract):
378 raise SafeException(_('Illegal character in extract attribute'))
380 assert decompress in [None, 'bzip2', 'gzip', 'lzma', 'xz']
382 if _gnu_tar():
383 ext_cmd = ['tar']
384 if decompress:
385 if decompress == 'bzip2':
386 ext_cmd.append('--bzip2')
387 elif decompress == 'gzip':
388 ext_cmd.append('-z')
389 elif decompress == 'lzma':
390 unlzma = find_in_path('unlzma')
391 if not unlzma:
392 unlzma = os.path.abspath(os.path.join(os.path.dirname(__file__), '_unlzma'))
393 ext_cmd.append('--use-compress-program=' + unlzma)
394 elif decompress == 'xz':
395 unxz = find_in_path('unxz')
396 if not unxz:
397 unxz = os.path.abspath(os.path.join(os.path.dirname(__file__), '_unxz'))
398 ext_cmd.append('--use-compress-program=' + unxz)
400 if recent_gnu_tar():
401 ext_cmd.extend(('-x', '--no-same-owner', '--no-same-permissions'))
402 else:
403 ext_cmd.extend(('xf', '-'))
405 if extract:
406 ext_cmd.append(extract)
408 _extract(stream, destdir, ext_cmd, start_offset)
409 else:
410 # Since we don't have GNU tar, use python's tarfile module. This will probably
411 # be a lot slower and we do not support lzma and xz; however, it is portable.
412 if decompress is None:
413 rmode = 'r|'
414 elif decompress == 'bzip2':
415 rmode = 'r|bz2'
416 elif decompress == 'gzip':
417 rmode = 'r|gz'
418 else:
419 raise SafeException(_('GNU tar unavailable; unsupported compression format: %s') % decompress)
421 import tarfile
423 stream.seek(start_offset)
424 # Python 2.5.1 crashes if name is None; see Python bug #1706850
425 tar = tarfile.open(name = '', mode = rmode, fileobj = stream)
427 current_umask = os.umask(0)
428 os.umask(current_umask)
430 uid = gid = None
431 try:
432 uid = os.geteuid()
433 gid = os.getegid()
434 except:
435 debug(_("Can't get uid/gid"))
437 def chmod_extract(tarinfo):
438 # If any X bit is set, they all must be
439 if tarinfo.mode & 0111:
440 tarinfo.mode |= 0111
442 # Everyone gets read and write (subject to the umask)
443 # No special bits are allowed.
444 tarinfo.mode = ((tarinfo.mode | 0666) & ~current_umask) & 0777
446 # Don't change owner, even if run as root
447 if uid:
448 tarinfo.uid = uid
449 if gid:
450 tarinfo.gid = gid
451 tar.extract(tarinfo, destdir)
453 extracted_anything = False
454 ext_dirs = []
456 for tarinfo in tar:
457 if extract is None or \
458 tarinfo.name.startswith(extract + '/') or \
459 tarinfo.name == extract:
460 if tarinfo.isdir():
461 ext_dirs.append(tarinfo)
463 chmod_extract(tarinfo)
464 extracted_anything = True
466 # Due to a bug in tarfile (python versions < 2.5), we have to manually
467 # set the mtime of each directory that we extract after extracting everything.
469 for tarinfo in ext_dirs:
470 dirname = os.path.join(destdir, tarinfo.name)
471 os.utime(dirname, (tarinfo.mtime, tarinfo.mtime))
473 tar.close()
475 if extract and not extracted_anything:
476 raise SafeException(_('Unable to find specified file = %s in archive') % extract)
478 def _extract(stream, destdir, command, start_offset = 0):
479 """Run execvp('command') inside destdir in a child process, with
480 stream seeked to 'start_offset' as stdin."""
482 # Some zip archives are missing timezone information; force consistent results
483 child_env = os.environ.copy()
484 child_env['TZ'] = 'GMT'
486 stream.seek(start_offset)
488 # TODO: use pola-run if available, once it supports fchmod
489 child = subprocess.Popen(command, cwd = destdir, stdin = stream, stderr = subprocess.PIPE, env = child_env)
491 unused, cerr = child.communicate()
493 status = child.wait()
494 if status != 0:
495 raise SafeException(_('Failed to extract archive (using %(command)s); exit code %(status)d:\n%(err)s') % {'command': command, 'status': status, 'err': cerr.strip()})