Lib/tarfile.py

   1 #!/usr/bin/env python
   2 # -*- coding: iso-8859-1 -*-
   3 #-------------------------------------------------------------------
   4 # tarfile.py
   5 #-------------------------------------------------------------------
   6 # Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
   7 # All rights reserved.
   8 #
   9 # Permission  is  hereby granted,  free  of charge,  to  any person
  10 # obtaining a  copy of  this software  and associated documentation
  11 # files  (the  "Software"),  to   deal  in  the  Software   without
  12 # restriction,  including  without limitation  the  rights to  use,
  13 # copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 # copies  of  the  Software,  and to  permit  persons  to  whom the
  15 # Software  is  furnished  to  do  so,  subject  to  the  following
  16 # conditions:
  17 #
  18 # The above copyright  notice and this  permission notice shall  be
  19 # included in all copies or substantial portions of the Software.
  20 #
  21 # THE SOFTWARE IS PROVIDED "AS  IS", WITHOUT WARRANTY OF ANY  KIND,
  22 # EXPRESS OR IMPLIED, INCLUDING  BUT NOT LIMITED TO  THE WARRANTIES
  23 # OF  MERCHANTABILITY,  FITNESS   FOR  A  PARTICULAR   PURPOSE  AND
  24 # NONINFRINGEMENT.  IN  NO  EVENT SHALL  THE  AUTHORS  OR COPYRIGHT
  25 # HOLDERS  BE LIABLE  FOR ANY  CLAIM, DAMAGES  OR OTHER  LIABILITY,
  26 # WHETHER  IN AN  ACTION OF  CONTRACT, TORT  OR OTHERWISE,  ARISING
  27 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  28 # OTHER DEALINGS IN THE SOFTWARE.
  29 #
  30 """Read from and write to tar format archives.
  31 """
  32
  33 __version__ = "$Revision$"
  34 # $Source$
  35
  36 version     = "0.9.0"
  37 __author__  = "Lars Gustäbel (lars@gustaebel.de)"
  38 __date__    = "$Date$"
  39 __cvsid__   = "$Id$"
  40 __credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend."
  41
  42 #---------
  43 # Imports
  44 #---------
  45 import sys
  46 import os
  47 import shutil
  48 import stat
  49 import errno
  50 import time
  51 import struct
  52 import copy
  53 import re
  54 import operator
  55
  56 if sys.platform == 'mac':
  57     # This module needs work for MacOS9, especially in the area of pathname
  58     # handling. In many places it is assumed a simple substitution of / by the
  59     # local os.path.sep is good enough to convert pathnames, but this does not
  60     # work with the mac rooted:path:name versus :nonrooted:path:name syntax
  61     raise ImportError, "tarfile does not work for platform==mac"
  62
  63 try:
  64     import grp, pwd
  65 except ImportError:
  66     grp = pwd = None
  67
  68 # from tarfile import *
  69 __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
  70
  71 #---------------------------------------------------------
  72 # tar constants
  73 #---------------------------------------------------------
  74 NUL = "\0"                      # the null character
  75 BLOCKSIZE = 512                 # length of processing blocks
  76 RECORDSIZE = BLOCKSIZE * 20     # length of records
  77 GNU_MAGIC = "ustar  \0"         # magic gnu tar string
  78 POSIX_MAGIC = "ustar\x0000"     # magic posix tar string
  79
  80 LENGTH_NAME = 100               # maximum length of a filename
  81 LENGTH_LINK = 100               # maximum length of a linkname
  82 LENGTH_PREFIX = 155             # maximum length of the prefix field
  83
  84 REGTYPE = "0"                   # regular file
  85 AREGTYPE = "\0"                 # regular file
  86 LNKTYPE = "1"                   # link (inside tarfile)
  87 SYMTYPE = "2"                   # symbolic link
  88 CHRTYPE = "3"                   # character special device
  89 BLKTYPE = "4"                   # block special device
  90 DIRTYPE = "5"                   # directory
  91 FIFOTYPE = "6"                  # fifo special device
  92 CONTTYPE = "7"                  # contiguous file
  93
  94 GNUTYPE_LONGNAME = "L"          # GNU tar longname
  95 GNUTYPE_LONGLINK = "K"          # GNU tar longlink
  96 GNUTYPE_SPARSE = "S"            # GNU tar sparse file
  97
  98 XHDTYPE = "x"                   # POSIX.1-2001 extended header
  99 XGLTYPE = "g"                   # POSIX.1-2001 global header
 100 SOLARIS_XHDTYPE = "X"           # Solaris extended header
 101
 102 USTAR_FORMAT = 0                # POSIX.1-1988 (ustar) format
 103 GNU_FORMAT = 1                  # GNU tar format
 104 PAX_FORMAT = 2                  # POSIX.1-2001 (pax) format
 105 DEFAULT_FORMAT = GNU_FORMAT
 106
 107 #---------------------------------------------------------
 108 # tarfile constants
 109 #---------------------------------------------------------
 110 # File types that tarfile supports:
 111 SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
 112                    SYMTYPE, DIRTYPE, FIFOTYPE,
 113                    CONTTYPE, CHRTYPE, BLKTYPE,
 114                    GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
 115                    GNUTYPE_SPARSE)
 116
 117 # File types that will be treated as a regular file.
 118 REGULAR_TYPES = (REGTYPE, AREGTYPE,
 119                  CONTTYPE, GNUTYPE_SPARSE)
 120
 121 # File types that are part of the GNU tar format.
 122 GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
 123              GNUTYPE_SPARSE)
 124
 125 # Fields from a pax header that override a TarInfo attribute.
 126 PAX_FIELDS = ("path", "linkpath", "size", "mtime",
 127               "uid", "gid", "uname", "gname")
 128
 129 # Fields in a pax header that are numbers, all other fields
 130 # are treated as strings.
 131 PAX_NUMBER_FIELDS = {
 132     "atime": float,
 133     "ctime": float,
 134     "mtime": float,
 135     "uid": int,
 136     "gid": int,
 137     "size": int
 138 }
 139
 140 #---------------------------------------------------------
 141 # Bits used in the mode field, values in octal.
 142 #---------------------------------------------------------
 143 S_IFLNK = 0120000        # symbolic link
 144 S_IFREG = 0100000        # regular file
 145 S_IFBLK = 0060000        # block device
 146 S_IFDIR = 0040000        # directory
 147 S_IFCHR = 0020000        # character device
 148 S_IFIFO = 0010000        # fifo
 149
 150 TSUID   = 04000          # set UID on execution
 151 TSGID   = 02000          # set GID on execution
 152 TSVTX   = 01000          # reserved
 153
 154 TUREAD  = 0400           # read by owner
 155 TUWRITE = 0200           # write by owner
 156 TUEXEC  = 0100           # execute/search by owner
 157 TGREAD  = 0040           # read by group
 158 TGWRITE = 0020           # write by group
 159 TGEXEC  = 0010           # execute/search by group
 160 TOREAD  = 0004           # read by other
 161 TOWRITE = 0002           # write by other
 162 TOEXEC  = 0001           # execute/search by other
 163
 164 #---------------------------------------------------------
 165 # initialization
 166 #---------------------------------------------------------
 167 ENCODING = sys.getfilesystemencoding()
 168 if ENCODING is None:
 169     ENCODING = sys.getdefaultencoding()
 170
 171 #---------------------------------------------------------
 172 # Some useful functions
 173 #---------------------------------------------------------
 174
 175 def stn(s, length):
 176     """Convert a python string to a null-terminated string buffer.
 177     """
 178     return s[:length] + (length - len(s)) * NUL
 179
 180 def nts(s):
 181     """Convert a null-terminated string field to a python string.
 182     """
 183     # Use the string up to the first null char.
 184     p = s.find("\0")
 185     if p == -1:
 186         return s
 187     return s[:p]
 188
 189 def nti(s):
 190     """Convert a number field to a python number.
 191     """
 192     # There are two possible encodings for a number field, see
 193     # itn() below.
 194     if s[0] != chr(0200):
 195         try:
 196             n = int(nts(s) or "0", 8)
 197         except ValueError:
 198             raise HeaderError("invalid header")
 199     else:
 200         n = 0L
 201         for i in xrange(len(s) - 1):
 202             n <<= 8
 203             n += ord(s[i + 1])
 204     return n
 205
 206 def itn(n, digits=8, format=DEFAULT_FORMAT):
 207     """Convert a python number to a number field.
 208     """
 209     # POSIX 1003.1-1988 requires numbers to be encoded as a string of
 210     # octal digits followed by a null-byte, this allows values up to
 211     # (8**(digits-1))-1. GNU tar allows storing numbers greater than
 212     # that if necessary. A leading 0200 byte indicates this particular
 213     # encoding, the following digits-1 bytes are a big-endian
 214     # representation. This allows values up to (256**(digits-1))-1.
 215     if 0 <= n < 8 ** (digits - 1):
 216         s = "%0*o" % (digits - 1, n) + NUL
 217     else:
 218         if format != GNU_FORMAT or n >= 256 ** (digits - 1):
 219             raise ValueError("overflow in number field")
 220
 221         if n < 0:
 222             # XXX We mimic GNU tar's behaviour with negative numbers,
 223             # this could raise OverflowError.
 224             n = struct.unpack("L", struct.pack("l", n))[0]
 225
 226         s = ""
 227         for i in xrange(digits - 1):
 228             s = chr(n & 0377) + s
 229             n >>= 8
 230         s = chr(0200) + s
 231     return s
 232
 233 def uts(s, encoding, errors):
 234     """Convert a unicode object to a string.
 235     """
 236     if errors == "utf-8":
 237         # An extra error handler similar to the -o invalid=UTF-8 option
 238         # in POSIX.1-2001. Replace untranslatable characters with their
 239         # UTF-8 representation.
 240         try:
 241             return s.encode(encoding, "strict")
 242         except UnicodeEncodeError:
 243             x = []
 244             for c in s:
 245                 try:
 246                     x.append(c.encode(encoding, "strict"))
 247                 except UnicodeEncodeError:
 248                     x.append(c.encode("utf8"))
 249             return "".join(x)
 250     else:
 251         return s.encode(encoding, errors)
 252
 253 def calc_chksums(buf):
 254     """Calculate the checksum for a member's header by summing up all
 255        characters except for the chksum field which is treated as if
 256        it was filled with spaces. According to the GNU tar sources,
 257        some tars (Sun and NeXT) calculate chksum with signed char,
 258        which will be different if there are chars in the buffer with
 259        the high bit set. So we calculate two checksums, unsigned and
 260        signed.
 261     """
 262     unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
 263     signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
 264     return unsigned_chksum, signed_chksum
 265
 266 def copyfileobj(src, dst, length=None):
 267     """Copy length bytes from fileobj src to fileobj dst.
 268        If length is None, copy the entire content.
 269     """
 270     if length == 0:
 271         return
 272     if length is None:
 273         shutil.copyfileobj(src, dst)
 274         return
 275
 276     BUFSIZE = 16 * 1024
 277     blocks, remainder = divmod(length, BUFSIZE)
 278     for b in xrange(blocks):
 279         buf = src.read(BUFSIZE)
 280         if len(buf) < BUFSIZE:
 281             raise IOError("end of file reached")
 282         dst.write(buf)
 283
 284     if remainder != 0:
 285         buf = src.read(remainder)
 286         if len(buf) < remainder:
 287             raise IOError("end of file reached")
 288         dst.write(buf)
 289     return
 290
 291 filemode_table = (
 292     ((S_IFLNK,      "l"),
 293      (S_IFREG,      "-"),
 294      (S_IFBLK,      "b"),
 295      (S_IFDIR,      "d"),
 296      (S_IFCHR,      "c"),
 297      (S_IFIFO,      "p")),
 298
 299     ((TUREAD,       "r"),),
 300     ((TUWRITE,      "w"),),
 301     ((TUEXEC|TSUID, "s"),
 302      (TSUID,        "S"),
 303      (TUEXEC,       "x")),
 304
 305     ((TGREAD,       "r"),),
 306     ((TGWRITE,      "w"),),
 307     ((TGEXEC|TSGID, "s"),
 308      (TSGID,        "S"),
 309      (TGEXEC,       "x")),
 310
 311     ((TOREAD,       "r"),),
 312     ((TOWRITE,      "w"),),
 313     ((TOEXEC|TSVTX, "t"),
 314      (TSVTX,        "T"),
 315      (TOEXEC,       "x"))
 316 )
 317
 318 def filemode(mode):
 319     """Convert a file's mode to a string of the form
 320        -rwxrwxrwx.
 321        Used by TarFile.list()
 322     """
 323     perm = []
 324     for table in filemode_table:
 325         for bit, char in table:
 326             if mode & bit == bit:
 327                 perm.append(char)
 328                 break
 329         else:
 330             perm.append("-")
 331     return "".join(perm)
 332
 333 if os.sep != "/":
 334     normpath = lambda path: os.path.normpath(path).replace(os.sep, "/")
 335 else:
 336     normpath = os.path.normpath
 337
 338 class TarError(Exception):
 339     """Base exception."""
 340     pass
 341 class ExtractError(TarError):
 342     """General exception for extract errors."""
 343     pass
 344 class ReadError(TarError):
 345     """Exception for unreadble tar archives."""
 346     pass
 347 class CompressionError(TarError):
 348     """Exception for unavailable compression methods."""
 349     pass
 350 class StreamError(TarError):
 351     """Exception for unsupported operations on stream-like TarFiles."""
 352     pass
 353 class HeaderError(TarError):
 354     """Exception for invalid headers."""
 355     pass
 356
 357 #---------------------------
 358 # internal stream interface
 359 #---------------------------
 360 class _LowLevelFile:
 361     """Low-level file object. Supports reading and writing.
 362        It is used instead of a regular file object for streaming
 363        access.
 364     """
 365
 366     def __init__(self, name, mode):
 367         mode = {
 368             "r": os.O_RDONLY,
 369             "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
 370         }[mode]
 371         if hasattr(os, "O_BINARY"):
 372             mode |= os.O_BINARY
 373         self.fd = os.open(name, mode)
 374
 375     def close(self):
 376         os.close(self.fd)
 377
 378     def read(self, size):
 379         return os.read(self.fd, size)
 380
 381     def write(self, s):
 382         os.write(self.fd, s)
 383
 384 class _Stream:
 385     """Class that serves as an adapter between TarFile and
 386        a stream-like object.  The stream-like object only
 387        needs to have a read() or write() method and is accessed
 388        blockwise.  Use of gzip or bzip2 compression is possible.
 389        A stream-like object could be for example: sys.stdin,
 390        sys.stdout, a socket, a tape device etc.
 391
 392        _Stream is intended to be used only internally.
 393     """
 394
 395     def __init__(self, name, mode, comptype, fileobj, bufsize):
 396         """Construct a _Stream object.
 397         """
 398         self._extfileobj = True
 399         if fileobj is None:
 400             fileobj = _LowLevelFile(name, mode)
 401             self._extfileobj = False
 402
 403         if comptype == '*':
 404             # Enable transparent compression detection for the
 405             # stream interface
 406             fileobj = _StreamProxy(fileobj)
 407             comptype = fileobj.getcomptype()
 408
 409         self.name     = name or ""
 410         self.mode     = mode
 411         self.comptype = comptype
 412         self.fileobj  = fileobj
 413         self.bufsize  = bufsize
 414         self.buf      = ""
 415         self.pos      = 0L
 416         self.closed   = False
 417
 418         if comptype == "gz":
 419             try:
 420                 import zlib
 421             except ImportError:
 422                 raise CompressionError("zlib module is not available")
 423             self.zlib = zlib
 424             self.crc = zlib.crc32("") & 0xffffffffL
 425             if mode == "r":
 426                 self._init_read_gz()
 427             else:
 428                 self._init_write_gz()
 429
 430         if comptype == "bz2":
 431             try:
 432                 import bz2
 433             except ImportError:
 434                 raise CompressionError("bz2 module is not available")
 435             if mode == "r":
 436                 self.dbuf = ""
 437                 self.cmp = bz2.BZ2Decompressor()
 438             else:
 439                 self.cmp = bz2.BZ2Compressor()
 440
 441     def __del__(self):
 442         if hasattr(self, "closed") and not self.closed:
 443             self.close()
 444
 445     def _init_write_gz(self):
 446         """Initialize for writing with gzip compression.
 447         """
 448         self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
 449                                             -self.zlib.MAX_WBITS,
 450                                             self.zlib.DEF_MEM_LEVEL,
 451                                             0)
 452         timestamp = struct.pack("<L", long(time.time()))
 453         self.__write("\037\213\010\010%s\002\377" % timestamp)
 454         if self.name.endswith(".gz"):
 455             self.name = self.name[:-3]
 456         self.__write(self.name + NUL)
 457
 458     def write(self, s):
 459         """Write string s to the stream.
 460         """
 461         if self.comptype == "gz":
 462             self.crc = self.zlib.crc32(s, self.crc) & 0xffffffffL
 463         self.pos += len(s)
 464         if self.comptype != "tar":
 465             s = self.cmp.compress(s)
 466         self.__write(s)
 467
 468     def __write(self, s):
 469         """Write string s to the stream if a whole new block
 470            is ready to be written.
 471         """
 472         self.buf += s
 473         while len(self.buf) > self.bufsize:
 474             self.fileobj.write(self.buf[:self.bufsize])
 475             self.buf = self.buf[self.bufsize:]
 476
 477     def close(self):
 478         """Close the _Stream object. No operation should be
 479            done on it afterwards.
 480         """
 481         if self.closed:
 482             return
 483
 484         if self.mode == "w" and self.comptype != "tar":
 485             self.buf += self.cmp.flush()
 486
 487         if self.mode == "w" and self.buf:
 488             self.fileobj.write(self.buf)
 489             self.buf = ""
 490             if self.comptype == "gz":
 491                 # The native zlib crc is an unsigned 32-bit integer, but
 492                 # the Python wrapper implicitly casts that to a signed C
 493                 # long.  So, on a 32-bit box self.crc may "look negative",
 494                 # while the same crc on a 64-bit box may "look positive".
 495                 # To avoid irksome warnings from the `struct` module, force
 496                 # it to look positive on all boxes.
 497                 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffffL))
 498                 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFFL))
 499
 500         if not self._extfileobj:
 501             self.fileobj.close()
 502
 503         self.closed = True
 504
 505     def _init_read_gz(self):
 506         """Initialize for reading a gzip compressed fileobj.
 507         """
 508         self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
 509         self.dbuf = ""
 510
 511         # taken from gzip.GzipFile with some alterations
 512         if self.__read(2) != "\037\213":
 513             raise ReadError("not a gzip file")
 514         if self.__read(1) != "\010":
 515             raise CompressionError("unsupported compression method")
 516
 517         flag = ord(self.__read(1))
 518         self.__read(6)
 519
 520         if flag & 4:
 521             xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
 522             self.read(xlen)
 523         if flag & 8:
 524             while True:
 525                 s = self.__read(1)
 526                 if not s or s == NUL:
 527                     break
 528         if flag & 16:
 529             while True:
 530                 s = self.__read(1)
 531                 if not s or s == NUL:
 532                     break
 533         if flag & 2:
 534             self.__read(2)
 535
 536     def tell(self):
 537         """Return the stream's file pointer position.
 538         """
 539         return self.pos
 540
 541     def seek(self, pos=0):
 542         """Set the stream's file pointer to pos. Negative seeking
 543            is forbidden.
 544         """
 545         if pos - self.pos >= 0:
 546             blocks, remainder = divmod(pos - self.pos, self.bufsize)
 547             for i in xrange(blocks):
 548                 self.read(self.bufsize)
 549             self.read(remainder)
 550         else:
 551             raise StreamError("seeking backwards is not allowed")
 552         return self.pos
 553
 554     def read(self, size=None):
 555         """Return the next size number of bytes from the stream.
 556            If size is not defined, return all bytes of the stream
 557            up to EOF.
 558         """
 559         if size is None:
 560             t = []
 561             while True:
 562                 buf = self._read(self.bufsize)
 563                 if not buf:
 564                     break
 565                 t.append(buf)
 566             buf = "".join(t)
 567         else:
 568             buf = self._read(size)
 569         self.pos += len(buf)
 570         return buf
 571
 572     def _read(self, size):
 573         """Return size bytes from the stream.
 574         """
 575         if self.comptype == "tar":
 576             return self.__read(size)
 577
 578         c = len(self.dbuf)
 579         t = [self.dbuf]
 580         while c < size:
 581             buf = self.__read(self.bufsize)
 582             if not buf:
 583                 break
 584             try:
 585                 buf = self.cmp.decompress(buf)
 586             except IOError:
 587                 raise ReadError("invalid compressed data")
 588             t.append(buf)
 589             c += len(buf)
 590         t = "".join(t)
 591         self.dbuf = t[size:]
 592         return t[:size]
 593
 594     def __read(self, size):
 595         """Return size bytes from stream. If internal buffer is empty,
 596            read another block from the stream.
 597         """
 598         c = len(self.buf)
 599         t = [self.buf]
 600         while c < size:
 601             buf = self.fileobj.read(self.bufsize)
 602             if not buf:
 603                 break
 604             t.append(buf)
 605             c += len(buf)
 606         t = "".join(t)
 607         self.buf = t[size:]
 608         return t[:size]
 609 # class _Stream
 610
 611 class _StreamProxy(object):
 612     """Small proxy class that enables transparent compression
 613        detection for the Stream interface (mode 'r|*').
 614     """
 615
 616     def __init__(self, fileobj):
 617         self.fileobj = fileobj
 618         self.buf = self.fileobj.read(BLOCKSIZE)
 619
 620     def read(self, size):
 621         self.read = self.fileobj.read
 622         return self.buf
 623
 624     def getcomptype(self):
 625         if self.buf.startswith("\037\213\010"):
 626             return "gz"
 627         if self.buf.startswith("BZh91"):
 628             return "bz2"
 629         return "tar"
 630
 631     def close(self):
 632         self.fileobj.close()
 633 # class StreamProxy
 634
 635 class _BZ2Proxy(object):
 636     """Small proxy class that enables external file object
 637        support for "r:bz2" and "w:bz2" modes. This is actually
 638        a workaround for a limitation in bz2 module's BZ2File
 639        class which (unlike gzip.GzipFile) has no support for
 640        a file object argument.
 641     """
 642
 643     blocksize = 16 * 1024
 644
 645     def __init__(self, fileobj, mode):
 646         self.fileobj = fileobj
 647         self.mode = mode
 648         self.name = getattr(self.fileobj, "name", None)
 649         self.init()
 650
 651     def init(self):
 652         import bz2
 653         self.pos = 0
 654         if self.mode == "r":
 655             self.bz2obj = bz2.BZ2Decompressor()
 656             self.fileobj.seek(0)
 657             self.buf = ""
 658         else:
 659             self.bz2obj = bz2.BZ2Compressor()
 660
 661     def read(self, size):
 662         b = [self.buf]
 663         x = len(self.buf)
 664         while x < size:
 665             try:
 666                 raw = self.fileobj.read(self.blocksize)
 667                 data = self.bz2obj.decompress(raw)
 668                 b.append(data)
 669             except EOFError:
 670                 break
 671             x += len(data)
 672         self.buf = "".join(b)
 673
 674         buf = self.buf[:size]
 675         self.buf = self.buf[size:]
 676         self.pos += len(buf)
 677         return buf
 678
 679     def seek(self, pos):
 680         if pos < self.pos:
 681             self.init()
 682         self.read(pos - self.pos)
 683
 684     def tell(self):
 685         return self.pos
 686
 687     def write(self, data):
 688         self.pos += len(data)
 689         raw = self.bz2obj.compress(data)
 690         self.fileobj.write(raw)
 691
 692     def close(self):
 693         if self.mode == "w":
 694             raw = self.bz2obj.flush()
 695             self.fileobj.write(raw)
 696 # class _BZ2Proxy
 697
 698 #------------------------
 699 # Extraction file object
 700 #------------------------
 701 class _FileInFile(object):
 702     """A thin wrapper around an existing file object that
 703        provides a part of its data as an individual file
 704        object.
 705     """
 706
 707     def __init__(self, fileobj, offset, size, sparse=None):
 708         self.fileobj = fileobj
 709         self.offset = offset
 710         self.size = size
 711         self.sparse = sparse
 712         self.position = 0
 713
 714     def tell(self):
 715         """Return the current file position.
 716         """
 717         return self.position
 718
 719     def seek(self, position):
 720         """Seek to a position in the file.
 721         """
 722         self.position = position
 723
 724     def read(self, size=None):
 725         """Read data from the file.
 726         """
 727         if size is None:
 728             size = self.size - self.position
 729         else:
 730             size = min(size, self.size - self.position)
 731
 732         if self.sparse is None:
 733             return self.readnormal(size)
 734         else:
 735             return self.readsparse(size)
 736
 737     def readnormal(self, size):
 738         """Read operation for regular files.
 739         """
 740         self.fileobj.seek(self.offset + self.position)
 741         self.position += size
 742         return self.fileobj.read(size)
 743
 744     def readsparse(self, size):
 745         """Read operation for sparse files.
 746         """
 747         data = []
 748         while size > 0:
 749             buf = self.readsparsesection(size)
 750             if not buf:
 751                 break
 752             size -= len(buf)
 753             data.append(buf)
 754         return "".join(data)
 755
 756     def readsparsesection(self, size):
 757         """Read a single section of a sparse file.
 758         """
 759         section = self.sparse.find(self.position)
 760
 761         if section is None:
 762             return ""
 763
 764         size = min(size, section.offset + section.size - self.position)
 765
 766         if isinstance(section, _data):
 767             realpos = section.realpos + self.position - section.offset
 768             self.fileobj.seek(self.offset + realpos)
 769             self.position += size
 770             return self.fileobj.read(size)
 771         else:
 772             self.position += size
 773             return NUL * size
 774 #class _FileInFile
 775
 776
 777 class ExFileObject(object):
 778     """File-like object for reading an archive member.
 779        Is returned by TarFile.extractfile().
 780     """
 781     blocksize = 1024
 782
 783     def __init__(self, tarfile, tarinfo):
 784         self.fileobj = _FileInFile(tarfile.fileobj,
 785                                    tarinfo.offset_data,
 786                                    tarinfo.size,
 787                                    getattr(tarinfo, "sparse", None))
 788         self.name = tarinfo.name
 789         self.mode = "r"
 790         self.closed = False
 791         self.size = tarinfo.size
 792
 793         self.position = 0
 794         self.buffer = ""
 795
 796     def read(self, size=None):
 797         """Read at most size bytes from the file. If size is not
 798            present or None, read all data until EOF is reached.
 799         """
 800         if self.closed:
 801             raise ValueError("I/O operation on closed file")
 802
 803         buf = ""
 804         if self.buffer:
 805             if size is None:
 806                 buf = self.buffer
 807                 self.buffer = ""
 808             else:
 809                 buf = self.buffer[:size]
 810                 self.buffer = self.buffer[size:]
 811
 812         if size is None:
 813             buf += self.fileobj.read()
 814         else:
 815             buf += self.fileobj.read(size - len(buf))
 816
 817         self.position += len(buf)
 818         return buf
 819
 820     def readline(self, size=-1):
 821         """Read one entire line from the file. If size is present
 822            and non-negative, return a string with at most that
 823            size, which may be an incomplete line.
 824         """
 825         if self.closed:
 826             raise ValueError("I/O operation on closed file")
 827
 828         if "\n" in self.buffer:
 829             pos = self.buffer.find("\n") + 1
 830         else:
 831             buffers = [self.buffer]
 832             while True:
 833                 buf = self.fileobj.read(self.blocksize)
 834                 buffers.append(buf)
 835                 if not buf or "\n" in buf:
 836                     self.buffer = "".join(buffers)
 837                     pos = self.buffer.find("\n") + 1
 838                     if pos == 0:
 839                         # no newline found.
 840                         pos = len(self.buffer)
 841                     break
 842
 843         if size != -1:
 844             pos = min(size, pos)
 845
 846         buf = self.buffer[:pos]
 847         self.buffer = self.buffer[pos:]
 848         self.position += len(buf)
 849         return buf
 850
 851     def readlines(self):
 852         """Return a list with all remaining lines.
 853         """
 854         result = []
 855         while True:
 856             line = self.readline()
 857             if not line: break
 858             result.append(line)
 859         return result
 860
 861     def tell(self):
 862         """Return the current file position.
 863         """
 864         if self.closed:
 865             raise ValueError("I/O operation on closed file")
 866
 867         return self.position
 868
 869     def seek(self, pos, whence=os.SEEK_SET):
 870         """Seek to a position in the file.
 871         """
 872         if self.closed:
 873             raise ValueError("I/O operation on closed file")
 874
 875         if whence == os.SEEK_SET:
 876             self.position = min(max(pos, 0), self.size)
 877         elif whence == os.SEEK_CUR:
 878             if pos < 0:
 879                 self.position = max(self.position + pos, 0)
 880             else:
 881                 self.position = min(self.position + pos, self.size)
 882         elif whence == os.SEEK_END:
 883             self.position = max(min(self.size + pos, self.size), 0)
 884         else:
 885             raise ValueError("Invalid argument")
 886
 887         self.buffer = ""
 888         self.fileobj.seek(self.position)
 889
 890     def close(self):
 891         """Close the file object.
 892         """
 893         self.closed = True
 894
 895     def __iter__(self):
 896         """Get an iterator over the file's lines.
 897         """
 898         while True:
 899             line = self.readline()
 900             if not line:
 901                 break
 902             yield line
 903 #class ExFileObject
 904
 905 #------------------
 906 # Exported Classes
 907 #------------------
 908 class TarInfo(object):
 909     """Informational class which holds the details about an
 910        archive member given by a tar header block.
 911        TarInfo objects are returned by TarFile.getmember(),
 912        TarFile.getmembers() and TarFile.gettarinfo() and are
 913        usually created internally.
 914     """
 915
 916     def __init__(self, name=""):
 917         """Construct a TarInfo object. name is the optional name
 918            of the member.
 919         """
 920         self.name = name        # member name
 921         self.mode = 0644        # file permissions
 922         self.uid = 0            # user id
 923         self.gid = 0            # group id
 924         self.size = 0           # file size
 925         self.mtime = 0          # modification time
 926         self.chksum = 0         # header checksum
 927         self.type = REGTYPE     # member type
 928         self.linkname = ""      # link name
 929         self.uname = "root"     # user name
 930         self.gname = "root"     # group name
 931         self.devmajor = 0       # device major number
 932         self.devminor = 0       # device minor number
 933
 934         self.offset = 0         # the tar header starts here
 935         self.offset_data = 0    # the file's data starts here
 936
 937         self.pax_headers = {}   # pax header information
 938
 939     # In pax headers the "name" and "linkname" field are called
 940     # "path" and "linkpath".
 941     def _getpath(self):
 942         return self.name
 943     def _setpath(self, name):
 944         self.name = name
 945     path = property(_getpath, _setpath)
 946
 947     def _getlinkpath(self):
 948         return self.linkname
 949     def _setlinkpath(self, linkname):
 950         self.linkname = linkname
 951     linkpath = property(_getlinkpath, _setlinkpath)
 952
 953     def __repr__(self):
 954         return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
 955
 956     def get_info(self, encoding, errors):
 957         """Return the TarInfo's attributes as a dictionary.
 958         """
 959         info = {
 960             "name":     normpath(self.name),
 961             "mode":     self.mode & 07777,
 962             "uid":      self.uid,
 963             "gid":      self.gid,
 964             "size":     self.size,
 965             "mtime":    self.mtime,
 966             "chksum":   self.chksum,
 967             "type":     self.type,
 968             "linkname": normpath(self.linkname) if self.linkname else "",
 969             "uname":    self.uname,
 970             "gname":    self.gname,
 971             "devmajor": self.devmajor,
 972             "devminor": self.devminor
 973         }
 974
 975         if info["type"] == DIRTYPE and not info["name"].endswith("/"):
 976             info["name"] += "/"
 977
 978         for key in ("name", "linkname", "uname", "gname"):
 979             if type(info[key]) is unicode:
 980                 info[key] = info[key].encode(encoding, errors)
 981
 982         return info
 983
 984     def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="strict"):
 985         """Return a tar header as a string of 512 byte blocks.
 986         """
 987         info = self.get_info(encoding, errors)
 988
 989         if format == USTAR_FORMAT:
 990             return self.create_ustar_header(info)
 991         elif format == GNU_FORMAT:
 992             return self.create_gnu_header(info)
 993         elif format == PAX_FORMAT:
 994             return self.create_pax_header(info, encoding, errors)
 995         else:
 996             raise ValueError("invalid format")
 997
 998     def create_ustar_header(self, info):
 999         """Return the object as a ustar header block.
1000         """
1001         info["magic"] = POSIX_MAGIC
1002
1003         if len(info["linkname"]) > LENGTH_LINK:
1004             raise ValueError("linkname is too long")
1005
1006         if len(info["name"]) > LENGTH_NAME:
1007             info["prefix"], info["name"] = self._posix_split_name(info["name"])
1008
1009         return self._create_header(info, USTAR_FORMAT)
1010
1011     def create_gnu_header(self, info):
1012         """Return the object as a GNU header block sequence.
1013         """
1014         info["magic"] = GNU_MAGIC
1015
1016         buf = ""
1017         if len(info["linkname"]) > LENGTH_LINK:
1018             buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK)
1019
1020         if len(info["name"]) > LENGTH_NAME:
1021             buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME)
1022
1023         return buf + self._create_header(info, GNU_FORMAT)
1024
1025     def create_pax_header(self, info, encoding, errors):
1026         """Return the object as a ustar header block. If it cannot be
1027            represented this way, prepend a pax extended header sequence
1028            with supplement information.
1029         """
1030         info["magic"] = POSIX_MAGIC
1031         pax_headers = self.pax_headers.copy()
1032
1033         # Test string fields for values that exceed the field length or cannot
1034         # be represented in ASCII encoding.
1035         for name, hname, length in (
1036                 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
1037                 ("uname", "uname", 32), ("gname", "gname", 32)):
1038
1039             if hname in pax_headers:
1040                 # The pax header has priority.
1041                 continue
1042
1043             val = info[name].decode(encoding, errors)
1044
1045             # Try to encode the string as ASCII.
1046             try:
1047                 val.encode("ascii")
1048             except UnicodeEncodeError:
1049                 pax_headers[hname] = val
1050                 continue
1051
1052             if len(info[name]) > length:
1053                 pax_headers[hname] = val
1054
1055         # Test number fields for values that exceed the field limit or values
1056         # that like to be stored as float.
1057         for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1058             if name in pax_headers:
1059                 # The pax header has priority. Avoid overflow.
1060                 info[name] = 0
1061                 continue
1062
1063             val = info[name]
1064             if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
1065                 pax_headers[name] = unicode(val)
1066                 info[name] = 0
1067
1068         # Create a pax extended header if necessary.
1069         if pax_headers:
1070             buf = self._create_pax_generic_header(pax_headers)
1071         else:
1072             buf = ""
1073
1074         return buf + self._create_header(info, USTAR_FORMAT)
1075
1076     @classmethod
1077     def create_pax_global_header(cls, pax_headers):
1078         """Return the object as a pax global header block sequence.
1079         """
1080         return cls._create_pax_generic_header(pax_headers, type=XGLTYPE)
1081
1082     def _posix_split_name(self, name):
1083         """Split a name longer than 100 chars into a prefix
1084            and a name part.
1085         """
1086         prefix = name[:LENGTH_PREFIX + 1]
1087         while prefix and prefix[-1] != "/":
1088             prefix = prefix[:-1]
1089
1090         name = name[len(prefix):]
1091         prefix = prefix[:-1]
1092
1093         if not prefix or len(name) > LENGTH_NAME:
1094             raise ValueError("name is too long")
1095         return prefix, name
1096
1097     @staticmethod
1098     def _create_header(info, format):
1099         """Return a header block. info is a dictionary with file
1100            information, format must be one of the *_FORMAT constants.
1101         """
1102         parts = [
1103             stn(info.get("name", ""), 100),
1104             itn(info.get("mode", 0) & 07777, 8, format),
1105             itn(info.get("uid", 0), 8, format),
1106             itn(info.get("gid", 0), 8, format),
1107             itn(info.get("size", 0), 12, format),
1108             itn(info.get("mtime", 0), 12, format),
1109             "        ", # checksum field
1110             info.get("type", REGTYPE),
1111             stn(info.get("linkname", ""), 100),
1112             stn(info.get("magic", POSIX_MAGIC), 8),
1113             stn(info.get("uname", "root"), 32),
1114             stn(info.get("gname", "root"), 32),
1115             itn(info.get("devmajor", 0), 8, format),
1116             itn(info.get("devminor", 0), 8, format),
1117             stn(info.get("prefix", ""), 155)
1118         ]
1119
1120         buf = struct.pack("%ds" % BLOCKSIZE, "".join(parts))
1121         chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
1122         buf = buf[:-364] + "%06o\0" % chksum + buf[-357:]
1123         return buf
1124
1125     @staticmethod
1126     def _create_payload(payload):
1127         """Return the string payload filled with zero bytes
1128            up to the next 512 byte border.
1129         """
1130         blocks, remainder = divmod(len(payload), BLOCKSIZE)
1131         if remainder > 0:
1132             payload += (BLOCKSIZE - remainder) * NUL
1133         return payload
1134
1135     @classmethod
1136     def _create_gnu_long_header(cls, name, type):
1137         """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1138            for name.
1139         """
1140         name += NUL
1141
1142         info = {}
1143         info["name"] = "././@LongLink"
1144         info["type"] = type
1145         info["size"] = len(name)
1146         info["magic"] = GNU_MAGIC
1147
1148         # create extended header + name blocks.
1149         return cls._create_header(info, USTAR_FORMAT) + \
1150                 cls._create_payload(name)
1151
1152     @classmethod
1153     def _create_pax_generic_header(cls, pax_headers, type=XHDTYPE):
1154         """Return a POSIX.1-2001 extended or global header sequence
1155            that contains a list of keyword, value pairs. The values
1156            must be unicode objects.
1157         """
1158         records = []
1159         for keyword, value in pax_headers.iteritems():
1160             keyword = keyword.encode("utf8")
1161             value = value.encode("utf8")
1162             l = len(keyword) + len(value) + 3   # ' ' + '=' + '\n'
1163             n = p = 0
1164             while True:
1165                 n = l + len(str(p))
1166                 if n == p:
1167                     break
1168                 p = n
1169             records.append("%d %s=%s\n" % (p, keyword, value))
1170         records = "".join(records)
1171
1172         # We use a hardcoded "././@PaxHeader" name like star does
1173         # instead of the one that POSIX recommends.
1174         info = {}
1175         info["name"] = "././@PaxHeader"
1176         info["type"] = type
1177         info["size"] = len(records)
1178         info["magic"] = POSIX_MAGIC
1179
1180         # Create pax header + record blocks.
1181         return cls._create_header(info, USTAR_FORMAT) + \
1182                 cls._create_payload(records)
1183
1184     @classmethod
1185     def frombuf(cls, buf):
1186         """Construct a TarInfo object from a 512 byte string buffer.
1187         """
1188         if len(buf) != BLOCKSIZE:
1189             raise HeaderError("truncated header")
1190         if buf.count(NUL) == BLOCKSIZE:
1191             raise HeaderError("empty header")
1192
1193         chksum = nti(buf[148:156])
1194         if chksum not in calc_chksums(buf):
1195             raise HeaderError("bad checksum")
1196
1197         obj = cls()
1198         obj.buf = buf
1199         obj.name = nts(buf[0:100])
1200         obj.mode = nti(buf[100:108])
1201         obj.uid = nti(buf[108:116])
1202         obj.gid = nti(buf[116:124])
1203         obj.size = nti(buf[124:136])
1204         obj.mtime = nti(buf[136:148])
1205         obj.chksum = chksum
1206         obj.type = buf[156:157]
1207         obj.linkname = nts(buf[157:257])
1208         obj.uname = nts(buf[265:297])
1209         obj.gname = nts(buf[297:329])
1210         obj.devmajor = nti(buf[329:337])
1211         obj.devminor = nti(buf[337:345])
1212         prefix = nts(buf[345:500])
1213
1214         # Old V7 tar format represents a directory as a regular
1215         # file with a trailing slash.
1216         if obj.type == AREGTYPE and obj.name.endswith("/"):
1217             obj.type = DIRTYPE
1218
1219         # Remove redundant slashes from directories.
1220         if obj.isdir():
1221             obj.name = obj.name.rstrip("/")
1222
1223         # Reconstruct a ustar longname.
1224         if prefix and obj.type not in GNU_TYPES:
1225             obj.name = prefix + "/" + obj.name
1226         return obj
1227
1228     @classmethod
1229     def fromtarfile(cls, tarfile):
1230         """Return the next TarInfo object from TarFile object
1231            tarfile.
1232         """
1233         buf = tarfile.fileobj.read(BLOCKSIZE)
1234         if not buf:
1235             return
1236         obj = cls.frombuf(buf)
1237         obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1238         return obj._proc_member(tarfile)
1239
1240     #--------------------------------------------------------------------------
1241     # The following are methods that are called depending on the type of a
1242     # member. The entry point is _proc_member() which can be overridden in a
1243     # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1244     # implement the following
1245     # operations:
1246     # 1. Set self.offset_data to the position where the data blocks begin,
1247     #    if there is data that follows.
1248     # 2. Set tarfile.offset to the position where the next member's header will
1249     #    begin.
1250     # 3. Return self or another valid TarInfo object.
1251     def _proc_member(self, tarfile):
1252         """Choose the right processing method depending on
1253            the type and call it.
1254         """
1255         if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1256             return self._proc_gnulong(tarfile)
1257         elif self.type == GNUTYPE_SPARSE:
1258             return self._proc_sparse(tarfile)
1259         elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1260             return self._proc_pax(tarfile)
1261         else:
1262             return self._proc_builtin(tarfile)
1263
1264     def _proc_builtin(self, tarfile):
1265         """Process a builtin type or an unknown type which
1266            will be treated as a regular file.
1267         """
1268         self.offset_data = tarfile.fileobj.tell()
1269         offset = self.offset_data
1270         if self.isreg() or self.type not in SUPPORTED_TYPES:
1271             # Skip the following data blocks.
1272             offset += self._block(self.size)
1273         tarfile.offset = offset
1274
1275         # Patch the TarInfo object with saved global
1276         # header information.
1277         self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1278
1279         return self
1280
1281     def _proc_gnulong(self, tarfile):
1282         """Process the blocks that hold a GNU longname
1283            or longlink member.
1284         """
1285         buf = tarfile.fileobj.read(self._block(self.size))
1286
1287         # Fetch the next header and process it.
1288         next = self.fromtarfile(tarfile)
1289         if next is None:
1290             raise HeaderError("missing subsequent header")
1291
1292         # Patch the TarInfo object from the next header with
1293         # the longname information.
1294         next.offset = self.offset
1295         if self.type == GNUTYPE_LONGNAME:
1296             next.name = nts(buf)
1297         elif self.type == GNUTYPE_LONGLINK:
1298             next.linkname = nts(buf)
1299
1300         return next
1301
1302     def _proc_sparse(self, tarfile):
1303         """Process a GNU sparse header plus extra headers.
1304         """
1305         buf = self.buf
1306         sp = _ringbuffer()
1307         pos = 386
1308         lastpos = 0L
1309         realpos = 0L
1310         # There are 4 possible sparse structs in the
1311         # first header.
1312         for i in xrange(4):
1313             try:
1314                 offset = nti(buf[pos:pos + 12])
1315                 numbytes = nti(buf[pos + 12:pos + 24])
1316             except ValueError:
1317                 break
1318             if offset > lastpos:
1319                 sp.append(_hole(lastpos, offset - lastpos))
1320             sp.append(_data(offset, numbytes, realpos))
1321             realpos += numbytes
1322             lastpos = offset + numbytes
1323             pos += 24
1324
1325         isextended = ord(buf[482])
1326         origsize = nti(buf[483:495])
1327
1328         # If the isextended flag is given,
1329         # there are extra headers to process.
1330         while isextended == 1:
1331             buf = tarfile.fileobj.read(BLOCKSIZE)
1332             pos = 0
1333             for i in xrange(21):
1334                 try:
1335                     offset = nti(buf[pos:pos + 12])
1336                     numbytes = nti(buf[pos + 12:pos + 24])
1337                 except ValueError:
1338                     break
1339                 if offset > lastpos:
1340                     sp.append(_hole(lastpos, offset - lastpos))
1341                 sp.append(_data(offset, numbytes, realpos))
1342                 realpos += numbytes
1343                 lastpos = offset + numbytes
1344                 pos += 24
1345             isextended = ord(buf[504])
1346
1347         if lastpos < origsize:
1348             sp.append(_hole(lastpos, origsize - lastpos))
1349
1350         self.sparse = sp
1351
1352         self.offset_data = tarfile.fileobj.tell()
1353         tarfile.offset = self.offset_data + self._block(self.size)
1354         self.size = origsize
1355
1356         return self
1357
1358     def _proc_pax(self, tarfile):
1359         """Process an extended or global header as described in
1360            POSIX.1-2001.
1361         """
1362         # Read the header information.
1363         buf = tarfile.fileobj.read(self._block(self.size))
1364
1365         # A pax header stores supplemental information for either
1366         # the following file (extended) or all following files
1367         # (global).
1368         if self.type == XGLTYPE:
1369             pax_headers = tarfile.pax_headers
1370         else:
1371             pax_headers = tarfile.pax_headers.copy()
1372
1373         # Parse pax header information. A record looks like that:
1374         # "%d %s=%s\n" % (length, keyword, value). length is the size
1375         # of the complete record including the length field itself and
1376         # the newline. keyword and value are both UTF-8 encoded strings.
1377         regex = re.compile(r"(\d+) ([^=]+)=", re.U)
1378         pos = 0
1379         while True:
1380             match = regex.match(buf, pos)
1381             if not match:
1382                 break
1383
1384             length, keyword = match.groups()
1385             length = int(length)
1386             value = buf[match.end(2) + 1:match.start(1) + length - 1]
1387
1388             keyword = keyword.decode("utf8")
1389             value = value.decode("utf8")
1390
1391             pax_headers[keyword] = value
1392             pos += length
1393
1394         # Fetch the next header.
1395         next = self.fromtarfile(tarfile)
1396
1397         if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1398             if next is None:
1399                 raise HeaderError("missing subsequent header")
1400
1401             # Patch the TarInfo object with the extended header info.
1402             next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1403             next.offset = self.offset
1404
1405             if "size" in pax_headers:
1406                 # If the extended header replaces the size field,
1407                 # we need to recalculate the offset where the next
1408                 # header starts.
1409                 offset = next.offset_data
1410                 if next.isreg() or next.type not in SUPPORTED_TYPES:
1411                     offset += next._block(next.size)
1412                 tarfile.offset = offset
1413
1414         return next
1415
1416     def _apply_pax_info(self, pax_headers, encoding, errors):
1417         """Replace fields with supplemental information from a previous
1418            pax extended or global header.
1419         """
1420         for keyword, value in pax_headers.iteritems():
1421             if keyword not in PAX_FIELDS:
1422                 continue
1423
1424             if keyword == "path":
1425                 value = value.rstrip("/")
1426
1427             if keyword in PAX_NUMBER_FIELDS:
1428                 try:
1429                     value = PAX_NUMBER_FIELDS[keyword](value)
1430                 except ValueError:
1431                     value = 0
1432             else:
1433                 value = uts(value, encoding, errors)
1434
1435             setattr(self, keyword, value)
1436
1437         self.pax_headers = pax_headers.copy()
1438
1439     def _block(self, count):
1440         """Round up a byte count by BLOCKSIZE and return it,
1441            e.g. _block(834) => 1024.
1442         """
1443         blocks, remainder = divmod(count, BLOCKSIZE)
1444         if remainder:
1445             blocks += 1
1446         return blocks * BLOCKSIZE
1447
1448     def isreg(self):
1449         return self.type in REGULAR_TYPES
1450     def isfile(self):
1451         return self.isreg()
1452     def isdir(self):
1453         return self.type == DIRTYPE
1454     def issym(self):
1455         return self.type == SYMTYPE
1456     def islnk(self):
1457         return self.type == LNKTYPE
1458     def ischr(self):
1459         return self.type == CHRTYPE
1460     def isblk(self):
1461         return self.type == BLKTYPE
1462     def isfifo(self):
1463         return self.type == FIFOTYPE
1464     def issparse(self):
1465         return self.type == GNUTYPE_SPARSE
1466     def isdev(self):
1467         return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1468 # class TarInfo
1469
1470 class TarFile(object):
1471     """The TarFile Class provides an interface to tar archives.
1472     """
1473
1474     debug = 0                   # May be set from 0 (no msgs) to 3 (all msgs)
1475
1476     dereference = False         # If true, add content of linked file to the
1477                                 # tar file, else the link.
1478
1479     ignore_zeros = False        # If true, skips empty or invalid blocks and
1480                                 # continues processing.
1481
1482     errorlevel = 0              # If 0, fatal errors only appear in debug
1483                                 # messages (if debug >= 0). If > 0, errors
1484                                 # are passed to the caller as exceptions.
1485
1486     format = DEFAULT_FORMAT     # The format to use when creating an archive.
1487
1488     encoding = ENCODING         # Encoding for 8-bit character strings.
1489
1490     errors = None               # Error handler for unicode conversion.
1491
1492     tarinfo = TarInfo           # The default TarInfo class to use.
1493
1494     fileobject = ExFileObject   # The default ExFileObject class to use.
1495
1496     def __init__(self, name=None, mode="r", fileobj=None, format=None,
1497             tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
1498             errors=None, pax_headers=None, debug=None, errorlevel=None):
1499         """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1500            read from an existing archive, 'a' to append data to an existing
1501            file or 'w' to create a new file overwriting an existing one. `mode'
1502            defaults to 'r'.
1503            If `fileobj' is given, it is used for reading or writing data. If it
1504            can be determined, `mode' is overridden by `fileobj's mode.
1505            `fileobj' is not closed, when TarFile is closed.
1506         """
1507         if len(mode) > 1 or mode not in "raw":
1508             raise ValueError("mode must be 'r', 'a' or 'w'")
1509         self.mode = mode
1510         self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
1511
1512         if not fileobj:
1513             if self.mode == "a" and not os.path.exists(name):
1514                 # Create nonexistent files in append mode.
1515                 self.mode = "w"
1516                 self._mode = "wb"
1517             fileobj = bltn_open(name, self._mode)
1518             self._extfileobj = False
1519         else:
1520             if name is None and hasattr(fileobj, "name"):
1521                 name = fileobj.name
1522             if hasattr(fileobj, "mode"):
1523                 self._mode = fileobj.mode
1524             self._extfileobj = True
1525         self.name = os.path.abspath(name) if name else None
1526         self.fileobj = fileobj
1527
1528         # Init attributes.
1529         if format is not None:
1530             self.format = format
1531         if tarinfo is not None:
1532             self.tarinfo = tarinfo
1533         if dereference is not None:
1534             self.dereference = dereference
1535         if ignore_zeros is not None:
1536             self.ignore_zeros = ignore_zeros
1537         if encoding is not None:
1538             self.encoding = encoding
1539
1540         if errors is not None:
1541             self.errors = errors
1542         elif mode == "r":
1543             self.errors = "utf-8"
1544         else:
1545             self.errors = "strict"
1546
1547         if pax_headers is not None and self.format == PAX_FORMAT:
1548             self.pax_headers = pax_headers
1549         else:
1550             self.pax_headers = {}
1551
1552         if debug is not None:
1553             self.debug = debug
1554         if errorlevel is not None:
1555             self.errorlevel = errorlevel
1556
1557         # Init datastructures.
1558         self.closed = False
1559         self.members = []       # list of members as TarInfo objects
1560         self._loaded = False    # flag if all members have been read
1561         self.offset = self.fileobj.tell()
1562                                 # current position in the archive file
1563         self.inodes = {}        # dictionary caching the inodes of
1564                                 # archive members already added
1565
1566         if self.mode == "r":
1567             self.firstmember = None
1568             self.firstmember = self.next()
1569
1570         if self.mode == "a":
1571             # Move to the end of the archive,
1572             # before the first empty block.
1573             self.firstmember = None
1574             while True:
1575                 if self.next() is None:
1576                     if self.offset > 0:
1577                         self.fileobj.seek(- BLOCKSIZE, 1)
1578                     break
1579
1580         if self.mode in "aw":
1581             self._loaded = True
1582
1583             if self.pax_headers:
1584                 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1585                 self.fileobj.write(buf)
1586                 self.offset += len(buf)
1587
1588     def _getposix(self):
1589         return self.format == USTAR_FORMAT
1590     def _setposix(self, value):
1591         import warnings
1592         warnings.warn("use the format attribute instead", DeprecationWarning)
1593         if value:
1594             self.format = USTAR_FORMAT
1595         else:
1596             self.format = GNU_FORMAT
1597     posix = property(_getposix, _setposix)
1598
1599     #--------------------------------------------------------------------------
1600     # Below are the classmethods which act as alternate constructors to the
1601     # TarFile class. The open() method is the only one that is needed for
1602     # public use; it is the "super"-constructor and is able to select an
1603     # adequate "sub"-constructor for a particular compression using the mapping
1604     # from OPEN_METH.
1605     #
1606     # This concept allows one to subclass TarFile without losing the comfort of
1607     # the super-constructor. A sub-constructor is registered and made available
1608     # by adding it to the mapping in OPEN_METH.
1609
1610     @classmethod
1611     def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
1612         """Open a tar archive for reading, writing or appending. Return
1613            an appropriate TarFile class.
1614
1615            mode:
1616            'r' or 'r:*' open for reading with transparent compression
1617            'r:'         open for reading exclusively uncompressed
1618            'r:gz'       open for reading with gzip compression
1619            'r:bz2'      open for reading with bzip2 compression
1620            'a' or 'a:'  open for appending, creating the file if necessary
1621            'w' or 'w:'  open for writing without compression
1622            'w:gz'       open for writing with gzip compression
1623            'w:bz2'      open for writing with bzip2 compression
1624
1625            'r|*'        open a stream of tar blocks with transparent compression
1626            'r|'         open an uncompressed stream of tar blocks for reading
1627            'r|gz'       open a gzip compressed stream of tar blocks
1628            'r|bz2'      open a bzip2 compressed stream of tar blocks
1629            'w|'         open an uncompressed stream for writing
1630            'w|gz'       open a gzip compressed stream for writing
1631            'w|bz2'      open a bzip2 compressed stream for writing
1632         """
1633
1634         if not name and not fileobj:
1635             raise ValueError("nothing to open")
1636
1637         if mode in ("r", "r:*"):
1638             # Find out which *open() is appropriate for opening the file.
1639             for comptype in cls.OPEN_METH:
1640                 func = getattr(cls, cls.OPEN_METH[comptype])
1641                 if fileobj is not None:
1642                     saved_pos = fileobj.tell()
1643                 try:
1644                     return func(name, "r", fileobj, **kwargs)
1645                 except (ReadError, CompressionError), e:
1646                     if fileobj is not None:
1647                         fileobj.seek(saved_pos)
1648                     continue
1649             raise ReadError("file could not be opened successfully")
1650
1651         elif ":" in mode:
1652             filemode, comptype = mode.split(":", 1)
1653             filemode = filemode or "r"
1654             comptype = comptype or "tar"
1655
1656             # Select the *open() function according to
1657             # given compression.
1658             if comptype in cls.OPEN_METH:
1659                 func = getattr(cls, cls.OPEN_METH[comptype])
1660             else:
1661                 raise CompressionError("unknown compression type %r" % comptype)
1662             return func(name, filemode, fileobj, **kwargs)
1663
1664         elif "|" in mode:
1665             filemode, comptype = mode.split("|", 1)
1666             filemode = filemode or "r"
1667             comptype = comptype or "tar"
1668
1669             if filemode not in "rw":
1670                 raise ValueError("mode must be 'r' or 'w'")
1671
1672             t = cls(name, filemode,
1673                     _Stream(name, filemode, comptype, fileobj, bufsize),
1674                     **kwargs)
1675             t._extfileobj = False
1676             return t
1677
1678         elif mode in "aw":
1679             return cls.taropen(name, mode, fileobj, **kwargs)
1680
1681         raise ValueError("undiscernible mode")
1682
1683     @classmethod
1684     def taropen(cls, name, mode="r", fileobj=None, **kwargs):
1685         """Open uncompressed tar archive name for reading or writing.
1686         """
1687         if len(mode) > 1 or mode not in "raw":
1688             raise ValueError("mode must be 'r', 'a' or 'w'")
1689         return cls(name, mode, fileobj, **kwargs)
1690
1691     @classmethod
1692     def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1693         """Open gzip compressed tar archive name for reading or writing.
1694            Appending is not allowed.
1695         """
1696         if len(mode) > 1 or mode not in "rw":
1697             raise ValueError("mode must be 'r' or 'w'")
1698
1699         try:
1700             import gzip
1701             gzip.GzipFile
1702         except (ImportError, AttributeError):
1703             raise CompressionError("gzip module is not available")
1704
1705         if fileobj is None:
1706             fileobj = bltn_open(name, mode + "b")
1707
1708         try:
1709             t = cls.taropen(name, mode,
1710                 gzip.GzipFile(name, mode, compresslevel, fileobj),
1711                 **kwargs)
1712         except IOError:
1713             raise ReadError("not a gzip file")
1714         t._extfileobj = False
1715         return t
1716
1717     @classmethod
1718     def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1719         """Open bzip2 compressed tar archive name for reading or writing.
1720            Appending is not allowed.
1721         """
1722         if len(mode) > 1 or mode not in "rw":
1723             raise ValueError("mode must be 'r' or 'w'.")
1724
1725         try:
1726             import bz2
1727         except ImportError:
1728             raise CompressionError("bz2 module is not available")
1729
1730         if fileobj is not None:
1731             fileobj = _BZ2Proxy(fileobj, mode)
1732         else:
1733             fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel)
1734
1735         try:
1736             t = cls.taropen(name, mode, fileobj, **kwargs)
1737         except IOError:
1738             raise ReadError("not a bzip2 file")
1739         t._extfileobj = False
1740         return t
1741
1742     # All *open() methods are registered here.
1743     OPEN_METH = {
1744         "tar": "taropen",   # uncompressed tar
1745         "gz":  "gzopen",    # gzip compressed tar
1746         "bz2": "bz2open"    # bzip2 compressed tar
1747     }
1748
1749     #--------------------------------------------------------------------------
1750     # The public methods which TarFile provides:
1751
1752     def close(self):
1753         """Close the TarFile. In write-mode, two finishing zero blocks are
1754            appended to the archive.
1755         """
1756         if self.closed:
1757             return
1758
1759         if self.mode in "aw":
1760             self.fileobj.write(NUL * (BLOCKSIZE * 2))
1761             self.offset += (BLOCKSIZE * 2)
1762             # fill up the end with zero-blocks
1763             # (like option -b20 for tar does)
1764             blocks, remainder = divmod(self.offset, RECORDSIZE)
1765             if remainder > 0:
1766                 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1767
1768         if not self._extfileobj:
1769             self.fileobj.close()
1770         self.closed = True
1771
1772     def getmember(self, name):
1773         """Return a TarInfo object for member `name'. If `name' can not be
1774            found in the archive, KeyError is raised. If a member occurs more
1775            than once in the archive, its last occurence is assumed to be the
1776            most up-to-date version.
1777         """
1778         tarinfo = self._getmember(name)
1779         if tarinfo is None:
1780             raise KeyError("filename %r not found" % name)
1781         return tarinfo
1782
1783     def getmembers(self):
1784         """Return the members of the archive as a list of TarInfo objects. The
1785            list has the same order as the members in the archive.
1786         """
1787         self._check()
1788         if not self._loaded:    # if we want to obtain a list of
1789             self._load()        # all members, we first have to
1790                                 # scan the whole archive.
1791         return self.members
1792
1793     def getnames(self):
1794         """Return the members of the archive as a list of their names. It has
1795            the same order as the list returned by getmembers().
1796         """
1797         return [tarinfo.name for tarinfo in self.getmembers()]
1798
1799     def gettarinfo(self, name=None, arcname=None, fileobj=None):
1800         """Create a TarInfo object for either the file `name' or the file
1801            object `fileobj' (using os.fstat on its file descriptor). You can
1802            modify some of the TarInfo's attributes before you add it using
1803            addfile(). If given, `arcname' specifies an alternative name for the
1804            file in the archive.
1805         """
1806         self._check("aw")
1807
1808         # When fileobj is given, replace name by
1809         # fileobj's real name.
1810         if fileobj is not None:
1811             name = fileobj.name
1812
1813         # Building the name of the member in the archive.
1814         # Backward slashes are converted to forward slashes,
1815         # Absolute paths are turned to relative paths.
1816         if arcname is None:
1817             arcname = name
1818         arcname = normpath(arcname)
1819         drv, arcname = os.path.splitdrive(arcname)
1820         while arcname[0:1] == "/":
1821             arcname = arcname[1:]
1822
1823         # Now, fill the TarInfo object with
1824         # information specific for the file.
1825         tarinfo = self.tarinfo()
1826         tarinfo.tarfile = self
1827
1828         # Use os.stat or os.lstat, depending on platform
1829         # and if symlinks shall be resolved.
1830         if fileobj is None:
1831             if hasattr(os, "lstat") and not self.dereference:
1832                 statres = os.lstat(name)
1833             else:
1834                 statres = os.stat(name)
1835         else:
1836             statres = os.fstat(fileobj.fileno())
1837         linkname = ""
1838
1839         stmd = statres.st_mode
1840         if stat.S_ISREG(stmd):
1841             inode = (statres.st_ino, statres.st_dev)
1842             if not self.dereference and statres.st_nlink > 1 and \
1843                     inode in self.inodes and arcname != self.inodes[inode]:
1844                 # Is it a hardlink to an already
1845                 # archived file?
1846                 type = LNKTYPE
1847                 linkname = self.inodes[inode]
1848             else:
1849                 # The inode is added only if its valid.
1850                 # For win32 it is always 0.
1851                 type = REGTYPE
1852                 if inode[0]:
1853                     self.inodes[inode] = arcname
1854         elif stat.S_ISDIR(stmd):
1855             type = DIRTYPE
1856         elif stat.S_ISFIFO(stmd):
1857             type = FIFOTYPE
1858         elif stat.S_ISLNK(stmd):
1859             type = SYMTYPE
1860             linkname = os.readlink(name)
1861         elif stat.S_ISCHR(stmd):
1862             type = CHRTYPE
1863         elif stat.S_ISBLK(stmd):
1864             type = BLKTYPE
1865         else:
1866             return None
1867
1868         # Fill the TarInfo object with all
1869         # information we can get.
1870         tarinfo.name = arcname
1871         tarinfo.mode = stmd
1872         tarinfo.uid = statres.st_uid
1873         tarinfo.gid = statres.st_gid
1874         if stat.S_ISREG(stmd):
1875             tarinfo.size = statres.st_size
1876         else:
1877             tarinfo.size = 0L
1878         tarinfo.mtime = statres.st_mtime
1879         tarinfo.type = type
1880         tarinfo.linkname = linkname
1881         if pwd:
1882             try:
1883                 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1884             except KeyError:
1885                 pass
1886         if grp:
1887             try:
1888                 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1889             except KeyError:
1890                 pass
1891
1892         if type in (CHRTYPE, BLKTYPE):
1893             if hasattr(os, "major") and hasattr(os, "minor"):
1894                 tarinfo.devmajor = os.major(statres.st_rdev)
1895                 tarinfo.devminor = os.minor(statres.st_rdev)
1896         return tarinfo
1897
1898     def list(self, verbose=True):
1899         """Print a table of contents to sys.stdout. If `verbose' is False, only
1900            the names of the members are printed. If it is True, an `ls -l'-like
1901            output is produced.
1902         """
1903         self._check()
1904
1905         for tarinfo in self:
1906             if verbose:
1907                 print filemode(tarinfo.mode),
1908                 print "%s/%s" % (tarinfo.uname or tarinfo.uid,
1909                                  tarinfo.gname or tarinfo.gid),
1910                 if tarinfo.ischr() or tarinfo.isblk():
1911                     print "%10s" % ("%d,%d" \
1912                                     % (tarinfo.devmajor, tarinfo.devminor)),
1913                 else:
1914                     print "%10d" % tarinfo.size,
1915                 print "%d-%02d-%02d %02d:%02d:%02d" \
1916                       % time.localtime(tarinfo.mtime)[:6],
1917
1918             print tarinfo.name + ("/" if tarinfo.isdir() else ""),
1919
1920             if verbose:
1921                 if tarinfo.issym():
1922                     print "->", tarinfo.linkname,
1923                 if tarinfo.islnk():
1924                     print "link to", tarinfo.linkname,
1925             print
1926
1927     def add(self, name, arcname=None, recursive=True, exclude=None):
1928         """Add the file `name' to the archive. `name' may be any type of file
1929            (directory, fifo, symbolic link, etc.). If given, `arcname'
1930            specifies an alternative name for the file in the archive.
1931            Directories are added recursively by default. This can be avoided by
1932            setting `recursive' to False. `exclude' is a function that should
1933            return True for each filename to be excluded.
1934         """
1935         self._check("aw")
1936
1937         if arcname is None:
1938             arcname = name
1939
1940         # Exclude pathnames.
1941         if exclude is not None and exclude(name):
1942             self._dbg(2, "tarfile: Excluded %r" % name)
1943             return
1944
1945         # Skip if somebody tries to archive the archive...
1946         if self.name is not None and os.path.abspath(name) == self.name:
1947             self._dbg(2, "tarfile: Skipped %r" % name)
1948             return
1949
1950         # Special case: The user wants to add the current
1951         # working directory.
1952         if name == ".":
1953             if recursive:
1954                 if arcname == ".":
1955                     arcname = ""
1956                 for f in os.listdir(name):
1957                     self.add(f, os.path.join(arcname, f), recursive, exclude)
1958             return
1959
1960         self._dbg(1, name)
1961
1962         # Create a TarInfo object from the file.
1963         tarinfo = self.gettarinfo(name, arcname)
1964
1965         if tarinfo is None:
1966             self._dbg(1, "tarfile: Unsupported type %r" % name)
1967             return
1968
1969         # Append the tar header and data to the archive.
1970         if tarinfo.isreg():
1971             f = bltn_open(name, "rb")
1972             self.addfile(tarinfo, f)
1973             f.close()
1974
1975         elif tarinfo.isdir():
1976             self.addfile(tarinfo)
1977             if recursive:
1978                 for f in os.listdir(name):
1979                     self.add(os.path.join(name, f), os.path.join(arcname, f), recursive, exclude)
1980
1981         else:
1982             self.addfile(tarinfo)
1983
1984     def addfile(self, tarinfo, fileobj=None):
1985         """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
1986            given, tarinfo.size bytes are read from it and added to the archive.
1987            You can create TarInfo objects using gettarinfo().
1988            On Windows platforms, `fileobj' should always be opened with mode
1989            'rb' to avoid irritation about the file size.
1990         """
1991         self._check("aw")
1992
1993         tarinfo = copy.copy(tarinfo)
1994
1995         buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
1996         self.fileobj.write(buf)
1997         self.offset += len(buf)
1998
1999         # If there's data to follow, append it.
2000         if fileobj is not None:
2001             copyfileobj(fileobj, self.fileobj, tarinfo.size)
2002             blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2003             if remainder > 0:
2004                 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2005                 blocks += 1
2006             self.offset += blocks * BLOCKSIZE
2007
2008         self.members.append(tarinfo)
2009
2010     def extractall(self, path=".", members=None):
2011         """Extract all members from the archive to the current working
2012            directory and set owner, modification time and permissions on
2013            directories afterwards. `path' specifies a different directory
2014            to extract to. `members' is optional and must be a subset of the
2015            list returned by getmembers().
2016         """
2017         directories = []
2018
2019         if members is None:
2020             members = self
2021
2022         for tarinfo in members:
2023             if tarinfo.isdir():
2024                 # Extract directories with a safe mode.
2025                 directories.append(tarinfo)
2026                 tarinfo = copy.copy(tarinfo)
2027                 tarinfo.mode = 0700
2028             self.extract(tarinfo, path)
2029
2030         # Reverse sort directories.
2031         directories.sort(key=operator.attrgetter('name'))
2032         directories.reverse()
2033
2034         # Set correct owner, mtime and filemode on directories.
2035         for tarinfo in directories:
2036             dirpath = os.path.join(path, tarinfo.name)
2037             try:
2038                 self.chown(tarinfo, dirpath)
2039                 self.utime(tarinfo, dirpath)
2040                 self.chmod(tarinfo, dirpath)
2041             except ExtractError, e:
2042                 if self.errorlevel > 1:
2043                     raise
2044                 else:
2045                     self._dbg(1, "tarfile: %s" % e)
2046
2047     def extract(self, member, path=""):
2048         """Extract a member from the archive to the current working directory,
2049            using its full name. Its file information is extracted as accurately
2050            as possible. `member' may be a filename or a TarInfo object. You can
2051            specify a different directory using `path'.
2052         """
2053         self._check("r")
2054
2055         if isinstance(member, basestring):
2056             tarinfo = self.getmember(member)
2057         else:
2058             tarinfo = member
2059
2060         # Prepare the link target for makelink().
2061         if tarinfo.islnk():
2062             tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2063
2064         try:
2065             self._extract_member(tarinfo, os.path.join(path, tarinfo.name))
2066         except EnvironmentError, e:
2067             if self.errorlevel > 0:
2068                 raise
2069             else:
2070                 if e.filename is None:
2071                     self._dbg(1, "tarfile: %s" % e.strerror)
2072                 else:
2073                     self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2074         except ExtractError, e:
2075             if self.errorlevel > 1:
2076                 raise
2077             else:
2078                 self._dbg(1, "tarfile: %s" % e)
2079
2080     def extractfile(self, member):
2081         """Extract a member from the archive as a file object. `member' may be
2082            a filename or a TarInfo object. If `member' is a regular file, a
2083            file-like object is returned. If `member' is a link, a file-like
2084            object is constructed from the link's target. If `member' is none of
2085            the above, None is returned.
2086            The file-like object is read-only and provides the following
2087            methods: read(), readline(), readlines(), seek() and tell()
2088         """
2089         self._check("r")
2090
2091         if isinstance(member, basestring):
2092             tarinfo = self.getmember(member)
2093         else:
2094             tarinfo = member
2095
2096         if tarinfo.isreg():
2097             return self.fileobject(self, tarinfo)
2098
2099         elif tarinfo.type not in SUPPORTED_TYPES:
2100             # If a member's type is unknown, it is treated as a
2101             # regular file.
2102             return self.fileobject(self, tarinfo)
2103
2104         elif tarinfo.islnk() or tarinfo.issym():
2105             if isinstance(self.fileobj, _Stream):
2106                 # A small but ugly workaround for the case that someone tries
2107                 # to extract a (sym)link as a file-object from a non-seekable
2108                 # stream of tar blocks.
2109                 raise StreamError("cannot extract (sym)link as file object")
2110             else:
2111                 # A (sym)link's file object is its target's file object.
2112                 return self.extractfile(self._getmember(tarinfo.linkname,
2113                                                         tarinfo))
2114         else:
2115             # If there's no data associated with the member (directory, chrdev,
2116             # blkdev, etc.), return None instead of a file object.
2117             return None
2118
2119     def _extract_member(self, tarinfo, targetpath):
2120         """Extract the TarInfo object tarinfo to a physical
2121            file called targetpath.
2122         """
2123         # Fetch the TarInfo object for the given name
2124         # and build the destination pathname, replacing
2125         # forward slashes to platform specific separators.
2126         if targetpath[-1:] == "/":
2127             targetpath = targetpath[:-1]
2128         targetpath = os.path.normpath(targetpath)
2129
2130         # Create all upper directories.
2131         upperdirs = os.path.dirname(targetpath)
2132         if upperdirs and not os.path.exists(upperdirs):
2133             # Create directories that are not part of the archive with
2134             # default permissions.
2135             os.makedirs(upperdirs)
2136
2137         if tarinfo.islnk() or tarinfo.issym():
2138             self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2139         else:
2140             self._dbg(1, tarinfo.name)
2141
2142         if tarinfo.isreg():
2143             self.makefile(tarinfo, targetpath)
2144         elif tarinfo.isdir():
2145             self.makedir(tarinfo, targetpath)
2146         elif tarinfo.isfifo():
2147             self.makefifo(tarinfo, targetpath)
2148         elif tarinfo.ischr() or tarinfo.isblk():
2149             self.makedev(tarinfo, targetpath)
2150         elif tarinfo.islnk() or tarinfo.issym():
2151             self.makelink(tarinfo, targetpath)
2152         elif tarinfo.type not in SUPPORTED_TYPES:
2153             self.makeunknown(tarinfo, targetpath)
2154         else:
2155             self.makefile(tarinfo, targetpath)
2156
2157         self.chown(tarinfo, targetpath)
2158         if not tarinfo.issym():
2159             self.chmod(tarinfo, targetpath)
2160             self.utime(tarinfo, targetpath)
2161
2162     #--------------------------------------------------------------------------
2163     # Below are the different file methods. They are called via
2164     # _extract_member() when extract() is called. They can be replaced in a
2165     # subclass to implement other functionality.
2166
2167     def makedir(self, tarinfo, targetpath):
2168         """Make a directory called targetpath.
2169         """
2170         try:
2171             # Use a safe mode for the directory, the real mode is set
2172             # later in _extract_member().
2173             os.mkdir(targetpath, 0700)
2174         except EnvironmentError, e:
2175             if e.errno != errno.EEXIST:
2176                 raise
2177
2178     def makefile(self, tarinfo, targetpath):
2179         """Make a file called targetpath.
2180         """
2181         source = self.extractfile(tarinfo)
2182         target = bltn_open(targetpath, "wb")
2183         copyfileobj(source, target)
2184         source.close()
2185         target.close()
2186
2187     def makeunknown(self, tarinfo, targetpath):
2188         """Make a file from a TarInfo object with an unknown type
2189            at targetpath.
2190         """
2191         self.makefile(tarinfo, targetpath)
2192         self._dbg(1, "tarfile: Unknown file type %r, " \
2193                      "extracted as regular file." % tarinfo.type)
2194
2195     def makefifo(self, tarinfo, targetpath):
2196         """Make a fifo called targetpath.
2197         """
2198         if hasattr(os, "mkfifo"):
2199             os.mkfifo(targetpath)
2200         else:
2201             raise ExtractError("fifo not supported by system")
2202
2203     def makedev(self, tarinfo, targetpath):
2204         """Make a character or block device called targetpath.
2205         """
2206         if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
2207             raise ExtractError("special devices not supported by system")
2208
2209         mode = tarinfo.mode
2210         if tarinfo.isblk():
2211             mode |= stat.S_IFBLK
2212         else:
2213             mode |= stat.S_IFCHR
2214
2215         os.mknod(targetpath, mode,
2216                  os.makedev(tarinfo.devmajor, tarinfo.devminor))
2217
2218     def makelink(self, tarinfo, targetpath):
2219         """Make a (symbolic) link called targetpath. If it cannot be created
2220           (platform limitation), we try to make a copy of the referenced file
2221           instead of a link.
2222         """
2223         linkpath = tarinfo.linkname
2224         try:
2225             if tarinfo.issym():
2226                 os.symlink(linkpath, targetpath)
2227             else:
2228                 # See extract().
2229                 os.link(tarinfo._link_target, targetpath)
2230         except AttributeError:
2231             if tarinfo.issym():
2232                 linkpath = os.path.join(os.path.dirname(tarinfo.name),
2233                                         linkpath)
2234                 linkpath = normpath(linkpath)
2235
2236             try:
2237                 self._extract_member(self.getmember(linkpath), targetpath)
2238             except (EnvironmentError, KeyError), e:
2239                 linkpath = os.path.normpath(linkpath)
2240                 try:
2241                     shutil.copy2(linkpath, targetpath)
2242                 except EnvironmentError, e:
2243                     raise IOError("link could not be created")
2244
2245     def chown(self, tarinfo, targetpath):
2246         """Set owner of targetpath according to tarinfo.
2247         """
2248         if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2249             # We have to be root to do so.
2250             try:
2251                 g = grp.getgrnam(tarinfo.gname)[2]
2252             except KeyError:
2253                 try:
2254                     g = grp.getgrgid(tarinfo.gid)[2]
2255                 except KeyError:
2256                     g = os.getgid()
2257             try:
2258                 u = pwd.getpwnam(tarinfo.uname)[2]
2259             except KeyError:
2260                 try:
2261                     u = pwd.getpwuid(tarinfo.uid)[2]
2262                 except KeyError:
2263                     u = os.getuid()
2264             try:
2265                 if tarinfo.issym() and hasattr(os, "lchown"):
2266                     os.lchown(targetpath, u, g)
2267                 else:
2268                     if sys.platform != "os2emx":
2269                         os.chown(targetpath, u, g)
2270             except EnvironmentError, e:
2271                 raise ExtractError("could not change owner")
2272
2273     def chmod(self, tarinfo, targetpath):
2274         """Set file permissions of targetpath according to tarinfo.
2275         """
2276         if hasattr(os, 'chmod'):
2277             try:
2278                 os.chmod(targetpath, tarinfo.mode)
2279             except EnvironmentError, e:
2280                 raise ExtractError("could not change mode")
2281
2282     def utime(self, tarinfo, targetpath):
2283         """Set modification time of targetpath according to tarinfo.
2284         """
2285         if not hasattr(os, 'utime'):
2286             return
2287         if sys.platform == "win32" and tarinfo.isdir():
2288             # According to msdn.microsoft.com, it is an error (EACCES)
2289             # to use utime() on directories.
2290             return
2291         try:
2292             os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
2293         except EnvironmentError, e:
2294             raise ExtractError("could not change modification time")
2295
2296     #--------------------------------------------------------------------------
2297     def next(self):
2298         """Return the next member of the archive as a TarInfo object, when
2299            TarFile is opened for reading. Return None if there is no more
2300            available.
2301         """
2302         self._check("ra")
2303         if self.firstmember is not None:
2304             m = self.firstmember
2305             self.firstmember = None
2306             return m
2307
2308         # Read the next block.
2309         self.fileobj.seek(self.offset)
2310         while True:
2311             try:
2312                 tarinfo = self.tarinfo.fromtarfile(self)
2313                 if tarinfo is None:
2314                     return
2315                 self.members.append(tarinfo)
2316
2317             except HeaderError, e:
2318                 if self.ignore_zeros:
2319                     self._dbg(2, "0x%X: %s" % (self.offset, e))
2320                     self.offset += BLOCKSIZE
2321                     continue
2322                 else:
2323                     if self.offset == 0:
2324                         raise ReadError(str(e))
2325                     return None
2326             break
2327
2328         return tarinfo
2329
2330     #--------------------------------------------------------------------------
2331     # Little helper methods:
2332
2333     def _getmember(self, name, tarinfo=None):
2334         """Find an archive member by name from bottom to top.
2335            If tarinfo is given, it is used as the starting point.
2336         """
2337         # Ensure that all members have been loaded.
2338         members = self.getmembers()
2339
2340         if tarinfo is None:
2341             end = len(members)
2342         else:
2343             end = members.index(tarinfo)
2344
2345         for i in xrange(end - 1, -1, -1):
2346             if name == members[i].name:
2347                 return members[i]
2348
2349     def _load(self):
2350         """Read through the entire archive file and look for readable
2351            members.
2352         """
2353         while True:
2354             tarinfo = self.next()
2355             if tarinfo is None:
2356                 break
2357         self._loaded = True
2358
2359     def _check(self, mode=None):
2360         """Check if TarFile is still open, and if the operation's mode
2361            corresponds to TarFile's mode.
2362         """
2363         if self.closed:
2364             raise IOError("%s is closed" % self.__class__.__name__)
2365         if mode is not None and self.mode not in mode:
2366             raise IOError("bad operation for mode %r" % self.mode)
2367
2368     def __iter__(self):
2369         """Provide an iterator object.
2370         """
2371         if self._loaded:
2372             return iter(self.members)
2373         else:
2374             return TarIter(self)
2375
2376     def _dbg(self, level, msg):
2377         """Write debugging output to sys.stderr.
2378         """
2379         if level <= self.debug:
2380             print >> sys.stderr, msg
2381 # class TarFile
2382
2383 class TarIter:
2384     """Iterator Class.
2385
2386        for tarinfo in TarFile(...):
2387            suite...
2388     """
2389
2390     def __init__(self, tarfile):
2391         """Construct a TarIter object.
2392         """
2393         self.tarfile = tarfile
2394         self.index = 0
2395     def __iter__(self):
2396         """Return iterator object.
2397         """
2398         return self
2399     def next(self):
2400         """Return the next item using TarFile's next() method.
2401            When all members have been read, set TarFile as _loaded.
2402         """
2403         # Fix for SF #1100429: Under rare circumstances it can
2404         # happen that getmembers() is called during iteration,
2405         # which will cause TarIter to stop prematurely.
2406         if not self.tarfile._loaded:
2407             tarinfo = self.tarfile.next()
2408             if not tarinfo:
2409                 self.tarfile._loaded = True
2410                 raise StopIteration
2411         else:
2412             try:
2413                 tarinfo = self.tarfile.members[self.index]
2414             except IndexError:
2415                 raise StopIteration
2416         self.index += 1
2417         return tarinfo
2418
2419 # Helper classes for sparse file support
2420 class _section:
2421     """Base class for _data and _hole.
2422     """
2423     def __init__(self, offset, size):
2424         self.offset = offset
2425         self.size = size
2426     def __contains__(self, offset):
2427         return self.offset <= offset < self.offset + self.size
2428
2429 class _data(_section):
2430     """Represent a data section in a sparse file.
2431     """
2432     def __init__(self, offset, size, realpos):
2433         _section.__init__(self, offset, size)
2434         self.realpos = realpos
2435
2436 class _hole(_section):
2437     """Represent a hole section in a sparse file.
2438     """
2439     pass
2440
2441 class _ringbuffer(list):
2442     """Ringbuffer class which increases performance
2443        over a regular list.
2444     """
2445     def __init__(self):
2446         self.idx = 0
2447     def find(self, offset):
2448         idx = self.idx
2449         while True:
2450             item = self[idx]
2451             if offset in item:
2452                 break
2453             idx += 1
2454             if idx == len(self):
2455                 idx = 0
2456             if idx == self.idx:
2457                 # End of File
2458                 return None
2459         self.idx = idx
2460         return item
2461
2462 #---------------------------------------------
2463 # zipfile compatible TarFile class
2464 #---------------------------------------------
2465 TAR_PLAIN = 0           # zipfile.ZIP_STORED
2466 TAR_GZIPPED = 8         # zipfile.ZIP_DEFLATED
2467 class TarFileCompat:
2468     """TarFile class compatible with standard module zipfile's
2469        ZipFile class.
2470     """
2471     def __init__(self, file, mode="r", compression=TAR_PLAIN):
2472         from warnings import warnpy3k
2473         warnpy3k("the TarFileCompat class has been removed in Python 3.0",
2474                 stacklevel=2)
2475         if compression == TAR_PLAIN:
2476             self.tarfile = TarFile.taropen(file, mode)
2477         elif compression == TAR_GZIPPED:
2478             self.tarfile = TarFile.gzopen(file, mode)
2479         else:
2480             raise ValueError("unknown compression constant")
2481         if mode[0:1] == "r":
2482             members = self.tarfile.getmembers()
2483             for m in members:
2484                 m.filename = m.name
2485                 m.file_size = m.size
2486                 m.date_time = time.gmtime(m.mtime)[:6]
2487     def namelist(self):
2488         return map(lambda m: m.name, self.infolist())
2489     def infolist(self):
2490         return filter(lambda m: m.type in REGULAR_TYPES,
2491                       self.tarfile.getmembers())
2492     def printdir(self):
2493         self.tarfile.list()
2494     def testzip(self):
2495         return
2496     def getinfo(self, name):
2497         return self.tarfile.getmember(name)
2498     def read(self, name):
2499         return self.tarfile.extractfile(self.tarfile.getmember(name)).read()
2500     def write(self, filename, arcname=None, compress_type=None):
2501         self.tarfile.add(filename, arcname)
2502     def writestr(self, zinfo, bytes):
2503         try:
2504             from cStringIO import StringIO
2505         except ImportError:
2506             from StringIO import StringIO
2507         import calendar
2508         tinfo = TarInfo(zinfo.filename)
2509         tinfo.size = len(bytes)
2510         tinfo.mtime = calendar.timegm(zinfo.date_time)
2511         self.tarfile.addfile(tinfo, StringIO(bytes))
2512     def close(self):
2513         self.tarfile.close()
2514 #class TarFileCompat
2515
2516 #--------------------
2517 # exported functions
2518 #--------------------
2519 def is_tarfile(name):
2520     """Return True if name points to a tar archive that we
2521        are able to handle, else return False.
2522     """
2523     try:
2524         t = open(name)
2525         t.close()
2526         return True
2527     except TarError:
2528         return False
2529
2530 bltn_open = open
2531 open = TarFile.open