Lib/tarfile.py

   1 #!/usr/bin/env python
   2 # -*- coding: iso-8859-1 -*-
   3 #-------------------------------------------------------------------
   4 # tarfile.py
   5 #-------------------------------------------------------------------
   6 # Copyright (C) 2002 Lars Gustäbel <lars@gustaebel.de>
   7 # All rights reserved.
   8 #
   9 # Permission  is  hereby granted,  free  of charge,  to  any person
  10 # obtaining a  copy of  this software  and associated documentation
  11 # files  (the  "Software"),  to   deal  in  the  Software   without
  12 # restriction,  including  without limitation  the  rights to  use,
  13 # copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 # copies  of  the  Software,  and to  permit  persons  to  whom the
  15 # Software  is  furnished  to  do  so,  subject  to  the  following
  16 # conditions:
  17 #
  18 # The above copyright  notice and this  permission notice shall  be
  19 # included in all copies or substantial portions of the Software.
  20 #
  21 # THE SOFTWARE IS PROVIDED "AS  IS", WITHOUT WARRANTY OF ANY  KIND,
  22 # EXPRESS OR IMPLIED, INCLUDING  BUT NOT LIMITED TO  THE WARRANTIES
  23 # OF  MERCHANTABILITY,  FITNESS   FOR  A  PARTICULAR   PURPOSE  AND
  24 # NONINFRINGEMENT.  IN  NO  EVENT SHALL  THE  AUTHORS  OR COPYRIGHT
  25 # HOLDERS  BE LIABLE  FOR ANY  CLAIM, DAMAGES  OR OTHER  LIABILITY,
  26 # WHETHER  IN AN  ACTION OF  CONTRACT, TORT  OR OTHERWISE,  ARISING
  27 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  28 # OTHER DEALINGS IN THE SOFTWARE.
  29 #
  30 """Read from and write to tar format archives.
  31 """
  32
  33 __version__ = "$Revision$"
  34 # $Source$
  35
  36 version     = "0.9.0"
  37 __author__  = "Lars Gustäbel (lars@gustaebel.de)"
  38 __date__    = "$Date$"
  39 __cvsid__   = "$Id$"
  40 __credits__ = "Gustavo Niemeyer, Niels Gustäbel, Richard Townsend."
  41
  42 #---------
  43 # Imports
  44 #---------
  45 import sys
  46 import os
  47 import shutil
  48 import stat
  49 import errno
  50 import time
  51 import struct
  52 import copy
  53 import re
  54 import operator
  55
  56 if sys.platform == 'mac':
  57     # This module needs work for MacOS9, especially in the area of pathname
  58     # handling. In many places it is assumed a simple substitution of / by the
  59     # local os.path.sep is good enough to convert pathnames, but this does not
  60     # work with the mac rooted:path:name versus :nonrooted:path:name syntax
  61     raise ImportError, "tarfile does not work for platform==mac"
  62
  63 try:
  64     import grp, pwd
  65 except ImportError:
  66     grp = pwd = None
  67
  68 # from tarfile import *
  69 __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
  70
  71 #---------------------------------------------------------
  72 # tar constants
  73 #---------------------------------------------------------
  74 NUL = "\0"                      # the null character
  75 BLOCKSIZE = 512                 # length of processing blocks
  76 RECORDSIZE = BLOCKSIZE * 20     # length of records
  77 GNU_MAGIC = "ustar  \0"         # magic gnu tar string
  78 POSIX_MAGIC = "ustar\x0000"     # magic posix tar string
  79
  80 LENGTH_NAME = 100               # maximum length of a filename
  81 LENGTH_LINK = 100               # maximum length of a linkname
  82 LENGTH_PREFIX = 155             # maximum length of the prefix field
  83
  84 REGTYPE = "0"                   # regular file
  85 AREGTYPE = "\0"                 # regular file
  86 LNKTYPE = "1"                   # link (inside tarfile)
  87 SYMTYPE = "2"                   # symbolic link
  88 CHRTYPE = "3"                   # character special device
  89 BLKTYPE = "4"                   # block special device
  90 DIRTYPE = "5"                   # directory
  91 FIFOTYPE = "6"                  # fifo special device
  92 CONTTYPE = "7"                  # contiguous file
  93
  94 GNUTYPE_LONGNAME = "L"          # GNU tar longname
  95 GNUTYPE_LONGLINK = "K"          # GNU tar longlink
  96 GNUTYPE_SPARSE = "S"            # GNU tar sparse file
  97
  98 XHDTYPE = "x"                   # POSIX.1-2001 extended header
  99 XGLTYPE = "g"                   # POSIX.1-2001 global header
 100 SOLARIS_XHDTYPE = "X"           # Solaris extended header
 101
 102 USTAR_FORMAT = 0                # POSIX.1-1988 (ustar) format
 103 GNU_FORMAT = 1                  # GNU tar format
 104 PAX_FORMAT = 2                  # POSIX.1-2001 (pax) format
 105 DEFAULT_FORMAT = GNU_FORMAT
 106
 107 #---------------------------------------------------------
 108 # tarfile constants
 109 #---------------------------------------------------------
 110 # File types that tarfile supports:
 111 SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
 112                    SYMTYPE, DIRTYPE, FIFOTYPE,
 113                    CONTTYPE, CHRTYPE, BLKTYPE,
 114                    GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
 115                    GNUTYPE_SPARSE)
 116
 117 # File types that will be treated as a regular file.
 118 REGULAR_TYPES = (REGTYPE, AREGTYPE,
 119                  CONTTYPE, GNUTYPE_SPARSE)
 120
 121 # File types that are part of the GNU tar format.
 122 GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
 123              GNUTYPE_SPARSE)
 124
 125 # Fields from a pax header that override a TarInfo attribute.
 126 PAX_FIELDS = ("path", "linkpath", "size", "mtime",
 127               "uid", "gid", "uname", "gname")
 128
 129 # Fields in a pax header that are numbers, all other fields
 130 # are treated as strings.
 131 PAX_NUMBER_FIELDS = {
 132     "atime": float,
 133     "ctime": float,
 134     "mtime": float,
 135     "uid": int,
 136     "gid": int,
 137     "size": int
 138 }
 139
 140 #---------------------------------------------------------
 141 # Bits used in the mode field, values in octal.
 142 #---------------------------------------------------------
 143 S_IFLNK = 0120000        # symbolic link
 144 S_IFREG = 0100000        # regular file
 145 S_IFBLK = 0060000        # block device
 146 S_IFDIR = 0040000        # directory
 147 S_IFCHR = 0020000        # character device
 148 S_IFIFO = 0010000        # fifo
 149
 150 TSUID   = 04000          # set UID on execution
 151 TSGID   = 02000          # set GID on execution
 152 TSVTX   = 01000          # reserved
 153
 154 TUREAD  = 0400           # read by owner
 155 TUWRITE = 0200           # write by owner
 156 TUEXEC  = 0100           # execute/search by owner
 157 TGREAD  = 0040           # read by group
 158 TGWRITE = 0020           # write by group
 159 TGEXEC  = 0010           # execute/search by group
 160 TOREAD  = 0004           # read by other
 161 TOWRITE = 0002           # write by other
 162 TOEXEC  = 0001           # execute/search by other
 163
 164 #---------------------------------------------------------
 165 # initialization
 166 #---------------------------------------------------------
 167 ENCODING = sys.getfilesystemencoding()
 168 if ENCODING is None:
 169     ENCODING = sys.getdefaultencoding()
 170
 171 #---------------------------------------------------------
 172 # Some useful functions
 173 #---------------------------------------------------------
 174
 175 def stn(s, length):
 176     """Convert a python string to a null-terminated string buffer.
 177     """
 178     return s[:length] + (length - len(s)) * NUL
 179
 180 def nts(s):
 181     """Convert a null-terminated string field to a python string.
 182     """
 183     # Use the string up to the first null char.
 184     p = s.find("\0")
 185     if p == -1:
 186         return s
 187     return s[:p]
 188
 189 def nti(s):
 190     """Convert a number field to a python number.
 191     """
 192     # There are two possible encodings for a number field, see
 193     # itn() below.
 194     if s[0] != chr(0200):
 195         try:
 196             n = int(nts(s) or "0", 8)
 197         except ValueError:
 198             raise HeaderError("invalid header")
 199     else:
 200         n = 0L
 201         for i in xrange(len(s) - 1):
 202             n <<= 8
 203             n += ord(s[i + 1])
 204     return n
 205
 206 def itn(n, digits=8, format=DEFAULT_FORMAT):
 207     """Convert a python number to a number field.
 208     """
 209     # POSIX 1003.1-1988 requires numbers to be encoded as a string of
 210     # octal digits followed by a null-byte, this allows values up to
 211     # (8**(digits-1))-1. GNU tar allows storing numbers greater than
 212     # that if necessary. A leading 0200 byte indicates this particular
 213     # encoding, the following digits-1 bytes are a big-endian
 214     # representation. This allows values up to (256**(digits-1))-1.
 215     if 0 <= n < 8 ** (digits - 1):
 216         s = "%0*o" % (digits - 1, n) + NUL
 217     else:
 218         if format != GNU_FORMAT or n >= 256 ** (digits - 1):
 219             raise ValueError("overflow in number field")
 220
 221         if n < 0:
 222             # XXX We mimic GNU tar's behaviour with negative numbers,
 223             # this could raise OverflowError.
 224             n = struct.unpack("L", struct.pack("l", n))[0]
 225
 226         s = ""
 227         for i in xrange(digits - 1):
 228             s = chr(n & 0377) + s
 229             n >>= 8
 230         s = chr(0200) + s
 231     return s
 232
 233 def uts(s, encoding, errors):
 234     """Convert a unicode object to a string.
 235     """
 236     if errors == "utf-8":
 237         # An extra error handler similar to the -o invalid=UTF-8 option
 238         # in POSIX.1-2001. Replace untranslatable characters with their
 239         # UTF-8 representation.
 240         try:
 241             return s.encode(encoding, "strict")
 242         except UnicodeEncodeError:
 243             x = []
 244             for c in s:
 245                 try:
 246                     x.append(c.encode(encoding, "strict"))
 247                 except UnicodeEncodeError:
 248                     x.append(c.encode("utf8"))
 249             return "".join(x)
 250     else:
 251         return s.encode(encoding, errors)
 252
 253 def calc_chksums(buf):
 254     """Calculate the checksum for a member's header by summing up all
 255        characters except for the chksum field which is treated as if
 256        it was filled with spaces. According to the GNU tar sources,
 257        some tars (Sun and NeXT) calculate chksum with signed char,
 258        which will be different if there are chars in the buffer with
 259        the high bit set. So we calculate two checksums, unsigned and
 260        signed.
 261     """
 262     unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
 263     signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
 264     return unsigned_chksum, signed_chksum
 265
 266 def copyfileobj(src, dst, length=None):
 267     """Copy length bytes from fileobj src to fileobj dst.
 268        If length is None, copy the entire content.
 269     """
 270     if length == 0:
 271         return
 272     if length is None:
 273         shutil.copyfileobj(src, dst)
 274         return
 275
 276     BUFSIZE = 16 * 1024
 277     blocks, remainder = divmod(length, BUFSIZE)
 278     for b in xrange(blocks):
 279         buf = src.read(BUFSIZE)
 280         if len(buf) < BUFSIZE:
 281             raise IOError("end of file reached")
 282         dst.write(buf)
 283
 284     if remainder != 0:
 285         buf = src.read(remainder)
 286         if len(buf) < remainder:
 287             raise IOError("end of file reached")
 288         dst.write(buf)
 289     return
 290
 291 filemode_table = (
 292     ((S_IFLNK,      "l"),
 293      (S_IFREG,      "-"),
 294      (S_IFBLK,      "b"),
 295      (S_IFDIR,      "d"),
 296      (S_IFCHR,      "c"),
 297      (S_IFIFO,      "p")),
 298
 299     ((TUREAD,       "r"),),
 300     ((TUWRITE,      "w"),),
 301     ((TUEXEC|TSUID, "s"),
 302      (TSUID,        "S"),
 303      (TUEXEC,       "x")),
 304
 305     ((TGREAD,       "r"),),
 306     ((TGWRITE,      "w"),),
 307     ((TGEXEC|TSGID, "s"),
 308      (TSGID,        "S"),
 309      (TGEXEC,       "x")),
 310
 311     ((TOREAD,       "r"),),
 312     ((TOWRITE,      "w"),),
 313     ((TOEXEC|TSVTX, "t"),
 314      (TSVTX,        "T"),
 315      (TOEXEC,       "x"))
 316 )
 317
 318 def filemode(mode):
 319     """Convert a file's mode to a string of the form
 320        -rwxrwxrwx.
 321        Used by TarFile.list()
 322     """
 323     perm = []
 324     for table in filemode_table:
 325         for bit, char in table:
 326             if mode & bit == bit:
 327                 perm.append(char)
 328                 break
 329         else:
 330             perm.append("-")
 331     return "".join(perm)
 332
 333 class TarError(Exception):
 334     """Base exception."""
 335     pass
 336 class ExtractError(TarError):
 337     """General exception for extract errors."""
 338     pass
 339 class ReadError(TarError):
 340     """Exception for unreadble tar archives."""
 341     pass
 342 class CompressionError(TarError):
 343     """Exception for unavailable compression methods."""
 344     pass
 345 class StreamError(TarError):
 346     """Exception for unsupported operations on stream-like TarFiles."""
 347     pass
 348 class HeaderError(TarError):
 349     """Exception for invalid headers."""
 350     pass
 351
 352 #---------------------------
 353 # internal stream interface
 354 #---------------------------
 355 class _LowLevelFile:
 356     """Low-level file object. Supports reading and writing.
 357        It is used instead of a regular file object for streaming
 358        access.
 359     """
 360
 361     def __init__(self, name, mode):
 362         mode = {
 363             "r": os.O_RDONLY,
 364             "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
 365         }[mode]
 366         if hasattr(os, "O_BINARY"):
 367             mode |= os.O_BINARY
 368         self.fd = os.open(name, mode)
 369
 370     def close(self):
 371         os.close(self.fd)
 372
 373     def read(self, size):
 374         return os.read(self.fd, size)
 375
 376     def write(self, s):
 377         os.write(self.fd, s)
 378
 379 class _Stream:
 380     """Class that serves as an adapter between TarFile and
 381        a stream-like object.  The stream-like object only
 382        needs to have a read() or write() method and is accessed
 383        blockwise.  Use of gzip or bzip2 compression is possible.
 384        A stream-like object could be for example: sys.stdin,
 385        sys.stdout, a socket, a tape device etc.
 386
 387        _Stream is intended to be used only internally.
 388     """
 389
 390     def __init__(self, name, mode, comptype, fileobj, bufsize):
 391         """Construct a _Stream object.
 392         """
 393         self._extfileobj = True
 394         if fileobj is None:
 395             fileobj = _LowLevelFile(name, mode)
 396             self._extfileobj = False
 397
 398         if comptype == '*':
 399             # Enable transparent compression detection for the
 400             # stream interface
 401             fileobj = _StreamProxy(fileobj)
 402             comptype = fileobj.getcomptype()
 403
 404         self.name     = name or ""
 405         self.mode     = mode
 406         self.comptype = comptype
 407         self.fileobj  = fileobj
 408         self.bufsize  = bufsize
 409         self.buf      = ""
 410         self.pos      = 0L
 411         self.closed   = False
 412
 413         if comptype == "gz":
 414             try:
 415                 import zlib
 416             except ImportError:
 417                 raise CompressionError("zlib module is not available")
 418             self.zlib = zlib
 419             self.crc = zlib.crc32("") & 0xffffffffL
 420             if mode == "r":
 421                 self._init_read_gz()
 422             else:
 423                 self._init_write_gz()
 424
 425         if comptype == "bz2":
 426             try:
 427                 import bz2
 428             except ImportError:
 429                 raise CompressionError("bz2 module is not available")
 430             if mode == "r":
 431                 self.dbuf = ""
 432                 self.cmp = bz2.BZ2Decompressor()
 433             else:
 434                 self.cmp = bz2.BZ2Compressor()
 435
 436     def __del__(self):
 437         if hasattr(self, "closed") and not self.closed:
 438             self.close()
 439
 440     def _init_write_gz(self):
 441         """Initialize for writing with gzip compression.
 442         """
 443         self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
 444                                             -self.zlib.MAX_WBITS,
 445                                             self.zlib.DEF_MEM_LEVEL,
 446                                             0)
 447         timestamp = struct.pack("<L", long(time.time()))
 448         self.__write("\037\213\010\010%s\002\377" % timestamp)
 449         if self.name.endswith(".gz"):
 450             self.name = self.name[:-3]
 451         self.__write(self.name + NUL)
 452
 453     def write(self, s):
 454         """Write string s to the stream.
 455         """
 456         if self.comptype == "gz":
 457             self.crc = self.zlib.crc32(s, self.crc) & 0xffffffffL
 458         self.pos += len(s)
 459         if self.comptype != "tar":
 460             s = self.cmp.compress(s)
 461         self.__write(s)
 462
 463     def __write(self, s):
 464         """Write string s to the stream if a whole new block
 465            is ready to be written.
 466         """
 467         self.buf += s
 468         while len(self.buf) > self.bufsize:
 469             self.fileobj.write(self.buf[:self.bufsize])
 470             self.buf = self.buf[self.bufsize:]
 471
 472     def close(self):
 473         """Close the _Stream object. No operation should be
 474            done on it afterwards.
 475         """
 476         if self.closed:
 477             return
 478
 479         if self.mode == "w" and self.comptype != "tar":
 480             self.buf += self.cmp.flush()
 481
 482         if self.mode == "w" and self.buf:
 483             self.fileobj.write(self.buf)
 484             self.buf = ""
 485             if self.comptype == "gz":
 486                 # The native zlib crc is an unsigned 32-bit integer, but
 487                 # the Python wrapper implicitly casts that to a signed C
 488                 # long.  So, on a 32-bit box self.crc may "look negative",
 489                 # while the same crc on a 64-bit box may "look positive".
 490                 # To avoid irksome warnings from the `struct` module, force
 491                 # it to look positive on all boxes.
 492                 self.fileobj.write(struct.pack("<L", self.crc & 0xffffffffL))
 493                 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFFL))
 494
 495         if not self._extfileobj:
 496             self.fileobj.close()
 497
 498         self.closed = True
 499
 500     def _init_read_gz(self):
 501         """Initialize for reading a gzip compressed fileobj.
 502         """
 503         self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
 504         self.dbuf = ""
 505
 506         # taken from gzip.GzipFile with some alterations
 507         if self.__read(2) != "\037\213":
 508             raise ReadError("not a gzip file")
 509         if self.__read(1) != "\010":
 510             raise CompressionError("unsupported compression method")
 511
 512         flag = ord(self.__read(1))
 513         self.__read(6)
 514
 515         if flag & 4:
 516             xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
 517             self.read(xlen)
 518         if flag & 8:
 519             while True:
 520                 s = self.__read(1)
 521                 if not s or s == NUL:
 522                     break
 523         if flag & 16:
 524             while True:
 525                 s = self.__read(1)
 526                 if not s or s == NUL:
 527                     break
 528         if flag & 2:
 529             self.__read(2)
 530
 531     def tell(self):
 532         """Return the stream's file pointer position.
 533         """
 534         return self.pos
 535
 536     def seek(self, pos=0):
 537         """Set the stream's file pointer to pos. Negative seeking
 538            is forbidden.
 539         """
 540         if pos - self.pos >= 0:
 541             blocks, remainder = divmod(pos - self.pos, self.bufsize)
 542             for i in xrange(blocks):
 543                 self.read(self.bufsize)
 544             self.read(remainder)
 545         else:
 546             raise StreamError("seeking backwards is not allowed")
 547         return self.pos
 548
 549     def read(self, size=None):
 550         """Return the next size number of bytes from the stream.
 551            If size is not defined, return all bytes of the stream
 552            up to EOF.
 553         """
 554         if size is None:
 555             t = []
 556             while True:
 557                 buf = self._read(self.bufsize)
 558                 if not buf:
 559                     break
 560                 t.append(buf)
 561             buf = "".join(t)
 562         else:
 563             buf = self._read(size)
 564         self.pos += len(buf)
 565         return buf
 566
 567     def _read(self, size):
 568         """Return size bytes from the stream.
 569         """
 570         if self.comptype == "tar":
 571             return self.__read(size)
 572
 573         c = len(self.dbuf)
 574         t = [self.dbuf]
 575         while c < size:
 576             buf = self.__read(self.bufsize)
 577             if not buf:
 578                 break
 579             try:
 580                 buf = self.cmp.decompress(buf)
 581             except IOError:
 582                 raise ReadError("invalid compressed data")
 583             t.append(buf)
 584             c += len(buf)
 585         t = "".join(t)
 586         self.dbuf = t[size:]
 587         return t[:size]
 588
 589     def __read(self, size):
 590         """Return size bytes from stream. If internal buffer is empty,
 591            read another block from the stream.
 592         """
 593         c = len(self.buf)
 594         t = [self.buf]
 595         while c < size:
 596             buf = self.fileobj.read(self.bufsize)
 597             if not buf:
 598                 break
 599             t.append(buf)
 600             c += len(buf)
 601         t = "".join(t)
 602         self.buf = t[size:]
 603         return t[:size]
 604 # class _Stream
 605
 606 class _StreamProxy(object):
 607     """Small proxy class that enables transparent compression
 608        detection for the Stream interface (mode 'r|*').
 609     """
 610
 611     def __init__(self, fileobj):
 612         self.fileobj = fileobj
 613         self.buf = self.fileobj.read(BLOCKSIZE)
 614
 615     def read(self, size):
 616         self.read = self.fileobj.read
 617         return self.buf
 618
 619     def getcomptype(self):
 620         if self.buf.startswith("\037\213\010"):
 621             return "gz"
 622         if self.buf.startswith("BZh91"):
 623             return "bz2"
 624         return "tar"
 625
 626     def close(self):
 627         self.fileobj.close()
 628 # class StreamProxy
 629
 630 class _BZ2Proxy(object):
 631     """Small proxy class that enables external file object
 632        support for "r:bz2" and "w:bz2" modes. This is actually
 633        a workaround for a limitation in bz2 module's BZ2File
 634        class which (unlike gzip.GzipFile) has no support for
 635        a file object argument.
 636     """
 637
 638     blocksize = 16 * 1024
 639
 640     def __init__(self, fileobj, mode):
 641         self.fileobj = fileobj
 642         self.mode = mode
 643         self.name = getattr(self.fileobj, "name", None)
 644         self.init()
 645
 646     def init(self):
 647         import bz2
 648         self.pos = 0
 649         if self.mode == "r":
 650             self.bz2obj = bz2.BZ2Decompressor()
 651             self.fileobj.seek(0)
 652             self.buf = ""
 653         else:
 654             self.bz2obj = bz2.BZ2Compressor()
 655
 656     def read(self, size):
 657         b = [self.buf]
 658         x = len(self.buf)
 659         while x < size:
 660             raw = self.fileobj.read(self.blocksize)
 661             if not raw:
 662                 break
 663             data = self.bz2obj.decompress(raw)
 664             b.append(data)
 665             x += len(data)
 666         self.buf = "".join(b)
 667
 668         buf = self.buf[:size]
 669         self.buf = self.buf[size:]
 670         self.pos += len(buf)
 671         return buf
 672
 673     def seek(self, pos):
 674         if pos < self.pos:
 675             self.init()
 676         self.read(pos - self.pos)
 677
 678     def tell(self):
 679         return self.pos
 680
 681     def write(self, data):
 682         self.pos += len(data)
 683         raw = self.bz2obj.compress(data)
 684         self.fileobj.write(raw)
 685
 686     def close(self):
 687         if self.mode == "w":
 688             raw = self.bz2obj.flush()
 689             self.fileobj.write(raw)
 690 # class _BZ2Proxy
 691
 692 #------------------------
 693 # Extraction file object
 694 #------------------------
 695 class _FileInFile(object):
 696     """A thin wrapper around an existing file object that
 697        provides a part of its data as an individual file
 698        object.
 699     """
 700
 701     def __init__(self, fileobj, offset, size, sparse=None):
 702         self.fileobj = fileobj
 703         self.offset = offset
 704         self.size = size
 705         self.sparse = sparse
 706         self.position = 0
 707
 708     def tell(self):
 709         """Return the current file position.
 710         """
 711         return self.position
 712
 713     def seek(self, position):
 714         """Seek to a position in the file.
 715         """
 716         self.position = position
 717
 718     def read(self, size=None):
 719         """Read data from the file.
 720         """
 721         if size is None:
 722             size = self.size - self.position
 723         else:
 724             size = min(size, self.size - self.position)
 725
 726         if self.sparse is None:
 727             return self.readnormal(size)
 728         else:
 729             return self.readsparse(size)
 730
 731     def readnormal(self, size):
 732         """Read operation for regular files.
 733         """
 734         self.fileobj.seek(self.offset + self.position)
 735         self.position += size
 736         return self.fileobj.read(size)
 737
 738     def readsparse(self, size):
 739         """Read operation for sparse files.
 740         """
 741         data = []
 742         while size > 0:
 743             buf = self.readsparsesection(size)
 744             if not buf:
 745                 break
 746             size -= len(buf)
 747             data.append(buf)
 748         return "".join(data)
 749
 750     def readsparsesection(self, size):
 751         """Read a single section of a sparse file.
 752         """
 753         section = self.sparse.find(self.position)
 754
 755         if section is None:
 756             return ""
 757
 758         size = min(size, section.offset + section.size - self.position)
 759
 760         if isinstance(section, _data):
 761             realpos = section.realpos + self.position - section.offset
 762             self.fileobj.seek(self.offset + realpos)
 763             self.position += size
 764             return self.fileobj.read(size)
 765         else:
 766             self.position += size
 767             return NUL * size
 768 #class _FileInFile
 769
 770
 771 class ExFileObject(object):
 772     """File-like object for reading an archive member.
 773        Is returned by TarFile.extractfile().
 774     """
 775     blocksize = 1024
 776
 777     def __init__(self, tarfile, tarinfo):
 778         self.fileobj = _FileInFile(tarfile.fileobj,
 779                                    tarinfo.offset_data,
 780                                    tarinfo.size,
 781                                    getattr(tarinfo, "sparse", None))
 782         self.name = tarinfo.name
 783         self.mode = "r"
 784         self.closed = False
 785         self.size = tarinfo.size
 786
 787         self.position = 0
 788         self.buffer = ""
 789
 790     def read(self, size=None):
 791         """Read at most size bytes from the file. If size is not
 792            present or None, read all data until EOF is reached.
 793         """
 794         if self.closed:
 795             raise ValueError("I/O operation on closed file")
 796
 797         buf = ""
 798         if self.buffer:
 799             if size is None:
 800                 buf = self.buffer
 801                 self.buffer = ""
 802             else:
 803                 buf = self.buffer[:size]
 804                 self.buffer = self.buffer[size:]
 805
 806         if size is None:
 807             buf += self.fileobj.read()
 808         else:
 809             buf += self.fileobj.read(size - len(buf))
 810
 811         self.position += len(buf)
 812         return buf
 813
 814     def readline(self, size=-1):
 815         """Read one entire line from the file. If size is present
 816            and non-negative, return a string with at most that
 817            size, which may be an incomplete line.
 818         """
 819         if self.closed:
 820             raise ValueError("I/O operation on closed file")
 821
 822         if "\n" in self.buffer:
 823             pos = self.buffer.find("\n") + 1
 824         else:
 825             buffers = [self.buffer]
 826             while True:
 827                 buf = self.fileobj.read(self.blocksize)
 828                 buffers.append(buf)
 829                 if not buf or "\n" in buf:
 830                     self.buffer = "".join(buffers)
 831                     pos = self.buffer.find("\n") + 1
 832                     if pos == 0:
 833                         # no newline found.
 834                         pos = len(self.buffer)
 835                     break
 836
 837         if size != -1:
 838             pos = min(size, pos)
 839
 840         buf = self.buffer[:pos]
 841         self.buffer = self.buffer[pos:]
 842         self.position += len(buf)
 843         return buf
 844
 845     def readlines(self):
 846         """Return a list with all remaining lines.
 847         """
 848         result = []
 849         while True:
 850             line = self.readline()
 851             if not line: break
 852             result.append(line)
 853         return result
 854
 855     def tell(self):
 856         """Return the current file position.
 857         """
 858         if self.closed:
 859             raise ValueError("I/O operation on closed file")
 860
 861         return self.position
 862
 863     def seek(self, pos, whence=os.SEEK_SET):
 864         """Seek to a position in the file.
 865         """
 866         if self.closed:
 867             raise ValueError("I/O operation on closed file")
 868
 869         if whence == os.SEEK_SET:
 870             self.position = min(max(pos, 0), self.size)
 871         elif whence == os.SEEK_CUR:
 872             if pos < 0:
 873                 self.position = max(self.position + pos, 0)
 874             else:
 875                 self.position = min(self.position + pos, self.size)
 876         elif whence == os.SEEK_END:
 877             self.position = max(min(self.size + pos, self.size), 0)
 878         else:
 879             raise ValueError("Invalid argument")
 880
 881         self.buffer = ""
 882         self.fileobj.seek(self.position)
 883
 884     def close(self):
 885         """Close the file object.
 886         """
 887         self.closed = True
 888
 889     def __iter__(self):
 890         """Get an iterator over the file's lines.
 891         """
 892         while True:
 893             line = self.readline()
 894             if not line:
 895                 break
 896             yield line
 897 #class ExFileObject
 898
 899 #------------------
 900 # Exported Classes
 901 #------------------
 902 class TarInfo(object):
 903     """Informational class which holds the details about an
 904        archive member given by a tar header block.
 905        TarInfo objects are returned by TarFile.getmember(),
 906        TarFile.getmembers() and TarFile.gettarinfo() and are
 907        usually created internally.
 908     """
 909
 910     def __init__(self, name=""):
 911         """Construct a TarInfo object. name is the optional name
 912            of the member.
 913         """
 914         self.name = name        # member name
 915         self.mode = 0644        # file permissions
 916         self.uid = 0            # user id
 917         self.gid = 0            # group id
 918         self.size = 0           # file size
 919         self.mtime = 0          # modification time
 920         self.chksum = 0         # header checksum
 921         self.type = REGTYPE     # member type
 922         self.linkname = ""      # link name
 923         self.uname = "root"     # user name
 924         self.gname = "root"     # group name
 925         self.devmajor = 0       # device major number
 926         self.devminor = 0       # device minor number
 927
 928         self.offset = 0         # the tar header starts here
 929         self.offset_data = 0    # the file's data starts here
 930
 931         self.pax_headers = {}   # pax header information
 932
 933     # In pax headers the "name" and "linkname" field are called
 934     # "path" and "linkpath".
 935     def _getpath(self):
 936         return self.name
 937     def _setpath(self, name):
 938         self.name = name
 939     path = property(_getpath, _setpath)
 940
 941     def _getlinkpath(self):
 942         return self.linkname
 943     def _setlinkpath(self, linkname):
 944         self.linkname = linkname
 945     linkpath = property(_getlinkpath, _setlinkpath)
 946
 947     def __repr__(self):
 948         return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
 949
 950     def get_info(self, encoding, errors):
 951         """Return the TarInfo's attributes as a dictionary.
 952         """
 953         info = {
 954             "name":     self.name,
 955             "mode":     self.mode & 07777,
 956             "uid":      self.uid,
 957             "gid":      self.gid,
 958             "size":     self.size,
 959             "mtime":    self.mtime,
 960             "chksum":   self.chksum,
 961             "type":     self.type,
 962             "linkname": self.linkname,
 963             "uname":    self.uname,
 964             "gname":    self.gname,
 965             "devmajor": self.devmajor,
 966             "devminor": self.devminor
 967         }
 968
 969         if info["type"] == DIRTYPE and not info["name"].endswith("/"):
 970             info["name"] += "/"
 971
 972         for key in ("name", "linkname", "uname", "gname"):
 973             if type(info[key]) is unicode:
 974                 info[key] = info[key].encode(encoding, errors)
 975
 976         return info
 977
 978     def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="strict"):
 979         """Return a tar header as a string of 512 byte blocks.
 980         """
 981         info = self.get_info(encoding, errors)
 982
 983         if format == USTAR_FORMAT:
 984             return self.create_ustar_header(info)
 985         elif format == GNU_FORMAT:
 986             return self.create_gnu_header(info)
 987         elif format == PAX_FORMAT:
 988             return self.create_pax_header(info, encoding, errors)
 989         else:
 990             raise ValueError("invalid format")
 991
 992     def create_ustar_header(self, info):
 993         """Return the object as a ustar header block.
 994         """
 995         info["magic"] = POSIX_MAGIC
 996
 997         if len(info["linkname"]) > LENGTH_LINK:
 998             raise ValueError("linkname is too long")
 999
1000         if len(info["name"]) > LENGTH_NAME:
1001             info["prefix"], info["name"] = self._posix_split_name(info["name"])
1002
1003         return self._create_header(info, USTAR_FORMAT)
1004
1005     def create_gnu_header(self, info):
1006         """Return the object as a GNU header block sequence.
1007         """
1008         info["magic"] = GNU_MAGIC
1009
1010         buf = ""
1011         if len(info["linkname"]) > LENGTH_LINK:
1012             buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK)
1013
1014         if len(info["name"]) > LENGTH_NAME:
1015             buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME)
1016
1017         return buf + self._create_header(info, GNU_FORMAT)
1018
1019     def create_pax_header(self, info, encoding, errors):
1020         """Return the object as a ustar header block. If it cannot be
1021            represented this way, prepend a pax extended header sequence
1022            with supplement information.
1023         """
1024         info["magic"] = POSIX_MAGIC
1025         pax_headers = self.pax_headers.copy()
1026
1027         # Test string fields for values that exceed the field length or cannot
1028         # be represented in ASCII encoding.
1029         for name, hname, length in (
1030                 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
1031                 ("uname", "uname", 32), ("gname", "gname", 32)):
1032
1033             if hname in pax_headers:
1034                 # The pax header has priority.
1035                 continue
1036
1037             val = info[name].decode(encoding, errors)
1038
1039             # Try to encode the string as ASCII.
1040             try:
1041                 val.encode("ascii")
1042             except UnicodeEncodeError:
1043                 pax_headers[hname] = val
1044                 continue
1045
1046             if len(info[name]) > length:
1047                 pax_headers[hname] = val
1048
1049         # Test number fields for values that exceed the field limit or values
1050         # that like to be stored as float.
1051         for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1052             if name in pax_headers:
1053                 # The pax header has priority. Avoid overflow.
1054                 info[name] = 0
1055                 continue
1056
1057             val = info[name]
1058             if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
1059                 pax_headers[name] = unicode(val)
1060                 info[name] = 0
1061
1062         # Create a pax extended header if necessary.
1063         if pax_headers:
1064             buf = self._create_pax_generic_header(pax_headers)
1065         else:
1066             buf = ""
1067
1068         return buf + self._create_header(info, USTAR_FORMAT)
1069
1070     @classmethod
1071     def create_pax_global_header(cls, pax_headers):
1072         """Return the object as a pax global header block sequence.
1073         """
1074         return cls._create_pax_generic_header(pax_headers, type=XGLTYPE)
1075
1076     def _posix_split_name(self, name):
1077         """Split a name longer than 100 chars into a prefix
1078            and a name part.
1079         """
1080         prefix = name[:LENGTH_PREFIX + 1]
1081         while prefix and prefix[-1] != "/":
1082             prefix = prefix[:-1]
1083
1084         name = name[len(prefix):]
1085         prefix = prefix[:-1]
1086
1087         if not prefix or len(name) > LENGTH_NAME:
1088             raise ValueError("name is too long")
1089         return prefix, name
1090
1091     @staticmethod
1092     def _create_header(info, format):
1093         """Return a header block. info is a dictionary with file
1094            information, format must be one of the *_FORMAT constants.
1095         """
1096         parts = [
1097             stn(info.get("name", ""), 100),
1098             itn(info.get("mode", 0) & 07777, 8, format),
1099             itn(info.get("uid", 0), 8, format),
1100             itn(info.get("gid", 0), 8, format),
1101             itn(info.get("size", 0), 12, format),
1102             itn(info.get("mtime", 0), 12, format),
1103             "        ", # checksum field
1104             info.get("type", REGTYPE),
1105             stn(info.get("linkname", ""), 100),
1106             stn(info.get("magic", POSIX_MAGIC), 8),
1107             stn(info.get("uname", "root"), 32),
1108             stn(info.get("gname", "root"), 32),
1109             itn(info.get("devmajor", 0), 8, format),
1110             itn(info.get("devminor", 0), 8, format),
1111             stn(info.get("prefix", ""), 155)
1112         ]
1113
1114         buf = struct.pack("%ds" % BLOCKSIZE, "".join(parts))
1115         chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
1116         buf = buf[:-364] + "%06o\0" % chksum + buf[-357:]
1117         return buf
1118
1119     @staticmethod
1120     def _create_payload(payload):
1121         """Return the string payload filled with zero bytes
1122            up to the next 512 byte border.
1123         """
1124         blocks, remainder = divmod(len(payload), BLOCKSIZE)
1125         if remainder > 0:
1126             payload += (BLOCKSIZE - remainder) * NUL
1127         return payload
1128
1129     @classmethod
1130     def _create_gnu_long_header(cls, name, type):
1131         """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1132            for name.
1133         """
1134         name += NUL
1135
1136         info = {}
1137         info["name"] = "././@LongLink"
1138         info["type"] = type
1139         info["size"] = len(name)
1140         info["magic"] = GNU_MAGIC
1141
1142         # create extended header + name blocks.
1143         return cls._create_header(info, USTAR_FORMAT) + \
1144                 cls._create_payload(name)
1145
1146     @classmethod
1147     def _create_pax_generic_header(cls, pax_headers, type=XHDTYPE):
1148         """Return a POSIX.1-2001 extended or global header sequence
1149            that contains a list of keyword, value pairs. The values
1150            must be unicode objects.
1151         """
1152         records = []
1153         for keyword, value in pax_headers.iteritems():
1154             keyword = keyword.encode("utf8")
1155             value = value.encode("utf8")
1156             l = len(keyword) + len(value) + 3   # ' ' + '=' + '\n'
1157             n = p = 0
1158             while True:
1159                 n = l + len(str(p))
1160                 if n == p:
1161                     break
1162                 p = n
1163             records.append("%d %s=%s\n" % (p, keyword, value))
1164         records = "".join(records)
1165
1166         # We use a hardcoded "././@PaxHeader" name like star does
1167         # instead of the one that POSIX recommends.
1168         info = {}
1169         info["name"] = "././@PaxHeader"
1170         info["type"] = type
1171         info["size"] = len(records)
1172         info["magic"] = POSIX_MAGIC
1173
1174         # Create pax header + record blocks.
1175         return cls._create_header(info, USTAR_FORMAT) + \
1176                 cls._create_payload(records)
1177
1178     @classmethod
1179     def frombuf(cls, buf):
1180         """Construct a TarInfo object from a 512 byte string buffer.
1181         """
1182         if len(buf) != BLOCKSIZE:
1183             raise HeaderError("truncated header")
1184         if buf.count(NUL) == BLOCKSIZE:
1185             raise HeaderError("empty header")
1186
1187         chksum = nti(buf[148:156])
1188         if chksum not in calc_chksums(buf):
1189             raise HeaderError("bad checksum")
1190
1191         obj = cls()
1192         obj.buf = buf
1193         obj.name = nts(buf[0:100])
1194         obj.mode = nti(buf[100:108])
1195         obj.uid = nti(buf[108:116])
1196         obj.gid = nti(buf[116:124])
1197         obj.size = nti(buf[124:136])
1198         obj.mtime = nti(buf[136:148])
1199         obj.chksum = chksum
1200         obj.type = buf[156:157]
1201         obj.linkname = nts(buf[157:257])
1202         obj.uname = nts(buf[265:297])
1203         obj.gname = nts(buf[297:329])
1204         obj.devmajor = nti(buf[329:337])
1205         obj.devminor = nti(buf[337:345])
1206         prefix = nts(buf[345:500])
1207
1208         # Old V7 tar format represents a directory as a regular
1209         # file with a trailing slash.
1210         if obj.type == AREGTYPE and obj.name.endswith("/"):
1211             obj.type = DIRTYPE
1212
1213         # Remove redundant slashes from directories.
1214         if obj.isdir():
1215             obj.name = obj.name.rstrip("/")
1216
1217         # Reconstruct a ustar longname.
1218         if prefix and obj.type not in GNU_TYPES:
1219             obj.name = prefix + "/" + obj.name
1220         return obj
1221
1222     @classmethod
1223     def fromtarfile(cls, tarfile):
1224         """Return the next TarInfo object from TarFile object
1225            tarfile.
1226         """
1227         buf = tarfile.fileobj.read(BLOCKSIZE)
1228         if not buf:
1229             return
1230         obj = cls.frombuf(buf)
1231         obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1232         return obj._proc_member(tarfile)
1233
1234     #--------------------------------------------------------------------------
1235     # The following are methods that are called depending on the type of a
1236     # member. The entry point is _proc_member() which can be overridden in a
1237     # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1238     # implement the following
1239     # operations:
1240     # 1. Set self.offset_data to the position where the data blocks begin,
1241     #    if there is data that follows.
1242     # 2. Set tarfile.offset to the position where the next member's header will
1243     #    begin.
1244     # 3. Return self or another valid TarInfo object.
1245     def _proc_member(self, tarfile):
1246         """Choose the right processing method depending on
1247            the type and call it.
1248         """
1249         if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1250             return self._proc_gnulong(tarfile)
1251         elif self.type == GNUTYPE_SPARSE:
1252             return self._proc_sparse(tarfile)
1253         elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1254             return self._proc_pax(tarfile)
1255         else:
1256             return self._proc_builtin(tarfile)
1257
1258     def _proc_builtin(self, tarfile):
1259         """Process a builtin type or an unknown type which
1260            will be treated as a regular file.
1261         """
1262         self.offset_data = tarfile.fileobj.tell()
1263         offset = self.offset_data
1264         if self.isreg() or self.type not in SUPPORTED_TYPES:
1265             # Skip the following data blocks.
1266             offset += self._block(self.size)
1267         tarfile.offset = offset
1268
1269         # Patch the TarInfo object with saved global
1270         # header information.
1271         self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1272
1273         return self
1274
1275     def _proc_gnulong(self, tarfile):
1276         """Process the blocks that hold a GNU longname
1277            or longlink member.
1278         """
1279         buf = tarfile.fileobj.read(self._block(self.size))
1280
1281         # Fetch the next header and process it.
1282         next = self.fromtarfile(tarfile)
1283         if next is None:
1284             raise HeaderError("missing subsequent header")
1285
1286         # Patch the TarInfo object from the next header with
1287         # the longname information.
1288         next.offset = self.offset
1289         if self.type == GNUTYPE_LONGNAME:
1290             next.name = nts(buf)
1291         elif self.type == GNUTYPE_LONGLINK:
1292             next.linkname = nts(buf)
1293
1294         return next
1295
1296     def _proc_sparse(self, tarfile):
1297         """Process a GNU sparse header plus extra headers.
1298         """
1299         buf = self.buf
1300         sp = _ringbuffer()
1301         pos = 386
1302         lastpos = 0L
1303         realpos = 0L
1304         # There are 4 possible sparse structs in the
1305         # first header.
1306         for i in xrange(4):
1307             try:
1308                 offset = nti(buf[pos:pos + 12])
1309                 numbytes = nti(buf[pos + 12:pos + 24])
1310             except ValueError:
1311                 break
1312             if offset > lastpos:
1313                 sp.append(_hole(lastpos, offset - lastpos))
1314             sp.append(_data(offset, numbytes, realpos))
1315             realpos += numbytes
1316             lastpos = offset + numbytes
1317             pos += 24
1318
1319         isextended = ord(buf[482])
1320         origsize = nti(buf[483:495])
1321
1322         # If the isextended flag is given,
1323         # there are extra headers to process.
1324         while isextended == 1:
1325             buf = tarfile.fileobj.read(BLOCKSIZE)
1326             pos = 0
1327             for i in xrange(21):
1328                 try:
1329                     offset = nti(buf[pos:pos + 12])
1330                     numbytes = nti(buf[pos + 12:pos + 24])
1331                 except ValueError:
1332                     break
1333                 if offset > lastpos:
1334                     sp.append(_hole(lastpos, offset - lastpos))
1335                 sp.append(_data(offset, numbytes, realpos))
1336                 realpos += numbytes
1337                 lastpos = offset + numbytes
1338                 pos += 24
1339             isextended = ord(buf[504])
1340
1341         if lastpos < origsize:
1342             sp.append(_hole(lastpos, origsize - lastpos))
1343
1344         self.sparse = sp
1345
1346         self.offset_data = tarfile.fileobj.tell()
1347         tarfile.offset = self.offset_data + self._block(self.size)
1348         self.size = origsize
1349
1350         return self
1351
1352     def _proc_pax(self, tarfile):
1353         """Process an extended or global header as described in
1354            POSIX.1-2001.
1355         """
1356         # Read the header information.
1357         buf = tarfile.fileobj.read(self._block(self.size))
1358
1359         # A pax header stores supplemental information for either
1360         # the following file (extended) or all following files
1361         # (global).
1362         if self.type == XGLTYPE:
1363             pax_headers = tarfile.pax_headers
1364         else:
1365             pax_headers = tarfile.pax_headers.copy()
1366
1367         # Parse pax header information. A record looks like that:
1368         # "%d %s=%s\n" % (length, keyword, value). length is the size
1369         # of the complete record including the length field itself and
1370         # the newline. keyword and value are both UTF-8 encoded strings.
1371         regex = re.compile(r"(\d+) ([^=]+)=", re.U)
1372         pos = 0
1373         while True:
1374             match = regex.match(buf, pos)
1375             if not match:
1376                 break
1377
1378             length, keyword = match.groups()
1379             length = int(length)
1380             value = buf[match.end(2) + 1:match.start(1) + length - 1]
1381
1382             keyword = keyword.decode("utf8")
1383             value = value.decode("utf8")
1384
1385             pax_headers[keyword] = value
1386             pos += length
1387
1388         # Fetch the next header.
1389         next = self.fromtarfile(tarfile)
1390
1391         if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1392             if next is None:
1393                 raise HeaderError("missing subsequent header")
1394
1395             # Patch the TarInfo object with the extended header info.
1396             next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1397             next.offset = self.offset
1398
1399             if "size" in pax_headers:
1400                 # If the extended header replaces the size field,
1401                 # we need to recalculate the offset where the next
1402                 # header starts.
1403                 offset = next.offset_data
1404                 if next.isreg() or next.type not in SUPPORTED_TYPES:
1405                     offset += next._block(next.size)
1406                 tarfile.offset = offset
1407
1408         return next
1409
1410     def _apply_pax_info(self, pax_headers, encoding, errors):
1411         """Replace fields with supplemental information from a previous
1412            pax extended or global header.
1413         """
1414         for keyword, value in pax_headers.iteritems():
1415             if keyword not in PAX_FIELDS:
1416                 continue
1417
1418             if keyword == "path":
1419                 value = value.rstrip("/")
1420
1421             if keyword in PAX_NUMBER_FIELDS:
1422                 try:
1423                     value = PAX_NUMBER_FIELDS[keyword](value)
1424                 except ValueError:
1425                     value = 0
1426             else:
1427                 value = uts(value, encoding, errors)
1428
1429             setattr(self, keyword, value)
1430
1431         self.pax_headers = pax_headers.copy()
1432
1433     def _block(self, count):
1434         """Round up a byte count by BLOCKSIZE and return it,
1435            e.g. _block(834) => 1024.
1436         """
1437         blocks, remainder = divmod(count, BLOCKSIZE)
1438         if remainder:
1439             blocks += 1
1440         return blocks * BLOCKSIZE
1441
1442     def isreg(self):
1443         return self.type in REGULAR_TYPES
1444     def isfile(self):
1445         return self.isreg()
1446     def isdir(self):
1447         return self.type == DIRTYPE
1448     def issym(self):
1449         return self.type == SYMTYPE
1450     def islnk(self):
1451         return self.type == LNKTYPE
1452     def ischr(self):
1453         return self.type == CHRTYPE
1454     def isblk(self):
1455         return self.type == BLKTYPE
1456     def isfifo(self):
1457         return self.type == FIFOTYPE
1458     def issparse(self):
1459         return self.type == GNUTYPE_SPARSE
1460     def isdev(self):
1461         return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1462 # class TarInfo
1463
1464 class TarFile(object):
1465     """The TarFile Class provides an interface to tar archives.
1466     """
1467
1468     debug = 0                   # May be set from 0 (no msgs) to 3 (all msgs)
1469
1470     dereference = False         # If true, add content of linked file to the
1471                                 # tar file, else the link.
1472
1473     ignore_zeros = False        # If true, skips empty or invalid blocks and
1474                                 # continues processing.
1475
1476     errorlevel = 0              # If 0, fatal errors only appear in debug
1477                                 # messages (if debug >= 0). If > 0, errors
1478                                 # are passed to the caller as exceptions.
1479
1480     format = DEFAULT_FORMAT     # The format to use when creating an archive.
1481
1482     encoding = ENCODING         # Encoding for 8-bit character strings.
1483
1484     errors = None               # Error handler for unicode conversion.
1485
1486     tarinfo = TarInfo           # The default TarInfo class to use.
1487
1488     fileobject = ExFileObject   # The default ExFileObject class to use.
1489
1490     def __init__(self, name=None, mode="r", fileobj=None, format=None,
1491             tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
1492             errors=None, pax_headers=None, debug=None, errorlevel=None):
1493         """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1494            read from an existing archive, 'a' to append data to an existing
1495            file or 'w' to create a new file overwriting an existing one. `mode'
1496            defaults to 'r'.
1497            If `fileobj' is given, it is used for reading or writing data. If it
1498            can be determined, `mode' is overridden by `fileobj's mode.
1499            `fileobj' is not closed, when TarFile is closed.
1500         """
1501         if len(mode) > 1 or mode not in "raw":
1502             raise ValueError("mode must be 'r', 'a' or 'w'")
1503         self.mode = mode
1504         self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
1505
1506         if not fileobj:
1507             if self.mode == "a" and not os.path.exists(name):
1508                 # Create nonexistent files in append mode.
1509                 self.mode = "w"
1510                 self._mode = "wb"
1511             fileobj = bltn_open(name, self._mode)
1512             self._extfileobj = False
1513         else:
1514             if name is None and hasattr(fileobj, "name"):
1515                 name = fileobj.name
1516             if hasattr(fileobj, "mode"):
1517                 self._mode = fileobj.mode
1518             self._extfileobj = True
1519         self.name = os.path.abspath(name) if name else None
1520         self.fileobj = fileobj
1521
1522         # Init attributes.
1523         if format is not None:
1524             self.format = format
1525         if tarinfo is not None:
1526             self.tarinfo = tarinfo
1527         if dereference is not None:
1528             self.dereference = dereference
1529         if ignore_zeros is not None:
1530             self.ignore_zeros = ignore_zeros
1531         if encoding is not None:
1532             self.encoding = encoding
1533
1534         if errors is not None:
1535             self.errors = errors
1536         elif mode == "r":
1537             self.errors = "utf-8"
1538         else:
1539             self.errors = "strict"
1540
1541         if pax_headers is not None and self.format == PAX_FORMAT:
1542             self.pax_headers = pax_headers
1543         else:
1544             self.pax_headers = {}
1545
1546         if debug is not None:
1547             self.debug = debug
1548         if errorlevel is not None:
1549             self.errorlevel = errorlevel
1550
1551         # Init datastructures.
1552         self.closed = False
1553         self.members = []       # list of members as TarInfo objects
1554         self._loaded = False    # flag if all members have been read
1555         self.offset = self.fileobj.tell()
1556                                 # current position in the archive file
1557         self.inodes = {}        # dictionary caching the inodes of
1558                                 # archive members already added
1559
1560         if self.mode == "r":
1561             self.firstmember = None
1562             self.firstmember = self.next()
1563
1564         if self.mode == "a":
1565             # Move to the end of the archive,
1566             # before the first empty block.
1567             self.firstmember = None
1568             while True:
1569                 if self.next() is None:
1570                     if self.offset > 0:
1571                         self.fileobj.seek(- BLOCKSIZE, 1)
1572                     break
1573
1574         if self.mode in "aw":
1575             self._loaded = True
1576
1577             if self.pax_headers:
1578                 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1579                 self.fileobj.write(buf)
1580                 self.offset += len(buf)
1581
1582     def _getposix(self):
1583         return self.format == USTAR_FORMAT
1584     def _setposix(self, value):
1585         import warnings
1586         warnings.warn("use the format attribute instead", DeprecationWarning,
1587                       2)
1588         if value:
1589             self.format = USTAR_FORMAT
1590         else:
1591             self.format = GNU_FORMAT
1592     posix = property(_getposix, _setposix)
1593
1594     #--------------------------------------------------------------------------
1595     # Below are the classmethods which act as alternate constructors to the
1596     # TarFile class. The open() method is the only one that is needed for
1597     # public use; it is the "super"-constructor and is able to select an
1598     # adequate "sub"-constructor for a particular compression using the mapping
1599     # from OPEN_METH.
1600     #
1601     # This concept allows one to subclass TarFile without losing the comfort of
1602     # the super-constructor. A sub-constructor is registered and made available
1603     # by adding it to the mapping in OPEN_METH.
1604
1605     @classmethod
1606     def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
1607         """Open a tar archive for reading, writing or appending. Return
1608            an appropriate TarFile class.
1609
1610            mode:
1611            'r' or 'r:*' open for reading with transparent compression
1612            'r:'         open for reading exclusively uncompressed
1613            'r:gz'       open for reading with gzip compression
1614            'r:bz2'      open for reading with bzip2 compression
1615            'a' or 'a:'  open for appending, creating the file if necessary
1616            'w' or 'w:'  open for writing without compression
1617            'w:gz'       open for writing with gzip compression
1618            'w:bz2'      open for writing with bzip2 compression
1619
1620            'r|*'        open a stream of tar blocks with transparent compression
1621            'r|'         open an uncompressed stream of tar blocks for reading
1622            'r|gz'       open a gzip compressed stream of tar blocks
1623            'r|bz2'      open a bzip2 compressed stream of tar blocks
1624            'w|'         open an uncompressed stream for writing
1625            'w|gz'       open a gzip compressed stream for writing
1626            'w|bz2'      open a bzip2 compressed stream for writing
1627         """
1628
1629         if not name and not fileobj:
1630             raise ValueError("nothing to open")
1631
1632         if mode in ("r", "r:*"):
1633             # Find out which *open() is appropriate for opening the file.
1634             for comptype in cls.OPEN_METH:
1635                 func = getattr(cls, cls.OPEN_METH[comptype])
1636                 if fileobj is not None:
1637                     saved_pos = fileobj.tell()
1638                 try:
1639                     return func(name, "r", fileobj, **kwargs)
1640                 except (ReadError, CompressionError), e:
1641                     if fileobj is not None:
1642                         fileobj.seek(saved_pos)
1643                     continue
1644             raise ReadError("file could not be opened successfully")
1645
1646         elif ":" in mode:
1647             filemode, comptype = mode.split(":", 1)
1648             filemode = filemode or "r"
1649             comptype = comptype or "tar"
1650
1651             # Select the *open() function according to
1652             # given compression.
1653             if comptype in cls.OPEN_METH:
1654                 func = getattr(cls, cls.OPEN_METH[comptype])
1655             else:
1656                 raise CompressionError("unknown compression type %r" % comptype)
1657             return func(name, filemode, fileobj, **kwargs)
1658
1659         elif "|" in mode:
1660             filemode, comptype = mode.split("|", 1)
1661             filemode = filemode or "r"
1662             comptype = comptype or "tar"
1663
1664             if filemode not in "rw":
1665                 raise ValueError("mode must be 'r' or 'w'")
1666
1667             t = cls(name, filemode,
1668                     _Stream(name, filemode, comptype, fileobj, bufsize),
1669                     **kwargs)
1670             t._extfileobj = False
1671             return t
1672
1673         elif mode in "aw":
1674             return cls.taropen(name, mode, fileobj, **kwargs)
1675
1676         raise ValueError("undiscernible mode")
1677
1678     @classmethod
1679     def taropen(cls, name, mode="r", fileobj=None, **kwargs):
1680         """Open uncompressed tar archive name for reading or writing.
1681         """
1682         if len(mode) > 1 or mode not in "raw":
1683             raise ValueError("mode must be 'r', 'a' or 'w'")
1684         return cls(name, mode, fileobj, **kwargs)
1685
1686     @classmethod
1687     def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1688         """Open gzip compressed tar archive name for reading or writing.
1689            Appending is not allowed.
1690         """
1691         if len(mode) > 1 or mode not in "rw":
1692             raise ValueError("mode must be 'r' or 'w'")
1693
1694         try:
1695             import gzip
1696             gzip.GzipFile
1697         except (ImportError, AttributeError):
1698             raise CompressionError("gzip module is not available")
1699
1700         if fileobj is None:
1701             fileobj = bltn_open(name, mode + "b")
1702
1703         try:
1704             t = cls.taropen(name, mode,
1705                 gzip.GzipFile(name, mode, compresslevel, fileobj),
1706                 **kwargs)
1707         except IOError:
1708             raise ReadError("not a gzip file")
1709         t._extfileobj = False
1710         return t
1711
1712     @classmethod
1713     def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1714         """Open bzip2 compressed tar archive name for reading or writing.
1715            Appending is not allowed.
1716         """
1717         if len(mode) > 1 or mode not in "rw":
1718             raise ValueError("mode must be 'r' or 'w'.")
1719
1720         try:
1721             import bz2
1722         except ImportError:
1723             raise CompressionError("bz2 module is not available")
1724
1725         if fileobj is not None:
1726             fileobj = _BZ2Proxy(fileobj, mode)
1727         else:
1728             fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel)
1729
1730         try:
1731             t = cls.taropen(name, mode, fileobj, **kwargs)
1732         except IOError:
1733             raise ReadError("not a bzip2 file")
1734         t._extfileobj = False
1735         return t
1736
1737     # All *open() methods are registered here.
1738     OPEN_METH = {
1739         "tar": "taropen",   # uncompressed tar
1740         "gz":  "gzopen",    # gzip compressed tar
1741         "bz2": "bz2open"    # bzip2 compressed tar
1742     }
1743
1744     #--------------------------------------------------------------------------
1745     # The public methods which TarFile provides:
1746
1747     def close(self):
1748         """Close the TarFile. In write-mode, two finishing zero blocks are
1749            appended to the archive.
1750         """
1751         if self.closed:
1752             return
1753
1754         if self.mode in "aw":
1755             self.fileobj.write(NUL * (BLOCKSIZE * 2))
1756             self.offset += (BLOCKSIZE * 2)
1757             # fill up the end with zero-blocks
1758             # (like option -b20 for tar does)
1759             blocks, remainder = divmod(self.offset, RECORDSIZE)
1760             if remainder > 0:
1761                 self.fileobj.write(NUL * (RECORDSIZE - remainder))
1762
1763         if not self._extfileobj:
1764             self.fileobj.close()
1765         self.closed = True
1766
1767     def getmember(self, name):
1768         """Return a TarInfo object for member `name'. If `name' can not be
1769            found in the archive, KeyError is raised. If a member occurs more
1770            than once in the archive, its last occurrence is assumed to be the
1771            most up-to-date version.
1772         """
1773         tarinfo = self._getmember(name)
1774         if tarinfo is None:
1775             raise KeyError("filename %r not found" % name)
1776         return tarinfo
1777
1778     def getmembers(self):
1779         """Return the members of the archive as a list of TarInfo objects. The
1780            list has the same order as the members in the archive.
1781         """
1782         self._check()
1783         if not self._loaded:    # if we want to obtain a list of
1784             self._load()        # all members, we first have to
1785                                 # scan the whole archive.
1786         return self.members
1787
1788     def getnames(self):
1789         """Return the members of the archive as a list of their names. It has
1790            the same order as the list returned by getmembers().
1791         """
1792         return [tarinfo.name for tarinfo in self.getmembers()]
1793
1794     def gettarinfo(self, name=None, arcname=None, fileobj=None):
1795         """Create a TarInfo object for either the file `name' or the file
1796            object `fileobj' (using os.fstat on its file descriptor). You can
1797            modify some of the TarInfo's attributes before you add it using
1798            addfile(). If given, `arcname' specifies an alternative name for the
1799            file in the archive.
1800         """
1801         self._check("aw")
1802
1803         # When fileobj is given, replace name by
1804         # fileobj's real name.
1805         if fileobj is not None:
1806             name = fileobj.name
1807
1808         # Building the name of the member in the archive.
1809         # Backward slashes are converted to forward slashes,
1810         # Absolute paths are turned to relative paths.
1811         if arcname is None:
1812             arcname = name
1813         drv, arcname = os.path.splitdrive(arcname)
1814         arcname = arcname.replace(os.sep, "/")
1815         arcname = arcname.lstrip("/")
1816
1817         # Now, fill the TarInfo object with
1818         # information specific for the file.
1819         tarinfo = self.tarinfo()
1820         tarinfo.tarfile = self
1821
1822         # Use os.stat or os.lstat, depending on platform
1823         # and if symlinks shall be resolved.
1824         if fileobj is None:
1825             if hasattr(os, "lstat") and not self.dereference:
1826                 statres = os.lstat(name)
1827             else:
1828                 statres = os.stat(name)
1829         else:
1830             statres = os.fstat(fileobj.fileno())
1831         linkname = ""
1832
1833         stmd = statres.st_mode
1834         if stat.S_ISREG(stmd):
1835             inode = (statres.st_ino, statres.st_dev)
1836             if not self.dereference and statres.st_nlink > 1 and \
1837                     inode in self.inodes and arcname != self.inodes[inode]:
1838                 # Is it a hardlink to an already
1839                 # archived file?
1840                 type = LNKTYPE
1841                 linkname = self.inodes[inode]
1842             else:
1843                 # The inode is added only if its valid.
1844                 # For win32 it is always 0.
1845                 type = REGTYPE
1846                 if inode[0]:
1847                     self.inodes[inode] = arcname
1848         elif stat.S_ISDIR(stmd):
1849             type = DIRTYPE
1850         elif stat.S_ISFIFO(stmd):
1851             type = FIFOTYPE
1852         elif stat.S_ISLNK(stmd):
1853             type = SYMTYPE
1854             linkname = os.readlink(name)
1855         elif stat.S_ISCHR(stmd):
1856             type = CHRTYPE
1857         elif stat.S_ISBLK(stmd):
1858             type = BLKTYPE
1859         else:
1860             return None
1861
1862         # Fill the TarInfo object with all
1863         # information we can get.
1864         tarinfo.name = arcname
1865         tarinfo.mode = stmd
1866         tarinfo.uid = statres.st_uid
1867         tarinfo.gid = statres.st_gid
1868         if stat.S_ISREG(stmd):
1869             tarinfo.size = statres.st_size
1870         else:
1871             tarinfo.size = 0L
1872         tarinfo.mtime = statres.st_mtime
1873         tarinfo.type = type
1874         tarinfo.linkname = linkname
1875         if pwd:
1876             try:
1877                 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1878             except KeyError:
1879                 pass
1880         if grp:
1881             try:
1882                 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1883             except KeyError:
1884                 pass
1885
1886         if type in (CHRTYPE, BLKTYPE):
1887             if hasattr(os, "major") and hasattr(os, "minor"):
1888                 tarinfo.devmajor = os.major(statres.st_rdev)
1889                 tarinfo.devminor = os.minor(statres.st_rdev)
1890         return tarinfo
1891
1892     def list(self, verbose=True):
1893         """Print a table of contents to sys.stdout. If `verbose' is False, only
1894            the names of the members are printed. If it is True, an `ls -l'-like
1895            output is produced.
1896         """
1897         self._check()
1898
1899         for tarinfo in self:
1900             if verbose:
1901                 print filemode(tarinfo.mode),
1902                 print "%s/%s" % (tarinfo.uname or tarinfo.uid,
1903                                  tarinfo.gname or tarinfo.gid),
1904                 if tarinfo.ischr() or tarinfo.isblk():
1905                     print "%10s" % ("%d,%d" \
1906                                     % (tarinfo.devmajor, tarinfo.devminor)),
1907                 else:
1908                     print "%10d" % tarinfo.size,
1909                 print "%d-%02d-%02d %02d:%02d:%02d" \
1910                       % time.localtime(tarinfo.mtime)[:6],
1911
1912             print tarinfo.name + ("/" if tarinfo.isdir() else ""),
1913
1914             if verbose:
1915                 if tarinfo.issym():
1916                     print "->", tarinfo.linkname,
1917                 if tarinfo.islnk():
1918                     print "link to", tarinfo.linkname,
1919             print
1920
1921     def add(self, name, arcname=None, recursive=True, exclude=None, filter=None):
1922         """Add the file `name' to the archive. `name' may be any type of file
1923            (directory, fifo, symbolic link, etc.). If given, `arcname'
1924            specifies an alternative name for the file in the archive.
1925            Directories are added recursively by default. This can be avoided by
1926            setting `recursive' to False. `exclude' is a function that should
1927            return True for each filename to be excluded. `filter' is a function
1928            that expects a TarInfo object argument and returns the changed
1929            TarInfo object, if it returns None the TarInfo object will be
1930            excluded from the archive.
1931         """
1932         self._check("aw")
1933
1934         if arcname is None:
1935             arcname = name
1936
1937         # Exclude pathnames.
1938         if exclude is not None:
1939             import warnings
1940             warnings.warn("use the filter argument instead",
1941                     DeprecationWarning, 2)
1942             if exclude(name):
1943                 self._dbg(2, "tarfile: Excluded %r" % name)
1944                 return
1945
1946         # Skip if somebody tries to archive the archive...
1947         if self.name is not None and os.path.abspath(name) == self.name:
1948             self._dbg(2, "tarfile: Skipped %r" % name)
1949             return
1950
1951         self._dbg(1, name)
1952
1953         # Create a TarInfo object from the file.
1954         tarinfo = self.gettarinfo(name, arcname)
1955
1956         if tarinfo is None:
1957             self._dbg(1, "tarfile: Unsupported type %r" % name)
1958             return
1959
1960         # Change or exclude the TarInfo object.
1961         if filter is not None:
1962             tarinfo = filter(tarinfo)
1963             if tarinfo is None:
1964                 self._dbg(2, "tarfile: Excluded %r" % name)
1965                 return
1966
1967         # Append the tar header and data to the archive.
1968         if tarinfo.isreg():
1969             f = bltn_open(name, "rb")
1970             self.addfile(tarinfo, f)
1971             f.close()
1972
1973         elif tarinfo.isdir():
1974             self.addfile(tarinfo)
1975             if recursive:
1976                 for f in os.listdir(name):
1977                     self.add(os.path.join(name, f), os.path.join(arcname, f),
1978                             recursive, exclude, filter)
1979
1980         else:
1981             self.addfile(tarinfo)
1982
1983     def addfile(self, tarinfo, fileobj=None):
1984         """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
1985            given, tarinfo.size bytes are read from it and added to the archive.
1986            You can create TarInfo objects using gettarinfo().
1987            On Windows platforms, `fileobj' should always be opened with mode
1988            'rb' to avoid irritation about the file size.
1989         """
1990         self._check("aw")
1991
1992         tarinfo = copy.copy(tarinfo)
1993
1994         buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
1995         self.fileobj.write(buf)
1996         self.offset += len(buf)
1997
1998         # If there's data to follow, append it.
1999         if fileobj is not None:
2000             copyfileobj(fileobj, self.fileobj, tarinfo.size)
2001             blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2002             if remainder > 0:
2003                 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2004                 blocks += 1
2005             self.offset += blocks * BLOCKSIZE
2006
2007         self.members.append(tarinfo)
2008
2009     def extractall(self, path=".", members=None):
2010         """Extract all members from the archive to the current working
2011            directory and set owner, modification time and permissions on
2012            directories afterwards. `path' specifies a different directory
2013            to extract to. `members' is optional and must be a subset of the
2014            list returned by getmembers().
2015         """
2016         directories = []
2017
2018         if members is None:
2019             members = self
2020
2021         for tarinfo in members:
2022             if tarinfo.isdir():
2023                 # Extract directories with a safe mode.
2024                 directories.append(tarinfo)
2025                 tarinfo = copy.copy(tarinfo)
2026                 tarinfo.mode = 0700
2027             self.extract(tarinfo, path)
2028
2029         # Reverse sort directories.
2030         directories.sort(key=operator.attrgetter('name'))
2031         directories.reverse()
2032
2033         # Set correct owner, mtime and filemode on directories.
2034         for tarinfo in directories:
2035             dirpath = os.path.join(path, tarinfo.name)
2036             try:
2037                 self.chown(tarinfo, dirpath)
2038                 self.utime(tarinfo, dirpath)
2039                 self.chmod(tarinfo, dirpath)
2040             except ExtractError, e:
2041                 if self.errorlevel > 1:
2042                     raise
2043                 else:
2044                     self._dbg(1, "tarfile: %s" % e)
2045
2046     def extract(self, member, path=""):
2047         """Extract a member from the archive to the current working directory,
2048            using its full name. Its file information is extracted as accurately
2049            as possible. `member' may be a filename or a TarInfo object. You can
2050            specify a different directory using `path'.
2051         """
2052         self._check("r")
2053
2054         if isinstance(member, basestring):
2055             tarinfo = self.getmember(member)
2056         else:
2057             tarinfo = member
2058
2059         # Prepare the link target for makelink().
2060         if tarinfo.islnk():
2061             tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2062
2063         try:
2064             self._extract_member(tarinfo, os.path.join(path, tarinfo.name))
2065         except EnvironmentError, e:
2066             if self.errorlevel > 0:
2067                 raise
2068             else:
2069                 if e.filename is None:
2070                     self._dbg(1, "tarfile: %s" % e.strerror)
2071                 else:
2072                     self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2073         except ExtractError, e:
2074             if self.errorlevel > 1:
2075                 raise
2076             else:
2077                 self._dbg(1, "tarfile: %s" % e)
2078
2079     def extractfile(self, member):
2080         """Extract a member from the archive as a file object. `member' may be
2081            a filename or a TarInfo object. If `member' is a regular file, a
2082            file-like object is returned. If `member' is a link, a file-like
2083            object is constructed from the link's target. If `member' is none of
2084            the above, None is returned.
2085            The file-like object is read-only and provides the following
2086            methods: read(), readline(), readlines(), seek() and tell()
2087         """
2088         self._check("r")
2089
2090         if isinstance(member, basestring):
2091             tarinfo = self.getmember(member)
2092         else:
2093             tarinfo = member
2094
2095         if tarinfo.isreg():
2096             return self.fileobject(self, tarinfo)
2097
2098         elif tarinfo.type not in SUPPORTED_TYPES:
2099             # If a member's type is unknown, it is treated as a
2100             # regular file.
2101             return self.fileobject(self, tarinfo)
2102
2103         elif tarinfo.islnk() or tarinfo.issym():
2104             if isinstance(self.fileobj, _Stream):
2105                 # A small but ugly workaround for the case that someone tries
2106                 # to extract a (sym)link as a file-object from a non-seekable
2107                 # stream of tar blocks.
2108                 raise StreamError("cannot extract (sym)link as file object")
2109             else:
2110                 # A (sym)link's file object is its target's file object.
2111                 return self.extractfile(self._getmember(tarinfo.linkname,
2112                                                         tarinfo))
2113         else:
2114             # If there's no data associated with the member (directory, chrdev,
2115             # blkdev, etc.), return None instead of a file object.
2116             return None
2117
2118     def _extract_member(self, tarinfo, targetpath):
2119         """Extract the TarInfo object tarinfo to a physical
2120            file called targetpath.
2121         """
2122         # Fetch the TarInfo object for the given name
2123         # and build the destination pathname, replacing
2124         # forward slashes to platform specific separators.
2125         targetpath = targetpath.rstrip("/")
2126         targetpath = targetpath.replace("/", os.sep)
2127
2128         # Create all upper directories.
2129         upperdirs = os.path.dirname(targetpath)
2130         if upperdirs and not os.path.exists(upperdirs):
2131             # Create directories that are not part of the archive with
2132             # default permissions.
2133             os.makedirs(upperdirs)
2134
2135         if tarinfo.islnk() or tarinfo.issym():
2136             self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2137         else:
2138             self._dbg(1, tarinfo.name)
2139
2140         if tarinfo.isreg():
2141             self.makefile(tarinfo, targetpath)
2142         elif tarinfo.isdir():
2143             self.makedir(tarinfo, targetpath)
2144         elif tarinfo.isfifo():
2145             self.makefifo(tarinfo, targetpath)
2146         elif tarinfo.ischr() or tarinfo.isblk():
2147             self.makedev(tarinfo, targetpath)
2148         elif tarinfo.islnk() or tarinfo.issym():
2149             self.makelink(tarinfo, targetpath)
2150         elif tarinfo.type not in SUPPORTED_TYPES:
2151             self.makeunknown(tarinfo, targetpath)
2152         else:
2153             self.makefile(tarinfo, targetpath)
2154
2155         self.chown(tarinfo, targetpath)
2156         if not tarinfo.issym():
2157             self.chmod(tarinfo, targetpath)
2158             self.utime(tarinfo, targetpath)
2159
2160     #--------------------------------------------------------------------------
2161     # Below are the different file methods. They are called via
2162     # _extract_member() when extract() is called. They can be replaced in a
2163     # subclass to implement other functionality.
2164
2165     def makedir(self, tarinfo, targetpath):
2166         """Make a directory called targetpath.
2167         """
2168         try:
2169             # Use a safe mode for the directory, the real mode is set
2170             # later in _extract_member().
2171             os.mkdir(targetpath, 0700)
2172         except EnvironmentError, e:
2173             if e.errno != errno.EEXIST:
2174                 raise
2175
2176     def makefile(self, tarinfo, targetpath):
2177         """Make a file called targetpath.
2178         """
2179         source = self.extractfile(tarinfo)
2180         target = bltn_open(targetpath, "wb")
2181         copyfileobj(source, target)
2182         source.close()
2183         target.close()
2184
2185     def makeunknown(self, tarinfo, targetpath):
2186         """Make a file from a TarInfo object with an unknown type
2187            at targetpath.
2188         """
2189         self.makefile(tarinfo, targetpath)
2190         self._dbg(1, "tarfile: Unknown file type %r, " \
2191                      "extracted as regular file." % tarinfo.type)
2192
2193     def makefifo(self, tarinfo, targetpath):
2194         """Make a fifo called targetpath.
2195         """
2196         if hasattr(os, "mkfifo"):
2197             os.mkfifo(targetpath)
2198         else:
2199             raise ExtractError("fifo not supported by system")
2200
2201     def makedev(self, tarinfo, targetpath):
2202         """Make a character or block device called targetpath.
2203         """
2204         if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
2205             raise ExtractError("special devices not supported by system")
2206
2207         mode = tarinfo.mode
2208         if tarinfo.isblk():
2209             mode |= stat.S_IFBLK
2210         else:
2211             mode |= stat.S_IFCHR
2212
2213         os.mknod(targetpath, mode,
2214                  os.makedev(tarinfo.devmajor, tarinfo.devminor))
2215
2216     def makelink(self, tarinfo, targetpath):
2217         """Make a (symbolic) link called targetpath. If it cannot be created
2218           (platform limitation), we try to make a copy of the referenced file
2219           instead of a link.
2220         """
2221         try:
2222             if tarinfo.issym():
2223                 os.symlink(tarinfo.linkname, targetpath)
2224             else:
2225                 # See extract().
2226                 os.link(tarinfo._link_target, targetpath)
2227         except AttributeError:
2228             if tarinfo.issym():
2229                 linkpath = os.path.dirname(tarinfo.name) + "/" + \
2230                                         tarinfo.linkname
2231             else:
2232                 linkpath = tarinfo.linkname
2233
2234             try:
2235                 self._extract_member(self.getmember(linkpath), targetpath)
2236             except (EnvironmentError, KeyError), e:
2237                 linkpath = linkpath.replace("/", os.sep)
2238                 try:
2239                     shutil.copy2(linkpath, targetpath)
2240                 except EnvironmentError, e:
2241                     raise IOError("link could not be created")
2242
2243     def chown(self, tarinfo, targetpath):
2244         """Set owner of targetpath according to tarinfo.
2245         """
2246         if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2247             # We have to be root to do so.
2248             try:
2249                 g = grp.getgrnam(tarinfo.gname)[2]
2250             except KeyError:
2251                 try:
2252                     g = grp.getgrgid(tarinfo.gid)[2]
2253                 except KeyError:
2254                     g = os.getgid()
2255             try:
2256                 u = pwd.getpwnam(tarinfo.uname)[2]
2257             except KeyError:
2258                 try:
2259                     u = pwd.getpwuid(tarinfo.uid)[2]
2260                 except KeyError:
2261                     u = os.getuid()
2262             try:
2263                 if tarinfo.issym() and hasattr(os, "lchown"):
2264                     os.lchown(targetpath, u, g)
2265                 else:
2266                     if sys.platform != "os2emx":
2267                         os.chown(targetpath, u, g)
2268             except EnvironmentError, e:
2269                 raise ExtractError("could not change owner")
2270
2271     def chmod(self, tarinfo, targetpath):
2272         """Set file permissions of targetpath according to tarinfo.
2273         """
2274         if hasattr(os, 'chmod'):
2275             try:
2276                 os.chmod(targetpath, tarinfo.mode)
2277             except EnvironmentError, e:
2278                 raise ExtractError("could not change mode")
2279
2280     def utime(self, tarinfo, targetpath):
2281         """Set modification time of targetpath according to tarinfo.
2282         """
2283         if not hasattr(os, 'utime'):
2284             return
2285         try:
2286             os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
2287         except EnvironmentError, e:
2288             raise ExtractError("could not change modification time")
2289
2290     #--------------------------------------------------------------------------
2291     def next(self):
2292         """Return the next member of the archive as a TarInfo object, when
2293            TarFile is opened for reading. Return None if there is no more
2294            available.
2295         """
2296         self._check("ra")
2297         if self.firstmember is not None:
2298             m = self.firstmember
2299             self.firstmember = None
2300             return m
2301
2302         # Read the next block.
2303         self.fileobj.seek(self.offset)
2304         while True:
2305             try:
2306                 tarinfo = self.tarinfo.fromtarfile(self)
2307                 if tarinfo is None:
2308                     return
2309                 self.members.append(tarinfo)
2310
2311             except HeaderError, e:
2312                 if self.ignore_zeros:
2313                     self._dbg(2, "0x%X: %s" % (self.offset, e))
2314                     self.offset += BLOCKSIZE
2315                     continue
2316                 else:
2317                     if self.offset == 0:
2318                         raise ReadError(str(e))
2319                     return None
2320             break
2321
2322         return tarinfo
2323
2324     #--------------------------------------------------------------------------
2325     # Little helper methods:
2326
2327     def _getmember(self, name, tarinfo=None):
2328         """Find an archive member by name from bottom to top.
2329            If tarinfo is given, it is used as the starting point.
2330         """
2331         # Ensure that all members have been loaded.
2332         members = self.getmembers()
2333
2334         if tarinfo is None:
2335             end = len(members)
2336         else:
2337             end = members.index(tarinfo)
2338
2339         for i in xrange(end - 1, -1, -1):
2340             if name == members[i].name:
2341                 return members[i]
2342
2343     def _load(self):
2344         """Read through the entire archive file and look for readable
2345            members.
2346         """
2347         while True:
2348             tarinfo = self.next()
2349             if tarinfo is None:
2350                 break
2351         self._loaded = True
2352
2353     def _check(self, mode=None):
2354         """Check if TarFile is still open, and if the operation's mode
2355            corresponds to TarFile's mode.
2356         """
2357         if self.closed:
2358             raise IOError("%s is closed" % self.__class__.__name__)
2359         if mode is not None and self.mode not in mode:
2360             raise IOError("bad operation for mode %r" % self.mode)
2361
2362     def __iter__(self):
2363         """Provide an iterator object.
2364         """
2365         if self._loaded:
2366             return iter(self.members)
2367         else:
2368             return TarIter(self)
2369
2370     def _dbg(self, level, msg):
2371         """Write debugging output to sys.stderr.
2372         """
2373         if level <= self.debug:
2374             print >> sys.stderr, msg
2375 # class TarFile
2376
2377 class TarIter:
2378     """Iterator Class.
2379
2380        for tarinfo in TarFile(...):
2381            suite...
2382     """
2383
2384     def __init__(self, tarfile):
2385         """Construct a TarIter object.
2386         """
2387         self.tarfile = tarfile
2388         self.index = 0
2389     def __iter__(self):
2390         """Return iterator object.
2391         """
2392         return self
2393     def next(self):
2394         """Return the next item using TarFile's next() method.
2395            When all members have been read, set TarFile as _loaded.
2396         """
2397         # Fix for SF #1100429: Under rare circumstances it can
2398         # happen that getmembers() is called during iteration,
2399         # which will cause TarIter to stop prematurely.
2400         if not self.tarfile._loaded:
2401             tarinfo = self.tarfile.next()
2402             if not tarinfo:
2403                 self.tarfile._loaded = True
2404                 raise StopIteration
2405         else:
2406             try:
2407                 tarinfo = self.tarfile.members[self.index]
2408             except IndexError:
2409                 raise StopIteration
2410         self.index += 1
2411         return tarinfo
2412
2413 # Helper classes for sparse file support
2414 class _section:
2415     """Base class for _data and _hole.
2416     """
2417     def __init__(self, offset, size):
2418         self.offset = offset
2419         self.size = size
2420     def __contains__(self, offset):
2421         return self.offset <= offset < self.offset + self.size
2422
2423 class _data(_section):
2424     """Represent a data section in a sparse file.
2425     """
2426     def __init__(self, offset, size, realpos):
2427         _section.__init__(self, offset, size)
2428         self.realpos = realpos
2429
2430 class _hole(_section):
2431     """Represent a hole section in a sparse file.
2432     """
2433     pass
2434
2435 class _ringbuffer(list):
2436     """Ringbuffer class which increases performance
2437        over a regular list.
2438     """
2439     def __init__(self):
2440         self.idx = 0
2441     def find(self, offset):
2442         idx = self.idx
2443         while True:
2444             item = self[idx]
2445             if offset in item:
2446                 break
2447             idx += 1
2448             if idx == len(self):
2449                 idx = 0
2450             if idx == self.idx:
2451                 # End of File
2452                 return None
2453         self.idx = idx
2454         return item
2455
2456 #---------------------------------------------
2457 # zipfile compatible TarFile class
2458 #---------------------------------------------
2459 TAR_PLAIN = 0           # zipfile.ZIP_STORED
2460 TAR_GZIPPED = 8         # zipfile.ZIP_DEFLATED
2461 class TarFileCompat:
2462     """TarFile class compatible with standard module zipfile's
2463        ZipFile class.
2464     """
2465     def __init__(self, file, mode="r", compression=TAR_PLAIN):
2466         from warnings import warnpy3k
2467         warnpy3k("the TarFileCompat class has been removed in Python 3.0",
2468                 stacklevel=2)
2469         if compression == TAR_PLAIN:
2470             self.tarfile = TarFile.taropen(file, mode)
2471         elif compression == TAR_GZIPPED:
2472             self.tarfile = TarFile.gzopen(file, mode)
2473         else:
2474             raise ValueError("unknown compression constant")
2475         if mode[0:1] == "r":
2476             members = self.tarfile.getmembers()
2477             for m in members:
2478                 m.filename = m.name
2479                 m.file_size = m.size
2480                 m.date_time = time.gmtime(m.mtime)[:6]
2481     def namelist(self):
2482         return map(lambda m: m.name, self.infolist())
2483     def infolist(self):
2484         return filter(lambda m: m.type in REGULAR_TYPES,
2485                       self.tarfile.getmembers())
2486     def printdir(self):
2487         self.tarfile.list()
2488     def testzip(self):
2489         return
2490     def getinfo(self, name):
2491         return self.tarfile.getmember(name)
2492     def read(self, name):
2493         return self.tarfile.extractfile(self.tarfile.getmember(name)).read()
2494     def write(self, filename, arcname=None, compress_type=None):
2495         self.tarfile.add(filename, arcname)
2496     def writestr(self, zinfo, bytes):
2497         try:
2498             from cStringIO import StringIO
2499         except ImportError:
2500             from StringIO import StringIO
2501         import calendar
2502         tinfo = TarInfo(zinfo.filename)
2503         tinfo.size = len(bytes)
2504         tinfo.mtime = calendar.timegm(zinfo.date_time)
2505         self.tarfile.addfile(tinfo, StringIO(bytes))
2506     def close(self):
2507         self.tarfile.close()
2508 #class TarFileCompat
2509
2510 #--------------------
2511 # exported functions
2512 #--------------------
2513 def is_tarfile(name):
2514     """Return True if name points to a tar archive that we
2515        are able to handle, else return False.
2516     """
2517     try:
2518         t = open(name)
2519         t.close()
2520         return True
2521     except TarError:
2522         return False
2523
2524 bltn_open = open
2525 open = TarFile.open