Lib/gzip.py

   1 """Functions that read and write gzipped files.
   2
   3 The user of the file doesn't have to worry about the compression,
   4 but random access is not allowed."""
   5
   6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
   7
   8 import struct, sys, time, os
   9 import zlib
  10 import io
  11 import __builtin__
  12
  13 __all__ = ["GzipFile","open"]
  14
  15 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
  16
  17 READ, WRITE = 1, 2
  18
  19 def write32u(output, value):
  20     # The L format writes the bit pattern correctly whether signed
  21     # or unsigned.
  22     output.write(struct.pack("<L", value))
  23
  24 def read32(input):
  25     return struct.unpack("<I", input.read(4))[0]
  26
  27 def open(filename, mode="rb", compresslevel=9):
  28     """Shorthand for GzipFile(filename, mode, compresslevel).
  29
  30     The filename argument is required; mode defaults to 'rb'
  31     and compresslevel defaults to 9.
  32
  33     """
  34     return GzipFile(filename, mode, compresslevel)
  35
  36 class GzipFile(io.BufferedIOBase):
  37     """The GzipFile class simulates most of the methods of a file object with
  38     the exception of the readinto() and truncate() methods.
  39
  40     """
  41
  42     myfileobj = None
  43     max_read_chunk = 10 * 1024 * 1024   # 10Mb
  44
  45     def __init__(self, filename=None, mode=None,
  46                  compresslevel=9, fileobj=None, mtime=None):
  47         """Constructor for the GzipFile class.
  48
  49         At least one of fileobj and filename must be given a
  50         non-trivial value.
  51
  52         The new class instance is based on fileobj, which can be a regular
  53         file, a StringIO object, or any other object which simulates a file.
  54         It defaults to None, in which case filename is opened to provide
  55         a file object.
  56
  57         When fileobj is not None, the filename argument is only used to be
  58         included in the gzip file header, which may includes the original
  59         filename of the uncompressed file.  It defaults to the filename of
  60         fileobj, if discernible; otherwise, it defaults to the empty string,
  61         and in this case the original filename is not included in the header.
  62
  63         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
  64         depending on whether the file will be read or written.  The default
  65         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
  66         Be aware that only the 'rb', 'ab', and 'wb' values should be used
  67         for cross-platform portability.
  68
  69         The compresslevel argument is an integer from 1 to 9 controlling the
  70         level of compression; 1 is fastest and produces the least compression,
  71         and 9 is slowest and produces the most compression.  The default is 9.
  72
  73         The mtime argument is an optional numeric timestamp to be written
  74         to the stream when compressing.  All gzip compressed streams
  75         are required to contain a timestamp.  If omitted or None, the
  76         current time is used.  This module ignores the timestamp when
  77         decompressing; however, some programs, such as gunzip, make use
  78         of it.  The format of the timestamp is the same as that of the
  79         return value of time.time() and of the st_mtime member of the
  80         object returned by os.stat().
  81
  82         """
  83
  84         # guarantee the file is opened in binary mode on platforms
  85         # that care about that sort of thing
  86         if mode and 'b' not in mode:
  87             mode += 'b'
  88         if fileobj is None:
  89             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
  90         if filename is None:
  91             if hasattr(fileobj, 'name'): filename = fileobj.name
  92             else: filename = ''
  93         if mode is None:
  94             if hasattr(fileobj, 'mode'): mode = fileobj.mode
  95             else: mode = 'rb'
  96
  97         if mode[0:1] == 'r':
  98             self.mode = READ
  99             # Set flag indicating start of a new member
 100             self._new_member = True
 101             # Buffer data read from gzip file. extrastart is offset in
 102             # stream where buffer starts. extrasize is number of
 103             # bytes remaining in buffer from current stream position.
 104             self.extrabuf = ""
 105             self.extrasize = 0
 106             self.extrastart = 0
 107             self.name = filename
 108             # Starts small, scales exponentially
 109             self.min_readsize = 100
 110
 111         elif mode[0:1] == 'w' or mode[0:1] == 'a':
 112             self.mode = WRITE
 113             self._init_write(filename)
 114             self.compress = zlib.compressobj(compresslevel,
 115                                              zlib.DEFLATED,
 116                                              -zlib.MAX_WBITS,
 117                                              zlib.DEF_MEM_LEVEL,
 118                                              0)
 119         else:
 120             raise IOError, "Mode " + mode + " not supported"
 121
 122         self.fileobj = fileobj
 123         self.offset = 0
 124         self.mtime = mtime
 125
 126         if self.mode == WRITE:
 127             self._write_gzip_header()
 128
 129     @property
 130     def filename(self):
 131         import warnings
 132         warnings.warn("use the name attribute", DeprecationWarning, 2)
 133         if self.mode == WRITE and self.name[-3:] != ".gz":
 134             return self.name + ".gz"
 135         return self.name
 136
 137     def __repr__(self):
 138         s = repr(self.fileobj)
 139         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
 140
 141     def _init_write(self, filename):
 142         self.name = filename
 143         self.crc = zlib.crc32("") & 0xffffffffL
 144         self.size = 0
 145         self.writebuf = []
 146         self.bufsize = 0
 147
 148     def _write_gzip_header(self):
 149         self.fileobj.write('\037\213')             # magic header
 150         self.fileobj.write('\010')                 # compression method
 151         fname = os.path.basename(self.name)
 152         if fname.endswith(".gz"):
 153             fname = fname[:-3]
 154         flags = 0
 155         if fname:
 156             flags = FNAME
 157         self.fileobj.write(chr(flags))
 158         mtime = self.mtime
 159         if mtime is None:
 160             mtime = time.time()
 161         write32u(self.fileobj, long(mtime))
 162         self.fileobj.write('\002')
 163         self.fileobj.write('\377')
 164         if fname:
 165             self.fileobj.write(fname + '\000')
 166
 167     def _init_read(self):
 168         self.crc = zlib.crc32("") & 0xffffffffL
 169         self.size = 0
 170
 171     def _read_gzip_header(self):
 172         magic = self.fileobj.read(2)
 173         if magic != '\037\213':
 174             raise IOError, 'Not a gzipped file'
 175         method = ord( self.fileobj.read(1) )
 176         if method != 8:
 177             raise IOError, 'Unknown compression method'
 178         flag = ord( self.fileobj.read(1) )
 179         self.mtime = read32(self.fileobj)
 180         # extraflag = self.fileobj.read(1)
 181         # os = self.fileobj.read(1)
 182         self.fileobj.read(2)
 183
 184         if flag & FEXTRA:
 185             # Read & discard the extra field, if present
 186             xlen = ord(self.fileobj.read(1))
 187             xlen = xlen + 256*ord(self.fileobj.read(1))
 188             self.fileobj.read(xlen)
 189         if flag & FNAME:
 190             # Read and discard a null-terminated string containing the filename
 191             while True:
 192                 s = self.fileobj.read(1)
 193                 if not s or s=='\000':
 194                     break
 195         if flag & FCOMMENT:
 196             # Read and discard a null-terminated string containing a comment
 197             while True:
 198                 s = self.fileobj.read(1)
 199                 if not s or s=='\000':
 200                     break
 201         if flag & FHCRC:
 202             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
 203
 204     def write(self,data):
 205         if self.mode != WRITE:
 206             import errno
 207             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
 208
 209         if self.fileobj is None:
 210             raise ValueError, "write() on closed GzipFile object"
 211
 212         # Convert data type if called by io.BufferedWriter.
 213         if isinstance(data, memoryview):
 214             data = data.tobytes()
 215
 216         if len(data) > 0:
 217             self.size = self.size + len(data)
 218             self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
 219             self.fileobj.write( self.compress.compress(data) )
 220             self.offset += len(data)
 221
 222         return len(data)
 223
 224     def read(self, size=-1):
 225         if self.mode != READ:
 226             import errno
 227             raise IOError(errno.EBADF, "read() on write-only GzipFile object")
 228
 229         if self.extrasize <= 0 and self.fileobj is None:
 230             return ''
 231
 232         readsize = 1024
 233         if size < 0:        # get the whole thing
 234             try:
 235                 while True:
 236                     self._read(readsize)
 237                     readsize = min(self.max_read_chunk, readsize * 2)
 238             except EOFError:
 239                 size = self.extrasize
 240         else:               # just get some more of it
 241             try:
 242                 while size > self.extrasize:
 243                     self._read(readsize)
 244                     readsize = min(self.max_read_chunk, readsize * 2)
 245             except EOFError:
 246                 if size > self.extrasize:
 247                     size = self.extrasize
 248
 249         offset = self.offset - self.extrastart
 250         chunk = self.extrabuf[offset: offset + size]
 251         self.extrasize = self.extrasize - size
 252
 253         self.offset += size
 254         return chunk
 255
 256     def _unread(self, buf):
 257         self.extrasize = len(buf) + self.extrasize
 258         self.offset -= len(buf)
 259
 260     def _read(self, size=1024):
 261         if self.fileobj is None:
 262             raise EOFError, "Reached EOF"
 263
 264         if self._new_member:
 265             # If the _new_member flag is set, we have to
 266             # jump to the next member, if there is one.
 267             #
 268             # First, check if we're at the end of the file;
 269             # if so, it's time to stop; no more members to read.
 270             pos = self.fileobj.tell()   # Save current position
 271             self.fileobj.seek(0, 2)     # Seek to end of file
 272             if pos == self.fileobj.tell():
 273                 raise EOFError, "Reached EOF"
 274             else:
 275                 self.fileobj.seek( pos ) # Return to original position
 276
 277             self._init_read()
 278             self._read_gzip_header()
 279             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
 280             self._new_member = False
 281
 282         # Read a chunk of data from the file
 283         buf = self.fileobj.read(size)
 284
 285         # If the EOF has been reached, flush the decompression object
 286         # and mark this object as finished.
 287
 288         if buf == "":
 289             uncompress = self.decompress.flush()
 290             self._read_eof()
 291             self._add_read_data( uncompress )
 292             raise EOFError, 'Reached EOF'
 293
 294         uncompress = self.decompress.decompress(buf)
 295         self._add_read_data( uncompress )
 296
 297         if self.decompress.unused_data != "":
 298             # Ending case: we've come to the end of a member in the file,
 299             # so seek back to the start of the unused data, finish up
 300             # this member, and read a new gzip header.
 301             # (The number of bytes to seek back is the length of the unused
 302             # data, minus 8 because _read_eof() will rewind a further 8 bytes)
 303             self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
 304
 305             # Check the CRC and file size, and set the flag so we read
 306             # a new member on the next call
 307             self._read_eof()
 308             self._new_member = True
 309
 310     def _add_read_data(self, data):
 311         self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
 312         offset = self.offset - self.extrastart
 313         self.extrabuf = self.extrabuf[offset:] + data
 314         self.extrasize = self.extrasize + len(data)
 315         self.extrastart = self.offset
 316         self.size = self.size + len(data)
 317
 318     def _read_eof(self):
 319         # We've read to the end of the file, so we have to rewind in order
 320         # to reread the 8 bytes containing the CRC and the file size.
 321         # We check the that the computed CRC and size of the
 322         # uncompressed data matches the stored values.  Note that the size
 323         # stored is the true file size mod 2**32.
 324         self.fileobj.seek(-8, 1)
 325         crc32 = read32(self.fileobj)
 326         isize = read32(self.fileobj)  # may exceed 2GB
 327         if crc32 != self.crc:
 328             raise IOError("CRC check failed %s != %s" % (hex(crc32),
 329                                                          hex(self.crc)))
 330         elif isize != (self.size & 0xffffffffL):
 331             raise IOError, "Incorrect length of data produced"
 332
 333         # Gzip files can be padded with zeroes and still have archives.
 334         # Consume all zero bytes and set the file position to the first
 335         # non-zero byte. See http://www.gzip.org/#faq8
 336         c = "\x00"
 337         while c == "\x00":
 338             c = self.fileobj.read(1)
 339         if c:
 340             self.fileobj.seek(-1, 1)
 341
 342     @property
 343     def closed(self):
 344         return self.fileobj is None
 345
 346     def close(self):
 347         if self.fileobj is None:
 348             return
 349         if self.mode == WRITE:
 350             self.fileobj.write(self.compress.flush())
 351             write32u(self.fileobj, self.crc)
 352             # self.size may exceed 2GB, or even 4GB
 353             write32u(self.fileobj, self.size & 0xffffffffL)
 354             self.fileobj = None
 355         elif self.mode == READ:
 356             self.fileobj = None
 357         if self.myfileobj:
 358             self.myfileobj.close()
 359             self.myfileobj = None
 360
 361     def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
 362         if self.mode == WRITE:
 363             # Ensure the compressor's buffer is flushed
 364             self.fileobj.write(self.compress.flush(zlib_mode))
 365         self.fileobj.flush()
 366
 367     def fileno(self):
 368         """Invoke the underlying file object's fileno() method.
 369
 370         This will raise AttributeError if the underlying file object
 371         doesn't support fileno().
 372         """
 373         return self.fileobj.fileno()
 374
 375     def rewind(self):
 376         '''Return the uncompressed stream file position indicator to the
 377         beginning of the file'''
 378         if self.mode != READ:
 379             raise IOError("Can't rewind in write mode")
 380         self.fileobj.seek(0)
 381         self._new_member = True
 382         self.extrabuf = ""
 383         self.extrasize = 0
 384         self.extrastart = 0
 385         self.offset = 0
 386
 387     def readable(self):
 388         return self.mode == READ
 389
 390     def writable(self):
 391         return self.mode == WRITE
 392
 393     def seekable(self):
 394         return True
 395
 396     def seek(self, offset, whence=0):
 397         if whence:
 398             if whence == 1:
 399                 offset = self.offset + offset
 400             else:
 401                 raise ValueError('Seek from end not supported')
 402         if self.mode == WRITE:
 403             if offset < self.offset:
 404                 raise IOError('Negative seek in write mode')
 405             count = offset - self.offset
 406             for i in range(count // 1024):
 407                 self.write(1024 * '\0')
 408             self.write((count % 1024) * '\0')
 409         elif self.mode == READ:
 410             if offset < self.offset:
 411                 # for negative seek, rewind and do positive seek
 412                 self.rewind()
 413             count = offset - self.offset
 414             for i in range(count // 1024):
 415                 self.read(1024)
 416             self.read(count % 1024)
 417
 418         return self.offset
 419
 420     def readline(self, size=-1):
 421         if size < 0:
 422             # Shortcut common case - newline found in buffer.
 423             offset = self.offset - self.extrastart
 424             i = self.extrabuf.find('\n', offset) + 1
 425             if i > 0:
 426                 self.extrasize -= i - offset
 427                 self.offset += i - offset
 428                 return self.extrabuf[offset: i]
 429
 430             size = sys.maxint
 431             readsize = self.min_readsize
 432         else:
 433             readsize = size
 434         bufs = []
 435         while size != 0:
 436             c = self.read(readsize)
 437             i = c.find('\n')
 438
 439             # We set i=size to break out of the loop under two
 440             # conditions: 1) there's no newline, and the chunk is
 441             # larger than size, or 2) there is a newline, but the
 442             # resulting line would be longer than 'size'.
 443             if (size <= i) or (i == -1 and len(c) > size):
 444                 i = size - 1
 445
 446             if i >= 0 or c == '':
 447                 bufs.append(c[:i + 1])    # Add portion of last chunk
 448                 self._unread(c[i + 1:])   # Push back rest of chunk
 449                 break
 450
 451             # Append chunk to list, decrease 'size',
 452             bufs.append(c)
 453             size = size - len(c)
 454             readsize = min(size, readsize * 2)
 455         if readsize > self.min_readsize:
 456             self.min_readsize = min(readsize, self.min_readsize * 2, 512)
 457         return ''.join(bufs) # Return resulting line
 458
 459
 460 def _test():
 461     # Act like gzip; with -d, act like gunzip.
 462     # The input file is not deleted, however, nor are any other gzip
 463     # options or features supported.
 464     args = sys.argv[1:]
 465     decompress = args and args[0] == "-d"
 466     if decompress:
 467         args = args[1:]
 468     if not args:
 469         args = ["-"]
 470     for arg in args:
 471         if decompress:
 472             if arg == "-":
 473                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
 474                 g = sys.stdout
 475             else:
 476                 if arg[-3:] != ".gz":
 477                     print "filename doesn't end in .gz:", repr(arg)
 478                     continue
 479                 f = open(arg, "rb")
 480                 g = __builtin__.open(arg[:-3], "wb")
 481         else:
 482             if arg == "-":
 483                 f = sys.stdin
 484                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
 485             else:
 486                 f = __builtin__.open(arg, "rb")
 487                 g = open(arg + ".gz", "wb")
 488         while True:
 489             chunk = f.read(1024)
 490             if not chunk:
 491                 break
 492             g.write(chunk)
 493         if g is not sys.stdout:
 494             g.close()
 495         if f is not sys.stdin:
 496             f.close()
 497
 498 if __name__ == '__main__':
 499     _test()