gzip.py

   1 """Functions that read and write gzipped files.
   2
   3 The user of the file doesn't have to worry about the compression,
   4 but random access is not allowed."""
   5
   6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
   7
   8 import struct, sys, time, os
   9 import zlib
  10 import io
  11 import __builtin__
  12
  13 __all__ = ["GzipFile","open"]
  14
  15 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
  16
  17 READ, WRITE = 1, 2
  18
  19 def write32u(output, value):
  20     # The L format writes the bit pattern correctly whether signed
  21     # or unsigned.
  22     output.write(struct.pack("<L", value))
  23
  24 def read32(input):
  25     return struct.unpack("<I", input.read(4))[0]
  26
  27 def open(filename, mode="rb", compresslevel=9):
  28     """Shorthand for GzipFile(filename, mode, compresslevel).
  29
  30     The filename argument is required; mode defaults to 'rb'
  31     and compresslevel defaults to 9.
  32
  33     """
  34     return GzipFile(filename, mode, compresslevel)
  35
  36 class GzipFile(io.BufferedIOBase):
  37     """The GzipFile class simulates most of the methods of a file object with
  38     the exception of the readinto() and truncate() methods.
  39
  40     """
  41
  42     myfileobj = None
  43     max_read_chunk = 10 * 1024 * 1024   # 10Mb
  44
  45     def __init__(self, filename=None, mode=None,
  46                  compresslevel=9, fileobj=None, mtime=None):
  47         """Constructor for the GzipFile class.
  48
  49         At least one of fileobj and filename must be given a
  50         non-trivial value.
  51
  52         The new class instance is based on fileobj, which can be a regular
  53         file, a StringIO object, or any other object which simulates a file.
  54         It defaults to None, in which case filename is opened to provide
  55         a file object.
  56
  57         When fileobj is not None, the filename argument is only used to be
  58         included in the gzip file header, which may includes the original
  59         filename of the uncompressed file.  It defaults to the filename of
  60         fileobj, if discernible; otherwise, it defaults to the empty string,
  61         and in this case the original filename is not included in the header.
  62
  63         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
  64         depending on whether the file will be read or written.  The default
  65         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
  66         Be aware that only the 'rb', 'ab', and 'wb' values should be used
  67         for cross-platform portability.
  68
  69         The compresslevel argument is an integer from 0 to 9 controlling the
  70         level of compression; 1 is fastest and produces the least compression,
  71         and 9 is slowest and produces the most compression. 0 is no compression
  72         at all. The default is 9.
  73
  74         The mtime argument is an optional numeric timestamp to be written
  75         to the stream when compressing.  All gzip compressed streams
  76         are required to contain a timestamp.  If omitted or None, the
  77         current time is used.  This module ignores the timestamp when
  78         decompressing; however, some programs, such as gunzip, make use
  79         of it.  The format of the timestamp is the same as that of the
  80         return value of time.time() and of the st_mtime member of the
  81         object returned by os.stat().
  82
  83         """
  84
  85         # Make sure we don't inadvertently enable universal newlines on the
  86         # underlying file object - in read mode, this causes data corruption.
  87         if mode:
  88             mode = mode.replace('U', '')
  89         # guarantee the file is opened in binary mode on platforms
  90         # that care about that sort of thing
  91         if mode and 'b' not in mode:
  92             mode += 'b'
  93         if fileobj is None:
  94             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
  95         if filename is None:
  96             # Issue #13781: os.fdopen() creates a fileobj with a bogus name
  97             # attribute. Avoid saving this in the gzip header's filename field.
  98             if hasattr(fileobj, 'name') and fileobj.name != '<fdopen>':
  99                 filename = fileobj.name
 100             else:
 101                 filename = ''
 102         if mode is None:
 103             if hasattr(fileobj, 'mode'): mode = fileobj.mode
 104             else: mode = 'rb'
 105
 106         if mode[0:1] == 'r':
 107             self.mode = READ
 108             # Set flag indicating start of a new member
 109             self._new_member = True
 110             # Buffer data read from gzip file. extrastart is offset in
 111             # stream where buffer starts. extrasize is number of
 112             # bytes remaining in buffer from current stream position.
 113             self.extrabuf = ""
 114             self.extrasize = 0
 115             self.extrastart = 0
 116             self.name = filename
 117             # Starts small, scales exponentially
 118             self.min_readsize = 100
 119
 120         elif mode[0:1] == 'w' or mode[0:1] == 'a':
 121             self.mode = WRITE
 122             self._init_write(filename)
 123             self.compress = zlib.compressobj(compresslevel,
 124                                              zlib.DEFLATED,
 125                                              -zlib.MAX_WBITS,
 126                                              zlib.DEF_MEM_LEVEL,
 127                                              0)
 128         else:
 129             raise IOError, "Mode " + mode + " not supported"
 130
 131         self.fileobj = fileobj
 132         self.offset = 0
 133         self.mtime = mtime
 134
 135         if self.mode == WRITE:
 136             self._write_gzip_header()
 137
 138     @property
 139     def filename(self):
 140         import warnings
 141         warnings.warn("use the name attribute", DeprecationWarning, 2)
 142         if self.mode == WRITE and self.name[-3:] != ".gz":
 143             return self.name + ".gz"
 144         return self.name
 145
 146     def __repr__(self):
 147         s = repr(self.fileobj)
 148         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
 149
 150     def _check_closed(self):
 151         """Raises a ValueError if the underlying file object has been closed.
 152
 153         """
 154         if self.closed:
 155             raise ValueError('I/O operation on closed file.')
 156
 157     def _init_write(self, filename):
 158         self.name = filename
 159         self.crc = zlib.crc32("") & 0xffffffffL
 160         self.size = 0
 161         self.writebuf = []
 162         self.bufsize = 0
 163
 164     def _write_gzip_header(self):
 165         self.fileobj.write('\037\213')             # magic header
 166         self.fileobj.write('\010')                 # compression method
 167         fname = os.path.basename(self.name)
 168         if fname.endswith(".gz"):
 169             fname = fname[:-3]
 170         flags = 0
 171         if fname:
 172             flags = FNAME
 173         self.fileobj.write(chr(flags))
 174         mtime = self.mtime
 175         if mtime is None:
 176             mtime = time.time()
 177         write32u(self.fileobj, long(mtime))
 178         self.fileobj.write('\002')
 179         self.fileobj.write('\377')
 180         if fname:
 181             self.fileobj.write(fname + '\000')
 182
 183     def _init_read(self):
 184         self.crc = zlib.crc32("") & 0xffffffffL
 185         self.size = 0
 186
 187     def _read_gzip_header(self):
 188         magic = self.fileobj.read(2)
 189         if magic != '\037\213':
 190             raise IOError, 'Not a gzipped file'
 191         method = ord( self.fileobj.read(1) )
 192         if method != 8:
 193             raise IOError, 'Unknown compression method'
 194         flag = ord( self.fileobj.read(1) )
 195         self.mtime = read32(self.fileobj)
 196         # extraflag = self.fileobj.read(1)
 197         # os = self.fileobj.read(1)
 198         self.fileobj.read(2)
 199
 200         if flag & FEXTRA:
 201             # Read & discard the extra field, if present
 202             xlen = ord(self.fileobj.read(1))
 203             xlen = xlen + 256*ord(self.fileobj.read(1))
 204             self.fileobj.read(xlen)
 205         if flag & FNAME:
 206             # Read and discard a null-terminated string containing the filename
 207             while True:
 208                 s = self.fileobj.read(1)
 209                 if not s or s=='\000':
 210                     break
 211         if flag & FCOMMENT:
 212             # Read and discard a null-terminated string containing a comment
 213             while True:
 214                 s = self.fileobj.read(1)
 215                 if not s or s=='\000':
 216                     break
 217         if flag & FHCRC:
 218             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
 219
 220     def write(self,data):
 221         self._check_closed()
 222         if self.mode != WRITE:
 223             import errno
 224             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
 225
 226         if self.fileobj is None:
 227             raise ValueError, "write() on closed GzipFile object"
 228
 229         # Convert data type if called by io.BufferedWriter.
 230         if isinstance(data, memoryview):
 231             data = data.tobytes()
 232
 233         if len(data) > 0:
 234             self.size = self.size + len(data)
 235             self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
 236             self.fileobj.write( self.compress.compress(data) )
 237             self.offset += len(data)
 238
 239         return len(data)
 240
 241     def read(self, size=-1):
 242         self._check_closed()
 243         if self.mode != READ:
 244             import errno
 245             raise IOError(errno.EBADF, "read() on write-only GzipFile object")
 246
 247         if self.extrasize <= 0 and self.fileobj is None:
 248             return ''
 249
 250         readsize = 1024
 251         if size < 0:        # get the whole thing
 252             try:
 253                 while True:
 254                     self._read(readsize)
 255                     readsize = min(self.max_read_chunk, readsize * 2)
 256             except EOFError:
 257                 size = self.extrasize
 258         else:               # just get some more of it
 259             try:
 260                 while size > self.extrasize:
 261                     self._read(readsize)
 262                     readsize = min(self.max_read_chunk, readsize * 2)
 263             except EOFError:
 264                 if size > self.extrasize:
 265                     size = self.extrasize
 266
 267         offset = self.offset - self.extrastart
 268         chunk = self.extrabuf[offset: offset + size]
 269         self.extrasize = self.extrasize - size
 270
 271         self.offset += size
 272         return chunk
 273
 274     def _unread(self, buf):
 275         self.extrasize = len(buf) + self.extrasize
 276         self.offset -= len(buf)
 277
 278     def _read(self, size=1024):
 279         if self.fileobj is None:
 280             raise EOFError, "Reached EOF"
 281
 282         if self._new_member:
 283             # If the _new_member flag is set, we have to
 284             # jump to the next member, if there is one.
 285             #
 286             # First, check if we're at the end of the file;
 287             # if so, it's time to stop; no more members to read.
 288             pos = self.fileobj.tell()   # Save current position
 289             self.fileobj.seek(0, 2)     # Seek to end of file
 290             if pos == self.fileobj.tell():
 291                 raise EOFError, "Reached EOF"
 292             else:
 293                 self.fileobj.seek( pos ) # Return to original position
 294
 295             self._init_read()
 296             self._read_gzip_header()
 297             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
 298             self._new_member = False
 299
 300         # Read a chunk of data from the file
 301         buf = self.fileobj.read(size)
 302
 303         # If the EOF has been reached, flush the decompression object
 304         # and mark this object as finished.
 305
 306         if buf == "":
 307             uncompress = self.decompress.flush()
 308             self._read_eof()
 309             self._add_read_data( uncompress )
 310             raise EOFError, 'Reached EOF'
 311
 312         uncompress = self.decompress.decompress(buf)
 313         self._add_read_data( uncompress )
 314
 315         if self.decompress.unused_data != "":
 316             # Ending case: we've come to the end of a member in the file,
 317             # so seek back to the start of the unused data, finish up
 318             # this member, and read a new gzip header.
 319             # (The number of bytes to seek back is the length of the unused
 320             # data, minus 8 because _read_eof() will rewind a further 8 bytes)
 321             self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
 322
 323             # Check the CRC and file size, and set the flag so we read
 324             # a new member on the next call
 325             self._read_eof()
 326             self._new_member = True
 327
 328     def _add_read_data(self, data):
 329         self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
 330         offset = self.offset - self.extrastart
 331         self.extrabuf = self.extrabuf[offset:] + data
 332         self.extrasize = self.extrasize + len(data)
 333         self.extrastart = self.offset
 334         self.size = self.size + len(data)
 335
 336     def _read_eof(self):
 337         # We've read to the end of the file, so we have to rewind in order
 338         # to reread the 8 bytes containing the CRC and the file size.
 339         # We check the that the computed CRC and size of the
 340         # uncompressed data matches the stored values.  Note that the size
 341         # stored is the true file size mod 2**32.
 342         self.fileobj.seek(-8, 1)
 343         crc32 = read32(self.fileobj)
 344         isize = read32(self.fileobj)  # may exceed 2GB
 345         if crc32 != self.crc:
 346             raise IOError("CRC check failed %s != %s" % (hex(crc32),
 347                                                          hex(self.crc)))
 348         elif isize != (self.size & 0xffffffffL):
 349             raise IOError, "Incorrect length of data produced"
 350
 351         # Gzip files can be padded with zeroes and still have archives.
 352         # Consume all zero bytes and set the file position to the first
 353         # non-zero byte. See http://www.gzip.org/#faq8
 354         c = "\x00"
 355         while c == "\x00":
 356             c = self.fileobj.read(1)
 357         if c:
 358             self.fileobj.seek(-1, 1)
 359
 360     @property
 361     def closed(self):
 362         return self.fileobj is None
 363
 364     def close(self):
 365         if self.fileobj is None:
 366             return
 367         if self.mode == WRITE:
 368             self.fileobj.write(self.compress.flush())
 369             write32u(self.fileobj, self.crc)
 370             # self.size may exceed 2GB, or even 4GB
 371             write32u(self.fileobj, self.size & 0xffffffffL)
 372             self.fileobj = None
 373         elif self.mode == READ:
 374             self.fileobj = None
 375         if self.myfileobj:
 376             self.myfileobj.close()
 377             self.myfileobj = None
 378
 379     def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
 380         self._check_closed()
 381         if self.mode == WRITE:
 382             # Ensure the compressor's buffer is flushed
 383             self.fileobj.write(self.compress.flush(zlib_mode))
 384             self.fileobj.flush()
 385
 386     def fileno(self):
 387         """Invoke the underlying file object's fileno() method.
 388
 389         This will raise AttributeError if the underlying file object
 390         doesn't support fileno().
 391         """
 392         return self.fileobj.fileno()
 393
 394     def rewind(self):
 395         '''Return the uncompressed stream file position indicator to the
 396         beginning of the file'''
 397         if self.mode != READ:
 398             raise IOError("Can't rewind in write mode")
 399         self.fileobj.seek(0)
 400         self._new_member = True
 401         self.extrabuf = ""
 402         self.extrasize = 0
 403         self.extrastart = 0
 404         self.offset = 0
 405
 406     def readable(self):
 407         return self.mode == READ
 408
 409     def writable(self):
 410         return self.mode == WRITE
 411
 412     def seekable(self):
 413         return True
 414
 415     def seek(self, offset, whence=0):
 416         if whence:
 417             if whence == 1:
 418                 offset = self.offset + offset
 419             else:
 420                 raise ValueError('Seek from end not supported')
 421         if self.mode == WRITE:
 422             if offset < self.offset:
 423                 raise IOError('Negative seek in write mode')
 424             count = offset - self.offset
 425             for i in xrange(count // 1024):
 426                 self.write(1024 * '\0')
 427             self.write((count % 1024) * '\0')
 428         elif self.mode == READ:
 429             if offset < self.offset:
 430                 # for negative seek, rewind and do positive seek
 431                 self.rewind()
 432             count = offset - self.offset
 433             for i in xrange(count // 1024):
 434                 self.read(1024)
 435             self.read(count % 1024)
 436
 437         return self.offset
 438
 439     def readline(self, size=-1):
 440         if size < 0:
 441             # Shortcut common case - newline found in buffer.
 442             offset = self.offset - self.extrastart
 443             i = self.extrabuf.find('\n', offset) + 1
 444             if i > 0:
 445                 self.extrasize -= i - offset
 446                 self.offset += i - offset
 447                 return self.extrabuf[offset: i]
 448
 449             size = sys.maxint
 450             readsize = self.min_readsize
 451         else:
 452             readsize = size
 453         bufs = []
 454         while size != 0:
 455             c = self.read(readsize)
 456             i = c.find('\n')
 457
 458             # We set i=size to break out of the loop under two
 459             # conditions: 1) there's no newline, and the chunk is
 460             # larger than size, or 2) there is a newline, but the
 461             # resulting line would be longer than 'size'.
 462             if (size <= i) or (i == -1 and len(c) > size):
 463                 i = size - 1
 464
 465             if i >= 0 or c == '':
 466                 bufs.append(c[:i + 1])    # Add portion of last chunk
 467                 self._unread(c[i + 1:])   # Push back rest of chunk
 468                 break
 469
 470             # Append chunk to list, decrease 'size',
 471             bufs.append(c)
 472             size = size - len(c)
 473             readsize = min(size, readsize * 2)
 474         if readsize > self.min_readsize:
 475             self.min_readsize = min(readsize, self.min_readsize * 2, 512)
 476         return ''.join(bufs) # Return resulting line
 477
 478
 479 def _test():
 480     # Act like gzip; with -d, act like gunzip.
 481     # The input file is not deleted, however, nor are any other gzip
 482     # options or features supported.
 483     args = sys.argv[1:]
 484     decompress = args and args[0] == "-d"
 485     if decompress:
 486         args = args[1:]
 487     if not args:
 488         args = ["-"]
 489     for arg in args:
 490         if decompress:
 491             if arg == "-":
 492                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
 493                 g = sys.stdout
 494             else:
 495                 if arg[-3:] != ".gz":
 496                     print "filename doesn't end in .gz:", repr(arg)
 497                     continue
 498                 f = open(arg, "rb")
 499                 g = __builtin__.open(arg[:-3], "wb")
 500         else:
 501             if arg == "-":
 502                 f = sys.stdin
 503                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
 504             else:
 505                 f = __builtin__.open(arg, "rb")
 506                 g = open(arg + ".gz", "wb")
 507         while True:
 508             chunk = f.read(1024)
 509             if not chunk:
 510                 break
 511             g.write(chunk)
 512         if g is not sys.stdout:
 513             g.close()
 514         if f is not sys.stdin:
 515             f.close()
 516
 517 if __name__ == '__main__':
 518     _test()