Lib/gzip.py

   1 """Functions that read and write gzipped files.
   2
   3 The user of the file doesn't have to worry about the compression,
   4 but random access is not allowed."""
   5
   6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
   7
   8 import struct, sys, time
   9 import zlib
  10 import __builtin__
  11
  12 __all__ = ["GzipFile","open"]
  13
  14 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
  15
  16 READ, WRITE = 1, 2
  17
  18 def U32(i):
  19     """Return i as an unsigned integer, assuming it fits in 32 bits.
  20
  21     If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
  22     """
  23     if i < 0:
  24         i += 1L << 32
  25     return i
  26
  27 def LOWU32(i):
  28     """Return the low-order 32 bits of an int, as a non-negative int."""
  29     return i & 0xFFFFFFFFL
  30
  31 def write32(output, value):
  32     output.write(struct.pack("<l", value))
  33
  34 def write32u(output, value):
  35     # The L format writes the bit pattern correctly whether signed
  36     # or unsigned.
  37     output.write(struct.pack("<L", value))
  38
  39 def read32(input):
  40     return struct.unpack("<l", input.read(4))[0]
  41
  42 def open(filename, mode="rb", compresslevel=9):
  43     """Shorthand for GzipFile(filename, mode, compresslevel).
  44
  45     The filename argument is required; mode defaults to 'rb'
  46     and compresslevel defaults to 9.
  47
  48     """
  49     return GzipFile(filename, mode, compresslevel)
  50
  51 class GzipFile:
  52     """The GzipFile class simulates most of the methods of a file object with
  53     the exception of the readinto() and truncate() methods.
  54
  55     """
  56
  57     myfileobj = None
  58     max_read_chunk = 10 * 1024 * 1024   # 10Mb
  59
  60     def __init__(self, filename=None, mode=None,
  61                  compresslevel=9, fileobj=None):
  62         """Constructor for the GzipFile class.
  63
  64         At least one of fileobj and filename must be given a
  65         non-trivial value.
  66
  67         The new class instance is based on fileobj, which can be a regular
  68         file, a StringIO object, or any other object which simulates a file.
  69         It defaults to None, in which case filename is opened to provide
  70         a file object.
  71
  72         When fileobj is not None, the filename argument is only used to be
  73         included in the gzip file header, which may includes the original
  74         filename of the uncompressed file.  It defaults to the filename of
  75         fileobj, if discernible; otherwise, it defaults to the empty string,
  76         and in this case the original filename is not included in the header.
  77
  78         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
  79         depending on whether the file will be read or written.  The default
  80         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
  81         Be aware that only the 'rb', 'ab', and 'wb' values should be used
  82         for cross-platform portability.
  83
  84         The compresslevel argument is an integer from 1 to 9 controlling the
  85         level of compression; 1 is fastest and produces the least compression,
  86         and 9 is slowest and produces the most compression.  The default is 9.
  87
  88         """
  89
  90         # guarantee the file is opened in binary mode on platforms
  91         # that care about that sort of thing
  92         if mode and 'b' not in mode:
  93             mode += 'b'
  94         if fileobj is None:
  95             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
  96         if filename is None:
  97             if hasattr(fileobj, 'name'): filename = fileobj.name
  98             else: filename = ''
  99         if mode is None:
 100             if hasattr(fileobj, 'mode'): mode = fileobj.mode
 101             else: mode = 'rb'
 102
 103         if mode[0:1] == 'r':
 104             self.mode = READ
 105             # Set flag indicating start of a new member
 106             self._new_member = True
 107             self.extrabuf = ""
 108             self.extrasize = 0
 109             self.filename = filename
 110             # Starts small, scales exponentially
 111             self.min_readsize = 100
 112
 113         elif mode[0:1] == 'w' or mode[0:1] == 'a':
 114             self.mode = WRITE
 115             self._init_write(filename)
 116             self.compress = zlib.compressobj(compresslevel,
 117                                              zlib.DEFLATED,
 118                                              -zlib.MAX_WBITS,
 119                                              zlib.DEF_MEM_LEVEL,
 120                                              0)
 121         else:
 122             raise IOError, "Mode " + mode + " not supported"
 123
 124         self.fileobj = fileobj
 125         self.offset = 0
 126
 127         if self.mode == WRITE:
 128             self._write_gzip_header()
 129
 130     def __repr__(self):
 131         s = repr(self.fileobj)
 132         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
 133
 134     def _init_write(self, filename):
 135         if filename[-3:] != '.gz':
 136             filename = filename + '.gz'
 137         self.filename = filename
 138         self.crc = zlib.crc32("")
 139         self.size = 0
 140         self.writebuf = []
 141         self.bufsize = 0
 142
 143     def _write_gzip_header(self):
 144         self.fileobj.write('\037\213')             # magic header
 145         self.fileobj.write('\010')                 # compression method
 146         fname = self.filename[:-3]
 147         flags = 0
 148         if fname:
 149             flags = FNAME
 150         self.fileobj.write(chr(flags))
 151         write32u(self.fileobj, long(time.time()))
 152         self.fileobj.write('\002')
 153         self.fileobj.write('\377')
 154         if fname:
 155             self.fileobj.write(fname + '\000')
 156
 157     def _init_read(self):
 158         self.crc = zlib.crc32("")
 159         self.size = 0
 160
 161     def _read_gzip_header(self):
 162         magic = self.fileobj.read(2)
 163         if magic != '\037\213':
 164             raise IOError, 'Not a gzipped file'
 165         method = ord( self.fileobj.read(1) )
 166         if method != 8:
 167             raise IOError, 'Unknown compression method'
 168         flag = ord( self.fileobj.read(1) )
 169         # modtime = self.fileobj.read(4)
 170         # extraflag = self.fileobj.read(1)
 171         # os = self.fileobj.read(1)
 172         self.fileobj.read(6)
 173
 174         if flag & FEXTRA:
 175             # Read & discard the extra field, if present
 176             xlen = ord(self.fileobj.read(1))
 177             xlen = xlen + 256*ord(self.fileobj.read(1))
 178             self.fileobj.read(xlen)
 179         if flag & FNAME:
 180             # Read and discard a null-terminated string containing the filename
 181             while True:
 182                 s = self.fileobj.read(1)
 183                 if not s or s=='\000':
 184                     break
 185         if flag & FCOMMENT:
 186             # Read and discard a null-terminated string containing a comment
 187             while True:
 188                 s = self.fileobj.read(1)
 189                 if not s or s=='\000':
 190                     break
 191         if flag & FHCRC:
 192             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
 193
 194
 195     def write(self,data):
 196         if self.mode != WRITE:
 197             import errno
 198             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
 199
 200         if self.fileobj is None:
 201             raise ValueError, "write() on closed GzipFile object"
 202         if len(data) > 0:
 203             self.size = self.size + len(data)
 204             self.crc = zlib.crc32(data, self.crc)
 205             self.fileobj.write( self.compress.compress(data) )
 206             self.offset += len(data)
 207
 208     def read(self, size=-1):
 209         if self.mode != READ:
 210             import errno
 211             raise IOError(errno.EBADF, "read() on write-only GzipFile object")
 212
 213         if self.extrasize <= 0 and self.fileobj is None:
 214             return ''
 215
 216         readsize = 1024
 217         if size < 0:        # get the whole thing
 218             try:
 219                 while True:
 220                     self._read(readsize)
 221                     readsize = min(self.max_read_chunk, readsize * 2)
 222             except EOFError:
 223                 size = self.extrasize
 224         else:               # just get some more of it
 225             try:
 226                 while size > self.extrasize:
 227                     self._read(readsize)
 228                     readsize = min(self.max_read_chunk, readsize * 2)
 229             except EOFError:
 230                 if size > self.extrasize:
 231                     size = self.extrasize
 232
 233         chunk = self.extrabuf[:size]
 234         self.extrabuf = self.extrabuf[size:]
 235         self.extrasize = self.extrasize - size
 236
 237         self.offset += size
 238         return chunk
 239
 240     def _unread(self, buf):
 241         self.extrabuf = buf + self.extrabuf
 242         self.extrasize = len(buf) + self.extrasize
 243         self.offset -= len(buf)
 244
 245     def _read(self, size=1024):
 246         if self.fileobj is None:
 247             raise EOFError, "Reached EOF"
 248
 249         if self._new_member:
 250             # If the _new_member flag is set, we have to
 251             # jump to the next member, if there is one.
 252             #
 253             # First, check if we're at the end of the file;
 254             # if so, it's time to stop; no more members to read.
 255             pos = self.fileobj.tell()   # Save current position
 256             self.fileobj.seek(0, 2)     # Seek to end of file
 257             if pos == self.fileobj.tell():
 258                 raise EOFError, "Reached EOF"
 259             else:
 260                 self.fileobj.seek( pos ) # Return to original position
 261
 262             self._init_read()
 263             self._read_gzip_header()
 264             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
 265             self._new_member = False
 266
 267         # Read a chunk of data from the file
 268         buf = self.fileobj.read(size)
 269
 270         # If the EOF has been reached, flush the decompression object
 271         # and mark this object as finished.
 272
 273         if buf == "":
 274             uncompress = self.decompress.flush()
 275             self._read_eof()
 276             self._add_read_data( uncompress )
 277             raise EOFError, 'Reached EOF'
 278
 279         uncompress = self.decompress.decompress(buf)
 280         self._add_read_data( uncompress )
 281
 282         if self.decompress.unused_data != "":
 283             # Ending case: we've come to the end of a member in the file,
 284             # so seek back to the start of the unused data, finish up
 285             # this member, and read a new gzip header.
 286             # (The number of bytes to seek back is the length of the unused
 287             # data, minus 8 because _read_eof() will rewind a further 8 bytes)
 288             self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
 289
 290             # Check the CRC and file size, and set the flag so we read
 291             # a new member on the next call
 292             self._read_eof()
 293             self._new_member = True
 294
 295     def _add_read_data(self, data):
 296         self.crc = zlib.crc32(data, self.crc)
 297         self.extrabuf = self.extrabuf + data
 298         self.extrasize = self.extrasize + len(data)
 299         self.size = self.size + len(data)
 300
 301     def _read_eof(self):
 302         # We've read to the end of the file, so we have to rewind in order
 303         # to reread the 8 bytes containing the CRC and the file size.
 304         # We check the that the computed CRC and size of the
 305         # uncompressed data matches the stored values.  Note that the size
 306         # stored is the true file size mod 2**32.
 307         self.fileobj.seek(-8, 1)
 308         crc32 = read32(self.fileobj)
 309         isize = U32(read32(self.fileobj))   # may exceed 2GB
 310         if U32(crc32) != U32(self.crc):
 311             raise IOError, "CRC check failed"
 312         elif isize != LOWU32(self.size):
 313             raise IOError, "Incorrect length of data produced"
 314
 315     def close(self):
 316         if self.mode == WRITE:
 317             self.fileobj.write(self.compress.flush())
 318             # The native zlib crc is an unsigned 32-bit integer, but
 319             # the Python wrapper implicitly casts that to a signed C
 320             # long.  So, on a 32-bit box self.crc may "look negative",
 321             # while the same crc on a 64-bit box may "look positive".
 322             # To avoid irksome warnings from the `struct` module, force
 323             # it to look positive on all boxes.
 324             write32u(self.fileobj, LOWU32(self.crc))
 325             # self.size may exceed 2GB, or even 4GB
 326             write32u(self.fileobj, LOWU32(self.size))
 327             self.fileobj = None
 328         elif self.mode == READ:
 329             self.fileobj = None
 330         if self.myfileobj:
 331             self.myfileobj.close()
 332             self.myfileobj = None
 333
 334     def __del__(self):
 335         try:
 336             if (self.myfileobj is None and
 337                 self.fileobj is None):
 338                 return
 339         except AttributeError:
 340             return
 341         self.close()
 342
 343     def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
 344         if self.mode == WRITE:
 345             # Ensure the compressor's buffer is flushed
 346             self.fileobj.write(self.compress.flush(zlib_mode))
 347         self.fileobj.flush()
 348
 349     def fileno(self):
 350         """Invoke the underlying file object's fileno() method.
 351
 352         This will raise AttributeError if the underlying file object
 353         doesn't support fileno().
 354         """
 355         return self.fileobj.fileno()
 356
 357     def isatty(self):
 358         return False
 359
 360     def tell(self):
 361         return self.offset
 362
 363     def rewind(self):
 364         '''Return the uncompressed stream file position indicator to the
 365         beginning of the file'''
 366         if self.mode != READ:
 367             raise IOError("Can't rewind in write mode")
 368         self.fileobj.seek(0)
 369         self._new_member = True
 370         self.extrabuf = ""
 371         self.extrasize = 0
 372         self.offset = 0
 373
 374     def seek(self, offset):
 375         if self.mode == WRITE:
 376             if offset < self.offset:
 377                 raise IOError('Negative seek in write mode')
 378             count = offset - self.offset
 379             for i in range(count // 1024):
 380                 self.write(1024 * '\0')
 381             self.write((count % 1024) * '\0')
 382         elif self.mode == READ:
 383             if offset < self.offset:
 384                 # for negative seek, rewind and do positive seek
 385                 self.rewind()
 386             count = offset - self.offset
 387             for i in range(count // 1024):
 388                 self.read(1024)
 389             self.read(count % 1024)
 390
 391     def readline(self, size=-1):
 392         if size < 0:
 393             size = sys.maxint
 394             readsize = self.min_readsize
 395         else:
 396             readsize = size
 397         bufs = []
 398         while size != 0:
 399             c = self.read(readsize)
 400             i = c.find('\n')
 401
 402             # We set i=size to break out of the loop under two
 403             # conditions: 1) there's no newline, and the chunk is
 404             # larger than size, or 2) there is a newline, but the
 405             # resulting line would be longer than 'size'.
 406             if (size <= i) or (i == -1 and len(c) > size):
 407                 i = size - 1
 408
 409             if i >= 0 or c == '':
 410                 bufs.append(c[:i + 1])    # Add portion of last chunk
 411                 self._unread(c[i + 1:])   # Push back rest of chunk
 412                 break
 413
 414             # Append chunk to list, decrease 'size',
 415             bufs.append(c)
 416             size = size - len(c)
 417             readsize = min(size, readsize * 2)
 418         if readsize > self.min_readsize:
 419             self.min_readsize = min(readsize, self.min_readsize * 2, 512)
 420         return ''.join(bufs) # Return resulting line
 421
 422     def readlines(self, sizehint=0):
 423         # Negative numbers result in reading all the lines
 424         if sizehint <= 0:
 425             sizehint = sys.maxint
 426         L = []
 427         while sizehint > 0:
 428             line = self.readline()
 429             if line == "":
 430                 break
 431             L.append(line)
 432             sizehint = sizehint - len(line)
 433
 434         return L
 435
 436     def writelines(self, L):
 437         for line in L:
 438             self.write(line)
 439
 440     def __iter__(self):
 441         return self
 442
 443     def next(self):
 444         line = self.readline()
 445         if line:
 446             return line
 447         else:
 448             raise StopIteration
 449
 450
 451 def _test():
 452     # Act like gzip; with -d, act like gunzip.
 453     # The input file is not deleted, however, nor are any other gzip
 454     # options or features supported.
 455     args = sys.argv[1:]
 456     decompress = args and args[0] == "-d"
 457     if decompress:
 458         args = args[1:]
 459     if not args:
 460         args = ["-"]
 461     for arg in args:
 462         if decompress:
 463             if arg == "-":
 464                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
 465                 g = sys.stdout
 466             else:
 467                 if arg[-3:] != ".gz":
 468                     print "filename doesn't end in .gz:", repr(arg)
 469                     continue
 470                 f = open(arg, "rb")
 471                 g = __builtin__.open(arg[:-3], "wb")
 472         else:
 473             if arg == "-":
 474                 f = sys.stdin
 475                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
 476             else:
 477                 f = __builtin__.open(arg, "rb")
 478                 g = open(arg + ".gz", "wb")
 479         while True:
 480             chunk = f.read(1024)
 481             if not chunk:
 482                 break
 483             g.write(chunk)
 484         if g is not sys.stdout:
 485             g.close()
 486         if f is not sys.stdin:
 487             f.close()
 488
 489 if __name__ == '__main__':
 490     _test()