Lib/gzip.py

   1 """Functions that read and write gzipped files.
   2
   3 The user of the file doesn't have to worry about the compression,
   4 but random access is not allowed."""
   5
   6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
   7
   8 import struct, sys, time
   9 import zlib
  10 import __builtin__
  11
  12 __all__ = ["GzipFile","open"]
  13
  14 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
  15
  16 READ, WRITE = 1, 2
  17
  18 def write32u(output, value):
  19     # The L format writes the bit pattern correctly whether signed
  20     # or unsigned.
  21     output.write(struct.pack("<L", value))
  22
  23 def read32(input):
  24     return struct.unpack("<I", input.read(4))[0]
  25
  26 def open(filename, mode="rb", compresslevel=9):
  27     """Shorthand for GzipFile(filename, mode, compresslevel).
  28
  29     The filename argument is required; mode defaults to 'rb'
  30     and compresslevel defaults to 9.
  31
  32     """
  33     return GzipFile(filename, mode, compresslevel)
  34
  35 class GzipFile:
  36     """The GzipFile class simulates most of the methods of a file object with
  37     the exception of the readinto() and truncate() methods.
  38
  39     """
  40
  41     myfileobj = None
  42     max_read_chunk = 10 * 1024 * 1024   # 10Mb
  43
  44     def __init__(self, filename=None, mode=None,
  45                  compresslevel=9, fileobj=None):
  46         """Constructor for the GzipFile class.
  47
  48         At least one of fileobj and filename must be given a
  49         non-trivial value.
  50
  51         The new class instance is based on fileobj, which can be a regular
  52         file, a StringIO object, or any other object which simulates a file.
  53         It defaults to None, in which case filename is opened to provide
  54         a file object.
  55
  56         When fileobj is not None, the filename argument is only used to be
  57         included in the gzip file header, which may includes the original
  58         filename of the uncompressed file.  It defaults to the filename of
  59         fileobj, if discernible; otherwise, it defaults to the empty string,
  60         and in this case the original filename is not included in the header.
  61
  62         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
  63         depending on whether the file will be read or written.  The default
  64         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
  65         Be aware that only the 'rb', 'ab', and 'wb' values should be used
  66         for cross-platform portability.
  67
  68         The compresslevel argument is an integer from 1 to 9 controlling the
  69         level of compression; 1 is fastest and produces the least compression,
  70         and 9 is slowest and produces the most compression.  The default is 9.
  71
  72         """
  73
  74         # guarantee the file is opened in binary mode on platforms
  75         # that care about that sort of thing
  76         if mode and 'b' not in mode:
  77             mode += 'b'
  78         if fileobj is None:
  79             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
  80         if filename is None:
  81             if hasattr(fileobj, 'name'): filename = fileobj.name
  82             else: filename = ''
  83         if mode is None:
  84             if hasattr(fileobj, 'mode'): mode = fileobj.mode
  85             else: mode = 'rb'
  86
  87         if mode[0:1] == 'r':
  88             self.mode = READ
  89             # Set flag indicating start of a new member
  90             self._new_member = True
  91             self.extrabuf = ""
  92             self.extrasize = 0
  93             self.name = filename
  94             # Starts small, scales exponentially
  95             self.min_readsize = 100
  96
  97         elif mode[0:1] == 'w' or mode[0:1] == 'a':
  98             self.mode = WRITE
  99             self._init_write(filename)
 100             self.compress = zlib.compressobj(compresslevel,
 101                                              zlib.DEFLATED,
 102                                              -zlib.MAX_WBITS,
 103                                              zlib.DEF_MEM_LEVEL,
 104                                              0)
 105         else:
 106             raise IOError, "Mode " + mode + " not supported"
 107
 108         self.fileobj = fileobj
 109         self.offset = 0
 110
 111         if self.mode == WRITE:
 112             self._write_gzip_header()
 113
 114     @property
 115     def filename(self):
 116         import warnings
 117         warnings.warn("use the name attribute", DeprecationWarning)
 118         if self.mode == WRITE and self.name[-3:] != ".gz":
 119             return self.name + ".gz"
 120         return self.name
 121
 122     def __repr__(self):
 123         s = repr(self.fileobj)
 124         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
 125
 126     def _init_write(self, filename):
 127         self.name = filename
 128         self.crc = zlib.crc32("") & 0xffffffffL
 129         self.size = 0
 130         self.writebuf = []
 131         self.bufsize = 0
 132
 133     def _write_gzip_header(self):
 134         self.fileobj.write('\037\213')             # magic header
 135         self.fileobj.write('\010')                 # compression method
 136         fname = self.name
 137         if fname.endswith(".gz"):
 138             fname = fname[:-3]
 139         flags = 0
 140         if fname:
 141             flags = FNAME
 142         self.fileobj.write(chr(flags))
 143         write32u(self.fileobj, long(time.time()))
 144         self.fileobj.write('\002')
 145         self.fileobj.write('\377')
 146         if fname:
 147             self.fileobj.write(fname + '\000')
 148
 149     def _init_read(self):
 150         self.crc = zlib.crc32("") & 0xffffffffL
 151         self.size = 0
 152
 153     def _read_gzip_header(self):
 154         magic = self.fileobj.read(2)
 155         if magic != '\037\213':
 156             raise IOError, 'Not a gzipped file'
 157         method = ord( self.fileobj.read(1) )
 158         if method != 8:
 159             raise IOError, 'Unknown compression method'
 160         flag = ord( self.fileobj.read(1) )
 161         # modtime = self.fileobj.read(4)
 162         # extraflag = self.fileobj.read(1)
 163         # os = self.fileobj.read(1)
 164         self.fileobj.read(6)
 165
 166         if flag & FEXTRA:
 167             # Read & discard the extra field, if present
 168             xlen = ord(self.fileobj.read(1))
 169             xlen = xlen + 256*ord(self.fileobj.read(1))
 170             self.fileobj.read(xlen)
 171         if flag & FNAME:
 172             # Read and discard a null-terminated string containing the filename
 173             while True:
 174                 s = self.fileobj.read(1)
 175                 if not s or s=='\000':
 176                     break
 177         if flag & FCOMMENT:
 178             # Read and discard a null-terminated string containing a comment
 179             while True:
 180                 s = self.fileobj.read(1)
 181                 if not s or s=='\000':
 182                     break
 183         if flag & FHCRC:
 184             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
 185
 186
 187     def write(self,data):
 188         if self.mode != WRITE:
 189             import errno
 190             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
 191
 192         if self.fileobj is None:
 193             raise ValueError, "write() on closed GzipFile object"
 194         if len(data) > 0:
 195             self.size = self.size + len(data)
 196             self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
 197             self.fileobj.write( self.compress.compress(data) )
 198             self.offset += len(data)
 199
 200     def read(self, size=-1):
 201         if self.mode != READ:
 202             import errno
 203             raise IOError(errno.EBADF, "read() on write-only GzipFile object")
 204
 205         if self.extrasize <= 0 and self.fileobj is None:
 206             return ''
 207
 208         readsize = 1024
 209         if size < 0:        # get the whole thing
 210             try:
 211                 while True:
 212                     self._read(readsize)
 213                     readsize = min(self.max_read_chunk, readsize * 2)
 214             except EOFError:
 215                 size = self.extrasize
 216         else:               # just get some more of it
 217             try:
 218                 while size > self.extrasize:
 219                     self._read(readsize)
 220                     readsize = min(self.max_read_chunk, readsize * 2)
 221             except EOFError:
 222                 if size > self.extrasize:
 223                     size = self.extrasize
 224
 225         chunk = self.extrabuf[:size]
 226         self.extrabuf = self.extrabuf[size:]
 227         self.extrasize = self.extrasize - size
 228
 229         self.offset += size
 230         return chunk
 231
 232     def _unread(self, buf):
 233         self.extrabuf = buf + self.extrabuf
 234         self.extrasize = len(buf) + self.extrasize
 235         self.offset -= len(buf)
 236
 237     def _read(self, size=1024):
 238         if self.fileobj is None:
 239             raise EOFError, "Reached EOF"
 240
 241         if self._new_member:
 242             # If the _new_member flag is set, we have to
 243             # jump to the next member, if there is one.
 244             #
 245             # First, check if we're at the end of the file;
 246             # if so, it's time to stop; no more members to read.
 247             pos = self.fileobj.tell()   # Save current position
 248             self.fileobj.seek(0, 2)     # Seek to end of file
 249             if pos == self.fileobj.tell():
 250                 raise EOFError, "Reached EOF"
 251             else:
 252                 self.fileobj.seek( pos ) # Return to original position
 253
 254             self._init_read()
 255             self._read_gzip_header()
 256             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
 257             self._new_member = False
 258
 259         # Read a chunk of data from the file
 260         buf = self.fileobj.read(size)
 261
 262         # If the EOF has been reached, flush the decompression object
 263         # and mark this object as finished.
 264
 265         if buf == "":
 266             uncompress = self.decompress.flush()
 267             self._read_eof()
 268             self._add_read_data( uncompress )
 269             raise EOFError, 'Reached EOF'
 270
 271         uncompress = self.decompress.decompress(buf)
 272         self._add_read_data( uncompress )
 273
 274         if self.decompress.unused_data != "":
 275             # Ending case: we've come to the end of a member in the file,
 276             # so seek back to the start of the unused data, finish up
 277             # this member, and read a new gzip header.
 278             # (The number of bytes to seek back is the length of the unused
 279             # data, minus 8 because _read_eof() will rewind a further 8 bytes)
 280             self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
 281
 282             # Check the CRC and file size, and set the flag so we read
 283             # a new member on the next call
 284             self._read_eof()
 285             self._new_member = True
 286
 287     def _add_read_data(self, data):
 288         self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
 289         self.extrabuf = self.extrabuf + data
 290         self.extrasize = self.extrasize + len(data)
 291         self.size = self.size + len(data)
 292
 293     def _read_eof(self):
 294         # We've read to the end of the file, so we have to rewind in order
 295         # to reread the 8 bytes containing the CRC and the file size.
 296         # We check the that the computed CRC and size of the
 297         # uncompressed data matches the stored values.  Note that the size
 298         # stored is the true file size mod 2**32.
 299         self.fileobj.seek(-8, 1)
 300         crc32 = read32(self.fileobj)
 301         isize = read32(self.fileobj)  # may exceed 2GB
 302         if crc32 != self.crc:
 303             raise IOError("CRC check failed %s != %s" % (hex(crc32),
 304                                                          hex(self.crc)))
 305         elif isize != (self.size & 0xffffffffL):
 306             raise IOError, "Incorrect length of data produced"
 307
 308     def close(self):
 309         if self.fileobj is None:
 310             return
 311         if self.mode == WRITE:
 312             self.fileobj.write(self.compress.flush())
 313             write32u(self.fileobj, self.crc)
 314             # self.size may exceed 2GB, or even 4GB
 315             write32u(self.fileobj, self.size & 0xffffffffL)
 316             self.fileobj = None
 317         elif self.mode == READ:
 318             self.fileobj = None
 319         if self.myfileobj:
 320             self.myfileobj.close()
 321             self.myfileobj = None
 322
 323     def __del__(self):
 324         try:
 325             if (self.myfileobj is None and
 326                 self.fileobj is None):
 327                 return
 328         except AttributeError:
 329             return
 330         self.close()
 331
 332     def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
 333         if self.mode == WRITE:
 334             # Ensure the compressor's buffer is flushed
 335             self.fileobj.write(self.compress.flush(zlib_mode))
 336         self.fileobj.flush()
 337
 338     def fileno(self):
 339         """Invoke the underlying file object's fileno() method.
 340
 341         This will raise AttributeError if the underlying file object
 342         doesn't support fileno().
 343         """
 344         return self.fileobj.fileno()
 345
 346     def isatty(self):
 347         return False
 348
 349     def tell(self):
 350         return self.offset
 351
 352     def rewind(self):
 353         '''Return the uncompressed stream file position indicator to the
 354         beginning of the file'''
 355         if self.mode != READ:
 356             raise IOError("Can't rewind in write mode")
 357         self.fileobj.seek(0)
 358         self._new_member = True
 359         self.extrabuf = ""
 360         self.extrasize = 0
 361         self.offset = 0
 362
 363     def seek(self, offset, whence=0):
 364         if whence:
 365             if whence == 1:
 366                 offset = self.offset + offset
 367             else:
 368                 raise ValueError('Seek from end not supported')
 369         if self.mode == WRITE:
 370             if offset < self.offset:
 371                 raise IOError('Negative seek in write mode')
 372             count = offset - self.offset
 373             for i in range(count // 1024):
 374                 self.write(1024 * '\0')
 375             self.write((count % 1024) * '\0')
 376         elif self.mode == READ:
 377             if offset < self.offset:
 378                 # for negative seek, rewind and do positive seek
 379                 self.rewind()
 380             count = offset - self.offset
 381             for i in range(count // 1024):
 382                 self.read(1024)
 383             self.read(count % 1024)
 384
 385     def readline(self, size=-1):
 386         if size < 0:
 387             size = sys.maxint
 388             readsize = self.min_readsize
 389         else:
 390             readsize = size
 391         bufs = []
 392         while size != 0:
 393             c = self.read(readsize)
 394             i = c.find('\n')
 395
 396             # We set i=size to break out of the loop under two
 397             # conditions: 1) there's no newline, and the chunk is
 398             # larger than size, or 2) there is a newline, but the
 399             # resulting line would be longer than 'size'.
 400             if (size <= i) or (i == -1 and len(c) > size):
 401                 i = size - 1
 402
 403             if i >= 0 or c == '':
 404                 bufs.append(c[:i + 1])    # Add portion of last chunk
 405                 self._unread(c[i + 1:])   # Push back rest of chunk
 406                 break
 407
 408             # Append chunk to list, decrease 'size',
 409             bufs.append(c)
 410             size = size - len(c)
 411             readsize = min(size, readsize * 2)
 412         if readsize > self.min_readsize:
 413             self.min_readsize = min(readsize, self.min_readsize * 2, 512)
 414         return ''.join(bufs) # Return resulting line
 415
 416     def readlines(self, sizehint=0):
 417         # Negative numbers result in reading all the lines
 418         if sizehint <= 0:
 419             sizehint = sys.maxint
 420         L = []
 421         while sizehint > 0:
 422             line = self.readline()
 423             if line == "":
 424                 break
 425             L.append(line)
 426             sizehint = sizehint - len(line)
 427
 428         return L
 429
 430     def writelines(self, L):
 431         for line in L:
 432             self.write(line)
 433
 434     def __iter__(self):
 435         return self
 436
 437     def next(self):
 438         line = self.readline()
 439         if line:
 440             return line
 441         else:
 442             raise StopIteration
 443
 444
 445 def _test():
 446     # Act like gzip; with -d, act like gunzip.
 447     # The input file is not deleted, however, nor are any other gzip
 448     # options or features supported.
 449     args = sys.argv[1:]
 450     decompress = args and args[0] == "-d"
 451     if decompress:
 452         args = args[1:]
 453     if not args:
 454         args = ["-"]
 455     for arg in args:
 456         if decompress:
 457             if arg == "-":
 458                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
 459                 g = sys.stdout
 460             else:
 461                 if arg[-3:] != ".gz":
 462                     print "filename doesn't end in .gz:", repr(arg)
 463                     continue
 464                 f = open(arg, "rb")
 465                 g = __builtin__.open(arg[:-3], "wb")
 466         else:
 467             if arg == "-":
 468                 f = sys.stdin
 469                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
 470             else:
 471                 f = __builtin__.open(arg, "rb")
 472                 g = open(arg + ".gz", "wb")
 473         while True:
 474             chunk = f.read(1024)
 475             if not chunk:
 476                 break
 477             g.write(chunk)
 478         if g is not sys.stdout:
 479             g.close()
 480         if f is not sys.stdin:
 481             f.close()
 482
 483 if __name__ == '__main__':
 484     _test()