Lib/gzip.py

   1 """Functions that read and write gzipped files.
   2
   3 The user of the file doesn't have to worry about the compression,
   4 but random access is not allowed."""
   5
   6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
   7
   8 import struct, sys, time, os
   9 import zlib
  10 import io
  11 import __builtin__
  12
  13 __all__ = ["GzipFile","open"]
  14
  15 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
  16
  17 READ, WRITE = 1, 2
  18
  19 def write32u(output, value):
  20     # The L format writes the bit pattern correctly whether signed
  21     # or unsigned.
  22     output.write(struct.pack("<L", value))
  23
  24 def read32(input):
  25     return struct.unpack("<I", input.read(4))[0]
  26
  27 def open(filename, mode="rb", compresslevel=9):
  28     """Shorthand for GzipFile(filename, mode, compresslevel).
  29
  30     The filename argument is required; mode defaults to 'rb'
  31     and compresslevel defaults to 9.
  32
  33     """
  34     return GzipFile(filename, mode, compresslevel)
  35
  36 class GzipFile(io.BufferedIOBase):
  37     """The GzipFile class simulates most of the methods of a file object with
  38     the exception of the readinto() and truncate() methods.
  39
  40     """
  41
  42     myfileobj = None
  43     max_read_chunk = 10 * 1024 * 1024   # 10Mb
  44
  45     def __init__(self, filename=None, mode=None,
  46                  compresslevel=9, fileobj=None, mtime=None):
  47         """Constructor for the GzipFile class.
  48
  49         At least one of fileobj and filename must be given a
  50         non-trivial value.
  51
  52         The new class instance is based on fileobj, which can be a regular
  53         file, a StringIO object, or any other object which simulates a file.
  54         It defaults to None, in which case filename is opened to provide
  55         a file object.
  56
  57         When fileobj is not None, the filename argument is only used to be
  58         included in the gzip file header, which may includes the original
  59         filename of the uncompressed file.  It defaults to the filename of
  60         fileobj, if discernible; otherwise, it defaults to the empty string,
  61         and in this case the original filename is not included in the header.
  62
  63         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
  64         depending on whether the file will be read or written.  The default
  65         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
  66         Be aware that only the 'rb', 'ab', and 'wb' values should be used
  67         for cross-platform portability.
  68
  69         The compresslevel argument is an integer from 1 to 9 controlling the
  70         level of compression; 1 is fastest and produces the least compression,
  71         and 9 is slowest and produces the most compression.  The default is 9.
  72
  73         The mtime argument is an optional numeric timestamp to be written
  74         to the stream when compressing.  All gzip compressed streams
  75         are required to contain a timestamp.  If omitted or None, the
  76         current time is used.  This module ignores the timestamp when
  77         decompressing; however, some programs, such as gunzip, make use
  78         of it.  The format of the timestamp is the same as that of the
  79         return value of time.time() and of the st_mtime member of the
  80         object returned by os.stat().
  81
  82         """
  83
  84         # guarantee the file is opened in binary mode on platforms
  85         # that care about that sort of thing
  86         if mode and 'b' not in mode:
  87             mode += 'b'
  88         if fileobj is None:
  89             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
  90         if filename is None:
  91             if hasattr(fileobj, 'name'): filename = fileobj.name
  92             else: filename = ''
  93         if mode is None:
  94             if hasattr(fileobj, 'mode'): mode = fileobj.mode
  95             else: mode = 'rb'
  96
  97         if mode[0:1] == 'r':
  98             self.mode = READ
  99             # Set flag indicating start of a new member
 100             self._new_member = True
 101             # Buffer data read from gzip file. extrastart is offset in
 102             # stream where buffer starts. extrasize is number of
 103             # bytes remaining in buffer from current stream position.
 104             self.extrabuf = ""
 105             self.extrasize = 0
 106             self.extrastart = 0
 107             self.name = filename
 108             # Starts small, scales exponentially
 109             self.min_readsize = 100
 110
 111         elif mode[0:1] == 'w' or mode[0:1] == 'a':
 112             self.mode = WRITE
 113             self._init_write(filename)
 114             self.compress = zlib.compressobj(compresslevel,
 115                                              zlib.DEFLATED,
 116                                              -zlib.MAX_WBITS,
 117                                              zlib.DEF_MEM_LEVEL,
 118                                              0)
 119         else:
 120             raise IOError, "Mode " + mode + " not supported"
 121
 122         self.fileobj = fileobj
 123         self.offset = 0
 124         self.mtime = mtime
 125
 126         if self.mode == WRITE:
 127             self._write_gzip_header()
 128
 129     @property
 130     def filename(self):
 131         import warnings
 132         warnings.warn("use the name attribute", DeprecationWarning, 2)
 133         if self.mode == WRITE and self.name[-3:] != ".gz":
 134             return self.name + ".gz"
 135         return self.name
 136
 137     def __repr__(self):
 138         s = repr(self.fileobj)
 139         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
 140
 141     def _init_write(self, filename):
 142         self.name = filename
 143         self.crc = zlib.crc32("") & 0xffffffffL
 144         self.size = 0
 145         self.writebuf = []
 146         self.bufsize = 0
 147
 148     def _write_gzip_header(self):
 149         self.fileobj.write('\037\213')             # magic header
 150         self.fileobj.write('\010')                 # compression method
 151         fname = os.path.basename(self.name)
 152         if fname.endswith(".gz"):
 153             fname = fname[:-3]
 154         flags = 0
 155         if fname:
 156             flags = FNAME
 157         self.fileobj.write(chr(flags))
 158         mtime = self.mtime
 159         if mtime is None:
 160             mtime = time.time()
 161         write32u(self.fileobj, long(mtime))
 162         self.fileobj.write('\002')
 163         self.fileobj.write('\377')
 164         if fname:
 165             self.fileobj.write(fname + '\000')
 166
 167     def _init_read(self):
 168         self.crc = zlib.crc32("") & 0xffffffffL
 169         self.size = 0
 170
 171     def _read_gzip_header(self):
 172         magic = self.fileobj.read(2)
 173         if magic != '\037\213':
 174             raise IOError, 'Not a gzipped file'
 175         method = ord( self.fileobj.read(1) )
 176         if method != 8:
 177             raise IOError, 'Unknown compression method'
 178         flag = ord( self.fileobj.read(1) )
 179         self.mtime = read32(self.fileobj)
 180         # extraflag = self.fileobj.read(1)
 181         # os = self.fileobj.read(1)
 182         self.fileobj.read(2)
 183
 184         if flag & FEXTRA:
 185             # Read & discard the extra field, if present
 186             xlen = ord(self.fileobj.read(1))
 187             xlen = xlen + 256*ord(self.fileobj.read(1))
 188             self.fileobj.read(xlen)
 189         if flag & FNAME:
 190             # Read and discard a null-terminated string containing the filename
 191             while True:
 192                 s = self.fileobj.read(1)
 193                 if not s or s=='\000':
 194                     break
 195         if flag & FCOMMENT:
 196             # Read and discard a null-terminated string containing a comment
 197             while True:
 198                 s = self.fileobj.read(1)
 199                 if not s or s=='\000':
 200                     break
 201         if flag & FHCRC:
 202             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
 203
 204     def write(self,data):
 205         if self.mode != WRITE:
 206             import errno
 207             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
 208
 209         if self.fileobj is None:
 210             raise ValueError, "write() on closed GzipFile object"
 211
 212         # Convert data type if called by io.BufferedWriter.
 213         if isinstance(data, memoryview):
 214             data = data.tobytes()
 215
 216         if len(data) > 0:
 217             self.size = self.size + len(data)
 218             self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
 219             self.fileobj.write( self.compress.compress(data) )
 220             self.offset += len(data)
 221
 222         return len(data)
 223
 224     def read(self, size=-1):
 225         if self.mode != READ:
 226             import errno
 227             raise IOError(errno.EBADF, "read() on write-only GzipFile object")
 228
 229         if self.extrasize <= 0 and self.fileobj is None:
 230             return ''
 231
 232         readsize = 1024
 233         if size < 0:        # get the whole thing
 234             try:
 235                 while True:
 236                     self._read(readsize)
 237                     readsize = min(self.max_read_chunk, readsize * 2)
 238             except EOFError:
 239                 size = self.extrasize
 240         else:               # just get some more of it
 241             try:
 242                 while size > self.extrasize:
 243                     self._read(readsize)
 244                     readsize = min(self.max_read_chunk, readsize * 2)
 245             except EOFError:
 246                 if size > self.extrasize:
 247                     size = self.extrasize
 248
 249         offset = self.offset - self.extrastart
 250         chunk = self.extrabuf[offset: offset + size]
 251         self.extrasize = self.extrasize - size
 252
 253         self.offset += size
 254         return chunk
 255
 256     def _unread(self, buf):
 257         self.extrasize = len(buf) + self.extrasize
 258         self.offset -= len(buf)
 259
 260     def _read(self, size=1024):
 261         if self.fileobj is None:
 262             raise EOFError, "Reached EOF"
 263
 264         if self._new_member:
 265             # If the _new_member flag is set, we have to
 266             # jump to the next member, if there is one.
 267             #
 268             # First, check if we're at the end of the file;
 269             # if so, it's time to stop; no more members to read.
 270             pos = self.fileobj.tell()   # Save current position
 271             self.fileobj.seek(0, 2)     # Seek to end of file
 272             if pos == self.fileobj.tell():
 273                 raise EOFError, "Reached EOF"
 274             else:
 275                 self.fileobj.seek( pos ) # Return to original position
 276
 277             self._init_read()
 278             self._read_gzip_header()
 279             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
 280             self._new_member = False
 281
 282         # Read a chunk of data from the file
 283         buf = self.fileobj.read(size)
 284
 285         # If the EOF has been reached, flush the decompression object
 286         # and mark this object as finished.
 287
 288         if buf == "":
 289             uncompress = self.decompress.flush()
 290             self._read_eof()
 291             self._add_read_data( uncompress )
 292             raise EOFError, 'Reached EOF'
 293
 294         uncompress = self.decompress.decompress(buf)
 295         self._add_read_data( uncompress )
 296
 297         if self.decompress.unused_data != "":
 298             # Ending case: we've come to the end of a member in the file,
 299             # so seek back to the start of the unused data, finish up
 300             # this member, and read a new gzip header.
 301             # (The number of bytes to seek back is the length of the unused
 302             # data, minus 8 because _read_eof() will rewind a further 8 bytes)
 303             self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
 304
 305             # Check the CRC and file size, and set the flag so we read
 306             # a new member on the next call
 307             self._read_eof()
 308             self._new_member = True
 309
 310     def _add_read_data(self, data):
 311         self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
 312         offset = self.offset - self.extrastart
 313         self.extrabuf = self.extrabuf[offset:] + data
 314         self.extrasize = self.extrasize + len(data)
 315         self.extrastart = self.offset
 316         self.size = self.size + len(data)
 317
 318     def _read_eof(self):
 319         # We've read to the end of the file, so we have to rewind in order
 320         # to reread the 8 bytes containing the CRC and the file size.
 321         # We check the that the computed CRC and size of the
 322         # uncompressed data matches the stored values.  Note that the size
 323         # stored is the true file size mod 2**32.
 324         self.fileobj.seek(-8, 1)
 325         crc32 = read32(self.fileobj)
 326         isize = read32(self.fileobj)  # may exceed 2GB
 327         if crc32 != self.crc:
 328             raise IOError("CRC check failed %s != %s" % (hex(crc32),
 329                                                          hex(self.crc)))
 330         elif isize != (self.size & 0xffffffffL):
 331             raise IOError, "Incorrect length of data produced"
 332
 333     @property
 334     def closed(self):
 335         return self.fileobj is None
 336
 337     def close(self):
 338         if self.fileobj is None:
 339             return
 340         if self.mode == WRITE:
 341             self.fileobj.write(self.compress.flush())
 342             write32u(self.fileobj, self.crc)
 343             # self.size may exceed 2GB, or even 4GB
 344             write32u(self.fileobj, self.size & 0xffffffffL)
 345             self.fileobj = None
 346         elif self.mode == READ:
 347             self.fileobj = None
 348         if self.myfileobj:
 349             self.myfileobj.close()
 350             self.myfileobj = None
 351
 352     def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
 353         if self.mode == WRITE:
 354             # Ensure the compressor's buffer is flushed
 355             self.fileobj.write(self.compress.flush(zlib_mode))
 356         self.fileobj.flush()
 357
 358     def fileno(self):
 359         """Invoke the underlying file object's fileno() method.
 360
 361         This will raise AttributeError if the underlying file object
 362         doesn't support fileno().
 363         """
 364         return self.fileobj.fileno()
 365
 366     def rewind(self):
 367         '''Return the uncompressed stream file position indicator to the
 368         beginning of the file'''
 369         if self.mode != READ:
 370             raise IOError("Can't rewind in write mode")
 371         self.fileobj.seek(0)
 372         self._new_member = True
 373         self.extrabuf = ""
 374         self.extrasize = 0
 375         self.extrastart = 0
 376         self.offset = 0
 377
 378     def readable(self):
 379         return self.mode == READ
 380
 381     def writable(self):
 382         return self.mode == WRITE
 383
 384     def seekable(self):
 385         return True
 386
 387     def seek(self, offset, whence=0):
 388         if whence:
 389             if whence == 1:
 390                 offset = self.offset + offset
 391             else:
 392                 raise ValueError('Seek from end not supported')
 393         if self.mode == WRITE:
 394             if offset < self.offset:
 395                 raise IOError('Negative seek in write mode')
 396             count = offset - self.offset
 397             for i in range(count // 1024):
 398                 self.write(1024 * '\0')
 399             self.write((count % 1024) * '\0')
 400         elif self.mode == READ:
 401             if offset < self.offset:
 402                 # for negative seek, rewind and do positive seek
 403                 self.rewind()
 404             count = offset - self.offset
 405             for i in range(count // 1024):
 406                 self.read(1024)
 407             self.read(count % 1024)
 408
 409         return self.offset
 410
 411     def readline(self, size=-1):
 412         if size < 0:
 413             # Shortcut common case - newline found in buffer.
 414             offset = self.offset - self.extrastart
 415             i = self.extrabuf.find('\n', offset) + 1
 416             if i > 0:
 417                 self.extrasize -= i - offset
 418                 self.offset += i - offset
 419                 return self.extrabuf[offset: i]
 420
 421             size = sys.maxint
 422             readsize = self.min_readsize
 423         else:
 424             readsize = size
 425         bufs = []
 426         while size != 0:
 427             c = self.read(readsize)
 428             i = c.find('\n')
 429
 430             # We set i=size to break out of the loop under two
 431             # conditions: 1) there's no newline, and the chunk is
 432             # larger than size, or 2) there is a newline, but the
 433             # resulting line would be longer than 'size'.
 434             if (size <= i) or (i == -1 and len(c) > size):
 435                 i = size - 1
 436
 437             if i >= 0 or c == '':
 438                 bufs.append(c[:i + 1])    # Add portion of last chunk
 439                 self._unread(c[i + 1:])   # Push back rest of chunk
 440                 break
 441
 442             # Append chunk to list, decrease 'size',
 443             bufs.append(c)
 444             size = size - len(c)
 445             readsize = min(size, readsize * 2)
 446         if readsize > self.min_readsize:
 447             self.min_readsize = min(readsize, self.min_readsize * 2, 512)
 448         return ''.join(bufs) # Return resulting line
 449
 450
 451 def _test():
 452     # Act like gzip; with -d, act like gunzip.
 453     # The input file is not deleted, however, nor are any other gzip
 454     # options or features supported.
 455     args = sys.argv[1:]
 456     decompress = args and args[0] == "-d"
 457     if decompress:
 458         args = args[1:]
 459     if not args:
 460         args = ["-"]
 461     for arg in args:
 462         if decompress:
 463             if arg == "-":
 464                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
 465                 g = sys.stdout
 466             else:
 467                 if arg[-3:] != ".gz":
 468                     print "filename doesn't end in .gz:", repr(arg)
 469                     continue
 470                 f = open(arg, "rb")
 471                 g = __builtin__.open(arg[:-3], "wb")
 472         else:
 473             if arg == "-":
 474                 f = sys.stdin
 475                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
 476             else:
 477                 f = __builtin__.open(arg, "rb")
 478                 g = open(arg + ".gz", "wb")
 479         while True:
 480             chunk = f.read(1024)
 481             if not chunk:
 482                 break
 483             g.write(chunk)
 484         if g is not sys.stdout:
 485             g.close()
 486         if f is not sys.stdin:
 487             f.close()
 488
 489 if __name__ == '__main__':
 490     _test()