Lib/gzip.py

   1 """Functions that read and write gzipped files.
   2
   3 The user of the file doesn't have to worry about the compression,
   4 but random access is not allowed."""
   5
   6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
   7
   8 import struct, sys, time, os
   9 import zlib
  10 import __builtin__
  11
  12 __all__ = ["GzipFile","open"]
  13
  14 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
  15
  16 READ, WRITE = 1, 2
  17
  18 def write32u(output, value):
  19     # The L format writes the bit pattern correctly whether signed
  20     # or unsigned.
  21     output.write(struct.pack("<L", value))
  22
  23 def read32(input):
  24     return struct.unpack("<I", input.read(4))[0]
  25
  26 def open(filename, mode="rb", compresslevel=9):
  27     """Shorthand for GzipFile(filename, mode, compresslevel).
  28
  29     The filename argument is required; mode defaults to 'rb'
  30     and compresslevel defaults to 9.
  31
  32     """
  33     return GzipFile(filename, mode, compresslevel)
  34
  35 class GzipFile:
  36     """The GzipFile class simulates most of the methods of a file object with
  37     the exception of the readinto() and truncate() methods.
  38
  39     """
  40
  41     myfileobj = None
  42     max_read_chunk = 10 * 1024 * 1024   # 10Mb
  43
  44     def __init__(self, filename=None, mode=None,
  45                  compresslevel=9, fileobj=None, mtime=None):
  46         """Constructor for the GzipFile class.
  47
  48         At least one of fileobj and filename must be given a
  49         non-trivial value.
  50
  51         The new class instance is based on fileobj, which can be a regular
  52         file, a StringIO object, or any other object which simulates a file.
  53         It defaults to None, in which case filename is opened to provide
  54         a file object.
  55
  56         When fileobj is not None, the filename argument is only used to be
  57         included in the gzip file header, which may includes the original
  58         filename of the uncompressed file.  It defaults to the filename of
  59         fileobj, if discernible; otherwise, it defaults to the empty string,
  60         and in this case the original filename is not included in the header.
  61
  62         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
  63         depending on whether the file will be read or written.  The default
  64         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
  65         Be aware that only the 'rb', 'ab', and 'wb' values should be used
  66         for cross-platform portability.
  67
  68         The compresslevel argument is an integer from 1 to 9 controlling the
  69         level of compression; 1 is fastest and produces the least compression,
  70         and 9 is slowest and produces the most compression.  The default is 9.
  71
  72         The mtime argument is an optional numeric timestamp to be written
  73         to the stream when compressing.  All gzip compressed streams
  74         are required to contain a timestamp.  If omitted or None, the
  75         current time is used.  This module ignores the timestamp when
  76         decompressing; however, some programs, such as gunzip, make use
  77         of it.  The format of the timestamp is the same as that of the
  78         return value of time.time() and of the st_mtime member of the
  79         object returned by os.stat().
  80
  81         """
  82
  83         # guarantee the file is opened in binary mode on platforms
  84         # that care about that sort of thing
  85         if mode and 'b' not in mode:
  86             mode += 'b'
  87         if fileobj is None:
  88             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
  89         if filename is None:
  90             if hasattr(fileobj, 'name'): filename = fileobj.name
  91             else: filename = ''
  92         if mode is None:
  93             if hasattr(fileobj, 'mode'): mode = fileobj.mode
  94             else: mode = 'rb'
  95
  96         if mode[0:1] == 'r':
  97             self.mode = READ
  98             # Set flag indicating start of a new member
  99             self._new_member = True
 100             self.extrabuf = ""
 101             self.extrasize = 0
 102             self.name = filename
 103             # Starts small, scales exponentially
 104             self.min_readsize = 100
 105
 106         elif mode[0:1] == 'w' or mode[0:1] == 'a':
 107             self.mode = WRITE
 108             self._init_write(filename)
 109             self.compress = zlib.compressobj(compresslevel,
 110                                              zlib.DEFLATED,
 111                                              -zlib.MAX_WBITS,
 112                                              zlib.DEF_MEM_LEVEL,
 113                                              0)
 114         else:
 115             raise IOError, "Mode " + mode + " not supported"
 116
 117         self.fileobj = fileobj
 118         self.offset = 0
 119         self.mtime = mtime
 120
 121         if self.mode == WRITE:
 122             self._write_gzip_header()
 123
 124     @property
 125     def filename(self):
 126         import warnings
 127         warnings.warn("use the name attribute", DeprecationWarning, 2)
 128         if self.mode == WRITE and self.name[-3:] != ".gz":
 129             return self.name + ".gz"
 130         return self.name
 131
 132     def __repr__(self):
 133         s = repr(self.fileobj)
 134         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
 135
 136     def _init_write(self, filename):
 137         self.name = filename
 138         self.crc = zlib.crc32("") & 0xffffffffL
 139         self.size = 0
 140         self.writebuf = []
 141         self.bufsize = 0
 142
 143     def _write_gzip_header(self):
 144         self.fileobj.write('\037\213')             # magic header
 145         self.fileobj.write('\010')                 # compression method
 146         fname = os.path.basename(self.name)
 147         if fname.endswith(".gz"):
 148             fname = fname[:-3]
 149         flags = 0
 150         if fname:
 151             flags = FNAME
 152         self.fileobj.write(chr(flags))
 153         mtime = self.mtime
 154         if mtime is None:
 155             mtime = time.time()
 156         write32u(self.fileobj, long(mtime))
 157         self.fileobj.write('\002')
 158         self.fileobj.write('\377')
 159         if fname:
 160             self.fileobj.write(fname + '\000')
 161
 162     def _init_read(self):
 163         self.crc = zlib.crc32("") & 0xffffffffL
 164         self.size = 0
 165
 166     def _read_gzip_header(self):
 167         magic = self.fileobj.read(2)
 168         if magic != '\037\213':
 169             raise IOError, 'Not a gzipped file'
 170         method = ord( self.fileobj.read(1) )
 171         if method != 8:
 172             raise IOError, 'Unknown compression method'
 173         flag = ord( self.fileobj.read(1) )
 174         self.mtime = read32(self.fileobj)
 175         # extraflag = self.fileobj.read(1)
 176         # os = self.fileobj.read(1)
 177         self.fileobj.read(2)
 178
 179         if flag & FEXTRA:
 180             # Read & discard the extra field, if present
 181             xlen = ord(self.fileobj.read(1))
 182             xlen = xlen + 256*ord(self.fileobj.read(1))
 183             self.fileobj.read(xlen)
 184         if flag & FNAME:
 185             # Read and discard a null-terminated string containing the filename
 186             while True:
 187                 s = self.fileobj.read(1)
 188                 if not s or s=='\000':
 189                     break
 190         if flag & FCOMMENT:
 191             # Read and discard a null-terminated string containing a comment
 192             while True:
 193                 s = self.fileobj.read(1)
 194                 if not s or s=='\000':
 195                     break
 196         if flag & FHCRC:
 197             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
 198
 199
 200     def write(self,data):
 201         if self.mode != WRITE:
 202             import errno
 203             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
 204
 205         if self.fileobj is None:
 206             raise ValueError, "write() on closed GzipFile object"
 207         if len(data) > 0:
 208             self.size = self.size + len(data)
 209             self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
 210             self.fileobj.write( self.compress.compress(data) )
 211             self.offset += len(data)
 212
 213     def read(self, size=-1):
 214         if self.mode != READ:
 215             import errno
 216             raise IOError(errno.EBADF, "read() on write-only GzipFile object")
 217
 218         if self.extrasize <= 0 and self.fileobj is None:
 219             return ''
 220
 221         readsize = 1024
 222         if size < 0:        # get the whole thing
 223             try:
 224                 while True:
 225                     self._read(readsize)
 226                     readsize = min(self.max_read_chunk, readsize * 2)
 227             except EOFError:
 228                 size = self.extrasize
 229         else:               # just get some more of it
 230             try:
 231                 while size > self.extrasize:
 232                     self._read(readsize)
 233                     readsize = min(self.max_read_chunk, readsize * 2)
 234             except EOFError:
 235                 if size > self.extrasize:
 236                     size = self.extrasize
 237
 238         chunk = self.extrabuf[:size]
 239         self.extrabuf = self.extrabuf[size:]
 240         self.extrasize = self.extrasize - size
 241
 242         self.offset += size
 243         return chunk
 244
 245     def _unread(self, buf):
 246         self.extrabuf = buf + self.extrabuf
 247         self.extrasize = len(buf) + self.extrasize
 248         self.offset -= len(buf)
 249
 250     def _read(self, size=1024):
 251         if self.fileobj is None:
 252             raise EOFError, "Reached EOF"
 253
 254         if self._new_member:
 255             # If the _new_member flag is set, we have to
 256             # jump to the next member, if there is one.
 257             #
 258             # First, check if we're at the end of the file;
 259             # if so, it's time to stop; no more members to read.
 260             pos = self.fileobj.tell()   # Save current position
 261             self.fileobj.seek(0, 2)     # Seek to end of file
 262             if pos == self.fileobj.tell():
 263                 raise EOFError, "Reached EOF"
 264             else:
 265                 self.fileobj.seek( pos ) # Return to original position
 266
 267             self._init_read()
 268             self._read_gzip_header()
 269             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
 270             self._new_member = False
 271
 272         # Read a chunk of data from the file
 273         buf = self.fileobj.read(size)
 274
 275         # If the EOF has been reached, flush the decompression object
 276         # and mark this object as finished.
 277
 278         if buf == "":
 279             uncompress = self.decompress.flush()
 280             self._read_eof()
 281             self._add_read_data( uncompress )
 282             raise EOFError, 'Reached EOF'
 283
 284         uncompress = self.decompress.decompress(buf)
 285         self._add_read_data( uncompress )
 286
 287         if self.decompress.unused_data != "":
 288             # Ending case: we've come to the end of a member in the file,
 289             # so seek back to the start of the unused data, finish up
 290             # this member, and read a new gzip header.
 291             # (The number of bytes to seek back is the length of the unused
 292             # data, minus 8 because _read_eof() will rewind a further 8 bytes)
 293             self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
 294
 295             # Check the CRC and file size, and set the flag so we read
 296             # a new member on the next call
 297             self._read_eof()
 298             self._new_member = True
 299
 300     def _add_read_data(self, data):
 301         self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
 302         self.extrabuf = self.extrabuf + data
 303         self.extrasize = self.extrasize + len(data)
 304         self.size = self.size + len(data)
 305
 306     def _read_eof(self):
 307         # We've read to the end of the file, so we have to rewind in order
 308         # to reread the 8 bytes containing the CRC and the file size.
 309         # We check the that the computed CRC and size of the
 310         # uncompressed data matches the stored values.  Note that the size
 311         # stored is the true file size mod 2**32.
 312         self.fileobj.seek(-8, 1)
 313         crc32 = read32(self.fileobj)
 314         isize = read32(self.fileobj)  # may exceed 2GB
 315         if crc32 != self.crc:
 316             raise IOError("CRC check failed %s != %s" % (hex(crc32),
 317                                                          hex(self.crc)))
 318         elif isize != (self.size & 0xffffffffL):
 319             raise IOError, "Incorrect length of data produced"
 320
 321     def close(self):
 322         if self.fileobj is None:
 323             return
 324         if self.mode == WRITE:
 325             self.fileobj.write(self.compress.flush())
 326             write32u(self.fileobj, self.crc)
 327             # self.size may exceed 2GB, or even 4GB
 328             write32u(self.fileobj, self.size & 0xffffffffL)
 329             self.fileobj = None
 330         elif self.mode == READ:
 331             self.fileobj = None
 332         if self.myfileobj:
 333             self.myfileobj.close()
 334             self.myfileobj = None
 335
 336     def __del__(self):
 337         try:
 338             if (self.myfileobj is None and
 339                 self.fileobj is None):
 340                 return
 341         except AttributeError:
 342             return
 343         self.close()
 344
 345     def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
 346         if self.mode == WRITE:
 347             # Ensure the compressor's buffer is flushed
 348             self.fileobj.write(self.compress.flush(zlib_mode))
 349         self.fileobj.flush()
 350
 351     def fileno(self):
 352         """Invoke the underlying file object's fileno() method.
 353
 354         This will raise AttributeError if the underlying file object
 355         doesn't support fileno().
 356         """
 357         return self.fileobj.fileno()
 358
 359     def isatty(self):
 360         return False
 361
 362     def tell(self):
 363         return self.offset
 364
 365     def rewind(self):
 366         '''Return the uncompressed stream file position indicator to the
 367         beginning of the file'''
 368         if self.mode != READ:
 369             raise IOError("Can't rewind in write mode")
 370         self.fileobj.seek(0)
 371         self._new_member = True
 372         self.extrabuf = ""
 373         self.extrasize = 0
 374         self.offset = 0
 375
 376     def seek(self, offset, whence=0):
 377         if whence:
 378             if whence == 1:
 379                 offset = self.offset + offset
 380             else:
 381                 raise ValueError('Seek from end not supported')
 382         if self.mode == WRITE:
 383             if offset < self.offset:
 384                 raise IOError('Negative seek in write mode')
 385             count = offset - self.offset
 386             for i in range(count // 1024):
 387                 self.write(1024 * '\0')
 388             self.write((count % 1024) * '\0')
 389         elif self.mode == READ:
 390             if offset < self.offset:
 391                 # for negative seek, rewind and do positive seek
 392                 self.rewind()
 393             count = offset - self.offset
 394             for i in range(count // 1024):
 395                 self.read(1024)
 396             self.read(count % 1024)
 397
 398     def readline(self, size=-1):
 399         if size < 0:
 400             size = sys.maxint
 401             readsize = self.min_readsize
 402         else:
 403             readsize = size
 404         bufs = []
 405         while size != 0:
 406             c = self.read(readsize)
 407             i = c.find('\n')
 408
 409             # We set i=size to break out of the loop under two
 410             # conditions: 1) there's no newline, and the chunk is
 411             # larger than size, or 2) there is a newline, but the
 412             # resulting line would be longer than 'size'.
 413             if (size <= i) or (i == -1 and len(c) > size):
 414                 i = size - 1
 415
 416             if i >= 0 or c == '':
 417                 bufs.append(c[:i + 1])    # Add portion of last chunk
 418                 self._unread(c[i + 1:])   # Push back rest of chunk
 419                 break
 420
 421             # Append chunk to list, decrease 'size',
 422             bufs.append(c)
 423             size = size - len(c)
 424             readsize = min(size, readsize * 2)
 425         if readsize > self.min_readsize:
 426             self.min_readsize = min(readsize, self.min_readsize * 2, 512)
 427         return ''.join(bufs) # Return resulting line
 428
 429     def readlines(self, sizehint=0):
 430         # Negative numbers result in reading all the lines
 431         if sizehint <= 0:
 432             sizehint = sys.maxint
 433         L = []
 434         while sizehint > 0:
 435             line = self.readline()
 436             if line == "":
 437                 break
 438             L.append(line)
 439             sizehint = sizehint - len(line)
 440
 441         return L
 442
 443     def writelines(self, L):
 444         for line in L:
 445             self.write(line)
 446
 447     def __iter__(self):
 448         return self
 449
 450     def next(self):
 451         line = self.readline()
 452         if line:
 453             return line
 454         else:
 455             raise StopIteration
 456
 457     def __enter__(self):
 458         if self.fileobj is None:
 459             raise ValueError("I/O operation on closed GzipFile object")
 460         return self
 461
 462     def __exit__(self, *args):
 463         self.close()
 464
 465
 466 def _test():
 467     # Act like gzip; with -d, act like gunzip.
 468     # The input file is not deleted, however, nor are any other gzip
 469     # options or features supported.
 470     args = sys.argv[1:]
 471     decompress = args and args[0] == "-d"
 472     if decompress:
 473         args = args[1:]
 474     if not args:
 475         args = ["-"]
 476     for arg in args:
 477         if decompress:
 478             if arg == "-":
 479                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
 480                 g = sys.stdout
 481             else:
 482                 if arg[-3:] != ".gz":
 483                     print "filename doesn't end in .gz:", repr(arg)
 484                     continue
 485                 f = open(arg, "rb")
 486                 g = __builtin__.open(arg[:-3], "wb")
 487         else:
 488             if arg == "-":
 489                 f = sys.stdin
 490                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
 491             else:
 492                 f = __builtin__.open(arg, "rb")
 493                 g = open(arg + ".gz", "wb")
 494         while True:
 495             chunk = f.read(1024)
 496             if not chunk:
 497                 break
 498             g.write(chunk)
 499         if g is not sys.stdout:
 500             g.close()
 501         if f is not sys.stdin:
 502             f.close()
 503
 504 if __name__ == '__main__':
 505     _test()