Lib/gzip.py

   1 """Functions that read and write gzipped files.
   2
   3 The user of the file doesn't have to worry about the compression,
   4 but random access is not allowed."""
   5
   6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
   7
   8 import struct, sys, time
   9 import zlib
  10 import builtins
  11
  12 __all__ = ["GzipFile","open"]
  13
  14 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
  15
  16 READ, WRITE = 1, 2
  17
  18 def U32(i):
  19     """Return i as an unsigned integer, assuming it fits in 32 bits.
  20     If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
  21     """
  22     if i < 0:
  23         i += 1 << 32
  24     return i
  25
  26 def LOWU32(i):
  27     """Return the low-order 32 bits, as a non-negative int"""
  28     return i & 0xFFFFFFFF
  29
  30 def write32u(output, value):
  31     # The L format writes the bit pattern correctly whether signed
  32     # or unsigned.
  33     output.write(struct.pack("<L", value))
  34
  35 def read32(input):
  36     return struct.unpack("<I", input.read(4))[0]
  37
  38 def open(filename, mode="rb", compresslevel=9):
  39     """Shorthand for GzipFile(filename, mode, compresslevel).
  40
  41     The filename argument is required; mode defaults to 'rb'
  42     and compresslevel defaults to 9.
  43
  44     """
  45     return GzipFile(filename, mode, compresslevel)
  46
  47 class GzipFile:
  48     """The GzipFile class simulates most of the methods of a file object with
  49     the exception of the readinto() and truncate() methods.
  50
  51     """
  52
  53     myfileobj = None
  54     max_read_chunk = 10 * 1024 * 1024   # 10Mb
  55
  56     def __init__(self, filename=None, mode=None,
  57                  compresslevel=9, fileobj=None, mtime=None):
  58         """Constructor for the GzipFile class.
  59
  60         At least one of fileobj and filename must be given a
  61         non-trivial value.
  62
  63         The new class instance is based on fileobj, which can be a regular
  64         file, a StringIO object, or any other object which simulates a file.
  65         It defaults to None, in which case filename is opened to provide
  66         a file object.
  67
  68         When fileobj is not None, the filename argument is only used to be
  69         included in the gzip file header, which may includes the original
  70         filename of the uncompressed file.  It defaults to the filename of
  71         fileobj, if discernible; otherwise, it defaults to the empty string,
  72         and in this case the original filename is not included in the header.
  73
  74         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
  75         depending on whether the file will be read or written.  The default
  76         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
  77         Be aware that only the 'rb', 'ab', and 'wb' values should be used
  78         for cross-platform portability.
  79
  80         The compresslevel argument is an integer from 1 to 9 controlling the
  81         level of compression; 1 is fastest and produces the least compression,
  82         and 9 is slowest and produces the most compression.  The default is 9.
  83
  84         The mtime argument is an optional numeric timestamp to be written
  85         to the stream when compressing.  All gzip compressed streams
  86         are required to contain a timestamp.  If omitted or None, the
  87         current time is used.  This module ignores the timestamp when
  88         decompressing; however, some programs, such as gunzip, make use
  89         of it.  The format of the timestamp is the same as that of the
  90         return value of time.time() and of the st_mtime member of the
  91         object returned by os.stat().
  92
  93         """
  94
  95         # guarantee the file is opened in binary mode on platforms
  96         # that care about that sort of thing
  97         if mode and 'b' not in mode:
  98             mode += 'b'
  99         if fileobj is None:
 100             fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
 101         if filename is None:
 102             if hasattr(fileobj, 'name'): filename = fileobj.name
 103             else: filename = ''
 104         if mode is None:
 105             if hasattr(fileobj, 'mode'): mode = fileobj.mode
 106             else: mode = 'rb'
 107
 108         if mode[0:1] == 'r':
 109             self.mode = READ
 110             # Set flag indicating start of a new member
 111             self._new_member = True
 112             self.extrabuf = b""
 113             self.extrasize = 0
 114             self.name = filename
 115             # Starts small, scales exponentially
 116             self.min_readsize = 100
 117
 118         elif mode[0:1] == 'w' or mode[0:1] == 'a':
 119             self.mode = WRITE
 120             self._init_write(filename)
 121             self.compress = zlib.compressobj(compresslevel,
 122                                              zlib.DEFLATED,
 123                                              -zlib.MAX_WBITS,
 124                                              zlib.DEF_MEM_LEVEL,
 125                                              0)
 126         else:
 127             raise IOError("Mode " + mode + " not supported")
 128
 129         self.fileobj = fileobj
 130         self.offset = 0
 131         self.mtime = mtime
 132
 133         if self.mode == WRITE:
 134             self._write_gzip_header()
 135
 136     @property
 137     def filename(self):
 138         import warnings
 139         warnings.warn("use the name attribute", DeprecationWarning, 2)
 140         if self.mode == WRITE and self.name[-3:] != ".gz":
 141             return self.name + ".gz"
 142         return self.name
 143
 144     def __repr__(self):
 145         s = repr(self.fileobj)
 146         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
 147
 148     def _init_write(self, filename):
 149         self.name = filename
 150         self.crc = zlib.crc32("") & 0xffffffff
 151         self.size = 0
 152         self.writebuf = []
 153         self.bufsize = 0
 154
 155     def _write_gzip_header(self):
 156         self.fileobj.write(b'\037\213')             # magic header
 157         self.fileobj.write(b'\010')                 # compression method
 158         try:
 159             # RFC 1952 requires the FNAME field to be Latin-1. Do not
 160             # include filenames that cannot be represented that way.
 161             fname = self.name.encode('latin-1')
 162             if fname.endswith(b'.gz'):
 163                 fname = fname[:-3]
 164         except UnicodeEncodeError:
 165             fname = b''
 166         flags = 0
 167         if fname:
 168             flags = FNAME
 169         self.fileobj.write(chr(flags).encode('latin-1'))
 170         mtime = self.mtime
 171         if mtime is None:
 172             mtime = time.time()
 173         write32u(self.fileobj, int(mtime))
 174         self.fileobj.write(b'\002')
 175         self.fileobj.write(b'\377')
 176         if fname:
 177             self.fileobj.write(fname + b'\000')
 178
 179     def _init_read(self):
 180         self.crc = zlib.crc32("") & 0xffffffff
 181         self.size = 0
 182
 183     def _read_gzip_header(self):
 184         magic = self.fileobj.read(2)
 185         if magic != b'\037\213':
 186             raise IOError('Not a gzipped file')
 187         method = ord( self.fileobj.read(1) )
 188         if method != 8:
 189             raise IOError('Unknown compression method')
 190         flag = ord( self.fileobj.read(1) )
 191         self.mtime = read32(self.fileobj)
 192         # extraflag = self.fileobj.read(1)
 193         # os = self.fileobj.read(1)
 194         self.fileobj.read(2)
 195
 196         if flag & FEXTRA:
 197             # Read & discard the extra field, if present
 198             xlen = ord(self.fileobj.read(1))
 199             xlen = xlen + 256*ord(self.fileobj.read(1))
 200             self.fileobj.read(xlen)
 201         if flag & FNAME:
 202             # Read and discard a null-terminated string containing the filename
 203             while True:
 204                 s = self.fileobj.read(1)
 205                 if not s or s==b'\000':
 206                     break
 207         if flag & FCOMMENT:
 208             # Read and discard a null-terminated string containing a comment
 209             while True:
 210                 s = self.fileobj.read(1)
 211                 if not s or s==b'\000':
 212                     break
 213         if flag & FHCRC:
 214             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
 215
 216
 217     def write(self,data):
 218         if self.mode != WRITE:
 219             import errno
 220             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
 221
 222         if self.fileobj is None:
 223             raise ValueError("write() on closed GzipFile object")
 224         if len(data) > 0:
 225             self.size = self.size + len(data)
 226             self.crc = zlib.crc32(data, self.crc) & 0xffffffff
 227             self.fileobj.write( self.compress.compress(data) )
 228             self.offset += len(data)
 229
 230     def read(self, size=-1):
 231         if self.mode != READ:
 232             import errno
 233             raise IOError(errno.EBADF, "read() on write-only GzipFile object")
 234
 235         if self.extrasize <= 0 and self.fileobj is None:
 236             return b''
 237
 238         readsize = 1024
 239         if size < 0:        # get the whole thing
 240             try:
 241                 while True:
 242                     self._read(readsize)
 243                     readsize = min(self.max_read_chunk, readsize * 2)
 244             except EOFError:
 245                 size = self.extrasize
 246         else:               # just get some more of it
 247             try:
 248                 while size > self.extrasize:
 249                     self._read(readsize)
 250                     readsize = min(self.max_read_chunk, readsize * 2)
 251             except EOFError:
 252                 if size > self.extrasize:
 253                     size = self.extrasize
 254
 255         chunk = self.extrabuf[:size]
 256         self.extrabuf = self.extrabuf[size:]
 257         self.extrasize = self.extrasize - size
 258
 259         self.offset += size
 260         return chunk
 261
 262     def _unread(self, buf):
 263         self.extrabuf = buf + self.extrabuf
 264         self.extrasize = len(buf) + self.extrasize
 265         self.offset -= len(buf)
 266
 267     def _read(self, size=1024):
 268         if self.fileobj is None:
 269             raise EOFError("Reached EOF")
 270
 271         if self._new_member:
 272             # If the _new_member flag is set, we have to
 273             # jump to the next member, if there is one.
 274             #
 275             # First, check if we're at the end of the file;
 276             # if so, it's time to stop; no more members to read.
 277             pos = self.fileobj.tell()   # Save current position
 278             self.fileobj.seek(0, 2)     # Seek to end of file
 279             if pos == self.fileobj.tell():
 280                 raise EOFError("Reached EOF")
 281             else:
 282                 self.fileobj.seek( pos ) # Return to original position
 283
 284             self._init_read()
 285             self._read_gzip_header()
 286             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
 287             self._new_member = False
 288
 289         # Read a chunk of data from the file
 290         buf = self.fileobj.read(size)
 291
 292         # If the EOF has been reached, flush the decompression object
 293         # and mark this object as finished.
 294
 295         if buf == b"":
 296             uncompress = self.decompress.flush()
 297             self._read_eof()
 298             self._add_read_data( uncompress )
 299             raise EOFError('Reached EOF')
 300
 301         uncompress = self.decompress.decompress(buf)
 302         self._add_read_data( uncompress )
 303
 304         if self.decompress.unused_data != b"":
 305             # Ending case: we've come to the end of a member in the file,
 306             # so seek back to the start of the unused data, finish up
 307             # this member, and read a new gzip header.
 308             # (The number of bytes to seek back is the length of the unused
 309             # data, minus 8 because _read_eof() will rewind a further 8 bytes)
 310             self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
 311
 312             # Check the CRC and file size, and set the flag so we read
 313             # a new member on the next call
 314             self._read_eof()
 315             self._new_member = True
 316
 317     def _add_read_data(self, data):
 318         self.crc = zlib.crc32(data, self.crc) & 0xffffffff
 319         self.extrabuf = self.extrabuf + data
 320         self.extrasize = self.extrasize + len(data)
 321         self.size = self.size + len(data)
 322
 323     def _read_eof(self):
 324         # We've read to the end of the file, so we have to rewind in order
 325         # to reread the 8 bytes containing the CRC and the file size.
 326         # We check the that the computed CRC and size of the
 327         # uncompressed data matches the stored values.  Note that the size
 328         # stored is the true file size mod 2**32.
 329         self.fileobj.seek(-8, 1)
 330         crc32 = read32(self.fileobj)
 331         isize = read32(self.fileobj)  # may exceed 2GB
 332         if crc32 != self.crc:
 333             raise IOError("CRC check failed %s != %s" % (hex(crc32),
 334                                                          hex(self.crc)))
 335         elif isize != (self.size & 0xffffffff):
 336             raise IOError("Incorrect length of data produced")
 337
 338     def close(self):
 339         if self.fileobj is None:
 340             return
 341         if self.mode == WRITE:
 342             self.fileobj.write(self.compress.flush())
 343             write32u(self.fileobj, self.crc)
 344             # self.size may exceed 2GB, or even 4GB
 345             write32u(self.fileobj, self.size & 0xffffffff)
 346             self.fileobj = None
 347         elif self.mode == READ:
 348             self.fileobj = None
 349         if self.myfileobj:
 350             self.myfileobj.close()
 351             self.myfileobj = None
 352
 353     def __del__(self):
 354         try:
 355             if (self.myfileobj is None and
 356                 self.fileobj is None):
 357                 return
 358         except AttributeError:
 359             return
 360         self.close()
 361
 362     def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
 363         if self.mode == WRITE:
 364             # Ensure the compressor's buffer is flushed
 365             self.fileobj.write(self.compress.flush(zlib_mode))
 366         self.fileobj.flush()
 367
 368     def fileno(self):
 369         """Invoke the underlying file object's fileno() method.
 370
 371         This will raise AttributeError if the underlying file object
 372         doesn't support fileno().
 373         """
 374         return self.fileobj.fileno()
 375
 376     def isatty(self):
 377         return False
 378
 379     def tell(self):
 380         return self.offset
 381
 382     def rewind(self):
 383         '''Return the uncompressed stream file position indicator to the
 384         beginning of the file'''
 385         if self.mode != READ:
 386             raise IOError("Can't rewind in write mode")
 387         self.fileobj.seek(0)
 388         self._new_member = True
 389         self.extrabuf = b""
 390         self.extrasize = 0
 391         self.offset = 0
 392
 393     def seek(self, offset, whence=0):
 394         if whence:
 395             if whence == 1:
 396                 offset = self.offset + offset
 397             else:
 398                 raise ValueError('Seek from end not supported')
 399         if self.mode == WRITE:
 400             if offset < self.offset:
 401                 raise IOError('Negative seek in write mode')
 402             count = offset - self.offset
 403             chunk = bytes(1024)
 404             for i in range(count // 1024):
 405                 self.write(chunk)
 406             self.write(bytes(count % 1024))
 407         elif self.mode == READ:
 408             if offset < self.offset:
 409                 # for negative seek, rewind and do positive seek
 410                 self.rewind()
 411             count = offset - self.offset
 412             for i in range(count // 1024):
 413                 self.read(1024)
 414             self.read(count % 1024)
 415
 416     def readline(self, size=-1):
 417         if size < 0:
 418             size = sys.maxsize
 419             readsize = self.min_readsize
 420         else:
 421             readsize = size
 422         bufs = []
 423         while size != 0:
 424             c = self.read(readsize)
 425             i = c.find(b'\n')
 426
 427             # We set i=size to break out of the loop under two
 428             # conditions: 1) there's no newline, and the chunk is
 429             # larger than size, or 2) there is a newline, but the
 430             # resulting line would be longer than 'size'.
 431             if (size <= i) or (i == -1 and len(c) > size):
 432                 i = size - 1
 433
 434             if i >= 0 or c == b'':
 435                 bufs.append(c[:i + 1])    # Add portion of last chunk
 436                 self._unread(c[i + 1:])   # Push back rest of chunk
 437                 break
 438
 439             # Append chunk to list, decrease 'size',
 440             bufs.append(c)
 441             size = size - len(c)
 442             readsize = min(size, readsize * 2)
 443         if readsize > self.min_readsize:
 444             self.min_readsize = min(readsize, self.min_readsize * 2, 512)
 445         return b''.join(bufs) # Return resulting line
 446
 447     def readlines(self, sizehint=0):
 448         # Negative numbers result in reading all the lines
 449         if sizehint <= 0:
 450             sizehint = sys.maxsize
 451         L = []
 452         while sizehint > 0:
 453             line = self.readline()
 454             if line == b"":
 455                 break
 456             L.append(line)
 457             sizehint = sizehint - len(line)
 458
 459         return L
 460
 461     def writelines(self, L):
 462         for line in L:
 463             self.write(line)
 464
 465     def __iter__(self):
 466         return self
 467
 468     def __next__(self):
 469         line = self.readline()
 470         if line:
 471             return line
 472         else:
 473             raise StopIteration
 474
 475     def __enter__(self):
 476         if self.fileobj is None:
 477             raise ValueError("I/O operation on closed GzipFile object")
 478         return self
 479
 480     def __exit__(self, *args):
 481         self.close()
 482
 483
 484 def _test():
 485     # Act like gzip; with -d, act like gunzip.
 486     # The input file is not deleted, however, nor are any other gzip
 487     # options or features supported.
 488     args = sys.argv[1:]
 489     decompress = args and args[0] == "-d"
 490     if decompress:
 491         args = args[1:]
 492     if not args:
 493         args = ["-"]
 494     for arg in args:
 495         if decompress:
 496             if arg == "-":
 497                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
 498                 g = sys.stdout.buffer
 499             else:
 500                 if arg[-3:] != ".gz":
 501                     print("filename doesn't end in .gz:", repr(arg))
 502                     continue
 503                 f = open(arg, "rb")
 504                 g = builtins.open(arg[:-3], "wb")
 505         else:
 506             if arg == "-":
 507                 f = sys.stdin.buffer
 508                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
 509             else:
 510                 f = builtins.open(arg, "rb")
 511                 g = open(arg + ".gz", "wb")
 512         while True:
 513             chunk = f.read(1024)
 514             if not chunk:
 515                 break
 516             g.write(chunk)
 517         if g is not sys.stdout:
 518             g.close()
 519         if f is not sys.stdin:
 520             f.close()
 521
 522 if __name__ == '__main__':
 523     _test()