update pydoc topics
[python/dscho.git] / Lib / gzip.py
blob983e0cee07f0fc8be648b9991ef60c31b14d3285
1 """Functions that read and write gzipped files.
3 The user of the file doesn't have to worry about the compression,
4 but random access is not allowed."""
6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
8 import struct, sys, time
9 import zlib
10 import builtins
12 __all__ = ["GzipFile","open"]
14 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16 READ, WRITE = 1, 2
18 def U32(i):
19 """Return i as an unsigned integer, assuming it fits in 32 bits.
20 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
21 """
22 if i < 0:
23 i += 1 << 32
24 return i
26 def LOWU32(i):
27 """Return the low-order 32 bits, as a non-negative int"""
28 return i & 0xFFFFFFFF
30 def write32u(output, value):
31 # The L format writes the bit pattern correctly whether signed
32 # or unsigned.
33 output.write(struct.pack("<L", value))
35 def read32(input):
36 return struct.unpack("<I", input.read(4))[0]
38 def open(filename, mode="rb", compresslevel=9):
39 """Shorthand for GzipFile(filename, mode, compresslevel).
41 The filename argument is required; mode defaults to 'rb'
42 and compresslevel defaults to 9.
44 """
45 return GzipFile(filename, mode, compresslevel)
47 class GzipFile:
48 """The GzipFile class simulates most of the methods of a file object with
49 the exception of the readinto() and truncate() methods.
51 """
53 myfileobj = None
54 max_read_chunk = 10 * 1024 * 1024 # 10Mb
56 def __init__(self, filename=None, mode=None,
57 compresslevel=9, fileobj=None, mtime=None):
58 """Constructor for the GzipFile class.
60 At least one of fileobj and filename must be given a
61 non-trivial value.
63 The new class instance is based on fileobj, which can be a regular
64 file, a StringIO object, or any other object which simulates a file.
65 It defaults to None, in which case filename is opened to provide
66 a file object.
68 When fileobj is not None, the filename argument is only used to be
69 included in the gzip file header, which may includes the original
70 filename of the uncompressed file. It defaults to the filename of
71 fileobj, if discernible; otherwise, it defaults to the empty string,
72 and in this case the original filename is not included in the header.
74 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
75 depending on whether the file will be read or written. The default
76 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
77 Be aware that only the 'rb', 'ab', and 'wb' values should be used
78 for cross-platform portability.
80 The compresslevel argument is an integer from 1 to 9 controlling the
81 level of compression; 1 is fastest and produces the least compression,
82 and 9 is slowest and produces the most compression. The default is 9.
84 The mtime argument is an optional numeric timestamp to be written
85 to the stream when compressing. All gzip compressed streams
86 are required to contain a timestamp. If omitted or None, the
87 current time is used. This module ignores the timestamp when
88 decompressing; however, some programs, such as gunzip, make use
89 of it. The format of the timestamp is the same as that of the
90 return value of time.time() and of the st_mtime member of the
91 object returned by os.stat().
93 """
95 # guarantee the file is opened in binary mode on platforms
96 # that care about that sort of thing
97 if mode and 'b' not in mode:
98 mode += 'b'
99 if fileobj is None:
100 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
101 if filename is None:
102 if hasattr(fileobj, 'name'): filename = fileobj.name
103 else: filename = ''
104 if mode is None:
105 if hasattr(fileobj, 'mode'): mode = fileobj.mode
106 else: mode = 'rb'
108 if mode[0:1] == 'r':
109 self.mode = READ
110 # Set flag indicating start of a new member
111 self._new_member = True
112 self.extrabuf = b""
113 self.extrasize = 0
114 self.name = filename
115 # Starts small, scales exponentially
116 self.min_readsize = 100
118 elif mode[0:1] == 'w' or mode[0:1] == 'a':
119 self.mode = WRITE
120 self._init_write(filename)
121 self.compress = zlib.compressobj(compresslevel,
122 zlib.DEFLATED,
123 -zlib.MAX_WBITS,
124 zlib.DEF_MEM_LEVEL,
126 else:
127 raise IOError("Mode " + mode + " not supported")
129 self.fileobj = fileobj
130 self.offset = 0
131 self.mtime = mtime
133 if self.mode == WRITE:
134 self._write_gzip_header()
136 @property
137 def filename(self):
138 import warnings
139 warnings.warn("use the name attribute", DeprecationWarning, 2)
140 if self.mode == WRITE and self.name[-3:] != ".gz":
141 return self.name + ".gz"
142 return self.name
144 def __repr__(self):
145 s = repr(self.fileobj)
146 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
148 def _init_write(self, filename):
149 self.name = filename
150 self.crc = zlib.crc32("") & 0xffffffff
151 self.size = 0
152 self.writebuf = []
153 self.bufsize = 0
155 def _write_gzip_header(self):
156 self.fileobj.write(b'\037\213') # magic header
157 self.fileobj.write(b'\010') # compression method
158 try:
159 # RFC 1952 requires the FNAME field to be Latin-1. Do not
160 # include filenames that cannot be represented that way.
161 fname = self.name.encode('latin-1')
162 if fname.endswith(b'.gz'):
163 fname = fname[:-3]
164 except UnicodeEncodeError:
165 fname = b''
166 flags = 0
167 if fname:
168 flags = FNAME
169 self.fileobj.write(chr(flags).encode('latin-1'))
170 mtime = self.mtime
171 if mtime is None:
172 mtime = time.time()
173 write32u(self.fileobj, int(mtime))
174 self.fileobj.write(b'\002')
175 self.fileobj.write(b'\377')
176 if fname:
177 self.fileobj.write(fname + b'\000')
179 def _init_read(self):
180 self.crc = zlib.crc32("") & 0xffffffff
181 self.size = 0
183 def _read_gzip_header(self):
184 magic = self.fileobj.read(2)
185 if magic != b'\037\213':
186 raise IOError('Not a gzipped file')
187 method = ord( self.fileobj.read(1) )
188 if method != 8:
189 raise IOError('Unknown compression method')
190 flag = ord( self.fileobj.read(1) )
191 self.mtime = read32(self.fileobj)
192 # extraflag = self.fileobj.read(1)
193 # os = self.fileobj.read(1)
194 self.fileobj.read(2)
196 if flag & FEXTRA:
197 # Read & discard the extra field, if present
198 xlen = ord(self.fileobj.read(1))
199 xlen = xlen + 256*ord(self.fileobj.read(1))
200 self.fileobj.read(xlen)
201 if flag & FNAME:
202 # Read and discard a null-terminated string containing the filename
203 while True:
204 s = self.fileobj.read(1)
205 if not s or s==b'\000':
206 break
207 if flag & FCOMMENT:
208 # Read and discard a null-terminated string containing a comment
209 while True:
210 s = self.fileobj.read(1)
211 if not s or s==b'\000':
212 break
213 if flag & FHCRC:
214 self.fileobj.read(2) # Read & discard the 16-bit header CRC
217 def write(self,data):
218 if self.mode != WRITE:
219 import errno
220 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
222 if self.fileobj is None:
223 raise ValueError("write() on closed GzipFile object")
224 if len(data) > 0:
225 self.size = self.size + len(data)
226 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
227 self.fileobj.write( self.compress.compress(data) )
228 self.offset += len(data)
230 def read(self, size=-1):
231 if self.mode != READ:
232 import errno
233 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
235 if self.extrasize <= 0 and self.fileobj is None:
236 return b''
238 readsize = 1024
239 if size < 0: # get the whole thing
240 try:
241 while True:
242 self._read(readsize)
243 readsize = min(self.max_read_chunk, readsize * 2)
244 except EOFError:
245 size = self.extrasize
246 else: # just get some more of it
247 try:
248 while size > self.extrasize:
249 self._read(readsize)
250 readsize = min(self.max_read_chunk, readsize * 2)
251 except EOFError:
252 if size > self.extrasize:
253 size = self.extrasize
255 chunk = self.extrabuf[:size]
256 self.extrabuf = self.extrabuf[size:]
257 self.extrasize = self.extrasize - size
259 self.offset += size
260 return chunk
262 def _unread(self, buf):
263 self.extrabuf = buf + self.extrabuf
264 self.extrasize = len(buf) + self.extrasize
265 self.offset -= len(buf)
267 def _read(self, size=1024):
268 if self.fileobj is None:
269 raise EOFError("Reached EOF")
271 if self._new_member:
272 # If the _new_member flag is set, we have to
273 # jump to the next member, if there is one.
275 # First, check if we're at the end of the file;
276 # if so, it's time to stop; no more members to read.
277 pos = self.fileobj.tell() # Save current position
278 self.fileobj.seek(0, 2) # Seek to end of file
279 if pos == self.fileobj.tell():
280 raise EOFError("Reached EOF")
281 else:
282 self.fileobj.seek( pos ) # Return to original position
284 self._init_read()
285 self._read_gzip_header()
286 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
287 self._new_member = False
289 # Read a chunk of data from the file
290 buf = self.fileobj.read(size)
292 # If the EOF has been reached, flush the decompression object
293 # and mark this object as finished.
295 if buf == b"":
296 uncompress = self.decompress.flush()
297 self._read_eof()
298 self._add_read_data( uncompress )
299 raise EOFError('Reached EOF')
301 uncompress = self.decompress.decompress(buf)
302 self._add_read_data( uncompress )
304 if self.decompress.unused_data != b"":
305 # Ending case: we've come to the end of a member in the file,
306 # so seek back to the start of the unused data, finish up
307 # this member, and read a new gzip header.
308 # (The number of bytes to seek back is the length of the unused
309 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
310 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
312 # Check the CRC and file size, and set the flag so we read
313 # a new member on the next call
314 self._read_eof()
315 self._new_member = True
317 def _add_read_data(self, data):
318 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
319 self.extrabuf = self.extrabuf + data
320 self.extrasize = self.extrasize + len(data)
321 self.size = self.size + len(data)
323 def _read_eof(self):
324 # We've read to the end of the file, so we have to rewind in order
325 # to reread the 8 bytes containing the CRC and the file size.
326 # We check the that the computed CRC and size of the
327 # uncompressed data matches the stored values. Note that the size
328 # stored is the true file size mod 2**32.
329 self.fileobj.seek(-8, 1)
330 crc32 = read32(self.fileobj)
331 isize = read32(self.fileobj) # may exceed 2GB
332 if crc32 != self.crc:
333 raise IOError("CRC check failed %s != %s" % (hex(crc32),
334 hex(self.crc)))
335 elif isize != (self.size & 0xffffffff):
336 raise IOError("Incorrect length of data produced")
338 def close(self):
339 if self.fileobj is None:
340 return
341 if self.mode == WRITE:
342 self.fileobj.write(self.compress.flush())
343 write32u(self.fileobj, self.crc)
344 # self.size may exceed 2GB, or even 4GB
345 write32u(self.fileobj, self.size & 0xffffffff)
346 self.fileobj = None
347 elif self.mode == READ:
348 self.fileobj = None
349 if self.myfileobj:
350 self.myfileobj.close()
351 self.myfileobj = None
353 def __del__(self):
354 try:
355 if (self.myfileobj is None and
356 self.fileobj is None):
357 return
358 except AttributeError:
359 return
360 self.close()
362 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
363 if self.mode == WRITE:
364 # Ensure the compressor's buffer is flushed
365 self.fileobj.write(self.compress.flush(zlib_mode))
366 self.fileobj.flush()
368 def fileno(self):
369 """Invoke the underlying file object's fileno() method.
371 This will raise AttributeError if the underlying file object
372 doesn't support fileno().
374 return self.fileobj.fileno()
376 def isatty(self):
377 return False
379 def tell(self):
380 return self.offset
382 def rewind(self):
383 '''Return the uncompressed stream file position indicator to the
384 beginning of the file'''
385 if self.mode != READ:
386 raise IOError("Can't rewind in write mode")
387 self.fileobj.seek(0)
388 self._new_member = True
389 self.extrabuf = b""
390 self.extrasize = 0
391 self.offset = 0
393 def seek(self, offset, whence=0):
394 if whence:
395 if whence == 1:
396 offset = self.offset + offset
397 else:
398 raise ValueError('Seek from end not supported')
399 if self.mode == WRITE:
400 if offset < self.offset:
401 raise IOError('Negative seek in write mode')
402 count = offset - self.offset
403 chunk = bytes(1024)
404 for i in range(count // 1024):
405 self.write(chunk)
406 self.write(bytes(count % 1024))
407 elif self.mode == READ:
408 if offset < self.offset:
409 # for negative seek, rewind and do positive seek
410 self.rewind()
411 count = offset - self.offset
412 for i in range(count // 1024):
413 self.read(1024)
414 self.read(count % 1024)
416 def readline(self, size=-1):
417 if size < 0:
418 size = sys.maxsize
419 readsize = self.min_readsize
420 else:
421 readsize = size
422 bufs = []
423 while size != 0:
424 c = self.read(readsize)
425 i = c.find(b'\n')
427 # We set i=size to break out of the loop under two
428 # conditions: 1) there's no newline, and the chunk is
429 # larger than size, or 2) there is a newline, but the
430 # resulting line would be longer than 'size'.
431 if (size <= i) or (i == -1 and len(c) > size):
432 i = size - 1
434 if i >= 0 or c == b'':
435 bufs.append(c[:i + 1]) # Add portion of last chunk
436 self._unread(c[i + 1:]) # Push back rest of chunk
437 break
439 # Append chunk to list, decrease 'size',
440 bufs.append(c)
441 size = size - len(c)
442 readsize = min(size, readsize * 2)
443 if readsize > self.min_readsize:
444 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
445 return b''.join(bufs) # Return resulting line
447 def readlines(self, sizehint=0):
448 # Negative numbers result in reading all the lines
449 if sizehint <= 0:
450 sizehint = sys.maxsize
451 L = []
452 while sizehint > 0:
453 line = self.readline()
454 if line == b"":
455 break
456 L.append(line)
457 sizehint = sizehint - len(line)
459 return L
461 def writelines(self, L):
462 for line in L:
463 self.write(line)
465 def __iter__(self):
466 return self
468 def __next__(self):
469 line = self.readline()
470 if line:
471 return line
472 else:
473 raise StopIteration
475 def __enter__(self):
476 if self.fileobj is None:
477 raise ValueError("I/O operation on closed GzipFile object")
478 return self
480 def __exit__(self, *args):
481 self.close()
484 def _test():
485 # Act like gzip; with -d, act like gunzip.
486 # The input file is not deleted, however, nor are any other gzip
487 # options or features supported.
488 args = sys.argv[1:]
489 decompress = args and args[0] == "-d"
490 if decompress:
491 args = args[1:]
492 if not args:
493 args = ["-"]
494 for arg in args:
495 if decompress:
496 if arg == "-":
497 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
498 g = sys.stdout.buffer
499 else:
500 if arg[-3:] != ".gz":
501 print("filename doesn't end in .gz:", repr(arg))
502 continue
503 f = open(arg, "rb")
504 g = builtins.open(arg[:-3], "wb")
505 else:
506 if arg == "-":
507 f = sys.stdin.buffer
508 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
509 else:
510 f = builtins.open(arg, "rb")
511 g = open(arg + ".gz", "wb")
512 while True:
513 chunk = f.read(1024)
514 if not chunk:
515 break
516 g.write(chunk)
517 if g is not sys.stdout:
518 g.close()
519 if f is not sys.stdin:
520 f.close()
522 if __name__ == '__main__':
523 _test()