1 """Functions that read and write gzipped files.
3 The user of the file doesn't have to worry about the compression,
4 but random access is not allowed."""
6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
8 import struct
, sys
, time
12 __all__
= ["GzipFile","open"]
14 FTEXT
, FHCRC
, FEXTRA
, FNAME
, FCOMMENT
= 1, 2, 4, 8, 16
19 """Return i as an unsigned integer, assuming it fits in 32 bits.
20 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
27 """Return the low-order 32 bits, as a non-negative int"""
30 def write32u(output
, value
):
31 # The L format writes the bit pattern correctly whether signed
33 output
.write(struct
.pack("<L", value
))
36 return struct
.unpack("<I", input.read(4))[0]
38 def open(filename
, mode
="rb", compresslevel
=9):
39 """Shorthand for GzipFile(filename, mode, compresslevel).
41 The filename argument is required; mode defaults to 'rb'
42 and compresslevel defaults to 9.
45 return GzipFile(filename
, mode
, compresslevel
)
48 """The GzipFile class simulates most of the methods of a file object with
49 the exception of the readinto() and truncate() methods.
54 max_read_chunk
= 10 * 1024 * 1024 # 10Mb
56 def __init__(self
, filename
=None, mode
=None,
57 compresslevel
=9, fileobj
=None, mtime
=None):
58 """Constructor for the GzipFile class.
60 At least one of fileobj and filename must be given a
63 The new class instance is based on fileobj, which can be a regular
64 file, a StringIO object, or any other object which simulates a file.
65 It defaults to None, in which case filename is opened to provide
68 When fileobj is not None, the filename argument is only used to be
69 included in the gzip file header, which may includes the original
70 filename of the uncompressed file. It defaults to the filename of
71 fileobj, if discernible; otherwise, it defaults to the empty string,
72 and in this case the original filename is not included in the header.
74 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
75 depending on whether the file will be read or written. The default
76 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
77 Be aware that only the 'rb', 'ab', and 'wb' values should be used
78 for cross-platform portability.
80 The compresslevel argument is an integer from 1 to 9 controlling the
81 level of compression; 1 is fastest and produces the least compression,
82 and 9 is slowest and produces the most compression. The default is 9.
84 The mtime argument is an optional numeric timestamp to be written
85 to the stream when compressing. All gzip compressed streams
86 are required to contain a timestamp. If omitted or None, the
87 current time is used. This module ignores the timestamp when
88 decompressing; however, some programs, such as gunzip, make use
89 of it. The format of the timestamp is the same as that of the
90 return value of time.time() and of the st_mtime member of the
91 object returned by os.stat().
95 # guarantee the file is opened in binary mode on platforms
96 # that care about that sort of thing
97 if mode
and 'b' not in mode
:
100 fileobj
= self
.myfileobj
= builtins
.open(filename
, mode
or 'rb')
102 if hasattr(fileobj
, 'name'): filename
= fileobj
.name
105 if hasattr(fileobj
, 'mode'): mode
= fileobj
.mode
110 # Set flag indicating start of a new member
111 self
._new
_member
= True
115 # Starts small, scales exponentially
116 self
.min_readsize
= 100
118 elif mode
[0:1] == 'w' or mode
[0:1] == 'a':
120 self
._init
_write
(filename
)
121 self
.compress
= zlib
.compressobj(compresslevel
,
127 raise IOError("Mode " + mode
+ " not supported")
129 self
.fileobj
= fileobj
133 if self
.mode
== WRITE
:
134 self
._write
_gzip
_header
()
139 warnings
.warn("use the name attribute", DeprecationWarning, 2)
140 if self
.mode
== WRITE
and self
.name
[-3:] != ".gz":
141 return self
.name
+ ".gz"
145 s
= repr(self
.fileobj
)
146 return '<gzip ' + s
[1:-1] + ' ' + hex(id(self
)) + '>'
148 def _init_write(self
, filename
):
150 self
.crc
= zlib
.crc32("") & 0xffffffff
155 def _write_gzip_header(self
):
156 self
.fileobj
.write(b
'\037\213') # magic header
157 self
.fileobj
.write(b
'\010') # compression method
159 # RFC 1952 requires the FNAME field to be Latin-1. Do not
160 # include filenames that cannot be represented that way.
161 fname
= self
.name
.encode('latin-1')
162 if fname
.endswith(b
'.gz'):
164 except UnicodeEncodeError:
169 self
.fileobj
.write(chr(flags
).encode('latin-1'))
173 write32u(self
.fileobj
, int(mtime
))
174 self
.fileobj
.write(b
'\002')
175 self
.fileobj
.write(b
'\377')
177 self
.fileobj
.write(fname
+ b
'\000')
179 def _init_read(self
):
180 self
.crc
= zlib
.crc32("") & 0xffffffff
183 def _read_gzip_header(self
):
184 magic
= self
.fileobj
.read(2)
185 if magic
!= b
'\037\213':
186 raise IOError('Not a gzipped file')
187 method
= ord( self
.fileobj
.read(1) )
189 raise IOError('Unknown compression method')
190 flag
= ord( self
.fileobj
.read(1) )
191 self
.mtime
= read32(self
.fileobj
)
192 # extraflag = self.fileobj.read(1)
193 # os = self.fileobj.read(1)
197 # Read & discard the extra field, if present
198 xlen
= ord(self
.fileobj
.read(1))
199 xlen
= xlen
+ 256*ord(self
.fileobj
.read(1))
200 self
.fileobj
.read(xlen
)
202 # Read and discard a null-terminated string containing the filename
204 s
= self
.fileobj
.read(1)
205 if not s
or s
==b
'\000':
208 # Read and discard a null-terminated string containing a comment
210 s
= self
.fileobj
.read(1)
211 if not s
or s
==b
'\000':
214 self
.fileobj
.read(2) # Read & discard the 16-bit header CRC
217 def write(self
,data
):
218 if self
.mode
!= WRITE
:
220 raise IOError(errno
.EBADF
, "write() on read-only GzipFile object")
222 if self
.fileobj
is None:
223 raise ValueError("write() on closed GzipFile object")
225 self
.size
= self
.size
+ len(data
)
226 self
.crc
= zlib
.crc32(data
, self
.crc
) & 0xffffffff
227 self
.fileobj
.write( self
.compress
.compress(data
) )
228 self
.offset
+= len(data
)
230 def read(self
, size
=-1):
231 if self
.mode
!= READ
:
233 raise IOError(errno
.EBADF
, "read() on write-only GzipFile object")
235 if self
.extrasize
<= 0 and self
.fileobj
is None:
239 if size
< 0: # get the whole thing
243 readsize
= min(self
.max_read_chunk
, readsize
* 2)
245 size
= self
.extrasize
246 else: # just get some more of it
248 while size
> self
.extrasize
:
250 readsize
= min(self
.max_read_chunk
, readsize
* 2)
252 if size
> self
.extrasize
:
253 size
= self
.extrasize
255 chunk
= self
.extrabuf
[:size
]
256 self
.extrabuf
= self
.extrabuf
[size
:]
257 self
.extrasize
= self
.extrasize
- size
262 def _unread(self
, buf
):
263 self
.extrabuf
= buf
+ self
.extrabuf
264 self
.extrasize
= len(buf
) + self
.extrasize
265 self
.offset
-= len(buf
)
267 def _read(self
, size
=1024):
268 if self
.fileobj
is None:
269 raise EOFError("Reached EOF")
272 # If the _new_member flag is set, we have to
273 # jump to the next member, if there is one.
275 # First, check if we're at the end of the file;
276 # if so, it's time to stop; no more members to read.
277 pos
= self
.fileobj
.tell() # Save current position
278 self
.fileobj
.seek(0, 2) # Seek to end of file
279 if pos
== self
.fileobj
.tell():
280 raise EOFError("Reached EOF")
282 self
.fileobj
.seek( pos
) # Return to original position
285 self
._read
_gzip
_header
()
286 self
.decompress
= zlib
.decompressobj(-zlib
.MAX_WBITS
)
287 self
._new
_member
= False
289 # Read a chunk of data from the file
290 buf
= self
.fileobj
.read(size
)
292 # If the EOF has been reached, flush the decompression object
293 # and mark this object as finished.
296 uncompress
= self
.decompress
.flush()
298 self
._add
_read
_data
( uncompress
)
299 raise EOFError('Reached EOF')
301 uncompress
= self
.decompress
.decompress(buf
)
302 self
._add
_read
_data
( uncompress
)
304 if self
.decompress
.unused_data
!= b
"":
305 # Ending case: we've come to the end of a member in the file,
306 # so seek back to the start of the unused data, finish up
307 # this member, and read a new gzip header.
308 # (The number of bytes to seek back is the length of the unused
309 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
310 self
.fileobj
.seek( -len(self
.decompress
.unused_data
)+8, 1)
312 # Check the CRC and file size, and set the flag so we read
313 # a new member on the next call
315 self
._new
_member
= True
317 def _add_read_data(self
, data
):
318 self
.crc
= zlib
.crc32(data
, self
.crc
) & 0xffffffff
319 self
.extrabuf
= self
.extrabuf
+ data
320 self
.extrasize
= self
.extrasize
+ len(data
)
321 self
.size
= self
.size
+ len(data
)
324 # We've read to the end of the file, so we have to rewind in order
325 # to reread the 8 bytes containing the CRC and the file size.
326 # We check the that the computed CRC and size of the
327 # uncompressed data matches the stored values. Note that the size
328 # stored is the true file size mod 2**32.
329 self
.fileobj
.seek(-8, 1)
330 crc32
= read32(self
.fileobj
)
331 isize
= read32(self
.fileobj
) # may exceed 2GB
332 if crc32
!= self
.crc
:
333 raise IOError("CRC check failed %s != %s" % (hex(crc32
),
335 elif isize
!= (self
.size
& 0xffffffff):
336 raise IOError("Incorrect length of data produced")
339 if self
.fileobj
is None:
341 if self
.mode
== WRITE
:
342 self
.fileobj
.write(self
.compress
.flush())
343 write32u(self
.fileobj
, self
.crc
)
344 # self.size may exceed 2GB, or even 4GB
345 write32u(self
.fileobj
, self
.size
& 0xffffffff)
347 elif self
.mode
== READ
:
350 self
.myfileobj
.close()
351 self
.myfileobj
= None
355 if (self
.myfileobj
is None and
356 self
.fileobj
is None):
358 except AttributeError:
362 def flush(self
,zlib_mode
=zlib
.Z_SYNC_FLUSH
):
363 if self
.mode
== WRITE
:
364 # Ensure the compressor's buffer is flushed
365 self
.fileobj
.write(self
.compress
.flush(zlib_mode
))
369 """Invoke the underlying file object's fileno() method.
371 This will raise AttributeError if the underlying file object
372 doesn't support fileno().
374 return self
.fileobj
.fileno()
383 '''Return the uncompressed stream file position indicator to the
384 beginning of the file'''
385 if self
.mode
!= READ
:
386 raise IOError("Can't rewind in write mode")
388 self
._new
_member
= True
393 def seek(self
, offset
, whence
=0):
396 offset
= self
.offset
+ offset
398 raise ValueError('Seek from end not supported')
399 if self
.mode
== WRITE
:
400 if offset
< self
.offset
:
401 raise IOError('Negative seek in write mode')
402 count
= offset
- self
.offset
404 for i
in range(count
// 1024):
406 self
.write(bytes(count
% 1024))
407 elif self
.mode
== READ
:
408 if offset
< self
.offset
:
409 # for negative seek, rewind and do positive seek
411 count
= offset
- self
.offset
412 for i
in range(count
// 1024):
414 self
.read(count
% 1024)
416 def readline(self
, size
=-1):
419 readsize
= self
.min_readsize
424 c
= self
.read(readsize
)
427 # We set i=size to break out of the loop under two
428 # conditions: 1) there's no newline, and the chunk is
429 # larger than size, or 2) there is a newline, but the
430 # resulting line would be longer than 'size'.
431 if (size
<= i
) or (i
== -1 and len(c
) > size
):
434 if i
>= 0 or c
== b
'':
435 bufs
.append(c
[:i
+ 1]) # Add portion of last chunk
436 self
._unread
(c
[i
+ 1:]) # Push back rest of chunk
439 # Append chunk to list, decrease 'size',
442 readsize
= min(size
, readsize
* 2)
443 if readsize
> self
.min_readsize
:
444 self
.min_readsize
= min(readsize
, self
.min_readsize
* 2, 512)
445 return b
''.join(bufs
) # Return resulting line
447 def readlines(self
, sizehint
=0):
448 # Negative numbers result in reading all the lines
450 sizehint
= sys
.maxsize
453 line
= self
.readline()
457 sizehint
= sizehint
- len(line
)
461 def writelines(self
, L
):
469 line
= self
.readline()
476 if self
.fileobj
is None:
477 raise ValueError("I/O operation on closed GzipFile object")
480 def __exit__(self
, *args
):
485 # Act like gzip; with -d, act like gunzip.
486 # The input file is not deleted, however, nor are any other gzip
487 # options or features supported.
489 decompress
= args
and args
[0] == "-d"
497 f
= GzipFile(filename
="", mode
="rb", fileobj
=sys
.stdin
.buffer)
498 g
= sys
.stdout
.buffer
500 if arg
[-3:] != ".gz":
501 print("filename doesn't end in .gz:", repr(arg
))
504 g
= builtins
.open(arg
[:-3], "wb")
508 g
= GzipFile(filename
="", mode
="wb", fileobj
=sys
.stdout
.buffer)
510 f
= builtins
.open(arg
, "rb")
511 g
= open(arg
+ ".gz", "wb")
517 if g
is not sys
.stdout
:
519 if f
is not sys
.stdin
:
522 if __name__
== '__main__':