1 """Functions that read and write gzipped files.
3 The user of the file doesn't have to worry about the compression,
4 but random access is not allowed."""
6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
8 import struct
, sys
, time
12 __all__
= ["GzipFile","open"]
14 FTEXT
, FHCRC
, FEXTRA
, FNAME
, FCOMMENT
= 1, 2, 4, 8, 16
19 """Return i as an unsigned integer, assuming it fits in 32 bits.
21 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
28 """Return the low-order 32 bits of an int, as a non-negative int."""
29 return i
& 0xFFFFFFFFL
31 def write32(output
, value
):
32 output
.write(struct
.pack("<l", value
))
34 def write32u(output
, value
):
35 # The L format writes the bit pattern correctly whether signed
37 output
.write(struct
.pack("<L", value
))
40 return struct
.unpack("<l", input.read(4))[0]
42 def open(filename
, mode
="rb", compresslevel
=9):
43 """Shorthand for GzipFile(filename, mode, compresslevel).
45 The filename argument is required; mode defaults to 'rb'
46 and compresslevel defaults to 9.
49 return GzipFile(filename
, mode
, compresslevel
)
52 """The GzipFile class simulates most of the methods of a file object with
53 the exception of the readinto() and truncate() methods.
58 max_read_chunk
= 10 * 1024 * 1024 # 10Mb
60 def __init__(self
, filename
=None, mode
=None,
61 compresslevel
=9, fileobj
=None):
62 """Constructor for the GzipFile class.
64 At least one of fileobj and filename must be given a
67 The new class instance is based on fileobj, which can be a regular
68 file, a StringIO object, or any other object which simulates a file.
69 It defaults to None, in which case filename is opened to provide
72 When fileobj is not None, the filename argument is only used to be
73 included in the gzip file header, which may includes the original
74 filename of the uncompressed file. It defaults to the filename of
75 fileobj, if discernible; otherwise, it defaults to the empty string,
76 and in this case the original filename is not included in the header.
78 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
79 depending on whether the file will be read or written. The default
80 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
81 Be aware that only the 'rb', 'ab', and 'wb' values should be used
82 for cross-platform portability.
84 The compresslevel argument is an integer from 1 to 9 controlling the
85 level of compression; 1 is fastest and produces the least compression,
86 and 9 is slowest and produces the most compression. The default is 9.
90 # guarantee the file is opened in binary mode on platforms
91 # that care about that sort of thing
92 if mode
and 'b' not in mode
:
95 fileobj
= self
.myfileobj
= __builtin__
.open(filename
, mode
or 'rb')
97 if hasattr(fileobj
, 'name'): filename
= fileobj
.name
100 if hasattr(fileobj
, 'mode'): mode
= fileobj
.mode
105 # Set flag indicating start of a new member
106 self
._new
_member
= True
109 self
.filename
= filename
110 # Starts small, scales exponentially
111 self
.min_readsize
= 100
113 elif mode
[0:1] == 'w' or mode
[0:1] == 'a':
115 self
._init
_write
(filename
)
116 self
.compress
= zlib
.compressobj(compresslevel
,
122 raise IOError, "Mode " + mode
+ " not supported"
124 self
.fileobj
= fileobj
127 if self
.mode
== WRITE
:
128 self
._write
_gzip
_header
()
131 s
= repr(self
.fileobj
)
132 return '<gzip ' + s
[1:-1] + ' ' + hex(id(self
)) + '>'
134 def _init_write(self
, filename
):
135 if filename
[-3:] != '.gz':
136 filename
= filename
+ '.gz'
137 self
.filename
= filename
138 self
.crc
= zlib
.crc32("")
143 def _write_gzip_header(self
):
144 self
.fileobj
.write('\037\213') # magic header
145 self
.fileobj
.write('\010') # compression method
146 fname
= self
.filename
[:-3]
150 self
.fileobj
.write(chr(flags
))
151 write32u(self
.fileobj
, long(time
.time()))
152 self
.fileobj
.write('\002')
153 self
.fileobj
.write('\377')
155 self
.fileobj
.write(fname
+ '\000')
157 def _init_read(self
):
158 self
.crc
= zlib
.crc32("")
161 def _read_gzip_header(self
):
162 magic
= self
.fileobj
.read(2)
163 if magic
!= '\037\213':
164 raise IOError, 'Not a gzipped file'
165 method
= ord( self
.fileobj
.read(1) )
167 raise IOError, 'Unknown compression method'
168 flag
= ord( self
.fileobj
.read(1) )
169 # modtime = self.fileobj.read(4)
170 # extraflag = self.fileobj.read(1)
171 # os = self.fileobj.read(1)
175 # Read & discard the extra field, if present
176 xlen
= ord(self
.fileobj
.read(1))
177 xlen
= xlen
+ 256*ord(self
.fileobj
.read(1))
178 self
.fileobj
.read(xlen
)
180 # Read and discard a null-terminated string containing the filename
182 s
= self
.fileobj
.read(1)
183 if not s
or s
=='\000':
186 # Read and discard a null-terminated string containing a comment
188 s
= self
.fileobj
.read(1)
189 if not s
or s
=='\000':
192 self
.fileobj
.read(2) # Read & discard the 16-bit header CRC
195 def write(self
,data
):
196 if self
.mode
!= WRITE
:
198 raise IOError(errno
.EBADF
, "write() on read-only GzipFile object")
200 if self
.fileobj
is None:
201 raise ValueError, "write() on closed GzipFile object"
203 self
.size
= self
.size
+ len(data
)
204 self
.crc
= zlib
.crc32(data
, self
.crc
)
205 self
.fileobj
.write( self
.compress
.compress(data
) )
206 self
.offset
+= len(data
)
208 def read(self
, size
=-1):
209 if self
.mode
!= READ
:
211 raise IOError(errno
.EBADF
, "read() on write-only GzipFile object")
213 if self
.extrasize
<= 0 and self
.fileobj
is None:
217 if size
< 0: # get the whole thing
221 readsize
= min(self
.max_read_chunk
, readsize
* 2)
223 size
= self
.extrasize
224 else: # just get some more of it
226 while size
> self
.extrasize
:
228 readsize
= min(self
.max_read_chunk
, readsize
* 2)
230 if size
> self
.extrasize
:
231 size
= self
.extrasize
233 chunk
= self
.extrabuf
[:size
]
234 self
.extrabuf
= self
.extrabuf
[size
:]
235 self
.extrasize
= self
.extrasize
- size
240 def _unread(self
, buf
):
241 self
.extrabuf
= buf
+ self
.extrabuf
242 self
.extrasize
= len(buf
) + self
.extrasize
243 self
.offset
-= len(buf
)
245 def _read(self
, size
=1024):
246 if self
.fileobj
is None:
247 raise EOFError, "Reached EOF"
250 # If the _new_member flag is set, we have to
251 # jump to the next member, if there is one.
253 # First, check if we're at the end of the file;
254 # if so, it's time to stop; no more members to read.
255 pos
= self
.fileobj
.tell() # Save current position
256 self
.fileobj
.seek(0, 2) # Seek to end of file
257 if pos
== self
.fileobj
.tell():
258 raise EOFError, "Reached EOF"
260 self
.fileobj
.seek( pos
) # Return to original position
263 self
._read
_gzip
_header
()
264 self
.decompress
= zlib
.decompressobj(-zlib
.MAX_WBITS
)
265 self
._new
_member
= False
267 # Read a chunk of data from the file
268 buf
= self
.fileobj
.read(size
)
270 # If the EOF has been reached, flush the decompression object
271 # and mark this object as finished.
274 uncompress
= self
.decompress
.flush()
276 self
._add
_read
_data
( uncompress
)
277 raise EOFError, 'Reached EOF'
279 uncompress
= self
.decompress
.decompress(buf
)
280 self
._add
_read
_data
( uncompress
)
282 if self
.decompress
.unused_data
!= "":
283 # Ending case: we've come to the end of a member in the file,
284 # so seek back to the start of the unused data, finish up
285 # this member, and read a new gzip header.
286 # (The number of bytes to seek back is the length of the unused
287 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
288 self
.fileobj
.seek( -len(self
.decompress
.unused_data
)+8, 1)
290 # Check the CRC and file size, and set the flag so we read
291 # a new member on the next call
293 self
._new
_member
= True
295 def _add_read_data(self
, data
):
296 self
.crc
= zlib
.crc32(data
, self
.crc
)
297 self
.extrabuf
= self
.extrabuf
+ data
298 self
.extrasize
= self
.extrasize
+ len(data
)
299 self
.size
= self
.size
+ len(data
)
302 # We've read to the end of the file, so we have to rewind in order
303 # to reread the 8 bytes containing the CRC and the file size.
304 # We check the that the computed CRC and size of the
305 # uncompressed data matches the stored values. Note that the size
306 # stored is the true file size mod 2**32.
307 self
.fileobj
.seek(-8, 1)
308 crc32
= read32(self
.fileobj
)
309 isize
= U32(read32(self
.fileobj
)) # may exceed 2GB
310 if U32(crc32
) != U32(self
.crc
):
311 raise IOError, "CRC check failed"
312 elif isize
!= LOWU32(self
.size
):
313 raise IOError, "Incorrect length of data produced"
316 if self
.mode
== WRITE
:
317 self
.fileobj
.write(self
.compress
.flush())
318 # The native zlib crc is an unsigned 32-bit integer, but
319 # the Python wrapper implicitly casts that to a signed C
320 # long. So, on a 32-bit box self.crc may "look negative",
321 # while the same crc on a 64-bit box may "look positive".
322 # To avoid irksome warnings from the `struct` module, force
323 # it to look positive on all boxes.
324 write32u(self
.fileobj
, LOWU32(self
.crc
))
325 # self.size may exceed 2GB, or even 4GB
326 write32u(self
.fileobj
, LOWU32(self
.size
))
328 elif self
.mode
== READ
:
331 self
.myfileobj
.close()
332 self
.myfileobj
= None
336 if (self
.myfileobj
is None and
337 self
.fileobj
is None):
339 except AttributeError:
343 def flush(self
,zlib_mode
=zlib
.Z_SYNC_FLUSH
):
344 if self
.mode
== WRITE
:
345 # Ensure the compressor's buffer is flushed
346 self
.fileobj
.write(self
.compress
.flush(zlib_mode
))
350 """Invoke the underlying file object's fileno() method.
352 This will raise AttributeError if the underlying file object
353 doesn't support fileno().
355 return self
.fileobj
.fileno()
364 '''Return the uncompressed stream file position indicator to the
365 beginning of the file'''
366 if self
.mode
!= READ
:
367 raise IOError("Can't rewind in write mode")
369 self
._new
_member
= True
374 def seek(self
, offset
):
375 if self
.mode
== WRITE
:
376 if offset
< self
.offset
:
377 raise IOError('Negative seek in write mode')
378 count
= offset
- self
.offset
379 for i
in range(count
// 1024):
380 self
.write(1024 * '\0')
381 self
.write((count
% 1024) * '\0')
382 elif self
.mode
== READ
:
383 if offset
< self
.offset
:
384 # for negative seek, rewind and do positive seek
386 count
= offset
- self
.offset
387 for i
in range(count
// 1024):
389 self
.read(count
% 1024)
391 def readline(self
, size
=-1):
394 readsize
= self
.min_readsize
399 c
= self
.read(readsize
)
402 # We set i=size to break out of the loop under two
403 # conditions: 1) there's no newline, and the chunk is
404 # larger than size, or 2) there is a newline, but the
405 # resulting line would be longer than 'size'.
406 if (size
<= i
) or (i
== -1 and len(c
) > size
):
409 if i
>= 0 or c
== '':
410 bufs
.append(c
[:i
+ 1]) # Add portion of last chunk
411 self
._unread
(c
[i
+ 1:]) # Push back rest of chunk
414 # Append chunk to list, decrease 'size',
417 readsize
= min(size
, readsize
* 2)
418 if readsize
> self
.min_readsize
:
419 self
.min_readsize
= min(readsize
, self
.min_readsize
* 2, 512)
420 return ''.join(bufs
) # Return resulting line
422 def readlines(self
, sizehint
=0):
423 # Negative numbers result in reading all the lines
425 sizehint
= sys
.maxint
428 line
= self
.readline()
432 sizehint
= sizehint
- len(line
)
436 def writelines(self
, L
):
444 line
= self
.readline()
452 # Act like gzip; with -d, act like gunzip.
453 # The input file is not deleted, however, nor are any other gzip
454 # options or features supported.
456 decompress
= args
and args
[0] == "-d"
464 f
= GzipFile(filename
="", mode
="rb", fileobj
=sys
.stdin
)
467 if arg
[-3:] != ".gz":
468 print "filename doesn't end in .gz:", repr(arg
)
471 g
= __builtin__
.open(arg
[:-3], "wb")
475 g
= GzipFile(filename
="", mode
="wb", fileobj
=sys
.stdout
)
477 f
= __builtin__
.open(arg
, "rb")
478 g
= open(arg
+ ".gz", "wb")
484 if g
is not sys
.stdout
:
486 if f
is not sys
.stdin
:
489 if __name__
== '__main__':