[Bug #1618083] Add missing word; make a few grammar fixes
[pytest.git] / Lib / gzip.py
blob0bf29e86bb5dcc4d521e21eb5bc891c91fb0a729
1 """Functions that read and write gzipped files.
3 The user of the file doesn't have to worry about the compression,
4 but random access is not allowed."""
6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
8 import struct, sys, time
9 import zlib
10 import __builtin__
12 __all__ = ["GzipFile","open"]
14 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16 READ, WRITE = 1, 2
18 def U32(i):
19 """Return i as an unsigned integer, assuming it fits in 32 bits.
21 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
22 """
23 if i < 0:
24 i += 1L << 32
25 return i
27 def LOWU32(i):
28 """Return the low-order 32 bits of an int, as a non-negative int."""
29 return i & 0xFFFFFFFFL
31 def write32(output, value):
32 output.write(struct.pack("<l", value))
34 def write32u(output, value):
35 # The L format writes the bit pattern correctly whether signed
36 # or unsigned.
37 output.write(struct.pack("<L", value))
39 def read32(input):
40 return struct.unpack("<l", input.read(4))[0]
42 def open(filename, mode="rb", compresslevel=9):
43 """Shorthand for GzipFile(filename, mode, compresslevel).
45 The filename argument is required; mode defaults to 'rb'
46 and compresslevel defaults to 9.
48 """
49 return GzipFile(filename, mode, compresslevel)
51 class GzipFile:
52 """The GzipFile class simulates most of the methods of a file object with
53 the exception of the readinto() and truncate() methods.
55 """
57 myfileobj = None
58 max_read_chunk = 10 * 1024 * 1024 # 10Mb
60 def __init__(self, filename=None, mode=None,
61 compresslevel=9, fileobj=None):
62 """Constructor for the GzipFile class.
64 At least one of fileobj and filename must be given a
65 non-trivial value.
67 The new class instance is based on fileobj, which can be a regular
68 file, a StringIO object, or any other object which simulates a file.
69 It defaults to None, in which case filename is opened to provide
70 a file object.
72 When fileobj is not None, the filename argument is only used to be
73 included in the gzip file header, which may includes the original
74 filename of the uncompressed file. It defaults to the filename of
75 fileobj, if discernible; otherwise, it defaults to the empty string,
76 and in this case the original filename is not included in the header.
78 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
79 depending on whether the file will be read or written. The default
80 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
81 Be aware that only the 'rb', 'ab', and 'wb' values should be used
82 for cross-platform portability.
84 The compresslevel argument is an integer from 1 to 9 controlling the
85 level of compression; 1 is fastest and produces the least compression,
86 and 9 is slowest and produces the most compression. The default is 9.
88 """
90 # guarantee the file is opened in binary mode on platforms
91 # that care about that sort of thing
92 if mode and 'b' not in mode:
93 mode += 'b'
94 if fileobj is None:
95 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
96 if filename is None:
97 if hasattr(fileobj, 'name'): filename = fileobj.name
98 else: filename = ''
99 if mode is None:
100 if hasattr(fileobj, 'mode'): mode = fileobj.mode
101 else: mode = 'rb'
103 if mode[0:1] == 'r':
104 self.mode = READ
105 # Set flag indicating start of a new member
106 self._new_member = True
107 self.extrabuf = ""
108 self.extrasize = 0
109 self.filename = filename
110 # Starts small, scales exponentially
111 self.min_readsize = 100
113 elif mode[0:1] == 'w' or mode[0:1] == 'a':
114 self.mode = WRITE
115 self._init_write(filename)
116 self.compress = zlib.compressobj(compresslevel,
117 zlib.DEFLATED,
118 -zlib.MAX_WBITS,
119 zlib.DEF_MEM_LEVEL,
121 else:
122 raise IOError, "Mode " + mode + " not supported"
124 self.fileobj = fileobj
125 self.offset = 0
127 if self.mode == WRITE:
128 self._write_gzip_header()
130 def __repr__(self):
131 s = repr(self.fileobj)
132 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
134 def _init_write(self, filename):
135 if filename[-3:] != '.gz':
136 filename = filename + '.gz'
137 self.filename = filename
138 self.crc = zlib.crc32("")
139 self.size = 0
140 self.writebuf = []
141 self.bufsize = 0
143 def _write_gzip_header(self):
144 self.fileobj.write('\037\213') # magic header
145 self.fileobj.write('\010') # compression method
146 fname = self.filename[:-3]
147 flags = 0
148 if fname:
149 flags = FNAME
150 self.fileobj.write(chr(flags))
151 write32u(self.fileobj, long(time.time()))
152 self.fileobj.write('\002')
153 self.fileobj.write('\377')
154 if fname:
155 self.fileobj.write(fname + '\000')
157 def _init_read(self):
158 self.crc = zlib.crc32("")
159 self.size = 0
161 def _read_gzip_header(self):
162 magic = self.fileobj.read(2)
163 if magic != '\037\213':
164 raise IOError, 'Not a gzipped file'
165 method = ord( self.fileobj.read(1) )
166 if method != 8:
167 raise IOError, 'Unknown compression method'
168 flag = ord( self.fileobj.read(1) )
169 # modtime = self.fileobj.read(4)
170 # extraflag = self.fileobj.read(1)
171 # os = self.fileobj.read(1)
172 self.fileobj.read(6)
174 if flag & FEXTRA:
175 # Read & discard the extra field, if present
176 xlen = ord(self.fileobj.read(1))
177 xlen = xlen + 256*ord(self.fileobj.read(1))
178 self.fileobj.read(xlen)
179 if flag & FNAME:
180 # Read and discard a null-terminated string containing the filename
181 while True:
182 s = self.fileobj.read(1)
183 if not s or s=='\000':
184 break
185 if flag & FCOMMENT:
186 # Read and discard a null-terminated string containing a comment
187 while True:
188 s = self.fileobj.read(1)
189 if not s or s=='\000':
190 break
191 if flag & FHCRC:
192 self.fileobj.read(2) # Read & discard the 16-bit header CRC
195 def write(self,data):
196 if self.mode != WRITE:
197 import errno
198 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
200 if self.fileobj is None:
201 raise ValueError, "write() on closed GzipFile object"
202 if len(data) > 0:
203 self.size = self.size + len(data)
204 self.crc = zlib.crc32(data, self.crc)
205 self.fileobj.write( self.compress.compress(data) )
206 self.offset += len(data)
208 def read(self, size=-1):
209 if self.mode != READ:
210 import errno
211 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
213 if self.extrasize <= 0 and self.fileobj is None:
214 return ''
216 readsize = 1024
217 if size < 0: # get the whole thing
218 try:
219 while True:
220 self._read(readsize)
221 readsize = min(self.max_read_chunk, readsize * 2)
222 except EOFError:
223 size = self.extrasize
224 else: # just get some more of it
225 try:
226 while size > self.extrasize:
227 self._read(readsize)
228 readsize = min(self.max_read_chunk, readsize * 2)
229 except EOFError:
230 if size > self.extrasize:
231 size = self.extrasize
233 chunk = self.extrabuf[:size]
234 self.extrabuf = self.extrabuf[size:]
235 self.extrasize = self.extrasize - size
237 self.offset += size
238 return chunk
240 def _unread(self, buf):
241 self.extrabuf = buf + self.extrabuf
242 self.extrasize = len(buf) + self.extrasize
243 self.offset -= len(buf)
245 def _read(self, size=1024):
246 if self.fileobj is None:
247 raise EOFError, "Reached EOF"
249 if self._new_member:
250 # If the _new_member flag is set, we have to
251 # jump to the next member, if there is one.
253 # First, check if we're at the end of the file;
254 # if so, it's time to stop; no more members to read.
255 pos = self.fileobj.tell() # Save current position
256 self.fileobj.seek(0, 2) # Seek to end of file
257 if pos == self.fileobj.tell():
258 raise EOFError, "Reached EOF"
259 else:
260 self.fileobj.seek( pos ) # Return to original position
262 self._init_read()
263 self._read_gzip_header()
264 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
265 self._new_member = False
267 # Read a chunk of data from the file
268 buf = self.fileobj.read(size)
270 # If the EOF has been reached, flush the decompression object
271 # and mark this object as finished.
273 if buf == "":
274 uncompress = self.decompress.flush()
275 self._read_eof()
276 self._add_read_data( uncompress )
277 raise EOFError, 'Reached EOF'
279 uncompress = self.decompress.decompress(buf)
280 self._add_read_data( uncompress )
282 if self.decompress.unused_data != "":
283 # Ending case: we've come to the end of a member in the file,
284 # so seek back to the start of the unused data, finish up
285 # this member, and read a new gzip header.
286 # (The number of bytes to seek back is the length of the unused
287 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
288 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
290 # Check the CRC and file size, and set the flag so we read
291 # a new member on the next call
292 self._read_eof()
293 self._new_member = True
295 def _add_read_data(self, data):
296 self.crc = zlib.crc32(data, self.crc)
297 self.extrabuf = self.extrabuf + data
298 self.extrasize = self.extrasize + len(data)
299 self.size = self.size + len(data)
301 def _read_eof(self):
302 # We've read to the end of the file, so we have to rewind in order
303 # to reread the 8 bytes containing the CRC and the file size.
304 # We check the that the computed CRC and size of the
305 # uncompressed data matches the stored values. Note that the size
306 # stored is the true file size mod 2**32.
307 self.fileobj.seek(-8, 1)
308 crc32 = read32(self.fileobj)
309 isize = U32(read32(self.fileobj)) # may exceed 2GB
310 if U32(crc32) != U32(self.crc):
311 raise IOError, "CRC check failed"
312 elif isize != LOWU32(self.size):
313 raise IOError, "Incorrect length of data produced"
315 def close(self):
316 if self.mode == WRITE:
317 self.fileobj.write(self.compress.flush())
318 # The native zlib crc is an unsigned 32-bit integer, but
319 # the Python wrapper implicitly casts that to a signed C
320 # long. So, on a 32-bit box self.crc may "look negative",
321 # while the same crc on a 64-bit box may "look positive".
322 # To avoid irksome warnings from the `struct` module, force
323 # it to look positive on all boxes.
324 write32u(self.fileobj, LOWU32(self.crc))
325 # self.size may exceed 2GB, or even 4GB
326 write32u(self.fileobj, LOWU32(self.size))
327 self.fileobj = None
328 elif self.mode == READ:
329 self.fileobj = None
330 if self.myfileobj:
331 self.myfileobj.close()
332 self.myfileobj = None
334 def __del__(self):
335 try:
336 if (self.myfileobj is None and
337 self.fileobj is None):
338 return
339 except AttributeError:
340 return
341 self.close()
343 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
344 if self.mode == WRITE:
345 # Ensure the compressor's buffer is flushed
346 self.fileobj.write(self.compress.flush(zlib_mode))
347 self.fileobj.flush()
349 def fileno(self):
350 """Invoke the underlying file object's fileno() method.
352 This will raise AttributeError if the underlying file object
353 doesn't support fileno().
355 return self.fileobj.fileno()
357 def isatty(self):
358 return False
360 def tell(self):
361 return self.offset
363 def rewind(self):
364 '''Return the uncompressed stream file position indicator to the
365 beginning of the file'''
366 if self.mode != READ:
367 raise IOError("Can't rewind in write mode")
368 self.fileobj.seek(0)
369 self._new_member = True
370 self.extrabuf = ""
371 self.extrasize = 0
372 self.offset = 0
374 def seek(self, offset):
375 if self.mode == WRITE:
376 if offset < self.offset:
377 raise IOError('Negative seek in write mode')
378 count = offset - self.offset
379 for i in range(count // 1024):
380 self.write(1024 * '\0')
381 self.write((count % 1024) * '\0')
382 elif self.mode == READ:
383 if offset < self.offset:
384 # for negative seek, rewind and do positive seek
385 self.rewind()
386 count = offset - self.offset
387 for i in range(count // 1024):
388 self.read(1024)
389 self.read(count % 1024)
391 def readline(self, size=-1):
392 if size < 0:
393 size = sys.maxint
394 readsize = self.min_readsize
395 else:
396 readsize = size
397 bufs = []
398 while size != 0:
399 c = self.read(readsize)
400 i = c.find('\n')
402 # We set i=size to break out of the loop under two
403 # conditions: 1) there's no newline, and the chunk is
404 # larger than size, or 2) there is a newline, but the
405 # resulting line would be longer than 'size'.
406 if (size <= i) or (i == -1 and len(c) > size):
407 i = size - 1
409 if i >= 0 or c == '':
410 bufs.append(c[:i + 1]) # Add portion of last chunk
411 self._unread(c[i + 1:]) # Push back rest of chunk
412 break
414 # Append chunk to list, decrease 'size',
415 bufs.append(c)
416 size = size - len(c)
417 readsize = min(size, readsize * 2)
418 if readsize > self.min_readsize:
419 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
420 return ''.join(bufs) # Return resulting line
422 def readlines(self, sizehint=0):
423 # Negative numbers result in reading all the lines
424 if sizehint <= 0:
425 sizehint = sys.maxint
426 L = []
427 while sizehint > 0:
428 line = self.readline()
429 if line == "":
430 break
431 L.append(line)
432 sizehint = sizehint - len(line)
434 return L
436 def writelines(self, L):
437 for line in L:
438 self.write(line)
440 def __iter__(self):
441 return self
443 def next(self):
444 line = self.readline()
445 if line:
446 return line
447 else:
448 raise StopIteration
451 def _test():
452 # Act like gzip; with -d, act like gunzip.
453 # The input file is not deleted, however, nor are any other gzip
454 # options or features supported.
455 args = sys.argv[1:]
456 decompress = args and args[0] == "-d"
457 if decompress:
458 args = args[1:]
459 if not args:
460 args = ["-"]
461 for arg in args:
462 if decompress:
463 if arg == "-":
464 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
465 g = sys.stdout
466 else:
467 if arg[-3:] != ".gz":
468 print "filename doesn't end in .gz:", repr(arg)
469 continue
470 f = open(arg, "rb")
471 g = __builtin__.open(arg[:-3], "wb")
472 else:
473 if arg == "-":
474 f = sys.stdin
475 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
476 else:
477 f = __builtin__.open(arg, "rb")
478 g = open(arg + ".gz", "wb")
479 while True:
480 chunk = f.read(1024)
481 if not chunk:
482 break
483 g.write(chunk)
484 if g is not sys.stdout:
485 g.close()
486 if f is not sys.stdin:
487 f.close()
489 if __name__ == '__main__':
490 _test()