3 # Copyright (c) 2005-2010 Marko Kreen <markokr@gmail.com>
5 # Permission to use, copy, modify, and/or distribute this software for any
6 # purpose with or without fee is hereby granted, provided that the above
7 # copyright notice and this permission notice appear in all copies.
9 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 """RAR archive reader.
19 This is Python module for Rar archive reading. The interface
20 is made as `zipfile` like as possible.
22 The archive structure parsing and uncompressed files
23 are handled in pure Python. Decompression is done
24 via 'unrar' command line utility.
28 - Works with both Python 2.x and 3.x
29 - Supports RAR 3.x archives.
30 - Supports multi volume archives.
31 - Supports Unicode filenames.
32 - Supports password-protected archives.
33 - Supports archive comments.
39 from struct
import pack
, unpack
40 from binascii
import crc32
41 from tempfile
import mkstemp
42 from subprocess
import Popen
, PIPE
44 # py2.6 has broken bytes()
45 if sys
.hexversion
< 0x3000000:
49 # export only interesting items
50 __all__
= ['is_rarfile', 'RarInfo', 'RarFile']
52 # default fallback charset
53 DEFAULT_CHARSET
= "windows-1252"
55 # 'unrar', 'rar' or full path to either one
56 EXTRACT_TOOL
= "unrar"
58 # Must be 'rar', because 'unrar' does not have 'cw' command.
59 # Can be full path, or None to disable comment extraction
62 # command line args to use for extracting. (rar, file) will be added.
63 EXTRACT_ARGS
= ('p', '-inul')
65 # how to extract comment from archive. (rar, tmpfile) will be added.
66 COMMENT_ARGS
= ('cw', '-y', '-inul', '-p-')
68 # whether to speed up decompression by using tmp archive
76 RAR_BLOCK_MARK
= 0x72 # r
77 RAR_BLOCK_MAIN
= 0x73 # s
78 RAR_BLOCK_FILE
= 0x74 # t
79 RAR_BLOCK_OLD_COMMENT
= 0x75 # u
80 RAR_BLOCK_OLD_EXTRA
= 0x76 # v
81 RAR_BLOCK_OLD_SUB
= 0x77 # w
82 RAR_BLOCK_OLD_RECOVERY
= 0x78 # x
83 RAR_BLOCK_OLD_AUTH
= 0x79 # y
84 RAR_BLOCK_SUB
= 0x7a # z
85 RAR_BLOCK_ENDARC
= 0x7b # {
88 RAR_MAIN_VOLUME
= 0x0001
89 RAR_MAIN_COMMENT
= 0x0002
90 RAR_MAIN_LOCK
= 0x0004
91 RAR_MAIN_SOLID
= 0x0008
92 RAR_MAIN_NEWNUMBERING
= 0x0010
93 RAR_MAIN_AUTH
= 0x0020
94 RAR_MAIN_RECOVERY
= 0x0040
95 RAR_MAIN_PASSWORD
= 0x0080
96 RAR_MAIN_FIRSTVOLUME
= 0x0100
97 RAR_MAIN_ENCRYPTVER
= 0x0200
100 RAR_FILE_SPLIT_BEFORE
= 0x0001
101 RAR_FILE_SPLIT_AFTER
= 0x0002
102 RAR_FILE_PASSWORD
= 0x0004
103 RAR_FILE_COMMENT
= 0x0008
104 RAR_FILE_SOLID
= 0x0010
105 RAR_FILE_DICTMASK
= 0x00e0
106 RAR_FILE_DICT64
= 0x0000
107 RAR_FILE_DICT128
= 0x0020
108 RAR_FILE_DICT256
= 0x0040
109 RAR_FILE_DICT512
= 0x0060
110 RAR_FILE_DICT1024
= 0x0080
111 RAR_FILE_DICT2048
= 0x00a0
112 RAR_FILE_DICT4096
= 0x00c0
113 RAR_FILE_DIRECTORY
= 0x00e0
114 RAR_FILE_LARGE
= 0x0100
115 RAR_FILE_UNICODE
= 0x0200
116 RAR_FILE_SALT
= 0x0400
117 RAR_FILE_VERSION
= 0x0800
118 RAR_FILE_EXTTIME
= 0x1000
119 RAR_FILE_EXTFLAGS
= 0x2000
121 RAR_ENDARC_NEXT_VOLUME
= 0x0001
122 RAR_ENDARC_DATACRC
= 0x0002
123 RAR_ENDARC_REVSPACE
= 0x0004
125 # flags common to all blocks
126 RAR_SKIP_IF_UNKNOWN
= 0x4000
127 RAR_LONG_BLOCK
= 0x8000
137 # internal byte constants
138 RAR_ID
= bytes("Rar!\x1a\x07\x00", 'ascii')
139 ZERO
= bytes("\0", 'ascii')
140 EMPTY
= bytes("", 'ascii')
146 class Error(Exception):
147 """Base class for rarfile errors."""
148 class BadRarFile(Error
):
149 """Incorrect data in archive."""
150 class NotRarFile(Error
):
151 """The file is not RAR archive."""
152 class BadRarName(Error
):
153 """Cannot guess multipart name components."""
154 class NoRarEntry(Error
):
155 """File not found in RAR"""
156 class PasswordRequired(Error
):
157 """File requires password"""
158 class NeedFirstVolume(Error
):
159 """Need to start from first volume."""
162 '''Check quickly whether file is rar archive.'''
163 buf
= open(fn
, "rb").read(len(RAR_ID
))
167 '''An entry in rar archive.
170 File name with relative path.
171 Note that Rar uses "\" as directory separator.
172 Always unicode string.
174 Modification time, tuple of (year, month, day, hour, minute, second).
180 Compression method: 0x30 - 0x35.
181 @ivar extract_version:
182 Minimal Rar version needed for decompressing.
184 Host OS type, one of RAR_OS_* constants.
186 File attributes. May be either dos-style or unix-style, depending on host_os.
188 CRC-32 of uncompressed file, unsigned int.
190 Volume nr, starting from 0.
192 One of RAR_BLOCK_* types. Only entries with type==RAR_BLOCK_FILE are shown in .infolist().
194 For files, RAR_FILE_* bits.
196 Byte string of non-unicode representation.
199 Optional time field: Modification time, tuple of (year, month, day, hour, minute, second).
201 Optional time field: ctime time.
203 Optional time field: access time.
205 Optional time field: archival time.
223 # optional extended time fields
224 # same format as date_time, but sec is float
246 '''Returns True if the entry is a directory.'''
247 if self
.type == RAR_BLOCK_FILE
:
248 return (self
.flags
& RAR_FILE_DIRECTORY
) == RAR_FILE_DIRECTORY
251 def needs_password(self
):
252 return self
.flags
& RAR_FILE_PASSWORD
255 '''Rar archive handling.'''
256 def __init__(self
, rarfile
, mode
="r", charset
=None, info_callback
=None, crc_check
= True):
257 """Open and parse a RAR archive.
259 @param rarfile: archive file name
260 @param mode: only 'r' is supported.
261 @param charset: fallback charset to use, if filenames are not already Unicode-enabled.
262 @param info_callback: debug callback, gets to see all archive entries.
263 @param crc_check: set to False to disable CRC checks
265 self
.rarfile
= rarfile
267 self
._charset
= charset
or DEFAULT_CHARSET
268 self
._info
_callback
= info_callback
271 self
._gen
_volname
= self
._gen
_oldvol
272 self
._needs
_password
= False
273 self
._password
= None
274 self
._crc
_check
= crc_check
279 raise NotImplementedError("RarFile supports only mode=r")
283 if self
._main
.flags
& RAR_MAIN_COMMENT
:
286 def setpassword(self
, password
):
287 '''Sets the password to use when extracting.'''
288 self
._password
= password
290 def needs_password(self
):
291 '''Returns True if any archive entries require password for extraction.'''
292 return self
._needs
_password
295 '''Return list of filenames in archive.'''
297 for f
in self
._info
_list
:
298 res
.append(f
.filename
)
302 '''Return RarInfo objects for all files/directories in archive.'''
303 return self
._info
_list
305 def getinfo(self
, fname
):
306 '''Return RarInfo for file.'''
307 fname2
= fname
.replace("/", "\\")
308 for f
in self
._info
_list
:
309 if fname
== f
.filename
or fname2
== f
.filename
:
312 def open(self
, fname
, psw
= None):
313 '''Return open file object, where the data can be read.
315 The object has only .read() and .close() methods.
317 inf
= self
.getinfo(fname
)
319 raise NoRarEntry("No such file")
322 raise TypeError("Directory does not have any data")
323 if inf
.needs_password():
324 psw
= psw
or self
._password
326 raise PasswordRequired("File %s requires password" % fname
)
330 is_solid
= self
._main
.flags
& RAR_MAIN_SOLID
331 uses_vols
= self
._main
.flags
& RAR_MAIN_VOLUME
332 if inf
.compress_type
== 0x30 and psw
is None:
333 return self
._open
_clear
(inf
)
334 elif USE_EXTRACT_HACK
and not is_solid
and not uses_vols
:
335 return self
._open
_hack
(inf
, psw
)
337 return self
._open
_unrar
(self
.rarfile
, inf
, psw
)
339 def read(self
, fname
, psw
= None):
340 """Return uncompressed data for archive entry.
342 For longer files using .open() may be better idea.
345 f
= self
.open(fname
, psw
)
351 """Release open resources."""
355 """Print archive file list to stdout."""
356 for f
in self
._info
_list
:
360 def _process_entry(self
, item
):
361 # RAR_BLOCK_NEWSUB has files too: CMT, RR
362 if item
.type == RAR_BLOCK_FILE
:
363 # use only first part
364 if (item
.flags
& RAR_FILE_SPLIT_BEFORE
) == 0:
365 self
._info
_list
.append(item
)
366 # remember if any items require password
367 if item
.needs_password():
368 self
._needs
_password
= True
369 elif len(self
._info
_list
) > 0:
370 # final crc is in last block
371 old
= self
._info
_list
[-1]
374 if self
._info
_callback
:
375 self
._info
_callback
(item
)
379 fd
= open(self
.rarfile
, "rb")
380 id = fd
.read(len(RAR_ID
))
382 raise NotRarFile("Not a Rar archive")
384 volume
= 0 # first vol (.rar) is 0
387 h
= self
._parse
_header
(fd
)
391 fd
= open(self
._gen
_volname
(volume
), "rb")
398 if h
.type == RAR_BLOCK_MAIN
and not self
._main
:
400 if h
.flags
& RAR_MAIN_VOLUME
:
401 if not h
.flags
& RAR_MAIN_FIRSTVOLUME
:
402 raise NeedFirstVolume("Need to start from first volume")
403 if h
.flags
& RAR_MAIN_NEWNUMBERING
:
404 self
._gen
_volname
= self
._gen
_newvol
405 elif h
.type == RAR_BLOCK_ENDARC
:
406 more_vols
= h
.flags
& RAR_ENDARC_NEXT_VOLUME
409 self
._process
_entry
(h
)
413 fd
.seek(h
.file_offset
+ h
.add_size
, 0)
417 def _parse_header(self
, fd
):
418 h
= self
._parse
_block
_header
(fd
)
419 if h
and (h
.type == RAR_BLOCK_FILE
or h
.type == RAR_BLOCK_SUB
):
420 self
._parse
_file
_header
(h
)
424 def _parse_block_header(self
, fd
):
427 h
.header_offset
= fd
.tell()
428 buf
= fd
.read(HDRLEN
)
432 t
= unpack("<HBHH", buf
)
433 h
.header_crc
, h
.type, h
.flags
, h
.header_size
= t
434 h
.header_unknown
= h
.header_size
- HDRLEN
436 if h
.header_size
> HDRLEN
:
437 h
.header_data
= fd
.read(h
.header_size
- HDRLEN
)
439 h
.header_data
= EMPTY
440 h
.file_offset
= fd
.tell()
442 if h
.flags
& RAR_LONG_BLOCK
:
443 h
.add_size
= unpack("<L", h
.header_data
[:4])[0]
447 # no crc check on that
448 if h
.type == RAR_BLOCK_MARK
:
452 if h
.type == RAR_BLOCK_MAIN
:
453 crcdat
= buf
[2:] + h
.header_data
[:6]
454 elif h
.type == RAR_BLOCK_OLD_AUTH
:
455 crcdat
= buf
[2:] + h
.header_data
[:8]
456 elif h
.type == RAR_BLOCK_OLD_SUB
:
457 crcdat
= buf
[2:] + h
.header_data
+ fd
.read(h
.add_size
)
459 crcdat
= buf
[2:] + h
.header_data
461 calc_crc
= crc32(crcdat
) & 0xFFFF
464 if h
.header_crc
== calc_crc
:
467 # instead panicing, send eof
470 # read file-specific header
471 def _parse_file_header(self
, h
):
472 HDRLEN
= 4+4+1+4+4+1+1+2+4
473 fld
= unpack("<LLBLLBBHL", h
.header_data
[ : HDRLEN
])
474 h
.compress_size
= fld
[0]
478 h
.date_time
= self
._parse
_dos
_time
(fld
[4])
479 h
.extract_version
= fld
[5]
480 h
.compress_type
= fld
[6]
485 if h
.flags
& RAR_FILE_LARGE
:
486 h1
, h2
= unpack("<LL", h
.header_data
[pos
:pos
+8])
487 h
.compress_size |
= h1
<< 32
488 h
.file_size |
= h2
<< 32
491 name
= h
.header_data
[pos
: pos
+ h
.name_size
]
493 if h
.flags
& RAR_FILE_UNICODE
:
494 nul
= name
.find(ZERO
)
495 h
.orig_filename
= name
[:nul
]
496 u
= _UnicodeFilename(h
.orig_filename
, name
[nul
+ 1 : ])
497 h
.unicode_filename
= u
.decode()
499 h
.orig_filename
= name
500 h
.unicode_filename
= name
.decode(self
._charset
, "replace")
502 h
.filename
= h
.unicode_filename
504 if h
.flags
& RAR_FILE_SALT
:
505 h
.salt
= h
.header_data
[pos
: pos
+ 8]
510 # optional extended time stamps
511 if h
.flags
& RAR_FILE_EXTTIME
:
512 pos
= self
._parse
_ext
_time
(h
, pos
)
514 h
.mtime
= h
.atime
= h
.ctime
= h
.arctime
= None
517 h
.header_unknown
-= pos
521 def _parse_dos_time(self
, stamp
):
522 sec
= stamp
& 0x1F; stamp
= stamp
>> 5
523 min = stamp
& 0x3F; stamp
= stamp
>> 6
524 hr
= stamp
& 0x1F; stamp
= stamp
>> 5
525 day
= stamp
& 0x1F; stamp
= stamp
>> 5
526 mon
= stamp
& 0x0F; stamp
= stamp
>> 4
527 yr
= (stamp
& 0x7F) + 1980
528 return (yr
, mon
, day
, hr
, min, sec
)
530 def _parse_ext_time(self
, h
, pos
):
532 flags
= unpack("<H", data
[pos
: pos
+ 2])[0]
534 h
.mtime
, pos
= self
._parse
_xtime
(flags
>> 3*4, data
, pos
, h
.date_time
)
535 h
.ctime
, pos
= self
._parse
_xtime
(flags
>> 2*4, data
, pos
)
536 h
.atime
, pos
= self
._parse
_xtime
(flags
>> 1*4, data
, pos
)
537 h
.arctime
, pos
= self
._parse
_xtime
(flags
>> 0*4, data
, pos
)
540 def _parse_xtime(self
, flag
, data
, pos
, dostime
= None):
541 unit
= 10000000.0 # 100 ns units
544 t
= unpack("<I", data
[pos
: pos
+ 4])[0]
545 dostime
= self
._parse
_dos
_time
(t
)
552 rem
+= unpack("B", data
[pos
: pos
+ 1])[0]
554 sec
= dostime
[5] + rem
/ unit
557 dostime
= dostime
[:5] + (sec
,)
560 # new-style volume name
561 def _gen_newvol(self
, volume
):
562 # allow % in filenames
563 fn
= self
.rarfile
.replace("%", "%%")
565 m
= re
.search(r
"([0-9][0-9]*)[^0-9]*$", fn
)
567 raise BadRarName("Cannot construct volume name")
570 fmt
= "%%0%dd" % (n2
- n1
)
571 volfmt
= fn
[:n1
] + fmt
+ fn
[n2
:]
572 return volfmt
% (volume
+ 1)
574 # old-style volume naming
575 def _gen_oldvol(self
, volume
):
578 # although 'rar' can generate them, it's unlikely they work well
580 raise BadRarName("Cannot construct volume name")
583 i
= self
.rarfile
.rfind(".")
585 base
= self
.rarfile
[:i
]
589 # generate new extension
590 d
, m
= divmod(volume
- 1, 100)
591 ext
= '.%c%02d' % (ord('r') + d
, m
)
594 def _open_clear(self
, inf
):
595 return DirectReader(self
, inf
)
597 # put file compressed data into temporary .rar archive, and run
598 # unrar on that, thus avoiding unrar going over whole archive
599 def _open_hack(self
, inf
, psw
= None):
602 size
= inf
.compress_size
+ inf
.header_size
603 rf
= open(self
.rarfile
, "rb")
604 rf
.seek(inf
.header_offset
)
606 tmpfd
, tmpname
= mkstemp(suffix
='.rar')
607 tmpf
= os
.fdopen(tmpfd
, "wb")
610 # create main header: crc, type, flags, size, res1, res2
611 mh
= pack("<HBHHHL", 0x90CF, 0x73, 0, 13, 0, 0)
612 tmpf
.write(RAR_ID
+ mh
)
619 raise BadRarFile('read failed - broken archive')
627 return self
._open
_unrar
(tmpname
, inf
, psw
, tmpname
)
629 # extract using unrar
630 def _open_unrar(self
, rarfile
, inf
, psw
= None, tmpfile
= None):
631 cmd
= [EXTRACT_TOOL
] + list(EXTRACT_ARGS
)
633 cmd
.append("-p" + psw
)
636 # not giving filename avoids encoding related problems
639 fn
= fn
.replace('\\', os
.sep
)
642 # 3xPIPE seems unreliable, at least on osx
644 null
= open("/dev/null", "wb")
652 p
= Popen(cmd
, stdout
= PIPE
, stdin
= _in
, stderr
= _err
)
653 return PipeReader(self
, inf
, p
, tmpfile
)
655 def _read_comment(self
):
658 tmpfd
, tmpname
= mkstemp(suffix
='.txt')
660 cmd
= [COMMENT_TOOL
] + list(COMMENT_ARGS
)
661 cmd
.append(self
.rarfile
)
667 cmt
= os
.fdopen(tmpfd
, 'rb').read()
669 self
.comment
= cmt
.decode('utf8')
671 self
.comment
= cmt
.decode(self
._charset
, 'replace')
672 except (OSError, IOError):
677 # handle unicode filename compression
678 class _UnicodeFilename
:
679 def __init__(self
, name
, encdata
):
680 self
.std_name
= bytearray(name
)
681 self
.encdata
= bytearray(encdata
)
682 self
.pos
= self
.encpos
= 0
683 self
.buf
= bytearray()
686 c
= self
.encdata
[self
.encpos
]
691 return self
.std_name
[self
.pos
]
693 def put(self
, lo
, hi
):
701 while self
.encpos
< len(self
.encdata
):
703 flags
= self
.enc_byte()
706 t
= (flags
>> flagbits
) & 3
708 self
.put(self
.enc_byte(), 0)
710 self
.put(self
.enc_byte(), hi
)
712 self
.put(self
.enc_byte(), self
.enc_byte())
717 for i
in range((n
& 0x7f) + 2):
718 lo
= (self
.std_byte() + c
) & 0xFF
721 for i
in range(n
+ 2):
722 self
.put(self
.std_byte(), 0)
723 return self
.buf
.decode("utf-16le", "replace")
727 """Base class for 'file-like' object that RarFile.open() returns.
729 Provides public methods and common crc checking.
732 def __init__(self
, rf
, inf
, tempfile
= None):
735 self
.crc_check
= rf
._crc
_check
737 self
.remain
= inf
.file_size
738 self
.tempfile
= tempfile
741 def read(self
, cnt
= None):
742 """Read all or specified amount of data from archive entry."""
747 elif cnt
> self
.remain
:
753 data
= self
._read
(cnt
)
755 self
.CRC
= crc32(data
, self
.CRC
)
756 self
.remain
-= len(data
)
759 if not data
or self
.remain
== 0:
765 """Check final CRC."""
766 if not self
.crc_check
:
769 raise BadRarFile("Failed the read enough data")
772 crc
+= (long(1) << 32)
773 if crc
!= self
.inf
.CRC
:
774 raise BadRarFile("Corrupt file - CRC check failed")
776 def _read(self
, cnt
):
777 """Actual read that gets sanitized cnt."""
780 """Close open resources."""
786 os
.unlink(self
.tempfile
)
790 """Hook delete to make sure tempfile is removed."""
794 class PipeReader(BaseReader
):
795 """Read data from pipe, handle tempfile cleanup."""
797 def __init__(self
, rf
, inf
, proc
, tempfile
=None):
798 BaseReader
.__init
__(self
, rf
, inf
, tempfile
)
799 self
.fd
= proc
.stdout
801 def _read(self
, cnt
):
802 """Read from pipe."""
803 return self
.fd
.read(cnt
)
806 class DirectReader(BaseReader
):
807 """Read uncompressed data directly from archive."""
809 def __init__(self
, rf
, inf
):
810 BaseReader
.__init
__(self
, rf
, inf
)
811 self
.vol
= inf
.volume
812 self
.size
= inf
.file_size
814 self
.fd
= open(self
.rf
._gen
_volname
(self
.vol
), "rb")
815 self
.fd
.seek(self
.inf
.header_offset
, 0)
816 self
.cur
= self
.rf
._parse
_header
(self
.fd
)
817 self
.cur_avail
= self
.cur
.add_size
819 def _read(self
, cnt
):
820 """Read from potentially multi-volume archive."""
825 if self
.cur_avail
== 0:
826 if not self
._open
_next
():
829 # fd is in read pos, do the read
830 if cnt
> self
.cur_avail
:
831 data
= self
.fd
.read(self
.cur_avail
)
833 data
= self
.fd
.read(cnt
)
839 self
.cur_avail
-= len(data
)
844 def _open_next(self
):
845 """Proceed to next volume."""
847 # is the file split over archives?
848 if (self
.cur
.flags
& RAR_FILE_SPLIT_AFTER
) == 0:
853 fd
= open(self
.rf
._gen
_volname
(self
.vol
), "rb")
856 # loop until first file header
858 cur
= self
.rf
._parse
_header
(fd
)
860 raise BadRarFile("Unexpected EOF")
861 if cur
.type in (RAR_BLOCK_MARK
, RAR_BLOCK_MAIN
):
863 fd
.seek(cur
.add_size
, 1)
865 if cur
.orig_filename
!= self
.inf
.orig_filename
:
866 raise BadRarFile("Did not found file entry")
868 self
.cur_avail
= cur
.add_size
871 # see if compat bytearray() is needed
877 def __init__(self
, val
= ''):
878 self
.arr
= array
.array('B', val
)
879 self
.append
= self
.arr
.append
880 self
.__getitem
__ = self
.arr
.__getitem
__
881 self
.__len
__ = self
.arr
.__len
__
882 def decode(self
, *args
):
883 return self
.arr
.tostring().decode(*args
)