1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package bzip2 implements bzip2 decompression.
10 // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
11 // of guessing: https://en.wikipedia.org/wiki/Bzip2
12 // The source code to pyflate was useful for debugging:
13 // http://www.paul.sladen.org/projects/pyflate
15 // A StructuralError is returned when the bzip2 data is found to be
16 // syntactically invalid.
17 type StructuralError
string
19 func (s StructuralError
) Error() string {
20 return "bzip2 data invalid: " + string(s
)
23 // A reader decompresses bzip2 compressed data.
29 setupDone
bool // true if we have parsed the bzip2 header.
30 blockSize
int // blockSize in bytes, i.e. 900 * 1000.
32 c
[256]uint // the `C' array for the inverse BWT.
33 tt
[]uint32 // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits.
34 tPos
uint32 // Index of the next output byte in tt.
36 preRLE
[]uint32 // contains the RLE data still to be processed.
37 preRLEUsed
int // number of entries of preRLE used.
38 lastByte
int // the last byte value seen.
39 byteRepeats
uint // the number of repeats of lastByte seen.
40 repeats
uint // the number of copies of lastByte to output.
43 // NewReader returns an io.Reader which decompresses bzip2 data from r.
44 // If r does not also implement io.ByteReader,
45 // the decompressor may read more data than necessary from r.
46 func NewReader(r io
.Reader
) io
.Reader
{
48 bz2
.br
= newBitReader(r
)
52 const bzip2FileMagic
= 0x425a // "BZ"
53 const bzip2BlockMagic
= 0x314159265359
54 const bzip2FinalMagic
= 0x177245385090
56 // setup parses the bzip2 header.
57 func (bz2
*reader
) setup(needMagic
bool) error
{
61 magic
:= br
.ReadBits(16)
62 if magic
!= bzip2FileMagic
{
63 return StructuralError("bad magic value")
69 return StructuralError("non-Huffman entropy encoding")
72 level
:= br
.ReadBits(8)
73 if level
< '1' || level
> '9' {
74 return StructuralError("invalid compression level")
78 bz2
.blockSize
= 100 * 1000 * (level
- '0')
79 if bz2
.blockSize
> len(bz2
.tt
) {
80 bz2
.tt
= make([]uint32, bz2
.blockSize
)
85 func (bz2
*reader
) Read(buf
[]byte) (n
int, err error
) {
102 n
, err
= bz2
.read(buf
)
103 brErr
:= bz2
.br
.Err()
110 func (bz2
*reader
) readFromBlock(buf
[]byte) int {
111 // bzip2 is a block based compressor, except that it has a run-length
112 // preprocessing step. The block based nature means that we can
113 // preallocate fixed-size buffers and reuse them. However, the RLE
114 // preprocessing would require allocating huge buffers to store the
115 // maximum expansion. Thus we process blocks all at once, except for
116 // the RLE which we decompress as required.
118 for (bz2
.repeats
> 0 || bz2
.preRLEUsed
< len(bz2
.preRLE
)) && n
< len(buf
) {
119 // We have RLE data pending.
121 // The run-length encoding works like this:
122 // Any sequence of four equal bytes is followed by a length
123 // byte which contains the number of repeats of that byte to
124 // include. (The number of repeats can be zero.) Because we are
125 // decompressing on-demand our state is kept in the reader
129 buf
[n
] = byte(bz2
.lastByte
)
132 if bz2
.repeats
== 0 {
138 bz2
.tPos
= bz2
.preRLE
[bz2
.tPos
]
143 if bz2
.byteRepeats
== 3 {
144 bz2
.repeats
= uint(b
)
149 if bz2
.lastByte
== int(b
) {
154 bz2
.lastByte
= int(b
)
163 func (bz2
*reader
) read(buf
[]byte) (int, error
) {
165 n
:= bz2
.readFromBlock(buf
)
166 if n
> 0 ||
len(buf
) == 0 {
167 bz2
.blockCRC
= updateCRC(bz2
.blockCRC
, buf
[:n
])
171 // End of block. Check CRC.
172 if bz2
.blockCRC
!= bz2
.wantBlockCRC
{
173 bz2
.br
.err
= StructuralError("block checksum mismatch")
179 switch br
.ReadBits64(48) {
181 return 0, StructuralError("bad magic value found")
183 case bzip2BlockMagic
:
185 err
:= bz2
.readBlock()
190 case bzip2FinalMagic
:
191 // Check end-of-file CRC.
192 wantFileCRC
:= uint32(br
.ReadBits64(32))
196 if bz2
.fileCRC
!= wantFileCRC
{
197 br
.err
= StructuralError("file checksum mismatch")
201 // Skip ahead to byte boundary.
202 // Is there a file concatenated to this one?
203 // It would start with BZ.
205 br
.ReadBits(br
.bits
% 8)
207 b
, err
:= br
.r
.ReadByte()
217 z
, err
:= br
.r
.ReadByte()
220 err
= io
.ErrUnexpectedEOF
225 if b
!= 'B' || z
!= 'Z' {
226 return 0, StructuralError("bad magic value in continuation file")
228 if err
:= bz2
.setup(false); err
!= nil {
235 // readBlock reads a bzip2 block. The magic number should already have been consumed.
236 func (bz2
*reader
) readBlock() (err error
) {
238 bz2
.wantBlockCRC
= uint32(br
.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
240 bz2
.fileCRC
= (bz2
.fileCRC
<<1 | bz2
.fileCRC
>>31) ^ bz2
.wantBlockCRC
241 randomized
:= br
.ReadBits(1)
243 return StructuralError("deprecated randomized files")
245 origPtr
:= uint(br
.ReadBits(24))
247 // If not every byte value is used in the block (i.e., it's text) then
248 // the symbol set is reduced. The symbols used are stored as a
249 // two-level, 16x16 bitmap.
250 symbolRangeUsedBitmap
:= br
.ReadBits(16)
251 symbolPresent
:= make([]bool, 256)
253 for symRange
:= uint(0); symRange
< 16; symRange
++ {
254 if symbolRangeUsedBitmap
&(1<<(15-symRange
)) != 0 {
255 bits
:= br
.ReadBits(16)
256 for symbol
:= uint(0); symbol
< 16; symbol
++ {
257 if bits
&(1<<(15-symbol
)) != 0 {
258 symbolPresent
[16*symRange
+symbol
] = true
266 // There must be an EOF symbol.
267 return StructuralError("no symbols in input")
270 // A block uses between two and six different Huffman trees.
271 numHuffmanTrees
:= br
.ReadBits(3)
272 if numHuffmanTrees
< 2 || numHuffmanTrees
> 6 {
273 return StructuralError("invalid number of Huffman trees")
276 // The Huffman tree can switch every 50 symbols so there's a list of
277 // tree indexes telling us which tree to use for each 50 symbol block.
278 numSelectors
:= br
.ReadBits(15)
279 treeIndexes
:= make([]uint8, numSelectors
)
281 // The tree indexes are move-to-front transformed and stored as unary
283 mtfTreeDecoder
:= newMTFDecoderWithRange(numHuffmanTrees
)
284 for i
:= range treeIndexes
{
287 inc
:= br
.ReadBits(1)
293 if c
>= numHuffmanTrees
{
294 return StructuralError("tree index too large")
296 treeIndexes
[i
] = mtfTreeDecoder
.Decode(c
)
299 // The list of symbols for the move-to-front transform is taken from
300 // the previously decoded symbol bitmap.
301 symbols
:= make([]byte, numSymbols
)
303 for i
:= 0; i
< 256; i
++ {
304 if symbolPresent
[i
] {
305 symbols
[nextSymbol
] = byte(i
)
309 mtf
:= newMTFDecoder(symbols
)
311 numSymbols
+= 2 // to account for RUNA and RUNB symbols
312 huffmanTrees
:= make([]huffmanTree
, numHuffmanTrees
)
314 // Now we decode the arrays of code-lengths for each tree.
315 lengths
:= make([]uint8, numSymbols
)
316 for i
:= range huffmanTrees
{
317 // The code lengths are delta encoded from a 5-bit base value.
318 length
:= br
.ReadBits(5)
319 for j
:= range lengths
{
321 if length
< 1 || length
> 20 {
322 return StructuralError("Huffman length out of range")
333 lengths
[j
] = uint8(length
)
335 huffmanTrees
[i
], err
= newHuffmanTree(lengths
)
341 selectorIndex
:= 1 // the next tree index to use
342 if len(treeIndexes
) == 0 {
343 return StructuralError("no tree selectors given")
345 if int(treeIndexes
[0]) >= len(huffmanTrees
) {
346 return StructuralError("tree selector out of range")
348 currentHuffmanTree
:= huffmanTrees
[treeIndexes
[0]]
349 bufIndex
:= 0 // indexes bz2.buf, the output buffer.
350 // The output of the move-to-front transform is run-length encoded and
351 // we merge the decoding into the Huffman parsing loop. These two
352 // variables accumulate the repeat count. See the Wikipedia page for
357 // The `C' array (used by the inverse BWT) needs to be zero initialized.
358 for i
:= range bz2
.c
{
362 decoded
:= 0 // counts the number of symbols decoded by the current tree.
365 if selectorIndex
>= numSelectors
{
366 return StructuralError("insufficient selector indices for number of symbols")
368 if int(treeIndexes
[selectorIndex
]) >= len(huffmanTrees
) {
369 return StructuralError("tree selector out of range")
371 currentHuffmanTree
= huffmanTrees
[treeIndexes
[selectorIndex
]]
376 v
:= currentHuffmanTree
.Decode(br
)
380 // This is either the RUNA or RUNB symbol.
384 repeat
+= repeatPower
<< v
387 // This limit of 2 million comes from the bzip2 source
388 // code. It prevents repeat from overflowing.
389 if repeat
> 2*1024*1024 {
390 return StructuralError("repeat count too large")
396 // We have decoded a complete run-length so we need to
397 // replicate the last output symbol.
398 if repeat
> bz2
.blockSize
-bufIndex
{
399 return StructuralError("repeats past end of block")
401 for i
:= 0; i
< repeat
; i
++ {
403 bz2
.tt
[bufIndex
] = uint32(b
)
410 if int(v
) == numSymbols
-1 {
411 // This is the EOF symbol. Because it's always at the
412 // end of the move-to-front list, and never gets moved
413 // to the front, it has this unique value.
417 // Since two metasymbols (RUNA and RUNB) have values 0 and 1,
418 // one would expect |v-2| to be passed to the MTF decoder.
419 // However, the front of the MTF list is never referenced as 0,
420 // it's always referenced with a run-length of 1. Thus 0
421 // doesn't need to be encoded and we have |v-1| in the next
423 b
:= mtf
.Decode(int(v
- 1))
424 if bufIndex
>= bz2
.blockSize
{
425 return StructuralError("data exceeds block size")
427 bz2
.tt
[bufIndex
] = uint32(b
)
432 if origPtr
>= uint(bufIndex
) {
433 return StructuralError("origPtr out of bounds")
436 // We have completed the entropy decoding. Now we can perform the
437 // inverse BWT and setup the RLE buffer.
438 bz2
.preRLE
= bz2
.tt
[:bufIndex
]
440 bz2
.tPos
= inverseBWT(bz2
.preRLE
, origPtr
, bz2
.c
[:])
448 // inverseBWT implements the inverse Burrows-Wheeler transform as described in
449 // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
450 // In that document, origPtr is called `I' and c is the `C' array after the
451 // first pass over the data. It's an argument here because we merge the first
452 // pass with the Huffman decoding.
454 // This also implements the `single array' method from the bzip2 source code
455 // which leaves the output, still shuffled, in the bottom 8 bits of tt with the
456 // index of the next byte in the top 24-bits. The index of the first byte is
458 func inverseBWT(tt
[]uint32, origPtr
uint, c
[]uint) uint32 {
460 for i
:= 0; i
< 256; i
++ {
467 tt
[c
[b
]] |
= uint32(i
) << 8
471 return tt
[origPtr
] >> 8
474 // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
475 // causing the bits in the input to be processed in the reverse of the usual order.
477 var crctab
[256]uint32
480 const poly
= 0x04C11DB7
481 for i
:= range crctab
{
482 crc
:= uint32(i
) << 24
483 for j
:= 0; j
< 8; j
++ {
484 if crc
&0x80000000 != 0 {
485 crc
= (crc
<< 1) ^ poly
494 // updateCRC updates the crc value to incorporate the data in b.
495 // The initial value is 0.
496 func updateCRC(val
uint32, b
[]byte) uint32 {
498 for _
, v
:= range b
{
499 crc
= crctab
[byte(crc
>>24)^v
] ^ (crc
<< 8)