libgo/go/compress/bzip2/bzip2.go

   1 // Copyright 2011 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Package bzip2 implements bzip2 decompression.
   6 package bzip2
   7
   8 import "io"
   9
  10 // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
  11 // of guessing: https://en.wikipedia.org/wiki/Bzip2
  12 // The source code to pyflate was useful for debugging:
  13 // http://www.paul.sladen.org/projects/pyflate
  14
  15 // A StructuralError is returned when the bzip2 data is found to be
  16 // syntactically invalid.
  17 type StructuralError string
  18
  19 func (s StructuralError) Error() string {
  20         return "bzip2 data invalid: " + string(s)
  21 }
  22
  23 // A reader decompresses bzip2 compressed data.
  24 type reader struct {
  25         br           bitReader
  26         fileCRC      uint32
  27         blockCRC     uint32
  28         wantBlockCRC uint32
  29         setupDone    bool // true if we have parsed the bzip2 header.
  30         blockSize    int  // blockSize in bytes, i.e. 900 * 1000.
  31         eof          bool
  32         c            [256]uint // the `C' array for the inverse BWT.
  33         tt           []uint32  // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits.
  34         tPos         uint32    // Index of the next output byte in tt.
  35
  36         preRLE      []uint32 // contains the RLE data still to be processed.
  37         preRLEUsed  int      // number of entries of preRLE used.
  38         lastByte    int      // the last byte value seen.
  39         byteRepeats uint     // the number of repeats of lastByte seen.
  40         repeats     uint     // the number of copies of lastByte to output.
  41 }
  42
  43 // NewReader returns an io.Reader which decompresses bzip2 data from r.
  44 // If r does not also implement io.ByteReader,
  45 // the decompressor may read more data than necessary from r.
  46 func NewReader(r io.Reader) io.Reader {
  47         bz2 := new(reader)
  48         bz2.br = newBitReader(r)
  49         return bz2
  50 }
  51
  52 const bzip2FileMagic = 0x425a // "BZ"
  53 const bzip2BlockMagic = 0x314159265359
  54 const bzip2FinalMagic = 0x177245385090
  55
  56 // setup parses the bzip2 header.
  57 func (bz2 *reader) setup(needMagic bool) error {
  58         br := &bz2.br
  59
  60         if needMagic {
  61                 magic := br.ReadBits(16)
  62                 if magic != bzip2FileMagic {
  63                         return StructuralError("bad magic value")
  64                 }
  65         }
  66
  67         t := br.ReadBits(8)
  68         if t != 'h' {
  69                 return StructuralError("non-Huffman entropy encoding")
  70         }
  71
  72         level := br.ReadBits(8)
  73         if level < '1' || level > '9' {
  74                 return StructuralError("invalid compression level")
  75         }
  76
  77         bz2.fileCRC = 0
  78         bz2.blockSize = 100 * 1000 * (level - '0')
  79         if bz2.blockSize > len(bz2.tt) {
  80                 bz2.tt = make([]uint32, bz2.blockSize)
  81         }
  82         return nil
  83 }
  84
  85 func (bz2 *reader) Read(buf []byte) (n int, err error) {
  86         if bz2.eof {
  87                 return 0, io.EOF
  88         }
  89
  90         if !bz2.setupDone {
  91                 err = bz2.setup(true)
  92                 brErr := bz2.br.Err()
  93                 if brErr != nil {
  94                         err = brErr
  95                 }
  96                 if err != nil {
  97                         return 0, err
  98                 }
  99                 bz2.setupDone = true
 100         }
 101
 102         n, err = bz2.read(buf)
 103         brErr := bz2.br.Err()
 104         if brErr != nil {
 105                 err = brErr
 106         }
 107         return
 108 }
 109
 110 func (bz2 *reader) readFromBlock(buf []byte) int {
 111         // bzip2 is a block based compressor, except that it has a run-length
 112         // preprocessing step. The block based nature means that we can
 113         // preallocate fixed-size buffers and reuse them. However, the RLE
 114         // preprocessing would require allocating huge buffers to store the
 115         // maximum expansion. Thus we process blocks all at once, except for
 116         // the RLE which we decompress as required.
 117         n := 0
 118         for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
 119                 // We have RLE data pending.
 120
 121                 // The run-length encoding works like this:
 122                 // Any sequence of four equal bytes is followed by a length
 123                 // byte which contains the number of repeats of that byte to
 124                 // include. (The number of repeats can be zero.) Because we are
 125                 // decompressing on-demand our state is kept in the reader
 126                 // object.
 127
 128                 if bz2.repeats > 0 {
 129                         buf[n] = byte(bz2.lastByte)
 130                         n++
 131                         bz2.repeats--
 132                         if bz2.repeats == 0 {
 133                                 bz2.lastByte = -1
 134                         }
 135                         continue
 136                 }
 137
 138                 bz2.tPos = bz2.preRLE[bz2.tPos]
 139                 b := byte(bz2.tPos)
 140                 bz2.tPos >>= 8
 141                 bz2.preRLEUsed++
 142
 143                 if bz2.byteRepeats == 3 {
 144                         bz2.repeats = uint(b)
 145                         bz2.byteRepeats = 0
 146                         continue
 147                 }
 148
 149                 if bz2.lastByte == int(b) {
 150                         bz2.byteRepeats++
 151                 } else {
 152                         bz2.byteRepeats = 0
 153                 }
 154                 bz2.lastByte = int(b)
 155
 156                 buf[n] = b
 157                 n++
 158         }
 159
 160         return n
 161 }
 162
 163 func (bz2 *reader) read(buf []byte) (int, error) {
 164         for {
 165                 n := bz2.readFromBlock(buf)
 166                 if n > 0 || len(buf) == 0 {
 167                         bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n])
 168                         return n, nil
 169                 }
 170
 171                 // End of block. Check CRC.
 172                 if bz2.blockCRC != bz2.wantBlockCRC {
 173                         bz2.br.err = StructuralError("block checksum mismatch")
 174                         return 0, bz2.br.err
 175                 }
 176
 177                 // Find next block.
 178                 br := &bz2.br
 179                 switch br.ReadBits64(48) {
 180                 default:
 181                         return 0, StructuralError("bad magic value found")
 182
 183                 case bzip2BlockMagic:
 184                         // Start of block.
 185                         err := bz2.readBlock()
 186                         if err != nil {
 187                                 return 0, err
 188                         }
 189
 190                 case bzip2FinalMagic:
 191                         // Check end-of-file CRC.
 192                         wantFileCRC := uint32(br.ReadBits64(32))
 193                         if br.err != nil {
 194                                 return 0, br.err
 195                         }
 196                         if bz2.fileCRC != wantFileCRC {
 197                                 br.err = StructuralError("file checksum mismatch")
 198                                 return 0, br.err
 199                         }
 200
 201                         // Skip ahead to byte boundary.
 202                         // Is there a file concatenated to this one?
 203                         // It would start with BZ.
 204                         if br.bits%8 != 0 {
 205                                 br.ReadBits(br.bits % 8)
 206                         }
 207                         b, err := br.r.ReadByte()
 208                         if err == io.EOF {
 209                                 br.err = io.EOF
 210                                 bz2.eof = true
 211                                 return 0, io.EOF
 212                         }
 213                         if err != nil {
 214                                 br.err = err
 215                                 return 0, err
 216                         }
 217                         z, err := br.r.ReadByte()
 218                         if err != nil {
 219                                 if err == io.EOF {
 220                                         err = io.ErrUnexpectedEOF
 221                                 }
 222                                 br.err = err
 223                                 return 0, err
 224                         }
 225                         if b != 'B' || z != 'Z' {
 226                                 return 0, StructuralError("bad magic value in continuation file")
 227                         }
 228                         if err := bz2.setup(false); err != nil {
 229                                 return 0, err
 230                         }
 231                 }
 232         }
 233 }
 234
 235 // readBlock reads a bzip2 block. The magic number should already have been consumed.
 236 func (bz2 *reader) readBlock() (err error) {
 237         br := &bz2.br
 238         bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
 239         bz2.blockCRC = 0
 240         bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC
 241         randomized := br.ReadBits(1)
 242         if randomized != 0 {
 243                 return StructuralError("deprecated randomized files")
 244         }
 245         origPtr := uint(br.ReadBits(24))
 246
 247         // If not every byte value is used in the block (i.e., it's text) then
 248         // the symbol set is reduced. The symbols used are stored as a
 249         // two-level, 16x16 bitmap.
 250         symbolRangeUsedBitmap := br.ReadBits(16)
 251         symbolPresent := make([]bool, 256)
 252         numSymbols := 0
 253         for symRange := uint(0); symRange < 16; symRange++ {
 254                 if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
 255                         bits := br.ReadBits(16)
 256                         for symbol := uint(0); symbol < 16; symbol++ {
 257                                 if bits&(1<<(15-symbol)) != 0 {
 258                                         symbolPresent[16*symRange+symbol] = true
 259                                         numSymbols++
 260                                 }
 261                         }
 262                 }
 263         }
 264
 265         if numSymbols == 0 {
 266                 // There must be an EOF symbol.
 267                 return StructuralError("no symbols in input")
 268         }
 269
 270         // A block uses between two and six different Huffman trees.
 271         numHuffmanTrees := br.ReadBits(3)
 272         if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
 273                 return StructuralError("invalid number of Huffman trees")
 274         }
 275
 276         // The Huffman tree can switch every 50 symbols so there's a list of
 277         // tree indexes telling us which tree to use for each 50 symbol block.
 278         numSelectors := br.ReadBits(15)
 279         treeIndexes := make([]uint8, numSelectors)
 280
 281         // The tree indexes are move-to-front transformed and stored as unary
 282         // numbers.
 283         mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
 284         for i := range treeIndexes {
 285                 c := 0
 286                 for {
 287                         inc := br.ReadBits(1)
 288                         if inc == 0 {
 289                                 break
 290                         }
 291                         c++
 292                 }
 293                 if c >= numHuffmanTrees {
 294                         return StructuralError("tree index too large")
 295                 }
 296                 treeIndexes[i] = mtfTreeDecoder.Decode(c)
 297         }
 298
 299         // The list of symbols for the move-to-front transform is taken from
 300         // the previously decoded symbol bitmap.
 301         symbols := make([]byte, numSymbols)
 302         nextSymbol := 0
 303         for i := 0; i < 256; i++ {
 304                 if symbolPresent[i] {
 305                         symbols[nextSymbol] = byte(i)
 306                         nextSymbol++
 307                 }
 308         }
 309         mtf := newMTFDecoder(symbols)
 310
 311         numSymbols += 2 // to account for RUNA and RUNB symbols
 312         huffmanTrees := make([]huffmanTree, numHuffmanTrees)
 313
 314         // Now we decode the arrays of code-lengths for each tree.
 315         lengths := make([]uint8, numSymbols)
 316         for i := range huffmanTrees {
 317                 // The code lengths are delta encoded from a 5-bit base value.
 318                 length := br.ReadBits(5)
 319                 for j := range lengths {
 320                         for {
 321                                 if length < 1 || length > 20 {
 322                                         return StructuralError("Huffman length out of range")
 323                                 }
 324                                 if !br.ReadBit() {
 325                                         break
 326                                 }
 327                                 if br.ReadBit() {
 328                                         length--
 329                                 } else {
 330                                         length++
 331                                 }
 332                         }
 333                         lengths[j] = uint8(length)
 334                 }
 335                 huffmanTrees[i], err = newHuffmanTree(lengths)
 336                 if err != nil {
 337                         return err
 338                 }
 339         }
 340
 341         selectorIndex := 1 // the next tree index to use
 342         if len(treeIndexes) == 0 {
 343                 return StructuralError("no tree selectors given")
 344         }
 345         if int(treeIndexes[0]) >= len(huffmanTrees) {
 346                 return StructuralError("tree selector out of range")
 347         }
 348         currentHuffmanTree := huffmanTrees[treeIndexes[0]]
 349         bufIndex := 0 // indexes bz2.buf, the output buffer.
 350         // The output of the move-to-front transform is run-length encoded and
 351         // we merge the decoding into the Huffman parsing loop. These two
 352         // variables accumulate the repeat count. See the Wikipedia page for
 353         // details.
 354         repeat := 0
 355         repeatPower := 0
 356
 357         // The `C' array (used by the inverse BWT) needs to be zero initialized.
 358         for i := range bz2.c {
 359                 bz2.c[i] = 0
 360         }
 361
 362         decoded := 0 // counts the number of symbols decoded by the current tree.
 363         for {
 364                 if decoded == 50 {
 365                         if selectorIndex >= numSelectors {
 366                                 return StructuralError("insufficient selector indices for number of symbols")
 367                         }
 368                         if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) {
 369                                 return StructuralError("tree selector out of range")
 370                         }
 371                         currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
 372                         selectorIndex++
 373                         decoded = 0
 374                 }
 375
 376                 v := currentHuffmanTree.Decode(br)
 377                 decoded++
 378
 379                 if v < 2 {
 380                         // This is either the RUNA or RUNB symbol.
 381                         if repeat == 0 {
 382                                 repeatPower = 1
 383                         }
 384                         repeat += repeatPower << v
 385                         repeatPower <<= 1
 386
 387                         // This limit of 2 million comes from the bzip2 source
 388                         // code. It prevents repeat from overflowing.
 389                         if repeat > 2*1024*1024 {
 390                                 return StructuralError("repeat count too large")
 391                         }
 392                         continue
 393                 }
 394
 395                 if repeat > 0 {
 396                         // We have decoded a complete run-length so we need to
 397                         // replicate the last output symbol.
 398                         if repeat > bz2.blockSize-bufIndex {
 399                                 return StructuralError("repeats past end of block")
 400                         }
 401                         for i := 0; i < repeat; i++ {
 402                                 b := mtf.First()
 403                                 bz2.tt[bufIndex] = uint32(b)
 404                                 bz2.c[b]++
 405                                 bufIndex++
 406                         }
 407                         repeat = 0
 408                 }
 409
 410                 if int(v) == numSymbols-1 {
 411                         // This is the EOF symbol. Because it's always at the
 412                         // end of the move-to-front list, and never gets moved
 413                         // to the front, it has this unique value.
 414                         break
 415                 }
 416
 417                 // Since two metasymbols (RUNA and RUNB) have values 0 and 1,
 418                 // one would expect |v-2| to be passed to the MTF decoder.
 419                 // However, the front of the MTF list is never referenced as 0,
 420                 // it's always referenced with a run-length of 1. Thus 0
 421                 // doesn't need to be encoded and we have |v-1| in the next
 422                 // line.
 423                 b := mtf.Decode(int(v - 1))
 424                 if bufIndex >= bz2.blockSize {
 425                         return StructuralError("data exceeds block size")
 426                 }
 427                 bz2.tt[bufIndex] = uint32(b)
 428                 bz2.c[b]++
 429                 bufIndex++
 430         }
 431
 432         if origPtr >= uint(bufIndex) {
 433                 return StructuralError("origPtr out of bounds")
 434         }
 435
 436         // We have completed the entropy decoding. Now we can perform the
 437         // inverse BWT and setup the RLE buffer.
 438         bz2.preRLE = bz2.tt[:bufIndex]
 439         bz2.preRLEUsed = 0
 440         bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
 441         bz2.lastByte = -1
 442         bz2.byteRepeats = 0
 443         bz2.repeats = 0
 444
 445         return nil
 446 }
 447
 448 // inverseBWT implements the inverse Burrows-Wheeler transform as described in
 449 // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
 450 // In that document, origPtr is called `I' and c is the `C' array after the
 451 // first pass over the data. It's an argument here because we merge the first
 452 // pass with the Huffman decoding.
 453 //
 454 // This also implements the `single array' method from the bzip2 source code
 455 // which leaves the output, still shuffled, in the bottom 8 bits of tt with the
 456 // index of the next byte in the top 24-bits. The index of the first byte is
 457 // returned.
 458 func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
 459         sum := uint(0)
 460         for i := 0; i < 256; i++ {
 461                 sum += c[i]
 462                 c[i] = sum - c[i]
 463         }
 464
 465         for i := range tt {
 466                 b := tt[i] & 0xff
 467                 tt[c[b]] |= uint32(i) << 8
 468                 c[b]++
 469         }
 470
 471         return tt[origPtr] >> 8
 472 }
 473
 474 // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
 475 // causing the bits in the input to be processed in the reverse of the usual order.
 476
 477 var crctab [256]uint32
 478
 479 func init() {
 480         const poly = 0x04C11DB7
 481         for i := range crctab {
 482                 crc := uint32(i) << 24
 483                 for j := 0; j < 8; j++ {
 484                         if crc&0x80000000 != 0 {
 485                                 crc = (crc << 1) ^ poly
 486                         } else {
 487                                 crc <<= 1
 488                         }
 489                 }
 490                 crctab[i] = crc
 491         }
 492 }
 493
 494 // updateCRC updates the crc value to incorporate the data in b.
 495 // The initial value is 0.
 496 func updateCRC(val uint32, b []byte) uint32 {
 497         crc := ^val
 498         for _, v := range b {
 499                 crc = crctab[byte(crc>>24)^v] ^ (crc << 8)
 500         }
 501         return ^crc
 502 }