libgo/go/compress/flate/huffman_bit_writer.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package flate
   6
   7 import (
   8         "io"
   9         "math"
  10         "os"
  11         "strconv"
  12 )
  13
  14 const (
  15         // The largest offset code.
  16         offsetCodeCount = 30
  17
  18         // The largest offset code in the extensions.
  19         extendedOffsetCodeCount = 42
  20
  21         // The special code used to mark the end of a block.
  22         endBlockMarker = 256
  23
  24         // The first length code.
  25         lengthCodesStart = 257
  26
  27         // The number of codegen codes.
  28         codegenCodeCount = 19
  29         badCode          = 255
  30 )
  31
  32 // The number of extra bits needed by length code X - LENGTH_CODES_START.
  33 var lengthExtraBits = []int8{
  34         /* 257 */ 0, 0, 0,
  35         /* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
  36         /* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
  37         /* 280 */ 4, 5, 5, 5, 5, 0,
  38 }
  39
  40 // The length indicated by length code X - LENGTH_CODES_START.
  41 var lengthBase = []uint32{
  42         0, 1, 2, 3, 4, 5, 6, 7, 8, 10,
  43         12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
  44         64, 80, 96, 112, 128, 160, 192, 224, 255,
  45 }
  46
  47 // offset code word extra bits.
  48 var offsetExtraBits = []int8{
  49         0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
  50         4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
  51         9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
  52         /* extended window */
  53         14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
  54 }
  55
  56 var offsetBase = []uint32{
  57         /* normal deflate */
  58         0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
  59         0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
  60         0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
  61         0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
  62         0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
  63         0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
  64
  65         /* extended window */
  66         0x008000, 0x00c000, 0x010000, 0x018000, 0x020000,
  67         0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000,
  68         0x100000, 0x180000, 0x200000, 0x300000,
  69 }
  70
  71 // The odd order in which the codegen code sizes are written.
  72 var codegenOrder = []uint32{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
  73
  74 type huffmanBitWriter struct {
  75         w io.Writer
  76         // Data waiting to be written is bytes[0:nbytes]
  77         // and then the low nbits of bits.
  78         bits            uint32
  79         nbits           uint32
  80         bytes           [64]byte
  81         nbytes          int
  82         literalFreq     []int32
  83         offsetFreq      []int32
  84         codegen         []uint8
  85         codegenFreq     []int32
  86         literalEncoding *huffmanEncoder
  87         offsetEncoding  *huffmanEncoder
  88         codegenEncoding *huffmanEncoder
  89         err             os.Error
  90 }
  91
  92 type WrongValueError struct {
  93         name  string
  94         from  int32
  95         to    int32
  96         value int32
  97 }
  98
  99 func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
 100         return &huffmanBitWriter{
 101                 w:               w,
 102                 literalFreq:     make([]int32, maxLit),
 103                 offsetFreq:      make([]int32, extendedOffsetCodeCount),
 104                 codegen:         make([]uint8, maxLit+extendedOffsetCodeCount+1),
 105                 codegenFreq:     make([]int32, codegenCodeCount),
 106                 literalEncoding: newHuffmanEncoder(maxLit),
 107                 offsetEncoding:  newHuffmanEncoder(extendedOffsetCodeCount),
 108                 codegenEncoding: newHuffmanEncoder(codegenCodeCount),
 109         }
 110 }
 111
 112 func (err WrongValueError) String() string {
 113         return "huffmanBitWriter: " + err.name + " should belong to [" + strconv.Itoa64(int64(err.from)) + ";" +
 114                 strconv.Itoa64(int64(err.to)) + "] but actual value is " + strconv.Itoa64(int64(err.value))
 115 }
 116
 117 func (w *huffmanBitWriter) flushBits() {
 118         if w.err != nil {
 119                 w.nbits = 0
 120                 return
 121         }
 122         bits := w.bits
 123         w.bits >>= 16
 124         w.nbits -= 16
 125         n := w.nbytes
 126         w.bytes[n] = byte(bits)
 127         w.bytes[n+1] = byte(bits >> 8)
 128         if n += 2; n >= len(w.bytes) {
 129                 _, w.err = w.w.Write(w.bytes[0:])
 130                 n = 0
 131         }
 132         w.nbytes = n
 133 }
 134
 135 func (w *huffmanBitWriter) flush() {
 136         if w.err != nil {
 137                 w.nbits = 0
 138                 return
 139         }
 140         n := w.nbytes
 141         if w.nbits > 8 {
 142                 w.bytes[n] = byte(w.bits)
 143                 w.bits >>= 8
 144                 w.nbits -= 8
 145                 n++
 146         }
 147         if w.nbits > 0 {
 148                 w.bytes[n] = byte(w.bits)
 149                 w.nbits = 0
 150                 n++
 151         }
 152         w.bits = 0
 153         _, w.err = w.w.Write(w.bytes[0:n])
 154         w.nbytes = 0
 155 }
 156
 157 func (w *huffmanBitWriter) writeBits(b, nb int32) {
 158         w.bits |= uint32(b) << w.nbits
 159         if w.nbits += uint32(nb); w.nbits >= 16 {
 160                 w.flushBits()
 161         }
 162 }
 163
 164 func (w *huffmanBitWriter) writeBytes(bytes []byte) {
 165         if w.err != nil {
 166                 return
 167         }
 168         n := w.nbytes
 169         if w.nbits == 8 {
 170                 w.bytes[n] = byte(w.bits)
 171                 w.nbits = 0
 172                 n++
 173         }
 174         if w.nbits != 0 {
 175                 w.err = InternalError("writeBytes with unfinished bits")
 176                 return
 177         }
 178         if n != 0 {
 179                 _, w.err = w.w.Write(w.bytes[0:n])
 180                 if w.err != nil {
 181                         return
 182                 }
 183         }
 184         w.nbytes = 0
 185         _, w.err = w.w.Write(bytes)
 186 }
 187
 188 // RFC 1951 3.2.7 specifies a special run-length encoding for specifiying
 189 // the literal and offset lengths arrays (which are concatenated into a single
 190 // array).  This method generates that run-length encoding.
 191 //
 192 // The result is written into the codegen array, and the frequencies
 193 // of each code is written into the codegenFreq array.
 194 // Codes 0-15 are single byte codes. Codes 16-18 are followed by additional
 195 // information.  Code badCode is an end marker
 196 //
 197 //  numLiterals      The number of literals in literalEncoding
 198 //  numOffsets       The number of offsets in offsetEncoding
 199 func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int) {
 200         fillInt32s(w.codegenFreq, 0)
 201         // Note that we are using codegen both as a temporary variable for holding
 202         // a copy of the frequencies, and as the place where we put the result.
 203         // This is fine because the output is always shorter than the input used
 204         // so far.
 205         codegen := w.codegen // cache
 206         // Copy the concatenated code sizes to codegen.  Put a marker at the end.
 207         copyUint8s(codegen[0:numLiterals], w.literalEncoding.codeBits)
 208         copyUint8s(codegen[numLiterals:numLiterals+numOffsets], w.offsetEncoding.codeBits)
 209         codegen[numLiterals+numOffsets] = badCode
 210
 211         size := codegen[0]
 212         count := 1
 213         outIndex := 0
 214         for inIndex := 1; size != badCode; inIndex++ {
 215                 // INVARIANT: We have seen "count" copies of size that have not yet
 216                 // had output generated for them.
 217                 nextSize := codegen[inIndex]
 218                 if nextSize == size {
 219                         count++
 220                         continue
 221                 }
 222                 // We need to generate codegen indicating "count" of size.
 223                 if size != 0 {
 224                         codegen[outIndex] = size
 225                         outIndex++
 226                         w.codegenFreq[size]++
 227                         count--
 228                         for count >= 3 {
 229                                 n := min(count, 6)
 230                                 codegen[outIndex] = 16
 231                                 outIndex++
 232                                 codegen[outIndex] = uint8(n - 3)
 233                                 outIndex++
 234                                 w.codegenFreq[16]++
 235                                 count -= n
 236                         }
 237                 } else {
 238                         for count >= 11 {
 239                                 n := min(count, 138)
 240                                 codegen[outIndex] = 18
 241                                 outIndex++
 242                                 codegen[outIndex] = uint8(n - 11)
 243                                 outIndex++
 244                                 w.codegenFreq[18]++
 245                                 count -= n
 246                         }
 247                         if count >= 3 {
 248                                 // count >= 3 && count <= 10
 249                                 codegen[outIndex] = 17
 250                                 outIndex++
 251                                 codegen[outIndex] = uint8(count - 3)
 252                                 outIndex++
 253                                 w.codegenFreq[17]++
 254                                 count = 0
 255                         }
 256                 }
 257                 count--
 258                 for ; count >= 0; count-- {
 259                         codegen[outIndex] = size
 260                         outIndex++
 261                         w.codegenFreq[size]++
 262                 }
 263                 // Set up invariant for next time through the loop.
 264                 size = nextSize
 265                 count = 1
 266         }
 267         // Marker indicating the end of the codegen.
 268         codegen[outIndex] = badCode
 269 }
 270
 271 func (w *huffmanBitWriter) writeCode(code *huffmanEncoder, literal uint32) {
 272         if w.err != nil {
 273                 return
 274         }
 275         w.writeBits(int32(code.code[literal]), int32(code.codeBits[literal]))
 276 }
 277
 278 // Write the header of a dynamic Huffman block to the output stream.
 279 //
 280 //  numLiterals  The number of literals specified in codegen
 281 //  numOffsets   The number of offsets specified in codegen
 282 //  numCodegens  Tne number of codegens used in codegen
 283 func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) {
 284         if w.err != nil {
 285                 return
 286         }
 287         var firstBits int32 = 4
 288         if isEof {
 289                 firstBits = 5
 290         }
 291         w.writeBits(firstBits, 3)
 292         w.writeBits(int32(numLiterals-257), 5)
 293         if numOffsets > offsetCodeCount {
 294                 // Extended version of decompressor
 295                 w.writeBits(int32(offsetCodeCount+((numOffsets-(1+offsetCodeCount))>>3)), 5)
 296                 w.writeBits(int32((numOffsets-(1+offsetCodeCount))&0x7), 3)
 297         } else {
 298                 w.writeBits(int32(numOffsets-1), 5)
 299         }
 300         w.writeBits(int32(numCodegens-4), 4)
 301
 302         for i := 0; i < numCodegens; i++ {
 303                 value := w.codegenEncoding.codeBits[codegenOrder[i]]
 304                 w.writeBits(int32(value), 3)
 305         }
 306
 307         i := 0
 308         for {
 309                 var codeWord int = int(w.codegen[i])
 310                 i++
 311                 if codeWord == badCode {
 312                         break
 313                 }
 314                 // The low byte contains the actual code to generate.
 315                 w.writeCode(w.codegenEncoding, uint32(codeWord))
 316
 317                 switch codeWord {
 318                 case 16:
 319                         w.writeBits(int32(w.codegen[i]), 2)
 320                         i++
 321                         break
 322                 case 17:
 323                         w.writeBits(int32(w.codegen[i]), 3)
 324                         i++
 325                         break
 326                 case 18:
 327                         w.writeBits(int32(w.codegen[i]), 7)
 328                         i++
 329                         break
 330                 }
 331         }
 332 }
 333
 334 func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
 335         if w.err != nil {
 336                 return
 337         }
 338         var flag int32
 339         if isEof {
 340                 flag = 1
 341         }
 342         w.writeBits(flag, 3)
 343         w.flush()
 344         w.writeBits(int32(length), 16)
 345         w.writeBits(int32(^uint16(length)), 16)
 346 }
 347
 348 func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
 349         if w.err != nil {
 350                 return
 351         }
 352         // Indicate that we are a fixed Huffman block
 353         var value int32 = 2
 354         if isEof {
 355                 value = 3
 356         }
 357         w.writeBits(value, 3)
 358 }
 359
 360 func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
 361         if w.err != nil {
 362                 return
 363         }
 364         fillInt32s(w.literalFreq, 0)
 365         fillInt32s(w.offsetFreq, 0)
 366
 367         n := len(tokens)
 368         tokens = tokens[0 : n+1]
 369         tokens[n] = endBlockMarker
 370
 371         totalLength := -1 // Subtract 1 for endBlock.
 372         for _, t := range tokens {
 373                 switch t.typ() {
 374                 case literalType:
 375                         w.literalFreq[t.literal()]++
 376                         totalLength++
 377                         break
 378                 case matchType:
 379                         length := t.length()
 380                         offset := t.offset()
 381                         totalLength += int(length + 3)
 382                         w.literalFreq[lengthCodesStart+lengthCode(length)]++
 383                         w.offsetFreq[offsetCode(offset)]++
 384                         break
 385                 }
 386         }
 387         w.literalEncoding.generate(w.literalFreq, 15)
 388         w.offsetEncoding.generate(w.offsetFreq, 15)
 389
 390         // get the number of literals
 391         numLiterals := len(w.literalFreq)
 392         for w.literalFreq[numLiterals-1] == 0 {
 393                 numLiterals--
 394         }
 395         // get the number of offsets
 396         numOffsets := len(w.offsetFreq)
 397         for numOffsets > 1 && w.offsetFreq[numOffsets-1] == 0 {
 398                 numOffsets--
 399         }
 400         storedBytes := 0
 401         if input != nil {
 402                 storedBytes = len(input)
 403         }
 404         var extraBits int64
 405         var storedSize int64
 406         if storedBytes <= maxStoreBlockSize && input != nil {
 407                 storedSize = int64((storedBytes + 5) * 8)
 408                 // We only bother calculating the costs of the extra bits required by
 409                 // the length of offset fields (which will be the same for both fixed
 410                 // and dynamic encoding), if we need to compare those two encodings
 411                 // against stored encoding.
 412                 for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ {
 413                         // First eight length codes have extra size = 0.
 414                         extraBits += int64(w.literalFreq[lengthCode]) * int64(lengthExtraBits[lengthCode-lengthCodesStart])
 415                 }
 416                 for offsetCode := 4; offsetCode < numOffsets; offsetCode++ {
 417                         // First four offset codes have extra size = 0.
 418                         extraBits += int64(w.offsetFreq[offsetCode]) * int64(offsetExtraBits[offsetCode])
 419                 }
 420         } else {
 421                 storedSize = math.MaxInt32
 422         }
 423
 424         // Figure out which generates smaller code, fixed Huffman, dynamic
 425         // Huffman, or just storing the data.
 426         var fixedSize int64 = math.MaxInt64
 427         if numOffsets <= offsetCodeCount {
 428                 fixedSize = int64(3) +
 429                         fixedLiteralEncoding.bitLength(w.literalFreq) +
 430                         fixedOffsetEncoding.bitLength(w.offsetFreq) +
 431                         extraBits
 432         }
 433         // Generate codegen and codegenFrequencies, which indicates how to encode
 434         // the literalEncoding and the offsetEncoding.
 435         w.generateCodegen(numLiterals, numOffsets)
 436         w.codegenEncoding.generate(w.codegenFreq, 7)
 437         numCodegens := len(w.codegenFreq)
 438         for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
 439                 numCodegens--
 440         }
 441         extensionSummand := 0
 442         if numOffsets > offsetCodeCount {
 443                 extensionSummand = 3
 444         }
 445         dynamicHeader := int64(3+5+5+4+(3*numCodegens)) +
 446                 // Following line is an extension.
 447                 int64(extensionSummand) +
 448                 w.codegenEncoding.bitLength(w.codegenFreq) +
 449                 int64(extraBits) +
 450                 int64(w.codegenFreq[16]*2) +
 451                 int64(w.codegenFreq[17]*3) +
 452                 int64(w.codegenFreq[18]*7)
 453         dynamicSize := dynamicHeader +
 454                 w.literalEncoding.bitLength(w.literalFreq) +
 455                 w.offsetEncoding.bitLength(w.offsetFreq)
 456
 457         if storedSize < fixedSize && storedSize < dynamicSize {
 458                 w.writeStoredHeader(storedBytes, eof)
 459                 w.writeBytes(input[0:storedBytes])
 460                 return
 461         }
 462         var literalEncoding *huffmanEncoder
 463         var offsetEncoding *huffmanEncoder
 464
 465         if fixedSize <= dynamicSize {
 466                 w.writeFixedHeader(eof)
 467                 literalEncoding = fixedLiteralEncoding
 468                 offsetEncoding = fixedOffsetEncoding
 469         } else {
 470                 // Write the header.
 471                 w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
 472                 literalEncoding = w.literalEncoding
 473                 offsetEncoding = w.offsetEncoding
 474         }
 475
 476         // Write the tokens.
 477         for _, t := range tokens {
 478                 switch t.typ() {
 479                 case literalType:
 480                         w.writeCode(literalEncoding, t.literal())
 481                         break
 482                 case matchType:
 483                         // Write the length
 484                         length := t.length()
 485                         lengthCode := lengthCode(length)
 486                         w.writeCode(literalEncoding, lengthCode+lengthCodesStart)
 487                         extraLengthBits := int32(lengthExtraBits[lengthCode])
 488                         if extraLengthBits > 0 {
 489                                 extraLength := int32(length - lengthBase[lengthCode])
 490                                 w.writeBits(extraLength, extraLengthBits)
 491                         }
 492                         // Write the offset
 493                         offset := t.offset()
 494                         offsetCode := offsetCode(offset)
 495                         w.writeCode(offsetEncoding, offsetCode)
 496                         extraOffsetBits := int32(offsetExtraBits[offsetCode])
 497                         if extraOffsetBits > 0 {
 498                                 extraOffset := int32(offset - offsetBase[offsetCode])
 499                                 w.writeBits(extraOffset, extraOffsetBits)
 500                         }
 501                         break
 502                 default:
 503                         panic("unknown token type: " + string(t))
 504                 }
 505         }
 506 }