1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package flate implements the DEFLATE compressed data format, described in
6 // RFC 1951. The gzip and zlib packages implement access to DEFLATE-based file
19 maxCodeLen
= 16 // max length of Huffman code
20 // The next three numbers come from the RFC section 3.2.7, with the
21 // additional proviso in section 3.2.5 which implies that distance codes
22 // 30 and 31 should never occur in compressed data.
25 numCodes
= 19 // number of codes in Huffman meta-code
28 // Initialize the fixedHuffmanDecoder only once upon first use.
29 var fixedOnce sync
.Once
30 var fixedHuffmanDecoder huffmanDecoder
32 // A CorruptInputError reports the presence of corrupt input at a given offset.
33 type CorruptInputError
int64
35 func (e CorruptInputError
) Error() string {
36 return "flate: corrupt input before offset " + strconv
.FormatInt(int64(e
), 10)
39 // An InternalError reports an error in the flate code itself.
40 type InternalError
string
42 func (e InternalError
) Error() string { return "flate: internal error: " + string(e
) }
44 // A ReadError reports an error encountered while reading input.
46 // Deprecated: No longer returned.
47 type ReadError
struct {
48 Offset
int64 // byte offset where error occurred
49 Err error
// error returned by underlying Read
52 func (e
*ReadError
) Error() string {
53 return "flate: read error at offset " + strconv
.FormatInt(e
.Offset
, 10) + ": " + e
.Err
.Error()
56 // A WriteError reports an error encountered while writing output.
58 // Deprecated: No longer returned.
59 type WriteError
struct {
60 Offset
int64 // byte offset where error occurred
61 Err error
// error returned by underlying Write
64 func (e
*WriteError
) Error() string {
65 return "flate: write error at offset " + strconv
.FormatInt(e
.Offset
, 10) + ": " + e
.Err
.Error()
68 // Resetter resets a ReadCloser returned by NewReader or NewReaderDict to
69 // to switch to a new underlying Reader. This permits reusing a ReadCloser
70 // instead of allocating a new one.
71 type Resetter
interface {
72 // Reset discards any buffered data and resets the Resetter as if it was
73 // newly initialized with the given reader.
74 Reset(r io
.Reader
, dict
[]byte) error
77 // The data structure for decoding Huffman tables is based on that of
78 // zlib. There is a lookup table of a fixed bit width (huffmanChunkBits),
79 // For codes smaller than the table width, there are multiple entries
80 // (each combination of trailing bits has the same value). For codes
81 // larger than the table width, the table contains a link to an overflow
82 // table. The width of each entry in the link table is the maximum code
83 // size minus the chunk width.
85 // Note that you can do a lookup in the table even without all bits
86 // filled. Since the extra bits are zero, and the DEFLATE Huffman codes
87 // have the property that shorter codes come before longer ones, the
88 // bit length estimate in the result is a lower bound on the actual
92 // http://www.gzip.org/algorithm.txt
94 // chunk & 15 is number of bits
95 // chunk >> 4 is value, including table link
99 huffmanNumChunks
= 1 << huffmanChunkBits
100 huffmanCountMask
= 15
101 huffmanValueShift
= 4
104 type huffmanDecoder
struct {
105 min
int // the minimum code length
106 chunks
[huffmanNumChunks
]uint32 // chunks as described above
107 links
[][]uint32 // overflow links
108 linkMask
uint32 // mask the width of the link table
111 // Initialize Huffman decoding tables from array of code lengths.
112 // Following this function, h is guaranteed to be initialized into a complete
113 // tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
114 // degenerate case where the tree has only a single symbol with length 1. Empty
115 // trees are permitted.
116 func (h
*huffmanDecoder
) init(bits
[]int) bool {
117 // Sanity enables additional runtime tests during Huffman
118 // table construction. It's intended to be used during
119 // development to supplement the currently ad-hoc unit tests.
123 *h
= huffmanDecoder
{}
126 // Count number of codes of each length,
127 // compute min and max length.
128 var count
[maxCodeLen
]int
130 for _
, n
:= range bits
{
134 if min
== 0 || n
< min
{
143 // Empty tree. The decompressor.huffSym function will fail later if the tree
144 // is used. Technically, an empty tree is only valid for the HDIST tree and
145 // not the HCLEN and HLIT tree. However, a stream with an empty HCLEN tree
146 // is guaranteed to fail since it will attempt to use the tree to decode the
147 // codes for the HLIT and HDIST trees. Similarly, an empty HLIT tree is
148 // guaranteed to fail later since the compressed data section must be
149 // composed of at least one symbol (the end-of-block marker).
155 var nextcode
[maxCodeLen
]int
156 for i
:= min
; i
<= max
; i
++ {
162 // Check that the coding is complete (i.e., that we've
163 // assigned all 2-to-the-max possible bit sequences).
164 // Exception: To be compatible with zlib, we also need to
165 // accept degenerate single-code codings. See also
166 // TestDegenerateHuffmanCoding.
167 if code
!= 1<<uint(max
) && !(code
== 1 && max
== 1) {
172 if max
> huffmanChunkBits
{
173 numLinks
:= 1 << (uint(max
) - huffmanChunkBits
)
174 h
.linkMask
= uint32(numLinks
- 1)
176 // create link tables
177 link
:= nextcode
[huffmanChunkBits
+1] >> 1
178 h
.links
= make([][]uint32, huffmanNumChunks
-link
)
179 for j
:= uint(link
); j
< huffmanNumChunks
; j
++ {
180 reverse
:= int(mathbits
.Reverse16(uint16(j
)))
181 reverse
>>= uint(16 - huffmanChunkBits
)
182 off
:= j
- uint(link
)
183 if sanity
&& h
.chunks
[reverse
] != 0 {
184 panic("impossible: overwriting existing chunk")
186 h
.chunks
[reverse
] = uint32(off
<<huffmanValueShift |
(huffmanChunkBits
+ 1))
187 h
.links
[off
] = make([]uint32, numLinks
)
191 for i
, n
:= range bits
{
197 chunk
:= uint32(i
<<huffmanValueShift | n
)
198 reverse
:= int(mathbits
.Reverse16(uint16(code
)))
199 reverse
>>= uint(16 - n
)
200 if n
<= huffmanChunkBits
{
201 for off
:= reverse
; off
< len(h
.chunks
); off
+= 1 << uint(n
) {
202 // We should never need to overwrite
203 // an existing chunk. Also, 0 is
204 // never a valid chunk, because the
205 // lower 4 "count" bits should be
207 if sanity
&& h
.chunks
[off
] != 0 {
208 panic("impossible: overwriting existing chunk")
210 h
.chunks
[off
] = chunk
213 j
:= reverse
& (huffmanNumChunks
- 1)
214 if sanity
&& h
.chunks
[j
]&huffmanCountMask
!= huffmanChunkBits
+1 {
215 // Longer codes should have been
216 // associated with a link table above.
217 panic("impossible: not an indirect chunk")
219 value
:= h
.chunks
[j
] >> huffmanValueShift
220 linktab
:= h
.links
[value
]
221 reverse
>>= huffmanChunkBits
222 for off
:= reverse
; off
< len(linktab
); off
+= 1 << uint(n
-huffmanChunkBits
) {
223 if sanity
&& linktab
[off
] != 0 {
224 panic("impossible: overwriting existing chunk")
232 // Above we've sanity checked that we never overwrote
233 // an existing entry. Here we additionally check that
234 // we filled the tables completely.
235 for i
, chunk
:= range h
.chunks
{
237 // As an exception, in the degenerate
238 // single-code case, we allow odd
239 // chunks to be missing.
240 if code
== 1 && i%2
== 1 {
243 panic("impossible: missing chunk")
246 for _
, linktab
:= range h
.links
{
247 for _
, chunk
:= range linktab
{
249 panic("impossible: missing chunk")
258 // The actual read interface needed by NewReader.
259 // If the passed in io.Reader does not also have ReadByte,
260 // the NewReader will introduce its own buffering.
261 type Reader
interface {
267 type decompressor
struct {
272 // Input bits, in top of b.
276 // Huffman decoders for literal/length, distance.
277 h1
, h2 huffmanDecoder
279 // Length arrays used to define Huffman codes.
280 bits
*[maxNumLit
+ maxNumDist
]int
281 codebits
*[numCodes
]int
283 // Output history, buffer.
286 // Temporary buffer (avoids repeated allocation).
289 // Next step in the decompression,
290 // and decompression state.
291 step
func(*decompressor
)
296 hl
, hd
*huffmanDecoder
301 func (f
*decompressor
) nextBlock() {
303 if f
.err
= f
.moreBits(); f
.err
!= nil {
316 // compressed, fixed Huffman tables
317 f
.hl
= &fixedHuffmanDecoder
321 // compressed, dynamic Huffman tables
322 if f
.err
= f
.readHuffman(); f
.err
!= nil {
330 f
.err
= CorruptInputError(f
.roffset
)
334 func (f
*decompressor
) Read(b
[]byte) (int, error
) {
336 if len(f
.toRead
) > 0 {
337 n
:= copy(b
, f
.toRead
)
338 f
.toRead
= f
.toRead
[n
:]
339 if len(f
.toRead
) == 0 {
348 if f
.err
!= nil && len(f
.toRead
) == 0 {
349 f
.toRead
= f
.dict
.readFlush() // Flush what's left in case of error
354 func (f
*decompressor
) Close() error
{
361 // RFC 1951 section 3.2.7.
362 // Compression with dynamic Huffman codes
364 var codeOrder
= [...]int{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
366 func (f
*decompressor
) readHuffman() error
{
367 // HLIT[5], HDIST[5], HCLEN[4].
369 if err
:= f
.moreBits(); err
!= nil {
373 nlit
:= int(f
.b
&0x1F) + 257
374 if nlit
> maxNumLit
{
375 return CorruptInputError(f
.roffset
)
378 ndist
:= int(f
.b
&0x1F) + 1
379 if ndist
> maxNumDist
{
380 return CorruptInputError(f
.roffset
)
383 nclen
:= int(f
.b
&0xF) + 4
384 // numCodes is 19, so nclen is always valid.
388 // (HCLEN+4)*3 bits: code lengths in the magic codeOrder order.
389 for i
:= 0; i
< nclen
; i
++ {
391 if err
:= f
.moreBits(); err
!= nil {
395 f
.codebits
[codeOrder
[i
]] = int(f
.b
& 0x7)
399 for i
:= nclen
; i
< len(codeOrder
); i
++ {
400 f
.codebits
[codeOrder
[i
]] = 0
402 if !f
.h1
.init(f
.codebits
[0:]) {
403 return CorruptInputError(f
.roffset
)
406 // HLIT + 257 code lengths, HDIST + 1 code lengths,
407 // using the code length Huffman code.
408 for i
, n
:= 0, nlit
+ndist
; i
< n
; {
409 x
, err
:= f
.huffSym(&f
.h1
)
419 // Repeat previous length or zero.
425 return InternalError("unexpected length code")
430 return CorruptInputError(f
.roffset
)
443 if err
:= f
.moreBits(); err
!= nil {
447 rep
+= int(f
.b
& uint32(1<<nb
-1))
451 return CorruptInputError(f
.roffset
)
453 for j
:= 0; j
< rep
; j
++ {
459 if !f
.h1
.init(f
.bits
[0:nlit
]) ||
!f
.h2
.init(f
.bits
[nlit
:nlit
+ndist
]) {
460 return CorruptInputError(f
.roffset
)
463 // As an optimization, we can initialize the min bits to read at a time
464 // for the HLIT tree to the length of the EOB marker since we know that
465 // every block must terminate with one. This preserves the property that
466 // we never read any extra bytes after the end of the DEFLATE stream.
467 if f
.h1
.min
< f
.bits
[endBlockMarker
] {
468 f
.h1
.min
= f
.bits
[endBlockMarker
]
474 // Decode a single Huffman block from f.
475 // hl and hd are the Huffman states for the lit/length values
476 // and the distance values, respectively. If hd == nil, using the
477 // fixed distance encoding associated with fixed Huffman blocks.
478 func (f
*decompressor
) huffmanBlock() {
480 stateInit
= iota // Zero value must be stateInit
492 // Read literal and/or (length, distance) according to RFC section 3.2.3.
494 v
, err
:= f
.huffSym(f
.hl
)
499 var n
uint // number of bits extra
503 f
.dict
.writeByte(byte(v
))
504 if f
.dict
.availWrite() == 0 {
505 f
.toRead
= f
.dict
.readFlush()
506 f
.step
= (*decompressor
).huffmanBlock
507 f
.stepState
= stateInit
514 // otherwise, reference to older data
516 length
= v
- (257 - 3)
519 length
= v
*2 - (265*2 - 11)
522 length
= v
*4 - (269*4 - 19)
525 length
= v
*8 - (273*8 - 35)
528 length
= v
*16 - (277*16 - 67)
531 length
= v
*32 - (281*32 - 131)
537 f
.err
= CorruptInputError(f
.roffset
)
542 if err
= f
.moreBits(); err
!= nil {
547 length
+= int(f
.b
& uint32(1<<n
-1))
555 if err
= f
.moreBits(); err
!= nil {
560 dist
= int(mathbits
.Reverse8(uint8(f
.b
& 0x1F << 3)))
564 if dist
, err
= f
.huffSym(f
.hd
); err
!= nil {
573 case dist
< maxNumDist
:
574 nb
:= uint(dist
-2) >> 1
575 // have 1 bit in bottom of dist, need nb more.
576 extra
:= (dist
& 1) << nb
578 if err
= f
.moreBits(); err
!= nil {
583 extra |
= int(f
.b
& uint32(1<<nb
-1))
586 dist
= 1<<(nb
+1) + 1 + extra
588 f
.err
= CorruptInputError(f
.roffset
)
592 // No check on length; encoding can be prescient.
593 if dist
> f
.dict
.histSize() {
594 f
.err
= CorruptInputError(f
.roffset
)
598 f
.copyLen
, f
.copyDist
= length
, dist
603 // Perform a backwards copy according to RFC section 3.2.3.
605 cnt
:= f
.dict
.tryWriteCopy(f
.copyDist
, f
.copyLen
)
607 cnt
= f
.dict
.writeCopy(f
.copyDist
, f
.copyLen
)
611 if f
.dict
.availWrite() == 0 || f
.copyLen
> 0 {
612 f
.toRead
= f
.dict
.readFlush()
613 f
.step
= (*decompressor
).huffmanBlock
// We need to continue this work
614 f
.stepState
= stateDict
621 // Copy a single uncompressed data block from input to output.
622 func (f
*decompressor
) dataBlock() {
624 // Discard current half-byte.
628 // Length then ones-complement of length.
629 nr
, err
:= io
.ReadFull(f
.r
, f
.buf
[0:4])
630 f
.roffset
+= int64(nr
)
633 err
= io
.ErrUnexpectedEOF
638 n
:= int(f
.buf
[0]) |
int(f
.buf
[1])<<8
639 nn
:= int(f
.buf
[2]) |
int(f
.buf
[3])<<8
640 if uint16(nn
) != uint16(^n
) {
641 f
.err
= CorruptInputError(f
.roffset
)
646 f
.toRead
= f
.dict
.readFlush()
655 // copyData copies f.copyLen bytes from the underlying reader into f.hist.
656 // It pauses for reads when f.hist is full.
657 func (f
*decompressor
) copyData() {
658 buf
:= f
.dict
.writeSlice()
659 if len(buf
) > f
.copyLen
{
660 buf
= buf
[:f
.copyLen
]
663 cnt
, err
:= io
.ReadFull(f
.r
, buf
)
664 f
.roffset
+= int64(cnt
)
666 f
.dict
.writeMark(cnt
)
669 err
= io
.ErrUnexpectedEOF
675 if f
.dict
.availWrite() == 0 || f
.copyLen
> 0 {
676 f
.toRead
= f
.dict
.readFlush()
677 f
.step
= (*decompressor
).copyData
683 func (f
*decompressor
) finishBlock() {
685 if f
.dict
.availRead() > 0 {
686 f
.toRead
= f
.dict
.readFlush()
690 f
.step
= (*decompressor
).nextBlock
693 func (f
*decompressor
) moreBits() error
{
694 c
, err
:= f
.r
.ReadByte()
697 err
= io
.ErrUnexpectedEOF
702 f
.b |
= uint32(c
) << f
.nb
707 // Read the next Huffman-encoded symbol from f according to h.
708 func (f
*decompressor
) huffSym(h
*huffmanDecoder
) (int, error
) {
709 // Since a huffmanDecoder can be empty or be composed of a degenerate tree
710 // with single element, huffSym must error on these two edge cases. In both
711 // cases, the chunks slice will be 0 for the invalid sequence, leading it
712 // satisfy the n == 0 check below.
716 if err
:= f
.moreBits(); err
!= nil {
720 chunk
:= h
.chunks
[f
.b
&(huffmanNumChunks
-1)]
721 n
= uint(chunk
& huffmanCountMask
)
722 if n
> huffmanChunkBits
{
723 chunk
= h
.links
[chunk
>>huffmanValueShift
][(f
.b
>>huffmanChunkBits
)&h
.linkMask
]
724 n
= uint(chunk
& huffmanCountMask
)
728 f
.err
= CorruptInputError(f
.roffset
)
733 return int(chunk
>> huffmanValueShift
), nil
738 func makeReader(r io
.Reader
) Reader
{
739 if rr
, ok
:= r
.(Reader
); ok
{
742 return bufio
.NewReader(r
)
745 func fixedHuffmanDecoderInit() {
746 fixedOnce
.Do(func() {
747 // These come from the RFC section 3.2.6.
749 for i
:= 0; i
< 144; i
++ {
752 for i
:= 144; i
< 256; i
++ {
755 for i
:= 256; i
< 280; i
++ {
758 for i
:= 280; i
< 288; i
++ {
761 fixedHuffmanDecoder
.init(bits
[:])
765 func (f
*decompressor
) Reset(r io
.Reader
, dict
[]byte) error
{
769 codebits
: f
.codebits
,
771 step
: (*decompressor
).nextBlock
,
773 f
.dict
.init(maxMatchOffset
, dict
)
777 // NewReader returns a new ReadCloser that can be used
778 // to read the uncompressed version of r.
779 // If r does not also implement io.ByteReader,
780 // the decompressor may read more data than necessary from r.
781 // It is the caller's responsibility to call Close on the ReadCloser
782 // when finished reading.
784 // The ReadCloser returned by NewReader also implements Resetter.
785 func NewReader(r io
.Reader
) io
.ReadCloser
{
786 fixedHuffmanDecoderInit()
790 f
.bits
= new([maxNumLit
+ maxNumDist
]int)
791 f
.codebits
= new([numCodes
]int)
792 f
.step
= (*decompressor
).nextBlock
793 f
.dict
.init(maxMatchOffset
, nil)
797 // NewReaderDict is like NewReader but initializes the reader
798 // with a preset dictionary. The returned Reader behaves as if
799 // the uncompressed data stream started with the given dictionary,
800 // which has already been read. NewReaderDict is typically used
801 // to read data compressed by NewWriterDict.
803 // The ReadCloser returned by NewReader also implements Resetter.
804 func NewReaderDict(r io
.Reader
, dict
[]byte) io
.ReadCloser
{
805 fixedHuffmanDecoderInit()
809 f
.bits
= new([maxNumLit
+ maxNumDist
]int)
810 f
.codebits
= new([numCodes
]int)
811 f
.step
= (*decompressor
).nextBlock
812 f
.dict
.init(maxMatchOffset
, dict
)