libgo/go/compress/flate/deflatefast.go

   1 // Copyright 2016 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package flate
   6
   7 // This encoding algorithm, which prioritizes speed over output size, is
   8 // based on Snappy's LZ77-style encoder: github.com/golang/snappy
   9
  10 const (
  11         tableBits  = 14             // Bits used in the table.
  12         tableSize  = 1 << tableBits // Size of the table.
  13         tableMask  = tableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
  14         tableShift = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
  15 )
  16
  17 func load32(b []byte, i int32) uint32 {
  18         b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
  19         return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
  20 }
  21
  22 func load64(b []byte, i int32) uint64 {
  23         b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
  24         return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
  25                 uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
  26 }
  27
  28 func hash(u uint32) uint32 {
  29         return (u * 0x1e35a7bd) >> tableShift
  30 }
  31
  32 // These constants are defined by the Snappy implementation so that its
  33 // assembly implementation can fast-path some 16-bytes-at-a-time copies. They
  34 // aren't necessary in the pure Go implementation, as we don't use those same
  35 // optimizations, but using the same thresholds doesn't really hurt.
  36 const (
  37         inputMargin            = 16 - 1
  38         minNonLiteralBlockSize = 1 + 1 + inputMargin
  39 )
  40
  41 type tableEntry struct {
  42         val    uint32 // Value at destination
  43         offset int32
  44 }
  45
  46 // deflateFast maintains the table for matches,
  47 // and the previous byte block for cross block matching.
  48 type deflateFast struct {
  49         table [tableSize]tableEntry
  50         prev  []byte // Previous block, zero length if unknown.
  51         cur   int32  // Current match offset.
  52 }
  53
  54 func newDeflateFast() *deflateFast {
  55         return &deflateFast{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}
  56 }
  57
  58 // encode encodes a block given in src and appends tokens
  59 // to dst and returns the result.
  60 func (e *deflateFast) encode(dst []token, src []byte) []token {
  61         // Ensure that e.cur doesn't wrap.
  62         if e.cur > 1<<30 {
  63                 e.resetAll()
  64         }
  65
  66         // This check isn't in the Snappy implementation, but there, the caller
  67         // instead of the callee handles this case.
  68         if len(src) < minNonLiteralBlockSize {
  69                 e.cur += maxStoreBlockSize
  70                 e.prev = e.prev[:0]
  71                 return emitLiteral(dst, src)
  72         }
  73
  74         // sLimit is when to stop looking for offset/length copies. The inputMargin
  75         // lets us use a fast path for emitLiteral in the main loop, while we are
  76         // looking for copies.
  77         sLimit := int32(len(src) - inputMargin)
  78
  79         // nextEmit is where in src the next emitLiteral should start from.
  80         nextEmit := int32(0)
  81         s := int32(0)
  82         cv := load32(src, s)
  83         nextHash := hash(cv)
  84
  85         for {
  86                 // Copied from the C++ snappy implementation:
  87                 //
  88                 // Heuristic match skipping: If 32 bytes are scanned with no matches
  89                 // found, start looking only at every other byte. If 32 more bytes are
  90                 // scanned (or skipped), look at every third byte, etc.. When a match
  91                 // is found, immediately go back to looking at every byte. This is a
  92                 // small loss (~5% performance, ~0.1% density) for compressible data
  93                 // due to more bookkeeping, but for non-compressible data (such as
  94                 // JPEG) it's a huge win since the compressor quickly "realizes" the
  95                 // data is incompressible and doesn't bother looking for matches
  96                 // everywhere.
  97                 //
  98                 // The "skip" variable keeps track of how many bytes there are since
  99                 // the last match; dividing it by 32 (ie. right-shifting by five) gives
 100                 // the number of bytes to move ahead for each iteration.
 101                 skip := int32(32)
 102
 103                 nextS := s
 104                 var candidate tableEntry
 105                 for {
 106                         s = nextS
 107                         bytesBetweenHashLookups := skip >> 5
 108                         nextS = s + bytesBetweenHashLookups
 109                         skip += bytesBetweenHashLookups
 110                         if nextS > sLimit {
 111                                 goto emitRemainder
 112                         }
 113                         candidate = e.table[nextHash&tableMask]
 114                         now := load32(src, nextS)
 115                         e.table[nextHash&tableMask] = tableEntry{offset: s + e.cur, val: cv}
 116                         nextHash = hash(now)
 117
 118                         offset := s - (candidate.offset - e.cur)
 119                         if offset > maxMatchOffset || cv != candidate.val {
 120                                 // Out of range or not matched.
 121                                 cv = now
 122                                 continue
 123                         }
 124                         break
 125                 }
 126
 127                 // A 4-byte match has been found. We'll later see if more than 4 bytes
 128                 // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
 129                 // them as literal bytes.
 130                 dst = emitLiteral(dst, src[nextEmit:s])
 131
 132                 // Call emitCopy, and then see if another emitCopy could be our next
 133                 // move. Repeat until we find no match for the input immediately after
 134                 // what was consumed by the last emitCopy call.
 135                 //
 136                 // If we exit this loop normally then we need to call emitLiteral next,
 137                 // though we don't yet know how big the literal will be. We handle that
 138                 // by proceeding to the next iteration of the main loop. We also can
 139                 // exit this loop via goto if we get close to exhausting the input.
 140                 for {
 141                         // Invariant: we have a 4-byte match at s, and no need to emit any
 142                         // literal bytes prior to s.
 143
 144                         // Extend the 4-byte match as long as possible.
 145                         //
 146                         s += 4
 147                         t := candidate.offset - e.cur + 4
 148                         l := e.matchLen(s, t, src)
 149
 150                         // matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
 151                         dst = append(dst, matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset)))
 152                         s += l
 153                         nextEmit = s
 154                         if s >= sLimit {
 155                                 goto emitRemainder
 156                         }
 157
 158                         // We could immediately start working at s now, but to improve
 159                         // compression we first update the hash table at s-1 and at s. If
 160                         // another emitCopy is not our next move, also calculate nextHash
 161                         // at s+1. At least on GOARCH=amd64, these three hash calculations
 162                         // are faster as one load64 call (with some shifts) instead of
 163                         // three load32 calls.
 164                         x := load64(src, s-1)
 165                         prevHash := hash(uint32(x))
 166                         e.table[prevHash&tableMask] = tableEntry{offset: e.cur + s - 1, val: uint32(x)}
 167                         x >>= 8
 168                         currHash := hash(uint32(x))
 169                         candidate = e.table[currHash&tableMask]
 170                         e.table[currHash&tableMask] = tableEntry{offset: e.cur + s, val: uint32(x)}
 171
 172                         offset := s - (candidate.offset - e.cur)
 173                         if offset > maxMatchOffset || uint32(x) != candidate.val {
 174                                 cv = uint32(x >> 8)
 175                                 nextHash = hash(cv)
 176                                 s++
 177                                 break
 178                         }
 179                 }
 180         }
 181
 182 emitRemainder:
 183         if int(nextEmit) < len(src) {
 184                 dst = emitLiteral(dst, src[nextEmit:])
 185         }
 186         e.cur += int32(len(src))
 187         e.prev = e.prev[:len(src)]
 188         copy(e.prev, src)
 189         return dst
 190 }
 191
 192 func emitLiteral(dst []token, lit []byte) []token {
 193         for _, v := range lit {
 194                 dst = append(dst, literalToken(uint32(v)))
 195         }
 196         return dst
 197 }
 198
 199 // matchLen returns the match length between src[s:] and src[t:].
 200 // t can be negative to indicate the match is starting in e.prev.
 201 // We assume that src[s-4:s] and src[t-4:t] already match.
 202 func (e *deflateFast) matchLen(s, t int32, src []byte) int32 {
 203         s1 := int(s) + maxMatchLength - 4
 204         if s1 > len(src) {
 205                 s1 = len(src)
 206         }
 207
 208         // If we are inside the current block
 209         if t >= 0 {
 210                 b := src[t:]
 211                 a := src[s:s1]
 212                 b = b[:len(a)]
 213                 // Extend the match to be as long as possible.
 214                 for i := range a {
 215                         if a[i] != b[i] {
 216                                 return int32(i)
 217                         }
 218                 }
 219                 return int32(len(a))
 220         }
 221
 222         // We found a match in the previous block.
 223         tp := int32(len(e.prev)) + t
 224         if tp < 0 {
 225                 return 0
 226         }
 227
 228         // Extend the match to be as long as possible.
 229         a := src[s:s1]
 230         b := e.prev[tp:]
 231         if len(b) > len(a) {
 232                 b = b[:len(a)]
 233         }
 234         a = a[:len(b)]
 235         for i := range b {
 236                 if a[i] != b[i] {
 237                         return int32(i)
 238                 }
 239         }
 240
 241         // If we reached our limit, we matched everything we are
 242         // allowed to in the previous block and we return.
 243         n := int32(len(b))
 244         if int(s+n) == s1 {
 245                 return n
 246         }
 247
 248         // Continue looking for more matches in the current block.
 249         a = src[s+n : s1]
 250         b = src[:len(a)]
 251         for i := range a {
 252                 if a[i] != b[i] {
 253                         return int32(i) + n
 254                 }
 255         }
 256         return int32(len(a)) + n
 257 }
 258
 259 // Reset resets the encoding history.
 260 // This ensures that no matches are made to the previous block.
 261 func (e *deflateFast) reset() {
 262         e.prev = e.prev[:0]
 263         // Bump the offset, so all matches will fail distance check.
 264         e.cur += maxMatchOffset
 265
 266         // Protect against e.cur wraparound.
 267         if e.cur > 1<<30 {
 268                 e.resetAll()
 269         }
 270 }
 271
 272 // resetAll resets the deflateFast struct and is only called in rare
 273 // situations to prevent integer overflow. It manually resets each field
 274 // to avoid causing large stack growth.
 275 //
 276 // See https://golang.org/issue/18636.
 277 func (e *deflateFast) resetAll() {
 278         // This is equivalent to:
 279         //      *e = deflateFast{cur: maxStoreBlockSize, prev: e.prev[:0]}
 280         e.cur = maxStoreBlockSize
 281         e.prev = e.prev[:0]
 282         for i := range e.table {
 283                 e.table[i] = tableEntry{}
 284         }
 285 }