12 files changed, 276 insertions, 165 deletions
diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go
index f8435998e..82882961a 100644
--- a/vendor/github.com/klauspost/compress/flate/deflate.go
+++ b/vendor/github.com/klauspost/compress/flate/deflate.go
@@ -131,7 +131,8 @@ func (d *compressor) fillDeflate(b []byte) int {
 	s := d.state
 	if s.index >= 2*windowSize-(minMatchLength+maxMatchLength) {
 		// shift the window by windowSize
-		copy(d.window[:], d.window[windowSize:2*windowSize])
+		//copy(d.window[:], d.window[windowSize:2*windowSize])
+		*(*[windowSize]byte)(d.window) = *(*[windowSize]byte)(d.window[windowSize:])
 		s.index -= windowSize
 		d.windowEnd -= windowSize
 		if d.blockStart >= windowSize {
@@ -293,7 +294,6 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, of
 	}
 	offset = 0
 
-	cGain := 0
 	if d.chain < 100 {
 		for i := prevHead; tries > 0; tries-- {
 			if wEnd == win[i+length] {
@@ -321,10 +321,14 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, of
 		return
 	}
 
+	// Minimum gain to accept a match.
+	cGain := 4
+
 	// Some like it higher (CSV), some like it lower (JSON)
-	const baseCost = 6
+	const baseCost = 3
 	// Base is 4 bytes at with an additional cost.
 	// Matches must be better than this.
+
 	for i := prevHead; tries > 0; tries-- {
 		if wEnd == win[i+length] {
 			n := matchLen(win[i:i+minMatchLook], wPos)
@@ -332,7 +336,7 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, of
 				// Calculate gain. Estimate
 				newGain := d.h.bitLengthRaw(wPos[:n]) - int(offsetExtraBits[offsetCode(uint32(pos-i))]) - baseCost - int(lengthExtraBits[lengthCodes[(n-3)&255]])
 
-				//fmt.Println(n, "gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n]))
+				//fmt.Println("gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n]), "this-len:", n, "prev-len:", length)
 				if newGain > cGain {
 					length = n
 					offset = pos - i
@@ -373,6 +377,12 @@ func hash4(b []byte) uint32 {
 	return hash4u(binary.LittleEndian.Uint32(b), hashBits)
 }
 
+// hash4 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4u(u uint32, h uint8) uint32 {
+	return (u * prime4bytes) >> (32 - h)
+}
+
 // bulkHash4 will compute hashes using the same
 // algorithm as hash4
 func bulkHash4(b []byte, dst []uint32) {
@@ -483,27 +493,103 @@ func (d *compressor) deflateLazy() {
 		}
 
 		if prevLength >= minMatchLength && s.length <= prevLength {
-			// Check for better match at end...
+			// No better match, but check for better match at end...
 			//
-			// checkOff must be >=2 since we otherwise risk checking s.index
-			// Offset of 2 seems to yield best results.
+			// Skip forward a number of bytes.
+			// Offset of 2 seems to yield best results. 3 is sometimes better.
 			const checkOff = 2
-			prevIndex := s.index - 1
-			if prevIndex+prevLength+checkOff < s.maxInsertIndex {
-				end := lookahead
-				if lookahead > maxMatchLength {
-					end = maxMatchLength
-				}
-				end += prevIndex
-				idx := prevIndex + prevLength - (4 - checkOff)
-				h := hash4(d.window[idx:])
-				ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength + (4 - checkOff)
-				if ch2 > minIndex {
-					length := matchLen(d.window[prevIndex:end], d.window[ch2:])
-					// It seems like a pure length metric is best.
-					if length > prevLength {
-						prevLength = length
-						prevOffset = prevIndex - ch2
+
+			// Check all, except full length
+			if prevLength < maxMatchLength-checkOff {
+				prevIndex := s.index - 1
+				if prevIndex+prevLength < s.maxInsertIndex {
+					end := lookahead
+					if lookahead > maxMatchLength+checkOff {
+						end = maxMatchLength + checkOff
+					}
+					end += prevIndex
+
+					// Hash at match end.
+					h := hash4(d.window[prevIndex+prevLength:])
+					ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength
+					if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff {
+						length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:])
+						// It seems like a pure length metric is best.
+						if length > prevLength {
+							prevLength = length
+							prevOffset = prevIndex - ch2
+
+							// Extend back...
+							for i := checkOff - 1; i >= 0; i-- {
+								if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i] {
+									// Emit tokens we "owe"
+									for j := 0; j <= i; j++ {
+										d.tokens.AddLiteral(d.window[prevIndex+j])
+										if d.tokens.n == maxFlateBlockTokens {
+											// The block includes the current character
+											if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+												return
+											}
+											d.tokens.Reset()
+										}
+										s.index++
+										if s.index < s.maxInsertIndex {
+											h := hash4(d.window[s.index:])
+											ch := s.hashHead[h]
+											s.chainHead = int(ch)
+											s.hashPrev[s.index&windowMask] = ch
+											s.hashHead[h] = uint32(s.index + s.hashOffset)
+										}
+									}
+									break
+								} else {
+									prevLength++
+								}
+							}
+						} else if false {
+							// Check one further ahead.
+							// Only rarely better, disabled for now.
+							prevIndex++
+							h := hash4(d.window[prevIndex+prevLength:])
+							ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength
+							if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff {
+								length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:])
+								// It seems like a pure length metric is best.
+								if length > prevLength+checkOff {
+									prevLength = length
+									prevOffset = prevIndex - ch2
+									prevIndex--
+
+									// Extend back...
+									for i := checkOff; i >= 0; i-- {
+										if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i-1] {
+											// Emit tokens we "owe"
+											for j := 0; j <= i; j++ {
+												d.tokens.AddLiteral(d.window[prevIndex+j])
+												if d.tokens.n == maxFlateBlockTokens {
+													// The block includes the current character
+													if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+														return
+													}
+													d.tokens.Reset()
+												}
+												s.index++
+												if s.index < s.maxInsertIndex {
+													h := hash4(d.window[s.index:])
+													ch := s.hashHead[h]
+													s.chainHead = int(ch)
+													s.hashPrev[s.index&windowMask] = ch
+													s.hashHead[h] = uint32(s.index + s.hashOffset)
+												}
+											}
+											break
+										} else {
+											prevLength++
+										}
+									}
+								}
+							}
+						}
 					}
 				}
 			}
diff --git a/vendor/github.com/klauspost/compress/flate/dict_decoder.go b/vendor/github.com/klauspost/compress/flate/dict_decoder.go
index 71c75a065..bb36351a5 100644
--- a/vendor/github.com/klauspost/compress/flate/dict_decoder.go
+++ b/vendor/github.com/klauspost/compress/flate/dict_decoder.go
@@ -7,19 +7,19 @@ package flate
 // dictDecoder implements the LZ77 sliding dictionary as used in decompression.
 // LZ77 decompresses data through sequences of two forms of commands:
 //
-//	* Literal insertions: Runs of one or more symbols are inserted into the data
-//	stream as is. This is accomplished through the writeByte method for a
-//	single symbol, or combinations of writeSlice/writeMark for multiple symbols.
-//	Any valid stream must start with a literal insertion if no preset dictionary
-//	is used.
+//   - Literal insertions: Runs of one or more symbols are inserted into the data
+//     stream as is. This is accomplished through the writeByte method for a
+//     single symbol, or combinations of writeSlice/writeMark for multiple symbols.
+//     Any valid stream must start with a literal insertion if no preset dictionary
+//     is used.
 //
-//	* Backward copies: Runs of one or more symbols are copied from previously
-//	emitted data. Backward copies come as the tuple (dist, length) where dist
-//	determines how far back in the stream to copy from and length determines how
-//	many bytes to copy. Note that it is valid for the length to be greater than
-//	the distance. Since LZ77 uses forward copies, that situation is used to
-//	perform a form of run-length encoding on repeated runs of symbols.
-//	The writeCopy and tryWriteCopy are used to implement this command.
+//   - Backward copies: Runs of one or more symbols are copied from previously
+//     emitted data. Backward copies come as the tuple (dist, length) where dist
+//     determines how far back in the stream to copy from and length determines how
+//     many bytes to copy. Note that it is valid for the length to be greater than
+//     the distance. Since LZ77 uses forward copies, that situation is used to
+//     perform a form of run-length encoding on repeated runs of symbols.
+//     The writeCopy and tryWriteCopy are used to implement this command.
 //
 // For performance reasons, this implementation performs little to no sanity
 // checks about the arguments. As such, the invariants documented for each
diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
index f781aaa62..24caf5f70 100644
--- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go
+++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
@@ -58,17 +58,6 @@ const (
 	prime8bytes = 0xcf1bbcdcb7a56463
 )
 
-func load32(b []byte, i int) uint32 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:4]
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
-}
-
-func load64(b []byte, i int) uint64 {
-	return binary.LittleEndian.Uint64(b[i:])
-}
-
 func load3232(b []byte, i int32) uint32 {
 	return binary.LittleEndian.Uint32(b[i:])
 }
@@ -77,10 +66,6 @@ func load6432(b []byte, i int32) uint64 {
 	return binary.LittleEndian.Uint64(b[i:])
 }
 
-func hash(u uint32) uint32 {
-	return (u * 0x1e35a7bd) >> tableShift
-}
-
 type tableEntry struct {
 	offset int32
 }
@@ -104,7 +89,8 @@ func (e *fastGen) addBlock(src []byte) int32 {
 			}
 			// Move down
 			offset := int32(len(e.hist)) - maxMatchOffset
-			copy(e.hist[0:maxMatchOffset], e.hist[offset:])
+			// copy(e.hist[0:maxMatchOffset], e.hist[offset:])
+			*(*[maxMatchOffset]byte)(e.hist) = *(*[maxMatchOffset]byte)(e.hist[offset:])
 			e.cur += offset
 			e.hist = e.hist[:maxMatchOffset]
 		}
@@ -114,39 +100,36 @@ func (e *fastGen) addBlock(src []byte) int32 {
 	return s
 }
 
-// hash4 returns the hash of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <32.
-func hash4u(u uint32, h uint8) uint32 {
-	return (u * prime4bytes) >> (32 - h)
-}
-
 type tableEntryPrev struct {
 	Cur  tableEntry
 	Prev tableEntry
 }
 
-// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <32.
-func hash4x64(u uint64, h uint8) uint32 {
-	return (uint32(u) * prime4bytes) >> ((32 - h) & reg8SizeMask32)
-}
-
 // hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash7(u uint64, h uint8) uint32 {
 	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & reg8SizeMask64))
 }
 
-// hash8 returns the hash of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash8(u uint64, h uint8) uint32 {
-	return uint32((u * prime8bytes) >> ((64 - h) & reg8SizeMask64))
-}
-
-// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash6(u uint64, h uint8) uint32 {
-	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & reg8SizeMask64))
+// hashLen returns a hash of the lowest mls bytes of with length output bits.
+// mls must be >=3 and <=8. Any other value will return hash for 4 bytes.
+// length should always be < 32.
+// Preferably length and mls should be a constant for inlining.
+func hashLen(u uint64, length, mls uint8) uint32 {
+	switch mls {
+	case 3:
+		return (uint32(u<<8) * prime3bytes) >> (32 - length)
+	case 5:
+		return uint32(((u << (64 - 40)) * prime5bytes) >> (64 - length))
+	case 6:
+		return uint32(((u << (64 - 48)) * prime6bytes) >> (64 - length))
+	case 7:
+		return uint32(((u << (64 - 56)) * prime7bytes) >> (64 - length))
+	case 8:
+		return uint32((u * prime8bytes) >> (64 - length))
+	default:
+		return (uint32(u) * prime4bytes) >> (32 - length)
+	}
 }
 
 // matchlen will return the match length between offsets and t in src.
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
index 40ef45c2f..89a5dd89f 100644
--- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@@ -265,9 +265,9 @@ func (w *huffmanBitWriter) writeBytes(bytes []byte) {
 // Codes 0-15 are single byte codes. Codes 16-18 are followed by additional
 // information. Code badCode is an end marker
 //
-//  numLiterals      The number of literals in literalEncoding
-//  numOffsets       The number of offsets in offsetEncoding
-//  litenc, offenc   The literal and offset encoder to use
+//	numLiterals      The number of literals in literalEncoding
+//	numOffsets       The number of offsets in offsetEncoding
+//	litenc, offenc   The literal and offset encoder to use
 func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litEnc, offEnc *huffmanEncoder) {
 	for i := range w.codegenFreq {
 		w.codegenFreq[i] = 0
@@ -460,9 +460,9 @@ func (w *huffmanBitWriter) writeOutBits() {
 
 // Write the header of a dynamic Huffman block to the output stream.
 //
-//  numLiterals  The number of literals specified in codegen
-//  numOffsets   The number of offsets specified in codegen
-//  numCodegens  The number of codegens used in codegen
+//	numLiterals  The number of literals specified in codegen
+//	numOffsets   The number of offsets specified in codegen
+//	numCodegens  The number of codegens used in codegen
 func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) {
 	if w.err != nil {
 		return
@@ -790,9 +790,11 @@ func (w *huffmanBitWriter) fillTokens() {
 // and offsetEncoding.
 // The number of literal and offset tokens is returned.
 func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, numOffsets int) {
-	copy(w.literalFreq[:], t.litHist[:])
-	copy(w.literalFreq[256:], t.extraHist[:])
-	copy(w.offsetFreq[:], t.offHist[:offsetCodeCount])
+	//copy(w.literalFreq[:], t.litHist[:])
+	*(*[256]uint16)(w.literalFreq[:]) = t.litHist
+	//copy(w.literalFreq[256:], t.extraHist[:])
+	*(*[32]uint16)(w.literalFreq[256:]) = t.extraHist
+	w.offsetFreq = t.offHist
 
 	if t.n == 0 {
 		return
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go
index 5ac144f28..be7b58b47 100644
--- a/vendor/github.com/klauspost/compress/flate/huffman_code.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go
@@ -168,13 +168,18 @@ func (h *huffmanEncoder) canReuseBits(freq []uint16) int {
 // The cases of 0, 1, and 2 literals are handled by special case code.
 //
 // list  An array of the literals with non-zero frequencies
-//             and their associated frequencies. The array is in order of increasing
-//             frequency, and has as its last element a special element with frequency
-//             MaxInt32
+//
+//	and their associated frequencies. The array is in order of increasing
+//	frequency, and has as its last element a special element with frequency
+//	MaxInt32
+//
 // maxBits     The maximum number of bits that should be used to encode any literal.
-//             Must be less than 16.
+//
+//	Must be less than 16.
+//
 // return      An integer array in which array[i] indicates the number of literals
-//             that should be encoded in i bits.
+//
+//	that should be encoded in i bits.
 func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	if maxBits >= maxBitsLimit {
 		panic("flate: maxBits too large")
diff --git a/vendor/github.com/klauspost/compress/flate/level1.go b/vendor/github.com/klauspost/compress/flate/level1.go
index 0f14f8d63..703b9a89a 100644
--- a/vendor/github.com/klauspost/compress/flate/level1.go
+++ b/vendor/github.com/klauspost/compress/flate/level1.go
@@ -19,6 +19,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashBytes              = 5
 	)
 	if debugDeflate && e.cur < 0 {
 		panic(fmt.Sprint("e.cur < 0: ", e.cur))
@@ -68,7 +69,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 	sLimit := int32(len(src) - inputMargin)
 
 	// nextEmit is where in src the next emitLiteral should start from.
-	cv := load3232(src, s)
+	cv := load6432(src, s)
 
 	for {
 		const skipLog = 5
@@ -77,7 +78,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 		nextS := s
 		var candidate tableEntry
 		for {
-			nextHash := hash(cv)
+			nextHash := hashLen(cv, tableBits, hashBytes)
 			candidate = e.table[nextHash]
 			nextS = s + doEvery + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
@@ -86,16 +87,16 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 
 			now := load6432(src, nextS)
 			e.table[nextHash] = tableEntry{offset: s + e.cur}
-			nextHash = hash(uint32(now))
+			nextHash = hashLen(now, tableBits, hashBytes)
 
 			offset := s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
 				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}
 
 			// Do one right away...
-			cv = uint32(now)
+			cv = now
 			s = nextS
 			nextS++
 			candidate = e.table[nextHash]
@@ -103,11 +104,11 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			e.table[nextHash] = tableEntry{offset: s + e.cur}
 
 			offset = s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
 				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}
-			cv = uint32(now)
+			cv = now
 			s = nextS
 		}
 
@@ -198,9 +199,9 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			}
 			if s >= sLimit {
 				// Index first pair after match end.
-				if int(s+l+4) < len(src) {
-					cv := load3232(src, s)
-					e.table[hash(cv)] = tableEntry{offset: s + e.cur}
+				if int(s+l+8) < len(src) {
+					cv := load6432(src, s)
+					e.table[hashLen(cv, tableBits, hashBytes)] = tableEntry{offset: s + e.cur}
 				}
 				goto emitRemainder
 			}
@@ -213,16 +214,16 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			// three load32 calls.
 			x := load6432(src, s-2)
 			o := e.cur + s - 2
-			prevHash := hash(uint32(x))
+			prevHash := hashLen(x, tableBits, hashBytes)
 			e.table[prevHash] = tableEntry{offset: o}
 			x >>= 16
-			currHash := hash(uint32(x))
+			currHash := hashLen(x, tableBits, hashBytes)
 			candidate = e.table[currHash]
 			e.table[currHash] = tableEntry{offset: o + 2}
 
 			offset := s - (candidate.offset - e.cur)
 			if offset > maxMatchOffset || uint32(x) != load3232(src, candidate.offset-e.cur) {
-				cv = uint32(x >> 8)
+				cv = x >> 8
 				s++
 				break
 			}
diff --git a/vendor/github.com/klauspost/compress/flate/level2.go b/vendor/github.com/klauspost/compress/flate/level2.go
index 8603fbd55..876dfbe30 100644
--- a/vendor/github.com/klauspost/compress/flate/level2.go
+++ b/vendor/github.com/klauspost/compress/flate/level2.go
@@ -16,6 +16,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashBytes              = 5
 	)
 
 	if debugDeflate && e.cur < 0 {
@@ -66,7 +67,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 	sLimit := int32(len(src) - inputMargin)
 
 	// nextEmit is where in src the next emitLiteral should start from.
-	cv := load3232(src, s)
+	cv := load6432(src, s)
 	for {
 		// When should we start skipping if we haven't found matches in a long while.
 		const skipLog = 5
@@ -75,7 +76,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 		nextS := s
 		var candidate tableEntry
 		for {
-			nextHash := hash4u(cv, bTableBits)
+			nextHash := hashLen(cv, bTableBits, hashBytes)
 			s = nextS
 			nextS = s + doEvery + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
@@ -84,16 +85,16 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 			candidate = e.table[nextHash]
 			now := load6432(src, nextS)
 			e.table[nextHash] = tableEntry{offset: s + e.cur}
-			nextHash = hash4u(uint32(now), bTableBits)
+			nextHash = hashLen(now, bTableBits, hashBytes)
 
 			offset := s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
 				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}
 
 			// Do one right away...
-			cv = uint32(now)
+			cv = now
 			s = nextS
 			nextS++
 			candidate = e.table[nextHash]
@@ -101,10 +102,10 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 			e.table[nextHash] = tableEntry{offset: s + e.cur}
 
 			offset = s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
 				break
 			}
-			cv = uint32(now)
+			cv = now
 		}
 
 		// A 4-byte match has been found. We'll later see if more than 4 bytes
@@ -154,9 +155,9 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 
 			if s >= sLimit {
 				// Index first pair after match end.
-				if int(s+l+4) < len(src) {
-					cv := load3232(src, s)
-					e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur}
+				if int(s+l+8) < len(src) {
+					cv := load6432(src, s)
+					e.table[hashLen(cv, bTableBits, hashBytes)] = tableEntry{offset: s + e.cur}
 				}
 				goto emitRemainder
 			}
@@ -164,15 +165,15 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 			// Store every second hash in-between, but offset by 1.
 			for i := s - l + 2; i < s-5; i += 7 {
 				x := load6432(src, i)
-				nextHash := hash4u(uint32(x), bTableBits)
+				nextHash := hashLen(x, bTableBits, hashBytes)
 				e.table[nextHash] = tableEntry{offset: e.cur + i}
 				// Skip one
 				x >>= 16
-				nextHash = hash4u(uint32(x), bTableBits)
+				nextHash = hashLen(x, bTableBits, hashBytes)
 				e.table[nextHash] = tableEntry{offset: e.cur + i + 2}
 				// Skip one
 				x >>= 16
-				nextHash = hash4u(uint32(x), bTableBits)
+				nextHash = hashLen(x, bTableBits, hashBytes)
 				e.table[nextHash] = tableEntry{offset: e.cur + i + 4}
 			}
 
@@ -184,17 +185,17 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 			// three load32 calls.
 			x := load6432(src, s-2)
 			o := e.cur + s - 2
-			prevHash := hash4u(uint32(x), bTableBits)
-			prevHash2 := hash4u(uint32(x>>8), bTableBits)
+			prevHash := hashLen(x, bTableBits, hashBytes)
+			prevHash2 := hashLen(x>>8, bTableBits, hashBytes)
 			e.table[prevHash] = tableEntry{offset: o}
 			e.table[prevHash2] = tableEntry{offset: o + 1}
-			currHash := hash4u(uint32(x>>16), bTableBits)
+			currHash := hashLen(x>>16, bTableBits, hashBytes)
 			candidate = e.table[currHash]
 			e.table[currHash] = tableEntry{offset: o + 2}
 
 			offset := s - (candidate.offset - e.cur)
 			if offset > maxMatchOffset || uint32(x>>16) != load3232(src, candidate.offset-e.cur) {
-				cv = uint32(x >> 24)
+				cv = x >> 24
 				s++
 				break
 			}
diff --git a/vendor/github.com/klauspost/compress/flate/level3.go b/vendor/github.com/klauspost/compress/flate/level3.go
index 039639f89..7aa2b72a1 100644
--- a/vendor/github.com/klauspost/compress/flate/level3.go
+++ b/vendor/github.com/klauspost/compress/flate/level3.go
@@ -11,10 +11,11 @@ type fastEncL3 struct {
 // Encode uses a similar algorithm to level 2, will check up to two candidates.
 func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 	const (
-		inputMargin            = 8 - 1
+		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 		tableBits              = 16
 		tableSize              = 1 << tableBits
+		hashBytes              = 5
 	)
 
 	if debugDeflate && e.cur < 0 {
@@ -69,20 +70,20 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 	sLimit := int32(len(src) - inputMargin)
 
 	// nextEmit is where in src the next emitLiteral should start from.
-	cv := load3232(src, s)
+	cv := load6432(src, s)
 	for {
-		const skipLog = 6
+		const skipLog = 7
 		nextS := s
 		var candidate tableEntry
 		for {
-			nextHash := hash4u(cv, tableBits)
+			nextHash := hashLen(cv, tableBits, hashBytes)
 			s = nextS
 			nextS = s + 1 + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			candidates := e.table[nextHash]
-			now := load3232(src, nextS)
+			now := load6432(src, nextS)
 
 			// Safe offset distance until s + 4...
 			minOffset := e.cur + s - (maxMatchOffset - 4)
@@ -96,8 +97,8 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				continue
 			}
 
-			if cv == load3232(src, candidate.offset-e.cur) {
-				if candidates.Prev.offset < minOffset || cv != load3232(src, candidates.Prev.offset-e.cur) {
+			if uint32(cv) == load3232(src, candidate.offset-e.cur) {
+				if candidates.Prev.offset < minOffset || uint32(cv) != load3232(src, candidates.Prev.offset-e.cur) {
 					break
 				}
 				// Both match and are valid, pick longest.
@@ -112,7 +113,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				// We only check if value mismatches.
 				// Offset will always be invalid in other cases.
 				candidate = candidates.Prev
-				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) {
+				if candidate.offset > minOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
 					break
 				}
 			}
@@ -164,9 +165,9 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			if s >= sLimit {
 				t += l
 				// Index first pair after match end.
-				if int(t+4) < len(src) && t > 0 {
-					cv := load3232(src, t)
-					nextHash := hash4u(cv, tableBits)
+				if int(t+8) < len(src) && t > 0 {
+					cv = load6432(src, t)
+					nextHash := hashLen(cv, tableBits, hashBytes)
 					e.table[nextHash] = tableEntryPrev{
 						Prev: e.table[nextHash].Cur,
 						Cur:  tableEntry{offset: e.cur + t},
@@ -176,8 +177,8 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			}
 
 			// Store every 5th hash in-between.
-			for i := s - l + 2; i < s-5; i += 5 {
-				nextHash := hash4u(load3232(src, i), tableBits)
+			for i := s - l + 2; i < s-5; i += 6 {
+				nextHash := hashLen(load6432(src, i), tableBits, hashBytes)
 				e.table[nextHash] = tableEntryPrev{
 					Prev: e.table[nextHash].Cur,
 					Cur:  tableEntry{offset: e.cur + i}}
@@ -185,23 +186,23 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			// We could immediately start working at s now, but to improve
 			// compression we first update the hash table at s-2 to s.
 			x := load6432(src, s-2)
-			prevHash := hash4u(uint32(x), tableBits)
+			prevHash := hashLen(x, tableBits, hashBytes)
 
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 2},
 			}
 			x >>= 8
-			prevHash = hash4u(uint32(x), tableBits)
+			prevHash = hashLen(x, tableBits, hashBytes)
 
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 1},
 			}
 			x >>= 8
-			currHash := hash4u(uint32(x), tableBits)
+			currHash := hashLen(x, tableBits, hashBytes)
 			candidates := e.table[currHash]
-			cv = uint32(x)
+			cv = x
 			e.table[currHash] = tableEntryPrev{
 				Prev: candidates.Cur,
 				Cur:  tableEntry{offset: s + e.cur},
@@ -212,17 +213,17 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			minOffset := e.cur + s - (maxMatchOffset - 4)
 
 			if candidate.offset > minOffset {
-				if cv == load3232(src, candidate.offset-e.cur) {
+				if uint32(cv) == load3232(src, candidate.offset-e.cur) {
 					// Found a match...
 					continue
 				}
 				candidate = candidates.Prev
-				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) {
+				if candidate.offset > minOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
 					// Match at prev...
 					continue
 				}
 			}
-			cv = uint32(x >> 8)
+			cv = x >> 8
 			s++
 			break
 		}
diff --git a/vendor/github.com/klauspost/compress/flate/level4.go b/vendor/github.com/klauspost/compress/flate/level4.go
index 1cbffa1ae..23c08b325 100644
--- a/vendor/github.com/klauspost/compress/flate/level4.go
+++ b/vendor/github.com/klauspost/compress/flate/level4.go
@@ -12,6 +12,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
 	)
 	if debugDeflate && e.cur < 0 {
 		panic(fmt.Sprint("e.cur < 0: ", e.cur))
@@ -80,7 +81,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 		nextS := s
 		var t int32
 		for {
-			nextHashS := hash4x64(cv, tableBits)
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
 			nextHashL := hash7(cv, tableBits)
 
 			s = nextS
@@ -168,7 +169,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			// Index first pair after match end.
 			if int(s+8) < len(src) {
 				cv := load6432(src, s)
-				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: s + e.cur}
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: s + e.cur}
 				e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur}
 			}
 			goto emitRemainder
@@ -183,7 +184,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 				t2 := tableEntry{offset: t.offset + 1}
 				e.bTable[hash7(cv, tableBits)] = t
 				e.bTable[hash7(cv>>8, tableBits)] = t2
-				e.table[hash4u(uint32(cv>>8), tableBits)] = t2
+				e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
 
 				i += 3
 				for ; i < s-1; i += 3 {
@@ -192,7 +193,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 					t2 := tableEntry{offset: t.offset + 1}
 					e.bTable[hash7(cv, tableBits)] = t
 					e.bTable[hash7(cv>>8, tableBits)] = t2
-					e.table[hash4u(uint32(cv>>8), tableBits)] = t2
+					e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
 				}
 			}
 		}
@@ -201,7 +202,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 		// compression we first update the hash table at s-1 and at s.
 		x := load6432(src, s-1)
 		o := e.cur + s - 1
-		prevHashS := hash4x64(x, tableBits)
+		prevHashS := hashLen(x, tableBits, hashShortBytes)
 		prevHashL := hash7(x, tableBits)
 		e.table[prevHashS] = tableEntry{offset: o}
 		e.bTable[prevHashL] = tableEntry{offset: o}
diff --git a/vendor/github.com/klauspost/compress/flate/level5.go b/vendor/github.com/klauspost/compress/flate/level5.go
index 4b97576bd..83ef50ba4 100644
--- a/vendor/github.com/klauspost/compress/flate/level5.go
+++ b/vendor/github.com/klauspost/compress/flate/level5.go
@@ -12,6 +12,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
 	)
 	if debugDeflate && e.cur < 0 {
 		panic(fmt.Sprint("e.cur < 0: ", e.cur))
@@ -88,7 +89,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 		var l int32
 		var t int32
 		for {
-			nextHashS := hash4x64(cv, tableBits)
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
 			nextHashL := hash7(cv, tableBits)
 
 			s = nextS
@@ -105,7 +106,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			eLong := &e.bTable[nextHashL]
 			eLong.Cur, eLong.Prev = entry, eLong.Cur
 
-			nextHashS = hash4x64(next, tableBits)
+			nextHashS = hashLen(next, tableBits, hashShortBytes)
 			nextHashL = hash7(next, tableBits)
 
 			t = lCandidate.Cur.offset - e.cur
@@ -191,14 +192,21 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 
 		// Try to locate a better match by checking the end of best match...
 		if sAt := s + l; l < 30 && sAt < sLimit {
+			// Allow some bytes at the beginning to mismatch.
+			// Sweet spot is 2/3 bytes depending on input.
+			// 3 is only a little better when it is but sometimes a lot worse.
+			// The skipped bytes are tested in Extend backwards,
+			// and still picked up as part of the match if they do.
+			const skipBeginning = 2
 			eLong := e.bTable[hash7(load6432(src, sAt), tableBits)].Cur.offset
-			// Test current
-			t2 := eLong - e.cur - l
-			off := s - t2
+			t2 := eLong - e.cur - l + skipBeginning
+			s2 := s + skipBeginning
+			off := s2 - t2
 			if t2 >= 0 && off < maxMatchOffset && off > 0 {
-				if l2 := e.matchlenLong(s, t2, src); l2 > l {
+				if l2 := e.matchlenLong(s2, t2, src); l2 > l {
 					t = t2
 					l = l2
+					s = s2
 				}
 			}
 		}
@@ -250,7 +258,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			if i < s-1 {
 				cv := load6432(src, i)
 				t := tableEntry{offset: i + e.cur}
-				e.table[hash4x64(cv, tableBits)] = t
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = t
 				eLong := &e.bTable[hash7(cv, tableBits)]
 				eLong.Cur, eLong.Prev = t, eLong.Cur
 
@@ -263,7 +271,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 				// We only have enough bits for a short entry at i+2
 				cv >>= 8
 				t = tableEntry{offset: t.offset + 1}
-				e.table[hash4x64(cv, tableBits)] = t
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = t
 
 				// Skip one - otherwise we risk hitting 's'
 				i += 4
@@ -273,7 +281,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 					t2 := tableEntry{offset: t.offset + 1}
 					eLong := &e.bTable[hash7(cv, tableBits)]
 					eLong.Cur, eLong.Prev = t, eLong.Cur
-					e.table[hash4u(uint32(cv>>8), tableBits)] = t2
+					e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
 				}
 			}
 		}
@@ -282,7 +290,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 		// compression we first update the hash table at s-1 and at s.
 		x := load6432(src, s-1)
 		o := e.cur + s - 1
-		prevHashS := hash4x64(x, tableBits)
+		prevHashS := hashLen(x, tableBits, hashShortBytes)
 		prevHashL := hash7(x, tableBits)
 		e.table[prevHashS] = tableEntry{offset: o}
 		eLong := &e.bTable[prevHashL]
diff --git a/vendor/github.com/klauspost/compress/flate/level6.go b/vendor/github.com/klauspost/compress/flate/level6.go
index 62888edf3..f1e9d98fa 100644
--- a/vendor/github.com/klauspost/compress/flate/level6.go
+++ b/vendor/github.com/klauspost/compress/flate/level6.go
@@ -12,6 +12,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
 	)
 	if debugDeflate && e.cur < 0 {
 		panic(fmt.Sprint("e.cur < 0: ", e.cur))
@@ -90,7 +91,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 		var l int32
 		var t int32
 		for {
-			nextHashS := hash4x64(cv, tableBits)
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
 			nextHashL := hash7(cv, tableBits)
 			s = nextS
 			nextS = s + doEvery + (s-nextEmit)>>skipLog
@@ -107,7 +108,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			eLong.Cur, eLong.Prev = entry, eLong.Cur
 
 			// Calculate hashes of 'next'
-			nextHashS = hash4x64(next, tableBits)
+			nextHashS = hashLen(next, tableBits, hashShortBytes)
 			nextHashL = hash7(next, tableBits)
 
 			t = lCandidate.Cur.offset - e.cur
@@ -213,24 +214,33 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 
 		// Try to locate a better match by checking the end-of-match...
 		if sAt := s + l; sAt < sLimit {
+			// Allow some bytes at the beginning to mismatch.
+			// Sweet spot is 2/3 bytes depending on input.
+			// 3 is only a little better when it is but sometimes a lot worse.
+			// The skipped bytes are tested in Extend backwards,
+			// and still picked up as part of the match if they do.
+			const skipBeginning = 2
 			eLong := &e.bTable[hash7(load6432(src, sAt), tableBits)]
 			// Test current
-			t2 := eLong.Cur.offset - e.cur - l
-			off := s - t2
+			t2 := eLong.Cur.offset - e.cur - l + skipBeginning
+			s2 := s + skipBeginning
+			off := s2 - t2
 			if off < maxMatchOffset {
 				if off > 0 && t2 >= 0 {
-					if l2 := e.matchlenLong(s, t2, src); l2 > l {
+					if l2 := e.matchlenLong(s2, t2, src); l2 > l {
 						t = t2
 						l = l2
+						s = s2
 					}
 				}
 				// Test next:
-				t2 = eLong.Prev.offset - e.cur - l
-				off := s - t2
+				t2 = eLong.Prev.offset - e.cur - l + skipBeginning
+				off := s2 - t2
 				if off > 0 && off < maxMatchOffset && t2 >= 0 {
-					if l2 := e.matchlenLong(s, t2, src); l2 > l {
+					if l2 := e.matchlenLong(s2, t2, src); l2 > l {
 						t = t2
 						l = l2
+						s = s2
 					}
 				}
 			}
@@ -277,7 +287,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			// Index after match end.
 			for i := nextS + 1; i < int32(len(src))-8; i += 2 {
 				cv := load6432(src, i)
-				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: i + e.cur}
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: i + e.cur}
 				eLong := &e.bTable[hash7(cv, tableBits)]
 				eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur}, eLong.Cur
 			}
@@ -292,7 +302,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 				t2 := tableEntry{offset: t.offset + 1}
 				eLong := &e.bTable[hash7(cv, tableBits)]
 				eLong2 := &e.bTable[hash7(cv>>8, tableBits)]
-				e.table[hash4x64(cv, tableBits)] = t
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = t
 				eLong.Cur, eLong.Prev = t, eLong.Cur
 				eLong2.Cur, eLong2.Prev = t2, eLong2.Cur
 			}
diff --git a/vendor/github.com/klauspost/compress/flate/stateless.go b/vendor/github.com/klauspost/compress/flate/stateless.go
index 93a1d1503..f3d4139ef 100644
--- a/vendor/github.com/klauspost/compress/flate/stateless.go
+++ b/vendor/github.com/klauspost/compress/flate/stateless.go
@@ -86,11 +86,19 @@ func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {
 		dict = dict[len(dict)-maxStatelessDict:]
 	}
 
+	// For subsequent loops, keep shallow dict reference to avoid alloc+copy.
+	var inDict []byte
+
 	for len(in) > 0 {
 		todo := in
-		if len(todo) > maxStatelessBlock-len(dict) {
+		if len(inDict) > 0 {
+			if len(todo) > maxStatelessBlock-maxStatelessDict {
+				todo = todo[:maxStatelessBlock-maxStatelessDict]
+			}
+		} else if len(todo) > maxStatelessBlock-len(dict) {
 			todo = todo[:maxStatelessBlock-len(dict)]
 		}
+		inOrg := in
 		in = in[len(todo):]
 		uncompressed := todo
 		if len(dict) > 0 {
@@ -102,7 +110,11 @@ func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {
 			todo = combined
 		}
 		// Compress
-		statelessEnc(&dst, todo, int16(len(dict)))
+		if len(inDict) == 0 {
+			statelessEnc(&dst, todo, int16(len(dict)))
+		} else {
+			statelessEnc(&dst, inDict[:maxStatelessDict+len(todo)], maxStatelessDict)
+		}
 		isEof := eof && len(in) == 0
 
 		if dst.n == 0 {
@@ -119,7 +131,8 @@ func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {
 		}
 		if len(in) > 0 {
 			// Retain a dict if we have more
-			dict = todo[len(todo)-maxStatelessDict:]
+			inDict = inOrg[len(uncompressed)-maxStatelessDict:]
+			dict = nil
 			dst.Reset()
 		}
 		if bw.err != nil {