diff options
| author | 2023-02-27 10:21:58 +0100 | |
|---|---|---|
| committer | 2023-02-27 10:21:58 +0100 | |
| commit | 752c38b0d5c01dbcc8f846518a61cbde4b7537cf (patch) | |
| tree | bf00aaf2eb2703cd50491e5359099159665ecaab /vendor/github.com/klauspost/compress | |
| parent | [chore] Use latest containers when building (#1554) (diff) | |
| download | gotosocial-752c38b0d5c01dbcc8f846518a61cbde4b7537cf.tar.xz | |
[chore]: Bump github.com/minio/minio-go/v7 from 7.0.48 to 7.0.49 (#1567)
Bumps [github.com/minio/minio-go/v7](https://github.com/minio/minio-go) from 7.0.48 to 7.0.49.
- [Release notes](https://github.com/minio/minio-go/releases)
- [Commits](https://github.com/minio/minio-go/compare/v7.0.48...v7.0.49)
---
updated-dependencies:
- dependency-name: github.com/minio/minio-go/v7
  dependency-type: direct:production
  update-type: version-update:semver-patch
...
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Diffstat (limited to 'vendor/github.com/klauspost/compress')
22 files changed, 1060 insertions, 810 deletions
| diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go index f8435998e..82882961a 100644 --- a/vendor/github.com/klauspost/compress/flate/deflate.go +++ b/vendor/github.com/klauspost/compress/flate/deflate.go @@ -131,7 +131,8 @@ func (d *compressor) fillDeflate(b []byte) int {  	s := d.state  	if s.index >= 2*windowSize-(minMatchLength+maxMatchLength) {  		// shift the window by windowSize -		copy(d.window[:], d.window[windowSize:2*windowSize]) +		//copy(d.window[:], d.window[windowSize:2*windowSize]) +		*(*[windowSize]byte)(d.window) = *(*[windowSize]byte)(d.window[windowSize:])  		s.index -= windowSize  		d.windowEnd -= windowSize  		if d.blockStart >= windowSize { @@ -293,7 +294,6 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, of  	}  	offset = 0 -	cGain := 0  	if d.chain < 100 {  		for i := prevHead; tries > 0; tries-- {  			if wEnd == win[i+length] { @@ -321,10 +321,14 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, of  		return  	} +	// Minimum gain to accept a match. +	cGain := 4 +  	// Some like it higher (CSV), some like it lower (JSON) -	const baseCost = 6 +	const baseCost = 3  	// Base is 4 bytes at with an additional cost.  	// Matches must be better than this. +  	for i := prevHead; tries > 0; tries-- {  		if wEnd == win[i+length] {  			n := matchLen(win[i:i+minMatchLook], wPos) @@ -332,7 +336,7 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, of  				// Calculate gain. Estimate  				newGain := d.h.bitLengthRaw(wPos[:n]) - int(offsetExtraBits[offsetCode(uint32(pos-i))]) - baseCost - int(lengthExtraBits[lengthCodes[(n-3)&255]]) -				//fmt.Println(n, "gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n])) +				//fmt.Println("gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n]), "this-len:", n, "prev-len:", length)  				if newGain > cGain {  					length = n  					offset = pos - i @@ -373,6 +377,12 @@ func hash4(b []byte) uint32 {  	return hash4u(binary.LittleEndian.Uint32(b), hashBits)  } +// hash4 returns the hash of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <32. +func hash4u(u uint32, h uint8) uint32 { +	return (u * prime4bytes) >> (32 - h) +} +  // bulkHash4 will compute hashes using the same  // algorithm as hash4  func bulkHash4(b []byte, dst []uint32) { @@ -483,27 +493,103 @@ func (d *compressor) deflateLazy() {  		}  		if prevLength >= minMatchLength && s.length <= prevLength { -			// Check for better match at end... +			// No better match, but check for better match at end...  			// -			// checkOff must be >=2 since we otherwise risk checking s.index -			// Offset of 2 seems to yield best results. +			// Skip forward a number of bytes. +			// Offset of 2 seems to yield best results. 3 is sometimes better.  			const checkOff = 2 -			prevIndex := s.index - 1 -			if prevIndex+prevLength+checkOff < s.maxInsertIndex { -				end := lookahead -				if lookahead > maxMatchLength { -					end = maxMatchLength -				} -				end += prevIndex -				idx := prevIndex + prevLength - (4 - checkOff) -				h := hash4(d.window[idx:]) -				ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength + (4 - checkOff) -				if ch2 > minIndex { -					length := matchLen(d.window[prevIndex:end], d.window[ch2:]) -					// It seems like a pure length metric is best. -					if length > prevLength { -						prevLength = length -						prevOffset = prevIndex - ch2 + +			// Check all, except full length +			if prevLength < maxMatchLength-checkOff { +				prevIndex := s.index - 1 +				if prevIndex+prevLength < s.maxInsertIndex { +					end := lookahead +					if lookahead > maxMatchLength+checkOff { +						end = maxMatchLength + checkOff +					} +					end += prevIndex + +					// Hash at match end. +					h := hash4(d.window[prevIndex+prevLength:]) +					ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength +					if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff { +						length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:]) +						// It seems like a pure length metric is best. +						if length > prevLength { +							prevLength = length +							prevOffset = prevIndex - ch2 + +							// Extend back... +							for i := checkOff - 1; i >= 0; i-- { +								if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i] { +									// Emit tokens we "owe" +									for j := 0; j <= i; j++ { +										d.tokens.AddLiteral(d.window[prevIndex+j]) +										if d.tokens.n == maxFlateBlockTokens { +											// The block includes the current character +											if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { +												return +											} +											d.tokens.Reset() +										} +										s.index++ +										if s.index < s.maxInsertIndex { +											h := hash4(d.window[s.index:]) +											ch := s.hashHead[h] +											s.chainHead = int(ch) +											s.hashPrev[s.index&windowMask] = ch +											s.hashHead[h] = uint32(s.index + s.hashOffset) +										} +									} +									break +								} else { +									prevLength++ +								} +							} +						} else if false { +							// Check one further ahead. +							// Only rarely better, disabled for now. +							prevIndex++ +							h := hash4(d.window[prevIndex+prevLength:]) +							ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength +							if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff { +								length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:]) +								// It seems like a pure length metric is best. +								if length > prevLength+checkOff { +									prevLength = length +									prevOffset = prevIndex - ch2 +									prevIndex-- + +									// Extend back... +									for i := checkOff; i >= 0; i-- { +										if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i-1] { +											// Emit tokens we "owe" +											for j := 0; j <= i; j++ { +												d.tokens.AddLiteral(d.window[prevIndex+j]) +												if d.tokens.n == maxFlateBlockTokens { +													// The block includes the current character +													if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { +														return +													} +													d.tokens.Reset() +												} +												s.index++ +												if s.index < s.maxInsertIndex { +													h := hash4(d.window[s.index:]) +													ch := s.hashHead[h] +													s.chainHead = int(ch) +													s.hashPrev[s.index&windowMask] = ch +													s.hashHead[h] = uint32(s.index + s.hashOffset) +												} +											} +											break +										} else { +											prevLength++ +										} +									} +								} +							} +						}  					}  				}  			} diff --git a/vendor/github.com/klauspost/compress/flate/dict_decoder.go b/vendor/github.com/klauspost/compress/flate/dict_decoder.go index 71c75a065..bb36351a5 100644 --- a/vendor/github.com/klauspost/compress/flate/dict_decoder.go +++ b/vendor/github.com/klauspost/compress/flate/dict_decoder.go @@ -7,19 +7,19 @@ package flate  // dictDecoder implements the LZ77 sliding dictionary as used in decompression.  // LZ77 decompresses data through sequences of two forms of commands:  // -//	* Literal insertions: Runs of one or more symbols are inserted into the data -//	stream as is. This is accomplished through the writeByte method for a -//	single symbol, or combinations of writeSlice/writeMark for multiple symbols. -//	Any valid stream must start with a literal insertion if no preset dictionary -//	is used. +//   - Literal insertions: Runs of one or more symbols are inserted into the data +//     stream as is. This is accomplished through the writeByte method for a +//     single symbol, or combinations of writeSlice/writeMark for multiple symbols. +//     Any valid stream must start with a literal insertion if no preset dictionary +//     is used.  // -//	* Backward copies: Runs of one or more symbols are copied from previously -//	emitted data. Backward copies come as the tuple (dist, length) where dist -//	determines how far back in the stream to copy from and length determines how -//	many bytes to copy. Note that it is valid for the length to be greater than -//	the distance. Since LZ77 uses forward copies, that situation is used to -//	perform a form of run-length encoding on repeated runs of symbols. -//	The writeCopy and tryWriteCopy are used to implement this command. +//   - Backward copies: Runs of one or more symbols are copied from previously +//     emitted data. Backward copies come as the tuple (dist, length) where dist +//     determines how far back in the stream to copy from and length determines how +//     many bytes to copy. Note that it is valid for the length to be greater than +//     the distance. Since LZ77 uses forward copies, that situation is used to +//     perform a form of run-length encoding on repeated runs of symbols. +//     The writeCopy and tryWriteCopy are used to implement this command.  //  // For performance reasons, this implementation performs little to no sanity  // checks about the arguments. As such, the invariants documented for each diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go index f781aaa62..24caf5f70 100644 --- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go +++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go @@ -58,17 +58,6 @@ const (  	prime8bytes = 0xcf1bbcdcb7a56463  ) -func load32(b []byte, i int) uint32 { -	// Help the compiler eliminate bounds checks on the read so it can be done in a single read. -	b = b[i:] -	b = b[:4] -	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 -} - -func load64(b []byte, i int) uint64 { -	return binary.LittleEndian.Uint64(b[i:]) -} -  func load3232(b []byte, i int32) uint32 {  	return binary.LittleEndian.Uint32(b[i:])  } @@ -77,10 +66,6 @@ func load6432(b []byte, i int32) uint64 {  	return binary.LittleEndian.Uint64(b[i:])  } -func hash(u uint32) uint32 { -	return (u * 0x1e35a7bd) >> tableShift -} -  type tableEntry struct {  	offset int32  } @@ -104,7 +89,8 @@ func (e *fastGen) addBlock(src []byte) int32 {  			}  			// Move down  			offset := int32(len(e.hist)) - maxMatchOffset -			copy(e.hist[0:maxMatchOffset], e.hist[offset:]) +			// copy(e.hist[0:maxMatchOffset], e.hist[offset:]) +			*(*[maxMatchOffset]byte)(e.hist) = *(*[maxMatchOffset]byte)(e.hist[offset:])  			e.cur += offset  			e.hist = e.hist[:maxMatchOffset]  		} @@ -114,39 +100,36 @@ func (e *fastGen) addBlock(src []byte) int32 {  	return s  } -// hash4 returns the hash of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <32. -func hash4u(u uint32, h uint8) uint32 { -	return (u * prime4bytes) >> (32 - h) -} -  type tableEntryPrev struct {  	Cur  tableEntry  	Prev tableEntry  } -// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <32. -func hash4x64(u uint64, h uint8) uint32 { -	return (uint32(u) * prime4bytes) >> ((32 - h) & reg8SizeMask32) -} -  // hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.  // Preferably h should be a constant and should always be <64.  func hash7(u uint64, h uint8) uint32 {  	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & reg8SizeMask64))  } -// hash8 returns the hash of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <64. -func hash8(u uint64, h uint8) uint32 { -	return uint32((u * prime8bytes) >> ((64 - h) & reg8SizeMask64)) -} - -// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <64. -func hash6(u uint64, h uint8) uint32 { -	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & reg8SizeMask64)) +// hashLen returns a hash of the lowest mls bytes of with length output bits. +// mls must be >=3 and <=8. Any other value will return hash for 4 bytes. +// length should always be < 32. +// Preferably length and mls should be a constant for inlining. +func hashLen(u uint64, length, mls uint8) uint32 { +	switch mls { +	case 3: +		return (uint32(u<<8) * prime3bytes) >> (32 - length) +	case 5: +		return uint32(((u << (64 - 40)) * prime5bytes) >> (64 - length)) +	case 6: +		return uint32(((u << (64 - 48)) * prime6bytes) >> (64 - length)) +	case 7: +		return uint32(((u << (64 - 56)) * prime7bytes) >> (64 - length)) +	case 8: +		return uint32((u * prime8bytes) >> (64 - length)) +	default: +		return (uint32(u) * prime4bytes) >> (32 - length) +	}  }  // matchlen will return the match length between offsets and t in src. diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go index 40ef45c2f..89a5dd89f 100644 --- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go +++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go @@ -265,9 +265,9 @@ func (w *huffmanBitWriter) writeBytes(bytes []byte) {  // Codes 0-15 are single byte codes. Codes 16-18 are followed by additional  // information. Code badCode is an end marker  // -//  numLiterals      The number of literals in literalEncoding -//  numOffsets       The number of offsets in offsetEncoding -//  litenc, offenc   The literal and offset encoder to use +//	numLiterals      The number of literals in literalEncoding +//	numOffsets       The number of offsets in offsetEncoding +//	litenc, offenc   The literal and offset encoder to use  func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litEnc, offEnc *huffmanEncoder) {  	for i := range w.codegenFreq {  		w.codegenFreq[i] = 0 @@ -460,9 +460,9 @@ func (w *huffmanBitWriter) writeOutBits() {  // Write the header of a dynamic Huffman block to the output stream.  // -//  numLiterals  The number of literals specified in codegen -//  numOffsets   The number of offsets specified in codegen -//  numCodegens  The number of codegens used in codegen +//	numLiterals  The number of literals specified in codegen +//	numOffsets   The number of offsets specified in codegen +//	numCodegens  The number of codegens used in codegen  func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) {  	if w.err != nil {  		return @@ -790,9 +790,11 @@ func (w *huffmanBitWriter) fillTokens() {  // and offsetEncoding.  // The number of literal and offset tokens is returned.  func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, numOffsets int) { -	copy(w.literalFreq[:], t.litHist[:]) -	copy(w.literalFreq[256:], t.extraHist[:]) -	copy(w.offsetFreq[:], t.offHist[:offsetCodeCount]) +	//copy(w.literalFreq[:], t.litHist[:]) +	*(*[256]uint16)(w.literalFreq[:]) = t.litHist +	//copy(w.literalFreq[256:], t.extraHist[:]) +	*(*[32]uint16)(w.literalFreq[256:]) = t.extraHist +	w.offsetFreq = t.offHist  	if t.n == 0 {  		return diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go index 5ac144f28..be7b58b47 100644 --- a/vendor/github.com/klauspost/compress/flate/huffman_code.go +++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go @@ -168,13 +168,18 @@ func (h *huffmanEncoder) canReuseBits(freq []uint16) int {  // The cases of 0, 1, and 2 literals are handled by special case code.  //  // list  An array of the literals with non-zero frequencies -//             and their associated frequencies. The array is in order of increasing -//             frequency, and has as its last element a special element with frequency -//             MaxInt32 +// +//	and their associated frequencies. The array is in order of increasing +//	frequency, and has as its last element a special element with frequency +//	MaxInt32 +//  // maxBits     The maximum number of bits that should be used to encode any literal. -//             Must be less than 16. +// +//	Must be less than 16. +//  // return      An integer array in which array[i] indicates the number of literals -//             that should be encoded in i bits. +// +//	that should be encoded in i bits.  func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {  	if maxBits >= maxBitsLimit {  		panic("flate: maxBits too large") diff --git a/vendor/github.com/klauspost/compress/flate/level1.go b/vendor/github.com/klauspost/compress/flate/level1.go index 0f14f8d63..703b9a89a 100644 --- a/vendor/github.com/klauspost/compress/flate/level1.go +++ b/vendor/github.com/klauspost/compress/flate/level1.go @@ -19,6 +19,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {  	const (  		inputMargin            = 12 - 1  		minNonLiteralBlockSize = 1 + 1 + inputMargin +		hashBytes              = 5  	)  	if debugDeflate && e.cur < 0 {  		panic(fmt.Sprint("e.cur < 0: ", e.cur)) @@ -68,7 +69,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {  	sLimit := int32(len(src) - inputMargin)  	// nextEmit is where in src the next emitLiteral should start from. -	cv := load3232(src, s) +	cv := load6432(src, s)  	for {  		const skipLog = 5 @@ -77,7 +78,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {  		nextS := s  		var candidate tableEntry  		for { -			nextHash := hash(cv) +			nextHash := hashLen(cv, tableBits, hashBytes)  			candidate = e.table[nextHash]  			nextS = s + doEvery + (s-nextEmit)>>skipLog  			if nextS > sLimit { @@ -86,16 +87,16 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {  			now := load6432(src, nextS)  			e.table[nextHash] = tableEntry{offset: s + e.cur} -			nextHash = hash(uint32(now)) +			nextHash = hashLen(now, tableBits, hashBytes)  			offset := s - (candidate.offset - e.cur) -			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) { +			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {  				e.table[nextHash] = tableEntry{offset: nextS + e.cur}  				break  			}  			// Do one right away... -			cv = uint32(now) +			cv = now  			s = nextS  			nextS++  			candidate = e.table[nextHash] @@ -103,11 +104,11 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {  			e.table[nextHash] = tableEntry{offset: s + e.cur}  			offset = s - (candidate.offset - e.cur) -			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) { +			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {  				e.table[nextHash] = tableEntry{offset: nextS + e.cur}  				break  			} -			cv = uint32(now) +			cv = now  			s = nextS  		} @@ -198,9 +199,9 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {  			}  			if s >= sLimit {  				// Index first pair after match end. -				if int(s+l+4) < len(src) { -					cv := load3232(src, s) -					e.table[hash(cv)] = tableEntry{offset: s + e.cur} +				if int(s+l+8) < len(src) { +					cv := load6432(src, s) +					e.table[hashLen(cv, tableBits, hashBytes)] = tableEntry{offset: s + e.cur}  				}  				goto emitRemainder  			} @@ -213,16 +214,16 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {  			// three load32 calls.  			x := load6432(src, s-2)  			o := e.cur + s - 2 -			prevHash := hash(uint32(x)) +			prevHash := hashLen(x, tableBits, hashBytes)  			e.table[prevHash] = tableEntry{offset: o}  			x >>= 16 -			currHash := hash(uint32(x)) +			currHash := hashLen(x, tableBits, hashBytes)  			candidate = e.table[currHash]  			e.table[currHash] = tableEntry{offset: o + 2}  			offset := s - (candidate.offset - e.cur)  			if offset > maxMatchOffset || uint32(x) != load3232(src, candidate.offset-e.cur) { -				cv = uint32(x >> 8) +				cv = x >> 8  				s++  				break  			} diff --git a/vendor/github.com/klauspost/compress/flate/level2.go b/vendor/github.com/klauspost/compress/flate/level2.go index 8603fbd55..876dfbe30 100644 --- a/vendor/github.com/klauspost/compress/flate/level2.go +++ b/vendor/github.com/klauspost/compress/flate/level2.go @@ -16,6 +16,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {  	const (  		inputMargin            = 12 - 1  		minNonLiteralBlockSize = 1 + 1 + inputMargin +		hashBytes              = 5  	)  	if debugDeflate && e.cur < 0 { @@ -66,7 +67,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {  	sLimit := int32(len(src) - inputMargin)  	// nextEmit is where in src the next emitLiteral should start from. -	cv := load3232(src, s) +	cv := load6432(src, s)  	for {  		// When should we start skipping if we haven't found matches in a long while.  		const skipLog = 5 @@ -75,7 +76,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {  		nextS := s  		var candidate tableEntry  		for { -			nextHash := hash4u(cv, bTableBits) +			nextHash := hashLen(cv, bTableBits, hashBytes)  			s = nextS  			nextS = s + doEvery + (s-nextEmit)>>skipLog  			if nextS > sLimit { @@ -84,16 +85,16 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {  			candidate = e.table[nextHash]  			now := load6432(src, nextS)  			e.table[nextHash] = tableEntry{offset: s + e.cur} -			nextHash = hash4u(uint32(now), bTableBits) +			nextHash = hashLen(now, bTableBits, hashBytes)  			offset := s - (candidate.offset - e.cur) -			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) { +			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {  				e.table[nextHash] = tableEntry{offset: nextS + e.cur}  				break  			}  			// Do one right away... -			cv = uint32(now) +			cv = now  			s = nextS  			nextS++  			candidate = e.table[nextHash] @@ -101,10 +102,10 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {  			e.table[nextHash] = tableEntry{offset: s + e.cur}  			offset = s - (candidate.offset - e.cur) -			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) { +			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {  				break  			} -			cv = uint32(now) +			cv = now  		}  		// A 4-byte match has been found. We'll later see if more than 4 bytes @@ -154,9 +155,9 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {  			if s >= sLimit {  				// Index first pair after match end. -				if int(s+l+4) < len(src) { -					cv := load3232(src, s) -					e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur} +				if int(s+l+8) < len(src) { +					cv := load6432(src, s) +					e.table[hashLen(cv, bTableBits, hashBytes)] = tableEntry{offset: s + e.cur}  				}  				goto emitRemainder  			} @@ -164,15 +165,15 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {  			// Store every second hash in-between, but offset by 1.  			for i := s - l + 2; i < s-5; i += 7 {  				x := load6432(src, i) -				nextHash := hash4u(uint32(x), bTableBits) +				nextHash := hashLen(x, bTableBits, hashBytes)  				e.table[nextHash] = tableEntry{offset: e.cur + i}  				// Skip one  				x >>= 16 -				nextHash = hash4u(uint32(x), bTableBits) +				nextHash = hashLen(x, bTableBits, hashBytes)  				e.table[nextHash] = tableEntry{offset: e.cur + i + 2}  				// Skip one  				x >>= 16 -				nextHash = hash4u(uint32(x), bTableBits) +				nextHash = hashLen(x, bTableBits, hashBytes)  				e.table[nextHash] = tableEntry{offset: e.cur + i + 4}  			} @@ -184,17 +185,17 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {  			// three load32 calls.  			x := load6432(src, s-2)  			o := e.cur + s - 2 -			prevHash := hash4u(uint32(x), bTableBits) -			prevHash2 := hash4u(uint32(x>>8), bTableBits) +			prevHash := hashLen(x, bTableBits, hashBytes) +			prevHash2 := hashLen(x>>8, bTableBits, hashBytes)  			e.table[prevHash] = tableEntry{offset: o}  			e.table[prevHash2] = tableEntry{offset: o + 1} -			currHash := hash4u(uint32(x>>16), bTableBits) +			currHash := hashLen(x>>16, bTableBits, hashBytes)  			candidate = e.table[currHash]  			e.table[currHash] = tableEntry{offset: o + 2}  			offset := s - (candidate.offset - e.cur)  			if offset > maxMatchOffset || uint32(x>>16) != load3232(src, candidate.offset-e.cur) { -				cv = uint32(x >> 24) +				cv = x >> 24  				s++  				break  			} diff --git a/vendor/github.com/klauspost/compress/flate/level3.go b/vendor/github.com/klauspost/compress/flate/level3.go index 039639f89..7aa2b72a1 100644 --- a/vendor/github.com/klauspost/compress/flate/level3.go +++ b/vendor/github.com/klauspost/compress/flate/level3.go @@ -11,10 +11,11 @@ type fastEncL3 struct {  // Encode uses a similar algorithm to level 2, will check up to two candidates.  func (e *fastEncL3) Encode(dst *tokens, src []byte) {  	const ( -		inputMargin            = 8 - 1 +		inputMargin            = 12 - 1  		minNonLiteralBlockSize = 1 + 1 + inputMargin  		tableBits              = 16  		tableSize              = 1 << tableBits +		hashBytes              = 5  	)  	if debugDeflate && e.cur < 0 { @@ -69,20 +70,20 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {  	sLimit := int32(len(src) - inputMargin)  	// nextEmit is where in src the next emitLiteral should start from. -	cv := load3232(src, s) +	cv := load6432(src, s)  	for { -		const skipLog = 6 +		const skipLog = 7  		nextS := s  		var candidate tableEntry  		for { -			nextHash := hash4u(cv, tableBits) +			nextHash := hashLen(cv, tableBits, hashBytes)  			s = nextS  			nextS = s + 1 + (s-nextEmit)>>skipLog  			if nextS > sLimit {  				goto emitRemainder  			}  			candidates := e.table[nextHash] -			now := load3232(src, nextS) +			now := load6432(src, nextS)  			// Safe offset distance until s + 4...  			minOffset := e.cur + s - (maxMatchOffset - 4) @@ -96,8 +97,8 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {  				continue  			} -			if cv == load3232(src, candidate.offset-e.cur) { -				if candidates.Prev.offset < minOffset || cv != load3232(src, candidates.Prev.offset-e.cur) { +			if uint32(cv) == load3232(src, candidate.offset-e.cur) { +				if candidates.Prev.offset < minOffset || uint32(cv) != load3232(src, candidates.Prev.offset-e.cur) {  					break  				}  				// Both match and are valid, pick longest. @@ -112,7 +113,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {  				// We only check if value mismatches.  				// Offset will always be invalid in other cases.  				candidate = candidates.Prev -				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) { +				if candidate.offset > minOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {  					break  				}  			} @@ -164,9 +165,9 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {  			if s >= sLimit {  				t += l  				// Index first pair after match end. -				if int(t+4) < len(src) && t > 0 { -					cv := load3232(src, t) -					nextHash := hash4u(cv, tableBits) +				if int(t+8) < len(src) && t > 0 { +					cv = load6432(src, t) +					nextHash := hashLen(cv, tableBits, hashBytes)  					e.table[nextHash] = tableEntryPrev{  						Prev: e.table[nextHash].Cur,  						Cur:  tableEntry{offset: e.cur + t}, @@ -176,8 +177,8 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {  			}  			// Store every 5th hash in-between. -			for i := s - l + 2; i < s-5; i += 5 { -				nextHash := hash4u(load3232(src, i), tableBits) +			for i := s - l + 2; i < s-5; i += 6 { +				nextHash := hashLen(load6432(src, i), tableBits, hashBytes)  				e.table[nextHash] = tableEntryPrev{  					Prev: e.table[nextHash].Cur,  					Cur:  tableEntry{offset: e.cur + i}} @@ -185,23 +186,23 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {  			// We could immediately start working at s now, but to improve  			// compression we first update the hash table at s-2 to s.  			x := load6432(src, s-2) -			prevHash := hash4u(uint32(x), tableBits) +			prevHash := hashLen(x, tableBits, hashBytes)  			e.table[prevHash] = tableEntryPrev{  				Prev: e.table[prevHash].Cur,  				Cur:  tableEntry{offset: e.cur + s - 2},  			}  			x >>= 8 -			prevHash = hash4u(uint32(x), tableBits) +			prevHash = hashLen(x, tableBits, hashBytes)  			e.table[prevHash] = tableEntryPrev{  				Prev: e.table[prevHash].Cur,  				Cur:  tableEntry{offset: e.cur + s - 1},  			}  			x >>= 8 -			currHash := hash4u(uint32(x), tableBits) +			currHash := hashLen(x, tableBits, hashBytes)  			candidates := e.table[currHash] -			cv = uint32(x) +			cv = x  			e.table[currHash] = tableEntryPrev{  				Prev: candidates.Cur,  				Cur:  tableEntry{offset: s + e.cur}, @@ -212,17 +213,17 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {  			minOffset := e.cur + s - (maxMatchOffset - 4)  			if candidate.offset > minOffset { -				if cv == load3232(src, candidate.offset-e.cur) { +				if uint32(cv) == load3232(src, candidate.offset-e.cur) {  					// Found a match...  					continue  				}  				candidate = candidates.Prev -				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) { +				if candidate.offset > minOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {  					// Match at prev...  					continue  				}  			} -			cv = uint32(x >> 8) +			cv = x >> 8  			s++  			break  		} diff --git a/vendor/github.com/klauspost/compress/flate/level4.go b/vendor/github.com/klauspost/compress/flate/level4.go index 1cbffa1ae..23c08b325 100644 --- a/vendor/github.com/klauspost/compress/flate/level4.go +++ b/vendor/github.com/klauspost/compress/flate/level4.go @@ -12,6 +12,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {  	const (  		inputMargin            = 12 - 1  		minNonLiteralBlockSize = 1 + 1 + inputMargin +		hashShortBytes         = 4  	)  	if debugDeflate && e.cur < 0 {  		panic(fmt.Sprint("e.cur < 0: ", e.cur)) @@ -80,7 +81,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {  		nextS := s  		var t int32  		for { -			nextHashS := hash4x64(cv, tableBits) +			nextHashS := hashLen(cv, tableBits, hashShortBytes)  			nextHashL := hash7(cv, tableBits)  			s = nextS @@ -168,7 +169,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {  			// Index first pair after match end.  			if int(s+8) < len(src) {  				cv := load6432(src, s) -				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: s + e.cur} +				e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: s + e.cur}  				e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur}  			}  			goto emitRemainder @@ -183,7 +184,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {  				t2 := tableEntry{offset: t.offset + 1}  				e.bTable[hash7(cv, tableBits)] = t  				e.bTable[hash7(cv>>8, tableBits)] = t2 -				e.table[hash4u(uint32(cv>>8), tableBits)] = t2 +				e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2  				i += 3  				for ; i < s-1; i += 3 { @@ -192,7 +193,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {  					t2 := tableEntry{offset: t.offset + 1}  					e.bTable[hash7(cv, tableBits)] = t  					e.bTable[hash7(cv>>8, tableBits)] = t2 -					e.table[hash4u(uint32(cv>>8), tableBits)] = t2 +					e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2  				}  			}  		} @@ -201,7 +202,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {  		// compression we first update the hash table at s-1 and at s.  		x := load6432(src, s-1)  		o := e.cur + s - 1 -		prevHashS := hash4x64(x, tableBits) +		prevHashS := hashLen(x, tableBits, hashShortBytes)  		prevHashL := hash7(x, tableBits)  		e.table[prevHashS] = tableEntry{offset: o}  		e.bTable[prevHashL] = tableEntry{offset: o} diff --git a/vendor/github.com/klauspost/compress/flate/level5.go b/vendor/github.com/klauspost/compress/flate/level5.go index 4b97576bd..83ef50ba4 100644 --- a/vendor/github.com/klauspost/compress/flate/level5.go +++ b/vendor/github.com/klauspost/compress/flate/level5.go @@ -12,6 +12,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {  	const (  		inputMargin            = 12 - 1  		minNonLiteralBlockSize = 1 + 1 + inputMargin +		hashShortBytes         = 4  	)  	if debugDeflate && e.cur < 0 {  		panic(fmt.Sprint("e.cur < 0: ", e.cur)) @@ -88,7 +89,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {  		var l int32  		var t int32  		for { -			nextHashS := hash4x64(cv, tableBits) +			nextHashS := hashLen(cv, tableBits, hashShortBytes)  			nextHashL := hash7(cv, tableBits)  			s = nextS @@ -105,7 +106,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {  			eLong := &e.bTable[nextHashL]  			eLong.Cur, eLong.Prev = entry, eLong.Cur -			nextHashS = hash4x64(next, tableBits) +			nextHashS = hashLen(next, tableBits, hashShortBytes)  			nextHashL = hash7(next, tableBits)  			t = lCandidate.Cur.offset - e.cur @@ -191,14 +192,21 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {  		// Try to locate a better match by checking the end of best match...  		if sAt := s + l; l < 30 && sAt < sLimit { +			// Allow some bytes at the beginning to mismatch. +			// Sweet spot is 2/3 bytes depending on input. +			// 3 is only a little better when it is but sometimes a lot worse. +			// The skipped bytes are tested in Extend backwards, +			// and still picked up as part of the match if they do. +			const skipBeginning = 2  			eLong := e.bTable[hash7(load6432(src, sAt), tableBits)].Cur.offset -			// Test current -			t2 := eLong - e.cur - l -			off := s - t2 +			t2 := eLong - e.cur - l + skipBeginning +			s2 := s + skipBeginning +			off := s2 - t2  			if t2 >= 0 && off < maxMatchOffset && off > 0 { -				if l2 := e.matchlenLong(s, t2, src); l2 > l { +				if l2 := e.matchlenLong(s2, t2, src); l2 > l {  					t = t2  					l = l2 +					s = s2  				}  			}  		} @@ -250,7 +258,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {  			if i < s-1 {  				cv := load6432(src, i)  				t := tableEntry{offset: i + e.cur} -				e.table[hash4x64(cv, tableBits)] = t +				e.table[hashLen(cv, tableBits, hashShortBytes)] = t  				eLong := &e.bTable[hash7(cv, tableBits)]  				eLong.Cur, eLong.Prev = t, eLong.Cur @@ -263,7 +271,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {  				// We only have enough bits for a short entry at i+2  				cv >>= 8  				t = tableEntry{offset: t.offset + 1} -				e.table[hash4x64(cv, tableBits)] = t +				e.table[hashLen(cv, tableBits, hashShortBytes)] = t  				// Skip one - otherwise we risk hitting 's'  				i += 4 @@ -273,7 +281,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {  					t2 := tableEntry{offset: t.offset + 1}  					eLong := &e.bTable[hash7(cv, tableBits)]  					eLong.Cur, eLong.Prev = t, eLong.Cur -					e.table[hash4u(uint32(cv>>8), tableBits)] = t2 +					e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2  				}  			}  		} @@ -282,7 +290,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {  		// compression we first update the hash table at s-1 and at s.  		x := load6432(src, s-1)  		o := e.cur + s - 1 -		prevHashS := hash4x64(x, tableBits) +		prevHashS := hashLen(x, tableBits, hashShortBytes)  		prevHashL := hash7(x, tableBits)  		e.table[prevHashS] = tableEntry{offset: o}  		eLong := &e.bTable[prevHashL] diff --git a/vendor/github.com/klauspost/compress/flate/level6.go b/vendor/github.com/klauspost/compress/flate/level6.go index 62888edf3..f1e9d98fa 100644 --- a/vendor/github.com/klauspost/compress/flate/level6.go +++ b/vendor/github.com/klauspost/compress/flate/level6.go @@ -12,6 +12,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {  	const (  		inputMargin            = 12 - 1  		minNonLiteralBlockSize = 1 + 1 + inputMargin +		hashShortBytes         = 4  	)  	if debugDeflate && e.cur < 0 {  		panic(fmt.Sprint("e.cur < 0: ", e.cur)) @@ -90,7 +91,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {  		var l int32  		var t int32  		for { -			nextHashS := hash4x64(cv, tableBits) +			nextHashS := hashLen(cv, tableBits, hashShortBytes)  			nextHashL := hash7(cv, tableBits)  			s = nextS  			nextS = s + doEvery + (s-nextEmit)>>skipLog @@ -107,7 +108,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {  			eLong.Cur, eLong.Prev = entry, eLong.Cur  			// Calculate hashes of 'next' -			nextHashS = hash4x64(next, tableBits) +			nextHashS = hashLen(next, tableBits, hashShortBytes)  			nextHashL = hash7(next, tableBits)  			t = lCandidate.Cur.offset - e.cur @@ -213,24 +214,33 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {  		// Try to locate a better match by checking the end-of-match...  		if sAt := s + l; sAt < sLimit { +			// Allow some bytes at the beginning to mismatch. +			// Sweet spot is 2/3 bytes depending on input. +			// 3 is only a little better when it is but sometimes a lot worse. +			// The skipped bytes are tested in Extend backwards, +			// and still picked up as part of the match if they do. +			const skipBeginning = 2  			eLong := &e.bTable[hash7(load6432(src, sAt), tableBits)]  			// Test current -			t2 := eLong.Cur.offset - e.cur - l -			off := s - t2 +			t2 := eLong.Cur.offset - e.cur - l + skipBeginning +			s2 := s + skipBeginning +			off := s2 - t2  			if off < maxMatchOffset {  				if off > 0 && t2 >= 0 { -					if l2 := e.matchlenLong(s, t2, src); l2 > l { +					if l2 := e.matchlenLong(s2, t2, src); l2 > l {  						t = t2  						l = l2 +						s = s2  					}  				}  				// Test next: -				t2 = eLong.Prev.offset - e.cur - l -				off := s - t2 +				t2 = eLong.Prev.offset - e.cur - l + skipBeginning +				off := s2 - t2  				if off > 0 && off < maxMatchOffset && t2 >= 0 { -					if l2 := e.matchlenLong(s, t2, src); l2 > l { +					if l2 := e.matchlenLong(s2, t2, src); l2 > l {  						t = t2  						l = l2 +						s = s2  					}  				}  			} @@ -277,7 +287,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {  			// Index after match end.  			for i := nextS + 1; i < int32(len(src))-8; i += 2 {  				cv := load6432(src, i) -				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: i + e.cur} +				e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: i + e.cur}  				eLong := &e.bTable[hash7(cv, tableBits)]  				eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur}, eLong.Cur  			} @@ -292,7 +302,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {  				t2 := tableEntry{offset: t.offset + 1}  				eLong := &e.bTable[hash7(cv, tableBits)]  				eLong2 := &e.bTable[hash7(cv>>8, tableBits)] -				e.table[hash4x64(cv, tableBits)] = t +				e.table[hashLen(cv, tableBits, hashShortBytes)] = t  				eLong.Cur, eLong.Prev = t, eLong.Cur  				eLong2.Cur, eLong2.Prev = t2, eLong2.Cur  			} diff --git a/vendor/github.com/klauspost/compress/flate/stateless.go b/vendor/github.com/klauspost/compress/flate/stateless.go index 93a1d1503..f3d4139ef 100644 --- a/vendor/github.com/klauspost/compress/flate/stateless.go +++ b/vendor/github.com/klauspost/compress/flate/stateless.go @@ -86,11 +86,19 @@ func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {  		dict = dict[len(dict)-maxStatelessDict:]  	} +	// For subsequent loops, keep shallow dict reference to avoid alloc+copy. +	var inDict []byte +  	for len(in) > 0 {  		todo := in -		if len(todo) > maxStatelessBlock-len(dict) { +		if len(inDict) > 0 { +			if len(todo) > maxStatelessBlock-maxStatelessDict { +				todo = todo[:maxStatelessBlock-maxStatelessDict] +			} +		} else if len(todo) > maxStatelessBlock-len(dict) {  			todo = todo[:maxStatelessBlock-len(dict)]  		} +		inOrg := in  		in = in[len(todo):]  		uncompressed := todo  		if len(dict) > 0 { @@ -102,7 +110,11 @@ func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {  			todo = combined  		}  		// Compress -		statelessEnc(&dst, todo, int16(len(dict))) +		if len(inDict) == 0 { +			statelessEnc(&dst, todo, int16(len(dict))) +		} else { +			statelessEnc(&dst, inDict[:maxStatelessDict+len(todo)], maxStatelessDict) +		}  		isEof := eof && len(in) == 0  		if dst.n == 0 { @@ -119,7 +131,8 @@ func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {  		}  		if len(in) > 0 {  			// Retain a dict if we have more -			dict = todo[len(todo)-maxStatelessDict:] +			inDict = inOrg[len(uncompressed)-maxStatelessDict:] +			dict = nil  			dst.Reset()  		}  		if bw.err != nil { diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md index 73c0c462d..1d80c42a5 100644 --- a/vendor/github.com/klauspost/compress/s2/README.md +++ b/vendor/github.com/klauspost/compress/s2/README.md @@ -325,35 +325,35 @@ The content compressed in this mode is fully compatible with the standard decode  Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all threads and a single thread (1 CPU): -| File                                                                                                | S2 speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller | -|-----------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------| -| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z)                    | 12.70x   | 10556 MB/s    | 7.35%        | 4.15x       | 3455 MB/s           | 12.79%             | -| (1 CPU)                                                                                             | 1.14x    | 948 MB/s      | -            | 0.42x       | 349 MB/s            | -                  | -| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 17.13x   | 14484 MB/s    | 31.60%       | 10.09x      | 8533 MB/s           | 37.71%             | -| (1 CPU)                                                                                             | 1.33x    | 1127 MB/s     | -            | 0.70x       | 589 MB/s            | -                  | -| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst)         | 15.14x   | 12000 MB/s    | -5.79%       | 6.59x       | 5223 MB/s           | 5.80%              | -| (1 CPU)                                                                                             | 1.11x    | 877 MB/s      | -            | 0.47x       | 370 MB/s            | -                  | -| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst)                     | 14.62x   | 12116 MB/s    | 15.90%       | 5.35x       | 4430 MB/s           | 16.08%             | -| (1 CPU)                                                                                             | 1.38x    | 1146 MB/s     | -            | 0.38x       | 312 MB/s            | -                  | -| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst)                             | 8.83x    | 17579 MB/s    | 43.86%       | 6.54x       | 13011 MB/s          | 47.23%             | -| (1 CPU)                                                                                             | 1.14x    | 2259 MB/s     | -            | 0.74x       | 1475 MB/s           | -                  | -| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z)                                    | 16.72x   | 14019 MB/s    | 24.02%       | 10.11x      | 8477 MB/s           | 30.48%             | -| (1 CPU)                                                                                             | 1.24x    | 1043 MB/s     | -            | 0.70x       | 586 MB/s            | -                  | -| [10gb.tar](http://mattmahoney.net/dc/10gb.html)                                                     | 13.33x   | 9254 MB/s     | 1.84%        | 6.75x       | 4686 MB/s           | 6.72%              | -| (1 CPU)                                                                                             | 0.97x    | 672 MB/s      | -            | 0.53x       | 366 MB/s            | -                  | -| sharnd.out.2gb                                                                                      | 2.11x    | 12639 MB/s    | 0.01%        | 1.98x       | 11833 MB/s          | 0.01%              | -| (1 CPU)                                                                                             | 0.93x    | 5594 MB/s     | -            | 1.34x       | 8030 MB/s           | -                  | -| [enwik9](http://mattmahoney.net/dc/textdata.html)                                                   | 19.34x   | 8220 MB/s     | 3.98%        | 7.87x       | 3345 MB/s           | 15.82%             | -| (1 CPU)                                                                                             | 1.06x    | 452 MB/s      | -            | 0.50x       | 213 MB/s            | -                  | -| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip)                                    | 10.48x   | 6124 MB/s     | 5.67%        | 3.76x       | 2197 MB/s           | 12.60%             | -| (1 CPU)                                                                                             | 0.97x    | 568 MB/s      | -            | 0.46x       | 271 MB/s            | -                  | -| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results)                                 | 21.07x   | 9020 MB/s     | 6.36%        | 6.91x       | 2959 MB/s           | 16.95%             | -| (1 CPU)                                                                                             | 1.07x    | 460 MB/s      | -            | 0.51x       | 220 MB/s            | -                  | +| File                                                                                                    | S2 Speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller | +|---------------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------| +| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z)                        | 16.33x   | 10556 MB/s    | 8.0%         | 6.04x       | 5252 MB/s           | 14.7%              | +| (1 CPU)                                                                                                 | 1.08x    | 940 MB/s      | -            | 0.46x       | 400 MB/s            | -                  | +| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst)     | 16.51x   | 15224 MB/s    | 31.70%       | 9.47x       | 8734 MB/s           | 37.71%             | +| (1 CPU)                                                                                                 | 1.26x    | 1157 MB/s     | -            | 0.60x       | 556 MB/s            | -                  | +| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst)             | 15.14x   | 12598 MB/s    | -5.76%       | 6.23x       | 5675 MB/s           | 3.62%              | +| (1 CPU)                                                                                                 | 1.02x    | 932 MB/s      | -            | 0.47x       | 432 MB/s            | -                  | +| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst)                         | 11.21x   | 12116 MB/s    | 15.95%       | 3.24x       | 3500 MB/s           | 18.00%             | +| (1 CPU)                                                                                                 | 1.05x    | 1135 MB/s     | -            | 0.27x       | 292 MB/s            | -                  | +| [apache.log](https://files.klauspost.com/compress/apache.log.zst)                                       | 8.55x    | 16673 MB/s    | 20.54%       | 5.85x       | 11420 MB/s          | 24.97%             | +| (1 CPU)                                                                                                 | 1.91x    | 1771 MB/s     | -            | 0.53x       | 1041 MB/s           | -                  | +| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z)                                        | 15.76x   | 14357 MB/s    | 24.01%       | 8.67x       | 7891 MB/s           | 33.68%             | +| (1 CPU)                                                                                                 | 1.17x    | 1064 MB/s     | -            | 0.65x       | 595 MB/s            | -                  | +| [10gb.tar](http://mattmahoney.net/dc/10gb.html)                                                         | 13.33x   | 9835 MB/s     | 2.34%        | 6.85x       | 4863 MB/s           | 9.96%              | +| (1 CPU)                                                                                                 | 0.97x    | 689 MB/s      | -            | 0.55x       | 387 MB/s            | -                  | +| sharnd.out.2gb                                                                                          | 9.11x    | 13213 MB/s    | 0.01%        | 1.49x       | 9184 MB/s           | 0.01%              | +| (1 CPU)                                                                                                 | 0.88x    | 5418 MB/s     | -            | 0.77x       | 5417 MB/s           | -                  | +| [sofia-air-quality-dataset csv](https://files.klauspost.com/compress/sofia-air-quality-dataset.tar.zst) | 22.00x   | 11477 MB/s    | 18.73%       | 11.15x      | 5817 MB/s           | 27.88%             | +| (1 CPU)                                                                                                 | 1.23x    | 642 MB/s      | -            | 0.71x       | 642 MB/s            | -                  | +| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip)                                        | 11.23x   | 6520 MB/s     | 5.9%         | 5.35x       | 3109 MB/s           | 15.88%             | +| (1 CPU)                                                                                                 | 1.05x    | 607 MB/s      | -            | 0.52x       | 304 MB/s            | -                  | +| [enwik9](https://files.klauspost.com/compress/enwik9.zst)                                               | 19.28x   | 8440 MB/s     | 4.04%        | 9.31x       | 4076 MB/s           | 18.04%             | +| (1 CPU)                                                                                                 | 1.12x    | 488 MB/s      | -            | 0.57x       | 250 MB/s            | -                  |  ### Legend -* `S2 speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core. -* `S2 throughput`: Throughput of S2 in MB/s.  +* `S2 Speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core. +* `S2 Throughput`: Throughput of S2 in MB/s.   * `S2 % smaller`: How many percent of the Snappy output size is S2 better.  * `S2 "better"`: Speed when enabling "better" compression mode in S2 compared to Snappy.   * `"better" throughput`: Speed when enabling "better" compression mode in S2 compared to Snappy.  @@ -361,7 +361,7 @@ Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all th  There is a good speedup across the board when using a single thread and a significant speedup when using multiple threads. -Machine generated data gets by far the biggest compression boost, with size being being reduced by up to 45% of Snappy size. +Machine generated data gets by far the biggest compression boost, with size being reduced by up to 35% of Snappy size.  The "better" compression mode sees a good improvement in all cases, but usually at a performance cost. @@ -404,15 +404,15 @@ The "better" compression mode will actively look for shorter matches, which is w  Without assembly decompression is also very fast; single goroutine decompression speed. No assembly:  | File                           | S2 Throughput | S2 throughput | -|--------------------------------|--------------|---------------| -| consensus.db.10gb.s2           | 1.84x        | 2289.8 MB/s   | -| 10gb.tar.s2                    | 1.30x        | 867.07 MB/s   | -| rawstudio-mint14.tar.s2        | 1.66x        | 1329.65 MB/s  | -| github-june-2days-2019.json.s2 | 2.36x        | 1831.59 MB/s  | -| github-ranks-backup.bin.s2     | 1.73x        | 1390.7 MB/s   | -| enwik9.s2                      | 1.67x        | 681.53 MB/s   | -| adresser.json.s2               | 3.41x        | 4230.53 MB/s  | -| silesia.tar.s2                 | 1.52x        | 811.58        | +|--------------------------------|---------------|---------------| +| consensus.db.10gb.s2           | 1.84x         | 2289.8 MB/s   | +| 10gb.tar.s2                    | 1.30x         | 867.07 MB/s   | +| rawstudio-mint14.tar.s2        | 1.66x         | 1329.65 MB/s  | +| github-june-2days-2019.json.s2 | 2.36x         | 1831.59 MB/s  | +| github-ranks-backup.bin.s2     | 1.73x         | 1390.7 MB/s   | +| enwik9.s2                      | 1.67x         | 681.53 MB/s   | +| adresser.json.s2               | 3.41x         | 4230.53 MB/s  | +| silesia.tar.s2                 | 1.52x         | 811.58        |  Even though S2 typically compresses better than Snappy, decompression speed is always better.  @@ -450,14 +450,14 @@ The most reliable is a wide dataset.  For this we use [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),  53927 files, total input size: 4,014,735,833 bytes. Single goroutine used. -| *                 | Input      | Output     | Reduction | MB/s   | -|-------------------|------------|------------|-----------|--------| -| S2                | 4014735833 | 1059723369 | 73.60%    | **934.34** | -| S2 Better         | 4014735833 | 969670507  | 75.85%    | 532.70 | -| S2 Best           | 4014735833 | 906625668  | **77.85%** | 46.84 | -| Snappy            | 4014735833 | 1128706759 | 71.89%    | 762.59 | -| S2, Snappy Output | 4014735833 | 1093821420 | 72.75%    | 908.60 | -| LZ4               | 4014735833 | 1079259294 | 73.12%    | 526.94 | +| *                 | Input      | Output     | Reduction  | MB/s       | +|-------------------|------------|------------|------------|------------| +| S2                | 4014735833 | 1059723369 | 73.60%     | **936.73** | +| S2 Better         | 4014735833 | 961580539  | 76.05%     | 451.10     | +| S2 Best           | 4014735833 | 899182886  | **77.60%** | 46.84      | +| Snappy            | 4014735833 | 1128706759 | 71.89%     | 790.15     | +| S2, Snappy Output | 4014735833 | 1093823291 | 72.75%     | 936.60     | +| LZ4               | 4014735833 | 1063768713 | 73.50%     | 452.02     |  S2 delivers both the best single threaded throughput with regular mode and the best compression rate with "best".  "Better" mode provides the same compression speed as LZ4 with better compression ratio.  @@ -489,42 +489,23 @@ AMD64 assembly is use for both S2 and Snappy.  | Absolute Perf         | Snappy size | S2 Size | Snappy Speed | S2 Speed    | Snappy dec  | S2 dec      |  |-----------------------|-------------|---------|--------------|-------------|-------------|-------------| -| html                  | 22843       | 21111   | 16246 MB/s   | 17438 MB/s  | 40972 MB/s  | 49263 MB/s  | -| urls.10K              | 335492      | 287326  | 7943 MB/s    | 9693 MB/s   | 22523 MB/s  | 26484 MB/s  | -| fireworks.jpeg        | 123034      | 123100  | 349544 MB/s  | 273889 MB/s | 718321 MB/s | 827552 MB/s | -| fireworks.jpeg (200B) | 146         | 155     | 8869 MB/s    | 17773 MB/s  | 33691 MB/s  | 52421 MB/s  | -| paper-100k.pdf        | 85304       | 84459   | 167546 MB/s  | 101263 MB/s | 326905 MB/s | 291944 MB/s | -| html_x_4              | 92234       | 21113   | 15194 MB/s   | 50670 MB/s  | 30843 MB/s  | 32217 MB/s  | -| alice29.txt           | 88034       | 85975   | 5936 MB/s    | 6139 MB/s   | 12882 MB/s  | 20044 MB/s  | -| asyoulik.txt          | 77503       | 79650   | 5517 MB/s    | 6366 MB/s   | 12735 MB/s  | 22806 MB/s  | -| lcet10.txt            | 234661      | 220670  | 6235 MB/s    | 6067 MB/s   | 14519 MB/s  | 18697 MB/s  | -| plrabn12.txt          | 319267      | 317985  | 5159 MB/s    | 5726 MB/s   | 11923 MB/s  | 19901 MB/s  | -| geo.protodata         | 23335       | 18690   | 21220 MB/s   | 26529 MB/s  | 56271 MB/s  | 62540 MB/s  | -| kppkn.gtb             | 69526       | 65312   | 9732 MB/s    | 8559 MB/s   | 18491 MB/s  | 18969 MB/s  | -| alice29.txt (128B)    | 80          | 82      | 6691 MB/s    | 15489 MB/s  | 31883 MB/s  | 38874 MB/s  | -| alice29.txt (1000B)   | 774         | 774     | 12204 MB/s   | 13000 MB/s  | 48056 MB/s  | 52341 MB/s  | -| alice29.txt (10000B)  | 6648        | 6933    | 10044 MB/s   | 12806 MB/s  | 32378 MB/s  | 46322 MB/s  | -| alice29.txt (20000B)  | 12686       | 13574   | 7733 MB/s    | 11210 MB/s  | 30566 MB/s  | 58969 MB/s  | - - -| Relative Perf         | Snappy size | S2 size improved | S2 Speed | S2 Dec Speed | -|-----------------------|-------------|------------------|----------|--------------| -| html                  | 22.31%      | 7.58%            | 1.07x    | 1.20x        | -| urls.10K              | 47.78%      | 14.36%           | 1.22x    | 1.18x        | -| fireworks.jpeg        | 99.95%      | -0.05%           | 0.78x    | 1.15x        | -| fireworks.jpeg (200B) | 73.00%      | -6.16%           | 2.00x    | 1.56x        | -| paper-100k.pdf        | 83.30%      | 0.99%            | 0.60x    | 0.89x        | -| html_x_4              | 22.52%      | 77.11%           | 3.33x    | 1.04x        | -| alice29.txt           | 57.88%      | 2.34%            | 1.03x    | 1.56x        | -| asyoulik.txt          | 61.91%      | -2.77%           | 1.15x    | 1.79x        | -| lcet10.txt            | 54.99%      | 5.96%            | 0.97x    | 1.29x        | -| plrabn12.txt          | 66.26%      | 0.40%            | 1.11x    | 1.67x        | -| geo.protodata         | 19.68%      | 19.91%           | 1.25x    | 1.11x        | -| kppkn.gtb             | 37.72%      | 6.06%            | 0.88x    | 1.03x        | -| alice29.txt (128B)    | 62.50%      | -2.50%           | 2.31x    | 1.22x        | -| alice29.txt (1000B)   | 77.40%      | 0.00%            | 1.07x    | 1.09x        | -| alice29.txt (10000B)  | 66.48%      | -4.29%           | 1.27x    | 1.43x        | -| alice29.txt (20000B)  | 63.43%      | -7.00%           | 1.45x    | 1.93x        | +| html                  | 22843       | 20868   | 16246 MB/s   | 18617 MB/s  | 40972 MB/s  | 49263 MB/s  | +| urls.10K              | 335492      | 286541  | 7943 MB/s    | 10201 MB/s  | 22523 MB/s  | 26484 MB/s  | +| fireworks.jpeg        | 123034      | 123100  | 349544 MB/s  | 303228 MB/s | 718321 MB/s | 827552 MB/s | +| fireworks.jpeg (200B) | 146         | 155     | 8869 MB/s    | 20180 MB/s  | 33691 MB/s  | 52421 MB/s  | +| paper-100k.pdf        | 85304       | 84202   | 167546 MB/s  | 112988 MB/s | 326905 MB/s | 291944 MB/s | +| html_x_4              | 92234       | 20870   | 15194 MB/s   | 54457 MB/s  | 30843 MB/s  | 32217 MB/s  | +| alice29.txt           | 88034       | 85934   | 5936 MB/s    | 6540 MB/s   | 12882 MB/s  | 20044 MB/s  | +| asyoulik.txt          | 77503       | 79575   | 5517 MB/s    | 6657 MB/s   | 12735 MB/s  | 22806 MB/s  | +| lcet10.txt            | 234661      | 220383  | 6235 MB/s    | 6303 MB/s   | 14519 MB/s  | 18697 MB/s  | +| plrabn12.txt          | 319267      | 318196  | 5159 MB/s    | 6074 MB/s   | 11923 MB/s  | 19901 MB/s  | +| geo.protodata         | 23335       | 18606   | 21220 MB/s   | 25432 MB/s  | 56271 MB/s  | 62540 MB/s  | +| kppkn.gtb             | 69526       | 65019   | 9732 MB/s    | 8905 MB/s   | 18491 MB/s  | 18969 MB/s  | +| alice29.txt (128B)    | 80          | 82      | 6691 MB/s    | 17179 MB/s  | 31883 MB/s  | 38874 MB/s  | +| alice29.txt (1000B)   | 774         | 774     | 12204 MB/s   | 13273 MB/s  | 48056 MB/s  | 52341 MB/s  | +| alice29.txt (10000B)  | 6648        | 6933    | 10044 MB/s   | 12824 MB/s  | 32378 MB/s  | 46322 MB/s  | +| alice29.txt (20000B)  | 12686       | 13516   | 7733 MB/s    | 12160 MB/s  | 30566 MB/s  | 58969 MB/s  | +  Speed is generally at or above Snappy. Small blocks gets a significant speedup, although at the expense of size.  @@ -543,42 +524,23 @@ So individual benchmarks should only be seen as a guideline and the overall pict  | Absolute Perf         | Snappy size | Better Size | Snappy Speed | Better Speed | Snappy dec  | Better dec  |  |-----------------------|-------------|-------------|--------------|--------------|-------------|-------------| -| html                  | 22843       | 19833       | 16246 MB/s   | 7731 MB/s    | 40972 MB/s  | 40292 MB/s  | -| urls.10K              | 335492      | 253529      | 7943 MB/s    | 3980 MB/s    | 22523 MB/s  | 20981 MB/s  | -| fireworks.jpeg        | 123034      | 123100      | 349544 MB/s  | 9760 MB/s    | 718321 MB/s | 823698 MB/s | -| fireworks.jpeg (200B) | 146         | 142         | 8869 MB/s    | 594 MB/s     | 33691 MB/s  | 30101 MB/s  | -| paper-100k.pdf        | 85304       | 82915       | 167546 MB/s  | 7470 MB/s    | 326905 MB/s | 198869 MB/s | -| html_x_4              | 92234       | 19841       | 15194 MB/s   | 23403 MB/s   | 30843 MB/s  | 30937 MB/s  | -| alice29.txt           | 88034       | 73218       | 5936 MB/s    | 2945 MB/s    | 12882 MB/s  | 16611 MB/s  | -| asyoulik.txt          | 77503       | 66844       | 5517 MB/s    | 2739 MB/s    | 12735 MB/s  | 14975 MB/s  | -| lcet10.txt            | 234661      | 190589      | 6235 MB/s    | 3099 MB/s    | 14519 MB/s  | 16634 MB/s  | -| plrabn12.txt          | 319267      | 270828      | 5159 MB/s    | 2600 MB/s    | 11923 MB/s  | 13382 MB/s  | -| geo.protodata         | 23335       | 18278       | 21220 MB/s   | 11208 MB/s   | 56271 MB/s  | 57961 MB/s  | -| kppkn.gtb             | 69526       | 61851       | 9732 MB/s    | 4556 MB/s    | 18491 MB/s  | 16524 MB/s  | -| alice29.txt (128B)    | 80          | 81          | 6691 MB/s    | 529 MB/s     | 31883 MB/s  | 34225 MB/s  | -| alice29.txt (1000B)   | 774         | 748         | 12204 MB/s   | 1943 MB/s    | 48056 MB/s  | 42068 MB/s  | -| alice29.txt (10000B)  | 6648        | 6234        | 10044 MB/s   | 2949 MB/s    | 32378 MB/s  | 28813 MB/s  | -| alice29.txt (20000B)  | 12686       | 11584       | 7733 MB/s    | 2822 MB/s    | 30566 MB/s  | 27315 MB/s  | - - -| Relative Perf         | Snappy size | Better size | Better Speed | Better dec | -|-----------------------|-------------|-------------|--------------|------------| -| html                  | 22.31%      | 13.18%      | 0.48x        | 0.98x      | -| urls.10K              | 47.78%      | 24.43%      | 0.50x        | 0.93x      | -| fireworks.jpeg        | 99.95%      | -0.05%      | 0.03x        | 1.15x      | -| fireworks.jpeg (200B) | 73.00%      | 2.74%       | 0.07x        | 0.89x      | -| paper-100k.pdf        | 83.30%      | 2.80%       | 0.07x        | 0.61x      | -| html_x_4              | 22.52%      | 78.49%      | 0.04x        | 1.00x      | -| alice29.txt           | 57.88%      | 16.83%      | 1.54x        | 1.29x      | -| asyoulik.txt          | 61.91%      | 13.75%      | 0.50x        | 1.18x      | -| lcet10.txt            | 54.99%      | 18.78%      | 0.50x        | 1.15x      | -| plrabn12.txt          | 66.26%      | 15.17%      | 0.50x        | 1.12x      | -| geo.protodata         | 19.68%      | 21.67%      | 0.50x        | 1.03x      | -| kppkn.gtb             | 37.72%      | 11.04%      | 0.53x        | 0.89x      | -| alice29.txt (128B)    | 62.50%      | -1.25%      | 0.47x        | 1.07x      | -| alice29.txt (1000B)   | 77.40%      | 3.36%       | 0.08x        | 0.88x      | -| alice29.txt (10000B)  | 66.48%      | 6.23%       | 0.16x        | 0.89x      | -| alice29.txt (20000B)  | 63.43%      | 8.69%       | 0.29x        | 0.89x      | +| html                  | 22843       | 18972       | 16246 MB/s   | 8621 MB/s    | 40972 MB/s  | 40292 MB/s  | +| urls.10K              | 335492      | 248079      | 7943 MB/s    | 5104 MB/s    | 22523 MB/s  | 20981 MB/s  | +| fireworks.jpeg        | 123034      | 123100      | 349544 MB/s  | 84429 MB/s   | 718321 MB/s | 823698 MB/s | +| fireworks.jpeg (200B) | 146         | 149         | 8869 MB/s    | 7125 MB/s    | 33691 MB/s  | 30101 MB/s  | +| paper-100k.pdf        | 85304       | 82887       | 167546 MB/s  | 11087 MB/s   | 326905 MB/s | 198869 MB/s | +| html_x_4              | 92234       | 18982       | 15194 MB/s   | 29316 MB/s   | 30843 MB/s  | 30937 MB/s  | +| alice29.txt           | 88034       | 71611       | 5936 MB/s    | 3709 MB/s    | 12882 MB/s  | 16611 MB/s  | +| asyoulik.txt          | 77503       | 65941       | 5517 MB/s    | 3380 MB/s    | 12735 MB/s  | 14975 MB/s  | +| lcet10.txt            | 234661      | 184939      | 6235 MB/s    | 3537 MB/s    | 14519 MB/s  | 16634 MB/s  | +| plrabn12.txt          | 319267      | 264990      | 5159 MB/s    | 2960 MB/s    | 11923 MB/s  | 13382 MB/s  | +| geo.protodata         | 23335       | 17689       | 21220 MB/s   | 10859 MB/s   | 56271 MB/s  | 57961 MB/s  | +| kppkn.gtb             | 69526       | 55398       | 9732 MB/s    | 5206 MB/s    | 18491 MB/s  | 16524 MB/s  | +| alice29.txt (128B)    | 80          | 78          | 6691 MB/s    | 7422 MB/s    | 31883 MB/s  | 34225 MB/s  | +| alice29.txt (1000B)   | 774         | 746         | 12204 MB/s   | 5734 MB/s    | 48056 MB/s  | 42068 MB/s  | +| alice29.txt (10000B)  | 6648        | 6218        | 10044 MB/s   | 6055 MB/s    | 32378 MB/s  | 28813 MB/s  | +| alice29.txt (20000B)  | 12686       | 11492       | 7733 MB/s    | 3143 MB/s    | 30566 MB/s  | 27315 MB/s  | +  Except for the mostly incompressible JPEG image compression is better and usually in the   double digits in terms of percentage reduction over Snappy. @@ -605,29 +567,29 @@ Some examples compared on 16 core CPU, amd64 assembly used:  ```  * enwik10 -Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s -Better...  10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s -Best...    10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s +Default... 10000000000 -> 4759950115 [47.60%]; 1.03s, 9263.0MB/s +Better...  10000000000 -> 4084706676 [40.85%]; 2.16s, 4415.4MB/s +Best...    10000000000 -> 3615520079 [36.16%]; 42.259s, 225.7MB/s  * github-june-2days-2019.json -Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s -Better...  6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s -Best...    6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s +Default... 6273951764 -> 1041700255 [16.60%]; 431ms, 13882.3MB/s +Better...  6273951764 -> 945841238 [15.08%]; 547ms, 10938.4MB/s +Best...    6273951764 -> 826392576 [13.17%]; 9.455s, 632.8MB/s  * nyc-taxi-data-10M.csv -Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s -Better...  3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s -Best...    3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s +Default... 3325605752 -> 1093516949 [32.88%]; 324ms, 9788.7MB/s +Better...  3325605752 -> 885394158 [26.62%]; 491ms, 6459.4MB/s +Best...    3325605752 -> 773681257 [23.26%]; 8.29s, 412.0MB/s  * 10gb.tar -Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s -Better...  10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s -Best...    10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/ +Default... 10065157632 -> 5915541066 [58.77%]; 1.028s, 9337.4MB/s +Better...  10065157632 -> 5453844650 [54.19%]; 1.597s, 4862.7MB/s +Best...    10065157632 -> 5192495021 [51.59%]; 32.78s, 308.2MB/  * consensus.db.10gb -Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s -Better...  10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s -Best...    10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s +Default... 10737418240 -> 4549762344 [42.37%]; 882ms, 12118.4MB/s +Better...  10737418240 -> 4438535064 [41.34%]; 1.533s, 3500.9MB/s +Best...    10737418240 -> 4210602774 [39.21%]; 42.96s, 254.4MB/s  ```  Decompression speed should be around the same as using the 'better' compression mode.  @@ -648,10 +610,10 @@ If you would like more control, you can use the s2 package as described below:  Snappy compatible blocks can be generated with the S2 encoder.   Compression and speed is typically a bit better `MaxEncodedLen` is also smaller for smaller memory usage. Replace  -| Snappy                     | S2 replacement          | -|----------------------------|-------------------------| -| snappy.Encode(...)         | s2.EncodeSnappy(...)   | -| snappy.MaxEncodedLen(...)  | s2.MaxEncodedLen(...)   | +| Snappy                    | S2 replacement        | +|---------------------------|-----------------------| +| snappy.Encode(...)        | s2.EncodeSnappy(...)  | +| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) |  `s2.EncodeSnappy` can be replaced with `s2.EncodeSnappyBetter` or `s2.EncodeSnappyBest` to get more efficiently compressed snappy compatible output.  @@ -660,12 +622,12 @@ Compression and speed is typically a bit better `MaxEncodedLen` is also smaller  Comparison of [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),  53927 files, total input size: 4,014,735,833 bytes. amd64, single goroutine used: -| Encoder               | Size       | MB/s       | Reduction | -|-----------------------|------------|------------|------------ -| snappy.Encode         | 1128706759 | 725.59     | 71.89%    | -| s2.EncodeSnappy       | 1093823291 | **899.16** | 72.75%    | -| s2.EncodeSnappyBetter | 1001158548 | 578.49     | 75.06%    | -| s2.EncodeSnappyBest   | 944507998  | 66.00      | **76.47%**| +| Encoder               | Size       | MB/s       | Reduction  | +|-----------------------|------------|------------|------------| +| snappy.Encode         | 1128706759 | 725.59     | 71.89%     | +| s2.EncodeSnappy       | 1093823291 | **899.16** | 72.75%     | +| s2.EncodeSnappyBetter | 1001158548 | 578.49     | 75.06%     | +| s2.EncodeSnappyBest   | 944507998  | 66.00      | **76.47%** |  ## Streams @@ -835,6 +797,13 @@ This is done using the regular "Skip" function:  This will ensure that we are at exactly the offset we want, and reading from `dec` will start at the requested offset. +# Compact storage + +For compact storage [RemoveIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RemoveIndexHeaders) can be used to remove any redundant info from  +a serialized index. If you remove the header it must be restored before [Loading](https://pkg.go.dev/github.com/klauspost/compress/s2#Index.Load). + +This is expected to save 20 bytes. These can be restored using [RestoreIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RestoreIndexHeaders). This removes a layer of security, but is the most compact representation. Returns nil if headers contains errors. +  ## Index Format:  Each block is structured as a snappy skippable block, with the chunk ID 0x99. @@ -844,20 +813,20 @@ The block can be read from the front, but contains information so it can be read  Numbers are stored as fixed size little endian values or [zigzag encoded](https://developers.google.com/protocol-buffers/docs/encoding#signed_integers) [base 128 varints](https://developers.google.com/protocol-buffers/docs/encoding),   with un-encoded value length of 64 bits, unless other limits are specified.  -| Content                                                                   | Format                                                                                                                      | -|---------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------| -| ID, `[1]byte`                                                           | Always 0x99.                                                                                                                  | -| Data Length, `[3]byte`                                                  | 3 byte little-endian length of the chunk in bytes, following this.                                                            | -| Header `[6]byte`                                                        | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00".                                                        | -| UncompressedSize, Varint                                                | Total Uncompressed size.                                                                                                      | -| CompressedSize, Varint                                                  | Total Compressed size if known. Should be -1 if unknown.                                                                      | -| EstBlockSize, Varint                                                    | Block Size, used for guessing uncompressed offsets. Must be >= 0.                                                             | -| Entries, Varint                                                         | Number of Entries in index, must be < 65536 and >=0.                                                                          | -| HasUncompressedOffsets `byte`                                           | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid.                                             | -| UncompressedOffsets, [Entries]VarInt                                    | Uncompressed offsets. See below how to decode.                                                                                | -| CompressedOffsets, [Entries]VarInt                                      | Compressed offsets. See below how to decode.                                                                                  | -| Block Size, `[4]byte`                                                   | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block.       | -| Trailer `[6]byte`                                                       | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. | +| Content                              | Format                                                                                                                        | +|--------------------------------------|-------------------------------------------------------------------------------------------------------------------------------| +| ID, `[1]byte`                        | Always 0x99.                                                                                                                  | +| Data Length, `[3]byte`               | 3 byte little-endian length of the chunk in bytes, following this.                                                            | +| Header `[6]byte`                     | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00".                                                        | +| UncompressedSize, Varint             | Total Uncompressed size.                                                                                                      | +| CompressedSize, Varint               | Total Compressed size if known. Should be -1 if unknown.                                                                      | +| EstBlockSize, Varint                 | Block Size, used for guessing uncompressed offsets. Must be >= 0.                                                             | +| Entries, Varint                      | Number of Entries in index, must be < 65536 and >=0.                                                                          | +| HasUncompressedOffsets `byte`        | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid.                                             | +| UncompressedOffsets, [Entries]VarInt | Uncompressed offsets. See below how to decode.                                                                                | +| CompressedOffsets, [Entries]VarInt   | Compressed offsets. See below how to decode.                                                                                  | +| Block Size, `[4]byte`                | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block.       | +| Trailer `[6]byte`                    | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. |  For regular streams the uncompressed offsets are fully predictable,  so `HasUncompressedOffsets` allows to specify that compressed blocks all have  @@ -929,6 +898,7 @@ To decode from any given uncompressed offset `(wantOffset)`:  See [using indexes](https://github.com/klauspost/compress/tree/master/s2#using-indexes) for functions that perform the operations with a simpler interface. +  # Format Extensions  * Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`. @@ -951,10 +921,11 @@ The length is specified by reading the 3-bit length specified in the tag and dec  | 7      | 65540 + read 3 bytes |  This allows any repeat offset + length to be represented by 2 to 5 bytes. +It also allows to emit matches longer than 64 bytes with one copy + one repeat instead of several 64 byte copies.  Lengths are stored as little endian values. -The first copy of a block cannot be a repeat offset and the offset is not carried across blocks in streams. +The first copy of a block cannot be a repeat offset and the offset is reset on every block in streams.  Default streaming block size is 1MB. diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go index 27c0f3c2c..00c5cc72c 100644 --- a/vendor/github.com/klauspost/compress/s2/decode.go +++ b/vendor/github.com/klauspost/compress/s2/decode.go @@ -952,7 +952,11 @@ func (r *Reader) ReadSeeker(random bool, index []byte) (*ReadSeeker, error) {  // Seek allows seeking in compressed data.  func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) {  	if r.err != nil { -		return 0, r.err +		if !errors.Is(r.err, io.EOF) { +			return 0, r.err +		} +		// Reset on EOF +		r.err = nil  	}  	if offset == 0 && whence == io.SeekCurrent {  		return r.blockStart + int64(r.i), nil diff --git a/vendor/github.com/klauspost/compress/s2/decode_other.go b/vendor/github.com/klauspost/compress/s2/decode_other.go index 1074ebd21..11300c3a8 100644 --- a/vendor/github.com/klauspost/compress/s2/decode_other.go +++ b/vendor/github.com/klauspost/compress/s2/decode_other.go @@ -28,6 +28,9 @@ func s2Decode(dst, src []byte) int {  	// As long as we can read at least 5 bytes...  	for s < len(src)-5 { +		// Removing bounds checks is SLOWER, when if doing +		// in := src[s:s+5] +		// Checked on Go 1.18  		switch src[s] & 0x03 {  		case tagLiteral:  			x := uint32(src[s] >> 2) @@ -38,14 +41,19 @@ func s2Decode(dst, src []byte) int {  				s += 2  				x = uint32(src[s-1])  			case x == 61: +				in := src[s : s+3] +				x = uint32(in[1]) | uint32(in[2])<<8  				s += 3 -				x = uint32(src[s-2]) | uint32(src[s-1])<<8  			case x == 62: +				in := src[s : s+4] +				// Load as 32 bit and shift down. +				x = uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24 +				x >>= 8  				s += 4 -				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16  			case x == 63: +				in := src[s : s+5] +				x = uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24  				s += 5 -				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24  			}  			length = int(x) + 1  			if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) { @@ -62,8 +70,8 @@ func s2Decode(dst, src []byte) int {  		case tagCopy1:  			s += 2 -			length = int(src[s-2]) >> 2 & 0x7  			toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) +			length = int(src[s-2]) >> 2 & 0x7  			if toffset == 0 {  				if debug {  					fmt.Print("(repeat) ") @@ -71,14 +79,16 @@ func s2Decode(dst, src []byte) int {  				// keep last offset  				switch length {  				case 5: +					length = int(src[s]) + 4  					s += 1 -					length = int(uint32(src[s-1])) + 4  				case 6: +					in := src[s : s+2] +					length = int(uint32(in[0])|(uint32(in[1])<<8)) + (1 << 8)  					s += 2 -					length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)  				case 7: +					in := src[s : s+3] +					length = int((uint32(in[2])<<16)|(uint32(in[1])<<8)|uint32(in[0])) + (1 << 16)  					s += 3 -					length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)  				default: // 0-> 4  				}  			} else { @@ -86,14 +96,16 @@ func s2Decode(dst, src []byte) int {  			}  			length += 4  		case tagCopy2: +			in := src[s : s+3] +			offset = int(uint32(in[1]) | uint32(in[2])<<8) +			length = 1 + int(in[0])>>2  			s += 3 -			length = 1 + int(src[s-3])>>2 -			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)  		case tagCopy4: +			in := src[s : s+5] +			offset = int(uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24) +			length = 1 + int(in[0])>>2  			s += 5 -			length = 1 + int(src[s-5])>>2 -			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)  		}  		if offset <= 0 || d < offset || length > len(dst)-d { diff --git a/vendor/github.com/klauspost/compress/s2/encode_all.go b/vendor/github.com/klauspost/compress/s2/encode_all.go index 8b16c38a6..54c71d3b5 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_all.go +++ b/vendor/github.com/klauspost/compress/s2/encode_all.go @@ -58,8 +58,9 @@ func encodeGo(dst, src []byte) []byte {  // been written.  //  // It also assumes that: +//  //	len(dst) >= MaxEncodedLen(len(src)) && -// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize  func encodeBlockGo(dst, src []byte) (d int) {  	// Initialize the hash table.  	const ( diff --git a/vendor/github.com/klauspost/compress/s2/encode_amd64.go b/vendor/github.com/klauspost/compress/s2/encode_amd64.go index e612225f4..6b93daa5a 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_amd64.go +++ b/vendor/github.com/klauspost/compress/s2/encode_amd64.go @@ -8,8 +8,9 @@ package s2  // been written.  //  // It also assumes that: +//  //	len(dst) >= MaxEncodedLen(len(src)) && -// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize  func encodeBlock(dst, src []byte) (d int) {  	const (  		// Use 12 bit table when less than... @@ -43,8 +44,9 @@ func encodeBlock(dst, src []byte) (d int) {  // been written.  //  // It also assumes that: +//  //	len(dst) >= MaxEncodedLen(len(src)) && -// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize  func encodeBlockBetter(dst, src []byte) (d int) {  	const (  		// Use 12 bit table when less than... @@ -78,8 +80,9 @@ func encodeBlockBetter(dst, src []byte) (d int) {  // been written.  //  // It also assumes that: +//  //	len(dst) >= MaxEncodedLen(len(src)) && -// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize  func encodeBlockSnappy(dst, src []byte) (d int) {  	const (  		// Use 12 bit table when less than... @@ -112,8 +115,9 @@ func encodeBlockSnappy(dst, src []byte) (d int) {  // been written.  //  // It also assumes that: +//  //	len(dst) >= MaxEncodedLen(len(src)) && -// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize  func encodeBlockBetterSnappy(dst, src []byte) (d int) {  	const (  		// Use 12 bit table when less than... diff --git a/vendor/github.com/klauspost/compress/s2/encode_best.go b/vendor/github.com/klauspost/compress/s2/encode_best.go index 4bc80bc6a..1b7ea394f 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_best.go +++ b/vendor/github.com/klauspost/compress/s2/encode_best.go @@ -15,8 +15,9 @@ import (  // been written.  //  // It also assumes that: +//  //	len(dst) >= MaxEncodedLen(len(src)) && -// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize  func encodeBlockBest(dst, src []byte) (d int) {  	// Initialize the hash tables.  	const ( @@ -176,14 +177,21 @@ func encodeBlockBest(dst, src []byte) (d int) {  						best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))  					}  					// Search for a match at best match end, see if that is better. -					if sAt := best.s + best.length; sAt < sLimit { -						sBack := best.s -						backL := best.length +					// Allow some bytes at the beginning to mismatch. +					// Sweet spot is around 1-2 bytes, but depends on input. +					// The skipped bytes are tested in Extend backwards, +					// and still picked up as part of the match if they do. +					const skipBeginning = 2 +					const skipEnd = 1 +					if sAt := best.s + best.length - skipEnd; sAt < sLimit { + +						sBack := best.s + skipBeginning - skipEnd +						backL := best.length - skipBeginning  						// Load initial values  						cv = load64(src, sBack) -						// Search for mismatch + +						// Grab candidates...  						next := lTable[hash8(load64(src, sAt), lTableBits)] -						//next := sTable[hash4(load64(src, sAt), sTableBits)]  						if checkAt := getCur(next) - backL; checkAt > 0 {  							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) @@ -191,6 +199,16 @@ func encodeBlockBest(dst, src []byte) (d int) {  						if checkAt := getPrev(next) - backL; checkAt > 0 {  							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))  						} +						// Disabled: Extremely small gain +						if false { +							next = sTable[hash4(load64(src, sAt), sTableBits)] +							if checkAt := getCur(next) - backL; checkAt > 0 { +								best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) +							} +							if checkAt := getPrev(next) - backL; checkAt > 0 { +								best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) +							} +						}  					}  				}  			} @@ -288,8 +306,9 @@ emitRemainder:  // been written.  //  // It also assumes that: +//  //	len(dst) >= MaxEncodedLen(len(src)) && -// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize  func encodeBlockBestSnappy(dst, src []byte) (d int) {  	// Initialize the hash tables.  	const ( @@ -546,6 +565,7 @@ emitRemainder:  // emitCopySize returns the size to encode the offset+length  //  // It assumes that: +//  //	1 <= offset && offset <= math.MaxUint32  //	4 <= length && length <= 1 << 24  func emitCopySize(offset, length int) int { @@ -584,6 +604,7 @@ func emitCopySize(offset, length int) int {  // emitCopyNoRepeatSize returns the size to encode the offset+length  //  // It assumes that: +//  //	1 <= offset && offset <= math.MaxUint32  //	4 <= length && length <= 1 << 24  func emitCopyNoRepeatSize(offset, length int) int { diff --git a/vendor/github.com/klauspost/compress/s2/encode_better.go b/vendor/github.com/klauspost/compress/s2/encode_better.go index 943215b8a..3b66ba42b 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_better.go +++ b/vendor/github.com/klauspost/compress/s2/encode_better.go @@ -42,8 +42,9 @@ func hash8(u uint64, h uint8) uint32 {  // been written.  //  // It also assumes that: +//  //	len(dst) >= MaxEncodedLen(len(src)) && -// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize  func encodeBlockBetterGo(dst, src []byte) (d int) {  	// sLimit is when to stop looking for offset/length copies. The inputMargin  	// lets us use a fast path for emitLiteral in the main loop, while we are @@ -56,7 +57,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {  	// Initialize the hash tables.  	const (  		// Long hash matches. -		lTableBits    = 16 +		lTableBits    = 17  		maxLTableSize = 1 << lTableBits  		// Short hash matches. @@ -97,9 +98,26 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {  			lTable[hashL] = uint32(s)  			sTable[hashS] = uint32(s) +			valLong := load64(src, candidateL) +			valShort := load64(src, candidateS) + +			// If long matches at least 8 bytes, use that. +			if cv == valLong { +				break +			} +			if cv == valShort { +				candidateL = candidateS +				break +			} +  			// Check repeat at offset checkRep.  			const checkRep = 1 -			if false && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { +			// Minimum length of a repeat. Tested with various values. +			// While 4-5 offers improvements in some, 6 reduces +			// regressions significantly. +			const wantRepeatBytes = 6 +			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep) +			if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {  				base := s + checkRep  				// Extend back  				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { @@ -109,8 +127,8 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {  				d += emitLiteral(dst[d:], src[nextEmit:base])  				// Extend forward -				candidate := s - repeat + 4 + checkRep -				s += 4 + checkRep +				candidate := s - repeat + wantRepeatBytes + checkRep +				s += wantRepeatBytes + checkRep  				for s < len(src) {  					if len(src)-s < 8 {  						if src[s] == src[candidate] { @@ -127,28 +145,40 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {  					s += 8  					candidate += 8  				} -				if nextEmit > 0 { -					// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. -					d += emitRepeat(dst[d:], repeat, s-base) -				} else { -					// First match, cannot be repeat. -					d += emitCopy(dst[d:], repeat, s-base) -				} +				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. +				d += emitRepeat(dst[d:], repeat, s-base)  				nextEmit = s  				if s >= sLimit {  					goto emitRemainder  				} +				// Index in-between +				index0 := base + 1 +				index1 := s - 2 + +				cv = load64(src, s) +				for index0 < index1 { +					cv0 := load64(src, index0) +					cv1 := load64(src, index1) +					lTable[hash7(cv0, lTableBits)] = uint32(index0) +					sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) + +					lTable[hash7(cv1, lTableBits)] = uint32(index1) +					sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) +					index0 += 2 +					index1 -= 2 +				}  				cv = load64(src, s)  				continue  			} -			if uint32(cv) == load32(src, candidateL) { +			// Long likely matches 7, so take that. +			if uint32(cv) == uint32(valLong) {  				break  			}  			// Check our short candidate -			if uint32(cv) == load32(src, candidateS) { +			if uint32(cv) == uint32(valShort) {  				// Try a long candidate at s+1  				hashL = hash7(cv>>8, lTableBits)  				candidateL = int(lTable[hashL]) @@ -227,21 +257,29 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {  			// Do we have space for more, if not bail.  			return 0  		} -		// Index match start+1 (long) and start+2 (short) + +		// Index short & long  		index0 := base + 1 -		// Index match end-2 (long) and end-1 (short)  		index1 := s - 2  		cv0 := load64(src, index0)  		cv1 := load64(src, index1) -		cv = load64(src, s)  		lTable[hash7(cv0, lTableBits)] = uint32(index0) -		lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1) -		lTable[hash7(cv1, lTableBits)] = uint32(index1) -		lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)  		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) -		sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2) + +		lTable[hash7(cv1, lTableBits)] = uint32(index1)  		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) +		index0 += 1 +		index1 -= 1 +		cv = load64(src, s) + +		// index every second long in between. +		for index0 < index1 { +			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0) +			lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1) +			index0 += 2 +			index1 -= 2 +		}  	}  emitRemainder: @@ -260,8 +298,9 @@ emitRemainder:  // been written.  //  // It also assumes that: +//  //	len(dst) >= MaxEncodedLen(len(src)) && -// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize  func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {  	// sLimit is when to stop looking for offset/length copies. The inputMargin  	// lets us use a fast path for emitLiteral in the main loop, while we are @@ -402,21 +441,29 @@ func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {  			// Do we have space for more, if not bail.  			return 0  		} -		// Index match start+1 (long) and start+2 (short) + +		// Index short & long  		index0 := base + 1 -		// Index match end-2 (long) and end-1 (short)  		index1 := s - 2  		cv0 := load64(src, index0)  		cv1 := load64(src, index1) -		cv = load64(src, s)  		lTable[hash7(cv0, lTableBits)] = uint32(index0) -		lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1) -		lTable[hash7(cv1, lTableBits)] = uint32(index1) -		lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)  		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) -		sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2) + +		lTable[hash7(cv1, lTableBits)] = uint32(index1)  		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) +		index0 += 1 +		index1 -= 1 +		cv = load64(src, s) + +		// index every second long in between. +		for index0 < index1 { +			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0) +			lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1) +			index0 += 2 +			index1 -= 2 +		}  	}  emitRemainder: diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go index 94784b82a..db08fc355 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_go.go +++ b/vendor/github.com/klauspost/compress/s2/encode_go.go @@ -12,6 +12,7 @@ import (  // been written.  //  // It also assumes that: +//  //	len(dst) >= MaxEncodedLen(len(src))  func encodeBlock(dst, src []byte) (d int) {  	if len(src) < minNonLiteralBlockSize { @@ -25,6 +26,7 @@ func encodeBlock(dst, src []byte) (d int) {  // been written.  //  // It also assumes that: +//  //	len(dst) >= MaxEncodedLen(len(src))  func encodeBlockBetter(dst, src []byte) (d int) {  	return encodeBlockBetterGo(dst, src) @@ -35,6 +37,7 @@ func encodeBlockBetter(dst, src []byte) (d int) {  // been written.  //  // It also assumes that: +//  //	len(dst) >= MaxEncodedLen(len(src))  func encodeBlockBetterSnappy(dst, src []byte) (d int) {  	return encodeBlockBetterSnappyGo(dst, src) @@ -45,6 +48,7 @@ func encodeBlockBetterSnappy(dst, src []byte) (d int) {  // been written.  //  // It also assumes that: +//  //	len(dst) >= MaxEncodedLen(len(src))  func encodeBlockSnappy(dst, src []byte) (d int) {  	if len(src) < minNonLiteralBlockSize { @@ -56,6 +60,7 @@ func encodeBlockSnappy(dst, src []byte) (d int) {  // emitLiteral writes a literal chunk and returns the number of bytes written.  //  // It assumes that: +//  //	dst is long enough to hold the encoded bytes  //	0 <= len(lit) && len(lit) <= math.MaxUint32  func emitLiteral(dst, lit []byte) int { @@ -146,6 +151,7 @@ func emitRepeat(dst []byte, offset, length int) int {  // emitCopy writes a copy chunk and returns the number of bytes written.  //  // It assumes that: +//  //	dst is long enough to hold the encoded bytes  //	1 <= offset && offset <= math.MaxUint32  //	4 <= length && length <= 1 << 24 @@ -214,6 +220,7 @@ func emitCopy(dst []byte, offset, length int) int {  // emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.  //  // It assumes that: +//  //	dst is long enough to hold the encoded bytes  //	1 <= offset && offset <= math.MaxUint32  //	4 <= length && length <= 1 << 24 @@ -273,8 +280,8 @@ func emitCopyNoRepeat(dst []byte, offset, length int) int {  // matchLen returns how many bytes match in a and b  //  // It assumes that: -//   len(a) <= len(b)  // +//	len(a) <= len(b)  func matchLen(a []byte, b []byte) int {  	b = b[:len(a)]  	var checked int diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go index 88f27c099..7e00bac3e 100644 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go @@ -1,7 +1,6 @@  // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.  //go:build !appengine && !noasm && gc && !noasm -// +build !appengine,!noasm,gc,!noasm  package s2 @@ -150,8 +149,9 @@ func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int  // emitLiteral writes a literal chunk and returns the number of bytes written.  //  // It assumes that: -//   dst is long enough to hold the encoded bytes with margin of 0 bytes -//   0 <= len(lit) && len(lit) <= math.MaxUint32 +// +//	dst is long enough to hold the encoded bytes with margin of 0 bytes +//	0 <= len(lit) && len(lit) <= math.MaxUint32  //  //go:noescape  func emitLiteral(dst []byte, lit []byte) int @@ -165,9 +165,10 @@ func emitRepeat(dst []byte, offset int, length int) int  // emitCopy writes a copy chunk and returns the number of bytes written.  //  // It assumes that: -//   dst is long enough to hold the encoded bytes -//   1 <= offset && offset <= math.MaxUint32 -//   4 <= length && length <= 1 << 24 +// +//	dst is long enough to hold the encoded bytes +//	1 <= offset && offset <= math.MaxUint32 +//	4 <= length && length <= 1 << 24  //  //go:noescape  func emitCopy(dst []byte, offset int, length int) int @@ -175,9 +176,10 @@ func emitCopy(dst []byte, offset int, length int) int  // emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.  //  // It assumes that: -//   dst is long enough to hold the encoded bytes -//   1 <= offset && offset <= math.MaxUint32 -//   4 <= length && length <= 1 << 24 +// +//	dst is long enough to hold the encoded bytes +//	1 <= offset && offset <= math.MaxUint32 +//	4 <= length && length <= 1 << 24  //  //go:noescape  func emitCopyNoRepeat(dst []byte, offset int, length int) int @@ -185,7 +187,8 @@ func emitCopyNoRepeat(dst []byte, offset int, length int) int  // matchLen returns how many bytes match in a and b  //  // It assumes that: -//   len(a) <= len(b) +// +//	len(a) <= len(b)  //  //go:noescape  func matchLen(a []byte, b []byte) int diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s index 36915d949..81a487d6d 100644 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s @@ -1,7 +1,6 @@  // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.  //go:build !appengine && !noasm && gc && !noasm -// +build !appengine,!noasm,gc,!noasm  #include "textflag.h" @@ -5743,9 +5742,9 @@ emit_literal_done_emit_remainder_encodeBlockAsm8B:  // func encodeBetterBlockAsm(dst []byte, src []byte) int  // Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm(SB), $327704-56 +TEXT ·encodeBetterBlockAsm(SB), $589848-56  	MOVQ dst_base+0(FP), AX -	MOVQ $0x00000a00, CX +	MOVQ $0x00001200, CX  	LEAQ 24(SP), DX  	PXOR X0, X0 @@ -5797,27 +5796,37 @@ check_maxskip_cont_encodeBetterBlockAsm:  	MOVQ  DI, R11  	SHLQ  $0x08, R10  	IMULQ R9, R10 -	SHRQ  $0x30, R10 +	SHRQ  $0x2f, R10  	SHLQ  $0x20, R11  	IMULQ SI, R11  	SHRQ  $0x32, R11  	MOVL  24(SP)(R10*4), SI -	MOVL  262168(SP)(R11*4), R8 +	MOVL  524312(SP)(R11*4), R8  	MOVL  CX, 24(SP)(R10*4) -	MOVL  CX, 262168(SP)(R11*4) -	CMPL  (DX)(SI*1), DI +	MOVL  CX, 524312(SP)(R11*4) +	MOVQ  (DX)(SI*1), R10 +	MOVQ  (DX)(R8*1), R11 +	CMPQ  R10, DI  	JEQ   candidate_match_encodeBetterBlockAsm -	CMPL  (DX)(R8*1), DI -	JEQ   candidateS_match_encodeBetterBlockAsm -	MOVL  20(SP), CX -	JMP   search_loop_encodeBetterBlockAsm +	CMPQ  R11, DI +	JNE   no_short_found_encodeBetterBlockAsm +	MOVL  R8, SI +	JMP   candidate_match_encodeBetterBlockAsm + +no_short_found_encodeBetterBlockAsm: +	CMPL R10, DI +	JEQ  candidate_match_encodeBetterBlockAsm +	CMPL R11, DI +	JEQ  candidateS_match_encodeBetterBlockAsm +	MOVL 20(SP), CX +	JMP  search_loop_encodeBetterBlockAsm  candidateS_match_encodeBetterBlockAsm:  	SHRQ  $0x08, DI  	MOVQ  DI, R10  	SHLQ  $0x08, R10  	IMULQ R9, R10 -	SHRQ  $0x30, R10 +	SHRQ  $0x2f, R10  	MOVL  24(SP)(R10*4), SI  	INCL  CX  	MOVL  CX, 24(SP)(R10*4) @@ -6590,52 +6599,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm:  match_nolit_dst_ok_encodeBetterBlockAsm:  	MOVQ  $0x00cf1bbcdcbfa563, SI  	MOVQ  $0x9e3779b1, R8 -	INCL  DI -	MOVQ  (DX)(DI*1), R9 -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	MOVQ  R9, R12 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	SHRQ  $0x10, R12 -	LEAL  1(DI), R14 -	LEAL  2(DI), R15 -	MOVQ  -2(DX)(CX*1), R9 +	LEAQ  1(DI), DI +	LEAQ  -2(CX), R9 +	MOVQ  (DX)(DI*1), R10 +	MOVQ  1(DX)(DI*1), R11 +	MOVQ  (DX)(R9*1), R12 +	MOVQ  1(DX)(R9*1), R13  	SHLQ  $0x08, R10  	IMULQ SI, R10 -	SHRQ  $0x30, R10 -	SHLQ  $0x08, R13 -	IMULQ SI, R13 -	SHRQ  $0x30, R13 +	SHRQ  $0x2f, R10  	SHLQ  $0x20, R11  	IMULQ R8, R11  	SHRQ  $0x32, R11 -	SHLQ  $0x20, R12 -	IMULQ R8, R12 -	SHRQ  $0x32, R12 +	SHLQ  $0x08, R12 +	IMULQ SI, R12 +	SHRQ  $0x2f, R12 +	SHLQ  $0x20, R13 +	IMULQ R8, R13 +	SHRQ  $0x32, R13 +	LEAQ  1(DI), R8 +	LEAQ  1(R9), R14  	MOVL  DI, 24(SP)(R10*4) -	MOVL  R14, 24(SP)(R13*4) -	MOVL  R14, 262168(SP)(R11*4) -	MOVL  R15, 262168(SP)(R12*4) -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	LEAL  -2(CX), R9 -	LEAL  -1(CX), DI +	MOVL  R9, 24(SP)(R12*4) +	MOVL  R8, 524312(SP)(R11*4) +	MOVL  R14, 524312(SP)(R13*4) +	ADDQ  $0x01, DI +	SUBQ  $0x01, R9 + +index_loop_encodeBetterBlockAsm: +	CMPQ  DI, R9 +	JAE   search_loop_encodeBetterBlockAsm +	MOVQ  (DX)(DI*1), R8 +	MOVQ  (DX)(R9*1), R10 +	SHLQ  $0x08, R8 +	IMULQ SI, R8 +	SHRQ  $0x2f, R8  	SHLQ  $0x08, R10  	IMULQ SI, R10 -	SHRQ  $0x30, R10 -	SHLQ  $0x20, R11 -	IMULQ R8, R11 -	SHRQ  $0x32, R11 -	SHLQ  $0x08, R13 -	IMULQ SI, R13 -	SHRQ  $0x30, R13 +	SHRQ  $0x2f, R10 +	MOVL  DI, 24(SP)(R8*4)  	MOVL  R9, 24(SP)(R10*4) -	MOVL  DI, 262168(SP)(R11*4) -	MOVL  DI, 24(SP)(R13*4) -	JMP   search_loop_encodeBetterBlockAsm +	ADDQ  $0x02, DI +	SUBQ  $0x02, R9 +	JMP   index_loop_encodeBetterBlockAsm  emit_remainder_encodeBetterBlockAsm:  	MOVQ src_len+32(FP), CX @@ -6815,9 +6821,9 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm:  // func encodeBetterBlockAsm4MB(dst []byte, src []byte) int  // Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56 +TEXT ·encodeBetterBlockAsm4MB(SB), $589848-56  	MOVQ dst_base+0(FP), AX -	MOVQ $0x00000a00, CX +	MOVQ $0x00001200, CX  	LEAQ 24(SP), DX  	PXOR X0, X0 @@ -6869,27 +6875,37 @@ check_maxskip_cont_encodeBetterBlockAsm4MB:  	MOVQ  DI, R11  	SHLQ  $0x08, R10  	IMULQ R9, R10 -	SHRQ  $0x30, R10 +	SHRQ  $0x2f, R10  	SHLQ  $0x20, R11  	IMULQ SI, R11  	SHRQ  $0x32, R11  	MOVL  24(SP)(R10*4), SI -	MOVL  262168(SP)(R11*4), R8 +	MOVL  524312(SP)(R11*4), R8  	MOVL  CX, 24(SP)(R10*4) -	MOVL  CX, 262168(SP)(R11*4) -	CMPL  (DX)(SI*1), DI +	MOVL  CX, 524312(SP)(R11*4) +	MOVQ  (DX)(SI*1), R10 +	MOVQ  (DX)(R8*1), R11 +	CMPQ  R10, DI  	JEQ   candidate_match_encodeBetterBlockAsm4MB -	CMPL  (DX)(R8*1), DI -	JEQ   candidateS_match_encodeBetterBlockAsm4MB -	MOVL  20(SP), CX -	JMP   search_loop_encodeBetterBlockAsm4MB +	CMPQ  R11, DI +	JNE   no_short_found_encodeBetterBlockAsm4MB +	MOVL  R8, SI +	JMP   candidate_match_encodeBetterBlockAsm4MB + +no_short_found_encodeBetterBlockAsm4MB: +	CMPL R10, DI +	JEQ  candidate_match_encodeBetterBlockAsm4MB +	CMPL R11, DI +	JEQ  candidateS_match_encodeBetterBlockAsm4MB +	MOVL 20(SP), CX +	JMP  search_loop_encodeBetterBlockAsm4MB  candidateS_match_encodeBetterBlockAsm4MB:  	SHRQ  $0x08, DI  	MOVQ  DI, R10  	SHLQ  $0x08, R10  	IMULQ R9, R10 -	SHRQ  $0x30, R10 +	SHRQ  $0x2f, R10  	MOVL  24(SP)(R10*4), SI  	INCL  CX  	MOVL  CX, 24(SP)(R10*4) @@ -7600,52 +7616,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:  match_nolit_dst_ok_encodeBetterBlockAsm4MB:  	MOVQ  $0x00cf1bbcdcbfa563, SI  	MOVQ  $0x9e3779b1, R8 -	INCL  DI -	MOVQ  (DX)(DI*1), R9 -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	MOVQ  R9, R12 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	SHRQ  $0x10, R12 -	LEAL  1(DI), R14 -	LEAL  2(DI), R15 -	MOVQ  -2(DX)(CX*1), R9 +	LEAQ  1(DI), DI +	LEAQ  -2(CX), R9 +	MOVQ  (DX)(DI*1), R10 +	MOVQ  1(DX)(DI*1), R11 +	MOVQ  (DX)(R9*1), R12 +	MOVQ  1(DX)(R9*1), R13  	SHLQ  $0x08, R10  	IMULQ SI, R10 -	SHRQ  $0x30, R10 -	SHLQ  $0x08, R13 -	IMULQ SI, R13 -	SHRQ  $0x30, R13 +	SHRQ  $0x2f, R10  	SHLQ  $0x20, R11  	IMULQ R8, R11  	SHRQ  $0x32, R11 -	SHLQ  $0x20, R12 -	IMULQ R8, R12 -	SHRQ  $0x32, R12 +	SHLQ  $0x08, R12 +	IMULQ SI, R12 +	SHRQ  $0x2f, R12 +	SHLQ  $0x20, R13 +	IMULQ R8, R13 +	SHRQ  $0x32, R13 +	LEAQ  1(DI), R8 +	LEAQ  1(R9), R14  	MOVL  DI, 24(SP)(R10*4) -	MOVL  R14, 24(SP)(R13*4) -	MOVL  R14, 262168(SP)(R11*4) -	MOVL  R15, 262168(SP)(R12*4) -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	LEAL  -2(CX), R9 -	LEAL  -1(CX), DI +	MOVL  R9, 24(SP)(R12*4) +	MOVL  R8, 524312(SP)(R11*4) +	MOVL  R14, 524312(SP)(R13*4) +	ADDQ  $0x01, DI +	SUBQ  $0x01, R9 + +index_loop_encodeBetterBlockAsm4MB: +	CMPQ  DI, R9 +	JAE   search_loop_encodeBetterBlockAsm4MB +	MOVQ  (DX)(DI*1), R8 +	MOVQ  (DX)(R9*1), R10 +	SHLQ  $0x08, R8 +	IMULQ SI, R8 +	SHRQ  $0x2f, R8  	SHLQ  $0x08, R10  	IMULQ SI, R10 -	SHRQ  $0x30, R10 -	SHLQ  $0x20, R11 -	IMULQ R8, R11 -	SHRQ  $0x32, R11 -	SHLQ  $0x08, R13 -	IMULQ SI, R13 -	SHRQ  $0x30, R13 +	SHRQ  $0x2f, R10 +	MOVL  DI, 24(SP)(R8*4)  	MOVL  R9, 24(SP)(R10*4) -	MOVL  DI, 262168(SP)(R11*4) -	MOVL  DI, 24(SP)(R13*4) -	JMP   search_loop_encodeBetterBlockAsm4MB +	ADDQ  $0x02, DI +	SUBQ  $0x02, R9 +	JMP   index_loop_encodeBetterBlockAsm4MB  emit_remainder_encodeBetterBlockAsm4MB:  	MOVQ src_len+32(FP), CX @@ -7871,12 +7884,22 @@ search_loop_encodeBetterBlockAsm12B:  	MOVL  65560(SP)(R11*4), R8  	MOVL  CX, 24(SP)(R10*4)  	MOVL  CX, 65560(SP)(R11*4) -	CMPL  (DX)(SI*1), DI +	MOVQ  (DX)(SI*1), R10 +	MOVQ  (DX)(R8*1), R11 +	CMPQ  R10, DI  	JEQ   candidate_match_encodeBetterBlockAsm12B -	CMPL  (DX)(R8*1), DI -	JEQ   candidateS_match_encodeBetterBlockAsm12B -	MOVL  20(SP), CX -	JMP   search_loop_encodeBetterBlockAsm12B +	CMPQ  R11, DI +	JNE   no_short_found_encodeBetterBlockAsm12B +	MOVL  R8, SI +	JMP   candidate_match_encodeBetterBlockAsm12B + +no_short_found_encodeBetterBlockAsm12B: +	CMPL R10, DI +	JEQ  candidate_match_encodeBetterBlockAsm12B +	CMPL R11, DI +	JEQ  candidateS_match_encodeBetterBlockAsm12B +	MOVL 20(SP), CX +	JMP  search_loop_encodeBetterBlockAsm12B  candidateS_match_encodeBetterBlockAsm12B:  	SHRQ  $0x08, DI @@ -8447,52 +8470,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm12B:  match_nolit_dst_ok_encodeBetterBlockAsm12B:  	MOVQ  $0x0000cf1bbcdcbf9b, SI  	MOVQ  $0x9e3779b1, R8 -	INCL  DI -	MOVQ  (DX)(DI*1), R9 -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	MOVQ  R9, R12 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	SHRQ  $0x10, R12 -	LEAL  1(DI), R14 -	LEAL  2(DI), R15 -	MOVQ  -2(DX)(CX*1), R9 +	LEAQ  1(DI), DI +	LEAQ  -2(CX), R9 +	MOVQ  (DX)(DI*1), R10 +	MOVQ  1(DX)(DI*1), R11 +	MOVQ  (DX)(R9*1), R12 +	MOVQ  1(DX)(R9*1), R13  	SHLQ  $0x10, R10  	IMULQ SI, R10  	SHRQ  $0x32, R10 -	SHLQ  $0x10, R13 -	IMULQ SI, R13 -	SHRQ  $0x32, R13  	SHLQ  $0x20, R11  	IMULQ R8, R11  	SHRQ  $0x34, R11 -	SHLQ  $0x20, R12 -	IMULQ R8, R12 -	SHRQ  $0x34, R12 +	SHLQ  $0x10, R12 +	IMULQ SI, R12 +	SHRQ  $0x32, R12 +	SHLQ  $0x20, R13 +	IMULQ R8, R13 +	SHRQ  $0x34, R13 +	LEAQ  1(DI), R8 +	LEAQ  1(R9), R14  	MOVL  DI, 24(SP)(R10*4) -	MOVL  R14, 24(SP)(R13*4) -	MOVL  R14, 65560(SP)(R11*4) -	MOVL  R15, 65560(SP)(R12*4) -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	LEAL  -2(CX), R9 -	LEAL  -1(CX), DI +	MOVL  R9, 24(SP)(R12*4) +	MOVL  R8, 65560(SP)(R11*4) +	MOVL  R14, 65560(SP)(R13*4) +	ADDQ  $0x01, DI +	SUBQ  $0x01, R9 + +index_loop_encodeBetterBlockAsm12B: +	CMPQ  DI, R9 +	JAE   search_loop_encodeBetterBlockAsm12B +	MOVQ  (DX)(DI*1), R8 +	MOVQ  (DX)(R9*1), R10 +	SHLQ  $0x10, R8 +	IMULQ SI, R8 +	SHRQ  $0x32, R8  	SHLQ  $0x10, R10  	IMULQ SI, R10  	SHRQ  $0x32, R10 -	SHLQ  $0x20, R11 -	IMULQ R8, R11 -	SHRQ  $0x34, R11 -	SHLQ  $0x10, R13 -	IMULQ SI, R13 -	SHRQ  $0x32, R13 +	MOVL  DI, 24(SP)(R8*4)  	MOVL  R9, 24(SP)(R10*4) -	MOVL  DI, 65560(SP)(R11*4) -	MOVL  DI, 24(SP)(R13*4) -	JMP   search_loop_encodeBetterBlockAsm12B +	ADDQ  $0x02, DI +	SUBQ  $0x02, R9 +	JMP   index_loop_encodeBetterBlockAsm12B  emit_remainder_encodeBetterBlockAsm12B:  	MOVQ src_len+32(FP), CX @@ -8707,12 +8727,22 @@ search_loop_encodeBetterBlockAsm10B:  	MOVL  16408(SP)(R11*4), R8  	MOVL  CX, 24(SP)(R10*4)  	MOVL  CX, 16408(SP)(R11*4) -	CMPL  (DX)(SI*1), DI +	MOVQ  (DX)(SI*1), R10 +	MOVQ  (DX)(R8*1), R11 +	CMPQ  R10, DI  	JEQ   candidate_match_encodeBetterBlockAsm10B -	CMPL  (DX)(R8*1), DI -	JEQ   candidateS_match_encodeBetterBlockAsm10B -	MOVL  20(SP), CX -	JMP   search_loop_encodeBetterBlockAsm10B +	CMPQ  R11, DI +	JNE   no_short_found_encodeBetterBlockAsm10B +	MOVL  R8, SI +	JMP   candidate_match_encodeBetterBlockAsm10B + +no_short_found_encodeBetterBlockAsm10B: +	CMPL R10, DI +	JEQ  candidate_match_encodeBetterBlockAsm10B +	CMPL R11, DI +	JEQ  candidateS_match_encodeBetterBlockAsm10B +	MOVL 20(SP), CX +	JMP  search_loop_encodeBetterBlockAsm10B  candidateS_match_encodeBetterBlockAsm10B:  	SHRQ  $0x08, DI @@ -9283,52 +9313,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm10B:  match_nolit_dst_ok_encodeBetterBlockAsm10B:  	MOVQ  $0x0000cf1bbcdcbf9b, SI  	MOVQ  $0x9e3779b1, R8 -	INCL  DI -	MOVQ  (DX)(DI*1), R9 -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	MOVQ  R9, R12 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	SHRQ  $0x10, R12 -	LEAL  1(DI), R14 -	LEAL  2(DI), R15 -	MOVQ  -2(DX)(CX*1), R9 +	LEAQ  1(DI), DI +	LEAQ  -2(CX), R9 +	MOVQ  (DX)(DI*1), R10 +	MOVQ  1(DX)(DI*1), R11 +	MOVQ  (DX)(R9*1), R12 +	MOVQ  1(DX)(R9*1), R13  	SHLQ  $0x10, R10  	IMULQ SI, R10  	SHRQ  $0x34, R10 -	SHLQ  $0x10, R13 -	IMULQ SI, R13 -	SHRQ  $0x34, R13  	SHLQ  $0x20, R11  	IMULQ R8, R11  	SHRQ  $0x36, R11 -	SHLQ  $0x20, R12 -	IMULQ R8, R12 -	SHRQ  $0x36, R12 +	SHLQ  $0x10, R12 +	IMULQ SI, R12 +	SHRQ  $0x34, R12 +	SHLQ  $0x20, R13 +	IMULQ R8, R13 +	SHRQ  $0x36, R13 +	LEAQ  1(DI), R8 +	LEAQ  1(R9), R14  	MOVL  DI, 24(SP)(R10*4) -	MOVL  R14, 24(SP)(R13*4) -	MOVL  R14, 16408(SP)(R11*4) -	MOVL  R15, 16408(SP)(R12*4) -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	LEAL  -2(CX), R9 -	LEAL  -1(CX), DI +	MOVL  R9, 24(SP)(R12*4) +	MOVL  R8, 16408(SP)(R11*4) +	MOVL  R14, 16408(SP)(R13*4) +	ADDQ  $0x01, DI +	SUBQ  $0x01, R9 + +index_loop_encodeBetterBlockAsm10B: +	CMPQ  DI, R9 +	JAE   search_loop_encodeBetterBlockAsm10B +	MOVQ  (DX)(DI*1), R8 +	MOVQ  (DX)(R9*1), R10 +	SHLQ  $0x10, R8 +	IMULQ SI, R8 +	SHRQ  $0x34, R8  	SHLQ  $0x10, R10  	IMULQ SI, R10  	SHRQ  $0x34, R10 -	SHLQ  $0x20, R11 -	IMULQ R8, R11 -	SHRQ  $0x36, R11 -	SHLQ  $0x10, R13 -	IMULQ SI, R13 -	SHRQ  $0x34, R13 +	MOVL  DI, 24(SP)(R8*4)  	MOVL  R9, 24(SP)(R10*4) -	MOVL  DI, 16408(SP)(R11*4) -	MOVL  DI, 24(SP)(R13*4) -	JMP   search_loop_encodeBetterBlockAsm10B +	ADDQ  $0x02, DI +	SUBQ  $0x02, R9 +	JMP   index_loop_encodeBetterBlockAsm10B  emit_remainder_encodeBetterBlockAsm10B:  	MOVQ src_len+32(FP), CX @@ -9543,12 +9570,22 @@ search_loop_encodeBetterBlockAsm8B:  	MOVL  4120(SP)(R11*4), R8  	MOVL  CX, 24(SP)(R10*4)  	MOVL  CX, 4120(SP)(R11*4) -	CMPL  (DX)(SI*1), DI +	MOVQ  (DX)(SI*1), R10 +	MOVQ  (DX)(R8*1), R11 +	CMPQ  R10, DI  	JEQ   candidate_match_encodeBetterBlockAsm8B -	CMPL  (DX)(R8*1), DI -	JEQ   candidateS_match_encodeBetterBlockAsm8B -	MOVL  20(SP), CX -	JMP   search_loop_encodeBetterBlockAsm8B +	CMPQ  R11, DI +	JNE   no_short_found_encodeBetterBlockAsm8B +	MOVL  R8, SI +	JMP   candidate_match_encodeBetterBlockAsm8B + +no_short_found_encodeBetterBlockAsm8B: +	CMPL R10, DI +	JEQ  candidate_match_encodeBetterBlockAsm8B +	CMPL R11, DI +	JEQ  candidateS_match_encodeBetterBlockAsm8B +	MOVL 20(SP), CX +	JMP  search_loop_encodeBetterBlockAsm8B  candidateS_match_encodeBetterBlockAsm8B:  	SHRQ  $0x08, DI @@ -10105,52 +10142,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm8B:  match_nolit_dst_ok_encodeBetterBlockAsm8B:  	MOVQ  $0x0000cf1bbcdcbf9b, SI  	MOVQ  $0x9e3779b1, R8 -	INCL  DI -	MOVQ  (DX)(DI*1), R9 -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	MOVQ  R9, R12 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	SHRQ  $0x10, R12 -	LEAL  1(DI), R14 -	LEAL  2(DI), R15 -	MOVQ  -2(DX)(CX*1), R9 +	LEAQ  1(DI), DI +	LEAQ  -2(CX), R9 +	MOVQ  (DX)(DI*1), R10 +	MOVQ  1(DX)(DI*1), R11 +	MOVQ  (DX)(R9*1), R12 +	MOVQ  1(DX)(R9*1), R13  	SHLQ  $0x10, R10  	IMULQ SI, R10  	SHRQ  $0x36, R10 -	SHLQ  $0x10, R13 -	IMULQ SI, R13 -	SHRQ  $0x36, R13  	SHLQ  $0x20, R11  	IMULQ R8, R11  	SHRQ  $0x38, R11 -	SHLQ  $0x20, R12 -	IMULQ R8, R12 -	SHRQ  $0x38, R12 +	SHLQ  $0x10, R12 +	IMULQ SI, R12 +	SHRQ  $0x36, R12 +	SHLQ  $0x20, R13 +	IMULQ R8, R13 +	SHRQ  $0x38, R13 +	LEAQ  1(DI), R8 +	LEAQ  1(R9), R14  	MOVL  DI, 24(SP)(R10*4) -	MOVL  R14, 24(SP)(R13*4) -	MOVL  R14, 4120(SP)(R11*4) -	MOVL  R15, 4120(SP)(R12*4) -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	LEAL  -2(CX), R9 -	LEAL  -1(CX), DI +	MOVL  R9, 24(SP)(R12*4) +	MOVL  R8, 4120(SP)(R11*4) +	MOVL  R14, 4120(SP)(R13*4) +	ADDQ  $0x01, DI +	SUBQ  $0x01, R9 + +index_loop_encodeBetterBlockAsm8B: +	CMPQ  DI, R9 +	JAE   search_loop_encodeBetterBlockAsm8B +	MOVQ  (DX)(DI*1), R8 +	MOVQ  (DX)(R9*1), R10 +	SHLQ  $0x10, R8 +	IMULQ SI, R8 +	SHRQ  $0x36, R8  	SHLQ  $0x10, R10  	IMULQ SI, R10  	SHRQ  $0x36, R10 -	SHLQ  $0x20, R11 -	IMULQ R8, R11 -	SHRQ  $0x38, R11 -	SHLQ  $0x10, R13 -	IMULQ SI, R13 -	SHRQ  $0x36, R13 +	MOVL  DI, 24(SP)(R8*4)  	MOVL  R9, 24(SP)(R10*4) -	MOVL  DI, 4120(SP)(R11*4) -	MOVL  DI, 24(SP)(R13*4) -	JMP   search_loop_encodeBetterBlockAsm8B +	ADDQ  $0x02, DI +	SUBQ  $0x02, R9 +	JMP   index_loop_encodeBetterBlockAsm8B  emit_remainder_encodeBetterBlockAsm8B:  	MOVQ src_len+32(FP), CX @@ -14287,9 +14321,9 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:  // func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int  // Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56 +TEXT ·encodeSnappyBetterBlockAsm(SB), $589848-56  	MOVQ dst_base+0(FP), AX -	MOVQ $0x00000a00, CX +	MOVQ $0x00001200, CX  	LEAQ 24(SP), DX  	PXOR X0, X0 @@ -14341,27 +14375,37 @@ check_maxskip_cont_encodeSnappyBetterBlockAsm:  	MOVQ  DI, R11  	SHLQ  $0x08, R10  	IMULQ R9, R10 -	SHRQ  $0x30, R10 +	SHRQ  $0x2f, R10  	SHLQ  $0x20, R11  	IMULQ SI, R11  	SHRQ  $0x32, R11  	MOVL  24(SP)(R10*4), SI -	MOVL  262168(SP)(R11*4), R8 +	MOVL  524312(SP)(R11*4), R8  	MOVL  CX, 24(SP)(R10*4) -	MOVL  CX, 262168(SP)(R11*4) -	CMPL  (DX)(SI*1), DI +	MOVL  CX, 524312(SP)(R11*4) +	MOVQ  (DX)(SI*1), R10 +	MOVQ  (DX)(R8*1), R11 +	CMPQ  R10, DI  	JEQ   candidate_match_encodeSnappyBetterBlockAsm -	CMPL  (DX)(R8*1), DI -	JEQ   candidateS_match_encodeSnappyBetterBlockAsm -	MOVL  20(SP), CX -	JMP   search_loop_encodeSnappyBetterBlockAsm +	CMPQ  R11, DI +	JNE   no_short_found_encodeSnappyBetterBlockAsm +	MOVL  R8, SI +	JMP   candidate_match_encodeSnappyBetterBlockAsm + +no_short_found_encodeSnappyBetterBlockAsm: +	CMPL R10, DI +	JEQ  candidate_match_encodeSnappyBetterBlockAsm +	CMPL R11, DI +	JEQ  candidateS_match_encodeSnappyBetterBlockAsm +	MOVL 20(SP), CX +	JMP  search_loop_encodeSnappyBetterBlockAsm  candidateS_match_encodeSnappyBetterBlockAsm:  	SHRQ  $0x08, DI  	MOVQ  DI, R10  	SHLQ  $0x08, R10  	IMULQ R9, R10 -	SHRQ  $0x30, R10 +	SHRQ  $0x2f, R10  	MOVL  24(SP)(R10*4), SI  	INCL  CX  	MOVL  CX, 24(SP)(R10*4) @@ -14685,52 +14729,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:  match_nolit_dst_ok_encodeSnappyBetterBlockAsm:  	MOVQ  $0x00cf1bbcdcbfa563, SI  	MOVQ  $0x9e3779b1, R8 -	INCL  DI -	MOVQ  (DX)(DI*1), R9 -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	MOVQ  R9, R12 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	SHRQ  $0x10, R12 -	LEAL  1(DI), R14 -	LEAL  2(DI), R15 -	MOVQ  -2(DX)(CX*1), R9 +	LEAQ  1(DI), DI +	LEAQ  -2(CX), R9 +	MOVQ  (DX)(DI*1), R10 +	MOVQ  1(DX)(DI*1), R11 +	MOVQ  (DX)(R9*1), R12 +	MOVQ  1(DX)(R9*1), R13  	SHLQ  $0x08, R10  	IMULQ SI, R10 -	SHRQ  $0x30, R10 -	SHLQ  $0x08, R13 -	IMULQ SI, R13 -	SHRQ  $0x30, R13 +	SHRQ  $0x2f, R10  	SHLQ  $0x20, R11  	IMULQ R8, R11  	SHRQ  $0x32, R11 -	SHLQ  $0x20, R12 -	IMULQ R8, R12 -	SHRQ  $0x32, R12 +	SHLQ  $0x08, R12 +	IMULQ SI, R12 +	SHRQ  $0x2f, R12 +	SHLQ  $0x20, R13 +	IMULQ R8, R13 +	SHRQ  $0x32, R13 +	LEAQ  1(DI), R8 +	LEAQ  1(R9), R14  	MOVL  DI, 24(SP)(R10*4) -	MOVL  R14, 24(SP)(R13*4) -	MOVL  R14, 262168(SP)(R11*4) -	MOVL  R15, 262168(SP)(R12*4) -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	LEAL  -2(CX), R9 -	LEAL  -1(CX), DI +	MOVL  R9, 24(SP)(R12*4) +	MOVL  R8, 524312(SP)(R11*4) +	MOVL  R14, 524312(SP)(R13*4) +	ADDQ  $0x01, DI +	SUBQ  $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm: +	CMPQ  DI, R9 +	JAE   search_loop_encodeSnappyBetterBlockAsm +	MOVQ  (DX)(DI*1), R8 +	MOVQ  (DX)(R9*1), R10 +	SHLQ  $0x08, R8 +	IMULQ SI, R8 +	SHRQ  $0x2f, R8  	SHLQ  $0x08, R10  	IMULQ SI, R10 -	SHRQ  $0x30, R10 -	SHLQ  $0x20, R11 -	IMULQ R8, R11 -	SHRQ  $0x32, R11 -	SHLQ  $0x08, R13 -	IMULQ SI, R13 -	SHRQ  $0x30, R13 +	SHRQ  $0x2f, R10 +	MOVL  DI, 24(SP)(R8*4)  	MOVL  R9, 24(SP)(R10*4) -	MOVL  DI, 262168(SP)(R11*4) -	MOVL  DI, 24(SP)(R13*4) -	JMP   search_loop_encodeSnappyBetterBlockAsm +	ADDQ  $0x02, DI +	SUBQ  $0x02, R9 +	JMP   index_loop_encodeSnappyBetterBlockAsm  emit_remainder_encodeSnappyBetterBlockAsm:  	MOVQ src_len+32(FP), CX @@ -14964,12 +15005,22 @@ search_loop_encodeSnappyBetterBlockAsm64K:  	MOVL  262168(SP)(R11*4), R8  	MOVL  CX, 24(SP)(R10*4)  	MOVL  CX, 262168(SP)(R11*4) -	CMPL  (DX)(SI*1), DI +	MOVQ  (DX)(SI*1), R10 +	MOVQ  (DX)(R8*1), R11 +	CMPQ  R10, DI  	JEQ   candidate_match_encodeSnappyBetterBlockAsm64K -	CMPL  (DX)(R8*1), DI -	JEQ   candidateS_match_encodeSnappyBetterBlockAsm64K -	MOVL  20(SP), CX -	JMP   search_loop_encodeSnappyBetterBlockAsm64K +	CMPQ  R11, DI +	JNE   no_short_found_encodeSnappyBetterBlockAsm64K +	MOVL  R8, SI +	JMP   candidate_match_encodeSnappyBetterBlockAsm64K + +no_short_found_encodeSnappyBetterBlockAsm64K: +	CMPL R10, DI +	JEQ  candidate_match_encodeSnappyBetterBlockAsm64K +	CMPL R11, DI +	JEQ  candidateS_match_encodeSnappyBetterBlockAsm64K +	MOVL 20(SP), CX +	JMP  search_loop_encodeSnappyBetterBlockAsm64K  candidateS_match_encodeSnappyBetterBlockAsm64K:  	SHRQ  $0x08, DI @@ -15248,52 +15299,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:  match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:  	MOVQ  $0x00cf1bbcdcbfa563, SI  	MOVQ  $0x9e3779b1, R8 -	INCL  DI -	MOVQ  (DX)(DI*1), R9 -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	MOVQ  R9, R12 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	SHRQ  $0x10, R12 -	LEAL  1(DI), R14 -	LEAL  2(DI), R15 -	MOVQ  -2(DX)(CX*1), R9 +	LEAQ  1(DI), DI +	LEAQ  -2(CX), R9 +	MOVQ  (DX)(DI*1), R10 +	MOVQ  1(DX)(DI*1), R11 +	MOVQ  (DX)(R9*1), R12 +	MOVQ  1(DX)(R9*1), R13  	SHLQ  $0x08, R10  	IMULQ SI, R10  	SHRQ  $0x30, R10 -	SHLQ  $0x08, R13 -	IMULQ SI, R13 -	SHRQ  $0x30, R13  	SHLQ  $0x20, R11  	IMULQ R8, R11  	SHRQ  $0x32, R11 -	SHLQ  $0x20, R12 -	IMULQ R8, R12 -	SHRQ  $0x32, R12 +	SHLQ  $0x08, R12 +	IMULQ SI, R12 +	SHRQ  $0x30, R12 +	SHLQ  $0x20, R13 +	IMULQ R8, R13 +	SHRQ  $0x32, R13 +	LEAQ  1(DI), R8 +	LEAQ  1(R9), R14  	MOVL  DI, 24(SP)(R10*4) -	MOVL  R14, 24(SP)(R13*4) -	MOVL  R14, 262168(SP)(R11*4) -	MOVL  R15, 262168(SP)(R12*4) -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	LEAL  -2(CX), R9 -	LEAL  -1(CX), DI +	MOVL  R9, 24(SP)(R12*4) +	MOVL  R8, 262168(SP)(R11*4) +	MOVL  R14, 262168(SP)(R13*4) +	ADDQ  $0x01, DI +	SUBQ  $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm64K: +	CMPQ  DI, R9 +	JAE   search_loop_encodeSnappyBetterBlockAsm64K +	MOVQ  (DX)(DI*1), R8 +	MOVQ  (DX)(R9*1), R10 +	SHLQ  $0x08, R8 +	IMULQ SI, R8 +	SHRQ  $0x30, R8  	SHLQ  $0x08, R10  	IMULQ SI, R10  	SHRQ  $0x30, R10 -	SHLQ  $0x20, R11 -	IMULQ R8, R11 -	SHRQ  $0x32, R11 -	SHLQ  $0x08, R13 -	IMULQ SI, R13 -	SHRQ  $0x30, R13 +	MOVL  DI, 24(SP)(R8*4)  	MOVL  R9, 24(SP)(R10*4) -	MOVL  DI, 262168(SP)(R11*4) -	MOVL  DI, 24(SP)(R13*4) -	JMP   search_loop_encodeSnappyBetterBlockAsm64K +	ADDQ  $0x02, DI +	SUBQ  $0x02, R9 +	JMP   index_loop_encodeSnappyBetterBlockAsm64K  emit_remainder_encodeSnappyBetterBlockAsm64K:  	MOVQ src_len+32(FP), CX @@ -15508,12 +15556,22 @@ search_loop_encodeSnappyBetterBlockAsm12B:  	MOVL  65560(SP)(R11*4), R8  	MOVL  CX, 24(SP)(R10*4)  	MOVL  CX, 65560(SP)(R11*4) -	CMPL  (DX)(SI*1), DI +	MOVQ  (DX)(SI*1), R10 +	MOVQ  (DX)(R8*1), R11 +	CMPQ  R10, DI  	JEQ   candidate_match_encodeSnappyBetterBlockAsm12B -	CMPL  (DX)(R8*1), DI -	JEQ   candidateS_match_encodeSnappyBetterBlockAsm12B -	MOVL  20(SP), CX -	JMP   search_loop_encodeSnappyBetterBlockAsm12B +	CMPQ  R11, DI +	JNE   no_short_found_encodeSnappyBetterBlockAsm12B +	MOVL  R8, SI +	JMP   candidate_match_encodeSnappyBetterBlockAsm12B + +no_short_found_encodeSnappyBetterBlockAsm12B: +	CMPL R10, DI +	JEQ  candidate_match_encodeSnappyBetterBlockAsm12B +	CMPL R11, DI +	JEQ  candidateS_match_encodeSnappyBetterBlockAsm12B +	MOVL 20(SP), CX +	JMP  search_loop_encodeSnappyBetterBlockAsm12B  candidateS_match_encodeSnappyBetterBlockAsm12B:  	SHRQ  $0x08, DI @@ -15792,52 +15850,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:  match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:  	MOVQ  $0x0000cf1bbcdcbf9b, SI  	MOVQ  $0x9e3779b1, R8 -	INCL  DI -	MOVQ  (DX)(DI*1), R9 -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	MOVQ  R9, R12 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	SHRQ  $0x10, R12 -	LEAL  1(DI), R14 -	LEAL  2(DI), R15 -	MOVQ  -2(DX)(CX*1), R9 +	LEAQ  1(DI), DI +	LEAQ  -2(CX), R9 +	MOVQ  (DX)(DI*1), R10 +	MOVQ  1(DX)(DI*1), R11 +	MOVQ  (DX)(R9*1), R12 +	MOVQ  1(DX)(R9*1), R13  	SHLQ  $0x10, R10  	IMULQ SI, R10  	SHRQ  $0x32, R10 -	SHLQ  $0x10, R13 -	IMULQ SI, R13 -	SHRQ  $0x32, R13  	SHLQ  $0x20, R11  	IMULQ R8, R11  	SHRQ  $0x34, R11 -	SHLQ  $0x20, R12 -	IMULQ R8, R12 -	SHRQ  $0x34, R12 +	SHLQ  $0x10, R12 +	IMULQ SI, R12 +	SHRQ  $0x32, R12 +	SHLQ  $0x20, R13 +	IMULQ R8, R13 +	SHRQ  $0x34, R13 +	LEAQ  1(DI), R8 +	LEAQ  1(R9), R14  	MOVL  DI, 24(SP)(R10*4) -	MOVL  R14, 24(SP)(R13*4) -	MOVL  R14, 65560(SP)(R11*4) -	MOVL  R15, 65560(SP)(R12*4) -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	LEAL  -2(CX), R9 -	LEAL  -1(CX), DI +	MOVL  R9, 24(SP)(R12*4) +	MOVL  R8, 65560(SP)(R11*4) +	MOVL  R14, 65560(SP)(R13*4) +	ADDQ  $0x01, DI +	SUBQ  $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm12B: +	CMPQ  DI, R9 +	JAE   search_loop_encodeSnappyBetterBlockAsm12B +	MOVQ  (DX)(DI*1), R8 +	MOVQ  (DX)(R9*1), R10 +	SHLQ  $0x10, R8 +	IMULQ SI, R8 +	SHRQ  $0x32, R8  	SHLQ  $0x10, R10  	IMULQ SI, R10  	SHRQ  $0x32, R10 -	SHLQ  $0x20, R11 -	IMULQ R8, R11 -	SHRQ  $0x34, R11 -	SHLQ  $0x10, R13 -	IMULQ SI, R13 -	SHRQ  $0x32, R13 +	MOVL  DI, 24(SP)(R8*4)  	MOVL  R9, 24(SP)(R10*4) -	MOVL  DI, 65560(SP)(R11*4) -	MOVL  DI, 24(SP)(R13*4) -	JMP   search_loop_encodeSnappyBetterBlockAsm12B +	ADDQ  $0x02, DI +	SUBQ  $0x02, R9 +	JMP   index_loop_encodeSnappyBetterBlockAsm12B  emit_remainder_encodeSnappyBetterBlockAsm12B:  	MOVQ src_len+32(FP), CX @@ -16052,12 +16107,22 @@ search_loop_encodeSnappyBetterBlockAsm10B:  	MOVL  16408(SP)(R11*4), R8  	MOVL  CX, 24(SP)(R10*4)  	MOVL  CX, 16408(SP)(R11*4) -	CMPL  (DX)(SI*1), DI +	MOVQ  (DX)(SI*1), R10 +	MOVQ  (DX)(R8*1), R11 +	CMPQ  R10, DI  	JEQ   candidate_match_encodeSnappyBetterBlockAsm10B -	CMPL  (DX)(R8*1), DI -	JEQ   candidateS_match_encodeSnappyBetterBlockAsm10B -	MOVL  20(SP), CX -	JMP   search_loop_encodeSnappyBetterBlockAsm10B +	CMPQ  R11, DI +	JNE   no_short_found_encodeSnappyBetterBlockAsm10B +	MOVL  R8, SI +	JMP   candidate_match_encodeSnappyBetterBlockAsm10B + +no_short_found_encodeSnappyBetterBlockAsm10B: +	CMPL R10, DI +	JEQ  candidate_match_encodeSnappyBetterBlockAsm10B +	CMPL R11, DI +	JEQ  candidateS_match_encodeSnappyBetterBlockAsm10B +	MOVL 20(SP), CX +	JMP  search_loop_encodeSnappyBetterBlockAsm10B  candidateS_match_encodeSnappyBetterBlockAsm10B:  	SHRQ  $0x08, DI @@ -16336,52 +16401,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:  match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:  	MOVQ  $0x0000cf1bbcdcbf9b, SI  	MOVQ  $0x9e3779b1, R8 -	INCL  DI -	MOVQ  (DX)(DI*1), R9 -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	MOVQ  R9, R12 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	SHRQ  $0x10, R12 -	LEAL  1(DI), R14 -	LEAL  2(DI), R15 -	MOVQ  -2(DX)(CX*1), R9 +	LEAQ  1(DI), DI +	LEAQ  -2(CX), R9 +	MOVQ  (DX)(DI*1), R10 +	MOVQ  1(DX)(DI*1), R11 +	MOVQ  (DX)(R9*1), R12 +	MOVQ  1(DX)(R9*1), R13  	SHLQ  $0x10, R10  	IMULQ SI, R10  	SHRQ  $0x34, R10 -	SHLQ  $0x10, R13 -	IMULQ SI, R13 -	SHRQ  $0x34, R13  	SHLQ  $0x20, R11  	IMULQ R8, R11  	SHRQ  $0x36, R11 -	SHLQ  $0x20, R12 -	IMULQ R8, R12 -	SHRQ  $0x36, R12 +	SHLQ  $0x10, R12 +	IMULQ SI, R12 +	SHRQ  $0x34, R12 +	SHLQ  $0x20, R13 +	IMULQ R8, R13 +	SHRQ  $0x36, R13 +	LEAQ  1(DI), R8 +	LEAQ  1(R9), R14  	MOVL  DI, 24(SP)(R10*4) -	MOVL  R14, 24(SP)(R13*4) -	MOVL  R14, 16408(SP)(R11*4) -	MOVL  R15, 16408(SP)(R12*4) -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	LEAL  -2(CX), R9 -	LEAL  -1(CX), DI +	MOVL  R9, 24(SP)(R12*4) +	MOVL  R8, 16408(SP)(R11*4) +	MOVL  R14, 16408(SP)(R13*4) +	ADDQ  $0x01, DI +	SUBQ  $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm10B: +	CMPQ  DI, R9 +	JAE   search_loop_encodeSnappyBetterBlockAsm10B +	MOVQ  (DX)(DI*1), R8 +	MOVQ  (DX)(R9*1), R10 +	SHLQ  $0x10, R8 +	IMULQ SI, R8 +	SHRQ  $0x34, R8  	SHLQ  $0x10, R10  	IMULQ SI, R10  	SHRQ  $0x34, R10 -	SHLQ  $0x20, R11 -	IMULQ R8, R11 -	SHRQ  $0x36, R11 -	SHLQ  $0x10, R13 -	IMULQ SI, R13 -	SHRQ  $0x34, R13 +	MOVL  DI, 24(SP)(R8*4)  	MOVL  R9, 24(SP)(R10*4) -	MOVL  DI, 16408(SP)(R11*4) -	MOVL  DI, 24(SP)(R13*4) -	JMP   search_loop_encodeSnappyBetterBlockAsm10B +	ADDQ  $0x02, DI +	SUBQ  $0x02, R9 +	JMP   index_loop_encodeSnappyBetterBlockAsm10B  emit_remainder_encodeSnappyBetterBlockAsm10B:  	MOVQ src_len+32(FP), CX @@ -16596,12 +16658,22 @@ search_loop_encodeSnappyBetterBlockAsm8B:  	MOVL  4120(SP)(R11*4), R8  	MOVL  CX, 24(SP)(R10*4)  	MOVL  CX, 4120(SP)(R11*4) -	CMPL  (DX)(SI*1), DI +	MOVQ  (DX)(SI*1), R10 +	MOVQ  (DX)(R8*1), R11 +	CMPQ  R10, DI  	JEQ   candidate_match_encodeSnappyBetterBlockAsm8B -	CMPL  (DX)(R8*1), DI -	JEQ   candidateS_match_encodeSnappyBetterBlockAsm8B -	MOVL  20(SP), CX -	JMP   search_loop_encodeSnappyBetterBlockAsm8B +	CMPQ  R11, DI +	JNE   no_short_found_encodeSnappyBetterBlockAsm8B +	MOVL  R8, SI +	JMP   candidate_match_encodeSnappyBetterBlockAsm8B + +no_short_found_encodeSnappyBetterBlockAsm8B: +	CMPL R10, DI +	JEQ  candidate_match_encodeSnappyBetterBlockAsm8B +	CMPL R11, DI +	JEQ  candidateS_match_encodeSnappyBetterBlockAsm8B +	MOVL 20(SP), CX +	JMP  search_loop_encodeSnappyBetterBlockAsm8B  candidateS_match_encodeSnappyBetterBlockAsm8B:  	SHRQ  $0x08, DI @@ -16878,52 +16950,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:  match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:  	MOVQ  $0x0000cf1bbcdcbf9b, SI  	MOVQ  $0x9e3779b1, R8 -	INCL  DI -	MOVQ  (DX)(DI*1), R9 -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	MOVQ  R9, R12 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	SHRQ  $0x10, R12 -	LEAL  1(DI), R14 -	LEAL  2(DI), R15 -	MOVQ  -2(DX)(CX*1), R9 +	LEAQ  1(DI), DI +	LEAQ  -2(CX), R9 +	MOVQ  (DX)(DI*1), R10 +	MOVQ  1(DX)(DI*1), R11 +	MOVQ  (DX)(R9*1), R12 +	MOVQ  1(DX)(R9*1), R13  	SHLQ  $0x10, R10  	IMULQ SI, R10  	SHRQ  $0x36, R10 -	SHLQ  $0x10, R13 -	IMULQ SI, R13 -	SHRQ  $0x36, R13  	SHLQ  $0x20, R11  	IMULQ R8, R11  	SHRQ  $0x38, R11 -	SHLQ  $0x20, R12 -	IMULQ R8, R12 -	SHRQ  $0x38, R12 +	SHLQ  $0x10, R12 +	IMULQ SI, R12 +	SHRQ  $0x36, R12 +	SHLQ  $0x20, R13 +	IMULQ R8, R13 +	SHRQ  $0x38, R13 +	LEAQ  1(DI), R8 +	LEAQ  1(R9), R14  	MOVL  DI, 24(SP)(R10*4) -	MOVL  R14, 24(SP)(R13*4) -	MOVL  R14, 4120(SP)(R11*4) -	MOVL  R15, 4120(SP)(R12*4) -	MOVQ  R9, R10 -	MOVQ  R9, R11 -	SHRQ  $0x08, R11 -	MOVQ  R11, R13 -	LEAL  -2(CX), R9 -	LEAL  -1(CX), DI +	MOVL  R9, 24(SP)(R12*4) +	MOVL  R8, 4120(SP)(R11*4) +	MOVL  R14, 4120(SP)(R13*4) +	ADDQ  $0x01, DI +	SUBQ  $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm8B: +	CMPQ  DI, R9 +	JAE   search_loop_encodeSnappyBetterBlockAsm8B +	MOVQ  (DX)(DI*1), R8 +	MOVQ  (DX)(R9*1), R10 +	SHLQ  $0x10, R8 +	IMULQ SI, R8 +	SHRQ  $0x36, R8  	SHLQ  $0x10, R10  	IMULQ SI, R10  	SHRQ  $0x36, R10 -	SHLQ  $0x20, R11 -	IMULQ R8, R11 -	SHRQ  $0x38, R11 -	SHLQ  $0x10, R13 -	IMULQ SI, R13 -	SHRQ  $0x36, R13 +	MOVL  DI, 24(SP)(R8*4)  	MOVL  R9, 24(SP)(R10*4) -	MOVL  DI, 4120(SP)(R11*4) -	MOVL  DI, 24(SP)(R13*4) -	JMP   search_loop_encodeSnappyBetterBlockAsm8B +	ADDQ  $0x02, DI +	SUBQ  $0x02, R9 +	JMP   index_loop_encodeSnappyBetterBlockAsm8B  emit_remainder_encodeSnappyBetterBlockAsm8B:  	MOVQ src_len+32(FP), CX | 
