diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress')
22 files changed, 1060 insertions, 810 deletions
diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go index f8435998e..82882961a 100644 --- a/vendor/github.com/klauspost/compress/flate/deflate.go +++ b/vendor/github.com/klauspost/compress/flate/deflate.go @@ -131,7 +131,8 @@ func (d *compressor) fillDeflate(b []byte) int { s := d.state if s.index >= 2*windowSize-(minMatchLength+maxMatchLength) { // shift the window by windowSize - copy(d.window[:], d.window[windowSize:2*windowSize]) + //copy(d.window[:], d.window[windowSize:2*windowSize]) + *(*[windowSize]byte)(d.window) = *(*[windowSize]byte)(d.window[windowSize:]) s.index -= windowSize d.windowEnd -= windowSize if d.blockStart >= windowSize { @@ -293,7 +294,6 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, of } offset = 0 - cGain := 0 if d.chain < 100 { for i := prevHead; tries > 0; tries-- { if wEnd == win[i+length] { @@ -321,10 +321,14 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, of return } + // Minimum gain to accept a match. + cGain := 4 + // Some like it higher (CSV), some like it lower (JSON) - const baseCost = 6 + const baseCost = 3 // Base is 4 bytes at with an additional cost. // Matches must be better than this. + for i := prevHead; tries > 0; tries-- { if wEnd == win[i+length] { n := matchLen(win[i:i+minMatchLook], wPos) @@ -332,7 +336,7 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, of // Calculate gain. Estimate newGain := d.h.bitLengthRaw(wPos[:n]) - int(offsetExtraBits[offsetCode(uint32(pos-i))]) - baseCost - int(lengthExtraBits[lengthCodes[(n-3)&255]]) - //fmt.Println(n, "gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n])) + //fmt.Println("gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n]), "this-len:", n, "prev-len:", length) if newGain > cGain { length = n offset = pos - i @@ -373,6 +377,12 @@ func hash4(b []byte) uint32 { return hash4u(binary.LittleEndian.Uint32(b), hashBits) } +// hash4 returns the hash of u to fit in a hash table with h bits. +// Preferably h should be a constant and should always be <32. +func hash4u(u uint32, h uint8) uint32 { + return (u * prime4bytes) >> (32 - h) +} + // bulkHash4 will compute hashes using the same // algorithm as hash4 func bulkHash4(b []byte, dst []uint32) { @@ -483,27 +493,103 @@ func (d *compressor) deflateLazy() { } if prevLength >= minMatchLength && s.length <= prevLength { - // Check for better match at end... + // No better match, but check for better match at end... // - // checkOff must be >=2 since we otherwise risk checking s.index - // Offset of 2 seems to yield best results. + // Skip forward a number of bytes. + // Offset of 2 seems to yield best results. 3 is sometimes better. const checkOff = 2 - prevIndex := s.index - 1 - if prevIndex+prevLength+checkOff < s.maxInsertIndex { - end := lookahead - if lookahead > maxMatchLength { - end = maxMatchLength - } - end += prevIndex - idx := prevIndex + prevLength - (4 - checkOff) - h := hash4(d.window[idx:]) - ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength + (4 - checkOff) - if ch2 > minIndex { - length := matchLen(d.window[prevIndex:end], d.window[ch2:]) - // It seems like a pure length metric is best. - if length > prevLength { - prevLength = length - prevOffset = prevIndex - ch2 + + // Check all, except full length + if prevLength < maxMatchLength-checkOff { + prevIndex := s.index - 1 + if prevIndex+prevLength < s.maxInsertIndex { + end := lookahead + if lookahead > maxMatchLength+checkOff { + end = maxMatchLength + checkOff + } + end += prevIndex + + // Hash at match end. + h := hash4(d.window[prevIndex+prevLength:]) + ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength + if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff { + length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:]) + // It seems like a pure length metric is best. + if length > prevLength { + prevLength = length + prevOffset = prevIndex - ch2 + + // Extend back... + for i := checkOff - 1; i >= 0; i-- { + if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i] { + // Emit tokens we "owe" + for j := 0; j <= i; j++ { + d.tokens.AddLiteral(d.window[prevIndex+j]) + if d.tokens.n == maxFlateBlockTokens { + // The block includes the current character + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { + return + } + d.tokens.Reset() + } + s.index++ + if s.index < s.maxInsertIndex { + h := hash4(d.window[s.index:]) + ch := s.hashHead[h] + s.chainHead = int(ch) + s.hashPrev[s.index&windowMask] = ch + s.hashHead[h] = uint32(s.index + s.hashOffset) + } + } + break + } else { + prevLength++ + } + } + } else if false { + // Check one further ahead. + // Only rarely better, disabled for now. + prevIndex++ + h := hash4(d.window[prevIndex+prevLength:]) + ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength + if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff { + length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:]) + // It seems like a pure length metric is best. + if length > prevLength+checkOff { + prevLength = length + prevOffset = prevIndex - ch2 + prevIndex-- + + // Extend back... + for i := checkOff; i >= 0; i-- { + if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i-1] { + // Emit tokens we "owe" + for j := 0; j <= i; j++ { + d.tokens.AddLiteral(d.window[prevIndex+j]) + if d.tokens.n == maxFlateBlockTokens { + // The block includes the current character + if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { + return + } + d.tokens.Reset() + } + s.index++ + if s.index < s.maxInsertIndex { + h := hash4(d.window[s.index:]) + ch := s.hashHead[h] + s.chainHead = int(ch) + s.hashPrev[s.index&windowMask] = ch + s.hashHead[h] = uint32(s.index + s.hashOffset) + } + } + break + } else { + prevLength++ + } + } + } + } + } } } } diff --git a/vendor/github.com/klauspost/compress/flate/dict_decoder.go b/vendor/github.com/klauspost/compress/flate/dict_decoder.go index 71c75a065..bb36351a5 100644 --- a/vendor/github.com/klauspost/compress/flate/dict_decoder.go +++ b/vendor/github.com/klauspost/compress/flate/dict_decoder.go @@ -7,19 +7,19 @@ package flate // dictDecoder implements the LZ77 sliding dictionary as used in decompression. // LZ77 decompresses data through sequences of two forms of commands: // -// * Literal insertions: Runs of one or more symbols are inserted into the data -// stream as is. This is accomplished through the writeByte method for a -// single symbol, or combinations of writeSlice/writeMark for multiple symbols. -// Any valid stream must start with a literal insertion if no preset dictionary -// is used. +// - Literal insertions: Runs of one or more symbols are inserted into the data +// stream as is. This is accomplished through the writeByte method for a +// single symbol, or combinations of writeSlice/writeMark for multiple symbols. +// Any valid stream must start with a literal insertion if no preset dictionary +// is used. // -// * Backward copies: Runs of one or more symbols are copied from previously -// emitted data. Backward copies come as the tuple (dist, length) where dist -// determines how far back in the stream to copy from and length determines how -// many bytes to copy. Note that it is valid for the length to be greater than -// the distance. Since LZ77 uses forward copies, that situation is used to -// perform a form of run-length encoding on repeated runs of symbols. -// The writeCopy and tryWriteCopy are used to implement this command. +// - Backward copies: Runs of one or more symbols are copied from previously +// emitted data. Backward copies come as the tuple (dist, length) where dist +// determines how far back in the stream to copy from and length determines how +// many bytes to copy. Note that it is valid for the length to be greater than +// the distance. Since LZ77 uses forward copies, that situation is used to +// perform a form of run-length encoding on repeated runs of symbols. +// The writeCopy and tryWriteCopy are used to implement this command. // // For performance reasons, this implementation performs little to no sanity // checks about the arguments. As such, the invariants documented for each diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go index f781aaa62..24caf5f70 100644 --- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go +++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go @@ -58,17 +58,6 @@ const ( prime8bytes = 0xcf1bbcdcb7a56463 ) -func load32(b []byte, i int) uint32 { - // Help the compiler eliminate bounds checks on the read so it can be done in a single read. - b = b[i:] - b = b[:4] - return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 -} - -func load64(b []byte, i int) uint64 { - return binary.LittleEndian.Uint64(b[i:]) -} - func load3232(b []byte, i int32) uint32 { return binary.LittleEndian.Uint32(b[i:]) } @@ -77,10 +66,6 @@ func load6432(b []byte, i int32) uint64 { return binary.LittleEndian.Uint64(b[i:]) } -func hash(u uint32) uint32 { - return (u * 0x1e35a7bd) >> tableShift -} - type tableEntry struct { offset int32 } @@ -104,7 +89,8 @@ func (e *fastGen) addBlock(src []byte) int32 { } // Move down offset := int32(len(e.hist)) - maxMatchOffset - copy(e.hist[0:maxMatchOffset], e.hist[offset:]) + // copy(e.hist[0:maxMatchOffset], e.hist[offset:]) + *(*[maxMatchOffset]byte)(e.hist) = *(*[maxMatchOffset]byte)(e.hist[offset:]) e.cur += offset e.hist = e.hist[:maxMatchOffset] } @@ -114,39 +100,36 @@ func (e *fastGen) addBlock(src []byte) int32 { return s } -// hash4 returns the hash of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <32. -func hash4u(u uint32, h uint8) uint32 { - return (u * prime4bytes) >> (32 - h) -} - type tableEntryPrev struct { Cur tableEntry Prev tableEntry } -// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <32. -func hash4x64(u uint64, h uint8) uint32 { - return (uint32(u) * prime4bytes) >> ((32 - h) & reg8SizeMask32) -} - // hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits. // Preferably h should be a constant and should always be <64. func hash7(u uint64, h uint8) uint32 { return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & reg8SizeMask64)) } -// hash8 returns the hash of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <64. -func hash8(u uint64, h uint8) uint32 { - return uint32((u * prime8bytes) >> ((64 - h) & reg8SizeMask64)) -} - -// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits. -// Preferably h should be a constant and should always be <64. -func hash6(u uint64, h uint8) uint32 { - return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & reg8SizeMask64)) +// hashLen returns a hash of the lowest mls bytes of with length output bits. +// mls must be >=3 and <=8. Any other value will return hash for 4 bytes. +// length should always be < 32. +// Preferably length and mls should be a constant for inlining. +func hashLen(u uint64, length, mls uint8) uint32 { + switch mls { + case 3: + return (uint32(u<<8) * prime3bytes) >> (32 - length) + case 5: + return uint32(((u << (64 - 40)) * prime5bytes) >> (64 - length)) + case 6: + return uint32(((u << (64 - 48)) * prime6bytes) >> (64 - length)) + case 7: + return uint32(((u << (64 - 56)) * prime7bytes) >> (64 - length)) + case 8: + return uint32((u * prime8bytes) >> (64 - length)) + default: + return (uint32(u) * prime4bytes) >> (32 - length) + } } // matchlen will return the match length between offsets and t in src. diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go index 40ef45c2f..89a5dd89f 100644 --- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go +++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go @@ -265,9 +265,9 @@ func (w *huffmanBitWriter) writeBytes(bytes []byte) { // Codes 0-15 are single byte codes. Codes 16-18 are followed by additional // information. Code badCode is an end marker // -// numLiterals The number of literals in literalEncoding -// numOffsets The number of offsets in offsetEncoding -// litenc, offenc The literal and offset encoder to use +// numLiterals The number of literals in literalEncoding +// numOffsets The number of offsets in offsetEncoding +// litenc, offenc The literal and offset encoder to use func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litEnc, offEnc *huffmanEncoder) { for i := range w.codegenFreq { w.codegenFreq[i] = 0 @@ -460,9 +460,9 @@ func (w *huffmanBitWriter) writeOutBits() { // Write the header of a dynamic Huffman block to the output stream. // -// numLiterals The number of literals specified in codegen -// numOffsets The number of offsets specified in codegen -// numCodegens The number of codegens used in codegen +// numLiterals The number of literals specified in codegen +// numOffsets The number of offsets specified in codegen +// numCodegens The number of codegens used in codegen func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) { if w.err != nil { return @@ -790,9 +790,11 @@ func (w *huffmanBitWriter) fillTokens() { // and offsetEncoding. // The number of literal and offset tokens is returned. func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, numOffsets int) { - copy(w.literalFreq[:], t.litHist[:]) - copy(w.literalFreq[256:], t.extraHist[:]) - copy(w.offsetFreq[:], t.offHist[:offsetCodeCount]) + //copy(w.literalFreq[:], t.litHist[:]) + *(*[256]uint16)(w.literalFreq[:]) = t.litHist + //copy(w.literalFreq[256:], t.extraHist[:]) + *(*[32]uint16)(w.literalFreq[256:]) = t.extraHist + w.offsetFreq = t.offHist if t.n == 0 { return diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go index 5ac144f28..be7b58b47 100644 --- a/vendor/github.com/klauspost/compress/flate/huffman_code.go +++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go @@ -168,13 +168,18 @@ func (h *huffmanEncoder) canReuseBits(freq []uint16) int { // The cases of 0, 1, and 2 literals are handled by special case code. // // list An array of the literals with non-zero frequencies -// and their associated frequencies. The array is in order of increasing -// frequency, and has as its last element a special element with frequency -// MaxInt32 +// +// and their associated frequencies. The array is in order of increasing +// frequency, and has as its last element a special element with frequency +// MaxInt32 +// // maxBits The maximum number of bits that should be used to encode any literal. -// Must be less than 16. +// +// Must be less than 16. +// // return An integer array in which array[i] indicates the number of literals -// that should be encoded in i bits. +// +// that should be encoded in i bits. func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 { if maxBits >= maxBitsLimit { panic("flate: maxBits too large") diff --git a/vendor/github.com/klauspost/compress/flate/level1.go b/vendor/github.com/klauspost/compress/flate/level1.go index 0f14f8d63..703b9a89a 100644 --- a/vendor/github.com/klauspost/compress/flate/level1.go +++ b/vendor/github.com/klauspost/compress/flate/level1.go @@ -19,6 +19,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { const ( inputMargin = 12 - 1 minNonLiteralBlockSize = 1 + 1 + inputMargin + hashBytes = 5 ) if debugDeflate && e.cur < 0 { panic(fmt.Sprint("e.cur < 0: ", e.cur)) @@ -68,7 +69,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { sLimit := int32(len(src) - inputMargin) // nextEmit is where in src the next emitLiteral should start from. - cv := load3232(src, s) + cv := load6432(src, s) for { const skipLog = 5 @@ -77,7 +78,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { nextS := s var candidate tableEntry for { - nextHash := hash(cv) + nextHash := hashLen(cv, tableBits, hashBytes) candidate = e.table[nextHash] nextS = s + doEvery + (s-nextEmit)>>skipLog if nextS > sLimit { @@ -86,16 +87,16 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { now := load6432(src, nextS) e.table[nextHash] = tableEntry{offset: s + e.cur} - nextHash = hash(uint32(now)) + nextHash = hashLen(now, tableBits, hashBytes) offset := s - (candidate.offset - e.cur) - if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) { + if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) { e.table[nextHash] = tableEntry{offset: nextS + e.cur} break } // Do one right away... - cv = uint32(now) + cv = now s = nextS nextS++ candidate = e.table[nextHash] @@ -103,11 +104,11 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { e.table[nextHash] = tableEntry{offset: s + e.cur} offset = s - (candidate.offset - e.cur) - if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) { + if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) { e.table[nextHash] = tableEntry{offset: nextS + e.cur} break } - cv = uint32(now) + cv = now s = nextS } @@ -198,9 +199,9 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { } if s >= sLimit { // Index first pair after match end. - if int(s+l+4) < len(src) { - cv := load3232(src, s) - e.table[hash(cv)] = tableEntry{offset: s + e.cur} + if int(s+l+8) < len(src) { + cv := load6432(src, s) + e.table[hashLen(cv, tableBits, hashBytes)] = tableEntry{offset: s + e.cur} } goto emitRemainder } @@ -213,16 +214,16 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) { // three load32 calls. x := load6432(src, s-2) o := e.cur + s - 2 - prevHash := hash(uint32(x)) + prevHash := hashLen(x, tableBits, hashBytes) e.table[prevHash] = tableEntry{offset: o} x >>= 16 - currHash := hash(uint32(x)) + currHash := hashLen(x, tableBits, hashBytes) candidate = e.table[currHash] e.table[currHash] = tableEntry{offset: o + 2} offset := s - (candidate.offset - e.cur) if offset > maxMatchOffset || uint32(x) != load3232(src, candidate.offset-e.cur) { - cv = uint32(x >> 8) + cv = x >> 8 s++ break } diff --git a/vendor/github.com/klauspost/compress/flate/level2.go b/vendor/github.com/klauspost/compress/flate/level2.go index 8603fbd55..876dfbe30 100644 --- a/vendor/github.com/klauspost/compress/flate/level2.go +++ b/vendor/github.com/klauspost/compress/flate/level2.go @@ -16,6 +16,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) { const ( inputMargin = 12 - 1 minNonLiteralBlockSize = 1 + 1 + inputMargin + hashBytes = 5 ) if debugDeflate && e.cur < 0 { @@ -66,7 +67,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) { sLimit := int32(len(src) - inputMargin) // nextEmit is where in src the next emitLiteral should start from. - cv := load3232(src, s) + cv := load6432(src, s) for { // When should we start skipping if we haven't found matches in a long while. const skipLog = 5 @@ -75,7 +76,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) { nextS := s var candidate tableEntry for { - nextHash := hash4u(cv, bTableBits) + nextHash := hashLen(cv, bTableBits, hashBytes) s = nextS nextS = s + doEvery + (s-nextEmit)>>skipLog if nextS > sLimit { @@ -84,16 +85,16 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) { candidate = e.table[nextHash] now := load6432(src, nextS) e.table[nextHash] = tableEntry{offset: s + e.cur} - nextHash = hash4u(uint32(now), bTableBits) + nextHash = hashLen(now, bTableBits, hashBytes) offset := s - (candidate.offset - e.cur) - if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) { + if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) { e.table[nextHash] = tableEntry{offset: nextS + e.cur} break } // Do one right away... - cv = uint32(now) + cv = now s = nextS nextS++ candidate = e.table[nextHash] @@ -101,10 +102,10 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) { e.table[nextHash] = tableEntry{offset: s + e.cur} offset = s - (candidate.offset - e.cur) - if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) { + if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) { break } - cv = uint32(now) + cv = now } // A 4-byte match has been found. We'll later see if more than 4 bytes @@ -154,9 +155,9 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) { if s >= sLimit { // Index first pair after match end. - if int(s+l+4) < len(src) { - cv := load3232(src, s) - e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur} + if int(s+l+8) < len(src) { + cv := load6432(src, s) + e.table[hashLen(cv, bTableBits, hashBytes)] = tableEntry{offset: s + e.cur} } goto emitRemainder } @@ -164,15 +165,15 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) { // Store every second hash in-between, but offset by 1. for i := s - l + 2; i < s-5; i += 7 { x := load6432(src, i) - nextHash := hash4u(uint32(x), bTableBits) + nextHash := hashLen(x, bTableBits, hashBytes) e.table[nextHash] = tableEntry{offset: e.cur + i} // Skip one x >>= 16 - nextHash = hash4u(uint32(x), bTableBits) + nextHash = hashLen(x, bTableBits, hashBytes) e.table[nextHash] = tableEntry{offset: e.cur + i + 2} // Skip one x >>= 16 - nextHash = hash4u(uint32(x), bTableBits) + nextHash = hashLen(x, bTableBits, hashBytes) e.table[nextHash] = tableEntry{offset: e.cur + i + 4} } @@ -184,17 +185,17 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) { // three load32 calls. x := load6432(src, s-2) o := e.cur + s - 2 - prevHash := hash4u(uint32(x), bTableBits) - prevHash2 := hash4u(uint32(x>>8), bTableBits) + prevHash := hashLen(x, bTableBits, hashBytes) + prevHash2 := hashLen(x>>8, bTableBits, hashBytes) e.table[prevHash] = tableEntry{offset: o} e.table[prevHash2] = tableEntry{offset: o + 1} - currHash := hash4u(uint32(x>>16), bTableBits) + currHash := hashLen(x>>16, bTableBits, hashBytes) candidate = e.table[currHash] e.table[currHash] = tableEntry{offset: o + 2} offset := s - (candidate.offset - e.cur) if offset > maxMatchOffset || uint32(x>>16) != load3232(src, candidate.offset-e.cur) { - cv = uint32(x >> 24) + cv = x >> 24 s++ break } diff --git a/vendor/github.com/klauspost/compress/flate/level3.go b/vendor/github.com/klauspost/compress/flate/level3.go index 039639f89..7aa2b72a1 100644 --- a/vendor/github.com/klauspost/compress/flate/level3.go +++ b/vendor/github.com/klauspost/compress/flate/level3.go @@ -11,10 +11,11 @@ type fastEncL3 struct { // Encode uses a similar algorithm to level 2, will check up to two candidates. func (e *fastEncL3) Encode(dst *tokens, src []byte) { const ( - inputMargin = 8 - 1 + inputMargin = 12 - 1 minNonLiteralBlockSize = 1 + 1 + inputMargin tableBits = 16 tableSize = 1 << tableBits + hashBytes = 5 ) if debugDeflate && e.cur < 0 { @@ -69,20 +70,20 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { sLimit := int32(len(src) - inputMargin) // nextEmit is where in src the next emitLiteral should start from. - cv := load3232(src, s) + cv := load6432(src, s) for { - const skipLog = 6 + const skipLog = 7 nextS := s var candidate tableEntry for { - nextHash := hash4u(cv, tableBits) + nextHash := hashLen(cv, tableBits, hashBytes) s = nextS nextS = s + 1 + (s-nextEmit)>>skipLog if nextS > sLimit { goto emitRemainder } candidates := e.table[nextHash] - now := load3232(src, nextS) + now := load6432(src, nextS) // Safe offset distance until s + 4... minOffset := e.cur + s - (maxMatchOffset - 4) @@ -96,8 +97,8 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { continue } - if cv == load3232(src, candidate.offset-e.cur) { - if candidates.Prev.offset < minOffset || cv != load3232(src, candidates.Prev.offset-e.cur) { + if uint32(cv) == load3232(src, candidate.offset-e.cur) { + if candidates.Prev.offset < minOffset || uint32(cv) != load3232(src, candidates.Prev.offset-e.cur) { break } // Both match and are valid, pick longest. @@ -112,7 +113,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { // We only check if value mismatches. // Offset will always be invalid in other cases. candidate = candidates.Prev - if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) { + if candidate.offset > minOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) { break } } @@ -164,9 +165,9 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { if s >= sLimit { t += l // Index first pair after match end. - if int(t+4) < len(src) && t > 0 { - cv := load3232(src, t) - nextHash := hash4u(cv, tableBits) + if int(t+8) < len(src) && t > 0 { + cv = load6432(src, t) + nextHash := hashLen(cv, tableBits, hashBytes) e.table[nextHash] = tableEntryPrev{ Prev: e.table[nextHash].Cur, Cur: tableEntry{offset: e.cur + t}, @@ -176,8 +177,8 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { } // Store every 5th hash in-between. - for i := s - l + 2; i < s-5; i += 5 { - nextHash := hash4u(load3232(src, i), tableBits) + for i := s - l + 2; i < s-5; i += 6 { + nextHash := hashLen(load6432(src, i), tableBits, hashBytes) e.table[nextHash] = tableEntryPrev{ Prev: e.table[nextHash].Cur, Cur: tableEntry{offset: e.cur + i}} @@ -185,23 +186,23 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { // We could immediately start working at s now, but to improve // compression we first update the hash table at s-2 to s. x := load6432(src, s-2) - prevHash := hash4u(uint32(x), tableBits) + prevHash := hashLen(x, tableBits, hashBytes) e.table[prevHash] = tableEntryPrev{ Prev: e.table[prevHash].Cur, Cur: tableEntry{offset: e.cur + s - 2}, } x >>= 8 - prevHash = hash4u(uint32(x), tableBits) + prevHash = hashLen(x, tableBits, hashBytes) e.table[prevHash] = tableEntryPrev{ Prev: e.table[prevHash].Cur, Cur: tableEntry{offset: e.cur + s - 1}, } x >>= 8 - currHash := hash4u(uint32(x), tableBits) + currHash := hashLen(x, tableBits, hashBytes) candidates := e.table[currHash] - cv = uint32(x) + cv = x e.table[currHash] = tableEntryPrev{ Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur}, @@ -212,17 +213,17 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) { minOffset := e.cur + s - (maxMatchOffset - 4) if candidate.offset > minOffset { - if cv == load3232(src, candidate.offset-e.cur) { + if uint32(cv) == load3232(src, candidate.offset-e.cur) { // Found a match... continue } candidate = candidates.Prev - if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) { + if candidate.offset > minOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) { // Match at prev... continue } } - cv = uint32(x >> 8) + cv = x >> 8 s++ break } diff --git a/vendor/github.com/klauspost/compress/flate/level4.go b/vendor/github.com/klauspost/compress/flate/level4.go index 1cbffa1ae..23c08b325 100644 --- a/vendor/github.com/klauspost/compress/flate/level4.go +++ b/vendor/github.com/klauspost/compress/flate/level4.go @@ -12,6 +12,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { const ( inputMargin = 12 - 1 minNonLiteralBlockSize = 1 + 1 + inputMargin + hashShortBytes = 4 ) if debugDeflate && e.cur < 0 { panic(fmt.Sprint("e.cur < 0: ", e.cur)) @@ -80,7 +81,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { nextS := s var t int32 for { - nextHashS := hash4x64(cv, tableBits) + nextHashS := hashLen(cv, tableBits, hashShortBytes) nextHashL := hash7(cv, tableBits) s = nextS @@ -168,7 +169,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { // Index first pair after match end. if int(s+8) < len(src) { cv := load6432(src, s) - e.table[hash4x64(cv, tableBits)] = tableEntry{offset: s + e.cur} + e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: s + e.cur} e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur} } goto emitRemainder @@ -183,7 +184,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { t2 := tableEntry{offset: t.offset + 1} e.bTable[hash7(cv, tableBits)] = t e.bTable[hash7(cv>>8, tableBits)] = t2 - e.table[hash4u(uint32(cv>>8), tableBits)] = t2 + e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2 i += 3 for ; i < s-1; i += 3 { @@ -192,7 +193,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { t2 := tableEntry{offset: t.offset + 1} e.bTable[hash7(cv, tableBits)] = t e.bTable[hash7(cv>>8, tableBits)] = t2 - e.table[hash4u(uint32(cv>>8), tableBits)] = t2 + e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2 } } } @@ -201,7 +202,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) { // compression we first update the hash table at s-1 and at s. x := load6432(src, s-1) o := e.cur + s - 1 - prevHashS := hash4x64(x, tableBits) + prevHashS := hashLen(x, tableBits, hashShortBytes) prevHashL := hash7(x, tableBits) e.table[prevHashS] = tableEntry{offset: o} e.bTable[prevHashL] = tableEntry{offset: o} diff --git a/vendor/github.com/klauspost/compress/flate/level5.go b/vendor/github.com/klauspost/compress/flate/level5.go index 4b97576bd..83ef50ba4 100644 --- a/vendor/github.com/klauspost/compress/flate/level5.go +++ b/vendor/github.com/klauspost/compress/flate/level5.go @@ -12,6 +12,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { const ( inputMargin = 12 - 1 minNonLiteralBlockSize = 1 + 1 + inputMargin + hashShortBytes = 4 ) if debugDeflate && e.cur < 0 { panic(fmt.Sprint("e.cur < 0: ", e.cur)) @@ -88,7 +89,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { var l int32 var t int32 for { - nextHashS := hash4x64(cv, tableBits) + nextHashS := hashLen(cv, tableBits, hashShortBytes) nextHashL := hash7(cv, tableBits) s = nextS @@ -105,7 +106,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { eLong := &e.bTable[nextHashL] eLong.Cur, eLong.Prev = entry, eLong.Cur - nextHashS = hash4x64(next, tableBits) + nextHashS = hashLen(next, tableBits, hashShortBytes) nextHashL = hash7(next, tableBits) t = lCandidate.Cur.offset - e.cur @@ -191,14 +192,21 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { // Try to locate a better match by checking the end of best match... if sAt := s + l; l < 30 && sAt < sLimit { + // Allow some bytes at the beginning to mismatch. + // Sweet spot is 2/3 bytes depending on input. + // 3 is only a little better when it is but sometimes a lot worse. + // The skipped bytes are tested in Extend backwards, + // and still picked up as part of the match if they do. + const skipBeginning = 2 eLong := e.bTable[hash7(load6432(src, sAt), tableBits)].Cur.offset - // Test current - t2 := eLong - e.cur - l - off := s - t2 + t2 := eLong - e.cur - l + skipBeginning + s2 := s + skipBeginning + off := s2 - t2 if t2 >= 0 && off < maxMatchOffset && off > 0 { - if l2 := e.matchlenLong(s, t2, src); l2 > l { + if l2 := e.matchlenLong(s2, t2, src); l2 > l { t = t2 l = l2 + s = s2 } } } @@ -250,7 +258,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { if i < s-1 { cv := load6432(src, i) t := tableEntry{offset: i + e.cur} - e.table[hash4x64(cv, tableBits)] = t + e.table[hashLen(cv, tableBits, hashShortBytes)] = t eLong := &e.bTable[hash7(cv, tableBits)] eLong.Cur, eLong.Prev = t, eLong.Cur @@ -263,7 +271,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { // We only have enough bits for a short entry at i+2 cv >>= 8 t = tableEntry{offset: t.offset + 1} - e.table[hash4x64(cv, tableBits)] = t + e.table[hashLen(cv, tableBits, hashShortBytes)] = t // Skip one - otherwise we risk hitting 's' i += 4 @@ -273,7 +281,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { t2 := tableEntry{offset: t.offset + 1} eLong := &e.bTable[hash7(cv, tableBits)] eLong.Cur, eLong.Prev = t, eLong.Cur - e.table[hash4u(uint32(cv>>8), tableBits)] = t2 + e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2 } } } @@ -282,7 +290,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) { // compression we first update the hash table at s-1 and at s. x := load6432(src, s-1) o := e.cur + s - 1 - prevHashS := hash4x64(x, tableBits) + prevHashS := hashLen(x, tableBits, hashShortBytes) prevHashL := hash7(x, tableBits) e.table[prevHashS] = tableEntry{offset: o} eLong := &e.bTable[prevHashL] diff --git a/vendor/github.com/klauspost/compress/flate/level6.go b/vendor/github.com/klauspost/compress/flate/level6.go index 62888edf3..f1e9d98fa 100644 --- a/vendor/github.com/klauspost/compress/flate/level6.go +++ b/vendor/github.com/klauspost/compress/flate/level6.go @@ -12,6 +12,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { const ( inputMargin = 12 - 1 minNonLiteralBlockSize = 1 + 1 + inputMargin + hashShortBytes = 4 ) if debugDeflate && e.cur < 0 { panic(fmt.Sprint("e.cur < 0: ", e.cur)) @@ -90,7 +91,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { var l int32 var t int32 for { - nextHashS := hash4x64(cv, tableBits) + nextHashS := hashLen(cv, tableBits, hashShortBytes) nextHashL := hash7(cv, tableBits) s = nextS nextS = s + doEvery + (s-nextEmit)>>skipLog @@ -107,7 +108,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { eLong.Cur, eLong.Prev = entry, eLong.Cur // Calculate hashes of 'next' - nextHashS = hash4x64(next, tableBits) + nextHashS = hashLen(next, tableBits, hashShortBytes) nextHashL = hash7(next, tableBits) t = lCandidate.Cur.offset - e.cur @@ -213,24 +214,33 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { // Try to locate a better match by checking the end-of-match... if sAt := s + l; sAt < sLimit { + // Allow some bytes at the beginning to mismatch. + // Sweet spot is 2/3 bytes depending on input. + // 3 is only a little better when it is but sometimes a lot worse. + // The skipped bytes are tested in Extend backwards, + // and still picked up as part of the match if they do. + const skipBeginning = 2 eLong := &e.bTable[hash7(load6432(src, sAt), tableBits)] // Test current - t2 := eLong.Cur.offset - e.cur - l - off := s - t2 + t2 := eLong.Cur.offset - e.cur - l + skipBeginning + s2 := s + skipBeginning + off := s2 - t2 if off < maxMatchOffset { if off > 0 && t2 >= 0 { - if l2 := e.matchlenLong(s, t2, src); l2 > l { + if l2 := e.matchlenLong(s2, t2, src); l2 > l { t = t2 l = l2 + s = s2 } } // Test next: - t2 = eLong.Prev.offset - e.cur - l - off := s - t2 + t2 = eLong.Prev.offset - e.cur - l + skipBeginning + off := s2 - t2 if off > 0 && off < maxMatchOffset && t2 >= 0 { - if l2 := e.matchlenLong(s, t2, src); l2 > l { + if l2 := e.matchlenLong(s2, t2, src); l2 > l { t = t2 l = l2 + s = s2 } } } @@ -277,7 +287,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { // Index after match end. for i := nextS + 1; i < int32(len(src))-8; i += 2 { cv := load6432(src, i) - e.table[hash4x64(cv, tableBits)] = tableEntry{offset: i + e.cur} + e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: i + e.cur} eLong := &e.bTable[hash7(cv, tableBits)] eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur}, eLong.Cur } @@ -292,7 +302,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) { t2 := tableEntry{offset: t.offset + 1} eLong := &e.bTable[hash7(cv, tableBits)] eLong2 := &e.bTable[hash7(cv>>8, tableBits)] - e.table[hash4x64(cv, tableBits)] = t + e.table[hashLen(cv, tableBits, hashShortBytes)] = t eLong.Cur, eLong.Prev = t, eLong.Cur eLong2.Cur, eLong2.Prev = t2, eLong2.Cur } diff --git a/vendor/github.com/klauspost/compress/flate/stateless.go b/vendor/github.com/klauspost/compress/flate/stateless.go index 93a1d1503..f3d4139ef 100644 --- a/vendor/github.com/klauspost/compress/flate/stateless.go +++ b/vendor/github.com/klauspost/compress/flate/stateless.go @@ -86,11 +86,19 @@ func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error { dict = dict[len(dict)-maxStatelessDict:] } + // For subsequent loops, keep shallow dict reference to avoid alloc+copy. + var inDict []byte + for len(in) > 0 { todo := in - if len(todo) > maxStatelessBlock-len(dict) { + if len(inDict) > 0 { + if len(todo) > maxStatelessBlock-maxStatelessDict { + todo = todo[:maxStatelessBlock-maxStatelessDict] + } + } else if len(todo) > maxStatelessBlock-len(dict) { todo = todo[:maxStatelessBlock-len(dict)] } + inOrg := in in = in[len(todo):] uncompressed := todo if len(dict) > 0 { @@ -102,7 +110,11 @@ func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error { todo = combined } // Compress - statelessEnc(&dst, todo, int16(len(dict))) + if len(inDict) == 0 { + statelessEnc(&dst, todo, int16(len(dict))) + } else { + statelessEnc(&dst, inDict[:maxStatelessDict+len(todo)], maxStatelessDict) + } isEof := eof && len(in) == 0 if dst.n == 0 { @@ -119,7 +131,8 @@ func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error { } if len(in) > 0 { // Retain a dict if we have more - dict = todo[len(todo)-maxStatelessDict:] + inDict = inOrg[len(uncompressed)-maxStatelessDict:] + dict = nil dst.Reset() } if bw.err != nil { diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md index 73c0c462d..1d80c42a5 100644 --- a/vendor/github.com/klauspost/compress/s2/README.md +++ b/vendor/github.com/klauspost/compress/s2/README.md @@ -325,35 +325,35 @@ The content compressed in this mode is fully compatible with the standard decode Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all threads and a single thread (1 CPU): -| File | S2 speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller | -|-----------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------| -| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 12.70x | 10556 MB/s | 7.35% | 4.15x | 3455 MB/s | 12.79% | -| (1 CPU) | 1.14x | 948 MB/s | - | 0.42x | 349 MB/s | - | -| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 17.13x | 14484 MB/s | 31.60% | 10.09x | 8533 MB/s | 37.71% | -| (1 CPU) | 1.33x | 1127 MB/s | - | 0.70x | 589 MB/s | - | -| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 15.14x | 12000 MB/s | -5.79% | 6.59x | 5223 MB/s | 5.80% | -| (1 CPU) | 1.11x | 877 MB/s | - | 0.47x | 370 MB/s | - | -| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 14.62x | 12116 MB/s | 15.90% | 5.35x | 4430 MB/s | 16.08% | -| (1 CPU) | 1.38x | 1146 MB/s | - | 0.38x | 312 MB/s | - | -| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst) | 8.83x | 17579 MB/s | 43.86% | 6.54x | 13011 MB/s | 47.23% | -| (1 CPU) | 1.14x | 2259 MB/s | - | 0.74x | 1475 MB/s | - | -| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 16.72x | 14019 MB/s | 24.02% | 10.11x | 8477 MB/s | 30.48% | -| (1 CPU) | 1.24x | 1043 MB/s | - | 0.70x | 586 MB/s | - | -| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 13.33x | 9254 MB/s | 1.84% | 6.75x | 4686 MB/s | 6.72% | -| (1 CPU) | 0.97x | 672 MB/s | - | 0.53x | 366 MB/s | - | -| sharnd.out.2gb | 2.11x | 12639 MB/s | 0.01% | 1.98x | 11833 MB/s | 0.01% | -| (1 CPU) | 0.93x | 5594 MB/s | - | 1.34x | 8030 MB/s | - | -| [enwik9](http://mattmahoney.net/dc/textdata.html) | 19.34x | 8220 MB/s | 3.98% | 7.87x | 3345 MB/s | 15.82% | -| (1 CPU) | 1.06x | 452 MB/s | - | 0.50x | 213 MB/s | - | -| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 10.48x | 6124 MB/s | 5.67% | 3.76x | 2197 MB/s | 12.60% | -| (1 CPU) | 0.97x | 568 MB/s | - | 0.46x | 271 MB/s | - | -| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results) | 21.07x | 9020 MB/s | 6.36% | 6.91x | 2959 MB/s | 16.95% | -| (1 CPU) | 1.07x | 460 MB/s | - | 0.51x | 220 MB/s | - | +| File | S2 Speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller | +|---------------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------| +| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 16.33x | 10556 MB/s | 8.0% | 6.04x | 5252 MB/s | 14.7% | +| (1 CPU) | 1.08x | 940 MB/s | - | 0.46x | 400 MB/s | - | +| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 16.51x | 15224 MB/s | 31.70% | 9.47x | 8734 MB/s | 37.71% | +| (1 CPU) | 1.26x | 1157 MB/s | - | 0.60x | 556 MB/s | - | +| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 15.14x | 12598 MB/s | -5.76% | 6.23x | 5675 MB/s | 3.62% | +| (1 CPU) | 1.02x | 932 MB/s | - | 0.47x | 432 MB/s | - | +| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 11.21x | 12116 MB/s | 15.95% | 3.24x | 3500 MB/s | 18.00% | +| (1 CPU) | 1.05x | 1135 MB/s | - | 0.27x | 292 MB/s | - | +| [apache.log](https://files.klauspost.com/compress/apache.log.zst) | 8.55x | 16673 MB/s | 20.54% | 5.85x | 11420 MB/s | 24.97% | +| (1 CPU) | 1.91x | 1771 MB/s | - | 0.53x | 1041 MB/s | - | +| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 15.76x | 14357 MB/s | 24.01% | 8.67x | 7891 MB/s | 33.68% | +| (1 CPU) | 1.17x | 1064 MB/s | - | 0.65x | 595 MB/s | - | +| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 13.33x | 9835 MB/s | 2.34% | 6.85x | 4863 MB/s | 9.96% | +| (1 CPU) | 0.97x | 689 MB/s | - | 0.55x | 387 MB/s | - | +| sharnd.out.2gb | 9.11x | 13213 MB/s | 0.01% | 1.49x | 9184 MB/s | 0.01% | +| (1 CPU) | 0.88x | 5418 MB/s | - | 0.77x | 5417 MB/s | - | +| [sofia-air-quality-dataset csv](https://files.klauspost.com/compress/sofia-air-quality-dataset.tar.zst) | 22.00x | 11477 MB/s | 18.73% | 11.15x | 5817 MB/s | 27.88% | +| (1 CPU) | 1.23x | 642 MB/s | - | 0.71x | 642 MB/s | - | +| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 11.23x | 6520 MB/s | 5.9% | 5.35x | 3109 MB/s | 15.88% | +| (1 CPU) | 1.05x | 607 MB/s | - | 0.52x | 304 MB/s | - | +| [enwik9](https://files.klauspost.com/compress/enwik9.zst) | 19.28x | 8440 MB/s | 4.04% | 9.31x | 4076 MB/s | 18.04% | +| (1 CPU) | 1.12x | 488 MB/s | - | 0.57x | 250 MB/s | - | ### Legend -* `S2 speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core. -* `S2 throughput`: Throughput of S2 in MB/s. +* `S2 Speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core. +* `S2 Throughput`: Throughput of S2 in MB/s. * `S2 % smaller`: How many percent of the Snappy output size is S2 better. * `S2 "better"`: Speed when enabling "better" compression mode in S2 compared to Snappy. * `"better" throughput`: Speed when enabling "better" compression mode in S2 compared to Snappy. @@ -361,7 +361,7 @@ Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all th There is a good speedup across the board when using a single thread and a significant speedup when using multiple threads. -Machine generated data gets by far the biggest compression boost, with size being being reduced by up to 45% of Snappy size. +Machine generated data gets by far the biggest compression boost, with size being reduced by up to 35% of Snappy size. The "better" compression mode sees a good improvement in all cases, but usually at a performance cost. @@ -404,15 +404,15 @@ The "better" compression mode will actively look for shorter matches, which is w Without assembly decompression is also very fast; single goroutine decompression speed. No assembly: | File | S2 Throughput | S2 throughput | -|--------------------------------|--------------|---------------| -| consensus.db.10gb.s2 | 1.84x | 2289.8 MB/s | -| 10gb.tar.s2 | 1.30x | 867.07 MB/s | -| rawstudio-mint14.tar.s2 | 1.66x | 1329.65 MB/s | -| github-june-2days-2019.json.s2 | 2.36x | 1831.59 MB/s | -| github-ranks-backup.bin.s2 | 1.73x | 1390.7 MB/s | -| enwik9.s2 | 1.67x | 681.53 MB/s | -| adresser.json.s2 | 3.41x | 4230.53 MB/s | -| silesia.tar.s2 | 1.52x | 811.58 | +|--------------------------------|---------------|---------------| +| consensus.db.10gb.s2 | 1.84x | 2289.8 MB/s | +| 10gb.tar.s2 | 1.30x | 867.07 MB/s | +| rawstudio-mint14.tar.s2 | 1.66x | 1329.65 MB/s | +| github-june-2days-2019.json.s2 | 2.36x | 1831.59 MB/s | +| github-ranks-backup.bin.s2 | 1.73x | 1390.7 MB/s | +| enwik9.s2 | 1.67x | 681.53 MB/s | +| adresser.json.s2 | 3.41x | 4230.53 MB/s | +| silesia.tar.s2 | 1.52x | 811.58 | Even though S2 typically compresses better than Snappy, decompression speed is always better. @@ -450,14 +450,14 @@ The most reliable is a wide dataset. For this we use [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z), 53927 files, total input size: 4,014,735,833 bytes. Single goroutine used. -| * | Input | Output | Reduction | MB/s | -|-------------------|------------|------------|-----------|--------| -| S2 | 4014735833 | 1059723369 | 73.60% | **934.34** | -| S2 Better | 4014735833 | 969670507 | 75.85% | 532.70 | -| S2 Best | 4014735833 | 906625668 | **77.85%** | 46.84 | -| Snappy | 4014735833 | 1128706759 | 71.89% | 762.59 | -| S2, Snappy Output | 4014735833 | 1093821420 | 72.75% | 908.60 | -| LZ4 | 4014735833 | 1079259294 | 73.12% | 526.94 | +| * | Input | Output | Reduction | MB/s | +|-------------------|------------|------------|------------|------------| +| S2 | 4014735833 | 1059723369 | 73.60% | **936.73** | +| S2 Better | 4014735833 | 961580539 | 76.05% | 451.10 | +| S2 Best | 4014735833 | 899182886 | **77.60%** | 46.84 | +| Snappy | 4014735833 | 1128706759 | 71.89% | 790.15 | +| S2, Snappy Output | 4014735833 | 1093823291 | 72.75% | 936.60 | +| LZ4 | 4014735833 | 1063768713 | 73.50% | 452.02 | S2 delivers both the best single threaded throughput with regular mode and the best compression rate with "best". "Better" mode provides the same compression speed as LZ4 with better compression ratio. @@ -489,42 +489,23 @@ AMD64 assembly is use for both S2 and Snappy. | Absolute Perf | Snappy size | S2 Size | Snappy Speed | S2 Speed | Snappy dec | S2 dec | |-----------------------|-------------|---------|--------------|-------------|-------------|-------------| -| html | 22843 | 21111 | 16246 MB/s | 17438 MB/s | 40972 MB/s | 49263 MB/s | -| urls.10K | 335492 | 287326 | 7943 MB/s | 9693 MB/s | 22523 MB/s | 26484 MB/s | -| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 273889 MB/s | 718321 MB/s | 827552 MB/s | -| fireworks.jpeg (200B) | 146 | 155 | 8869 MB/s | 17773 MB/s | 33691 MB/s | 52421 MB/s | -| paper-100k.pdf | 85304 | 84459 | 167546 MB/s | 101263 MB/s | 326905 MB/s | 291944 MB/s | -| html_x_4 | 92234 | 21113 | 15194 MB/s | 50670 MB/s | 30843 MB/s | 32217 MB/s | -| alice29.txt | 88034 | 85975 | 5936 MB/s | 6139 MB/s | 12882 MB/s | 20044 MB/s | -| asyoulik.txt | 77503 | 79650 | 5517 MB/s | 6366 MB/s | 12735 MB/s | 22806 MB/s | -| lcet10.txt | 234661 | 220670 | 6235 MB/s | 6067 MB/s | 14519 MB/s | 18697 MB/s | -| plrabn12.txt | 319267 | 317985 | 5159 MB/s | 5726 MB/s | 11923 MB/s | 19901 MB/s | -| geo.protodata | 23335 | 18690 | 21220 MB/s | 26529 MB/s | 56271 MB/s | 62540 MB/s | -| kppkn.gtb | 69526 | 65312 | 9732 MB/s | 8559 MB/s | 18491 MB/s | 18969 MB/s | -| alice29.txt (128B) | 80 | 82 | 6691 MB/s | 15489 MB/s | 31883 MB/s | 38874 MB/s | -| alice29.txt (1000B) | 774 | 774 | 12204 MB/s | 13000 MB/s | 48056 MB/s | 52341 MB/s | -| alice29.txt (10000B) | 6648 | 6933 | 10044 MB/s | 12806 MB/s | 32378 MB/s | 46322 MB/s | -| alice29.txt (20000B) | 12686 | 13574 | 7733 MB/s | 11210 MB/s | 30566 MB/s | 58969 MB/s | - - -| Relative Perf | Snappy size | S2 size improved | S2 Speed | S2 Dec Speed | -|-----------------------|-------------|------------------|----------|--------------| -| html | 22.31% | 7.58% | 1.07x | 1.20x | -| urls.10K | 47.78% | 14.36% | 1.22x | 1.18x | -| fireworks.jpeg | 99.95% | -0.05% | 0.78x | 1.15x | -| fireworks.jpeg (200B) | 73.00% | -6.16% | 2.00x | 1.56x | -| paper-100k.pdf | 83.30% | 0.99% | 0.60x | 0.89x | -| html_x_4 | 22.52% | 77.11% | 3.33x | 1.04x | -| alice29.txt | 57.88% | 2.34% | 1.03x | 1.56x | -| asyoulik.txt | 61.91% | -2.77% | 1.15x | 1.79x | -| lcet10.txt | 54.99% | 5.96% | 0.97x | 1.29x | -| plrabn12.txt | 66.26% | 0.40% | 1.11x | 1.67x | -| geo.protodata | 19.68% | 19.91% | 1.25x | 1.11x | -| kppkn.gtb | 37.72% | 6.06% | 0.88x | 1.03x | -| alice29.txt (128B) | 62.50% | -2.50% | 2.31x | 1.22x | -| alice29.txt (1000B) | 77.40% | 0.00% | 1.07x | 1.09x | -| alice29.txt (10000B) | 66.48% | -4.29% | 1.27x | 1.43x | -| alice29.txt (20000B) | 63.43% | -7.00% | 1.45x | 1.93x | +| html | 22843 | 20868 | 16246 MB/s | 18617 MB/s | 40972 MB/s | 49263 MB/s | +| urls.10K | 335492 | 286541 | 7943 MB/s | 10201 MB/s | 22523 MB/s | 26484 MB/s | +| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 303228 MB/s | 718321 MB/s | 827552 MB/s | +| fireworks.jpeg (200B) | 146 | 155 | 8869 MB/s | 20180 MB/s | 33691 MB/s | 52421 MB/s | +| paper-100k.pdf | 85304 | 84202 | 167546 MB/s | 112988 MB/s | 326905 MB/s | 291944 MB/s | +| html_x_4 | 92234 | 20870 | 15194 MB/s | 54457 MB/s | 30843 MB/s | 32217 MB/s | +| alice29.txt | 88034 | 85934 | 5936 MB/s | 6540 MB/s | 12882 MB/s | 20044 MB/s | +| asyoulik.txt | 77503 | 79575 | 5517 MB/s | 6657 MB/s | 12735 MB/s | 22806 MB/s | +| lcet10.txt | 234661 | 220383 | 6235 MB/s | 6303 MB/s | 14519 MB/s | 18697 MB/s | +| plrabn12.txt | 319267 | 318196 | 5159 MB/s | 6074 MB/s | 11923 MB/s | 19901 MB/s | +| geo.protodata | 23335 | 18606 | 21220 MB/s | 25432 MB/s | 56271 MB/s | 62540 MB/s | +| kppkn.gtb | 69526 | 65019 | 9732 MB/s | 8905 MB/s | 18491 MB/s | 18969 MB/s | +| alice29.txt (128B) | 80 | 82 | 6691 MB/s | 17179 MB/s | 31883 MB/s | 38874 MB/s | +| alice29.txt (1000B) | 774 | 774 | 12204 MB/s | 13273 MB/s | 48056 MB/s | 52341 MB/s | +| alice29.txt (10000B) | 6648 | 6933 | 10044 MB/s | 12824 MB/s | 32378 MB/s | 46322 MB/s | +| alice29.txt (20000B) | 12686 | 13516 | 7733 MB/s | 12160 MB/s | 30566 MB/s | 58969 MB/s | + Speed is generally at or above Snappy. Small blocks gets a significant speedup, although at the expense of size. @@ -543,42 +524,23 @@ So individual benchmarks should only be seen as a guideline and the overall pict | Absolute Perf | Snappy size | Better Size | Snappy Speed | Better Speed | Snappy dec | Better dec | |-----------------------|-------------|-------------|--------------|--------------|-------------|-------------| -| html | 22843 | 19833 | 16246 MB/s | 7731 MB/s | 40972 MB/s | 40292 MB/s | -| urls.10K | 335492 | 253529 | 7943 MB/s | 3980 MB/s | 22523 MB/s | 20981 MB/s | -| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 9760 MB/s | 718321 MB/s | 823698 MB/s | -| fireworks.jpeg (200B) | 146 | 142 | 8869 MB/s | 594 MB/s | 33691 MB/s | 30101 MB/s | -| paper-100k.pdf | 85304 | 82915 | 167546 MB/s | 7470 MB/s | 326905 MB/s | 198869 MB/s | -| html_x_4 | 92234 | 19841 | 15194 MB/s | 23403 MB/s | 30843 MB/s | 30937 MB/s | -| alice29.txt | 88034 | 73218 | 5936 MB/s | 2945 MB/s | 12882 MB/s | 16611 MB/s | -| asyoulik.txt | 77503 | 66844 | 5517 MB/s | 2739 MB/s | 12735 MB/s | 14975 MB/s | -| lcet10.txt | 234661 | 190589 | 6235 MB/s | 3099 MB/s | 14519 MB/s | 16634 MB/s | -| plrabn12.txt | 319267 | 270828 | 5159 MB/s | 2600 MB/s | 11923 MB/s | 13382 MB/s | -| geo.protodata | 23335 | 18278 | 21220 MB/s | 11208 MB/s | 56271 MB/s | 57961 MB/s | -| kppkn.gtb | 69526 | 61851 | 9732 MB/s | 4556 MB/s | 18491 MB/s | 16524 MB/s | -| alice29.txt (128B) | 80 | 81 | 6691 MB/s | 529 MB/s | 31883 MB/s | 34225 MB/s | -| alice29.txt (1000B) | 774 | 748 | 12204 MB/s | 1943 MB/s | 48056 MB/s | 42068 MB/s | -| alice29.txt (10000B) | 6648 | 6234 | 10044 MB/s | 2949 MB/s | 32378 MB/s | 28813 MB/s | -| alice29.txt (20000B) | 12686 | 11584 | 7733 MB/s | 2822 MB/s | 30566 MB/s | 27315 MB/s | - - -| Relative Perf | Snappy size | Better size | Better Speed | Better dec | -|-----------------------|-------------|-------------|--------------|------------| -| html | 22.31% | 13.18% | 0.48x | 0.98x | -| urls.10K | 47.78% | 24.43% | 0.50x | 0.93x | -| fireworks.jpeg | 99.95% | -0.05% | 0.03x | 1.15x | -| fireworks.jpeg (200B) | 73.00% | 2.74% | 0.07x | 0.89x | -| paper-100k.pdf | 83.30% | 2.80% | 0.07x | 0.61x | -| html_x_4 | 22.52% | 78.49% | 0.04x | 1.00x | -| alice29.txt | 57.88% | 16.83% | 1.54x | 1.29x | -| asyoulik.txt | 61.91% | 13.75% | 0.50x | 1.18x | -| lcet10.txt | 54.99% | 18.78% | 0.50x | 1.15x | -| plrabn12.txt | 66.26% | 15.17% | 0.50x | 1.12x | -| geo.protodata | 19.68% | 21.67% | 0.50x | 1.03x | -| kppkn.gtb | 37.72% | 11.04% | 0.53x | 0.89x | -| alice29.txt (128B) | 62.50% | -1.25% | 0.47x | 1.07x | -| alice29.txt (1000B) | 77.40% | 3.36% | 0.08x | 0.88x | -| alice29.txt (10000B) | 66.48% | 6.23% | 0.16x | 0.89x | -| alice29.txt (20000B) | 63.43% | 8.69% | 0.29x | 0.89x | +| html | 22843 | 18972 | 16246 MB/s | 8621 MB/s | 40972 MB/s | 40292 MB/s | +| urls.10K | 335492 | 248079 | 7943 MB/s | 5104 MB/s | 22523 MB/s | 20981 MB/s | +| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 84429 MB/s | 718321 MB/s | 823698 MB/s | +| fireworks.jpeg (200B) | 146 | 149 | 8869 MB/s | 7125 MB/s | 33691 MB/s | 30101 MB/s | +| paper-100k.pdf | 85304 | 82887 | 167546 MB/s | 11087 MB/s | 326905 MB/s | 198869 MB/s | +| html_x_4 | 92234 | 18982 | 15194 MB/s | 29316 MB/s | 30843 MB/s | 30937 MB/s | +| alice29.txt | 88034 | 71611 | 5936 MB/s | 3709 MB/s | 12882 MB/s | 16611 MB/s | +| asyoulik.txt | 77503 | 65941 | 5517 MB/s | 3380 MB/s | 12735 MB/s | 14975 MB/s | +| lcet10.txt | 234661 | 184939 | 6235 MB/s | 3537 MB/s | 14519 MB/s | 16634 MB/s | +| plrabn12.txt | 319267 | 264990 | 5159 MB/s | 2960 MB/s | 11923 MB/s | 13382 MB/s | +| geo.protodata | 23335 | 17689 | 21220 MB/s | 10859 MB/s | 56271 MB/s | 57961 MB/s | +| kppkn.gtb | 69526 | 55398 | 9732 MB/s | 5206 MB/s | 18491 MB/s | 16524 MB/s | +| alice29.txt (128B) | 80 | 78 | 6691 MB/s | 7422 MB/s | 31883 MB/s | 34225 MB/s | +| alice29.txt (1000B) | 774 | 746 | 12204 MB/s | 5734 MB/s | 48056 MB/s | 42068 MB/s | +| alice29.txt (10000B) | 6648 | 6218 | 10044 MB/s | 6055 MB/s | 32378 MB/s | 28813 MB/s | +| alice29.txt (20000B) | 12686 | 11492 | 7733 MB/s | 3143 MB/s | 30566 MB/s | 27315 MB/s | + Except for the mostly incompressible JPEG image compression is better and usually in the double digits in terms of percentage reduction over Snappy. @@ -605,29 +567,29 @@ Some examples compared on 16 core CPU, amd64 assembly used: ``` * enwik10 -Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s -Better... 10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s -Best... 10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s +Default... 10000000000 -> 4759950115 [47.60%]; 1.03s, 9263.0MB/s +Better... 10000000000 -> 4084706676 [40.85%]; 2.16s, 4415.4MB/s +Best... 10000000000 -> 3615520079 [36.16%]; 42.259s, 225.7MB/s * github-june-2days-2019.json -Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s -Better... 6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s -Best... 6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s +Default... 6273951764 -> 1041700255 [16.60%]; 431ms, 13882.3MB/s +Better... 6273951764 -> 945841238 [15.08%]; 547ms, 10938.4MB/s +Best... 6273951764 -> 826392576 [13.17%]; 9.455s, 632.8MB/s * nyc-taxi-data-10M.csv -Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s -Better... 3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s -Best... 3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s +Default... 3325605752 -> 1093516949 [32.88%]; 324ms, 9788.7MB/s +Better... 3325605752 -> 885394158 [26.62%]; 491ms, 6459.4MB/s +Best... 3325605752 -> 773681257 [23.26%]; 8.29s, 412.0MB/s * 10gb.tar -Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s -Better... 10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s -Best... 10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/ +Default... 10065157632 -> 5915541066 [58.77%]; 1.028s, 9337.4MB/s +Better... 10065157632 -> 5453844650 [54.19%]; 1.597s, 4862.7MB/s +Best... 10065157632 -> 5192495021 [51.59%]; 32.78s, 308.2MB/ * consensus.db.10gb -Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s -Better... 10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s -Best... 10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s +Default... 10737418240 -> 4549762344 [42.37%]; 882ms, 12118.4MB/s +Better... 10737418240 -> 4438535064 [41.34%]; 1.533s, 3500.9MB/s +Best... 10737418240 -> 4210602774 [39.21%]; 42.96s, 254.4MB/s ``` Decompression speed should be around the same as using the 'better' compression mode. @@ -648,10 +610,10 @@ If you would like more control, you can use the s2 package as described below: Snappy compatible blocks can be generated with the S2 encoder. Compression and speed is typically a bit better `MaxEncodedLen` is also smaller for smaller memory usage. Replace -| Snappy | S2 replacement | -|----------------------------|-------------------------| -| snappy.Encode(...) | s2.EncodeSnappy(...) | -| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) | +| Snappy | S2 replacement | +|---------------------------|-----------------------| +| snappy.Encode(...) | s2.EncodeSnappy(...) | +| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) | `s2.EncodeSnappy` can be replaced with `s2.EncodeSnappyBetter` or `s2.EncodeSnappyBest` to get more efficiently compressed snappy compatible output. @@ -660,12 +622,12 @@ Compression and speed is typically a bit better `MaxEncodedLen` is also smaller Comparison of [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z), 53927 files, total input size: 4,014,735,833 bytes. amd64, single goroutine used: -| Encoder | Size | MB/s | Reduction | -|-----------------------|------------|------------|------------ -| snappy.Encode | 1128706759 | 725.59 | 71.89% | -| s2.EncodeSnappy | 1093823291 | **899.16** | 72.75% | -| s2.EncodeSnappyBetter | 1001158548 | 578.49 | 75.06% | -| s2.EncodeSnappyBest | 944507998 | 66.00 | **76.47%**| +| Encoder | Size | MB/s | Reduction | +|-----------------------|------------|------------|------------| +| snappy.Encode | 1128706759 | 725.59 | 71.89% | +| s2.EncodeSnappy | 1093823291 | **899.16** | 72.75% | +| s2.EncodeSnappyBetter | 1001158548 | 578.49 | 75.06% | +| s2.EncodeSnappyBest | 944507998 | 66.00 | **76.47%** | ## Streams @@ -835,6 +797,13 @@ This is done using the regular "Skip" function: This will ensure that we are at exactly the offset we want, and reading from `dec` will start at the requested offset. +# Compact storage + +For compact storage [RemoveIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RemoveIndexHeaders) can be used to remove any redundant info from +a serialized index. If you remove the header it must be restored before [Loading](https://pkg.go.dev/github.com/klauspost/compress/s2#Index.Load). + +This is expected to save 20 bytes. These can be restored using [RestoreIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RestoreIndexHeaders). This removes a layer of security, but is the most compact representation. Returns nil if headers contains errors. + ## Index Format: Each block is structured as a snappy skippable block, with the chunk ID 0x99. @@ -844,20 +813,20 @@ The block can be read from the front, but contains information so it can be read Numbers are stored as fixed size little endian values or [zigzag encoded](https://developers.google.com/protocol-buffers/docs/encoding#signed_integers) [base 128 varints](https://developers.google.com/protocol-buffers/docs/encoding), with un-encoded value length of 64 bits, unless other limits are specified. -| Content | Format | -|---------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------| -| ID, `[1]byte` | Always 0x99. | -| Data Length, `[3]byte` | 3 byte little-endian length of the chunk in bytes, following this. | -| Header `[6]byte` | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00". | -| UncompressedSize, Varint | Total Uncompressed size. | -| CompressedSize, Varint | Total Compressed size if known. Should be -1 if unknown. | -| EstBlockSize, Varint | Block Size, used for guessing uncompressed offsets. Must be >= 0. | -| Entries, Varint | Number of Entries in index, must be < 65536 and >=0. | -| HasUncompressedOffsets `byte` | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid. | -| UncompressedOffsets, [Entries]VarInt | Uncompressed offsets. See below how to decode. | -| CompressedOffsets, [Entries]VarInt | Compressed offsets. See below how to decode. | -| Block Size, `[4]byte` | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block. | -| Trailer `[6]byte` | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. | +| Content | Format | +|--------------------------------------|-------------------------------------------------------------------------------------------------------------------------------| +| ID, `[1]byte` | Always 0x99. | +| Data Length, `[3]byte` | 3 byte little-endian length of the chunk in bytes, following this. | +| Header `[6]byte` | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00". | +| UncompressedSize, Varint | Total Uncompressed size. | +| CompressedSize, Varint | Total Compressed size if known. Should be -1 if unknown. | +| EstBlockSize, Varint | Block Size, used for guessing uncompressed offsets. Must be >= 0. | +| Entries, Varint | Number of Entries in index, must be < 65536 and >=0. | +| HasUncompressedOffsets `byte` | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid. | +| UncompressedOffsets, [Entries]VarInt | Uncompressed offsets. See below how to decode. | +| CompressedOffsets, [Entries]VarInt | Compressed offsets. See below how to decode. | +| Block Size, `[4]byte` | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block. | +| Trailer `[6]byte` | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. | For regular streams the uncompressed offsets are fully predictable, so `HasUncompressedOffsets` allows to specify that compressed blocks all have @@ -929,6 +898,7 @@ To decode from any given uncompressed offset `(wantOffset)`: See [using indexes](https://github.com/klauspost/compress/tree/master/s2#using-indexes) for functions that perform the operations with a simpler interface. + # Format Extensions * Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`. @@ -951,10 +921,11 @@ The length is specified by reading the 3-bit length specified in the tag and dec | 7 | 65540 + read 3 bytes | This allows any repeat offset + length to be represented by 2 to 5 bytes. +It also allows to emit matches longer than 64 bytes with one copy + one repeat instead of several 64 byte copies. Lengths are stored as little endian values. -The first copy of a block cannot be a repeat offset and the offset is not carried across blocks in streams. +The first copy of a block cannot be a repeat offset and the offset is reset on every block in streams. Default streaming block size is 1MB. diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go index 27c0f3c2c..00c5cc72c 100644 --- a/vendor/github.com/klauspost/compress/s2/decode.go +++ b/vendor/github.com/klauspost/compress/s2/decode.go @@ -952,7 +952,11 @@ func (r *Reader) ReadSeeker(random bool, index []byte) (*ReadSeeker, error) { // Seek allows seeking in compressed data. func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) { if r.err != nil { - return 0, r.err + if !errors.Is(r.err, io.EOF) { + return 0, r.err + } + // Reset on EOF + r.err = nil } if offset == 0 && whence == io.SeekCurrent { return r.blockStart + int64(r.i), nil diff --git a/vendor/github.com/klauspost/compress/s2/decode_other.go b/vendor/github.com/klauspost/compress/s2/decode_other.go index 1074ebd21..11300c3a8 100644 --- a/vendor/github.com/klauspost/compress/s2/decode_other.go +++ b/vendor/github.com/klauspost/compress/s2/decode_other.go @@ -28,6 +28,9 @@ func s2Decode(dst, src []byte) int { // As long as we can read at least 5 bytes... for s < len(src)-5 { + // Removing bounds checks is SLOWER, when if doing + // in := src[s:s+5] + // Checked on Go 1.18 switch src[s] & 0x03 { case tagLiteral: x := uint32(src[s] >> 2) @@ -38,14 +41,19 @@ func s2Decode(dst, src []byte) int { s += 2 x = uint32(src[s-1]) case x == 61: + in := src[s : s+3] + x = uint32(in[1]) | uint32(in[2])<<8 s += 3 - x = uint32(src[s-2]) | uint32(src[s-1])<<8 case x == 62: + in := src[s : s+4] + // Load as 32 bit and shift down. + x = uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24 + x >>= 8 s += 4 - x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 case x == 63: + in := src[s : s+5] + x = uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24 s += 5 - x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 } length = int(x) + 1 if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) { @@ -62,8 +70,8 @@ func s2Decode(dst, src []byte) int { case tagCopy1: s += 2 - length = int(src[s-2]) >> 2 & 0x7 toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) + length = int(src[s-2]) >> 2 & 0x7 if toffset == 0 { if debug { fmt.Print("(repeat) ") @@ -71,14 +79,16 @@ func s2Decode(dst, src []byte) int { // keep last offset switch length { case 5: + length = int(src[s]) + 4 s += 1 - length = int(uint32(src[s-1])) + 4 case 6: + in := src[s : s+2] + length = int(uint32(in[0])|(uint32(in[1])<<8)) + (1 << 8) s += 2 - length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8) case 7: + in := src[s : s+3] + length = int((uint32(in[2])<<16)|(uint32(in[1])<<8)|uint32(in[0])) + (1 << 16) s += 3 - length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16) default: // 0-> 4 } } else { @@ -86,14 +96,16 @@ func s2Decode(dst, src []byte) int { } length += 4 case tagCopy2: + in := src[s : s+3] + offset = int(uint32(in[1]) | uint32(in[2])<<8) + length = 1 + int(in[0])>>2 s += 3 - length = 1 + int(src[s-3])>>2 - offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) case tagCopy4: + in := src[s : s+5] + offset = int(uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24) + length = 1 + int(in[0])>>2 s += 5 - length = 1 + int(src[s-5])>>2 - offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) } if offset <= 0 || d < offset || length > len(dst)-d { diff --git a/vendor/github.com/klauspost/compress/s2/encode_all.go b/vendor/github.com/klauspost/compress/s2/encode_all.go index 8b16c38a6..54c71d3b5 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_all.go +++ b/vendor/github.com/klauspost/compress/s2/encode_all.go @@ -58,8 +58,9 @@ func encodeGo(dst, src []byte) []byte { // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlockGo(dst, src []byte) (d int) { // Initialize the hash table. const ( diff --git a/vendor/github.com/klauspost/compress/s2/encode_amd64.go b/vendor/github.com/klauspost/compress/s2/encode_amd64.go index e612225f4..6b93daa5a 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_amd64.go +++ b/vendor/github.com/klauspost/compress/s2/encode_amd64.go @@ -8,8 +8,9 @@ package s2 // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlock(dst, src []byte) (d int) { const ( // Use 12 bit table when less than... @@ -43,8 +44,9 @@ func encodeBlock(dst, src []byte) (d int) { // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlockBetter(dst, src []byte) (d int) { const ( // Use 12 bit table when less than... @@ -78,8 +80,9 @@ func encodeBlockBetter(dst, src []byte) (d int) { // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlockSnappy(dst, src []byte) (d int) { const ( // Use 12 bit table when less than... @@ -112,8 +115,9 @@ func encodeBlockSnappy(dst, src []byte) (d int) { // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlockBetterSnappy(dst, src []byte) (d int) { const ( // Use 12 bit table when less than... diff --git a/vendor/github.com/klauspost/compress/s2/encode_best.go b/vendor/github.com/klauspost/compress/s2/encode_best.go index 4bc80bc6a..1b7ea394f 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_best.go +++ b/vendor/github.com/klauspost/compress/s2/encode_best.go @@ -15,8 +15,9 @@ import ( // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlockBest(dst, src []byte) (d int) { // Initialize the hash tables. const ( @@ -176,14 +177,21 @@ func encodeBlockBest(dst, src []byte) (d int) { best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false)) } // Search for a match at best match end, see if that is better. - if sAt := best.s + best.length; sAt < sLimit { - sBack := best.s - backL := best.length + // Allow some bytes at the beginning to mismatch. + // Sweet spot is around 1-2 bytes, but depends on input. + // The skipped bytes are tested in Extend backwards, + // and still picked up as part of the match if they do. + const skipBeginning = 2 + const skipEnd = 1 + if sAt := best.s + best.length - skipEnd; sAt < sLimit { + + sBack := best.s + skipBeginning - skipEnd + backL := best.length - skipBeginning // Load initial values cv = load64(src, sBack) - // Search for mismatch + + // Grab candidates... next := lTable[hash8(load64(src, sAt), lTableBits)] - //next := sTable[hash4(load64(src, sAt), sTableBits)] if checkAt := getCur(next) - backL; checkAt > 0 { best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) @@ -191,6 +199,16 @@ func encodeBlockBest(dst, src []byte) (d int) { if checkAt := getPrev(next) - backL; checkAt > 0 { best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) } + // Disabled: Extremely small gain + if false { + next = sTable[hash4(load64(src, sAt), sTableBits)] + if checkAt := getCur(next) - backL; checkAt > 0 { + best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) + } + if checkAt := getPrev(next) - backL; checkAt > 0 { + best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) + } + } } } } @@ -288,8 +306,9 @@ emitRemainder: // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlockBestSnappy(dst, src []byte) (d int) { // Initialize the hash tables. const ( @@ -546,6 +565,7 @@ emitRemainder: // emitCopySize returns the size to encode the offset+length // // It assumes that: +// // 1 <= offset && offset <= math.MaxUint32 // 4 <= length && length <= 1 << 24 func emitCopySize(offset, length int) int { @@ -584,6 +604,7 @@ func emitCopySize(offset, length int) int { // emitCopyNoRepeatSize returns the size to encode the offset+length // // It assumes that: +// // 1 <= offset && offset <= math.MaxUint32 // 4 <= length && length <= 1 << 24 func emitCopyNoRepeatSize(offset, length int) int { diff --git a/vendor/github.com/klauspost/compress/s2/encode_better.go b/vendor/github.com/klauspost/compress/s2/encode_better.go index 943215b8a..3b66ba42b 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_better.go +++ b/vendor/github.com/klauspost/compress/s2/encode_better.go @@ -42,8 +42,9 @@ func hash8(u uint64, h uint8) uint32 { // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlockBetterGo(dst, src []byte) (d int) { // sLimit is when to stop looking for offset/length copies. The inputMargin // lets us use a fast path for emitLiteral in the main loop, while we are @@ -56,7 +57,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { // Initialize the hash tables. const ( // Long hash matches. - lTableBits = 16 + lTableBits = 17 maxLTableSize = 1 << lTableBits // Short hash matches. @@ -97,9 +98,26 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { lTable[hashL] = uint32(s) sTable[hashS] = uint32(s) + valLong := load64(src, candidateL) + valShort := load64(src, candidateS) + + // If long matches at least 8 bytes, use that. + if cv == valLong { + break + } + if cv == valShort { + candidateL = candidateS + break + } + // Check repeat at offset checkRep. const checkRep = 1 - if false && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { + // Minimum length of a repeat. Tested with various values. + // While 4-5 offers improvements in some, 6 reduces + // regressions significantly. + const wantRepeatBytes = 6 + const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep) + if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask { base := s + checkRep // Extend back for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { @@ -109,8 +127,8 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { d += emitLiteral(dst[d:], src[nextEmit:base]) // Extend forward - candidate := s - repeat + 4 + checkRep - s += 4 + checkRep + candidate := s - repeat + wantRepeatBytes + checkRep + s += wantRepeatBytes + checkRep for s < len(src) { if len(src)-s < 8 { if src[s] == src[candidate] { @@ -127,28 +145,40 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { s += 8 candidate += 8 } - if nextEmit > 0 { - // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. - d += emitRepeat(dst[d:], repeat, s-base) - } else { - // First match, cannot be repeat. - d += emitCopy(dst[d:], repeat, s-base) - } + // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. + d += emitRepeat(dst[d:], repeat, s-base) nextEmit = s if s >= sLimit { goto emitRemainder } + // Index in-between + index0 := base + 1 + index1 := s - 2 + + cv = load64(src, s) + for index0 < index1 { + cv0 := load64(src, index0) + cv1 := load64(src, index1) + lTable[hash7(cv0, lTableBits)] = uint32(index0) + sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) + + lTable[hash7(cv1, lTableBits)] = uint32(index1) + sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) + index0 += 2 + index1 -= 2 + } cv = load64(src, s) continue } - if uint32(cv) == load32(src, candidateL) { + // Long likely matches 7, so take that. + if uint32(cv) == uint32(valLong) { break } // Check our short candidate - if uint32(cv) == load32(src, candidateS) { + if uint32(cv) == uint32(valShort) { // Try a long candidate at s+1 hashL = hash7(cv>>8, lTableBits) candidateL = int(lTable[hashL]) @@ -227,21 +257,29 @@ func encodeBlockBetterGo(dst, src []byte) (d int) { // Do we have space for more, if not bail. return 0 } - // Index match start+1 (long) and start+2 (short) + + // Index short & long index0 := base + 1 - // Index match end-2 (long) and end-1 (short) index1 := s - 2 cv0 := load64(src, index0) cv1 := load64(src, index1) - cv = load64(src, s) lTable[hash7(cv0, lTableBits)] = uint32(index0) - lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1) - lTable[hash7(cv1, lTableBits)] = uint32(index1) - lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1) sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) - sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2) + + lTable[hash7(cv1, lTableBits)] = uint32(index1) sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) + index0 += 1 + index1 -= 1 + cv = load64(src, s) + + // index every second long in between. + for index0 < index1 { + lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0) + lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1) + index0 += 2 + index1 -= 2 + } } emitRemainder: @@ -260,8 +298,9 @@ emitRemainder: // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) && -// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize func encodeBlockBetterSnappyGo(dst, src []byte) (d int) { // sLimit is when to stop looking for offset/length copies. The inputMargin // lets us use a fast path for emitLiteral in the main loop, while we are @@ -402,21 +441,29 @@ func encodeBlockBetterSnappyGo(dst, src []byte) (d int) { // Do we have space for more, if not bail. return 0 } - // Index match start+1 (long) and start+2 (short) + + // Index short & long index0 := base + 1 - // Index match end-2 (long) and end-1 (short) index1 := s - 2 cv0 := load64(src, index0) cv1 := load64(src, index1) - cv = load64(src, s) lTable[hash7(cv0, lTableBits)] = uint32(index0) - lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1) - lTable[hash7(cv1, lTableBits)] = uint32(index1) - lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1) sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1) - sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2) + + lTable[hash7(cv1, lTableBits)] = uint32(index1) sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1) + index0 += 1 + index1 -= 1 + cv = load64(src, s) + + // index every second long in between. + for index0 < index1 { + lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0) + lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1) + index0 += 2 + index1 -= 2 + } } emitRemainder: diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go index 94784b82a..db08fc355 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_go.go +++ b/vendor/github.com/klauspost/compress/s2/encode_go.go @@ -12,6 +12,7 @@ import ( // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) func encodeBlock(dst, src []byte) (d int) { if len(src) < minNonLiteralBlockSize { @@ -25,6 +26,7 @@ func encodeBlock(dst, src []byte) (d int) { // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) func encodeBlockBetter(dst, src []byte) (d int) { return encodeBlockBetterGo(dst, src) @@ -35,6 +37,7 @@ func encodeBlockBetter(dst, src []byte) (d int) { // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) func encodeBlockBetterSnappy(dst, src []byte) (d int) { return encodeBlockBetterSnappyGo(dst, src) @@ -45,6 +48,7 @@ func encodeBlockBetterSnappy(dst, src []byte) (d int) { // been written. // // It also assumes that: +// // len(dst) >= MaxEncodedLen(len(src)) func encodeBlockSnappy(dst, src []byte) (d int) { if len(src) < minNonLiteralBlockSize { @@ -56,6 +60,7 @@ func encodeBlockSnappy(dst, src []byte) (d int) { // emitLiteral writes a literal chunk and returns the number of bytes written. // // It assumes that: +// // dst is long enough to hold the encoded bytes // 0 <= len(lit) && len(lit) <= math.MaxUint32 func emitLiteral(dst, lit []byte) int { @@ -146,6 +151,7 @@ func emitRepeat(dst []byte, offset, length int) int { // emitCopy writes a copy chunk and returns the number of bytes written. // // It assumes that: +// // dst is long enough to hold the encoded bytes // 1 <= offset && offset <= math.MaxUint32 // 4 <= length && length <= 1 << 24 @@ -214,6 +220,7 @@ func emitCopy(dst []byte, offset, length int) int { // emitCopyNoRepeat writes a copy chunk and returns the number of bytes written. // // It assumes that: +// // dst is long enough to hold the encoded bytes // 1 <= offset && offset <= math.MaxUint32 // 4 <= length && length <= 1 << 24 @@ -273,8 +280,8 @@ func emitCopyNoRepeat(dst []byte, offset, length int) int { // matchLen returns how many bytes match in a and b // // It assumes that: -// len(a) <= len(b) // +// len(a) <= len(b) func matchLen(a []byte, b []byte) int { b = b[:len(a)] var checked int diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go index 88f27c099..7e00bac3e 100644 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go @@ -1,7 +1,6 @@ // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. //go:build !appengine && !noasm && gc && !noasm -// +build !appengine,!noasm,gc,!noasm package s2 @@ -150,8 +149,9 @@ func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int // emitLiteral writes a literal chunk and returns the number of bytes written. // // It assumes that: -// dst is long enough to hold the encoded bytes with margin of 0 bytes -// 0 <= len(lit) && len(lit) <= math.MaxUint32 +// +// dst is long enough to hold the encoded bytes with margin of 0 bytes +// 0 <= len(lit) && len(lit) <= math.MaxUint32 // //go:noescape func emitLiteral(dst []byte, lit []byte) int @@ -165,9 +165,10 @@ func emitRepeat(dst []byte, offset int, length int) int // emitCopy writes a copy chunk and returns the number of bytes written. // // It assumes that: -// dst is long enough to hold the encoded bytes -// 1 <= offset && offset <= math.MaxUint32 -// 4 <= length && length <= 1 << 24 +// +// dst is long enough to hold the encoded bytes +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 // //go:noescape func emitCopy(dst []byte, offset int, length int) int @@ -175,9 +176,10 @@ func emitCopy(dst []byte, offset int, length int) int // emitCopyNoRepeat writes a copy chunk and returns the number of bytes written. // // It assumes that: -// dst is long enough to hold the encoded bytes -// 1 <= offset && offset <= math.MaxUint32 -// 4 <= length && length <= 1 << 24 +// +// dst is long enough to hold the encoded bytes +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 // //go:noescape func emitCopyNoRepeat(dst []byte, offset int, length int) int @@ -185,7 +187,8 @@ func emitCopyNoRepeat(dst []byte, offset int, length int) int // matchLen returns how many bytes match in a and b // // It assumes that: -// len(a) <= len(b) +// +// len(a) <= len(b) // //go:noescape func matchLen(a []byte, b []byte) int diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s index 36915d949..81a487d6d 100644 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s @@ -1,7 +1,6 @@ // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. //go:build !appengine && !noasm && gc && !noasm -// +build !appengine,!noasm,gc,!noasm #include "textflag.h" @@ -5743,9 +5742,9 @@ emit_literal_done_emit_remainder_encodeBlockAsm8B: // func encodeBetterBlockAsm(dst []byte, src []byte) int // Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm(SB), $327704-56 +TEXT ·encodeBetterBlockAsm(SB), $589848-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX + MOVQ $0x00001200, CX LEAQ 24(SP), DX PXOR X0, X0 @@ -5797,27 +5796,37 @@ check_maxskip_cont_encodeBetterBlockAsm: MOVQ DI, R11 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x32, R11 MOVL 24(SP)(R10*4), SI - MOVL 262168(SP)(R11*4), R8 + MOVL 524312(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) - MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVL CX, 524312(SP)(R11*4) + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm + CMPQ R11, DI + JNE no_short_found_encodeBetterBlockAsm + MOVL R8, SI + JMP candidate_match_encodeBetterBlockAsm + +no_short_found_encodeBetterBlockAsm: + CMPL R10, DI + JEQ candidate_match_encodeBetterBlockAsm + CMPL R11, DI + JEQ candidateS_match_encodeBetterBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm candidateS_match_encodeBetterBlockAsm: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 MOVL 24(SP)(R10*4), SI INCL CX MOVL CX, 24(SP)(R10*4) @@ -6590,52 +6599,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm: match_nolit_dst_ok_encodeBetterBlockAsm: MOVQ $0x00cf1bbcdcbfa563, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 + SHLQ $0x08, R12 + IMULQ SI, R12 + SHRQ $0x2f, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x32, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 524312(SP)(R11*4) + MOVL R14, 524312(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeBetterBlockAsm: + CMPQ DI, R9 + JAE search_loop_encodeBetterBlockAsm + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x08, R8 + IMULQ SI, R8 + SHRQ $0x2f, R8 SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R10 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeBetterBlockAsm emit_remainder_encodeBetterBlockAsm: MOVQ src_len+32(FP), CX @@ -6815,9 +6821,9 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm: // func encodeBetterBlockAsm4MB(dst []byte, src []byte) int // Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56 +TEXT ·encodeBetterBlockAsm4MB(SB), $589848-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX + MOVQ $0x00001200, CX LEAQ 24(SP), DX PXOR X0, X0 @@ -6869,27 +6875,37 @@ check_maxskip_cont_encodeBetterBlockAsm4MB: MOVQ DI, R11 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x32, R11 MOVL 24(SP)(R10*4), SI - MOVL 262168(SP)(R11*4), R8 + MOVL 524312(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) - MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVL CX, 524312(SP)(R11*4) + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm4MB - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm4MB - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm4MB + CMPQ R11, DI + JNE no_short_found_encodeBetterBlockAsm4MB + MOVL R8, SI + JMP candidate_match_encodeBetterBlockAsm4MB + +no_short_found_encodeBetterBlockAsm4MB: + CMPL R10, DI + JEQ candidate_match_encodeBetterBlockAsm4MB + CMPL R11, DI + JEQ candidateS_match_encodeBetterBlockAsm4MB + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm4MB candidateS_match_encodeBetterBlockAsm4MB: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 MOVL 24(SP)(R10*4), SI INCL CX MOVL CX, 24(SP)(R10*4) @@ -7600,52 +7616,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB: match_nolit_dst_ok_encodeBetterBlockAsm4MB: MOVQ $0x00cf1bbcdcbfa563, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 + SHLQ $0x08, R12 + IMULQ SI, R12 + SHRQ $0x2f, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x32, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 524312(SP)(R11*4) + MOVL R14, 524312(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeBetterBlockAsm4MB: + CMPQ DI, R9 + JAE search_loop_encodeBetterBlockAsm4MB + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x08, R8 + IMULQ SI, R8 + SHRQ $0x2f, R8 SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R10 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm4MB + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeBetterBlockAsm4MB emit_remainder_encodeBetterBlockAsm4MB: MOVQ src_len+32(FP), CX @@ -7871,12 +7884,22 @@ search_loop_encodeBetterBlockAsm12B: MOVL 65560(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 65560(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm12B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm12B + CMPQ R11, DI + JNE no_short_found_encodeBetterBlockAsm12B + MOVL R8, SI + JMP candidate_match_encodeBetterBlockAsm12B + +no_short_found_encodeBetterBlockAsm12B: + CMPL R10, DI + JEQ candidate_match_encodeBetterBlockAsm12B + CMPL R11, DI + JEQ candidateS_match_encodeBetterBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm12B candidateS_match_encodeBetterBlockAsm12B: SHRQ $0x08, DI @@ -8447,52 +8470,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm12B: match_nolit_dst_ok_encodeBetterBlockAsm12B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x32, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x34, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x34, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x32, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x34, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 65560(SP)(R11*4) - MOVL R15, 65560(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 65560(SP)(R11*4) + MOVL R14, 65560(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeBetterBlockAsm12B: + CMPQ DI, R9 + JAE search_loop_encodeBetterBlockAsm12B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x32, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x34, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 65560(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm12B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeBetterBlockAsm12B emit_remainder_encodeBetterBlockAsm12B: MOVQ src_len+32(FP), CX @@ -8707,12 +8727,22 @@ search_loop_encodeBetterBlockAsm10B: MOVL 16408(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 16408(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm10B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm10B + CMPQ R11, DI + JNE no_short_found_encodeBetterBlockAsm10B + MOVL R8, SI + JMP candidate_match_encodeBetterBlockAsm10B + +no_short_found_encodeBetterBlockAsm10B: + CMPL R10, DI + JEQ candidate_match_encodeBetterBlockAsm10B + CMPL R11, DI + JEQ candidateS_match_encodeBetterBlockAsm10B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm10B candidateS_match_encodeBetterBlockAsm10B: SHRQ $0x08, DI @@ -9283,52 +9313,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm10B: match_nolit_dst_ok_encodeBetterBlockAsm10B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x34, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x36, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x36, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x34, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x36, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 16408(SP)(R11*4) - MOVL R15, 16408(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 16408(SP)(R11*4) + MOVL R14, 16408(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeBetterBlockAsm10B: + CMPQ DI, R9 + JAE search_loop_encodeBetterBlockAsm10B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x34, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x36, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 16408(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm10B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeBetterBlockAsm10B emit_remainder_encodeBetterBlockAsm10B: MOVQ src_len+32(FP), CX @@ -9543,12 +9570,22 @@ search_loop_encodeBetterBlockAsm8B: MOVL 4120(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 4120(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeBetterBlockAsm8B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeBetterBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm8B + CMPQ R11, DI + JNE no_short_found_encodeBetterBlockAsm8B + MOVL R8, SI + JMP candidate_match_encodeBetterBlockAsm8B + +no_short_found_encodeBetterBlockAsm8B: + CMPL R10, DI + JEQ candidate_match_encodeBetterBlockAsm8B + CMPL R11, DI + JEQ candidateS_match_encodeBetterBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeBetterBlockAsm8B candidateS_match_encodeBetterBlockAsm8B: SHRQ $0x08, DI @@ -10105,52 +10142,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm8B: match_nolit_dst_ok_encodeBetterBlockAsm8B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x36, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x38, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x36, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x38, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 4120(SP)(R11*4) - MOVL R15, 4120(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 4120(SP)(R11*4) + MOVL R14, 4120(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeBetterBlockAsm8B: + CMPQ DI, R9 + JAE search_loop_encodeBetterBlockAsm8B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x36, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x38, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 4120(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeBetterBlockAsm8B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeBetterBlockAsm8B emit_remainder_encodeBetterBlockAsm8B: MOVQ src_len+32(FP), CX @@ -14287,9 +14321,9 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: // func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int // Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56 +TEXT ·encodeSnappyBetterBlockAsm(SB), $589848-56 MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX + MOVQ $0x00001200, CX LEAQ 24(SP), DX PXOR X0, X0 @@ -14341,27 +14375,37 @@ check_maxskip_cont_encodeSnappyBetterBlockAsm: MOVQ DI, R11 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ SI, R11 SHRQ $0x32, R11 MOVL 24(SP)(R10*4), SI - MOVL 262168(SP)(R11*4), R8 + MOVL 524312(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) - MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVL CX, 524312(SP)(R11*4) + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm + CMPQ R11, DI + JNE no_short_found_encodeSnappyBetterBlockAsm + MOVL R8, SI + JMP candidate_match_encodeSnappyBetterBlockAsm + +no_short_found_encodeSnappyBetterBlockAsm: + CMPL R10, DI + JEQ candidate_match_encodeSnappyBetterBlockAsm + CMPL R11, DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm candidateS_match_encodeSnappyBetterBlockAsm: SHRQ $0x08, DI MOVQ DI, R10 SHLQ $0x08, R10 IMULQ R9, R10 - SHRQ $0x30, R10 + SHRQ $0x2f, R10 MOVL 24(SP)(R10*4), SI INCL CX MOVL CX, 24(SP)(R10*4) @@ -14685,52 +14729,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm: match_nolit_dst_ok_encodeSnappyBetterBlockAsm: MOVQ $0x00cf1bbcdcbfa563, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R10 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 + SHLQ $0x08, R12 + IMULQ SI, R12 + SHRQ $0x2f, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x32, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 524312(SP)(R11*4) + MOVL R14, 524312(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm: + CMPQ DI, R9 + JAE search_loop_encodeSnappyBetterBlockAsm + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x08, R8 + IMULQ SI, R8 + SHRQ $0x2f, R8 SHLQ $0x08, R10 IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 + SHRQ $0x2f, R10 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeSnappyBetterBlockAsm emit_remainder_encodeSnappyBetterBlockAsm: MOVQ src_len+32(FP), CX @@ -14964,12 +15005,22 @@ search_loop_encodeSnappyBetterBlockAsm64K: MOVL 262168(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 262168(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm64K - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm64K - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm64K + CMPQ R11, DI + JNE no_short_found_encodeSnappyBetterBlockAsm64K + MOVL R8, SI + JMP candidate_match_encodeSnappyBetterBlockAsm64K + +no_short_found_encodeSnappyBetterBlockAsm64K: + CMPL R10, DI + JEQ candidate_match_encodeSnappyBetterBlockAsm64K + CMPL R11, DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm64K + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm64K candidateS_match_encodeSnappyBetterBlockAsm64K: SHRQ $0x08, DI @@ -15248,52 +15299,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K: match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K: MOVQ $0x00cf1bbcdcbfa563, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x30, R10 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x32, R12 + SHLQ $0x08, R12 + IMULQ SI, R12 + SHRQ $0x30, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x32, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 262168(SP)(R11*4) - MOVL R15, 262168(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 262168(SP)(R11*4) + MOVL R14, 262168(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm64K: + CMPQ DI, R9 + JAE search_loop_encodeSnappyBetterBlockAsm64K + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x08, R8 + IMULQ SI, R8 + SHRQ $0x30, R8 SHLQ $0x08, R10 IMULQ SI, R10 SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R13 - IMULQ SI, R13 - SHRQ $0x30, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 262168(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm64K + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeSnappyBetterBlockAsm64K emit_remainder_encodeSnappyBetterBlockAsm64K: MOVQ src_len+32(FP), CX @@ -15508,12 +15556,22 @@ search_loop_encodeSnappyBetterBlockAsm12B: MOVL 65560(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 65560(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm12B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm12B + CMPQ R11, DI + JNE no_short_found_encodeSnappyBetterBlockAsm12B + MOVL R8, SI + JMP candidate_match_encodeSnappyBetterBlockAsm12B + +no_short_found_encodeSnappyBetterBlockAsm12B: + CMPL R10, DI + JEQ candidate_match_encodeSnappyBetterBlockAsm12B + CMPL R11, DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm12B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm12B candidateS_match_encodeSnappyBetterBlockAsm12B: SHRQ $0x08, DI @@ -15792,52 +15850,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B: match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x32, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x34, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x34, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x32, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x34, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 65560(SP)(R11*4) - MOVL R15, 65560(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 65560(SP)(R11*4) + MOVL R14, 65560(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm12B: + CMPQ DI, R9 + JAE search_loop_encodeSnappyBetterBlockAsm12B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x32, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x34, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x32, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 65560(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm12B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeSnappyBetterBlockAsm12B emit_remainder_encodeSnappyBetterBlockAsm12B: MOVQ src_len+32(FP), CX @@ -16052,12 +16107,22 @@ search_loop_encodeSnappyBetterBlockAsm10B: MOVL 16408(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 16408(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm10B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm10B + CMPQ R11, DI + JNE no_short_found_encodeSnappyBetterBlockAsm10B + MOVL R8, SI + JMP candidate_match_encodeSnappyBetterBlockAsm10B + +no_short_found_encodeSnappyBetterBlockAsm10B: + CMPL R10, DI + JEQ candidate_match_encodeSnappyBetterBlockAsm10B + CMPL R11, DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm10B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm10B candidateS_match_encodeSnappyBetterBlockAsm10B: SHRQ $0x08, DI @@ -16336,52 +16401,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B: match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x34, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x36, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x36, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x34, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x36, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 16408(SP)(R11*4) - MOVL R15, 16408(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 16408(SP)(R11*4) + MOVL R14, 16408(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm10B: + CMPQ DI, R9 + JAE search_loop_encodeSnappyBetterBlockAsm10B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x34, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x36, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x34, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 16408(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm10B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeSnappyBetterBlockAsm10B emit_remainder_encodeSnappyBetterBlockAsm10B: MOVQ src_len+32(FP), CX @@ -16596,12 +16658,22 @@ search_loop_encodeSnappyBetterBlockAsm8B: MOVL 4120(SP)(R11*4), R8 MOVL CX, 24(SP)(R10*4) MOVL CX, 4120(SP)(R11*4) - CMPL (DX)(SI*1), DI + MOVQ (DX)(SI*1), R10 + MOVQ (DX)(R8*1), R11 + CMPQ R10, DI JEQ candidate_match_encodeSnappyBetterBlockAsm8B - CMPL (DX)(R8*1), DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm8B + CMPQ R11, DI + JNE no_short_found_encodeSnappyBetterBlockAsm8B + MOVL R8, SI + JMP candidate_match_encodeSnappyBetterBlockAsm8B + +no_short_found_encodeSnappyBetterBlockAsm8B: + CMPL R10, DI + JEQ candidate_match_encodeSnappyBetterBlockAsm8B + CMPL R11, DI + JEQ candidateS_match_encodeSnappyBetterBlockAsm8B + MOVL 20(SP), CX + JMP search_loop_encodeSnappyBetterBlockAsm8B candidateS_match_encodeSnappyBetterBlockAsm8B: SHRQ $0x08, DI @@ -16878,52 +16950,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B: match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B: MOVQ $0x0000cf1bbcdcbf9b, SI MOVQ $0x9e3779b1, R8 - INCL DI - MOVQ (DX)(DI*1), R9 - MOVQ R9, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R11 - MOVQ R11, R13 - SHRQ $0x10, R12 - LEAL 1(DI), R14 - LEAL 2(DI), R15 - MOVQ -2(DX)(CX*1), R9 + LEAQ 1(DI), DI + LEAQ -2(CX), R9 + MOVQ (DX)(DI*1), R10 + MOVQ 1(DX)(DI*1), R11 + MOVQ (DX)(R9*1), R12 + MOVQ 1(DX)(R9*1), R13 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x36, R10 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 SHLQ $0x20, R11 IMULQ R8, R11 SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R8, R12 - SHRQ $0x38, R12 + SHLQ $0x10, R12 + IMULQ SI, R12 + SHRQ $0x36, R12 + SHLQ $0x20, R13 + IMULQ R8, R13 + SHRQ $0x38, R13 + LEAQ 1(DI), R8 + LEAQ 1(R9), R14 MOVL DI, 24(SP)(R10*4) - MOVL R14, 24(SP)(R13*4) - MOVL R14, 4120(SP)(R11*4) - MOVL R15, 4120(SP)(R12*4) - MOVQ R9, R10 - MOVQ R9, R11 - SHRQ $0x08, R11 - MOVQ R11, R13 - LEAL -2(CX), R9 - LEAL -1(CX), DI + MOVL R9, 24(SP)(R12*4) + MOVL R8, 4120(SP)(R11*4) + MOVL R14, 4120(SP)(R13*4) + ADDQ $0x01, DI + SUBQ $0x01, R9 + +index_loop_encodeSnappyBetterBlockAsm8B: + CMPQ DI, R9 + JAE search_loop_encodeSnappyBetterBlockAsm8B + MOVQ (DX)(DI*1), R8 + MOVQ (DX)(R9*1), R10 + SHLQ $0x10, R8 + IMULQ SI, R8 + SHRQ $0x36, R8 SHLQ $0x10, R10 IMULQ SI, R10 SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x38, R11 - SHLQ $0x10, R13 - IMULQ SI, R13 - SHRQ $0x36, R13 + MOVL DI, 24(SP)(R8*4) MOVL R9, 24(SP)(R10*4) - MOVL DI, 4120(SP)(R11*4) - MOVL DI, 24(SP)(R13*4) - JMP search_loop_encodeSnappyBetterBlockAsm8B + ADDQ $0x02, DI + SUBQ $0x02, R9 + JMP index_loop_encodeSnappyBetterBlockAsm8B emit_remainder_encodeSnappyBetterBlockAsm8B: MOVQ src_len+32(FP), CX |