diff options
| author | 2022-09-28 18:30:40 +0100 | |
|---|---|---|
| committer | 2022-09-28 18:30:40 +0100 | |
| commit | a156188b3eb5cb3da44aa1b7452265f5fa38a607 (patch) | |
| tree | 7097fa48d56fbabc7c2c8750b1f3bc9321d71c0f /vendor/github.com/klauspost/compress/s2 | |
| parent | [bugfix] Fix emphasis being added to emoji shortcodes with markdown parsing (... (diff) | |
| download | gotosocial-a156188b3eb5cb3da44aa1b7452265f5fa38a607.tar.xz | |
[chore] update dependencies, bump to Go 1.19.1 (#826)
* update dependencies, bump Go version to 1.19
* bump test image Go version
* update golangci-lint
* update gotosocial-drone-build
* sign
* linting, go fmt
* update swagger docs
* update swagger docs
* whitespace
* update contributing.md
* fuckin whoopsie doopsie
* linterino, linteroni
* fix followrequest test not starting processor
* fix other api/client tests not starting processor
* fix remaining tests where processor not started
* bump go-runners version
* don't check last-webfingered-at, processor may have updated this
* update swagger command
* update bun to latest version
* fix embed to work the same as before with new bun
Signed-off-by: kim <grufwub@gmail.com>
Co-authored-by: tsmethurst <tobi.smethurst@protonmail.com>
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2')
8 files changed, 1946 insertions, 452 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md index 119793456..73c0c462d 100644 --- a/vendor/github.com/klauspost/compress/s2/README.md +++ b/vendor/github.com/klauspost/compress/s2/README.md @@ -19,6 +19,7 @@ This is important, so you don't have to worry about spending CPU cycles on alrea  * Adjustable compression (3 levels)   * Concurrent stream compression  * Faster decompression, even for Snappy compatible content +* Concurrent Snappy/S2 stream decompression  * Ability to quickly skip forward in compressed stream  * Random seeking with indexes  * Compatible with reading Snappy compressed content @@ -415,6 +416,25 @@ Without assembly decompression is also very fast; single goroutine decompression  Even though S2 typically compresses better than Snappy, decompression speed is always better.  +### Concurrent Stream Decompression + +For full stream decompression S2 offers a [DecodeConcurrent](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.DecodeConcurrent)  +that will decode a full stream using multiple goroutines. + +Example scaling, AMD Ryzen 3950X, 16 cores, decompression using `s2d -bench=3 <input>`, best of 3:  + +| Input                                     | `-cpu=1`   | `-cpu=2`   | `-cpu=4`   | `-cpu=8`   | `-cpu=16`   | +|-------------------------------------------|------------|------------|------------|------------|-------------| +| enwik10.snappy                            | 1098.6MB/s | 1819.8MB/s | 3625.6MB/s | 6910.6MB/s | 10818.2MB/s | +| enwik10.s2                                | 1303.5MB/s | 2606.1MB/s | 4847.9MB/s | 8878.4MB/s | 9592.1MB/s  | +| sofia-air-quality-dataset.tar.snappy      | 1302.0MB/s | 2165.0MB/s | 4244.5MB/s | 8241.0MB/s | 12920.5MB/s | +| sofia-air-quality-dataset.tar.s2          | 1399.2MB/s | 2463.2MB/s | 5196.5MB/s | 9639.8MB/s | 11439.5MB/s | +| sofia-air-quality-dataset.tar.s2 (no asm) | 837.5MB/s  | 1652.6MB/s | 3183.6MB/s | 5945.0MB/s | 9620.7MB/s  | + +Scaling can be expected to be pretty linear until memory bandwidth is saturated.  + +For now the DecodeConcurrent can only be used for full streams without seeking or combining with regular reads. +  ## Block compression @@ -873,7 +893,7 @@ for each entry {      }      // Uncompressed uses previous offset and adds EstBlockSize -    entry[entryNum].UncompressedOffset = entry[entryNum-1].UncompressedOffset + EstBlockSize +    entry[entryNum].UncompressedOffset = entry[entryNum-1].UncompressedOffset + EstBlockSize + uOff  } @@ -901,6 +921,14 @@ for each entry {  }  ``` +To decode from any given uncompressed offset `(wantOffset)`: + +* Iterate entries until `entry[n].UncompressedOffset > wantOffset`. +* Start decoding from `entry[n-1].CompressedOffset`. +* Discard `entry[n-1].UncompressedOffset - wantOffset` bytes from the decoded stream. + +See [using indexes](https://github.com/klauspost/compress/tree/master/s2#using-indexes) for functions that perform the operations with a simpler interface. +  # Format Extensions  * Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`. diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go index 9e7fce885..27c0f3c2c 100644 --- a/vendor/github.com/klauspost/compress/s2/decode.go +++ b/vendor/github.com/klauspost/compress/s2/decode.go @@ -11,6 +11,9 @@ import (  	"fmt"  	"io"  	"io/ioutil" +	"math" +	"runtime" +	"sync"  )  var ( @@ -169,6 +172,14 @@ func ReaderSkippableCB(id uint8, fn func(r io.Reader) error) ReaderOption {  	}  } +// ReaderIgnoreCRC will make the reader skip CRC calculation and checks. +func ReaderIgnoreCRC() ReaderOption { +	return func(r *Reader) error { +		r.ignoreCRC = true +		return nil +	} +} +  // Reader is an io.Reader that can read Snappy-compressed bytes.  type Reader struct {  	r           io.Reader @@ -191,18 +202,19 @@ type Reader struct {  	paramsOK       bool  	snappyFrame    bool  	ignoreStreamID bool +	ignoreCRC      bool  }  // ensureBufferSize will ensure that the buffer can take at least n bytes.  // If false is returned the buffer exceeds maximum allowed size.  func (r *Reader) ensureBufferSize(n int) bool { -	if len(r.buf) >= n { -		return true -	}  	if n > r.maxBufSize {  		r.err = ErrCorrupt  		return false  	} +	if cap(r.buf) >= n { +		return true +	}  	// Realloc buffer.  	r.buf = make([]byte, n)  	return true @@ -220,6 +232,7 @@ func (r *Reader) Reset(reader io.Reader) {  	r.err = nil  	r.i = 0  	r.j = 0 +	r.blockStart = 0  	r.readHeader = r.ignoreStreamID  } @@ -344,7 +357,7 @@ func (r *Reader) Read(p []byte) (int, error) {  				r.err = err  				return 0, r.err  			} -			if crc(r.decoded[:n]) != checksum { +			if !r.ignoreCRC && crc(r.decoded[:n]) != checksum {  				r.err = ErrCRC  				return 0, r.err  			} @@ -385,7 +398,7 @@ func (r *Reader) Read(p []byte) (int, error) {  			if !r.readFull(r.decoded[:n], false) {  				return 0, r.err  			} -			if crc(r.decoded[:n]) != checksum { +			if !r.ignoreCRC && crc(r.decoded[:n]) != checksum {  				r.err = ErrCRC  				return 0, r.err  			} @@ -435,6 +448,259 @@ func (r *Reader) Read(p []byte) (int, error) {  	}  } +// DecodeConcurrent will decode the full stream to w. +// This function should not be combined with reading, seeking or other operations. +// Up to 'concurrent' goroutines will be used. +// If <= 0, runtime.NumCPU will be used. +// On success the number of bytes decompressed nil and is returned. +// This is mainly intended for bigger streams. +func (r *Reader) DecodeConcurrent(w io.Writer, concurrent int) (written int64, err error) { +	if r.i > 0 || r.j > 0 || r.blockStart > 0 { +		return 0, errors.New("DecodeConcurrent called after ") +	} +	if concurrent <= 0 { +		concurrent = runtime.NumCPU() +	} + +	// Write to output +	var errMu sync.Mutex +	var aErr error +	setErr := func(e error) (ok bool) { +		errMu.Lock() +		defer errMu.Unlock() +		if e == nil { +			return aErr == nil +		} +		if aErr == nil { +			aErr = e +		} +		return false +	} +	hasErr := func() (ok bool) { +		errMu.Lock() +		v := aErr != nil +		errMu.Unlock() +		return v +	} + +	var aWritten int64 +	toRead := make(chan []byte, concurrent) +	writtenBlocks := make(chan []byte, concurrent) +	queue := make(chan chan []byte, concurrent) +	reUse := make(chan chan []byte, concurrent) +	for i := 0; i < concurrent; i++ { +		toRead <- make([]byte, 0, r.maxBufSize) +		writtenBlocks <- make([]byte, 0, r.maxBufSize) +		reUse <- make(chan []byte, 1) +	} +	// Writer +	var wg sync.WaitGroup +	wg.Add(1) +	go func() { +		defer wg.Done() +		for toWrite := range queue { +			entry := <-toWrite +			reUse <- toWrite +			if hasErr() { +				writtenBlocks <- entry +				continue +			} +			n, err := w.Write(entry) +			want := len(entry) +			writtenBlocks <- entry +			if err != nil { +				setErr(err) +				continue +			} +			if n != want { +				setErr(io.ErrShortWrite) +				continue +			} +			aWritten += int64(n) +		} +	}() + +	// Reader +	defer func() { +		close(queue) +		if r.err != nil { +			err = r.err +			setErr(r.err) +		} +		wg.Wait() +		if err == nil { +			err = aErr +		} +		written = aWritten +	}() + +	for !hasErr() { +		if !r.readFull(r.buf[:4], true) { +			if r.err == io.EOF { +				r.err = nil +			} +			return 0, r.err +		} +		chunkType := r.buf[0] +		if !r.readHeader { +			if chunkType != chunkTypeStreamIdentifier { +				r.err = ErrCorrupt +				return 0, r.err +			} +			r.readHeader = true +		} +		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16 + +		// The chunk types are specified at +		// https://github.com/google/snappy/blob/master/framing_format.txt +		switch chunkType { +		case chunkTypeCompressedData: +			r.blockStart += int64(r.j) +			// Section 4.2. Compressed data (chunk type 0x00). +			if chunkLen < checksumSize { +				r.err = ErrCorrupt +				return 0, r.err +			} +			if chunkLen > r.maxBufSize { +				r.err = ErrCorrupt +				return 0, r.err +			} +			orgBuf := <-toRead +			buf := orgBuf[:chunkLen] + +			if !r.readFull(buf, false) { +				return 0, r.err +			} + +			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 +			buf = buf[checksumSize:] + +			n, err := DecodedLen(buf) +			if err != nil { +				r.err = err +				return 0, r.err +			} +			if r.snappyFrame && n > maxSnappyBlockSize { +				r.err = ErrCorrupt +				return 0, r.err +			} + +			if n > r.maxBlock { +				r.err = ErrCorrupt +				return 0, r.err +			} +			wg.Add(1) + +			decoded := <-writtenBlocks +			entry := <-reUse +			queue <- entry +			go func() { +				defer wg.Done() +				decoded = decoded[:n] +				_, err := Decode(decoded, buf) +				toRead <- orgBuf +				if err != nil { +					writtenBlocks <- decoded +					setErr(err) +					return +				} +				if !r.ignoreCRC && crc(decoded) != checksum { +					writtenBlocks <- decoded +					setErr(ErrCRC) +					return +				} +				entry <- decoded +			}() +			continue + +		case chunkTypeUncompressedData: + +			// Section 4.3. Uncompressed data (chunk type 0x01). +			if chunkLen < checksumSize { +				r.err = ErrCorrupt +				return 0, r.err +			} +			if chunkLen > r.maxBufSize { +				r.err = ErrCorrupt +				return 0, r.err +			} +			// Grab write buffer +			orgBuf := <-writtenBlocks +			buf := orgBuf[:checksumSize] +			if !r.readFull(buf, false) { +				return 0, r.err +			} +			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 +			// Read content. +			n := chunkLen - checksumSize + +			if r.snappyFrame && n > maxSnappyBlockSize { +				r.err = ErrCorrupt +				return 0, r.err +			} +			if n > r.maxBlock { +				r.err = ErrCorrupt +				return 0, r.err +			} +			// Read uncompressed +			buf = orgBuf[:n] +			if !r.readFull(buf, false) { +				return 0, r.err +			} + +			if !r.ignoreCRC && crc(buf) != checksum { +				r.err = ErrCRC +				return 0, r.err +			} +			entry := <-reUse +			queue <- entry +			entry <- buf +			continue + +		case chunkTypeStreamIdentifier: +			// Section 4.1. Stream identifier (chunk type 0xff). +			if chunkLen != len(magicBody) { +				r.err = ErrCorrupt +				return 0, r.err +			} +			if !r.readFull(r.buf[:len(magicBody)], false) { +				return 0, r.err +			} +			if string(r.buf[:len(magicBody)]) != magicBody { +				if string(r.buf[:len(magicBody)]) != magicBodySnappy { +					r.err = ErrCorrupt +					return 0, r.err +				} else { +					r.snappyFrame = true +				} +			} else { +				r.snappyFrame = false +			} +			continue +		} + +		if chunkType <= 0x7f { +			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f). +			// fmt.Printf("ERR chunktype: 0x%x\n", chunkType) +			r.err = ErrUnsupported +			return 0, r.err +		} +		// Section 4.4 Padding (chunk type 0xfe). +		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd). +		if chunkLen > maxChunkSize { +			// fmt.Printf("ERR chunkLen: 0x%x\n", chunkLen) +			r.err = ErrUnsupported +			return 0, r.err +		} + +		// fmt.Printf("skippable: ID: 0x%x, len: 0x%x\n", chunkType, chunkLen) +		if !r.skippable(r.buf, chunkLen, false, chunkType) { +			return 0, r.err +		} +	} +	return 0, r.err +} +  // Skip will skip n bytes forward in the decompressed output.  // For larger skips this consumes less CPU and is faster than reading output and discarding it.  // CRC is not checked on skipped blocks. @@ -454,7 +720,11 @@ func (r *Reader) Skip(n int64) error {  			// decoded[i:j] contains decoded bytes that have not yet been passed on.  			left := int64(r.j - r.i)  			if left >= n { -				r.i += int(n) +				tmp := int64(r.i) + n +				if tmp > math.MaxInt32 { +					return errors.New("s2: internal overflow in skip") +				} +				r.i = int(tmp)  				return nil  			}  			n -= int64(r.j - r.i) @@ -526,6 +796,7 @@ func (r *Reader) Skip(n int64) error {  			} else {  				// Skip block completely  				n -= int64(dLen) +				r.blockStart += int64(dLen)  				dLen = 0  			}  			r.i, r.j = 0, dLen @@ -656,6 +927,15 @@ func (r *Reader) ReadSeeker(random bool, index []byte) (*ReadSeeker, error) {  	err = r.index.LoadStream(rs)  	if err != nil {  		if err == ErrUnsupported { +			// If we don't require random seeking, reset input and return. +			if !random { +				_, err = rs.Seek(pos, io.SeekStart) +				if err != nil { +					return nil, ErrCantSeek{Reason: "resetting stream returned: " + err.Error()} +				} +				r.index = nil +				return &ReadSeeker{Reader: r}, nil +			}  			return nil, ErrCantSeek{Reason: "input stream does not contain an index"}  		}  		return nil, ErrCantSeek{Reason: "reading index returned: " + err.Error()} @@ -699,8 +979,16 @@ func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) {  	case io.SeekCurrent:  		offset += r.blockStart + int64(r.i)  	case io.SeekEnd: -		offset = -offset +		if offset > 0 { +			return 0, errors.New("seek after end of file") +		} +		offset = r.index.TotalUncompressed + offset +	} + +	if offset < 0 { +		return 0, errors.New("seek before start of file")  	} +  	c, u, err := r.index.Find(offset)  	if err != nil {  		return r.blockStart + int64(r.i), err @@ -712,10 +1000,6 @@ func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) {  		return 0, err  	} -	if offset < 0 { -		offset = r.index.TotalUncompressed + offset -	} -  	r.i = r.j // Remove rest of current block.  	if u < offset {  		// Forward inside block diff --git a/vendor/github.com/klauspost/compress/s2/encode.go b/vendor/github.com/klauspost/compress/s2/encode.go index 59f992ca6..1aefabf31 100644 --- a/vendor/github.com/klauspost/compress/s2/encode.go +++ b/vendor/github.com/klauspost/compress/s2/encode.go @@ -1119,12 +1119,6 @@ func (w *Writer) closeIndex(idx bool) ([]byte, error) {  			if w.appendIndex {  				w.written += int64(len(index))  			} -			if true { -				_, err := w.index.Load(index) -				if err != nil { -					panic(err) -				} -			}  		}  		if w.pad > 1 { diff --git a/vendor/github.com/klauspost/compress/s2/encode_best.go b/vendor/github.com/klauspost/compress/s2/encode_best.go index 448034776..4bc80bc6a 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_best.go +++ b/vendor/github.com/klauspost/compress/s2/encode_best.go @@ -370,7 +370,7 @@ func encodeBlockBestSnappy(dst, src []byte) (d int) {  				}  				offset := m.s - m.offset -				return score - emitCopySize(offset, m.length) +				return score - emitCopyNoRepeatSize(offset, m.length)  			}  			matchAt := func(offset, s int, first uint32) match { @@ -567,6 +567,10 @@ func emitCopySize(offset, length int) int {  	// Offset no more than 2 bytes.  	if length > 64 { +		if offset < 2048 { +			// Emit 8 bytes, then rest as repeats... +			return 2 + emitRepeatSize(offset, length-8) +		}  		// Emit remaining as repeats, at least 4 bytes remain.  		return 3 + emitRepeatSize(offset, length-60)  	} @@ -577,6 +581,28 @@ func emitCopySize(offset, length int) int {  	return 2  } +// emitCopyNoRepeatSize returns the size to encode the offset+length +// +// It assumes that: +//	1 <= offset && offset <= math.MaxUint32 +//	4 <= length && length <= 1 << 24 +func emitCopyNoRepeatSize(offset, length int) int { +	if offset >= 65536 { +		return 5 + 5*(length/64) +	} + +	// Offset no more than 2 bytes. +	if length > 64 { +		// Emit remaining as repeats, at least 4 bytes remain. +		return 3 + 3*(length/60) +	} +	if length >= 12 || offset >= 2048 { +		return 3 +	} +	// Emit the remaining copy, encoded as 2 bytes. +	return 2 +} +  // emitRepeatSize returns the number of bytes required to encode a repeat.  // Length must be at least 4 and < 1<<24  func emitRepeatSize(offset, length int) int { diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go index 43d43534e..94784b82a 100644 --- a/vendor/github.com/klauspost/compress/s2/encode_go.go +++ b/vendor/github.com/klauspost/compress/s2/encode_go.go @@ -180,14 +180,23 @@ func emitCopy(dst []byte, offset, length int) int {  	// Offset no more than 2 bytes.  	if length > 64 { -		// Emit a length 60 copy, encoded as 3 bytes. -		// Emit remaining as repeat value (minimum 4 bytes). -		dst[2] = uint8(offset >> 8) -		dst[1] = uint8(offset) -		dst[0] = 59<<2 | tagCopy2 -		length -= 60 +		off := 3 +		if offset < 2048 { +			// emit 8 bytes as tagCopy1, rest as repeats. +			dst[1] = uint8(offset) +			dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1 +			length -= 8 +			off = 2 +		} else { +			// Emit a length 60 copy, encoded as 3 bytes. +			// Emit remaining as repeat value (minimum 4 bytes). +			dst[2] = uint8(offset >> 8) +			dst[1] = uint8(offset) +			dst[0] = 59<<2 | tagCopy2 +			length -= 60 +		}  		// Emit remaining as repeats, at least 4 bytes remain. -		return 3 + emitRepeat(dst[3:], offset, length) +		return off + emitRepeat(dst[off:], offset, length)  	}  	if length >= 12 || offset >= 2048 {  		// Emit the remaining copy, encoded as 3 bytes. diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go index d9312e5b9..88f27c099 100644 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go @@ -5,6 +5,8 @@  package s2 +func _dummy_() +  // encodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.  // Maximum input 4294967295 bytes.  // It assumes that the varint-encoded length of the decompressed bytes has already been written. diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s index 729dbf536..36915d949 100644 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s @@ -5,6 +5,15 @@  #include "textflag.h" +// func _dummy_() +TEXT ·_dummy_(SB), $0 +#ifdef GOAMD64_v4 +#ifndef GOAMD64_v3 +#define GOAMD64_v3 +#endif +#endif +	RET +  // func encodeBlockAsm(dst []byte, src []byte) int  // Requires: BMI, SSE2  TEXT ·encodeBlockAsm(SB), $65560-56 @@ -253,17 +262,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm:  #ifdef GOAMD64_v3  	TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R11, R11 @@ -476,6 +474,90 @@ four_bytes_remain_repeat_as_copy_encodeBlockAsm:  two_byte_offset_repeat_as_copy_encodeBlockAsm:  	CMPL SI, $0x40  	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm +	CMPL DI, $0x00000800 +	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm +	MOVL $0x00000001, R8 +	LEAL 16(R8), R8 +	MOVB DI, 1(AX) +	MOVL DI, R9 +	SHRL $0x08, R9 +	SHLL $0x05, R9 +	ORL  R9, R8 +	MOVB R8, (AX) +	ADDQ $0x02, AX +	SUBL $0x08, SI + +	// emitRepeat +	LEAL -4(SI), SI +	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + +emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: +	MOVL SI, R8 +	LEAL -4(SI), SI +	CMPL R8, $0x08 +	JLE  repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b +	CMPL R8, $0x0c +	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b +	CMPL DI, $0x00000800 +	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: +	CMPL SI, $0x00000104 +	JLT  repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b +	CMPL SI, $0x00010100 +	JLT  repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b +	CMPL SI, $0x0100ffff +	JLT  repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b +	LEAL -16842747(SI), SI +	MOVW $0x001d, (AX) +	MOVW $0xfffb, 2(AX) +	MOVB $0xff, 4(AX) +	ADDQ $0x05, AX +	JMP  emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b + +repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: +	LEAL -65536(SI), SI +	MOVL SI, DI +	MOVW $0x001d, (AX) +	MOVW SI, 2(AX) +	SARL $0x10, DI +	MOVB DI, 4(AX) +	ADDQ $0x05, AX +	JMP  repeat_end_emit_encodeBlockAsm + +repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: +	LEAL -256(SI), SI +	MOVW $0x0019, (AX) +	MOVW SI, 2(AX) +	ADDQ $0x04, AX +	JMP  repeat_end_emit_encodeBlockAsm + +repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: +	LEAL -4(SI), SI +	MOVW $0x0015, (AX) +	MOVB SI, 2(AX) +	ADDQ $0x03, AX +	JMP  repeat_end_emit_encodeBlockAsm + +repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: +	SHLL $0x02, SI +	ORL  $0x01, SI +	MOVW SI, (AX) +	ADDQ $0x02, AX +	JMP  repeat_end_emit_encodeBlockAsm + +repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: +	XORQ R8, R8 +	LEAL 1(R8)(SI*4), SI +	MOVB DI, 1(AX) +	SARL $0x08, DI +	SHLL $0x05, DI +	ORL  DI, SI +	MOVB SI, (AX) +	ADDQ $0x02, AX +	JMP  repeat_end_emit_encodeBlockAsm + +long_offset_short_repeat_as_copy_encodeBlockAsm:  	MOVB $0xee, (AX)  	MOVW DI, 1(AX)  	LEAL -60(SI), SI @@ -791,17 +873,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm:  #ifdef GOAMD64_v3  	TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R9, R9 @@ -944,6 +1015,90 @@ four_bytes_remain_match_nolit_encodeBlockAsm:  two_byte_offset_match_nolit_encodeBlockAsm:  	CMPL R10, $0x40  	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm +	CMPL SI, $0x00000800 +	JAE  long_offset_short_match_nolit_encodeBlockAsm +	MOVL $0x00000001, DI +	LEAL 16(DI), DI +	MOVB SI, 1(AX) +	MOVL SI, R8 +	SHRL $0x08, R8 +	SHLL $0x05, R8 +	ORL  R8, DI +	MOVB DI, (AX) +	ADDQ $0x02, AX +	SUBL $0x08, R10 + +	// emitRepeat +	LEAL -4(R10), R10 +	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b + +emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b: +	MOVL R10, DI +	LEAL -4(R10), R10 +	CMPL DI, $0x08 +	JLE  repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b +	CMPL DI, $0x0c +	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b +	CMPL SI, $0x00000800 +	JLT  repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: +	CMPL R10, $0x00000104 +	JLT  repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b +	CMPL R10, $0x00010100 +	JLT  repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b +	CMPL R10, $0x0100ffff +	JLT  repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b +	LEAL -16842747(R10), R10 +	MOVW $0x001d, (AX) +	MOVW $0xfffb, 2(AX) +	MOVB $0xff, 4(AX) +	ADDQ $0x05, AX +	JMP  emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b + +repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b: +	LEAL -65536(R10), R10 +	MOVL R10, SI +	MOVW $0x001d, (AX) +	MOVW R10, 2(AX) +	SARL $0x10, SI +	MOVB SI, 4(AX) +	ADDQ $0x05, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm + +repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b: +	LEAL -256(R10), R10 +	MOVW $0x0019, (AX) +	MOVW R10, 2(AX) +	ADDQ $0x04, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm + +repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b: +	LEAL -4(R10), R10 +	MOVW $0x0015, (AX) +	MOVB R10, 2(AX) +	ADDQ $0x03, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b: +	SHLL $0x02, R10 +	ORL  $0x01, R10 +	MOVW R10, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm + +repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: +	XORQ DI, DI +	LEAL 1(DI)(R10*4), R10 +	MOVB SI, 1(AX) +	SARL $0x08, SI +	SHLL $0x05, SI +	ORL  SI, R10 +	MOVB R10, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm + +long_offset_short_match_nolit_encodeBlockAsm:  	MOVB $0xee, (AX)  	MOVW SI, 1(AX)  	LEAL -60(R10), R10 @@ -1134,17 +1289,36 @@ memmove_emit_remainder_encodeBlockAsm:  	MOVL SI, BX  	// genMemMoveShort +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3  	CMPQ BX, $0x08 -	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8 +	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7  	CMPQ BX, $0x10  	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16  	CMPQ BX, $0x20  	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8: -	MOVQ (CX), SI -	MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX) +	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm + +emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7: +	MOVL (CX), SI +	MOVL -4(CX)(BX*1), CX +	MOVL SI, (AX) +	MOVL CX, -4(AX)(BX*1)  	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm  emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: @@ -1466,17 +1640,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm4MB:  #ifdef GOAMD64_v3  	TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R11, R11 @@ -1667,6 +1830,77 @@ four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:  two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:  	CMPL SI, $0x40  	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB +	CMPL DI, $0x00000800 +	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm4MB +	MOVL $0x00000001, R8 +	LEAL 16(R8), R8 +	MOVB DI, 1(AX) +	SHRL $0x08, DI +	SHLL $0x05, DI +	ORL  DI, R8 +	MOVB R8, (AX) +	ADDQ $0x02, AX +	SUBL $0x08, SI + +	// emitRepeat +	LEAL -4(SI), SI +	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b +	MOVL SI, R8 +	LEAL -4(SI), SI +	CMPL R8, $0x08 +	JLE  repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b +	CMPL R8, $0x0c +	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b +	CMPL DI, $0x00000800 +	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: +	CMPL SI, $0x00000104 +	JLT  repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b +	CMPL SI, $0x00010100 +	JLT  repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b +	LEAL -65536(SI), SI +	MOVL SI, DI +	MOVW $0x001d, (AX) +	MOVW SI, 2(AX) +	SARL $0x10, DI +	MOVB DI, 4(AX) +	ADDQ $0x05, AX +	JMP  repeat_end_emit_encodeBlockAsm4MB + +repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: +	LEAL -256(SI), SI +	MOVW $0x0019, (AX) +	MOVW SI, 2(AX) +	ADDQ $0x04, AX +	JMP  repeat_end_emit_encodeBlockAsm4MB + +repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: +	LEAL -4(SI), SI +	MOVW $0x0015, (AX) +	MOVB SI, 2(AX) +	ADDQ $0x03, AX +	JMP  repeat_end_emit_encodeBlockAsm4MB + +repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: +	SHLL $0x02, SI +	ORL  $0x01, SI +	MOVW SI, (AX) +	ADDQ $0x02, AX +	JMP  repeat_end_emit_encodeBlockAsm4MB + +repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: +	XORQ R8, R8 +	LEAL 1(R8)(SI*4), SI +	MOVB DI, 1(AX) +	SARL $0x08, DI +	SHLL $0x05, DI +	ORL  DI, SI +	MOVB SI, (AX) +	ADDQ $0x02, AX +	JMP  repeat_end_emit_encodeBlockAsm4MB + +long_offset_short_repeat_as_copy_encodeBlockAsm4MB:  	MOVB $0xee, (AX)  	MOVW DI, 1(AX)  	LEAL -60(SI), SI @@ -1963,17 +2197,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm4MB:  #ifdef GOAMD64_v3  	TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R9, R9 @@ -2105,6 +2328,77 @@ four_bytes_remain_match_nolit_encodeBlockAsm4MB:  two_byte_offset_match_nolit_encodeBlockAsm4MB:  	CMPL R10, $0x40  	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm4MB +	CMPL SI, $0x00000800 +	JAE  long_offset_short_match_nolit_encodeBlockAsm4MB +	MOVL $0x00000001, DI +	LEAL 16(DI), DI +	MOVB SI, 1(AX) +	SHRL $0x08, SI +	SHLL $0x05, SI +	ORL  SI, DI +	MOVB DI, (AX) +	ADDQ $0x02, AX +	SUBL $0x08, R10 + +	// emitRepeat +	LEAL -4(R10), R10 +	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b +	MOVL R10, DI +	LEAL -4(R10), R10 +	CMPL DI, $0x08 +	JLE  repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b +	CMPL DI, $0x0c +	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b +	CMPL SI, $0x00000800 +	JLT  repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: +	CMPL R10, $0x00000104 +	JLT  repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b +	CMPL R10, $0x00010100 +	JLT  repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b +	LEAL -65536(R10), R10 +	MOVL R10, SI +	MOVW $0x001d, (AX) +	MOVW R10, 2(AX) +	SARL $0x10, SI +	MOVB SI, 4(AX) +	ADDQ $0x05, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: +	LEAL -256(R10), R10 +	MOVW $0x0019, (AX) +	MOVW R10, 2(AX) +	ADDQ $0x04, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: +	LEAL -4(R10), R10 +	MOVW $0x0015, (AX) +	MOVB R10, 2(AX) +	ADDQ $0x03, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: +	SHLL $0x02, R10 +	ORL  $0x01, R10 +	MOVW R10, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: +	XORQ DI, DI +	LEAL 1(DI)(R10*4), R10 +	MOVB SI, 1(AX) +	SARL $0x08, SI +	SHLL $0x05, SI +	ORL  SI, R10 +	MOVB R10, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB + +long_offset_short_match_nolit_encodeBlockAsm4MB:  	MOVB $0xee, (AX)  	MOVW SI, 1(AX)  	LEAL -60(R10), R10 @@ -2276,17 +2570,36 @@ memmove_emit_remainder_encodeBlockAsm4MB:  	MOVL SI, BX  	// genMemMoveShort +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3  	CMPQ BX, $0x08 -	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8 +	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7  	CMPQ BX, $0x10  	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16  	CMPQ BX, $0x20  	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8: -	MOVQ (CX), SI -	MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX) +	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7: +	MOVL (CX), SI +	MOVL -4(CX)(BX*1), CX +	MOVL SI, (AX) +	MOVL CX, -4(AX)(BX*1)  	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm4MB  emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16: @@ -2597,17 +2910,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm12B:  #ifdef GOAMD64_v3  	TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R11, R11 @@ -2706,6 +3008,65 @@ repeat_as_copy_encodeBlockAsm12B:  two_byte_offset_repeat_as_copy_encodeBlockAsm12B:  	CMPL SI, $0x40  	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B +	CMPL DI, $0x00000800 +	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm12B +	MOVL $0x00000001, R8 +	LEAL 16(R8), R8 +	MOVB DI, 1(AX) +	SHRL $0x08, DI +	SHLL $0x05, DI +	ORL  DI, R8 +	MOVB R8, (AX) +	ADDQ $0x02, AX +	SUBL $0x08, SI + +	// emitRepeat +	LEAL -4(SI), SI +	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b +	MOVL SI, R8 +	LEAL -4(SI), SI +	CMPL R8, $0x08 +	JLE  repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b +	CMPL R8, $0x0c +	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b +	CMPL DI, $0x00000800 +	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: +	CMPL SI, $0x00000104 +	JLT  repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b +	LEAL -256(SI), SI +	MOVW $0x0019, (AX) +	MOVW SI, 2(AX) +	ADDQ $0x04, AX +	JMP  repeat_end_emit_encodeBlockAsm12B + +repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: +	LEAL -4(SI), SI +	MOVW $0x0015, (AX) +	MOVB SI, 2(AX) +	ADDQ $0x03, AX +	JMP  repeat_end_emit_encodeBlockAsm12B + +repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: +	SHLL $0x02, SI +	ORL  $0x01, SI +	MOVW SI, (AX) +	ADDQ $0x02, AX +	JMP  repeat_end_emit_encodeBlockAsm12B + +repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: +	XORQ R8, R8 +	LEAL 1(R8)(SI*4), SI +	MOVB DI, 1(AX) +	SARL $0x08, DI +	SHLL $0x05, DI +	ORL  DI, SI +	MOVB SI, (AX) +	ADDQ $0x02, AX +	JMP  repeat_end_emit_encodeBlockAsm12B + +long_offset_short_repeat_as_copy_encodeBlockAsm12B:  	MOVB $0xee, (AX)  	MOVW DI, 1(AX)  	LEAL -60(SI), SI @@ -2979,17 +3340,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm12B:  #ifdef GOAMD64_v3  	TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R9, R9 @@ -3041,6 +3391,65 @@ match_nolit_end_encodeBlockAsm12B:  two_byte_offset_match_nolit_encodeBlockAsm12B:  	CMPL R10, $0x40  	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm12B +	CMPL SI, $0x00000800 +	JAE  long_offset_short_match_nolit_encodeBlockAsm12B +	MOVL $0x00000001, DI +	LEAL 16(DI), DI +	MOVB SI, 1(AX) +	SHRL $0x08, SI +	SHLL $0x05, SI +	ORL  SI, DI +	MOVB DI, (AX) +	ADDQ $0x02, AX +	SUBL $0x08, R10 + +	// emitRepeat +	LEAL -4(R10), R10 +	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b +	MOVL R10, DI +	LEAL -4(R10), R10 +	CMPL DI, $0x08 +	JLE  repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b +	CMPL DI, $0x0c +	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b +	CMPL SI, $0x00000800 +	JLT  repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: +	CMPL R10, $0x00000104 +	JLT  repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b +	LEAL -256(R10), R10 +	MOVW $0x0019, (AX) +	MOVW R10, 2(AX) +	ADDQ $0x04, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: +	LEAL -4(R10), R10 +	MOVW $0x0015, (AX) +	MOVB R10, 2(AX) +	ADDQ $0x03, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: +	SHLL $0x02, R10 +	ORL  $0x01, R10 +	MOVW R10, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B + +repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: +	XORQ DI, DI +	LEAL 1(DI)(R10*4), R10 +	MOVB SI, 1(AX) +	SARL $0x08, SI +	SHLL $0x05, SI +	ORL  SI, R10 +	MOVB R10, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B + +long_offset_short_match_nolit_encodeBlockAsm12B:  	MOVB $0xee, (AX)  	MOVW SI, 1(AX)  	LEAL -60(R10), R10 @@ -3189,17 +3598,36 @@ memmove_emit_remainder_encodeBlockAsm12B:  	MOVL SI, BX  	// genMemMoveShort +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3  	CMPQ BX, $0x08 -	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8 +	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7  	CMPQ BX, $0x10  	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16  	CMPQ BX, $0x20  	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8: -	MOVQ (CX), SI -	MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX) +	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7: +	MOVL (CX), SI +	MOVL -4(CX)(BX*1), CX +	MOVL SI, (AX) +	MOVL CX, -4(AX)(BX*1)  	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm12B  emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: @@ -3510,17 +3938,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm10B:  #ifdef GOAMD64_v3  	TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R11, R11 @@ -3619,6 +4036,65 @@ repeat_as_copy_encodeBlockAsm10B:  two_byte_offset_repeat_as_copy_encodeBlockAsm10B:  	CMPL SI, $0x40  	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B +	CMPL DI, $0x00000800 +	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm10B +	MOVL $0x00000001, R8 +	LEAL 16(R8), R8 +	MOVB DI, 1(AX) +	SHRL $0x08, DI +	SHLL $0x05, DI +	ORL  DI, R8 +	MOVB R8, (AX) +	ADDQ $0x02, AX +	SUBL $0x08, SI + +	// emitRepeat +	LEAL -4(SI), SI +	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b +	MOVL SI, R8 +	LEAL -4(SI), SI +	CMPL R8, $0x08 +	JLE  repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b +	CMPL R8, $0x0c +	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b +	CMPL DI, $0x00000800 +	JLT  repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: +	CMPL SI, $0x00000104 +	JLT  repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b +	LEAL -256(SI), SI +	MOVW $0x0019, (AX) +	MOVW SI, 2(AX) +	ADDQ $0x04, AX +	JMP  repeat_end_emit_encodeBlockAsm10B + +repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: +	LEAL -4(SI), SI +	MOVW $0x0015, (AX) +	MOVB SI, 2(AX) +	ADDQ $0x03, AX +	JMP  repeat_end_emit_encodeBlockAsm10B + +repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: +	SHLL $0x02, SI +	ORL  $0x01, SI +	MOVW SI, (AX) +	ADDQ $0x02, AX +	JMP  repeat_end_emit_encodeBlockAsm10B + +repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: +	XORQ R8, R8 +	LEAL 1(R8)(SI*4), SI +	MOVB DI, 1(AX) +	SARL $0x08, DI +	SHLL $0x05, DI +	ORL  DI, SI +	MOVB SI, (AX) +	ADDQ $0x02, AX +	JMP  repeat_end_emit_encodeBlockAsm10B + +long_offset_short_repeat_as_copy_encodeBlockAsm10B:  	MOVB $0xee, (AX)  	MOVW DI, 1(AX)  	LEAL -60(SI), SI @@ -3892,17 +4368,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm10B:  #ifdef GOAMD64_v3  	TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R9, R9 @@ -3954,6 +4419,65 @@ match_nolit_end_encodeBlockAsm10B:  two_byte_offset_match_nolit_encodeBlockAsm10B:  	CMPL R10, $0x40  	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm10B +	CMPL SI, $0x00000800 +	JAE  long_offset_short_match_nolit_encodeBlockAsm10B +	MOVL $0x00000001, DI +	LEAL 16(DI), DI +	MOVB SI, 1(AX) +	SHRL $0x08, SI +	SHLL $0x05, SI +	ORL  SI, DI +	MOVB DI, (AX) +	ADDQ $0x02, AX +	SUBL $0x08, R10 + +	// emitRepeat +	LEAL -4(R10), R10 +	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b +	MOVL R10, DI +	LEAL -4(R10), R10 +	CMPL DI, $0x08 +	JLE  repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b +	CMPL DI, $0x0c +	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b +	CMPL SI, $0x00000800 +	JLT  repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: +	CMPL R10, $0x00000104 +	JLT  repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b +	LEAL -256(R10), R10 +	MOVW $0x0019, (AX) +	MOVW R10, 2(AX) +	ADDQ $0x04, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: +	LEAL -4(R10), R10 +	MOVW $0x0015, (AX) +	MOVB R10, 2(AX) +	ADDQ $0x03, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: +	SHLL $0x02, R10 +	ORL  $0x01, R10 +	MOVW R10, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B + +repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: +	XORQ DI, DI +	LEAL 1(DI)(R10*4), R10 +	MOVB SI, 1(AX) +	SARL $0x08, SI +	SHLL $0x05, SI +	ORL  SI, R10 +	MOVB R10, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B + +long_offset_short_match_nolit_encodeBlockAsm10B:  	MOVB $0xee, (AX)  	MOVW SI, 1(AX)  	LEAL -60(R10), R10 @@ -4102,17 +4626,36 @@ memmove_emit_remainder_encodeBlockAsm10B:  	MOVL SI, BX  	// genMemMoveShort +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3  	CMPQ BX, $0x08 -	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8 +	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7  	CMPQ BX, $0x10  	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16  	CMPQ BX, $0x20  	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8: -	MOVQ (CX), SI -	MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX) +	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7: +	MOVL (CX), SI +	MOVL -4(CX)(BX*1), CX +	MOVL SI, (AX) +	MOVL CX, -4(AX)(BX*1)  	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm10B  emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: @@ -4423,17 +4966,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm8B:  #ifdef GOAMD64_v3  	TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R11, R11 @@ -4528,6 +5060,61 @@ repeat_as_copy_encodeBlockAsm8B:  two_byte_offset_repeat_as_copy_encodeBlockAsm8B:  	CMPL SI, $0x40  	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B +	CMPL DI, $0x00000800 +	JAE  long_offset_short_repeat_as_copy_encodeBlockAsm8B +	MOVL $0x00000001, R8 +	LEAL 16(R8), R8 +	MOVB DI, 1(AX) +	SHRL $0x08, DI +	SHLL $0x05, DI +	ORL  DI, R8 +	MOVB R8, (AX) +	ADDQ $0x02, AX +	SUBL $0x08, SI + +	// emitRepeat +	LEAL -4(SI), SI +	JMP  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b +	MOVL SI, DI +	LEAL -4(SI), SI +	CMPL DI, $0x08 +	JLE  repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b +	CMPL DI, $0x0c +	JGE  cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b + +cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: +	CMPL SI, $0x00000104 +	JLT  repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b +	LEAL -256(SI), SI +	MOVW $0x0019, (AX) +	MOVW SI, 2(AX) +	ADDQ $0x04, AX +	JMP  repeat_end_emit_encodeBlockAsm8B + +repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: +	LEAL -4(SI), SI +	MOVW $0x0015, (AX) +	MOVB SI, 2(AX) +	ADDQ $0x03, AX +	JMP  repeat_end_emit_encodeBlockAsm8B + +repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: +	SHLL $0x02, SI +	ORL  $0x01, SI +	MOVW SI, (AX) +	ADDQ $0x02, AX +	JMP  repeat_end_emit_encodeBlockAsm8B +	XORQ R8, R8 +	LEAL 1(R8)(SI*4), SI +	MOVB DI, 1(AX) +	SARL $0x08, DI +	SHLL $0x05, DI +	ORL  DI, SI +	MOVB SI, (AX) +	ADDQ $0x02, AX +	JMP  repeat_end_emit_encodeBlockAsm8B + +long_offset_short_repeat_as_copy_encodeBlockAsm8B:  	MOVB $0xee, (AX)  	MOVW DI, 1(AX)  	LEAL -60(SI), SI @@ -4795,17 +5382,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm8B:  #ifdef GOAMD64_v3  	TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R9, R9 @@ -4857,6 +5433,61 @@ match_nolit_end_encodeBlockAsm8B:  two_byte_offset_match_nolit_encodeBlockAsm8B:  	CMPL R10, $0x40  	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm8B +	CMPL SI, $0x00000800 +	JAE  long_offset_short_match_nolit_encodeBlockAsm8B +	MOVL $0x00000001, DI +	LEAL 16(DI), DI +	MOVB SI, 1(AX) +	SHRL $0x08, SI +	SHLL $0x05, SI +	ORL  SI, DI +	MOVB DI, (AX) +	ADDQ $0x02, AX +	SUBL $0x08, R10 + +	// emitRepeat +	LEAL -4(R10), R10 +	JMP  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b +	MOVL R10, SI +	LEAL -4(R10), R10 +	CMPL SI, $0x08 +	JLE  repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b +	CMPL SI, $0x0c +	JGE  cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: +	CMPL R10, $0x00000104 +	JLT  repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b +	LEAL -256(R10), R10 +	MOVW $0x0019, (AX) +	MOVW R10, 2(AX) +	ADDQ $0x04, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B + +repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: +	LEAL -4(R10), R10 +	MOVW $0x0015, (AX) +	MOVB R10, 2(AX) +	ADDQ $0x03, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B + +repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: +	SHLL $0x02, R10 +	ORL  $0x01, R10 +	MOVW R10, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B +	XORQ DI, DI +	LEAL 1(DI)(R10*4), R10 +	MOVB SI, 1(AX) +	SARL $0x08, SI +	SHLL $0x05, SI +	ORL  SI, R10 +	MOVB R10, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B + +long_offset_short_match_nolit_encodeBlockAsm8B:  	MOVB $0xee, (AX)  	MOVW SI, 1(AX)  	LEAL -60(R10), R10 @@ -4999,17 +5630,36 @@ memmove_emit_remainder_encodeBlockAsm8B:  	MOVL SI, BX  	// genMemMoveShort +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3  	CMPQ BX, $0x08 -	JLE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8 +	JB   emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7  	CMPQ BX, $0x10  	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16  	CMPQ BX, $0x20  	JBE  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8: -	MOVQ (CX), SI -	MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX) +	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7: +	MOVL (CX), SI +	MOVL -4(CX)(BX*1), CX +	MOVL SI, (AX) +	MOVL CX, -4(AX)(BX*1)  	JMP  memmove_end_copy_emit_remainder_encodeBlockAsm8B  emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: @@ -5225,17 +5875,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm:  #ifdef GOAMD64_v3  	TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R11, R11 @@ -5541,6 +6180,90 @@ four_bytes_remain_match_nolit_encodeBetterBlockAsm:  two_byte_offset_match_nolit_encodeBetterBlockAsm:  	CMPL R12, $0x40  	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm +	CMPL R8, $0x00000800 +	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm +	MOVL $0x00000001, SI +	LEAL 16(SI), SI +	MOVB R8, 1(AX) +	MOVL R8, R9 +	SHRL $0x08, R9 +	SHLL $0x05, R9 +	ORL  R9, SI +	MOVB SI, (AX) +	ADDQ $0x02, AX +	SUBL $0x08, R12 + +	// emitRepeat +	LEAL -4(R12), R12 +	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + +emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: +	MOVL R12, SI +	LEAL -4(R12), R12 +	CMPL SI, $0x08 +	JLE  repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b +	CMPL SI, $0x0c +	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b +	CMPL R8, $0x00000800 +	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: +	CMPL R12, $0x00000104 +	JLT  repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b +	CMPL R12, $0x00010100 +	JLT  repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b +	CMPL R12, $0x0100ffff +	JLT  repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b +	LEAL -16842747(R12), R12 +	MOVW $0x001d, (AX) +	MOVW $0xfffb, 2(AX) +	MOVB $0xff, 4(AX) +	ADDQ $0x05, AX +	JMP  emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b + +repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: +	LEAL -65536(R12), R12 +	MOVL R12, R8 +	MOVW $0x001d, (AX) +	MOVW R12, 2(AX) +	SARL $0x10, R8 +	MOVB R8, 4(AX) +	ADDQ $0x05, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: +	LEAL -256(R12), R12 +	MOVW $0x0019, (AX) +	MOVW R12, 2(AX) +	ADDQ $0x04, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: +	LEAL -4(R12), R12 +	MOVW $0x0015, (AX) +	MOVB R12, 2(AX) +	ADDQ $0x03, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: +	SHLL $0x02, R12 +	ORL  $0x01, R12 +	MOVW R12, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm + +repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: +	XORQ SI, SI +	LEAL 1(SI)(R12*4), R12 +	MOVB R8, 1(AX) +	SARL $0x08, R8 +	SHLL $0x05, R8 +	ORL  R8, R12 +	MOVB R12, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm + +long_offset_short_match_nolit_encodeBetterBlockAsm:  	MOVB $0xee, (AX)  	MOVW R8, 1(AX)  	LEAL -60(R12), R12 @@ -5979,8 +6702,9 @@ memmove_emit_remainder_encodeBetterBlockAsm:  	MOVL SI, BX  	// genMemMoveShort -	CMPQ BX, $0x04 -	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4 +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3  	CMPQ BX, $0x08  	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7  	CMPQ BX, $0x10 @@ -5989,9 +6713,18 @@ memmove_emit_remainder_encodeBetterBlockAsm:  	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4: -	MOVL (CX), SI -	MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX)  	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: @@ -6214,17 +6947,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB:  #ifdef GOAMD64_v3  	TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R11, R11 @@ -6511,6 +7233,77 @@ four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:  two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:  	CMPL R12, $0x40  	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB +	CMPL R8, $0x00000800 +	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm4MB +	MOVL $0x00000001, SI +	LEAL 16(SI), SI +	MOVB R8, 1(AX) +	SHRL $0x08, R8 +	SHLL $0x05, R8 +	ORL  R8, SI +	MOVB SI, (AX) +	ADDQ $0x02, AX +	SUBL $0x08, R12 + +	// emitRepeat +	LEAL -4(R12), R12 +	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b +	MOVL R12, SI +	LEAL -4(R12), R12 +	CMPL SI, $0x08 +	JLE  repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b +	CMPL SI, $0x0c +	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b +	CMPL R8, $0x00000800 +	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: +	CMPL R12, $0x00000104 +	JLT  repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b +	CMPL R12, $0x00010100 +	JLT  repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b +	LEAL -65536(R12), R12 +	MOVL R12, R8 +	MOVW $0x001d, (AX) +	MOVW R12, 2(AX) +	SARL $0x10, R8 +	MOVB R8, 4(AX) +	ADDQ $0x05, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: +	LEAL -256(R12), R12 +	MOVW $0x0019, (AX) +	MOVW R12, 2(AX) +	ADDQ $0x04, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: +	LEAL -4(R12), R12 +	MOVW $0x0015, (AX) +	MOVB R12, 2(AX) +	ADDQ $0x03, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: +	SHLL $0x02, R12 +	ORL  $0x01, R12 +	MOVW R12, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: +	XORQ SI, SI +	LEAL 1(SI)(R12*4), R12 +	MOVB R8, 1(AX) +	SARL $0x08, R8 +	SHLL $0x05, R8 +	ORL  R8, R12 +	MOVB R12, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB + +long_offset_short_match_nolit_encodeBetterBlockAsm4MB:  	MOVB $0xee, (AX)  	MOVW R8, 1(AX)  	LEAL -60(R12), R12 @@ -6911,8 +7704,9 @@ memmove_emit_remainder_encodeBetterBlockAsm4MB:  	MOVL SI, BX  	// genMemMoveShort -	CMPQ BX, $0x04 -	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4 +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3  	CMPQ BX, $0x08  	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7  	CMPQ BX, $0x10 @@ -6921,9 +7715,18 @@ memmove_emit_remainder_encodeBetterBlockAsm4MB:  	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4: -	MOVL (CX), SI -	MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX)  	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7: @@ -7138,17 +7941,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm12B:  #ifdef GOAMD64_v3  	TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R11, R11 @@ -7335,6 +8127,65 @@ emit_literal_done_match_emit_encodeBetterBlockAsm12B:  two_byte_offset_match_nolit_encodeBetterBlockAsm12B:  	CMPL R12, $0x40  	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B +	CMPL R8, $0x00000800 +	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm12B +	MOVL $0x00000001, SI +	LEAL 16(SI), SI +	MOVB R8, 1(AX) +	SHRL $0x08, R8 +	SHLL $0x05, R8 +	ORL  R8, SI +	MOVB SI, (AX) +	ADDQ $0x02, AX +	SUBL $0x08, R12 + +	// emitRepeat +	LEAL -4(R12), R12 +	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b +	MOVL R12, SI +	LEAL -4(R12), R12 +	CMPL SI, $0x08 +	JLE  repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b +	CMPL SI, $0x0c +	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b +	CMPL R8, $0x00000800 +	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: +	CMPL R12, $0x00000104 +	JLT  repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b +	LEAL -256(R12), R12 +	MOVW $0x0019, (AX) +	MOVW R12, 2(AX) +	ADDQ $0x04, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: +	LEAL -4(R12), R12 +	MOVW $0x0015, (AX) +	MOVB R12, 2(AX) +	ADDQ $0x03, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: +	SHLL $0x02, R12 +	ORL  $0x01, R12 +	MOVW R12, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: +	XORQ SI, SI +	LEAL 1(SI)(R12*4), R12 +	MOVB R8, 1(AX) +	SARL $0x08, R8 +	SHLL $0x05, R8 +	ORL  R8, R12 +	MOVB R12, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B + +long_offset_short_match_nolit_encodeBetterBlockAsm12B:  	MOVB $0xee, (AX)  	MOVW R8, 1(AX)  	LEAL -60(R12), R12 @@ -7689,8 +8540,9 @@ memmove_emit_remainder_encodeBetterBlockAsm12B:  	MOVL SI, BX  	// genMemMoveShort -	CMPQ BX, $0x04 -	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4 +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3  	CMPQ BX, $0x08  	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7  	CMPQ BX, $0x10 @@ -7699,9 +8551,18 @@ memmove_emit_remainder_encodeBetterBlockAsm12B:  	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4: -	MOVL (CX), SI -	MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX)  	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: @@ -7916,17 +8777,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm10B:  #ifdef GOAMD64_v3  	TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R11, R11 @@ -8113,6 +8963,65 @@ emit_literal_done_match_emit_encodeBetterBlockAsm10B:  two_byte_offset_match_nolit_encodeBetterBlockAsm10B:  	CMPL R12, $0x40  	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B +	CMPL R8, $0x00000800 +	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm10B +	MOVL $0x00000001, SI +	LEAL 16(SI), SI +	MOVB R8, 1(AX) +	SHRL $0x08, R8 +	SHLL $0x05, R8 +	ORL  R8, SI +	MOVB SI, (AX) +	ADDQ $0x02, AX +	SUBL $0x08, R12 + +	// emitRepeat +	LEAL -4(R12), R12 +	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b +	MOVL R12, SI +	LEAL -4(R12), R12 +	CMPL SI, $0x08 +	JLE  repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b +	CMPL SI, $0x0c +	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b +	CMPL R8, $0x00000800 +	JLT  repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: +	CMPL R12, $0x00000104 +	JLT  repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b +	LEAL -256(R12), R12 +	MOVW $0x0019, (AX) +	MOVW R12, 2(AX) +	ADDQ $0x04, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: +	LEAL -4(R12), R12 +	MOVW $0x0015, (AX) +	MOVB R12, 2(AX) +	ADDQ $0x03, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: +	SHLL $0x02, R12 +	ORL  $0x01, R12 +	MOVW R12, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: +	XORQ SI, SI +	LEAL 1(SI)(R12*4), R12 +	MOVB R8, 1(AX) +	SARL $0x08, R8 +	SHLL $0x05, R8 +	ORL  R8, R12 +	MOVB R12, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B + +long_offset_short_match_nolit_encodeBetterBlockAsm10B:  	MOVB $0xee, (AX)  	MOVW R8, 1(AX)  	LEAL -60(R12), R12 @@ -8467,8 +9376,9 @@ memmove_emit_remainder_encodeBetterBlockAsm10B:  	MOVL SI, BX  	// genMemMoveShort -	CMPQ BX, $0x04 -	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4 +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3  	CMPQ BX, $0x08  	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7  	CMPQ BX, $0x10 @@ -8477,9 +9387,18 @@ memmove_emit_remainder_encodeBetterBlockAsm10B:  	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4: -	MOVL (CX), SI -	MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX)  	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: @@ -8694,17 +9613,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm8B:  #ifdef GOAMD64_v3  	TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R11, R11 @@ -8891,6 +9799,61 @@ emit_literal_done_match_emit_encodeBetterBlockAsm8B:  two_byte_offset_match_nolit_encodeBetterBlockAsm8B:  	CMPL R12, $0x40  	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B +	CMPL R8, $0x00000800 +	JAE  long_offset_short_match_nolit_encodeBetterBlockAsm8B +	MOVL $0x00000001, SI +	LEAL 16(SI), SI +	MOVB R8, 1(AX) +	SHRL $0x08, R8 +	SHLL $0x05, R8 +	ORL  R8, SI +	MOVB SI, (AX) +	ADDQ $0x02, AX +	SUBL $0x08, R12 + +	// emitRepeat +	LEAL -4(R12), R12 +	JMP  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b +	MOVL R12, SI +	LEAL -4(R12), R12 +	CMPL SI, $0x08 +	JLE  repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b +	CMPL SI, $0x0c +	JGE  cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b + +cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: +	CMPL R12, $0x00000104 +	JLT  repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b +	LEAL -256(R12), R12 +	MOVW $0x0019, (AX) +	MOVW R12, 2(AX) +	ADDQ $0x04, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: +	LEAL -4(R12), R12 +	MOVW $0x0015, (AX) +	MOVB R12, 2(AX) +	ADDQ $0x03, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: +	SHLL $0x02, R12 +	ORL  $0x01, R12 +	MOVW R12, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B +	XORQ SI, SI +	LEAL 1(SI)(R12*4), R12 +	MOVB R8, 1(AX) +	SARL $0x08, R8 +	SHLL $0x05, R8 +	ORL  R8, R12 +	MOVB R12, (AX) +	ADDQ $0x02, AX +	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B + +long_offset_short_match_nolit_encodeBetterBlockAsm8B:  	MOVB $0xee, (AX)  	MOVW R8, 1(AX)  	LEAL -60(R12), R12 @@ -9235,8 +10198,9 @@ memmove_emit_remainder_encodeBetterBlockAsm8B:  	MOVL SI, BX  	// genMemMoveShort -	CMPQ BX, $0x04 -	JLE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4 +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3  	CMPQ BX, $0x08  	JB   emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7  	CMPQ BX, $0x10 @@ -9245,9 +10209,18 @@ memmove_emit_remainder_encodeBetterBlockAsm8B:  	JBE  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4: -	MOVL (CX), SI -	MOVL SI, (AX) +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX)  	JMP  memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B  emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: @@ -9584,17 +10557,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm:  #ifdef GOAMD64_v3  	TZCNTQ R10, R10 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R10, R10 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R10, R10 @@ -9918,17 +10880,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm:  #ifdef GOAMD64_v3  	TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R9, R9 @@ -10127,17 +11078,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm:  	MOVL SI, BX  	// genMemMoveShort +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3  	CMPQ BX, $0x08 -	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7  	CMPQ BX, $0x10  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16  	CMPQ BX, $0x20  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8: -	MOVQ (CX), SI -	MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7: +	MOVL (CX), SI +	MOVL -4(CX)(BX*1), CX +	MOVL SI, (AX) +	MOVL CX, -4(AX)(BX*1)  	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16: @@ -10448,17 +11418,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K:  #ifdef GOAMD64_v3  	TZCNTQ R10, R10 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R10, R10 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R10, R10 @@ -10739,17 +11698,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K:  #ifdef GOAMD64_v3  	TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R9, R9 @@ -10905,17 +11853,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm64K:  	MOVL SI, BX  	// genMemMoveShort +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3  	CMPQ BX, $0x08 -	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7  	CMPQ BX, $0x10  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16  	CMPQ BX, $0x20  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8: -	MOVQ (CX), SI -	MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7: +	MOVL (CX), SI +	MOVL -4(CX)(BX*1), CX +	MOVL SI, (AX) +	MOVL CX, -4(AX)(BX*1)  	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16: @@ -11226,17 +12193,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B:  #ifdef GOAMD64_v3  	TZCNTQ R10, R10 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R10, R10 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R10, R10 @@ -11517,17 +12473,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B:  #ifdef GOAMD64_v3  	TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R9, R9 @@ -11683,17 +12628,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm12B:  	MOVL SI, BX  	// genMemMoveShort +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3  	CMPQ BX, $0x08 -	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7  	CMPQ BX, $0x10  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16  	CMPQ BX, $0x20  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8: -	MOVQ (CX), SI -	MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7: +	MOVL (CX), SI +	MOVL -4(CX)(BX*1), CX +	MOVL SI, (AX) +	MOVL CX, -4(AX)(BX*1)  	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16: @@ -12004,17 +12968,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B:  #ifdef GOAMD64_v3  	TZCNTQ R10, R10 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R10, R10 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R10, R10 @@ -12295,17 +13248,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B:  #ifdef GOAMD64_v3  	TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R9, R9 @@ -12461,17 +13403,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm10B:  	MOVL SI, BX  	// genMemMoveShort +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3  	CMPQ BX, $0x08 -	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7  	CMPQ BX, $0x10  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16  	CMPQ BX, $0x20  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8: -	MOVQ (CX), SI -	MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7: +	MOVL (CX), SI +	MOVL -4(CX)(BX*1), CX +	MOVL SI, (AX) +	MOVL CX, -4(AX)(BX*1)  	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16: @@ -12782,17 +13743,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B:  #ifdef GOAMD64_v3  	TZCNTQ R10, R10 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R10, R10 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R10, R10 @@ -13071,17 +14021,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B:  #ifdef GOAMD64_v3  	TZCNTQ R9, R9 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R9, R9 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R9, R9 @@ -13235,17 +14174,36 @@ memmove_emit_remainder_encodeSnappyBlockAsm8B:  	MOVL SI, BX  	// genMemMoveShort +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3  	CMPQ BX, $0x08 -	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7  	CMPQ BX, $0x10  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16  	CMPQ BX, $0x20  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8: -	MOVQ (CX), SI -	MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7: +	MOVL (CX), SI +	MOVL -4(CX)(BX*1), CX +	MOVL SI, (AX) +	MOVL CX, -4(AX)(BX*1)  	JMP  memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B  emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16: @@ -13461,17 +14419,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm:  #ifdef GOAMD64_v3  	TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R11, R11 @@ -13850,17 +14797,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm:  	MOVL SI, BX  	// genMemMoveShort +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3  	CMPQ BX, $0x08 -	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7  	CMPQ BX, $0x10  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16  	CMPQ BX, $0x20  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8: -	MOVQ (CX), SI -	MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7: +	MOVL (CX), SI +	MOVL -4(CX)(BX*1), CX +	MOVL SI, (AX) +	MOVL CX, -4(AX)(BX*1)  	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16: @@ -14068,17 +15034,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:  #ifdef GOAMD64_v3  	TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R11, R11 @@ -14386,17 +15341,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm64K:  	MOVL SI, BX  	// genMemMoveShort +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3  	CMPQ BX, $0x08 -	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7  	CMPQ BX, $0x10  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16  	CMPQ BX, $0x20  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8: -	MOVQ (CX), SI -	MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7: +	MOVL (CX), SI +	MOVL -4(CX)(BX*1), CX +	MOVL SI, (AX) +	MOVL CX, -4(AX)(BX*1)  	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: @@ -14604,17 +15578,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:  #ifdef GOAMD64_v3  	TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R11, R11 @@ -14922,17 +15885,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm12B:  	MOVL SI, BX  	// genMemMoveShort +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3  	CMPQ BX, $0x08 -	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7  	CMPQ BX, $0x10  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16  	CMPQ BX, $0x20  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8: -	MOVQ (CX), SI -	MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7: +	MOVL (CX), SI +	MOVL -4(CX)(BX*1), CX +	MOVL SI, (AX) +	MOVL CX, -4(AX)(BX*1)  	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: @@ -15140,17 +16122,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:  #ifdef GOAMD64_v3  	TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R11, R11 @@ -15458,17 +16429,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm10B:  	MOVL SI, BX  	// genMemMoveShort +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3  	CMPQ BX, $0x08 -	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7  	CMPQ BX, $0x10  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16  	CMPQ BX, $0x20  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8: -	MOVQ (CX), SI -	MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7: +	MOVL (CX), SI +	MOVL -4(CX)(BX*1), CX +	MOVL SI, (AX) +	MOVL CX, -4(AX)(BX*1)  	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: @@ -15676,17 +16666,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:  #ifdef GOAMD64_v3  	TZCNTQ R11, R11 -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ R11, R11 - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ R11, R11 @@ -15992,17 +16971,36 @@ memmove_emit_remainder_encodeSnappyBetterBlockAsm8B:  	MOVL SI, BX  	// genMemMoveShort +	CMPQ BX, $0x03 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2 +	JE   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3  	CMPQ BX, $0x08 -	JLE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8 +	JB   emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7  	CMPQ BX, $0x10  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16  	CMPQ BX, $0x20  	JBE  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32  	JMP  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8: -	MOVQ (CX), SI -	MOVQ SI, (AX) +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2: +	MOVB (CX), SI +	MOVB -1(CX)(BX*1), CL +	MOVB SI, (AX) +	MOVB CL, -1(AX)(BX*1) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3: +	MOVW (CX), SI +	MOVB 2(CX), CL +	MOVW SI, (AX) +	MOVB CL, 2(AX) +	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B + +emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7: +	MOVL (CX), SI +	MOVL -4(CX)(BX*1), CX +	MOVL SI, (AX) +	MOVL CX, -4(AX)(BX*1)  	JMP  memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B  emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: @@ -16443,6 +17441,97 @@ four_bytes_remain_standalone:  two_byte_offset_standalone:  	CMPL DX, $0x40  	JLE  two_byte_offset_short_standalone +	CMPL CX, $0x00000800 +	JAE  long_offset_short_standalone +	MOVL $0x00000001, SI +	LEAL 16(SI), SI +	MOVB CL, 1(AX) +	MOVL CX, DI +	SHRL $0x08, DI +	SHLL $0x05, DI +	ORL  DI, SI +	MOVB SI, (AX) +	ADDQ $0x02, BX +	ADDQ $0x02, AX +	SUBL $0x08, DX + +	// emitRepeat +	LEAL -4(DX), DX +	JMP  cant_repeat_two_offset_standalone_emit_copy_short_2b + +emit_repeat_again_standalone_emit_copy_short_2b: +	MOVL DX, SI +	LEAL -4(DX), DX +	CMPL SI, $0x08 +	JLE  repeat_two_standalone_emit_copy_short_2b +	CMPL SI, $0x0c +	JGE  cant_repeat_two_offset_standalone_emit_copy_short_2b +	CMPL CX, $0x00000800 +	JLT  repeat_two_offset_standalone_emit_copy_short_2b + +cant_repeat_two_offset_standalone_emit_copy_short_2b: +	CMPL DX, $0x00000104 +	JLT  repeat_three_standalone_emit_copy_short_2b +	CMPL DX, $0x00010100 +	JLT  repeat_four_standalone_emit_copy_short_2b +	CMPL DX, $0x0100ffff +	JLT  repeat_five_standalone_emit_copy_short_2b +	LEAL -16842747(DX), DX +	MOVW $0x001d, (AX) +	MOVW $0xfffb, 2(AX) +	MOVB $0xff, 4(AX) +	ADDQ $0x05, AX +	ADDQ $0x05, BX +	JMP  emit_repeat_again_standalone_emit_copy_short_2b + +repeat_five_standalone_emit_copy_short_2b: +	LEAL -65536(DX), DX +	MOVL DX, CX +	MOVW $0x001d, (AX) +	MOVW DX, 2(AX) +	SARL $0x10, CX +	MOVB CL, 4(AX) +	ADDQ $0x05, BX +	ADDQ $0x05, AX +	JMP  gen_emit_copy_end + +repeat_four_standalone_emit_copy_short_2b: +	LEAL -256(DX), DX +	MOVW $0x0019, (AX) +	MOVW DX, 2(AX) +	ADDQ $0x04, BX +	ADDQ $0x04, AX +	JMP  gen_emit_copy_end + +repeat_three_standalone_emit_copy_short_2b: +	LEAL -4(DX), DX +	MOVW $0x0015, (AX) +	MOVB DL, 2(AX) +	ADDQ $0x03, BX +	ADDQ $0x03, AX +	JMP  gen_emit_copy_end + +repeat_two_standalone_emit_copy_short_2b: +	SHLL $0x02, DX +	ORL  $0x01, DX +	MOVW DX, (AX) +	ADDQ $0x02, BX +	ADDQ $0x02, AX +	JMP  gen_emit_copy_end + +repeat_two_offset_standalone_emit_copy_short_2b: +	XORQ SI, SI +	LEAL 1(SI)(DX*4), DX +	MOVB CL, 1(AX) +	SARL $0x08, CX +	SHLL $0x05, CX +	ORL  CX, DX +	MOVB DL, (AX) +	ADDQ $0x02, BX +	ADDQ $0x02, AX +	JMP  gen_emit_copy_end + +long_offset_short_standalone:  	MOVB $0xee, (AX)  	MOVW CX, 1(AX)  	LEAL -60(DX), DX @@ -16644,17 +17733,6 @@ matchlen_loopback_standalone:  #ifdef GOAMD64_v3  	TZCNTQ BX, BX -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef GOAMD64_v4 -	TZCNTQ BX, BX - -#define TZCNTQ_EMITTED 1 -#endif - -#ifdef TZCNTQ_EMITTED -#undef TZCNTQ_EMITTED  #else  	BSFQ BX, BX diff --git a/vendor/github.com/klauspost/compress/s2/index.go b/vendor/github.com/klauspost/compress/s2/index.go index fd857682e..dd9ecfe71 100644 --- a/vendor/github.com/klauspost/compress/s2/index.go +++ b/vendor/github.com/klauspost/compress/s2/index.go @@ -10,6 +10,7 @@ import (  	"encoding/json"  	"fmt"  	"io" +	"sort"  )  const ( @@ -100,6 +101,15 @@ func (i *Index) Find(offset int64) (compressedOff, uncompressedOff int64, err er  	if offset > i.TotalUncompressed {  		return 0, 0, io.ErrUnexpectedEOF  	} +	if len(i.info) > 200 { +		n := sort.Search(len(i.info), func(n int) bool { +			return i.info[n].uncompressedOffset > offset +		}) +		if n == 0 { +			n = 1 +		} +		return i.info[n-1].compressedOffset, i.info[n-1].uncompressedOffset, nil +	}  	for _, info := range i.info {  		if info.uncompressedOffset > offset {  			break @@ -523,3 +533,66 @@ func (i *Index) JSON() []byte {  	b, _ := json.MarshalIndent(x, "", "  ")  	return b  } + +// RemoveIndexHeaders will trim all headers and trailers from a given index. +// This is expected to save 20 bytes. +// These can be restored using RestoreIndexHeaders. +// This removes a layer of security, but is the most compact representation. +// Returns nil if headers contains errors. +// The returned slice references the provided slice. +func RemoveIndexHeaders(b []byte) []byte { +	const save = 4 + len(S2IndexHeader) + len(S2IndexTrailer) + 4 +	if len(b) <= save { +		return nil +	} +	if b[0] != ChunkTypeIndex { +		return nil +	} +	chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16 +	b = b[4:] + +	// Validate we have enough... +	if len(b) < chunkLen { +		return nil +	} +	b = b[:chunkLen] + +	if !bytes.Equal(b[:len(S2IndexHeader)], []byte(S2IndexHeader)) { +		return nil +	} +	b = b[len(S2IndexHeader):] +	if !bytes.HasSuffix(b, []byte(S2IndexTrailer)) { +		return nil +	} +	b = bytes.TrimSuffix(b, []byte(S2IndexTrailer)) + +	if len(b) < 4 { +		return nil +	} +	return b[:len(b)-4] +} + +// RestoreIndexHeaders will index restore headers removed by RemoveIndexHeaders. +// No error checking is performed on the input. +// If a 0 length slice is sent, it is returned without modification. +func RestoreIndexHeaders(in []byte) []byte { +	if len(in) == 0 { +		return in +	} +	b := make([]byte, 0, 4+len(S2IndexHeader)+len(in)+len(S2IndexTrailer)+4) +	b = append(b, ChunkTypeIndex, 0, 0, 0) +	b = append(b, []byte(S2IndexHeader)...) +	b = append(b, in...) + +	var tmp [4]byte +	binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)+4+len(S2IndexTrailer))) +	b = append(b, tmp[:4]...) +	// Trailer +	b = append(b, []byte(S2IndexTrailer)...) + +	chunkLen := len(b) - skippableFrameHeader +	b[1] = uint8(chunkLen >> 0) +	b[2] = uint8(chunkLen >> 8) +	b[3] = uint8(chunkLen >> 16) +	return b +}  | 
