diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s')
-rw-r--r-- | vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s | 21303 |
1 files changed, 0 insertions, 21303 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s deleted file mode 100644 index df9be687b..000000000 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s +++ /dev/null @@ -1,21303 +0,0 @@ -// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. - -//go:build !appengine && !noasm && gc && !noasm - -#include "textflag.h" - -// func _dummy_() -TEXT ·_dummy_(SB), $0 -#ifdef GOAMD64_v4 -#ifndef GOAMD64_v3 -#define GOAMD64_v3 -#endif -#endif - RET - -// func encodeBlockAsm(dst []byte, src []byte, tmp *[65536]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBlockAsm(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00000200, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeBlockAsm: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -9(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeBlockAsm: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 4(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeBlockAsm - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R11 - IMULQ R9, R11 - SHRQ $0x32, R11 - MOVL (AX)(R10*4), SI - MOVL (AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - LEAL 1(DX), R10 - MOVL R10, (AX)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm - LEAL 1(DX), DI - MOVL 12(SP), R8 - MOVL DI, SI - SUBL 16(SP), SI - JZ repeat_extend_back_end_encodeBlockAsm - -repeat_extend_back_loop_encodeBlockAsm: - CMPL DI, R8 - JBE repeat_extend_back_end_encodeBlockAsm - MOVB -1(BX)(SI*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_encodeBlockAsm - LEAL -1(DI), DI - DECL SI - JNZ repeat_extend_back_loop_encodeBlockAsm - -repeat_extend_back_end_encodeBlockAsm: - MOVL DI, SI - SUBL 12(SP), SI - LEAQ 5(CX)(SI*1), SI - CMPQ SI, (SP) - JB repeat_dst_size_check_encodeBlockAsm - MOVQ $0x00000000, ret+56(FP) - RET - -repeat_dst_size_check_encodeBlockAsm: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_repeat_emit_encodeBlockAsm - CMPL SI, $0x00000100 - JB two_bytes_repeat_emit_encodeBlockAsm - CMPL SI, $0x00010000 - JB three_bytes_repeat_emit_encodeBlockAsm - CMPL SI, $0x01000000 - JB four_bytes_repeat_emit_encodeBlockAsm - MOVB $0xfc, (CX) - MOVL SI, 1(CX) - ADDQ $0x05, CX - JMP memmove_long_repeat_emit_encodeBlockAsm - -four_bytes_repeat_emit_encodeBlockAsm: - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (CX) - MOVW SI, 1(CX) - MOVB R11, 3(CX) - ADDQ $0x04, CX - JMP memmove_long_repeat_emit_encodeBlockAsm - -three_bytes_repeat_emit_encodeBlockAsm: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_repeat_emit_encodeBlockAsm - -two_bytes_repeat_emit_encodeBlockAsm: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_repeat_emit_encodeBlockAsm - JMP memmove_long_repeat_emit_encodeBlockAsm - -one_byte_repeat_emit_encodeBlockAsm: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_repeat_emit_encodeBlockAsm: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (CX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm: - MOVQ SI, CX - JMP emit_literal_done_repeat_emit_encodeBlockAsm - -memmove_long_repeat_emit_encodeBlockAsm: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R13*1), R11 - LEAQ -32(CX)(R13*1), R14 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R13*1), X4 - MOVOU -16(R10)(R13*1), X5 - MOVOA X4, -32(CX)(R13*1) - MOVOA X5, -16(CX)(R13*1) - ADDQ $0x20, R13 - CMPQ R9, R13 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_repeat_emit_encodeBlockAsm: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R9 - SUBL DX, R9 - LEAQ (BX)(DX*1), R10 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R12, R12 - -matchlen_loopback_16_repeat_extend_encodeBlockAsm: - CMPL R9, $0x10 - JB matchlen_match8_repeat_extend_encodeBlockAsm - MOVQ (R10)(R12*1), R11 - MOVQ 8(R10)(R12*1), R13 - XORQ (SI)(R12*1), R11 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm - XORQ 8(SI)(R12*1), R13 - JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm - LEAL -16(R9), R9 - LEAL 16(R12), R12 - JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm - -matchlen_bsf_16repeat_extend_encodeBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R13, R13 - -#else - BSFQ R13, R13 - -#endif - SARQ $0x03, R13 - LEAL 8(R12)(R13*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm - -matchlen_match8_repeat_extend_encodeBlockAsm: - CMPL R9, $0x08 - JB matchlen_match4_repeat_extend_encodeBlockAsm - MOVQ (R10)(R12*1), R11 - XORQ (SI)(R12*1), R11 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm - LEAL -8(R9), R9 - LEAL 8(R12), R12 - JMP matchlen_match4_repeat_extend_encodeBlockAsm - -matchlen_bsf_8_repeat_extend_encodeBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm - -matchlen_match4_repeat_extend_encodeBlockAsm: - CMPL R9, $0x04 - JB matchlen_match2_repeat_extend_encodeBlockAsm - MOVL (R10)(R12*1), R11 - CMPL (SI)(R12*1), R11 - JNE matchlen_match2_repeat_extend_encodeBlockAsm - LEAL -4(R9), R9 - LEAL 4(R12), R12 - -matchlen_match2_repeat_extend_encodeBlockAsm: - CMPL R9, $0x01 - JE matchlen_match1_repeat_extend_encodeBlockAsm - JB repeat_extend_forward_end_encodeBlockAsm - MOVW (R10)(R12*1), R11 - CMPW (SI)(R12*1), R11 - JNE matchlen_match1_repeat_extend_encodeBlockAsm - LEAL 2(R12), R12 - SUBL $0x02, R9 - JZ repeat_extend_forward_end_encodeBlockAsm - -matchlen_match1_repeat_extend_encodeBlockAsm: - MOVB (R10)(R12*1), R11 - CMPB (SI)(R12*1), R11 - JNE repeat_extend_forward_end_encodeBlockAsm - LEAL 1(R12), R12 - -repeat_extend_forward_end_encodeBlockAsm: - ADDL R12, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - TESTL R8, R8 - JZ repeat_as_copy_encodeBlockAsm - - // emitRepeat -emit_repeat_again_match_repeat_encodeBlockAsm: - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JBE repeat_two_match_repeat_encodeBlockAsm - CMPL R8, $0x0c - JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm - CMPL DI, $0x00000800 - JB repeat_two_offset_match_repeat_encodeBlockAsm - -cant_repeat_two_offset_match_repeat_encodeBlockAsm: - CMPL SI, $0x00000104 - JB repeat_three_match_repeat_encodeBlockAsm - CMPL SI, $0x00010100 - JB repeat_four_match_repeat_encodeBlockAsm - CMPL SI, $0x0100ffff - JB repeat_five_match_repeat_encodeBlockAsm - LEAL -16842747(SI), SI - MOVL $0xfffb001d, (CX) - MOVB $0xff, 4(CX) - ADDQ $0x05, CX - JMP emit_repeat_again_match_repeat_encodeBlockAsm - -repeat_five_match_repeat_encodeBlockAsm: - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (CX) - MOVW SI, 2(CX) - SARL $0x10, DI - MOVB DI, 4(CX) - ADDQ $0x05, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_four_match_repeat_encodeBlockAsm: - LEAL -256(SI), SI - MOVW $0x0019, (CX) - MOVW SI, 2(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_three_match_repeat_encodeBlockAsm: - LEAL -4(SI), SI - MOVW $0x0015, (CX) - MOVB SI, 2(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_match_repeat_encodeBlockAsm: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_offset_match_repeat_encodeBlockAsm: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(CX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_as_copy_encodeBlockAsm: - // emitCopy - CMPL DI, $0x00010000 - JB two_byte_offset_repeat_as_copy_encodeBlockAsm - CMPL SI, $0x40 - JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm - MOVB $0xff, (CX) - MOVL DI, 1(CX) - LEAL -64(SI), SI - ADDQ $0x05, CX - CMPL SI, $0x04 - JB four_bytes_remain_repeat_as_copy_encodeBlockAsm - - // emitRepeat -emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy: - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL R8, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL DI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: - CMPL SI, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL SI, $0x00010100 - JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL SI, $0x0100ffff - JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy - LEAL -16842747(SI), SI - MOVL $0xfffb001d, (CX) - MOVB $0xff, 4(CX) - ADDQ $0x05, CX - JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy - -repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (CX) - MOVW SI, 2(CX) - SARL $0x10, DI - MOVB DI, 4(CX) - ADDQ $0x05, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -256(SI), SI - MOVW $0x0019, (CX) - MOVW SI, 2(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -4(SI), SI - MOVW $0x0015, (CX) - MOVB SI, 2(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(CX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm - -four_bytes_remain_repeat_as_copy_encodeBlockAsm: - TESTL SI, SI - JZ repeat_end_emit_encodeBlockAsm - XORL R8, R8 - LEAL -1(R8)(SI*4), SI - MOVB SI, (CX) - MOVL DI, 1(CX) - ADDQ $0x05, CX - JMP repeat_end_emit_encodeBlockAsm - -two_byte_offset_repeat_as_copy_encodeBlockAsm: - CMPL SI, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm - CMPL DI, $0x00000800 - JAE long_offset_short_repeat_as_copy_encodeBlockAsm - MOVL $0x00000001, R8 - LEAL 16(R8), R8 - MOVB DI, 1(CX) - MOVL DI, R9 - SHRL $0x08, R9 - SHLL $0x05, R9 - ORL R9, R8 - MOVB R8, (CX) - ADDQ $0x02, CX - SUBL $0x08, SI - - // emitRepeat - LEAL -4(SI), SI - JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b - -emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b - CMPL R8, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b - CMPL DI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: - CMPL SI, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b - CMPL SI, $0x00010100 - JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b - CMPL SI, $0x0100ffff - JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b - LEAL -16842747(SI), SI - MOVL $0xfffb001d, (CX) - MOVB $0xff, 4(CX) - ADDQ $0x05, CX - JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b - -repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (CX) - MOVW SI, 2(CX) - SARL $0x10, DI - MOVB DI, 4(CX) - ADDQ $0x05, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: - LEAL -256(SI), SI - MOVW $0x0019, (CX) - MOVW SI, 2(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: - LEAL -4(SI), SI - MOVW $0x0015, (CX) - MOVB SI, 2(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(CX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm - -long_offset_short_repeat_as_copy_encodeBlockAsm: - MOVB $0xee, (CX) - MOVW DI, 1(CX) - LEAL -60(SI), SI - ADDQ $0x03, CX - - // emitRepeat -emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short: - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL R8, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL DI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: - CMPL SI, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL SI, $0x00010100 - JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL SI, $0x0100ffff - JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short - LEAL -16842747(SI), SI - MOVL $0xfffb001d, (CX) - MOVB $0xff, 4(CX) - ADDQ $0x05, CX - JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short - -repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (CX) - MOVW SI, 2(CX) - SARL $0x10, DI - MOVB DI, 4(CX) - ADDQ $0x05, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -256(SI), SI - MOVW $0x0019, (CX) - MOVW SI, 2(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -4(SI), SI - MOVW $0x0015, (CX) - MOVB SI, 2(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(CX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm: - MOVL SI, R8 - SHLL $0x02, R8 - CMPL SI, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm - CMPL DI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm - LEAL -15(R8), R8 - MOVB DI, 1(CX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R8 - MOVB R8, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm - -emit_copy_three_repeat_as_copy_encodeBlockAsm: - LEAL -2(R8), R8 - MOVB R8, (CX) - MOVW DI, 1(CX) - ADDQ $0x03, CX - -repeat_end_emit_encodeBlockAsm: - MOVL DX, 12(SP) - JMP search_loop_encodeBlockAsm - -no_repeat_found_encodeBlockAsm: - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm - SHRQ $0x08, DI - MOVL (AX)(R10*4), SI - LEAL 2(DX), R9 - CMPL (BX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm - MOVL R9, (AX)(R10*4) - SHRQ $0x08, DI - CMPL (BX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm - MOVL 20(SP), DX - JMP search_loop_encodeBlockAsm - -candidate3_match_encodeBlockAsm: - ADDL $0x02, DX - JMP candidate_match_encodeBlockAsm - -candidate2_match_encodeBlockAsm: - MOVL R9, (AX)(R10*4) - INCL DX - MOVL R8, SI - -candidate_match_encodeBlockAsm: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBlockAsm - -match_extend_back_loop_encodeBlockAsm: - CMPL DX, DI - JBE match_extend_back_end_encodeBlockAsm - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeBlockAsm - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeBlockAsm - JMP match_extend_back_loop_encodeBlockAsm - -match_extend_back_end_encodeBlockAsm: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 5(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeBlockAsm - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeBlockAsm: - MOVL DX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JB one_byte_match_emit_encodeBlockAsm - CMPL R8, $0x00000100 - JB two_bytes_match_emit_encodeBlockAsm - CMPL R8, $0x00010000 - JB three_bytes_match_emit_encodeBlockAsm - CMPL R8, $0x01000000 - JB four_bytes_match_emit_encodeBlockAsm - MOVB $0xfc, (CX) - MOVL R8, 1(CX) - ADDQ $0x05, CX - JMP memmove_long_match_emit_encodeBlockAsm - -four_bytes_match_emit_encodeBlockAsm: - MOVL R8, R10 - SHRL $0x10, R10 - MOVB $0xf8, (CX) - MOVW R8, 1(CX) - MOVB R10, 3(CX) - ADDQ $0x04, CX - JMP memmove_long_match_emit_encodeBlockAsm - -three_bytes_match_emit_encodeBlockAsm: - MOVB $0xf4, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeBlockAsm - -two_bytes_match_emit_encodeBlockAsm: - MOVB $0xf0, (CX) - MOVB R8, 1(CX) - ADDQ $0x02, CX - CMPL R8, $0x40 - JB memmove_match_emit_encodeBlockAsm - JMP memmove_long_match_emit_encodeBlockAsm - -one_byte_match_emit_encodeBlockAsm: - SHLB $0x02, R8 - MOVB R8, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeBlockAsm: - LEAQ (CX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (CX) - JMP memmove_end_copy_match_emit_encodeBlockAsm - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (CX) - MOVQ DI, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeBlockAsm: - MOVQ R8, CX - JMP emit_literal_done_match_emit_encodeBlockAsm - -memmove_long_match_emit_encodeBlockAsm: - LEAQ (CX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ R8, CX - -emit_literal_done_match_emit_encodeBlockAsm: -match_nolit_loop_encodeBlockAsm: - MOVL DX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL DX, DI - LEAQ (BX)(DX*1), R8 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_match_nolit_encodeBlockAsm: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeBlockAsm - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm - XORQ 8(SI)(R10*1), R11 - JNZ matchlen_bsf_16match_nolit_encodeBlockAsm - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_match_nolit_encodeBlockAsm - -matchlen_bsf_16match_nolit_encodeBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP match_nolit_end_encodeBlockAsm - -matchlen_match8_match_nolit_encodeBlockAsm: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeBlockAsm - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_match_nolit_encodeBlockAsm - -matchlen_bsf_8_match_nolit_encodeBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm - -matchlen_match4_match_nolit_encodeBlockAsm: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeBlockAsm - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeBlockAsm - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeBlockAsm: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeBlockAsm - JB match_nolit_end_encodeBlockAsm - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeBlockAsm - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ match_nolit_end_encodeBlockAsm - -matchlen_match1_match_nolit_encodeBlockAsm: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeBlockAsm - LEAL 1(R10), R10 - -match_nolit_end_encodeBlockAsm: - ADDL R10, DX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL DX, 12(SP) - - // emitCopy - CMPL SI, $0x00010000 - JB two_byte_offset_match_nolit_encodeBlockAsm - CMPL R10, $0x40 - JBE four_bytes_remain_match_nolit_encodeBlockAsm - MOVB $0xff, (CX) - MOVL SI, 1(CX) - LEAL -64(R10), R10 - ADDQ $0x05, CX - CMPL R10, $0x04 - JB four_bytes_remain_match_nolit_encodeBlockAsm - - // emitRepeat -emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy: - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy - CMPL DI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy - CMPL SI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy - -cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: - CMPL R10, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm_emit_copy - CMPL R10, $0x00010100 - JB repeat_four_match_nolit_encodeBlockAsm_emit_copy - CMPL R10, $0x0100ffff - JB repeat_five_match_nolit_encodeBlockAsm_emit_copy - LEAL -16842747(R10), R10 - MOVL $0xfffb001d, (CX) - MOVB $0xff, 4(CX) - ADDQ $0x05, CX - JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy - -repeat_five_match_nolit_encodeBlockAsm_emit_copy: - LEAL -65536(R10), R10 - MOVL R10, SI - MOVW $0x001d, (CX) - MOVW R10, 2(CX) - SARL $0x10, SI - MOVB SI, 4(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_four_match_nolit_encodeBlockAsm_emit_copy: - LEAL -256(R10), R10 - MOVW $0x0019, (CX) - MOVW R10, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_three_match_nolit_encodeBlockAsm_emit_copy: - LEAL -4(R10), R10 - MOVW $0x0015, (CX) - MOVB R10, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_match_nolit_encodeBlockAsm_emit_copy: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(CX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -four_bytes_remain_match_nolit_encodeBlockAsm: - TESTL R10, R10 - JZ match_nolit_emitcopy_end_encodeBlockAsm - XORL DI, DI - LEAL -1(DI)(R10*4), R10 - MOVB R10, (CX) - MOVL SI, 1(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -two_byte_offset_match_nolit_encodeBlockAsm: - CMPL R10, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBlockAsm - CMPL SI, $0x00000800 - JAE long_offset_short_match_nolit_encodeBlockAsm - MOVL $0x00000001, DI - LEAL 16(DI), DI - MOVB SI, 1(CX) - MOVL SI, R8 - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, DI - MOVB DI, (CX) - ADDQ $0x02, CX - SUBL $0x08, R10 - - // emitRepeat - LEAL -4(R10), R10 - JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b - -emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b: - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b - CMPL DI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b - CMPL SI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: - CMPL R10, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b - CMPL R10, $0x00010100 - JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b - CMPL R10, $0x0100ffff - JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b - LEAL -16842747(R10), R10 - MOVL $0xfffb001d, (CX) - MOVB $0xff, 4(CX) - ADDQ $0x05, CX - JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b - -repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b: - LEAL -65536(R10), R10 - MOVL R10, SI - MOVW $0x001d, (CX) - MOVW R10, 2(CX) - SARL $0x10, SI - MOVB SI, 4(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b: - LEAL -256(R10), R10 - MOVW $0x0019, (CX) - MOVW R10, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b: - LEAL -4(R10), R10 - MOVW $0x0015, (CX) - MOVB R10, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(CX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -long_offset_short_match_nolit_encodeBlockAsm: - MOVB $0xee, (CX) - MOVW SI, 1(CX) - LEAL -60(R10), R10 - ADDQ $0x03, CX - - // emitRepeat -emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short: - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short - CMPL DI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short - CMPL SI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: - CMPL R10, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short - CMPL R10, $0x00010100 - JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short - CMPL R10, $0x0100ffff - JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short - LEAL -16842747(R10), R10 - MOVL $0xfffb001d, (CX) - MOVB $0xff, 4(CX) - ADDQ $0x05, CX - JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short - -repeat_five_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -65536(R10), R10 - MOVL R10, SI - MOVW $0x001d, (CX) - MOVW R10, 2(CX) - SARL $0x10, SI - MOVB SI, 4(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_four_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -256(R10), R10 - MOVW $0x0019, (CX) - MOVW R10, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_three_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (CX) - MOVB R10, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_match_nolit_encodeBlockAsm_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(CX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -two_byte_offset_short_match_nolit_encodeBlockAsm: - MOVL R10, DI - SHLL $0x02, DI - CMPL R10, $0x0c - JAE emit_copy_three_match_nolit_encodeBlockAsm - CMPL SI, $0x00000800 - JAE emit_copy_three_match_nolit_encodeBlockAsm - LEAL -15(DI), DI - MOVB SI, 1(CX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -emit_copy_three_match_nolit_encodeBlockAsm: - LEAL -2(DI), DI - MOVB DI, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - -match_nolit_emitcopy_end_encodeBlockAsm: - CMPL DX, 8(SP) - JAE emit_remainder_encodeBlockAsm - MOVQ -2(BX)(DX*1), DI - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeBlockAsm - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm: - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x10, R8 - IMULQ R9, R8 - SHRQ $0x32, R8 - SHLQ $0x10, SI - IMULQ R9, SI - SHRQ $0x32, SI - LEAL -2(DX), R9 - LEAQ (AX)(SI*4), R10 - MOVL (R10), SI - MOVL R9, (AX)(R8*4) - MOVL DX, (R10) - CMPL (BX)(SI*1), DI - JEQ match_nolit_loop_encodeBlockAsm - INCL DX - JMP search_loop_encodeBlockAsm - -emit_remainder_encodeBlockAsm: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 5(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeBlockAsm - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeBlockAsm: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBlockAsm - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBlockAsm - CMPL DX, $0x00010000 - JB three_bytes_emit_remainder_encodeBlockAsm - CMPL DX, $0x01000000 - JB four_bytes_emit_remainder_encodeBlockAsm - MOVB $0xfc, (CX) - MOVL DX, 1(CX) - ADDQ $0x05, CX - JMP memmove_long_emit_remainder_encodeBlockAsm - -four_bytes_emit_remainder_encodeBlockAsm: - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (CX) - MOVW DX, 1(CX) - MOVB BL, 3(CX) - ADDQ $0x04, CX - JMP memmove_long_emit_remainder_encodeBlockAsm - -three_bytes_emit_remainder_encodeBlockAsm: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeBlockAsm - -two_bytes_emit_remainder_encodeBlockAsm: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBlockAsm - JMP memmove_long_emit_remainder_encodeBlockAsm - -one_byte_emit_remainder_encodeBlockAsm: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeBlockAsm: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeBlockAsm - -memmove_long_emit_remainder_encodeBlockAsm: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeBlockAsm: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeBlockAsm4MB(dst []byte, src []byte, tmp *[65536]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBlockAsm4MB(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00000200, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeBlockAsm4MB: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeBlockAsm4MB - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -9(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeBlockAsm4MB: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 4(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeBlockAsm4MB - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R11 - IMULQ R9, R11 - SHRQ $0x32, R11 - MOVL (AX)(R10*4), SI - MOVL (AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - LEAL 1(DX), R10 - MOVL R10, (AX)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm4MB - LEAL 1(DX), DI - MOVL 12(SP), R8 - MOVL DI, SI - SUBL 16(SP), SI - JZ repeat_extend_back_end_encodeBlockAsm4MB - -repeat_extend_back_loop_encodeBlockAsm4MB: - CMPL DI, R8 - JBE repeat_extend_back_end_encodeBlockAsm4MB - MOVB -1(BX)(SI*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_encodeBlockAsm4MB - LEAL -1(DI), DI - DECL SI - JNZ repeat_extend_back_loop_encodeBlockAsm4MB - -repeat_extend_back_end_encodeBlockAsm4MB: - MOVL DI, SI - SUBL 12(SP), SI - LEAQ 4(CX)(SI*1), SI - CMPQ SI, (SP) - JB repeat_dst_size_check_encodeBlockAsm4MB - MOVQ $0x00000000, ret+56(FP) - RET - -repeat_dst_size_check_encodeBlockAsm4MB: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_repeat_emit_encodeBlockAsm4MB - CMPL SI, $0x00000100 - JB two_bytes_repeat_emit_encodeBlockAsm4MB - CMPL SI, $0x00010000 - JB three_bytes_repeat_emit_encodeBlockAsm4MB - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (CX) - MOVW SI, 1(CX) - MOVB R11, 3(CX) - ADDQ $0x04, CX - JMP memmove_long_repeat_emit_encodeBlockAsm4MB - -three_bytes_repeat_emit_encodeBlockAsm4MB: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_repeat_emit_encodeBlockAsm4MB - -two_bytes_repeat_emit_encodeBlockAsm4MB: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_repeat_emit_encodeBlockAsm4MB - JMP memmove_long_repeat_emit_encodeBlockAsm4MB - -one_byte_repeat_emit_encodeBlockAsm4MB: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_repeat_emit_encodeBlockAsm4MB: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (CX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm4MB: - MOVQ SI, CX - JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB - -memmove_long_repeat_emit_encodeBlockAsm4MB: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(R10)(R13*1), R11 - LEAQ -32(CX)(R13*1), R14 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(R10)(R13*1), X4 - MOVOU -16(R10)(R13*1), X5 - MOVOA X4, -32(CX)(R13*1) - MOVOA X5, -16(CX)(R13*1) - ADDQ $0x20, R13 - CMPQ R9, R13 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_repeat_emit_encodeBlockAsm4MB: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R9 - SUBL DX, R9 - LEAQ (BX)(DX*1), R10 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R12, R12 - -matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB: - CMPL R9, $0x10 - JB matchlen_match8_repeat_extend_encodeBlockAsm4MB - MOVQ (R10)(R12*1), R11 - MOVQ 8(R10)(R12*1), R13 - XORQ (SI)(R12*1), R11 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB - XORQ 8(SI)(R12*1), R13 - JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm4MB - LEAL -16(R9), R9 - LEAL 16(R12), R12 - JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB - -matchlen_bsf_16repeat_extend_encodeBlockAsm4MB: -#ifdef GOAMD64_v3 - TZCNTQ R13, R13 - -#else - BSFQ R13, R13 - -#endif - SARQ $0x03, R13 - LEAL 8(R12)(R13*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm4MB - -matchlen_match8_repeat_extend_encodeBlockAsm4MB: - CMPL R9, $0x08 - JB matchlen_match4_repeat_extend_encodeBlockAsm4MB - MOVQ (R10)(R12*1), R11 - XORQ (SI)(R12*1), R11 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB - LEAL -8(R9), R9 - LEAL 8(R12), R12 - JMP matchlen_match4_repeat_extend_encodeBlockAsm4MB - -matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm4MB - -matchlen_match4_repeat_extend_encodeBlockAsm4MB: - CMPL R9, $0x04 - JB matchlen_match2_repeat_extend_encodeBlockAsm4MB - MOVL (R10)(R12*1), R11 - CMPL (SI)(R12*1), R11 - JNE matchlen_match2_repeat_extend_encodeBlockAsm4MB - LEAL -4(R9), R9 - LEAL 4(R12), R12 - -matchlen_match2_repeat_extend_encodeBlockAsm4MB: - CMPL R9, $0x01 - JE matchlen_match1_repeat_extend_encodeBlockAsm4MB - JB repeat_extend_forward_end_encodeBlockAsm4MB - MOVW (R10)(R12*1), R11 - CMPW (SI)(R12*1), R11 - JNE matchlen_match1_repeat_extend_encodeBlockAsm4MB - LEAL 2(R12), R12 - SUBL $0x02, R9 - JZ repeat_extend_forward_end_encodeBlockAsm4MB - -matchlen_match1_repeat_extend_encodeBlockAsm4MB: - MOVB (R10)(R12*1), R11 - CMPB (SI)(R12*1), R11 - JNE repeat_extend_forward_end_encodeBlockAsm4MB - LEAL 1(R12), R12 - -repeat_extend_forward_end_encodeBlockAsm4MB: - ADDL R12, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - TESTL R8, R8 - JZ repeat_as_copy_encodeBlockAsm4MB - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JBE repeat_two_match_repeat_encodeBlockAsm4MB - CMPL R8, $0x0c - JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB - CMPL DI, $0x00000800 - JB repeat_two_offset_match_repeat_encodeBlockAsm4MB - -cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB: - CMPL SI, $0x00000104 - JB repeat_three_match_repeat_encodeBlockAsm4MB - CMPL SI, $0x00010100 - JB repeat_four_match_repeat_encodeBlockAsm4MB - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (CX) - MOVW SI, 2(CX) - SARL $0x10, DI - MOVB DI, 4(CX) - ADDQ $0x05, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_four_match_repeat_encodeBlockAsm4MB: - LEAL -256(SI), SI - MOVW $0x0019, (CX) - MOVW SI, 2(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_three_match_repeat_encodeBlockAsm4MB: - LEAL -4(SI), SI - MOVW $0x0015, (CX) - MOVB SI, 2(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_match_repeat_encodeBlockAsm4MB: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_offset_match_repeat_encodeBlockAsm4MB: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(CX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_as_copy_encodeBlockAsm4MB: - // emitCopy - CMPL DI, $0x00010000 - JB two_byte_offset_repeat_as_copy_encodeBlockAsm4MB - CMPL SI, $0x40 - JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB - MOVB $0xff, (CX) - MOVL DI, 1(CX) - LEAL -64(SI), SI - ADDQ $0x05, CX - CMPL SI, $0x04 - JB four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy - CMPL R8, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy - CMPL DI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - CMPL SI, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy - CMPL SI, $0x00010100 - JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (CX) - MOVW SI, 2(CX) - SARL $0x10, DI - MOVB DI, 4(CX) - ADDQ $0x05, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - LEAL -256(SI), SI - MOVW $0x0019, (CX) - MOVW SI, 2(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - LEAL -4(SI), SI - MOVW $0x0015, (CX) - MOVB SI, 2(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(CX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB: - TESTL SI, SI - JZ repeat_end_emit_encodeBlockAsm4MB - XORL R8, R8 - LEAL -1(R8)(SI*4), SI - MOVB SI, (CX) - MOVL DI, 1(CX) - ADDQ $0x05, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -two_byte_offset_repeat_as_copy_encodeBlockAsm4MB: - CMPL SI, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB - CMPL DI, $0x00000800 - JAE long_offset_short_repeat_as_copy_encodeBlockAsm4MB - MOVL $0x00000001, R8 - LEAL 16(R8), R8 - MOVB DI, 1(CX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R8 - MOVB R8, (CX) - ADDQ $0x02, CX - SUBL $0x08, SI - - // emitRepeat - LEAL -4(SI), SI - JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b - CMPL R8, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b - CMPL DI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: - CMPL SI, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b - CMPL SI, $0x00010100 - JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (CX) - MOVW SI, 2(CX) - SARL $0x10, DI - MOVB DI, 4(CX) - ADDQ $0x05, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: - LEAL -256(SI), SI - MOVW $0x0019, (CX) - MOVW SI, 2(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: - LEAL -4(SI), SI - MOVW $0x0015, (CX) - MOVB SI, 2(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(CX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -long_offset_short_repeat_as_copy_encodeBlockAsm4MB: - MOVB $0xee, (CX) - MOVW DI, 1(CX) - LEAL -60(SI), SI - ADDQ $0x03, CX - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - CMPL R8, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - CMPL DI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - CMPL SI, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - CMPL SI, $0x00010100 - JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - LEAL -65536(SI), SI - MOVL SI, DI - MOVW $0x001d, (CX) - MOVW SI, 2(CX) - SARL $0x10, DI - MOVB DI, 4(CX) - ADDQ $0x05, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - LEAL -256(SI), SI - MOVW $0x0019, (CX) - MOVW SI, 2(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - LEAL -4(SI), SI - MOVW $0x0015, (CX) - MOVB SI, 2(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(CX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB: - MOVL SI, R8 - SHLL $0x02, R8 - CMPL SI, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB - CMPL DI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB - LEAL -15(R8), R8 - MOVB DI, 1(CX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R8 - MOVB R8, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm4MB - -emit_copy_three_repeat_as_copy_encodeBlockAsm4MB: - LEAL -2(R8), R8 - MOVB R8, (CX) - MOVW DI, 1(CX) - ADDQ $0x03, CX - -repeat_end_emit_encodeBlockAsm4MB: - MOVL DX, 12(SP) - JMP search_loop_encodeBlockAsm4MB - -no_repeat_found_encodeBlockAsm4MB: - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm4MB - SHRQ $0x08, DI - MOVL (AX)(R10*4), SI - LEAL 2(DX), R9 - CMPL (BX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm4MB - MOVL R9, (AX)(R10*4) - SHRQ $0x08, DI - CMPL (BX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm4MB - MOVL 20(SP), DX - JMP search_loop_encodeBlockAsm4MB - -candidate3_match_encodeBlockAsm4MB: - ADDL $0x02, DX - JMP candidate_match_encodeBlockAsm4MB - -candidate2_match_encodeBlockAsm4MB: - MOVL R9, (AX)(R10*4) - INCL DX - MOVL R8, SI - -candidate_match_encodeBlockAsm4MB: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBlockAsm4MB - -match_extend_back_loop_encodeBlockAsm4MB: - CMPL DX, DI - JBE match_extend_back_end_encodeBlockAsm4MB - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeBlockAsm4MB - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeBlockAsm4MB - JMP match_extend_back_loop_encodeBlockAsm4MB - -match_extend_back_end_encodeBlockAsm4MB: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 4(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeBlockAsm4MB - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeBlockAsm4MB: - MOVL DX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeBlockAsm4MB - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JB one_byte_match_emit_encodeBlockAsm4MB - CMPL R8, $0x00000100 - JB two_bytes_match_emit_encodeBlockAsm4MB - CMPL R8, $0x00010000 - JB three_bytes_match_emit_encodeBlockAsm4MB - MOVL R8, R10 - SHRL $0x10, R10 - MOVB $0xf8, (CX) - MOVW R8, 1(CX) - MOVB R10, 3(CX) - ADDQ $0x04, CX - JMP memmove_long_match_emit_encodeBlockAsm4MB - -three_bytes_match_emit_encodeBlockAsm4MB: - MOVB $0xf4, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeBlockAsm4MB - -two_bytes_match_emit_encodeBlockAsm4MB: - MOVB $0xf0, (CX) - MOVB R8, 1(CX) - ADDQ $0x02, CX - CMPL R8, $0x40 - JB memmove_match_emit_encodeBlockAsm4MB - JMP memmove_long_match_emit_encodeBlockAsm4MB - -one_byte_match_emit_encodeBlockAsm4MB: - SHLB $0x02, R8 - MOVB R8, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeBlockAsm4MB: - LEAQ (CX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (CX) - JMP memmove_end_copy_match_emit_encodeBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (CX) - MOVQ DI, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeBlockAsm4MB: - MOVQ R8, CX - JMP emit_literal_done_match_emit_encodeBlockAsm4MB - -memmove_long_match_emit_encodeBlockAsm4MB: - LEAQ (CX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ R8, CX - -emit_literal_done_match_emit_encodeBlockAsm4MB: -match_nolit_loop_encodeBlockAsm4MB: - MOVL DX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL DX, DI - LEAQ (BX)(DX*1), R8 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_match_nolit_encodeBlockAsm4MB: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeBlockAsm4MB - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB - XORQ 8(SI)(R10*1), R11 - JNZ matchlen_bsf_16match_nolit_encodeBlockAsm4MB - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_match_nolit_encodeBlockAsm4MB - -matchlen_bsf_16match_nolit_encodeBlockAsm4MB: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP match_nolit_end_encodeBlockAsm4MB - -matchlen_match8_match_nolit_encodeBlockAsm4MB: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeBlockAsm4MB - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_match_nolit_encodeBlockAsm4MB - -matchlen_bsf_8_match_nolit_encodeBlockAsm4MB: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm4MB - -matchlen_match4_match_nolit_encodeBlockAsm4MB: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeBlockAsm4MB - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeBlockAsm4MB - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeBlockAsm4MB: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeBlockAsm4MB - JB match_nolit_end_encodeBlockAsm4MB - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeBlockAsm4MB - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ match_nolit_end_encodeBlockAsm4MB - -matchlen_match1_match_nolit_encodeBlockAsm4MB: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeBlockAsm4MB - LEAL 1(R10), R10 - -match_nolit_end_encodeBlockAsm4MB: - ADDL R10, DX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL DX, 12(SP) - - // emitCopy - CMPL SI, $0x00010000 - JB two_byte_offset_match_nolit_encodeBlockAsm4MB - CMPL R10, $0x40 - JBE four_bytes_remain_match_nolit_encodeBlockAsm4MB - MOVB $0xff, (CX) - MOVL SI, 1(CX) - LEAL -64(R10), R10 - ADDQ $0x05, CX - CMPL R10, $0x04 - JB four_bytes_remain_match_nolit_encodeBlockAsm4MB - - // emitRepeat - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy - CMPL DI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy - CMPL SI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy - -cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: - CMPL R10, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy - CMPL R10, $0x00010100 - JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy - LEAL -65536(R10), R10 - MOVL R10, SI - MOVW $0x001d, (CX) - MOVW R10, 2(CX) - SARL $0x10, SI - MOVB SI, 4(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy: - LEAL -256(R10), R10 - MOVW $0x0019, (CX) - MOVW R10, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy: - LEAL -4(R10), R10 - MOVW $0x0015, (CX) - MOVB R10, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(CX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -four_bytes_remain_match_nolit_encodeBlockAsm4MB: - TESTL R10, R10 - JZ match_nolit_emitcopy_end_encodeBlockAsm4MB - XORL DI, DI - LEAL -1(DI)(R10*4), R10 - MOVB R10, (CX) - MOVL SI, 1(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -two_byte_offset_match_nolit_encodeBlockAsm4MB: - CMPL R10, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBlockAsm4MB - CMPL SI, $0x00000800 - JAE long_offset_short_match_nolit_encodeBlockAsm4MB - MOVL $0x00000001, DI - LEAL 16(DI), DI - MOVB SI, 1(CX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (CX) - ADDQ $0x02, CX - SUBL $0x08, R10 - - // emitRepeat - LEAL -4(R10), R10 - JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b - CMPL DI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b - CMPL SI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: - CMPL R10, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b - CMPL R10, $0x00010100 - JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b - LEAL -65536(R10), R10 - MOVL R10, SI - MOVW $0x001d, (CX) - MOVW R10, 2(CX) - SARL $0x10, SI - MOVB SI, 4(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: - LEAL -256(R10), R10 - MOVW $0x0019, (CX) - MOVW R10, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: - LEAL -4(R10), R10 - MOVW $0x0015, (CX) - MOVB R10, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(CX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -long_offset_short_match_nolit_encodeBlockAsm4MB: - MOVB $0xee, (CX) - MOVW SI, 1(CX) - LEAL -60(R10), R10 - ADDQ $0x03, CX - - // emitRepeat - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short - CMPL DI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short - CMPL SI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: - CMPL R10, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short - CMPL R10, $0x00010100 - JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short - LEAL -65536(R10), R10 - MOVL R10, SI - MOVW $0x001d, (CX) - MOVW R10, 2(CX) - SARL $0x10, SI - MOVB SI, 4(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short: - LEAL -256(R10), R10 - MOVW $0x0019, (CX) - MOVW R10, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (CX) - MOVB R10, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(CX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -two_byte_offset_short_match_nolit_encodeBlockAsm4MB: - MOVL R10, DI - SHLL $0x02, DI - CMPL R10, $0x0c - JAE emit_copy_three_match_nolit_encodeBlockAsm4MB - CMPL SI, $0x00000800 - JAE emit_copy_three_match_nolit_encodeBlockAsm4MB - LEAL -15(DI), DI - MOVB SI, 1(CX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -emit_copy_three_match_nolit_encodeBlockAsm4MB: - LEAL -2(DI), DI - MOVB DI, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - -match_nolit_emitcopy_end_encodeBlockAsm4MB: - CMPL DX, 8(SP) - JAE emit_remainder_encodeBlockAsm4MB - MOVQ -2(BX)(DX*1), DI - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeBlockAsm4MB - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm4MB: - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x10, R8 - IMULQ R9, R8 - SHRQ $0x32, R8 - SHLQ $0x10, SI - IMULQ R9, SI - SHRQ $0x32, SI - LEAL -2(DX), R9 - LEAQ (AX)(SI*4), R10 - MOVL (R10), SI - MOVL R9, (AX)(R8*4) - MOVL DX, (R10) - CMPL (BX)(SI*1), DI - JEQ match_nolit_loop_encodeBlockAsm4MB - INCL DX - JMP search_loop_encodeBlockAsm4MB - -emit_remainder_encodeBlockAsm4MB: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 4(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeBlockAsm4MB - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeBlockAsm4MB: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBlockAsm4MB - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBlockAsm4MB - CMPL DX, $0x00010000 - JB three_bytes_emit_remainder_encodeBlockAsm4MB - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (CX) - MOVW DX, 1(CX) - MOVB BL, 3(CX) - ADDQ $0x04, CX - JMP memmove_long_emit_remainder_encodeBlockAsm4MB - -three_bytes_emit_remainder_encodeBlockAsm4MB: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeBlockAsm4MB - -two_bytes_emit_remainder_encodeBlockAsm4MB: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBlockAsm4MB - JMP memmove_long_emit_remainder_encodeBlockAsm4MB - -one_byte_emit_remainder_encodeBlockAsm4MB: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeBlockAsm4MB: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm4MB: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB - -memmove_long_emit_remainder_encodeBlockAsm4MB: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeBlockAsm4MB: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeBlockAsm12B(dst []byte, src []byte, tmp *[16384]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBlockAsm12B(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00000080, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeBlockAsm12B: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeBlockAsm12B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -9(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeBlockAsm12B: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 4(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeBlockAsm12B - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x18, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - SHLQ $0x18, R11 - IMULQ R9, R11 - SHRQ $0x34, R11 - MOVL (AX)(R10*4), SI - MOVL (AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - LEAL 1(DX), R10 - MOVL R10, (AX)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x18, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm12B - LEAL 1(DX), DI - MOVL 12(SP), R8 - MOVL DI, SI - SUBL 16(SP), SI - JZ repeat_extend_back_end_encodeBlockAsm12B - -repeat_extend_back_loop_encodeBlockAsm12B: - CMPL DI, R8 - JBE repeat_extend_back_end_encodeBlockAsm12B - MOVB -1(BX)(SI*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_encodeBlockAsm12B - LEAL -1(DI), DI - DECL SI - JNZ repeat_extend_back_loop_encodeBlockAsm12B - -repeat_extend_back_end_encodeBlockAsm12B: - MOVL DI, SI - SUBL 12(SP), SI - LEAQ 3(CX)(SI*1), SI - CMPQ SI, (SP) - JB repeat_dst_size_check_encodeBlockAsm12B - MOVQ $0x00000000, ret+56(FP) - RET - -repeat_dst_size_check_encodeBlockAsm12B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_repeat_emit_encodeBlockAsm12B - CMPL SI, $0x00000100 - JB two_bytes_repeat_emit_encodeBlockAsm12B - JB three_bytes_repeat_emit_encodeBlockAsm12B - -three_bytes_repeat_emit_encodeBlockAsm12B: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_repeat_emit_encodeBlockAsm12B - -two_bytes_repeat_emit_encodeBlockAsm12B: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_repeat_emit_encodeBlockAsm12B - JMP memmove_long_repeat_emit_encodeBlockAsm12B - -one_byte_repeat_emit_encodeBlockAsm12B: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_repeat_emit_encodeBlockAsm12B: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (CX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm12B: - MOVQ SI, CX - JMP emit_literal_done_repeat_emit_encodeBlockAsm12B - -memmove_long_repeat_emit_encodeBlockAsm12B: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R13*1), R11 - LEAQ -32(CX)(R13*1), R14 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R13*1), X4 - MOVOU -16(R10)(R13*1), X5 - MOVOA X4, -32(CX)(R13*1) - MOVOA X5, -16(CX)(R13*1) - ADDQ $0x20, R13 - CMPQ R9, R13 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_repeat_emit_encodeBlockAsm12B: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R9 - SUBL DX, R9 - LEAQ (BX)(DX*1), R10 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R12, R12 - -matchlen_loopback_16_repeat_extend_encodeBlockAsm12B: - CMPL R9, $0x10 - JB matchlen_match8_repeat_extend_encodeBlockAsm12B - MOVQ (R10)(R12*1), R11 - MOVQ 8(R10)(R12*1), R13 - XORQ (SI)(R12*1), R11 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B - XORQ 8(SI)(R12*1), R13 - JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm12B - LEAL -16(R9), R9 - LEAL 16(R12), R12 - JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm12B - -matchlen_bsf_16repeat_extend_encodeBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R13, R13 - -#else - BSFQ R13, R13 - -#endif - SARQ $0x03, R13 - LEAL 8(R12)(R13*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm12B - -matchlen_match8_repeat_extend_encodeBlockAsm12B: - CMPL R9, $0x08 - JB matchlen_match4_repeat_extend_encodeBlockAsm12B - MOVQ (R10)(R12*1), R11 - XORQ (SI)(R12*1), R11 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B - LEAL -8(R9), R9 - LEAL 8(R12), R12 - JMP matchlen_match4_repeat_extend_encodeBlockAsm12B - -matchlen_bsf_8_repeat_extend_encodeBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm12B - -matchlen_match4_repeat_extend_encodeBlockAsm12B: - CMPL R9, $0x04 - JB matchlen_match2_repeat_extend_encodeBlockAsm12B - MOVL (R10)(R12*1), R11 - CMPL (SI)(R12*1), R11 - JNE matchlen_match2_repeat_extend_encodeBlockAsm12B - LEAL -4(R9), R9 - LEAL 4(R12), R12 - -matchlen_match2_repeat_extend_encodeBlockAsm12B: - CMPL R9, $0x01 - JE matchlen_match1_repeat_extend_encodeBlockAsm12B - JB repeat_extend_forward_end_encodeBlockAsm12B - MOVW (R10)(R12*1), R11 - CMPW (SI)(R12*1), R11 - JNE matchlen_match1_repeat_extend_encodeBlockAsm12B - LEAL 2(R12), R12 - SUBL $0x02, R9 - JZ repeat_extend_forward_end_encodeBlockAsm12B - -matchlen_match1_repeat_extend_encodeBlockAsm12B: - MOVB (R10)(R12*1), R11 - CMPB (SI)(R12*1), R11 - JNE repeat_extend_forward_end_encodeBlockAsm12B - LEAL 1(R12), R12 - -repeat_extend_forward_end_encodeBlockAsm12B: - ADDL R12, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - TESTL R8, R8 - JZ repeat_as_copy_encodeBlockAsm12B - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JBE repeat_two_match_repeat_encodeBlockAsm12B - CMPL R8, $0x0c - JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B - CMPL DI, $0x00000800 - JB repeat_two_offset_match_repeat_encodeBlockAsm12B - -cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: - CMPL SI, $0x00000104 - JB repeat_three_match_repeat_encodeBlockAsm12B - LEAL -256(SI), SI - MOVW $0x0019, (CX) - MOVW SI, 2(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_three_match_repeat_encodeBlockAsm12B: - LEAL -4(SI), SI - MOVW $0x0015, (CX) - MOVB SI, 2(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_match_repeat_encodeBlockAsm12B: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_offset_match_repeat_encodeBlockAsm12B: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(CX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_as_copy_encodeBlockAsm12B: - // emitCopy - CMPL SI, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B - CMPL DI, $0x00000800 - JAE long_offset_short_repeat_as_copy_encodeBlockAsm12B - MOVL $0x00000001, R8 - LEAL 16(R8), R8 - MOVB DI, 1(CX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R8 - MOVB R8, (CX) - ADDQ $0x02, CX - SUBL $0x08, SI - - // emitRepeat - LEAL -4(SI), SI - JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b - CMPL R8, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b - CMPL DI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: - CMPL SI, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b - LEAL -256(SI), SI - MOVW $0x0019, (CX) - MOVW SI, 2(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: - LEAL -4(SI), SI - MOVW $0x0015, (CX) - MOVB SI, 2(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(CX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm12B - -long_offset_short_repeat_as_copy_encodeBlockAsm12B: - MOVB $0xee, (CX) - MOVW DI, 1(CX) - LEAL -60(SI), SI - ADDQ $0x03, CX - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - CMPL R8, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - CMPL DI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - CMPL SI, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - LEAL -256(SI), SI - MOVW $0x0019, (CX) - MOVW SI, 2(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - LEAL -4(SI), SI - MOVW $0x0015, (CX) - MOVB SI, 2(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(CX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm12B - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: - MOVL SI, R8 - SHLL $0x02, R8 - CMPL SI, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B - CMPL DI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B - LEAL -15(R8), R8 - MOVB DI, 1(CX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R8 - MOVB R8, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm12B - -emit_copy_three_repeat_as_copy_encodeBlockAsm12B: - LEAL -2(R8), R8 - MOVB R8, (CX) - MOVW DI, 1(CX) - ADDQ $0x03, CX - -repeat_end_emit_encodeBlockAsm12B: - MOVL DX, 12(SP) - JMP search_loop_encodeBlockAsm12B - -no_repeat_found_encodeBlockAsm12B: - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm12B - SHRQ $0x08, DI - MOVL (AX)(R10*4), SI - LEAL 2(DX), R9 - CMPL (BX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm12B - MOVL R9, (AX)(R10*4) - SHRQ $0x08, DI - CMPL (BX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm12B - MOVL 20(SP), DX - JMP search_loop_encodeBlockAsm12B - -candidate3_match_encodeBlockAsm12B: - ADDL $0x02, DX - JMP candidate_match_encodeBlockAsm12B - -candidate2_match_encodeBlockAsm12B: - MOVL R9, (AX)(R10*4) - INCL DX - MOVL R8, SI - -candidate_match_encodeBlockAsm12B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBlockAsm12B - -match_extend_back_loop_encodeBlockAsm12B: - CMPL DX, DI - JBE match_extend_back_end_encodeBlockAsm12B - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeBlockAsm12B - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeBlockAsm12B - JMP match_extend_back_loop_encodeBlockAsm12B - -match_extend_back_end_encodeBlockAsm12B: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 3(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeBlockAsm12B - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeBlockAsm12B: - MOVL DX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JB one_byte_match_emit_encodeBlockAsm12B - CMPL R8, $0x00000100 - JB two_bytes_match_emit_encodeBlockAsm12B - JB three_bytes_match_emit_encodeBlockAsm12B - -three_bytes_match_emit_encodeBlockAsm12B: - MOVB $0xf4, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeBlockAsm12B - -two_bytes_match_emit_encodeBlockAsm12B: - MOVB $0xf0, (CX) - MOVB R8, 1(CX) - ADDQ $0x02, CX - CMPL R8, $0x40 - JB memmove_match_emit_encodeBlockAsm12B - JMP memmove_long_match_emit_encodeBlockAsm12B - -one_byte_match_emit_encodeBlockAsm12B: - SHLB $0x02, R8 - MOVB R8, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeBlockAsm12B: - LEAQ (CX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (CX) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (CX) - MOVQ DI, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeBlockAsm12B: - MOVQ R8, CX - JMP emit_literal_done_match_emit_encodeBlockAsm12B - -memmove_long_match_emit_encodeBlockAsm12B: - LEAQ (CX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ R8, CX - -emit_literal_done_match_emit_encodeBlockAsm12B: -match_nolit_loop_encodeBlockAsm12B: - MOVL DX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL DX, DI - LEAQ (BX)(DX*1), R8 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_match_nolit_encodeBlockAsm12B: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeBlockAsm12B - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B - XORQ 8(SI)(R10*1), R11 - JNZ matchlen_bsf_16match_nolit_encodeBlockAsm12B - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_match_nolit_encodeBlockAsm12B - -matchlen_bsf_16match_nolit_encodeBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP match_nolit_end_encodeBlockAsm12B - -matchlen_match8_match_nolit_encodeBlockAsm12B: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeBlockAsm12B - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_match_nolit_encodeBlockAsm12B - -matchlen_bsf_8_match_nolit_encodeBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm12B - -matchlen_match4_match_nolit_encodeBlockAsm12B: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeBlockAsm12B - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeBlockAsm12B - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeBlockAsm12B: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeBlockAsm12B - JB match_nolit_end_encodeBlockAsm12B - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeBlockAsm12B - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ match_nolit_end_encodeBlockAsm12B - -matchlen_match1_match_nolit_encodeBlockAsm12B: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeBlockAsm12B - LEAL 1(R10), R10 - -match_nolit_end_encodeBlockAsm12B: - ADDL R10, DX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL DX, 12(SP) - - // emitCopy - CMPL R10, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBlockAsm12B - CMPL SI, $0x00000800 - JAE long_offset_short_match_nolit_encodeBlockAsm12B - MOVL $0x00000001, DI - LEAL 16(DI), DI - MOVB SI, 1(CX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (CX) - ADDQ $0x02, CX - SUBL $0x08, R10 - - // emitRepeat - LEAL -4(R10), R10 - JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b - CMPL DI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b - CMPL SI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: - CMPL R10, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b - LEAL -256(R10), R10 - MOVW $0x0019, (CX) - MOVW R10, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: - LEAL -4(R10), R10 - MOVW $0x0015, (CX) - MOVB R10, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(CX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -long_offset_short_match_nolit_encodeBlockAsm12B: - MOVB $0xee, (CX) - MOVW SI, 1(CX) - LEAL -60(R10), R10 - ADDQ $0x03, CX - - // emitRepeat - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short - CMPL DI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short - CMPL SI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: - CMPL R10, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short - LEAL -256(R10), R10 - MOVW $0x0019, (CX) - MOVW R10, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (CX) - MOVB R10, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(CX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -two_byte_offset_short_match_nolit_encodeBlockAsm12B: - MOVL R10, DI - SHLL $0x02, DI - CMPL R10, $0x0c - JAE emit_copy_three_match_nolit_encodeBlockAsm12B - CMPL SI, $0x00000800 - JAE emit_copy_three_match_nolit_encodeBlockAsm12B - LEAL -15(DI), DI - MOVB SI, 1(CX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -emit_copy_three_match_nolit_encodeBlockAsm12B: - LEAL -2(DI), DI - MOVB DI, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - -match_nolit_emitcopy_end_encodeBlockAsm12B: - CMPL DX, 8(SP) - JAE emit_remainder_encodeBlockAsm12B - MOVQ -2(BX)(DX*1), DI - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeBlockAsm12B - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm12B: - MOVQ $0x000000cf1bbcdcbb, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x18, R8 - IMULQ R9, R8 - SHRQ $0x34, R8 - SHLQ $0x18, SI - IMULQ R9, SI - SHRQ $0x34, SI - LEAL -2(DX), R9 - LEAQ (AX)(SI*4), R10 - MOVL (R10), SI - MOVL R9, (AX)(R8*4) - MOVL DX, (R10) - CMPL (BX)(SI*1), DI - JEQ match_nolit_loop_encodeBlockAsm12B - INCL DX - JMP search_loop_encodeBlockAsm12B - -emit_remainder_encodeBlockAsm12B: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 3(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeBlockAsm12B - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeBlockAsm12B: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBlockAsm12B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBlockAsm12B - JB three_bytes_emit_remainder_encodeBlockAsm12B - -three_bytes_emit_remainder_encodeBlockAsm12B: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeBlockAsm12B - -two_bytes_emit_remainder_encodeBlockAsm12B: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBlockAsm12B - JMP memmove_long_emit_remainder_encodeBlockAsm12B - -one_byte_emit_remainder_encodeBlockAsm12B: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeBlockAsm12B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm12B: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeBlockAsm12B - -memmove_long_emit_remainder_encodeBlockAsm12B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeBlockAsm12B: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeBlockAsm10B(dst []byte, src []byte, tmp *[4096]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBlockAsm10B(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00000020, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeBlockAsm10B: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeBlockAsm10B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -9(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeBlockAsm10B: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 4(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeBlockAsm10B - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x9e3779b1, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R9, R11 - SHRQ $0x36, R11 - MOVL (AX)(R10*4), SI - MOVL (AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - LEAL 1(DX), R10 - MOVL R10, (AX)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm10B - LEAL 1(DX), DI - MOVL 12(SP), R8 - MOVL DI, SI - SUBL 16(SP), SI - JZ repeat_extend_back_end_encodeBlockAsm10B - -repeat_extend_back_loop_encodeBlockAsm10B: - CMPL DI, R8 - JBE repeat_extend_back_end_encodeBlockAsm10B - MOVB -1(BX)(SI*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_encodeBlockAsm10B - LEAL -1(DI), DI - DECL SI - JNZ repeat_extend_back_loop_encodeBlockAsm10B - -repeat_extend_back_end_encodeBlockAsm10B: - MOVL DI, SI - SUBL 12(SP), SI - LEAQ 3(CX)(SI*1), SI - CMPQ SI, (SP) - JB repeat_dst_size_check_encodeBlockAsm10B - MOVQ $0x00000000, ret+56(FP) - RET - -repeat_dst_size_check_encodeBlockAsm10B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_repeat_emit_encodeBlockAsm10B - CMPL SI, $0x00000100 - JB two_bytes_repeat_emit_encodeBlockAsm10B - JB three_bytes_repeat_emit_encodeBlockAsm10B - -three_bytes_repeat_emit_encodeBlockAsm10B: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_repeat_emit_encodeBlockAsm10B - -two_bytes_repeat_emit_encodeBlockAsm10B: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_repeat_emit_encodeBlockAsm10B - JMP memmove_long_repeat_emit_encodeBlockAsm10B - -one_byte_repeat_emit_encodeBlockAsm10B: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_repeat_emit_encodeBlockAsm10B: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (CX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm10B: - MOVQ SI, CX - JMP emit_literal_done_repeat_emit_encodeBlockAsm10B - -memmove_long_repeat_emit_encodeBlockAsm10B: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R13*1), R11 - LEAQ -32(CX)(R13*1), R14 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R13*1), X4 - MOVOU -16(R10)(R13*1), X5 - MOVOA X4, -32(CX)(R13*1) - MOVOA X5, -16(CX)(R13*1) - ADDQ $0x20, R13 - CMPQ R9, R13 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_repeat_emit_encodeBlockAsm10B: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R9 - SUBL DX, R9 - LEAQ (BX)(DX*1), R10 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R12, R12 - -matchlen_loopback_16_repeat_extend_encodeBlockAsm10B: - CMPL R9, $0x10 - JB matchlen_match8_repeat_extend_encodeBlockAsm10B - MOVQ (R10)(R12*1), R11 - MOVQ 8(R10)(R12*1), R13 - XORQ (SI)(R12*1), R11 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B - XORQ 8(SI)(R12*1), R13 - JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm10B - LEAL -16(R9), R9 - LEAL 16(R12), R12 - JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm10B - -matchlen_bsf_16repeat_extend_encodeBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R13, R13 - -#else - BSFQ R13, R13 - -#endif - SARQ $0x03, R13 - LEAL 8(R12)(R13*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm10B - -matchlen_match8_repeat_extend_encodeBlockAsm10B: - CMPL R9, $0x08 - JB matchlen_match4_repeat_extend_encodeBlockAsm10B - MOVQ (R10)(R12*1), R11 - XORQ (SI)(R12*1), R11 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B - LEAL -8(R9), R9 - LEAL 8(R12), R12 - JMP matchlen_match4_repeat_extend_encodeBlockAsm10B - -matchlen_bsf_8_repeat_extend_encodeBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm10B - -matchlen_match4_repeat_extend_encodeBlockAsm10B: - CMPL R9, $0x04 - JB matchlen_match2_repeat_extend_encodeBlockAsm10B - MOVL (R10)(R12*1), R11 - CMPL (SI)(R12*1), R11 - JNE matchlen_match2_repeat_extend_encodeBlockAsm10B - LEAL -4(R9), R9 - LEAL 4(R12), R12 - -matchlen_match2_repeat_extend_encodeBlockAsm10B: - CMPL R9, $0x01 - JE matchlen_match1_repeat_extend_encodeBlockAsm10B - JB repeat_extend_forward_end_encodeBlockAsm10B - MOVW (R10)(R12*1), R11 - CMPW (SI)(R12*1), R11 - JNE matchlen_match1_repeat_extend_encodeBlockAsm10B - LEAL 2(R12), R12 - SUBL $0x02, R9 - JZ repeat_extend_forward_end_encodeBlockAsm10B - -matchlen_match1_repeat_extend_encodeBlockAsm10B: - MOVB (R10)(R12*1), R11 - CMPB (SI)(R12*1), R11 - JNE repeat_extend_forward_end_encodeBlockAsm10B - LEAL 1(R12), R12 - -repeat_extend_forward_end_encodeBlockAsm10B: - ADDL R12, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - TESTL R8, R8 - JZ repeat_as_copy_encodeBlockAsm10B - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JBE repeat_two_match_repeat_encodeBlockAsm10B - CMPL R8, $0x0c - JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B - CMPL DI, $0x00000800 - JB repeat_two_offset_match_repeat_encodeBlockAsm10B - -cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: - CMPL SI, $0x00000104 - JB repeat_three_match_repeat_encodeBlockAsm10B - LEAL -256(SI), SI - MOVW $0x0019, (CX) - MOVW SI, 2(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_three_match_repeat_encodeBlockAsm10B: - LEAL -4(SI), SI - MOVW $0x0015, (CX) - MOVB SI, 2(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_match_repeat_encodeBlockAsm10B: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_offset_match_repeat_encodeBlockAsm10B: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(CX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_as_copy_encodeBlockAsm10B: - // emitCopy - CMPL SI, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B - CMPL DI, $0x00000800 - JAE long_offset_short_repeat_as_copy_encodeBlockAsm10B - MOVL $0x00000001, R8 - LEAL 16(R8), R8 - MOVB DI, 1(CX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R8 - MOVB R8, (CX) - ADDQ $0x02, CX - SUBL $0x08, SI - - // emitRepeat - LEAL -4(SI), SI - JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b - CMPL R8, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b - CMPL DI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: - CMPL SI, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b - LEAL -256(SI), SI - MOVW $0x0019, (CX) - MOVW SI, 2(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: - LEAL -4(SI), SI - MOVW $0x0015, (CX) - MOVB SI, 2(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(CX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm10B - -long_offset_short_repeat_as_copy_encodeBlockAsm10B: - MOVB $0xee, (CX) - MOVW DI, 1(CX) - LEAL -60(SI), SI - ADDQ $0x03, CX - - // emitRepeat - MOVL SI, R8 - LEAL -4(SI), SI - CMPL R8, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - CMPL R8, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - CMPL DI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - CMPL SI, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - LEAL -256(SI), SI - MOVW $0x0019, (CX) - MOVW SI, 2(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - LEAL -4(SI), SI - MOVW $0x0015, (CX) - MOVB SI, 2(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(CX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm10B - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: - MOVL SI, R8 - SHLL $0x02, R8 - CMPL SI, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B - CMPL DI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B - LEAL -15(R8), R8 - MOVB DI, 1(CX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R8 - MOVB R8, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm10B - -emit_copy_three_repeat_as_copy_encodeBlockAsm10B: - LEAL -2(R8), R8 - MOVB R8, (CX) - MOVW DI, 1(CX) - ADDQ $0x03, CX - -repeat_end_emit_encodeBlockAsm10B: - MOVL DX, 12(SP) - JMP search_loop_encodeBlockAsm10B - -no_repeat_found_encodeBlockAsm10B: - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm10B - SHRQ $0x08, DI - MOVL (AX)(R10*4), SI - LEAL 2(DX), R9 - CMPL (BX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm10B - MOVL R9, (AX)(R10*4) - SHRQ $0x08, DI - CMPL (BX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm10B - MOVL 20(SP), DX - JMP search_loop_encodeBlockAsm10B - -candidate3_match_encodeBlockAsm10B: - ADDL $0x02, DX - JMP candidate_match_encodeBlockAsm10B - -candidate2_match_encodeBlockAsm10B: - MOVL R9, (AX)(R10*4) - INCL DX - MOVL R8, SI - -candidate_match_encodeBlockAsm10B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBlockAsm10B - -match_extend_back_loop_encodeBlockAsm10B: - CMPL DX, DI - JBE match_extend_back_end_encodeBlockAsm10B - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeBlockAsm10B - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeBlockAsm10B - JMP match_extend_back_loop_encodeBlockAsm10B - -match_extend_back_end_encodeBlockAsm10B: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 3(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeBlockAsm10B - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeBlockAsm10B: - MOVL DX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JB one_byte_match_emit_encodeBlockAsm10B - CMPL R8, $0x00000100 - JB two_bytes_match_emit_encodeBlockAsm10B - JB three_bytes_match_emit_encodeBlockAsm10B - -three_bytes_match_emit_encodeBlockAsm10B: - MOVB $0xf4, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeBlockAsm10B - -two_bytes_match_emit_encodeBlockAsm10B: - MOVB $0xf0, (CX) - MOVB R8, 1(CX) - ADDQ $0x02, CX - CMPL R8, $0x40 - JB memmove_match_emit_encodeBlockAsm10B - JMP memmove_long_match_emit_encodeBlockAsm10B - -one_byte_match_emit_encodeBlockAsm10B: - SHLB $0x02, R8 - MOVB R8, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeBlockAsm10B: - LEAQ (CX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (CX) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (CX) - MOVQ DI, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeBlockAsm10B: - MOVQ R8, CX - JMP emit_literal_done_match_emit_encodeBlockAsm10B - -memmove_long_match_emit_encodeBlockAsm10B: - LEAQ (CX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ R8, CX - -emit_literal_done_match_emit_encodeBlockAsm10B: -match_nolit_loop_encodeBlockAsm10B: - MOVL DX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL DX, DI - LEAQ (BX)(DX*1), R8 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_match_nolit_encodeBlockAsm10B: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeBlockAsm10B - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B - XORQ 8(SI)(R10*1), R11 - JNZ matchlen_bsf_16match_nolit_encodeBlockAsm10B - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_match_nolit_encodeBlockAsm10B - -matchlen_bsf_16match_nolit_encodeBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP match_nolit_end_encodeBlockAsm10B - -matchlen_match8_match_nolit_encodeBlockAsm10B: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeBlockAsm10B - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_match_nolit_encodeBlockAsm10B - -matchlen_bsf_8_match_nolit_encodeBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm10B - -matchlen_match4_match_nolit_encodeBlockAsm10B: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeBlockAsm10B - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeBlockAsm10B - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeBlockAsm10B: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeBlockAsm10B - JB match_nolit_end_encodeBlockAsm10B - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeBlockAsm10B - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ match_nolit_end_encodeBlockAsm10B - -matchlen_match1_match_nolit_encodeBlockAsm10B: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeBlockAsm10B - LEAL 1(R10), R10 - -match_nolit_end_encodeBlockAsm10B: - ADDL R10, DX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL DX, 12(SP) - - // emitCopy - CMPL R10, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBlockAsm10B - CMPL SI, $0x00000800 - JAE long_offset_short_match_nolit_encodeBlockAsm10B - MOVL $0x00000001, DI - LEAL 16(DI), DI - MOVB SI, 1(CX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (CX) - ADDQ $0x02, CX - SUBL $0x08, R10 - - // emitRepeat - LEAL -4(R10), R10 - JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b - CMPL DI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b - CMPL SI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: - CMPL R10, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b - LEAL -256(R10), R10 - MOVW $0x0019, (CX) - MOVW R10, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: - LEAL -4(R10), R10 - MOVW $0x0015, (CX) - MOVB R10, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(CX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -long_offset_short_match_nolit_encodeBlockAsm10B: - MOVB $0xee, (CX) - MOVW SI, 1(CX) - LEAL -60(R10), R10 - ADDQ $0x03, CX - - // emitRepeat - MOVL R10, DI - LEAL -4(R10), R10 - CMPL DI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short - CMPL DI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short - CMPL SI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: - CMPL R10, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short - LEAL -256(R10), R10 - MOVW $0x0019, (CX) - MOVW R10, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (CX) - MOVB R10, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(CX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -two_byte_offset_short_match_nolit_encodeBlockAsm10B: - MOVL R10, DI - SHLL $0x02, DI - CMPL R10, $0x0c - JAE emit_copy_three_match_nolit_encodeBlockAsm10B - CMPL SI, $0x00000800 - JAE emit_copy_three_match_nolit_encodeBlockAsm10B - LEAL -15(DI), DI - MOVB SI, 1(CX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -emit_copy_three_match_nolit_encodeBlockAsm10B: - LEAL -2(DI), DI - MOVB DI, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - -match_nolit_emitcopy_end_encodeBlockAsm10B: - CMPL DX, 8(SP) - JAE emit_remainder_encodeBlockAsm10B - MOVQ -2(BX)(DX*1), DI - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeBlockAsm10B - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm10B: - MOVQ $0x9e3779b1, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x20, R8 - IMULQ R9, R8 - SHRQ $0x36, R8 - SHLQ $0x20, SI - IMULQ R9, SI - SHRQ $0x36, SI - LEAL -2(DX), R9 - LEAQ (AX)(SI*4), R10 - MOVL (R10), SI - MOVL R9, (AX)(R8*4) - MOVL DX, (R10) - CMPL (BX)(SI*1), DI - JEQ match_nolit_loop_encodeBlockAsm10B - INCL DX - JMP search_loop_encodeBlockAsm10B - -emit_remainder_encodeBlockAsm10B: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 3(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeBlockAsm10B - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeBlockAsm10B: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBlockAsm10B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBlockAsm10B - JB three_bytes_emit_remainder_encodeBlockAsm10B - -three_bytes_emit_remainder_encodeBlockAsm10B: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeBlockAsm10B - -two_bytes_emit_remainder_encodeBlockAsm10B: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBlockAsm10B - JMP memmove_long_emit_remainder_encodeBlockAsm10B - -one_byte_emit_remainder_encodeBlockAsm10B: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeBlockAsm10B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm10B: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeBlockAsm10B - -memmove_long_emit_remainder_encodeBlockAsm10B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeBlockAsm10B: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeBlockAsm8B(dst []byte, src []byte, tmp *[1024]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBlockAsm8B(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00000008, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeBlockAsm8B: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeBlockAsm8B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -9(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeBlockAsm8B: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x04, SI - LEAL 4(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeBlockAsm8B - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x9e3779b1, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x38, R10 - SHLQ $0x20, R11 - IMULQ R9, R11 - SHRQ $0x38, R11 - MOVL (AX)(R10*4), SI - MOVL (AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - LEAL 1(DX), R10 - MOVL R10, (AX)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x38, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeBlockAsm8B - LEAL 1(DX), DI - MOVL 12(SP), R8 - MOVL DI, SI - SUBL 16(SP), SI - JZ repeat_extend_back_end_encodeBlockAsm8B - -repeat_extend_back_loop_encodeBlockAsm8B: - CMPL DI, R8 - JBE repeat_extend_back_end_encodeBlockAsm8B - MOVB -1(BX)(SI*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_encodeBlockAsm8B - LEAL -1(DI), DI - DECL SI - JNZ repeat_extend_back_loop_encodeBlockAsm8B - -repeat_extend_back_end_encodeBlockAsm8B: - MOVL DI, SI - SUBL 12(SP), SI - LEAQ 3(CX)(SI*1), SI - CMPQ SI, (SP) - JB repeat_dst_size_check_encodeBlockAsm8B - MOVQ $0x00000000, ret+56(FP) - RET - -repeat_dst_size_check_encodeBlockAsm8B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_repeat_emit_encodeBlockAsm8B - CMPL SI, $0x00000100 - JB two_bytes_repeat_emit_encodeBlockAsm8B - JB three_bytes_repeat_emit_encodeBlockAsm8B - -three_bytes_repeat_emit_encodeBlockAsm8B: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_repeat_emit_encodeBlockAsm8B - -two_bytes_repeat_emit_encodeBlockAsm8B: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_repeat_emit_encodeBlockAsm8B - JMP memmove_long_repeat_emit_encodeBlockAsm8B - -one_byte_repeat_emit_encodeBlockAsm8B: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_repeat_emit_encodeBlockAsm8B: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (CX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm8B: - MOVQ SI, CX - JMP emit_literal_done_repeat_emit_encodeBlockAsm8B - -memmove_long_repeat_emit_encodeBlockAsm8B: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R12 - SHRQ $0x05, R12 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R13*1), R11 - LEAQ -32(CX)(R13*1), R14 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R13*1), X4 - MOVOU -16(R10)(R13*1), X5 - MOVOA X4, -32(CX)(R13*1) - MOVOA X5, -16(CX)(R13*1) - ADDQ $0x20, R13 - CMPQ R9, R13 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_repeat_emit_encodeBlockAsm8B: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R9 - SUBL DX, R9 - LEAQ (BX)(DX*1), R10 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R12, R12 - -matchlen_loopback_16_repeat_extend_encodeBlockAsm8B: - CMPL R9, $0x10 - JB matchlen_match8_repeat_extend_encodeBlockAsm8B - MOVQ (R10)(R12*1), R11 - MOVQ 8(R10)(R12*1), R13 - XORQ (SI)(R12*1), R11 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B - XORQ 8(SI)(R12*1), R13 - JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm8B - LEAL -16(R9), R9 - LEAL 16(R12), R12 - JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm8B - -matchlen_bsf_16repeat_extend_encodeBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R13, R13 - -#else - BSFQ R13, R13 - -#endif - SARQ $0x03, R13 - LEAL 8(R12)(R13*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm8B - -matchlen_match8_repeat_extend_encodeBlockAsm8B: - CMPL R9, $0x08 - JB matchlen_match4_repeat_extend_encodeBlockAsm8B - MOVQ (R10)(R12*1), R11 - XORQ (SI)(R12*1), R11 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B - LEAL -8(R9), R9 - LEAL 8(R12), R12 - JMP matchlen_match4_repeat_extend_encodeBlockAsm8B - -matchlen_bsf_8_repeat_extend_encodeBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP repeat_extend_forward_end_encodeBlockAsm8B - -matchlen_match4_repeat_extend_encodeBlockAsm8B: - CMPL R9, $0x04 - JB matchlen_match2_repeat_extend_encodeBlockAsm8B - MOVL (R10)(R12*1), R11 - CMPL (SI)(R12*1), R11 - JNE matchlen_match2_repeat_extend_encodeBlockAsm8B - LEAL -4(R9), R9 - LEAL 4(R12), R12 - -matchlen_match2_repeat_extend_encodeBlockAsm8B: - CMPL R9, $0x01 - JE matchlen_match1_repeat_extend_encodeBlockAsm8B - JB repeat_extend_forward_end_encodeBlockAsm8B - MOVW (R10)(R12*1), R11 - CMPW (SI)(R12*1), R11 - JNE matchlen_match1_repeat_extend_encodeBlockAsm8B - LEAL 2(R12), R12 - SUBL $0x02, R9 - JZ repeat_extend_forward_end_encodeBlockAsm8B - -matchlen_match1_repeat_extend_encodeBlockAsm8B: - MOVB (R10)(R12*1), R11 - CMPB (SI)(R12*1), R11 - JNE repeat_extend_forward_end_encodeBlockAsm8B - LEAL 1(R12), R12 - -repeat_extend_forward_end_encodeBlockAsm8B: - ADDL R12, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - TESTL R8, R8 - JZ repeat_as_copy_encodeBlockAsm8B - - // emitRepeat - MOVL SI, DI - LEAL -4(SI), SI - CMPL DI, $0x08 - JBE repeat_two_match_repeat_encodeBlockAsm8B - CMPL DI, $0x0c - JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B - -cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: - CMPL SI, $0x00000104 - JB repeat_three_match_repeat_encodeBlockAsm8B - LEAL -256(SI), SI - MOVW $0x0019, (CX) - MOVW SI, 2(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_three_match_repeat_encodeBlockAsm8B: - LEAL -4(SI), SI - MOVW $0x0015, (CX) - MOVB SI, 2(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_two_match_repeat_encodeBlockAsm8B: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm8B - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(CX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_as_copy_encodeBlockAsm8B: - // emitCopy - CMPL SI, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B - CMPL DI, $0x00000800 - JAE long_offset_short_repeat_as_copy_encodeBlockAsm8B - MOVL $0x00000001, R8 - LEAL 16(R8), R8 - MOVB DI, 1(CX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R8 - MOVB R8, (CX) - ADDQ $0x02, CX - SUBL $0x08, SI - - // emitRepeat - LEAL -4(SI), SI - JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b - MOVL SI, DI - LEAL -4(SI), SI - CMPL DI, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b - CMPL DI, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: - CMPL SI, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b - LEAL -256(SI), SI - MOVW $0x0019, (CX) - MOVW SI, 2(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: - LEAL -4(SI), SI - MOVW $0x0015, (CX) - MOVB SI, 2(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm8B - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(CX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm8B - -long_offset_short_repeat_as_copy_encodeBlockAsm8B: - MOVB $0xee, (CX) - MOVW DI, 1(CX) - LEAL -60(SI), SI - ADDQ $0x03, CX - - // emitRepeat - MOVL SI, DI - LEAL -4(SI), SI - CMPL DI, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - CMPL DI, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - CMPL SI, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - LEAL -256(SI), SI - MOVW $0x0019, (CX) - MOVW SI, 2(CX) - ADDQ $0x04, CX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - LEAL -4(SI), SI - MOVW $0x0015, (CX) - MOVB SI, 2(CX) - ADDQ $0x03, CX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - SHLL $0x02, SI - ORL $0x01, SI - MOVW SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm8B - XORQ R8, R8 - LEAL 1(R8)(SI*4), SI - MOVB DI, 1(CX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm8B - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: - MOVL SI, R8 - SHLL $0x02, R8 - CMPL SI, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm8B - LEAL -15(R8), R8 - MOVB DI, 1(CX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R8 - MOVB R8, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeBlockAsm8B - -emit_copy_three_repeat_as_copy_encodeBlockAsm8B: - LEAL -2(R8), R8 - MOVB R8, (CX) - MOVW DI, 1(CX) - ADDQ $0x03, CX - -repeat_end_emit_encodeBlockAsm8B: - MOVL DX, 12(SP) - JMP search_loop_encodeBlockAsm8B - -no_repeat_found_encodeBlockAsm8B: - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeBlockAsm8B - SHRQ $0x08, DI - MOVL (AX)(R10*4), SI - LEAL 2(DX), R9 - CMPL (BX)(R8*1), DI - JEQ candidate2_match_encodeBlockAsm8B - MOVL R9, (AX)(R10*4) - SHRQ $0x08, DI - CMPL (BX)(SI*1), DI - JEQ candidate3_match_encodeBlockAsm8B - MOVL 20(SP), DX - JMP search_loop_encodeBlockAsm8B - -candidate3_match_encodeBlockAsm8B: - ADDL $0x02, DX - JMP candidate_match_encodeBlockAsm8B - -candidate2_match_encodeBlockAsm8B: - MOVL R9, (AX)(R10*4) - INCL DX - MOVL R8, SI - -candidate_match_encodeBlockAsm8B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBlockAsm8B - -match_extend_back_loop_encodeBlockAsm8B: - CMPL DX, DI - JBE match_extend_back_end_encodeBlockAsm8B - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeBlockAsm8B - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeBlockAsm8B - JMP match_extend_back_loop_encodeBlockAsm8B - -match_extend_back_end_encodeBlockAsm8B: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 3(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeBlockAsm8B - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeBlockAsm8B: - MOVL DX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeBlockAsm8B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JB one_byte_match_emit_encodeBlockAsm8B - CMPL R8, $0x00000100 - JB two_bytes_match_emit_encodeBlockAsm8B - JB three_bytes_match_emit_encodeBlockAsm8B - -three_bytes_match_emit_encodeBlockAsm8B: - MOVB $0xf4, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeBlockAsm8B - -two_bytes_match_emit_encodeBlockAsm8B: - MOVB $0xf0, (CX) - MOVB R8, 1(CX) - ADDQ $0x02, CX - CMPL R8, $0x40 - JB memmove_match_emit_encodeBlockAsm8B - JMP memmove_long_match_emit_encodeBlockAsm8B - -one_byte_match_emit_encodeBlockAsm8B: - SHLB $0x02, R8 - MOVB R8, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeBlockAsm8B: - LEAQ (CX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (CX) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (CX) - MOVQ DI, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeBlockAsm8B: - MOVQ R8, CX - JMP emit_literal_done_match_emit_encodeBlockAsm8B - -memmove_long_match_emit_encodeBlockAsm8B: - LEAQ (CX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ R8, CX - -emit_literal_done_match_emit_encodeBlockAsm8B: -match_nolit_loop_encodeBlockAsm8B: - MOVL DX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL DX, DI - LEAQ (BX)(DX*1), R8 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_match_nolit_encodeBlockAsm8B: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeBlockAsm8B - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B - XORQ 8(SI)(R10*1), R11 - JNZ matchlen_bsf_16match_nolit_encodeBlockAsm8B - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_match_nolit_encodeBlockAsm8B - -matchlen_bsf_16match_nolit_encodeBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP match_nolit_end_encodeBlockAsm8B - -matchlen_match8_match_nolit_encodeBlockAsm8B: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeBlockAsm8B - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_match_nolit_encodeBlockAsm8B - -matchlen_bsf_8_match_nolit_encodeBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeBlockAsm8B - -matchlen_match4_match_nolit_encodeBlockAsm8B: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeBlockAsm8B - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeBlockAsm8B - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeBlockAsm8B: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeBlockAsm8B - JB match_nolit_end_encodeBlockAsm8B - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeBlockAsm8B - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ match_nolit_end_encodeBlockAsm8B - -matchlen_match1_match_nolit_encodeBlockAsm8B: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeBlockAsm8B - LEAL 1(R10), R10 - -match_nolit_end_encodeBlockAsm8B: - ADDL R10, DX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL DX, 12(SP) - - // emitCopy - CMPL R10, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBlockAsm8B - CMPL SI, $0x00000800 - JAE long_offset_short_match_nolit_encodeBlockAsm8B - MOVL $0x00000001, DI - LEAL 16(DI), DI - MOVB SI, 1(CX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (CX) - ADDQ $0x02, CX - SUBL $0x08, R10 - - // emitRepeat - LEAL -4(R10), R10 - JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b - MOVL R10, SI - LEAL -4(R10), R10 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: - CMPL R10, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b - LEAL -256(R10), R10 - MOVW $0x0019, (CX) - MOVW R10, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: - LEAL -4(R10), R10 - MOVW $0x0015, (CX) - MOVB R10, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(CX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -long_offset_short_match_nolit_encodeBlockAsm8B: - MOVB $0xee, (CX) - MOVW SI, 1(CX) - LEAL -60(R10), R10 - ADDQ $0x03, CX - - // emitRepeat - MOVL R10, SI - LEAL -4(R10), R10 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: - CMPL R10, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short - LEAL -256(R10), R10 - MOVW $0x0019, (CX) - MOVW R10, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (CX) - MOVB R10, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - XORQ DI, DI - LEAL 1(DI)(R10*4), R10 - MOVB SI, 1(CX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, R10 - MOVB R10, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -two_byte_offset_short_match_nolit_encodeBlockAsm8B: - MOVL R10, DI - SHLL $0x02, DI - CMPL R10, $0x0c - JAE emit_copy_three_match_nolit_encodeBlockAsm8B - LEAL -15(DI), DI - MOVB SI, 1(CX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -emit_copy_three_match_nolit_encodeBlockAsm8B: - LEAL -2(DI), DI - MOVB DI, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - -match_nolit_emitcopy_end_encodeBlockAsm8B: - CMPL DX, 8(SP) - JAE emit_remainder_encodeBlockAsm8B - MOVQ -2(BX)(DX*1), DI - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeBlockAsm8B - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm8B: - MOVQ $0x9e3779b1, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x20, R8 - IMULQ R9, R8 - SHRQ $0x38, R8 - SHLQ $0x20, SI - IMULQ R9, SI - SHRQ $0x38, SI - LEAL -2(DX), R9 - LEAQ (AX)(SI*4), R10 - MOVL (R10), SI - MOVL R9, (AX)(R8*4) - MOVL DX, (R10) - CMPL (BX)(SI*1), DI - JEQ match_nolit_loop_encodeBlockAsm8B - INCL DX - JMP search_loop_encodeBlockAsm8B - -emit_remainder_encodeBlockAsm8B: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 3(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeBlockAsm8B - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeBlockAsm8B: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBlockAsm8B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBlockAsm8B - JB three_bytes_emit_remainder_encodeBlockAsm8B - -three_bytes_emit_remainder_encodeBlockAsm8B: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeBlockAsm8B - -two_bytes_emit_remainder_encodeBlockAsm8B: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBlockAsm8B - JMP memmove_long_emit_remainder_encodeBlockAsm8B - -one_byte_emit_remainder_encodeBlockAsm8B: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeBlockAsm8B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm8B: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeBlockAsm8B - -memmove_long_emit_remainder_encodeBlockAsm8B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeBlockAsm8B: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00001200, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -6(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeBetterBlockAsm: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x07, SI - CMPL SI, $0x63 - JBE check_maxskip_ok_encodeBetterBlockAsm - LEAL 100(DX), SI - JMP check_maxskip_cont_encodeBetterBlockAsm - -check_maxskip_ok_encodeBetterBlockAsm: - LEAL 1(DX)(SI*1), SI - -check_maxskip_cont_encodeBetterBlockAsm: - CMPL SI, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x2f, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x32, R11 - MOVL (AX)(R10*4), SI - MOVL 524288(AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - MOVL DX, 524288(AX)(R11*4) - MOVQ (BX)(SI*1), R10 - MOVQ (BX)(R8*1), R11 - CMPQ R10, DI - JEQ candidate_match_encodeBetterBlockAsm - CMPQ R11, DI - JNE no_short_found_encodeBetterBlockAsm - MOVL R8, SI - JMP candidate_match_encodeBetterBlockAsm - -no_short_found_encodeBetterBlockAsm: - CMPL R10, DI - JEQ candidate_match_encodeBetterBlockAsm - CMPL R11, DI - JEQ candidateS_match_encodeBetterBlockAsm - MOVL 20(SP), DX - JMP search_loop_encodeBetterBlockAsm - -candidateS_match_encodeBetterBlockAsm: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x2f, R10 - MOVL (AX)(R10*4), SI - INCL DX - MOVL DX, (AX)(R10*4) - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm - DECL DX - MOVL R8, SI - -candidate_match_encodeBetterBlockAsm: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBetterBlockAsm - -match_extend_back_loop_encodeBetterBlockAsm: - CMPL DX, DI - JBE match_extend_back_end_encodeBetterBlockAsm - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeBetterBlockAsm - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeBetterBlockAsm - JMP match_extend_back_loop_encodeBetterBlockAsm - -match_extend_back_end_encodeBetterBlockAsm: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 5(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeBetterBlockAsm - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm: - MOVL DX, DI - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), R10 - - // matchLen - XORL R12, R12 - -matchlen_loopback_16_match_nolit_encodeBetterBlockAsm: - CMPL R8, $0x10 - JB matchlen_match8_match_nolit_encodeBetterBlockAsm - MOVQ (R9)(R12*1), R11 - MOVQ 8(R9)(R12*1), R13 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm - XORQ 8(R10)(R12*1), R13 - JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm - LEAL -16(R8), R8 - LEAL 16(R12), R12 - JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm - -matchlen_bsf_16match_nolit_encodeBetterBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R13, R13 - -#else - BSFQ R13, R13 - -#endif - SARQ $0x03, R13 - LEAL 8(R12)(R13*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm - -matchlen_match8_match_nolit_encodeBetterBlockAsm: - CMPL R8, $0x08 - JB matchlen_match4_match_nolit_encodeBetterBlockAsm - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm - LEAL -8(R8), R8 - LEAL 8(R12), R12 - JMP matchlen_match4_match_nolit_encodeBetterBlockAsm - -matchlen_bsf_8_match_nolit_encodeBetterBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm - -matchlen_match4_match_nolit_encodeBetterBlockAsm: - CMPL R8, $0x04 - JB matchlen_match2_match_nolit_encodeBetterBlockAsm - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeBetterBlockAsm - LEAL -4(R8), R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeBetterBlockAsm: - CMPL R8, $0x01 - JE matchlen_match1_match_nolit_encodeBetterBlockAsm - JB match_nolit_end_encodeBetterBlockAsm - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeBetterBlockAsm - LEAL 2(R12), R12 - SUBL $0x02, R8 - JZ match_nolit_end_encodeBetterBlockAsm - -matchlen_match1_match_nolit_encodeBetterBlockAsm: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeBetterBlockAsm - LEAL 1(R12), R12 - -match_nolit_end_encodeBetterBlockAsm: - MOVL DX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL 16(SP), R8 - JEQ match_is_repeat_encodeBetterBlockAsm - CMPL R12, $0x01 - JA match_length_ok_encodeBetterBlockAsm - CMPL R8, $0x0000ffff - JBE match_length_ok_encodeBetterBlockAsm - MOVL 20(SP), DX - INCL DX - JMP search_loop_encodeBetterBlockAsm - -match_length_ok_encodeBetterBlockAsm: - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_match_emit_encodeBetterBlockAsm - CMPL SI, $0x00000100 - JB two_bytes_match_emit_encodeBetterBlockAsm - CMPL SI, $0x00010000 - JB three_bytes_match_emit_encodeBetterBlockAsm - CMPL SI, $0x01000000 - JB four_bytes_match_emit_encodeBetterBlockAsm - MOVB $0xfc, (CX) - MOVL SI, 1(CX) - ADDQ $0x05, CX - JMP memmove_long_match_emit_encodeBetterBlockAsm - -four_bytes_match_emit_encodeBetterBlockAsm: - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (CX) - MOVW SI, 1(CX) - MOVB R11, 3(CX) - ADDQ $0x04, CX - JMP memmove_long_match_emit_encodeBetterBlockAsm - -three_bytes_match_emit_encodeBetterBlockAsm: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeBetterBlockAsm - -two_bytes_match_emit_encodeBetterBlockAsm: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_match_emit_encodeBetterBlockAsm - JMP memmove_long_match_emit_encodeBetterBlockAsm - -one_byte_match_emit_encodeBetterBlockAsm: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeBetterBlockAsm: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (CX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (CX) - MOVL R10, -4(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm: - MOVQ SI, CX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm - -memmove_long_match_emit_encodeBetterBlockAsm: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(CX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(CX)(R14*1) - MOVOA X5, -16(CX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_match_emit_encodeBetterBlockAsm: - ADDL R12, DX - ADDL $0x04, R12 - MOVL DX, 12(SP) - - // emitCopy - CMPL R8, $0x00010000 - JB two_byte_offset_match_nolit_encodeBetterBlockAsm - CMPL R12, $0x40 - JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm - MOVB $0xff, (CX) - MOVL R8, 1(CX) - LEAL -64(R12), R12 - ADDQ $0x05, CX - CMPL R12, $0x04 - JB four_bytes_remain_match_nolit_encodeBetterBlockAsm - - // emitRepeat -emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy: - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL R8, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: - CMPL R12, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL R12, $0x00010100 - JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL R12, $0x0100ffff - JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy - LEAL -16842747(R12), R12 - MOVL $0xfffb001d, (CX) - MOVB $0xff, 4(CX) - ADDQ $0x05, CX - JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy - -repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy: - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (CX) - MOVW R12, 2(CX) - SARL $0x10, R8 - MOVB R8, 4(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy: - LEAL -256(R12), R12 - MOVW $0x0019, (CX) - MOVW R12, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy: - LEAL -4(R12), R12 - MOVW $0x0015, (CX) - MOVB R12, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(CX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -four_bytes_remain_match_nolit_encodeBetterBlockAsm: - TESTL R12, R12 - JZ match_nolit_emitcopy_end_encodeBetterBlockAsm - XORL SI, SI - LEAL -1(SI)(R12*4), R12 - MOVB R12, (CX) - MOVL R8, 1(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -two_byte_offset_match_nolit_encodeBetterBlockAsm: - CMPL R12, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm - CMPL R8, $0x00000800 - JAE long_offset_short_match_nolit_encodeBetterBlockAsm - MOVL $0x00000001, SI - LEAL 16(SI), SI - MOVB R8, 1(CX) - MOVL R8, R9 - SHRL $0x08, R9 - SHLL $0x05, R9 - ORL R9, SI - MOVB SI, (CX) - ADDQ $0x02, CX - SUBL $0x08, R12 - - // emitRepeat - LEAL -4(R12), R12 - JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b - -emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b - CMPL R8, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: - CMPL R12, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b - CMPL R12, $0x00010100 - JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b - CMPL R12, $0x0100ffff - JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b - LEAL -16842747(R12), R12 - MOVL $0xfffb001d, (CX) - MOVB $0xff, 4(CX) - ADDQ $0x05, CX - JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b - -repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (CX) - MOVW R12, 2(CX) - SARL $0x10, R8 - MOVB R8, 4(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: - LEAL -256(R12), R12 - MOVW $0x0019, (CX) - MOVW R12, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: - LEAL -4(R12), R12 - MOVW $0x0015, (CX) - MOVB R12, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(CX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -long_offset_short_match_nolit_encodeBetterBlockAsm: - MOVB $0xee, (CX) - MOVW R8, 1(CX) - LEAL -60(R12), R12 - ADDQ $0x03, CX - - // emitRepeat -emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short: - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL R8, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: - CMPL R12, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL R12, $0x00010100 - JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL R12, $0x0100ffff - JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short - LEAL -16842747(R12), R12 - MOVL $0xfffb001d, (CX) - MOVB $0xff, 4(CX) - ADDQ $0x05, CX - JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short - -repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short: - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (CX) - MOVW R12, 2(CX) - SARL $0x10, R8 - MOVB R8, 4(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short: - LEAL -256(R12), R12 - MOVW $0x0019, (CX) - MOVW R12, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short: - LEAL -4(R12), R12 - MOVW $0x0015, (CX) - MOVB R12, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(CX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm: - MOVL R12, SI - SHLL $0x02, SI - CMPL R12, $0x0c - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm - CMPL R8, $0x00000800 - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm - LEAL -15(SI), SI - MOVB R8, 1(CX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -emit_copy_three_match_nolit_encodeBetterBlockAsm: - LEAL -2(SI), SI - MOVB SI, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -match_is_repeat_encodeBetterBlockAsm: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_match_emit_repeat_encodeBetterBlockAsm - CMPL SI, $0x00000100 - JB two_bytes_match_emit_repeat_encodeBetterBlockAsm - CMPL SI, $0x00010000 - JB three_bytes_match_emit_repeat_encodeBetterBlockAsm - CMPL SI, $0x01000000 - JB four_bytes_match_emit_repeat_encodeBetterBlockAsm - MOVB $0xfc, (CX) - MOVL SI, 1(CX) - ADDQ $0x05, CX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm - -four_bytes_match_emit_repeat_encodeBetterBlockAsm: - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (CX) - MOVW SI, 1(CX) - MOVB R11, 3(CX) - ADDQ $0x04, CX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm - -three_bytes_match_emit_repeat_encodeBetterBlockAsm: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm - -two_bytes_match_emit_repeat_encodeBetterBlockAsm: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_match_emit_repeat_encodeBetterBlockAsm - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm - -one_byte_match_emit_repeat_encodeBetterBlockAsm: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_match_emit_repeat_encodeBetterBlockAsm: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (CX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (CX) - MOVL R10, -4(CX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm: - MOVQ SI, CX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm - -memmove_long_match_emit_repeat_encodeBetterBlockAsm: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(CX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(CX)(R14*1) - MOVOA X5, -16(CX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm: - ADDL R12, DX - ADDL $0x04, R12 - MOVL DX, 12(SP) - - // emitRepeat -emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm: - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm - CMPL R8, $0x00000800 - JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: - CMPL R12, $0x00000104 - JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm - CMPL R12, $0x00010100 - JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm - CMPL R12, $0x0100ffff - JB repeat_five_match_nolit_repeat_encodeBetterBlockAsm - LEAL -16842747(R12), R12 - MOVL $0xfffb001d, (CX) - MOVB $0xff, 4(CX) - ADDQ $0x05, CX - JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm - -repeat_five_match_nolit_repeat_encodeBetterBlockAsm: - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (CX) - MOVW R12, 2(CX) - SARL $0x10, R8 - MOVB R8, 4(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_four_match_nolit_repeat_encodeBetterBlockAsm: - LEAL -256(R12), R12 - MOVW $0x0019, (CX) - MOVW R12, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm: - LEAL -4(R12), R12 - MOVW $0x0015, (CX) - MOVB R12, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(CX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (CX) - ADDQ $0x02, CX - -match_nolit_emitcopy_end_encodeBetterBlockAsm: - CMPL DX, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeBetterBlockAsm - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm: - MOVQ $0x00cf1bbcdcbfa563, SI - MOVQ $0x9e3779b1, R8 - LEAQ 1(DI), DI - LEAQ -2(DX), R9 - MOVQ (BX)(DI*1), R10 - MOVQ 1(BX)(DI*1), R11 - MOVQ (BX)(R9*1), R12 - MOVQ 1(BX)(R9*1), R13 - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x2f, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R12 - IMULQ SI, R12 - SHRQ $0x2f, R12 - SHLQ $0x20, R13 - IMULQ R8, R13 - SHRQ $0x32, R13 - LEAQ 1(DI), R8 - LEAQ 1(R9), R14 - MOVL DI, (AX)(R10*4) - MOVL R9, (AX)(R12*4) - MOVL R8, 524288(AX)(R11*4) - MOVL R14, 524288(AX)(R13*4) - LEAQ 1(R9)(DI*1), R8 - SHRQ $0x01, R8 - ADDQ $0x01, DI - SUBQ $0x01, R9 - -index_loop_encodeBetterBlockAsm: - CMPQ R8, R9 - JAE search_loop_encodeBetterBlockAsm - MOVQ (BX)(DI*1), R10 - MOVQ (BX)(R8*1), R11 - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x2f, R10 - SHLQ $0x08, R11 - IMULQ SI, R11 - SHRQ $0x2f, R11 - MOVL DI, (AX)(R10*4) - MOVL R8, (AX)(R11*4) - ADDQ $0x02, DI - ADDQ $0x02, R8 - JMP index_loop_encodeBetterBlockAsm - -emit_remainder_encodeBetterBlockAsm: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 5(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeBetterBlockAsm - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBetterBlockAsm - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBetterBlockAsm - CMPL DX, $0x00010000 - JB three_bytes_emit_remainder_encodeBetterBlockAsm - CMPL DX, $0x01000000 - JB four_bytes_emit_remainder_encodeBetterBlockAsm - MOVB $0xfc, (CX) - MOVL DX, 1(CX) - ADDQ $0x05, CX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm - -four_bytes_emit_remainder_encodeBetterBlockAsm: - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (CX) - MOVW DX, 1(CX) - MOVB BL, 3(CX) - ADDQ $0x04, CX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm - -three_bytes_emit_remainder_encodeBetterBlockAsm: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm - -two_bytes_emit_remainder_encodeBetterBlockAsm: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBetterBlockAsm - JMP memmove_long_emit_remainder_encodeBetterBlockAsm - -one_byte_emit_remainder_encodeBetterBlockAsm: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeBetterBlockAsm: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm - -memmove_long_emit_remainder_encodeBetterBlockAsm: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeBetterBlockAsm4MB(dst []byte, src []byte, tmp *[589824]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm4MB(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00001200, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm4MB: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm4MB - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -6(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeBetterBlockAsm4MB: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x07, SI - CMPL SI, $0x63 - JBE check_maxskip_ok_encodeBetterBlockAsm4MB - LEAL 100(DX), SI - JMP check_maxskip_cont_encodeBetterBlockAsm4MB - -check_maxskip_ok_encodeBetterBlockAsm4MB: - LEAL 1(DX)(SI*1), SI - -check_maxskip_cont_encodeBetterBlockAsm4MB: - CMPL SI, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm4MB - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x2f, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x32, R11 - MOVL (AX)(R10*4), SI - MOVL 524288(AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - MOVL DX, 524288(AX)(R11*4) - MOVQ (BX)(SI*1), R10 - MOVQ (BX)(R8*1), R11 - CMPQ R10, DI - JEQ candidate_match_encodeBetterBlockAsm4MB - CMPQ R11, DI - JNE no_short_found_encodeBetterBlockAsm4MB - MOVL R8, SI - JMP candidate_match_encodeBetterBlockAsm4MB - -no_short_found_encodeBetterBlockAsm4MB: - CMPL R10, DI - JEQ candidate_match_encodeBetterBlockAsm4MB - CMPL R11, DI - JEQ candidateS_match_encodeBetterBlockAsm4MB - MOVL 20(SP), DX - JMP search_loop_encodeBetterBlockAsm4MB - -candidateS_match_encodeBetterBlockAsm4MB: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x2f, R10 - MOVL (AX)(R10*4), SI - INCL DX - MOVL DX, (AX)(R10*4) - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm4MB - DECL DX - MOVL R8, SI - -candidate_match_encodeBetterBlockAsm4MB: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBetterBlockAsm4MB - -match_extend_back_loop_encodeBetterBlockAsm4MB: - CMPL DX, DI - JBE match_extend_back_end_encodeBetterBlockAsm4MB - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeBetterBlockAsm4MB - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeBetterBlockAsm4MB - JMP match_extend_back_loop_encodeBetterBlockAsm4MB - -match_extend_back_end_encodeBetterBlockAsm4MB: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 4(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeBetterBlockAsm4MB - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm4MB: - MOVL DX, DI - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), R10 - - // matchLen - XORL R12, R12 - -matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB: - CMPL R8, $0x10 - JB matchlen_match8_match_nolit_encodeBetterBlockAsm4MB - MOVQ (R9)(R12*1), R11 - MOVQ 8(R9)(R12*1), R13 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB - XORQ 8(R10)(R12*1), R13 - JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB - LEAL -16(R8), R8 - LEAL 16(R12), R12 - JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB - -matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB: -#ifdef GOAMD64_v3 - TZCNTQ R13, R13 - -#else - BSFQ R13, R13 - -#endif - SARQ $0x03, R13 - LEAL 8(R12)(R13*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm4MB - -matchlen_match8_match_nolit_encodeBetterBlockAsm4MB: - CMPL R8, $0x08 - JB matchlen_match4_match_nolit_encodeBetterBlockAsm4MB - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB - LEAL -8(R8), R8 - LEAL 8(R12), R12 - JMP matchlen_match4_match_nolit_encodeBetterBlockAsm4MB - -matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm4MB - -matchlen_match4_match_nolit_encodeBetterBlockAsm4MB: - CMPL R8, $0x04 - JB matchlen_match2_match_nolit_encodeBetterBlockAsm4MB - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4MB - LEAL -4(R8), R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeBetterBlockAsm4MB: - CMPL R8, $0x01 - JE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB - JB match_nolit_end_encodeBetterBlockAsm4MB - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB - LEAL 2(R12), R12 - SUBL $0x02, R8 - JZ match_nolit_end_encodeBetterBlockAsm4MB - -matchlen_match1_match_nolit_encodeBetterBlockAsm4MB: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeBetterBlockAsm4MB - LEAL 1(R12), R12 - -match_nolit_end_encodeBetterBlockAsm4MB: - MOVL DX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL 16(SP), R8 - JEQ match_is_repeat_encodeBetterBlockAsm4MB - CMPL R12, $0x01 - JA match_length_ok_encodeBetterBlockAsm4MB - CMPL R8, $0x0000ffff - JBE match_length_ok_encodeBetterBlockAsm4MB - MOVL 20(SP), DX - INCL DX - JMP search_loop_encodeBetterBlockAsm4MB - -match_length_ok_encodeBetterBlockAsm4MB: - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_match_emit_encodeBetterBlockAsm4MB - CMPL SI, $0x00000100 - JB two_bytes_match_emit_encodeBetterBlockAsm4MB - CMPL SI, $0x00010000 - JB three_bytes_match_emit_encodeBetterBlockAsm4MB - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (CX) - MOVW SI, 1(CX) - MOVB R11, 3(CX) - ADDQ $0x04, CX - JMP memmove_long_match_emit_encodeBetterBlockAsm4MB - -three_bytes_match_emit_encodeBetterBlockAsm4MB: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeBetterBlockAsm4MB - -two_bytes_match_emit_encodeBetterBlockAsm4MB: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_match_emit_encodeBetterBlockAsm4MB - JMP memmove_long_match_emit_encodeBetterBlockAsm4MB - -one_byte_match_emit_encodeBetterBlockAsm4MB: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeBetterBlockAsm4MB: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (CX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (CX) - MOVL R10, -4(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm4MB: - MOVQ SI, CX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB - -memmove_long_match_emit_encodeBetterBlockAsm4MB: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(CX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(CX)(R14*1) - MOVOA X5, -16(CX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_match_emit_encodeBetterBlockAsm4MB: - ADDL R12, DX - ADDL $0x04, R12 - MOVL DX, 12(SP) - - // emitCopy - CMPL R8, $0x00010000 - JB two_byte_offset_match_nolit_encodeBetterBlockAsm4MB - CMPL R12, $0x40 - JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB - MOVB $0xff, (CX) - MOVL R8, 1(CX) - LEAL -64(R12), R12 - ADDQ $0x05, CX - CMPL R12, $0x04 - JB four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy - CMPL R8, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - CMPL R12, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy - CMPL R12, $0x00010100 - JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (CX) - MOVW R12, 2(CX) - SARL $0x10, R8 - MOVB R8, 4(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - LEAL -256(R12), R12 - MOVW $0x0019, (CX) - MOVW R12, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - LEAL -4(R12), R12 - MOVW $0x0015, (CX) - MOVB R12, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(CX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB: - TESTL R12, R12 - JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - XORL SI, SI - LEAL -1(SI)(R12*4), R12 - MOVB R12, (CX) - MOVL R8, 1(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -two_byte_offset_match_nolit_encodeBetterBlockAsm4MB: - CMPL R12, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB - CMPL R8, $0x00000800 - JAE long_offset_short_match_nolit_encodeBetterBlockAsm4MB - MOVL $0x00000001, SI - LEAL 16(SI), SI - MOVB R8, 1(CX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, SI - MOVB SI, (CX) - ADDQ $0x02, CX - SUBL $0x08, R12 - - // emitRepeat - LEAL -4(R12), R12 - JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b - CMPL R8, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: - CMPL R12, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b - CMPL R12, $0x00010100 - JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (CX) - MOVW R12, 2(CX) - SARL $0x10, R8 - MOVB R8, 4(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: - LEAL -256(R12), R12 - MOVW $0x0019, (CX) - MOVW R12, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: - LEAL -4(R12), R12 - MOVW $0x0015, (CX) - MOVB R12, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(CX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -long_offset_short_match_nolit_encodeBetterBlockAsm4MB: - MOVB $0xee, (CX) - MOVW R8, 1(CX) - LEAL -60(R12), R12 - ADDQ $0x03, CX - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - CMPL R8, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - CMPL R12, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - CMPL R12, $0x00010100 - JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (CX) - MOVW R12, 2(CX) - SARL $0x10, R8 - MOVB R8, 4(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - LEAL -256(R12), R12 - MOVW $0x0019, (CX) - MOVW R12, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - LEAL -4(R12), R12 - MOVW $0x0015, (CX) - MOVB R12, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(CX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB: - MOVL R12, SI - SHLL $0x02, SI - CMPL R12, $0x0c - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB - CMPL R8, $0x00000800 - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB - LEAL -15(SI), SI - MOVB R8, 1(CX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -emit_copy_three_match_nolit_encodeBetterBlockAsm4MB: - LEAL -2(SI), SI - MOVB SI, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -match_is_repeat_encodeBetterBlockAsm4MB: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_match_emit_repeat_encodeBetterBlockAsm4MB - CMPL SI, $0x00000100 - JB two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB - CMPL SI, $0x00010000 - JB three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (CX) - MOVW SI, 1(CX) - MOVB R11, 3(CX) - ADDQ $0x04, CX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB - -three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB - -two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_match_emit_repeat_encodeBetterBlockAsm4MB - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB - -one_byte_match_emit_repeat_encodeBetterBlockAsm4MB: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_match_emit_repeat_encodeBetterBlockAsm4MB: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (CX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (CX) - MOVL R10, -4(CX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB: - MOVQ SI, CX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB - -memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(CX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(CX)(R14*1) - MOVOA X5, -16(CX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB: - ADDL R12, DX - ADDL $0x04, R12 - MOVL DX, 12(SP) - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB - CMPL R8, $0x00000800 - JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: - CMPL R12, $0x00000104 - JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB - CMPL R12, $0x00010100 - JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB - LEAL -65536(R12), R12 - MOVL R12, R8 - MOVW $0x001d, (CX) - MOVW R12, 2(CX) - SARL $0x10, R8 - MOVB R8, 4(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB: - LEAL -256(R12), R12 - MOVW $0x0019, (CX) - MOVW R12, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB: - LEAL -4(R12), R12 - MOVW $0x0015, (CX) - MOVB R12, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(CX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (CX) - ADDQ $0x02, CX - -match_nolit_emitcopy_end_encodeBetterBlockAsm4MB: - CMPL DX, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm4MB - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeBetterBlockAsm4MB - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm4MB: - MOVQ $0x00cf1bbcdcbfa563, SI - MOVQ $0x9e3779b1, R8 - LEAQ 1(DI), DI - LEAQ -2(DX), R9 - MOVQ (BX)(DI*1), R10 - MOVQ 1(BX)(DI*1), R11 - MOVQ (BX)(R9*1), R12 - MOVQ 1(BX)(R9*1), R13 - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x2f, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R12 - IMULQ SI, R12 - SHRQ $0x2f, R12 - SHLQ $0x20, R13 - IMULQ R8, R13 - SHRQ $0x32, R13 - LEAQ 1(DI), R8 - LEAQ 1(R9), R14 - MOVL DI, (AX)(R10*4) - MOVL R9, (AX)(R12*4) - MOVL R8, 524288(AX)(R11*4) - MOVL R14, 524288(AX)(R13*4) - LEAQ 1(R9)(DI*1), R8 - SHRQ $0x01, R8 - ADDQ $0x01, DI - SUBQ $0x01, R9 - -index_loop_encodeBetterBlockAsm4MB: - CMPQ R8, R9 - JAE search_loop_encodeBetterBlockAsm4MB - MOVQ (BX)(DI*1), R10 - MOVQ (BX)(R8*1), R11 - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x2f, R10 - SHLQ $0x08, R11 - IMULQ SI, R11 - SHRQ $0x2f, R11 - MOVL DI, (AX)(R10*4) - MOVL R8, (AX)(R11*4) - ADDQ $0x02, DI - ADDQ $0x02, R8 - JMP index_loop_encodeBetterBlockAsm4MB - -emit_remainder_encodeBetterBlockAsm4MB: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 4(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeBetterBlockAsm4MB - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm4MB: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBetterBlockAsm4MB - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBetterBlockAsm4MB - CMPL DX, $0x00010000 - JB three_bytes_emit_remainder_encodeBetterBlockAsm4MB - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (CX) - MOVW DX, 1(CX) - MOVB BL, 3(CX) - ADDQ $0x04, CX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB - -three_bytes_emit_remainder_encodeBetterBlockAsm4MB: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB - -two_bytes_emit_remainder_encodeBetterBlockAsm4MB: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBetterBlockAsm4MB - JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB - -one_byte_emit_remainder_encodeBetterBlockAsm4MB: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeBetterBlockAsm4MB: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB - -memmove_long_emit_remainder_encodeBetterBlockAsm4MB: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeBetterBlockAsm12B(dst []byte, src []byte, tmp *[81920]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm12B(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00000280, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm12B: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm12B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -6(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeBetterBlockAsm12B: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 1(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm12B - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x34, R11 - MOVL (AX)(R10*4), SI - MOVL 65536(AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - MOVL DX, 65536(AX)(R11*4) - MOVQ (BX)(SI*1), R10 - MOVQ (BX)(R8*1), R11 - CMPQ R10, DI - JEQ candidate_match_encodeBetterBlockAsm12B - CMPQ R11, DI - JNE no_short_found_encodeBetterBlockAsm12B - MOVL R8, SI - JMP candidate_match_encodeBetterBlockAsm12B - -no_short_found_encodeBetterBlockAsm12B: - CMPL R10, DI - JEQ candidate_match_encodeBetterBlockAsm12B - CMPL R11, DI - JEQ candidateS_match_encodeBetterBlockAsm12B - MOVL 20(SP), DX - JMP search_loop_encodeBetterBlockAsm12B - -candidateS_match_encodeBetterBlockAsm12B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL (AX)(R10*4), SI - INCL DX - MOVL DX, (AX)(R10*4) - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm12B - DECL DX - MOVL R8, SI - -candidate_match_encodeBetterBlockAsm12B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBetterBlockAsm12B - -match_extend_back_loop_encodeBetterBlockAsm12B: - CMPL DX, DI - JBE match_extend_back_end_encodeBetterBlockAsm12B - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeBetterBlockAsm12B - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeBetterBlockAsm12B - JMP match_extend_back_loop_encodeBetterBlockAsm12B - -match_extend_back_end_encodeBetterBlockAsm12B: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 3(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeBetterBlockAsm12B - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm12B: - MOVL DX, DI - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), R10 - - // matchLen - XORL R12, R12 - -matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B: - CMPL R8, $0x10 - JB matchlen_match8_match_nolit_encodeBetterBlockAsm12B - MOVQ (R9)(R12*1), R11 - MOVQ 8(R9)(R12*1), R13 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B - XORQ 8(R10)(R12*1), R13 - JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B - LEAL -16(R8), R8 - LEAL 16(R12), R12 - JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B - -matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R13, R13 - -#else - BSFQ R13, R13 - -#endif - SARQ $0x03, R13 - LEAL 8(R12)(R13*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm12B - -matchlen_match8_match_nolit_encodeBetterBlockAsm12B: - CMPL R8, $0x08 - JB matchlen_match4_match_nolit_encodeBetterBlockAsm12B - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B - LEAL -8(R8), R8 - LEAL 8(R12), R12 - JMP matchlen_match4_match_nolit_encodeBetterBlockAsm12B - -matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm12B - -matchlen_match4_match_nolit_encodeBetterBlockAsm12B: - CMPL R8, $0x04 - JB matchlen_match2_match_nolit_encodeBetterBlockAsm12B - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeBetterBlockAsm12B - LEAL -4(R8), R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeBetterBlockAsm12B: - CMPL R8, $0x01 - JE matchlen_match1_match_nolit_encodeBetterBlockAsm12B - JB match_nolit_end_encodeBetterBlockAsm12B - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeBetterBlockAsm12B - LEAL 2(R12), R12 - SUBL $0x02, R8 - JZ match_nolit_end_encodeBetterBlockAsm12B - -matchlen_match1_match_nolit_encodeBetterBlockAsm12B: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeBetterBlockAsm12B - LEAL 1(R12), R12 - -match_nolit_end_encodeBetterBlockAsm12B: - MOVL DX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL 16(SP), R8 - JEQ match_is_repeat_encodeBetterBlockAsm12B - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_match_emit_encodeBetterBlockAsm12B - CMPL SI, $0x00000100 - JB two_bytes_match_emit_encodeBetterBlockAsm12B - JB three_bytes_match_emit_encodeBetterBlockAsm12B - -three_bytes_match_emit_encodeBetterBlockAsm12B: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeBetterBlockAsm12B - -two_bytes_match_emit_encodeBetterBlockAsm12B: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_match_emit_encodeBetterBlockAsm12B - JMP memmove_long_match_emit_encodeBetterBlockAsm12B - -one_byte_match_emit_encodeBetterBlockAsm12B: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeBetterBlockAsm12B: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (CX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (CX) - MOVL R10, -4(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm12B: - MOVQ SI, CX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B - -memmove_long_match_emit_encodeBetterBlockAsm12B: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(CX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(CX)(R14*1) - MOVOA X5, -16(CX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_match_emit_encodeBetterBlockAsm12B: - ADDL R12, DX - ADDL $0x04, R12 - MOVL DX, 12(SP) - - // emitCopy - CMPL R12, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B - CMPL R8, $0x00000800 - JAE long_offset_short_match_nolit_encodeBetterBlockAsm12B - MOVL $0x00000001, SI - LEAL 16(SI), SI - MOVB R8, 1(CX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, SI - MOVB SI, (CX) - ADDQ $0x02, CX - SUBL $0x08, R12 - - // emitRepeat - LEAL -4(R12), R12 - JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b - CMPL R8, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: - CMPL R12, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b - LEAL -256(R12), R12 - MOVW $0x0019, (CX) - MOVW R12, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: - LEAL -4(R12), R12 - MOVW $0x0015, (CX) - MOVB R12, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(CX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -long_offset_short_match_nolit_encodeBetterBlockAsm12B: - MOVB $0xee, (CX) - MOVW R8, 1(CX) - LEAL -60(R12), R12 - ADDQ $0x03, CX - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - CMPL R8, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - CMPL R12, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - LEAL -256(R12), R12 - MOVW $0x0019, (CX) - MOVW R12, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - LEAL -4(R12), R12 - MOVW $0x0015, (CX) - MOVB R12, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(CX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B: - MOVL R12, SI - SHLL $0x02, SI - CMPL R12, $0x0c - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B - CMPL R8, $0x00000800 - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B - LEAL -15(SI), SI - MOVB R8, 1(CX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -emit_copy_three_match_nolit_encodeBetterBlockAsm12B: - LEAL -2(SI), SI - MOVB SI, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -match_is_repeat_encodeBetterBlockAsm12B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_match_emit_repeat_encodeBetterBlockAsm12B - CMPL SI, $0x00000100 - JB two_bytes_match_emit_repeat_encodeBetterBlockAsm12B - JB three_bytes_match_emit_repeat_encodeBetterBlockAsm12B - -three_bytes_match_emit_repeat_encodeBetterBlockAsm12B: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B - -two_bytes_match_emit_repeat_encodeBetterBlockAsm12B: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_match_emit_repeat_encodeBetterBlockAsm12B - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B - -one_byte_match_emit_repeat_encodeBetterBlockAsm12B: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_match_emit_repeat_encodeBetterBlockAsm12B: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (CX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (CX) - MOVL R10, -4(CX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B: - MOVQ SI, CX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B - -memmove_long_match_emit_repeat_encodeBetterBlockAsm12B: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(CX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(CX)(R14*1) - MOVOA X5, -16(CX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B: - ADDL R12, DX - ADDL $0x04, R12 - MOVL DX, 12(SP) - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B - CMPL R8, $0x00000800 - JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: - CMPL R12, $0x00000104 - JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B - LEAL -256(R12), R12 - MOVW $0x0019, (CX) - MOVW R12, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B: - LEAL -4(R12), R12 - MOVW $0x0015, (CX) - MOVB R12, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(CX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (CX) - ADDQ $0x02, CX - -match_nolit_emitcopy_end_encodeBetterBlockAsm12B: - CMPL DX, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm12B - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeBetterBlockAsm12B - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm12B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - LEAQ 1(DI), DI - LEAQ -2(DX), R9 - MOVQ (BX)(DI*1), R10 - MOVQ 1(BX)(DI*1), R11 - MOVQ (BX)(R9*1), R12 - MOVQ 1(BX)(R9*1), R13 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x34, R11 - SHLQ $0x10, R12 - IMULQ SI, R12 - SHRQ $0x32, R12 - SHLQ $0x20, R13 - IMULQ R8, R13 - SHRQ $0x34, R13 - LEAQ 1(DI), R8 - LEAQ 1(R9), R14 - MOVL DI, (AX)(R10*4) - MOVL R9, (AX)(R12*4) - MOVL R8, 65536(AX)(R11*4) - MOVL R14, 65536(AX)(R13*4) - LEAQ 1(R9)(DI*1), R8 - SHRQ $0x01, R8 - ADDQ $0x01, DI - SUBQ $0x01, R9 - -index_loop_encodeBetterBlockAsm12B: - CMPQ R8, R9 - JAE search_loop_encodeBetterBlockAsm12B - MOVQ (BX)(DI*1), R10 - MOVQ (BX)(R8*1), R11 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R11 - IMULQ SI, R11 - SHRQ $0x32, R11 - MOVL DI, (AX)(R10*4) - MOVL R8, (AX)(R11*4) - ADDQ $0x02, DI - ADDQ $0x02, R8 - JMP index_loop_encodeBetterBlockAsm12B - -emit_remainder_encodeBetterBlockAsm12B: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 3(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeBetterBlockAsm12B - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm12B: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBetterBlockAsm12B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBetterBlockAsm12B - JB three_bytes_emit_remainder_encodeBetterBlockAsm12B - -three_bytes_emit_remainder_encodeBetterBlockAsm12B: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B - -two_bytes_emit_remainder_encodeBetterBlockAsm12B: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBetterBlockAsm12B - JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B - -one_byte_emit_remainder_encodeBetterBlockAsm12B: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeBetterBlockAsm12B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B - -memmove_long_emit_remainder_encodeBetterBlockAsm12B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm12B: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeBetterBlockAsm10B(dst []byte, src []byte, tmp *[20480]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm10B(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x000000a0, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm10B: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm10B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -6(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeBetterBlockAsm10B: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 1(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm10B - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x36, R11 - MOVL (AX)(R10*4), SI - MOVL 16384(AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - MOVL DX, 16384(AX)(R11*4) - MOVQ (BX)(SI*1), R10 - MOVQ (BX)(R8*1), R11 - CMPQ R10, DI - JEQ candidate_match_encodeBetterBlockAsm10B - CMPQ R11, DI - JNE no_short_found_encodeBetterBlockAsm10B - MOVL R8, SI - JMP candidate_match_encodeBetterBlockAsm10B - -no_short_found_encodeBetterBlockAsm10B: - CMPL R10, DI - JEQ candidate_match_encodeBetterBlockAsm10B - CMPL R11, DI - JEQ candidateS_match_encodeBetterBlockAsm10B - MOVL 20(SP), DX - JMP search_loop_encodeBetterBlockAsm10B - -candidateS_match_encodeBetterBlockAsm10B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - MOVL (AX)(R10*4), SI - INCL DX - MOVL DX, (AX)(R10*4) - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm10B - DECL DX - MOVL R8, SI - -candidate_match_encodeBetterBlockAsm10B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBetterBlockAsm10B - -match_extend_back_loop_encodeBetterBlockAsm10B: - CMPL DX, DI - JBE match_extend_back_end_encodeBetterBlockAsm10B - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeBetterBlockAsm10B - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeBetterBlockAsm10B - JMP match_extend_back_loop_encodeBetterBlockAsm10B - -match_extend_back_end_encodeBetterBlockAsm10B: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 3(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeBetterBlockAsm10B - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm10B: - MOVL DX, DI - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), R10 - - // matchLen - XORL R12, R12 - -matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B: - CMPL R8, $0x10 - JB matchlen_match8_match_nolit_encodeBetterBlockAsm10B - MOVQ (R9)(R12*1), R11 - MOVQ 8(R9)(R12*1), R13 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B - XORQ 8(R10)(R12*1), R13 - JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B - LEAL -16(R8), R8 - LEAL 16(R12), R12 - JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B - -matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R13, R13 - -#else - BSFQ R13, R13 - -#endif - SARQ $0x03, R13 - LEAL 8(R12)(R13*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm10B - -matchlen_match8_match_nolit_encodeBetterBlockAsm10B: - CMPL R8, $0x08 - JB matchlen_match4_match_nolit_encodeBetterBlockAsm10B - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B - LEAL -8(R8), R8 - LEAL 8(R12), R12 - JMP matchlen_match4_match_nolit_encodeBetterBlockAsm10B - -matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm10B - -matchlen_match4_match_nolit_encodeBetterBlockAsm10B: - CMPL R8, $0x04 - JB matchlen_match2_match_nolit_encodeBetterBlockAsm10B - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeBetterBlockAsm10B - LEAL -4(R8), R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeBetterBlockAsm10B: - CMPL R8, $0x01 - JE matchlen_match1_match_nolit_encodeBetterBlockAsm10B - JB match_nolit_end_encodeBetterBlockAsm10B - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeBetterBlockAsm10B - LEAL 2(R12), R12 - SUBL $0x02, R8 - JZ match_nolit_end_encodeBetterBlockAsm10B - -matchlen_match1_match_nolit_encodeBetterBlockAsm10B: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeBetterBlockAsm10B - LEAL 1(R12), R12 - -match_nolit_end_encodeBetterBlockAsm10B: - MOVL DX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL 16(SP), R8 - JEQ match_is_repeat_encodeBetterBlockAsm10B - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_match_emit_encodeBetterBlockAsm10B - CMPL SI, $0x00000100 - JB two_bytes_match_emit_encodeBetterBlockAsm10B - JB three_bytes_match_emit_encodeBetterBlockAsm10B - -three_bytes_match_emit_encodeBetterBlockAsm10B: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeBetterBlockAsm10B - -two_bytes_match_emit_encodeBetterBlockAsm10B: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_match_emit_encodeBetterBlockAsm10B - JMP memmove_long_match_emit_encodeBetterBlockAsm10B - -one_byte_match_emit_encodeBetterBlockAsm10B: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeBetterBlockAsm10B: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (CX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (CX) - MOVL R10, -4(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm10B: - MOVQ SI, CX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B - -memmove_long_match_emit_encodeBetterBlockAsm10B: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(CX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(CX)(R14*1) - MOVOA X5, -16(CX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_match_emit_encodeBetterBlockAsm10B: - ADDL R12, DX - ADDL $0x04, R12 - MOVL DX, 12(SP) - - // emitCopy - CMPL R12, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B - CMPL R8, $0x00000800 - JAE long_offset_short_match_nolit_encodeBetterBlockAsm10B - MOVL $0x00000001, SI - LEAL 16(SI), SI - MOVB R8, 1(CX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, SI - MOVB SI, (CX) - ADDQ $0x02, CX - SUBL $0x08, R12 - - // emitRepeat - LEAL -4(R12), R12 - JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b - CMPL R8, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: - CMPL R12, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b - LEAL -256(R12), R12 - MOVW $0x0019, (CX) - MOVW R12, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: - LEAL -4(R12), R12 - MOVW $0x0015, (CX) - MOVB R12, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(CX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -long_offset_short_match_nolit_encodeBetterBlockAsm10B: - MOVB $0xee, (CX) - MOVW R8, 1(CX) - LEAL -60(R12), R12 - ADDQ $0x03, CX - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - CMPL R8, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - CMPL R12, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - LEAL -256(R12), R12 - MOVW $0x0019, (CX) - MOVW R12, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - LEAL -4(R12), R12 - MOVW $0x0015, (CX) - MOVB R12, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(CX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B: - MOVL R12, SI - SHLL $0x02, SI - CMPL R12, $0x0c - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B - CMPL R8, $0x00000800 - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B - LEAL -15(SI), SI - MOVB R8, 1(CX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -emit_copy_three_match_nolit_encodeBetterBlockAsm10B: - LEAL -2(SI), SI - MOVB SI, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -match_is_repeat_encodeBetterBlockAsm10B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_match_emit_repeat_encodeBetterBlockAsm10B - CMPL SI, $0x00000100 - JB two_bytes_match_emit_repeat_encodeBetterBlockAsm10B - JB three_bytes_match_emit_repeat_encodeBetterBlockAsm10B - -three_bytes_match_emit_repeat_encodeBetterBlockAsm10B: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B - -two_bytes_match_emit_repeat_encodeBetterBlockAsm10B: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_match_emit_repeat_encodeBetterBlockAsm10B - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B - -one_byte_match_emit_repeat_encodeBetterBlockAsm10B: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_match_emit_repeat_encodeBetterBlockAsm10B: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (CX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (CX) - MOVL R10, -4(CX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B: - MOVQ SI, CX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B - -memmove_long_match_emit_repeat_encodeBetterBlockAsm10B: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(CX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(CX)(R14*1) - MOVOA X5, -16(CX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B: - ADDL R12, DX - ADDL $0x04, R12 - MOVL DX, 12(SP) - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B - CMPL R8, $0x00000800 - JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: - CMPL R12, $0x00000104 - JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B - LEAL -256(R12), R12 - MOVW $0x0019, (CX) - MOVW R12, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B: - LEAL -4(R12), R12 - MOVW $0x0015, (CX) - MOVB R12, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(CX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (CX) - ADDQ $0x02, CX - -match_nolit_emitcopy_end_encodeBetterBlockAsm10B: - CMPL DX, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm10B - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeBetterBlockAsm10B - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm10B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - LEAQ 1(DI), DI - LEAQ -2(DX), R9 - MOVQ (BX)(DI*1), R10 - MOVQ 1(BX)(DI*1), R11 - MOVQ (BX)(R9*1), R12 - MOVQ 1(BX)(R9*1), R13 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x36, R11 - SHLQ $0x10, R12 - IMULQ SI, R12 - SHRQ $0x34, R12 - SHLQ $0x20, R13 - IMULQ R8, R13 - SHRQ $0x36, R13 - LEAQ 1(DI), R8 - LEAQ 1(R9), R14 - MOVL DI, (AX)(R10*4) - MOVL R9, (AX)(R12*4) - MOVL R8, 16384(AX)(R11*4) - MOVL R14, 16384(AX)(R13*4) - LEAQ 1(R9)(DI*1), R8 - SHRQ $0x01, R8 - ADDQ $0x01, DI - SUBQ $0x01, R9 - -index_loop_encodeBetterBlockAsm10B: - CMPQ R8, R9 - JAE search_loop_encodeBetterBlockAsm10B - MOVQ (BX)(DI*1), R10 - MOVQ (BX)(R8*1), R11 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x34, R10 - SHLQ $0x10, R11 - IMULQ SI, R11 - SHRQ $0x34, R11 - MOVL DI, (AX)(R10*4) - MOVL R8, (AX)(R11*4) - ADDQ $0x02, DI - ADDQ $0x02, R8 - JMP index_loop_encodeBetterBlockAsm10B - -emit_remainder_encodeBetterBlockAsm10B: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 3(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeBetterBlockAsm10B - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm10B: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBetterBlockAsm10B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBetterBlockAsm10B - JB three_bytes_emit_remainder_encodeBetterBlockAsm10B - -three_bytes_emit_remainder_encodeBetterBlockAsm10B: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B - -two_bytes_emit_remainder_encodeBetterBlockAsm10B: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBetterBlockAsm10B - JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B - -one_byte_emit_remainder_encodeBetterBlockAsm10B: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeBetterBlockAsm10B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B - -memmove_long_emit_remainder_encodeBetterBlockAsm10B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm10B: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeBetterBlockAsm8B(dst []byte, src []byte, tmp *[5120]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm8B(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00000028, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm8B: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeBetterBlockAsm8B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -6(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeBetterBlockAsm8B: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x04, SI - LEAL 1(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm8B - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x38, R11 - MOVL (AX)(R10*4), SI - MOVL 4096(AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - MOVL DX, 4096(AX)(R11*4) - MOVQ (BX)(SI*1), R10 - MOVQ (BX)(R8*1), R11 - CMPQ R10, DI - JEQ candidate_match_encodeBetterBlockAsm8B - CMPQ R11, DI - JNE no_short_found_encodeBetterBlockAsm8B - MOVL R8, SI - JMP candidate_match_encodeBetterBlockAsm8B - -no_short_found_encodeBetterBlockAsm8B: - CMPL R10, DI - JEQ candidate_match_encodeBetterBlockAsm8B - CMPL R11, DI - JEQ candidateS_match_encodeBetterBlockAsm8B - MOVL 20(SP), DX - JMP search_loop_encodeBetterBlockAsm8B - -candidateS_match_encodeBetterBlockAsm8B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - MOVL (AX)(R10*4), SI - INCL DX - MOVL DX, (AX)(R10*4) - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeBetterBlockAsm8B - DECL DX - MOVL R8, SI - -candidate_match_encodeBetterBlockAsm8B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeBetterBlockAsm8B - -match_extend_back_loop_encodeBetterBlockAsm8B: - CMPL DX, DI - JBE match_extend_back_end_encodeBetterBlockAsm8B - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeBetterBlockAsm8B - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeBetterBlockAsm8B - JMP match_extend_back_loop_encodeBetterBlockAsm8B - -match_extend_back_end_encodeBetterBlockAsm8B: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 3(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeBetterBlockAsm8B - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm8B: - MOVL DX, DI - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), R10 - - // matchLen - XORL R12, R12 - -matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B: - CMPL R8, $0x10 - JB matchlen_match8_match_nolit_encodeBetterBlockAsm8B - MOVQ (R9)(R12*1), R11 - MOVQ 8(R9)(R12*1), R13 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B - XORQ 8(R10)(R12*1), R13 - JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B - LEAL -16(R8), R8 - LEAL 16(R12), R12 - JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B - -matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R13, R13 - -#else - BSFQ R13, R13 - -#endif - SARQ $0x03, R13 - LEAL 8(R12)(R13*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm8B - -matchlen_match8_match_nolit_encodeBetterBlockAsm8B: - CMPL R8, $0x08 - JB matchlen_match4_match_nolit_encodeBetterBlockAsm8B - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B - LEAL -8(R8), R8 - LEAL 8(R12), R12 - JMP matchlen_match4_match_nolit_encodeBetterBlockAsm8B - -matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeBetterBlockAsm8B - -matchlen_match4_match_nolit_encodeBetterBlockAsm8B: - CMPL R8, $0x04 - JB matchlen_match2_match_nolit_encodeBetterBlockAsm8B - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeBetterBlockAsm8B - LEAL -4(R8), R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeBetterBlockAsm8B: - CMPL R8, $0x01 - JE matchlen_match1_match_nolit_encodeBetterBlockAsm8B - JB match_nolit_end_encodeBetterBlockAsm8B - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeBetterBlockAsm8B - LEAL 2(R12), R12 - SUBL $0x02, R8 - JZ match_nolit_end_encodeBetterBlockAsm8B - -matchlen_match1_match_nolit_encodeBetterBlockAsm8B: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeBetterBlockAsm8B - LEAL 1(R12), R12 - -match_nolit_end_encodeBetterBlockAsm8B: - MOVL DX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL 16(SP), R8 - JEQ match_is_repeat_encodeBetterBlockAsm8B - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_match_emit_encodeBetterBlockAsm8B - CMPL SI, $0x00000100 - JB two_bytes_match_emit_encodeBetterBlockAsm8B - JB three_bytes_match_emit_encodeBetterBlockAsm8B - -three_bytes_match_emit_encodeBetterBlockAsm8B: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeBetterBlockAsm8B - -two_bytes_match_emit_encodeBetterBlockAsm8B: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_match_emit_encodeBetterBlockAsm8B - JMP memmove_long_match_emit_encodeBetterBlockAsm8B - -one_byte_match_emit_encodeBetterBlockAsm8B: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeBetterBlockAsm8B: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x04 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4 - CMPQ R9, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4: - MOVL (R10), R11 - MOVL R11, (CX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7: - MOVL (R10), R11 - MOVL -4(R10)(R9*1), R10 - MOVL R11, (CX) - MOVL R10, -4(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm8B: - MOVQ SI, CX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B - -memmove_long_match_emit_encodeBetterBlockAsm8B: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(CX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(CX)(R14*1) - MOVOA X5, -16(CX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_match_emit_encodeBetterBlockAsm8B: - ADDL R12, DX - ADDL $0x04, R12 - MOVL DX, 12(SP) - - // emitCopy - CMPL R12, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B - CMPL R8, $0x00000800 - JAE long_offset_short_match_nolit_encodeBetterBlockAsm8B - MOVL $0x00000001, SI - LEAL 16(SI), SI - MOVB R8, 1(CX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, SI - MOVB SI, (CX) - ADDQ $0x02, CX - SUBL $0x08, R12 - - // emitRepeat - LEAL -4(R12), R12 - JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: - CMPL R12, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b - LEAL -256(R12), R12 - MOVW $0x0019, (CX) - MOVW R12, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: - LEAL -4(R12), R12 - MOVW $0x0015, (CX) - MOVB R12, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(CX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -long_offset_short_match_nolit_encodeBetterBlockAsm8B: - MOVB $0xee, (CX) - MOVW R8, 1(CX) - LEAL -60(R12), R12 - ADDQ $0x03, CX - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: - CMPL R12, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short - LEAL -256(R12), R12 - MOVW $0x0019, (CX) - MOVW R12, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: - LEAL -4(R12), R12 - MOVW $0x0015, (CX) - MOVB R12, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(CX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B: - MOVL R12, SI - SHLL $0x02, SI - CMPL R12, $0x0c - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm8B - LEAL -15(SI), SI - MOVB R8, 1(CX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -emit_copy_three_match_nolit_encodeBetterBlockAsm8B: - LEAL -2(SI), SI - MOVB SI, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -match_is_repeat_encodeBetterBlockAsm8B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JB one_byte_match_emit_repeat_encodeBetterBlockAsm8B - CMPL SI, $0x00000100 - JB two_bytes_match_emit_repeat_encodeBetterBlockAsm8B - JB three_bytes_match_emit_repeat_encodeBetterBlockAsm8B - -three_bytes_match_emit_repeat_encodeBetterBlockAsm8B: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B - -two_bytes_match_emit_repeat_encodeBetterBlockAsm8B: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_match_emit_repeat_encodeBetterBlockAsm8B - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B - -one_byte_match_emit_repeat_encodeBetterBlockAsm8B: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_match_emit_repeat_encodeBetterBlockAsm8B: - LEAQ (CX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x04 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4: - MOVL (R9), R10 - MOVL R10, (CX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (CX) - MOVL R9, -4(CX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (CX) - MOVQ R9, -8(CX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B: - MOVQ SI, CX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B - -memmove_long_match_emit_repeat_encodeBetterBlockAsm8B: - LEAQ (CX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R11 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(CX)(R13*1), R14 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(CX)(R13*1) - MOVOA X5, -16(CX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) - MOVQ SI, CX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B: - ADDL R12, DX - ADDL $0x04, R12 - MOVL DX, 12(SP) - - // emitRepeat - MOVL R12, SI - LEAL -4(R12), R12 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B: - CMPL R12, $0x00000104 - JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B - LEAL -256(R12), R12 - MOVW $0x0019, (CX) - MOVW R12, 2(CX) - ADDQ $0x04, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B: - LEAL -4(R12), R12 - MOVW $0x0015, (CX) - MOVB R12, 2(CX) - ADDQ $0x03, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B: - SHLL $0x02, R12 - ORL $0x01, R12 - MOVW R12, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - XORQ SI, SI - LEAL 1(SI)(R12*4), R12 - MOVB R8, 1(CX) - SARL $0x08, R8 - SHLL $0x05, R8 - ORL R8, R12 - MOVB R12, (CX) - ADDQ $0x02, CX - -match_nolit_emitcopy_end_encodeBetterBlockAsm8B: - CMPL DX, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm8B - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeBetterBlockAsm8B - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm8B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - LEAQ 1(DI), DI - LEAQ -2(DX), R9 - MOVQ (BX)(DI*1), R10 - MOVQ 1(BX)(DI*1), R11 - MOVQ (BX)(R9*1), R12 - MOVQ 1(BX)(R9*1), R13 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x38, R11 - SHLQ $0x10, R12 - IMULQ SI, R12 - SHRQ $0x36, R12 - SHLQ $0x20, R13 - IMULQ R8, R13 - SHRQ $0x38, R13 - LEAQ 1(DI), R8 - LEAQ 1(R9), R14 - MOVL DI, (AX)(R10*4) - MOVL R9, (AX)(R12*4) - MOVL R8, 4096(AX)(R11*4) - MOVL R14, 4096(AX)(R13*4) - LEAQ 1(R9)(DI*1), R8 - SHRQ $0x01, R8 - ADDQ $0x01, DI - SUBQ $0x01, R9 - -index_loop_encodeBetterBlockAsm8B: - CMPQ R8, R9 - JAE search_loop_encodeBetterBlockAsm8B - MOVQ (BX)(DI*1), R10 - MOVQ (BX)(R8*1), R11 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x36, R10 - SHLQ $0x10, R11 - IMULQ SI, R11 - SHRQ $0x36, R11 - MOVL DI, (AX)(R10*4) - MOVL R8, (AX)(R11*4) - ADDQ $0x02, DI - ADDQ $0x02, R8 - JMP index_loop_encodeBetterBlockAsm8B - -emit_remainder_encodeBetterBlockAsm8B: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 3(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeBetterBlockAsm8B - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm8B: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBetterBlockAsm8B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBetterBlockAsm8B - JB three_bytes_emit_remainder_encodeBetterBlockAsm8B - -three_bytes_emit_remainder_encodeBetterBlockAsm8B: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B - -two_bytes_emit_remainder_encodeBetterBlockAsm8B: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBetterBlockAsm8B - JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B - -one_byte_emit_remainder_encodeBetterBlockAsm8B: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeBetterBlockAsm8B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B - -memmove_long_emit_remainder_encodeBetterBlockAsm8B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm8B: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeSnappyBlockAsm(dst []byte, src []byte, tmp *[65536]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBlockAsm(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00000200, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeSnappyBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -9(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeSnappyBlockAsm: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 4(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R11 - IMULQ R9, R11 - SHRQ $0x32, R11 - MOVL (AX)(R10*4), SI - MOVL (AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - LEAL 1(DX), R10 - MOVL R10, (AX)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeSnappyBlockAsm - LEAL 1(DX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeSnappyBlockAsm - -repeat_extend_back_loop_encodeSnappyBlockAsm: - CMPL DI, SI - JBE repeat_extend_back_end_encodeSnappyBlockAsm - MOVB -1(BX)(R8*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_encodeSnappyBlockAsm - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm - -repeat_extend_back_end_encodeSnappyBlockAsm: - MOVL DI, SI - SUBL 12(SP), SI - LEAQ 5(CX)(SI*1), SI - CMPQ SI, (SP) - JB repeat_dst_size_check_encodeSnappyBlockAsm - MOVQ $0x00000000, ret+56(FP) - RET - -repeat_dst_size_check_encodeSnappyBlockAsm: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JB one_byte_repeat_emit_encodeSnappyBlockAsm - CMPL SI, $0x00000100 - JB two_bytes_repeat_emit_encodeSnappyBlockAsm - CMPL SI, $0x00010000 - JB three_bytes_repeat_emit_encodeSnappyBlockAsm - CMPL SI, $0x01000000 - JB four_bytes_repeat_emit_encodeSnappyBlockAsm - MOVB $0xfc, (CX) - MOVL SI, 1(CX) - ADDQ $0x05, CX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm - -four_bytes_repeat_emit_encodeSnappyBlockAsm: - MOVL SI, R10 - SHRL $0x10, R10 - MOVB $0xf8, (CX) - MOVW SI, 1(CX) - MOVB R10, 3(CX) - ADDQ $0x04, CX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm - -three_bytes_repeat_emit_encodeSnappyBlockAsm: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm - -two_bytes_repeat_emit_encodeSnappyBlockAsm: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_repeat_emit_encodeSnappyBlockAsm - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm - -one_byte_repeat_emit_encodeSnappyBlockAsm: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_repeat_emit_encodeSnappyBlockAsm: - LEAQ (CX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (CX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (CX) - MOVQ R9, -8(CX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm: - MOVQ SI, CX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm - -memmove_long_repeat_emit_encodeSnappyBlockAsm: - LEAQ (CX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) - MOVQ SI, CX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm: - CMPL R8, $0x10 - JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm - XORQ 8(SI)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm - LEAL -16(R8), R8 - LEAL 16(R11), R11 - JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm - -matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm - -matchlen_match8_repeat_extend_encodeSnappyBlockAsm: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm - -matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm - -matchlen_match4_repeat_extend_encodeSnappyBlockAsm: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeSnappyBlockAsm: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm - JB repeat_extend_forward_end_encodeSnappyBlockAsm - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm - -matchlen_match1_repeat_extend_encodeSnappyBlockAsm: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeSnappyBlockAsm: - ADDL R11, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitCopy - CMPL DI, $0x00010000 - JB two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm - -four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm: - CMPL SI, $0x40 - JBE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm - MOVB $0xff, (CX) - MOVL DI, 1(CX) - LEAL -64(SI), SI - ADDQ $0x05, CX - CMPL SI, $0x04 - JB four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm - JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm - -four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm: - TESTL SI, SI - JZ repeat_end_emit_encodeSnappyBlockAsm - XORL R8, R8 - LEAL -1(R8)(SI*4), SI - MOVB SI, (CX) - MOVL DI, 1(CX) - ADDQ $0x05, CX - JMP repeat_end_emit_encodeSnappyBlockAsm - -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm: - CMPL SI, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm - MOVB $0xee, (CX) - MOVW DI, 1(CX) - LEAL -60(SI), SI - ADDQ $0x03, CX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm: - MOVL SI, R8 - SHLL $0x02, R8 - CMPL SI, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm - CMPL DI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm - LEAL -15(R8), R8 - MOVB DI, 1(CX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R8 - MOVB R8, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeSnappyBlockAsm - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm: - LEAL -2(R8), R8 - MOVB R8, (CX) - MOVW DI, 1(CX) - ADDQ $0x03, CX - -repeat_end_emit_encodeSnappyBlockAsm: - MOVL DX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm - -no_repeat_found_encodeSnappyBlockAsm: - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeSnappyBlockAsm - SHRQ $0x08, DI - MOVL (AX)(R10*4), SI - LEAL 2(DX), R9 - CMPL (BX)(R8*1), DI - JEQ candidate2_match_encodeSnappyBlockAsm - MOVL R9, (AX)(R10*4) - SHRQ $0x08, DI - CMPL (BX)(SI*1), DI - JEQ candidate3_match_encodeSnappyBlockAsm - MOVL 20(SP), DX - JMP search_loop_encodeSnappyBlockAsm - -candidate3_match_encodeSnappyBlockAsm: - ADDL $0x02, DX - JMP candidate_match_encodeSnappyBlockAsm - -candidate2_match_encodeSnappyBlockAsm: - MOVL R9, (AX)(R10*4) - INCL DX - MOVL R8, SI - -candidate_match_encodeSnappyBlockAsm: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBlockAsm - -match_extend_back_loop_encodeSnappyBlockAsm: - CMPL DX, DI - JBE match_extend_back_end_encodeSnappyBlockAsm - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeSnappyBlockAsm - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeSnappyBlockAsm - JMP match_extend_back_loop_encodeSnappyBlockAsm - -match_extend_back_end_encodeSnappyBlockAsm: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 5(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeSnappyBlockAsm - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm: - MOVL DX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JB one_byte_match_emit_encodeSnappyBlockAsm - CMPL R8, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBlockAsm - CMPL R8, $0x00010000 - JB three_bytes_match_emit_encodeSnappyBlockAsm - CMPL R8, $0x01000000 - JB four_bytes_match_emit_encodeSnappyBlockAsm - MOVB $0xfc, (CX) - MOVL R8, 1(CX) - ADDQ $0x05, CX - JMP memmove_long_match_emit_encodeSnappyBlockAsm - -four_bytes_match_emit_encodeSnappyBlockAsm: - MOVL R8, R10 - SHRL $0x10, R10 - MOVB $0xf8, (CX) - MOVW R8, 1(CX) - MOVB R10, 3(CX) - ADDQ $0x04, CX - JMP memmove_long_match_emit_encodeSnappyBlockAsm - -three_bytes_match_emit_encodeSnappyBlockAsm: - MOVB $0xf4, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeSnappyBlockAsm - -two_bytes_match_emit_encodeSnappyBlockAsm: - MOVB $0xf0, (CX) - MOVB R8, 1(CX) - ADDQ $0x02, CX - CMPL R8, $0x40 - JB memmove_match_emit_encodeSnappyBlockAsm - JMP memmove_long_match_emit_encodeSnappyBlockAsm - -one_byte_match_emit_encodeSnappyBlockAsm: - SHLB $0x02, R8 - MOVB R8, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeSnappyBlockAsm: - LEAQ (CX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (CX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (CX) - MOVQ DI, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm: - MOVQ R8, CX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm - -memmove_long_match_emit_encodeSnappyBlockAsm: - LEAQ (CX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ R8, CX - -emit_literal_done_match_emit_encodeSnappyBlockAsm: -match_nolit_loop_encodeSnappyBlockAsm: - MOVL DX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL DX, DI - LEAQ (BX)(DX*1), R8 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBlockAsm - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm - XORQ 8(SI)(R10*1), R11 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm - -matchlen_bsf_16match_nolit_encodeSnappyBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm - -matchlen_match8_match_nolit_encodeSnappyBlockAsm: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBlockAsm - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm - -matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm - -matchlen_match4_match_nolit_encodeSnappyBlockAsm: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBlockAsm - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeSnappyBlockAsm: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBlockAsm - JB match_nolit_end_encodeSnappyBlockAsm - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ match_nolit_end_encodeSnappyBlockAsm - -matchlen_match1_match_nolit_encodeSnappyBlockAsm: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeSnappyBlockAsm - LEAL 1(R10), R10 - -match_nolit_end_encodeSnappyBlockAsm: - ADDL R10, DX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL DX, 12(SP) - - // emitCopy - CMPL SI, $0x00010000 - JB two_byte_offset_match_nolit_encodeSnappyBlockAsm - -four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm: - CMPL R10, $0x40 - JBE four_bytes_remain_match_nolit_encodeSnappyBlockAsm - MOVB $0xff, (CX) - MOVL SI, 1(CX) - LEAL -64(R10), R10 - ADDQ $0x05, CX - CMPL R10, $0x04 - JB four_bytes_remain_match_nolit_encodeSnappyBlockAsm - JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm - -four_bytes_remain_match_nolit_encodeSnappyBlockAsm: - TESTL R10, R10 - JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm - XORL DI, DI - LEAL -1(DI)(R10*4), R10 - MOVB R10, (CX) - MOVL SI, 1(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm - -two_byte_offset_match_nolit_encodeSnappyBlockAsm: - CMPL R10, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm - MOVB $0xee, (CX) - MOVW SI, 1(CX) - LEAL -60(R10), R10 - ADDQ $0x03, CX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm: - MOVL R10, DI - SHLL $0x02, DI - CMPL R10, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm - CMPL SI, $0x00000800 - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm - LEAL -15(DI), DI - MOVB SI, 1(CX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm - -emit_copy_three_match_nolit_encodeSnappyBlockAsm: - LEAL -2(DI), DI - MOVB DI, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm: - CMPL DX, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm - MOVQ -2(BX)(DX*1), DI - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeSnappyBlockAsm - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm: - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x10, R8 - IMULQ R9, R8 - SHRQ $0x32, R8 - SHLQ $0x10, SI - IMULQ R9, SI - SHRQ $0x32, SI - LEAL -2(DX), R9 - LEAQ (AX)(SI*4), R10 - MOVL (R10), SI - MOVL R9, (AX)(R8*4) - MOVL DX, (R10) - CMPL (BX)(SI*1), DI - JEQ match_nolit_loop_encodeSnappyBlockAsm - INCL DX - JMP search_loop_encodeSnappyBlockAsm - -emit_remainder_encodeSnappyBlockAsm: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 5(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeSnappyBlockAsm - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBlockAsm - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBlockAsm - CMPL DX, $0x00010000 - JB three_bytes_emit_remainder_encodeSnappyBlockAsm - CMPL DX, $0x01000000 - JB four_bytes_emit_remainder_encodeSnappyBlockAsm - MOVB $0xfc, (CX) - MOVL DX, 1(CX) - ADDQ $0x05, CX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm - -four_bytes_emit_remainder_encodeSnappyBlockAsm: - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (CX) - MOVW DX, 1(CX) - MOVB BL, 3(CX) - ADDQ $0x04, CX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm - -three_bytes_emit_remainder_encodeSnappyBlockAsm: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm - -two_bytes_emit_remainder_encodeSnappyBlockAsm: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBlockAsm - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm - -one_byte_emit_remainder_encodeSnappyBlockAsm: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeSnappyBlockAsm: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm - -memmove_long_emit_remainder_encodeSnappyBlockAsm: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeSnappyBlockAsm64K(dst []byte, src []byte, tmp *[65536]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBlockAsm64K(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00000200, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm64K: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeSnappyBlockAsm64K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -9(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeSnappyBlockAsm64K: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 4(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm64K - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R11 - IMULQ R9, R11 - SHRQ $0x32, R11 - MOVL (AX)(R10*4), SI - MOVL (AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - LEAL 1(DX), R10 - MOVL R10, (AX)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeSnappyBlockAsm64K - LEAL 1(DX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeSnappyBlockAsm64K - -repeat_extend_back_loop_encodeSnappyBlockAsm64K: - CMPL DI, SI - JBE repeat_extend_back_end_encodeSnappyBlockAsm64K - MOVB -1(BX)(R8*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_encodeSnappyBlockAsm64K - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K - -repeat_extend_back_end_encodeSnappyBlockAsm64K: - MOVL DI, SI - SUBL 12(SP), SI - LEAQ 3(CX)(SI*1), SI - CMPQ SI, (SP) - JB repeat_dst_size_check_encodeSnappyBlockAsm64K - MOVQ $0x00000000, ret+56(FP) - RET - -repeat_dst_size_check_encodeSnappyBlockAsm64K: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JB one_byte_repeat_emit_encodeSnappyBlockAsm64K - CMPL SI, $0x00000100 - JB two_bytes_repeat_emit_encodeSnappyBlockAsm64K - JB three_bytes_repeat_emit_encodeSnappyBlockAsm64K - -three_bytes_repeat_emit_encodeSnappyBlockAsm64K: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K - -two_bytes_repeat_emit_encodeSnappyBlockAsm64K: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_repeat_emit_encodeSnappyBlockAsm64K - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K - -one_byte_repeat_emit_encodeSnappyBlockAsm64K: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_repeat_emit_encodeSnappyBlockAsm64K: - LEAQ (CX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (CX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (CX) - MOVQ R9, -8(CX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K: - MOVQ SI, CX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K - -memmove_long_repeat_emit_encodeSnappyBlockAsm64K: - LEAQ (CX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) - MOVQ SI, CX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K: - CMPL R8, $0x10 - JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K - XORQ 8(SI)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K - LEAL -16(R8), R8 - LEAL 16(R11), R11 - JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K - -matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K - -matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K - -matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K - -matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K - JB repeat_extend_forward_end_encodeSnappyBlockAsm64K - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K - -matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeSnappyBlockAsm64K: - ADDL R11, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitCopy -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K: - CMPL SI, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K - MOVB $0xee, (CX) - MOVW DI, 1(CX) - LEAL -60(SI), SI - ADDQ $0x03, CX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K: - MOVL SI, R8 - SHLL $0x02, R8 - CMPL SI, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K - CMPL DI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K - LEAL -15(R8), R8 - MOVB DI, 1(CX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R8 - MOVB R8, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeSnappyBlockAsm64K - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K: - LEAL -2(R8), R8 - MOVB R8, (CX) - MOVW DI, 1(CX) - ADDQ $0x03, CX - -repeat_end_emit_encodeSnappyBlockAsm64K: - MOVL DX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm64K - -no_repeat_found_encodeSnappyBlockAsm64K: - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeSnappyBlockAsm64K - SHRQ $0x08, DI - MOVL (AX)(R10*4), SI - LEAL 2(DX), R9 - CMPL (BX)(R8*1), DI - JEQ candidate2_match_encodeSnappyBlockAsm64K - MOVL R9, (AX)(R10*4) - SHRQ $0x08, DI - CMPL (BX)(SI*1), DI - JEQ candidate3_match_encodeSnappyBlockAsm64K - MOVL 20(SP), DX - JMP search_loop_encodeSnappyBlockAsm64K - -candidate3_match_encodeSnappyBlockAsm64K: - ADDL $0x02, DX - JMP candidate_match_encodeSnappyBlockAsm64K - -candidate2_match_encodeSnappyBlockAsm64K: - MOVL R9, (AX)(R10*4) - INCL DX - MOVL R8, SI - -candidate_match_encodeSnappyBlockAsm64K: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBlockAsm64K - -match_extend_back_loop_encodeSnappyBlockAsm64K: - CMPL DX, DI - JBE match_extend_back_end_encodeSnappyBlockAsm64K - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeSnappyBlockAsm64K - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeSnappyBlockAsm64K - JMP match_extend_back_loop_encodeSnappyBlockAsm64K - -match_extend_back_end_encodeSnappyBlockAsm64K: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 3(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeSnappyBlockAsm64K - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm64K: - MOVL DX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JB one_byte_match_emit_encodeSnappyBlockAsm64K - CMPL R8, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBlockAsm64K - JB three_bytes_match_emit_encodeSnappyBlockAsm64K - -three_bytes_match_emit_encodeSnappyBlockAsm64K: - MOVB $0xf4, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeSnappyBlockAsm64K - -two_bytes_match_emit_encodeSnappyBlockAsm64K: - MOVB $0xf0, (CX) - MOVB R8, 1(CX) - ADDQ $0x02, CX - CMPL R8, $0x40 - JB memmove_match_emit_encodeSnappyBlockAsm64K - JMP memmove_long_match_emit_encodeSnappyBlockAsm64K - -one_byte_match_emit_encodeSnappyBlockAsm64K: - SHLB $0x02, R8 - MOVB R8, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeSnappyBlockAsm64K: - LEAQ (CX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (CX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (CX) - MOVQ DI, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm64K: - MOVQ R8, CX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K - -memmove_long_match_emit_encodeSnappyBlockAsm64K: - LEAQ (CX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ R8, CX - -emit_literal_done_match_emit_encodeSnappyBlockAsm64K: -match_nolit_loop_encodeSnappyBlockAsm64K: - MOVL DX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL DX, DI - LEAQ (BX)(DX*1), R8 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBlockAsm64K - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K - XORQ 8(SI)(R10*1), R11 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K - -matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm64K - -matchlen_match8_match_nolit_encodeSnappyBlockAsm64K: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBlockAsm64K - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm64K - -matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm64K - -matchlen_match4_match_nolit_encodeSnappyBlockAsm64K: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBlockAsm64K - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm64K - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeSnappyBlockAsm64K: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K - JB match_nolit_end_encodeSnappyBlockAsm64K - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ match_nolit_end_encodeSnappyBlockAsm64K - -matchlen_match1_match_nolit_encodeSnappyBlockAsm64K: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeSnappyBlockAsm64K - LEAL 1(R10), R10 - -match_nolit_end_encodeSnappyBlockAsm64K: - ADDL R10, DX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL DX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBlockAsm64K: - CMPL R10, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K - MOVB $0xee, (CX) - MOVW SI, 1(CX) - LEAL -60(R10), R10 - ADDQ $0x03, CX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K: - MOVL R10, DI - SHLL $0x02, DI - CMPL R10, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K - CMPL SI, $0x00000800 - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K - LEAL -15(DI), DI - MOVB SI, 1(CX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K - -emit_copy_three_match_nolit_encodeSnappyBlockAsm64K: - LEAL -2(DI), DI - MOVB DI, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm64K: - CMPL DX, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm64K - MOVQ -2(BX)(DX*1), DI - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeSnappyBlockAsm64K - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm64K: - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x10, R8 - IMULQ R9, R8 - SHRQ $0x32, R8 - SHLQ $0x10, SI - IMULQ R9, SI - SHRQ $0x32, SI - LEAL -2(DX), R9 - LEAQ (AX)(SI*4), R10 - MOVL (R10), SI - MOVL R9, (AX)(R8*4) - MOVL DX, (R10) - CMPL (BX)(SI*1), DI - JEQ match_nolit_loop_encodeSnappyBlockAsm64K - INCL DX - JMP search_loop_encodeSnappyBlockAsm64K - -emit_remainder_encodeSnappyBlockAsm64K: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 3(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeSnappyBlockAsm64K - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm64K: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBlockAsm64K - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBlockAsm64K - JB three_bytes_emit_remainder_encodeSnappyBlockAsm64K - -three_bytes_emit_remainder_encodeSnappyBlockAsm64K: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K - -two_bytes_emit_remainder_encodeSnappyBlockAsm64K: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBlockAsm64K - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K - -one_byte_emit_remainder_encodeSnappyBlockAsm64K: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeSnappyBlockAsm64K: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K - -memmove_long_emit_remainder_encodeSnappyBlockAsm64K: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeSnappyBlockAsm12B(dst []byte, src []byte, tmp *[16384]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBlockAsm12B(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00000080, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm12B: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeSnappyBlockAsm12B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -9(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeSnappyBlockAsm12B: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 4(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm12B - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x18, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - SHLQ $0x18, R11 - IMULQ R9, R11 - SHRQ $0x34, R11 - MOVL (AX)(R10*4), SI - MOVL (AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - LEAL 1(DX), R10 - MOVL R10, (AX)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x18, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeSnappyBlockAsm12B - LEAL 1(DX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeSnappyBlockAsm12B - -repeat_extend_back_loop_encodeSnappyBlockAsm12B: - CMPL DI, SI - JBE repeat_extend_back_end_encodeSnappyBlockAsm12B - MOVB -1(BX)(R8*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_encodeSnappyBlockAsm12B - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B - -repeat_extend_back_end_encodeSnappyBlockAsm12B: - MOVL DI, SI - SUBL 12(SP), SI - LEAQ 3(CX)(SI*1), SI - CMPQ SI, (SP) - JB repeat_dst_size_check_encodeSnappyBlockAsm12B - MOVQ $0x00000000, ret+56(FP) - RET - -repeat_dst_size_check_encodeSnappyBlockAsm12B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JB one_byte_repeat_emit_encodeSnappyBlockAsm12B - CMPL SI, $0x00000100 - JB two_bytes_repeat_emit_encodeSnappyBlockAsm12B - JB three_bytes_repeat_emit_encodeSnappyBlockAsm12B - -three_bytes_repeat_emit_encodeSnappyBlockAsm12B: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B - -two_bytes_repeat_emit_encodeSnappyBlockAsm12B: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_repeat_emit_encodeSnappyBlockAsm12B - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B - -one_byte_repeat_emit_encodeSnappyBlockAsm12B: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_repeat_emit_encodeSnappyBlockAsm12B: - LEAQ (CX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (CX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (CX) - MOVQ R9, -8(CX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B: - MOVQ SI, CX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B - -memmove_long_repeat_emit_encodeSnappyBlockAsm12B: - LEAQ (CX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) - MOVQ SI, CX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B: - CMPL R8, $0x10 - JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B - XORQ 8(SI)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B - LEAL -16(R8), R8 - LEAL 16(R11), R11 - JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B - -matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B - -matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B - -matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B - -matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B - JB repeat_extend_forward_end_encodeSnappyBlockAsm12B - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B - -matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeSnappyBlockAsm12B: - ADDL R11, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitCopy -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B: - CMPL SI, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B - MOVB $0xee, (CX) - MOVW DI, 1(CX) - LEAL -60(SI), SI - ADDQ $0x03, CX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B: - MOVL SI, R8 - SHLL $0x02, R8 - CMPL SI, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B - CMPL DI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B - LEAL -15(R8), R8 - MOVB DI, 1(CX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R8 - MOVB R8, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeSnappyBlockAsm12B - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B: - LEAL -2(R8), R8 - MOVB R8, (CX) - MOVW DI, 1(CX) - ADDQ $0x03, CX - -repeat_end_emit_encodeSnappyBlockAsm12B: - MOVL DX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm12B - -no_repeat_found_encodeSnappyBlockAsm12B: - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeSnappyBlockAsm12B - SHRQ $0x08, DI - MOVL (AX)(R10*4), SI - LEAL 2(DX), R9 - CMPL (BX)(R8*1), DI - JEQ candidate2_match_encodeSnappyBlockAsm12B - MOVL R9, (AX)(R10*4) - SHRQ $0x08, DI - CMPL (BX)(SI*1), DI - JEQ candidate3_match_encodeSnappyBlockAsm12B - MOVL 20(SP), DX - JMP search_loop_encodeSnappyBlockAsm12B - -candidate3_match_encodeSnappyBlockAsm12B: - ADDL $0x02, DX - JMP candidate_match_encodeSnappyBlockAsm12B - -candidate2_match_encodeSnappyBlockAsm12B: - MOVL R9, (AX)(R10*4) - INCL DX - MOVL R8, SI - -candidate_match_encodeSnappyBlockAsm12B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBlockAsm12B - -match_extend_back_loop_encodeSnappyBlockAsm12B: - CMPL DX, DI - JBE match_extend_back_end_encodeSnappyBlockAsm12B - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeSnappyBlockAsm12B - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeSnappyBlockAsm12B - JMP match_extend_back_loop_encodeSnappyBlockAsm12B - -match_extend_back_end_encodeSnappyBlockAsm12B: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 3(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeSnappyBlockAsm12B - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm12B: - MOVL DX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JB one_byte_match_emit_encodeSnappyBlockAsm12B - CMPL R8, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBlockAsm12B - JB three_bytes_match_emit_encodeSnappyBlockAsm12B - -three_bytes_match_emit_encodeSnappyBlockAsm12B: - MOVB $0xf4, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeSnappyBlockAsm12B - -two_bytes_match_emit_encodeSnappyBlockAsm12B: - MOVB $0xf0, (CX) - MOVB R8, 1(CX) - ADDQ $0x02, CX - CMPL R8, $0x40 - JB memmove_match_emit_encodeSnappyBlockAsm12B - JMP memmove_long_match_emit_encodeSnappyBlockAsm12B - -one_byte_match_emit_encodeSnappyBlockAsm12B: - SHLB $0x02, R8 - MOVB R8, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeSnappyBlockAsm12B: - LEAQ (CX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (CX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (CX) - MOVQ DI, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm12B: - MOVQ R8, CX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B - -memmove_long_match_emit_encodeSnappyBlockAsm12B: - LEAQ (CX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ R8, CX - -emit_literal_done_match_emit_encodeSnappyBlockAsm12B: -match_nolit_loop_encodeSnappyBlockAsm12B: - MOVL DX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL DX, DI - LEAQ (BX)(DX*1), R8 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBlockAsm12B - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B - XORQ 8(SI)(R10*1), R11 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B - -matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm12B - -matchlen_match8_match_nolit_encodeSnappyBlockAsm12B: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBlockAsm12B - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm12B - -matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm12B - -matchlen_match4_match_nolit_encodeSnappyBlockAsm12B: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBlockAsm12B - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm12B - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeSnappyBlockAsm12B: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B - JB match_nolit_end_encodeSnappyBlockAsm12B - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ match_nolit_end_encodeSnappyBlockAsm12B - -matchlen_match1_match_nolit_encodeSnappyBlockAsm12B: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeSnappyBlockAsm12B - LEAL 1(R10), R10 - -match_nolit_end_encodeSnappyBlockAsm12B: - ADDL R10, DX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL DX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBlockAsm12B: - CMPL R10, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B - MOVB $0xee, (CX) - MOVW SI, 1(CX) - LEAL -60(R10), R10 - ADDQ $0x03, CX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B: - MOVL R10, DI - SHLL $0x02, DI - CMPL R10, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B - CMPL SI, $0x00000800 - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B - LEAL -15(DI), DI - MOVB SI, 1(CX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B - -emit_copy_three_match_nolit_encodeSnappyBlockAsm12B: - LEAL -2(DI), DI - MOVB DI, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm12B: - CMPL DX, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm12B - MOVQ -2(BX)(DX*1), DI - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeSnappyBlockAsm12B - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm12B: - MOVQ $0x000000cf1bbcdcbb, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x18, R8 - IMULQ R9, R8 - SHRQ $0x34, R8 - SHLQ $0x18, SI - IMULQ R9, SI - SHRQ $0x34, SI - LEAL -2(DX), R9 - LEAQ (AX)(SI*4), R10 - MOVL (R10), SI - MOVL R9, (AX)(R8*4) - MOVL DX, (R10) - CMPL (BX)(SI*1), DI - JEQ match_nolit_loop_encodeSnappyBlockAsm12B - INCL DX - JMP search_loop_encodeSnappyBlockAsm12B - -emit_remainder_encodeSnappyBlockAsm12B: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 3(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeSnappyBlockAsm12B - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm12B: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBlockAsm12B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBlockAsm12B - JB three_bytes_emit_remainder_encodeSnappyBlockAsm12B - -three_bytes_emit_remainder_encodeSnappyBlockAsm12B: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B - -two_bytes_emit_remainder_encodeSnappyBlockAsm12B: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBlockAsm12B - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B - -one_byte_emit_remainder_encodeSnappyBlockAsm12B: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeSnappyBlockAsm12B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B - -memmove_long_emit_remainder_encodeSnappyBlockAsm12B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeSnappyBlockAsm10B(dst []byte, src []byte, tmp *[4096]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBlockAsm10B(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00000020, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm10B: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeSnappyBlockAsm10B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -9(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeSnappyBlockAsm10B: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 4(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm10B - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x9e3779b1, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R9, R11 - SHRQ $0x36, R11 - MOVL (AX)(R10*4), SI - MOVL (AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - LEAL 1(DX), R10 - MOVL R10, (AX)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeSnappyBlockAsm10B - LEAL 1(DX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeSnappyBlockAsm10B - -repeat_extend_back_loop_encodeSnappyBlockAsm10B: - CMPL DI, SI - JBE repeat_extend_back_end_encodeSnappyBlockAsm10B - MOVB -1(BX)(R8*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_encodeSnappyBlockAsm10B - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B - -repeat_extend_back_end_encodeSnappyBlockAsm10B: - MOVL DI, SI - SUBL 12(SP), SI - LEAQ 3(CX)(SI*1), SI - CMPQ SI, (SP) - JB repeat_dst_size_check_encodeSnappyBlockAsm10B - MOVQ $0x00000000, ret+56(FP) - RET - -repeat_dst_size_check_encodeSnappyBlockAsm10B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JB one_byte_repeat_emit_encodeSnappyBlockAsm10B - CMPL SI, $0x00000100 - JB two_bytes_repeat_emit_encodeSnappyBlockAsm10B - JB three_bytes_repeat_emit_encodeSnappyBlockAsm10B - -three_bytes_repeat_emit_encodeSnappyBlockAsm10B: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B - -two_bytes_repeat_emit_encodeSnappyBlockAsm10B: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_repeat_emit_encodeSnappyBlockAsm10B - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B - -one_byte_repeat_emit_encodeSnappyBlockAsm10B: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_repeat_emit_encodeSnappyBlockAsm10B: - LEAQ (CX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (CX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (CX) - MOVQ R9, -8(CX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B: - MOVQ SI, CX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B - -memmove_long_repeat_emit_encodeSnappyBlockAsm10B: - LEAQ (CX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) - MOVQ SI, CX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B: - CMPL R8, $0x10 - JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B - XORQ 8(SI)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B - LEAL -16(R8), R8 - LEAL 16(R11), R11 - JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B - -matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B - -matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B - -matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B - -matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B - JB repeat_extend_forward_end_encodeSnappyBlockAsm10B - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B - -matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeSnappyBlockAsm10B: - ADDL R11, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitCopy -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B: - CMPL SI, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B - MOVB $0xee, (CX) - MOVW DI, 1(CX) - LEAL -60(SI), SI - ADDQ $0x03, CX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B: - MOVL SI, R8 - SHLL $0x02, R8 - CMPL SI, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B - CMPL DI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B - LEAL -15(R8), R8 - MOVB DI, 1(CX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R8 - MOVB R8, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeSnappyBlockAsm10B - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B: - LEAL -2(R8), R8 - MOVB R8, (CX) - MOVW DI, 1(CX) - ADDQ $0x03, CX - -repeat_end_emit_encodeSnappyBlockAsm10B: - MOVL DX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm10B - -no_repeat_found_encodeSnappyBlockAsm10B: - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeSnappyBlockAsm10B - SHRQ $0x08, DI - MOVL (AX)(R10*4), SI - LEAL 2(DX), R9 - CMPL (BX)(R8*1), DI - JEQ candidate2_match_encodeSnappyBlockAsm10B - MOVL R9, (AX)(R10*4) - SHRQ $0x08, DI - CMPL (BX)(SI*1), DI - JEQ candidate3_match_encodeSnappyBlockAsm10B - MOVL 20(SP), DX - JMP search_loop_encodeSnappyBlockAsm10B - -candidate3_match_encodeSnappyBlockAsm10B: - ADDL $0x02, DX - JMP candidate_match_encodeSnappyBlockAsm10B - -candidate2_match_encodeSnappyBlockAsm10B: - MOVL R9, (AX)(R10*4) - INCL DX - MOVL R8, SI - -candidate_match_encodeSnappyBlockAsm10B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBlockAsm10B - -match_extend_back_loop_encodeSnappyBlockAsm10B: - CMPL DX, DI - JBE match_extend_back_end_encodeSnappyBlockAsm10B - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeSnappyBlockAsm10B - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeSnappyBlockAsm10B - JMP match_extend_back_loop_encodeSnappyBlockAsm10B - -match_extend_back_end_encodeSnappyBlockAsm10B: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 3(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeSnappyBlockAsm10B - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm10B: - MOVL DX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JB one_byte_match_emit_encodeSnappyBlockAsm10B - CMPL R8, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBlockAsm10B - JB three_bytes_match_emit_encodeSnappyBlockAsm10B - -three_bytes_match_emit_encodeSnappyBlockAsm10B: - MOVB $0xf4, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeSnappyBlockAsm10B - -two_bytes_match_emit_encodeSnappyBlockAsm10B: - MOVB $0xf0, (CX) - MOVB R8, 1(CX) - ADDQ $0x02, CX - CMPL R8, $0x40 - JB memmove_match_emit_encodeSnappyBlockAsm10B - JMP memmove_long_match_emit_encodeSnappyBlockAsm10B - -one_byte_match_emit_encodeSnappyBlockAsm10B: - SHLB $0x02, R8 - MOVB R8, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeSnappyBlockAsm10B: - LEAQ (CX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (CX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (CX) - MOVQ DI, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm10B: - MOVQ R8, CX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B - -memmove_long_match_emit_encodeSnappyBlockAsm10B: - LEAQ (CX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ R8, CX - -emit_literal_done_match_emit_encodeSnappyBlockAsm10B: -match_nolit_loop_encodeSnappyBlockAsm10B: - MOVL DX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL DX, DI - LEAQ (BX)(DX*1), R8 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBlockAsm10B - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B - XORQ 8(SI)(R10*1), R11 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B - -matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm10B - -matchlen_match8_match_nolit_encodeSnappyBlockAsm10B: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBlockAsm10B - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm10B - -matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm10B - -matchlen_match4_match_nolit_encodeSnappyBlockAsm10B: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBlockAsm10B - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm10B - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeSnappyBlockAsm10B: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B - JB match_nolit_end_encodeSnappyBlockAsm10B - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ match_nolit_end_encodeSnappyBlockAsm10B - -matchlen_match1_match_nolit_encodeSnappyBlockAsm10B: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeSnappyBlockAsm10B - LEAL 1(R10), R10 - -match_nolit_end_encodeSnappyBlockAsm10B: - ADDL R10, DX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL DX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBlockAsm10B: - CMPL R10, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B - MOVB $0xee, (CX) - MOVW SI, 1(CX) - LEAL -60(R10), R10 - ADDQ $0x03, CX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B: - MOVL R10, DI - SHLL $0x02, DI - CMPL R10, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B - CMPL SI, $0x00000800 - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B - LEAL -15(DI), DI - MOVB SI, 1(CX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B - -emit_copy_three_match_nolit_encodeSnappyBlockAsm10B: - LEAL -2(DI), DI - MOVB DI, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm10B: - CMPL DX, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm10B - MOVQ -2(BX)(DX*1), DI - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeSnappyBlockAsm10B - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm10B: - MOVQ $0x9e3779b1, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x20, R8 - IMULQ R9, R8 - SHRQ $0x36, R8 - SHLQ $0x20, SI - IMULQ R9, SI - SHRQ $0x36, SI - LEAL -2(DX), R9 - LEAQ (AX)(SI*4), R10 - MOVL (R10), SI - MOVL R9, (AX)(R8*4) - MOVL DX, (R10) - CMPL (BX)(SI*1), DI - JEQ match_nolit_loop_encodeSnappyBlockAsm10B - INCL DX - JMP search_loop_encodeSnappyBlockAsm10B - -emit_remainder_encodeSnappyBlockAsm10B: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 3(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeSnappyBlockAsm10B - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm10B: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBlockAsm10B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBlockAsm10B - JB three_bytes_emit_remainder_encodeSnappyBlockAsm10B - -three_bytes_emit_remainder_encodeSnappyBlockAsm10B: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B - -two_bytes_emit_remainder_encodeSnappyBlockAsm10B: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBlockAsm10B - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B - -one_byte_emit_remainder_encodeSnappyBlockAsm10B: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeSnappyBlockAsm10B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B - -memmove_long_emit_remainder_encodeSnappyBlockAsm10B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeSnappyBlockAsm8B(dst []byte, src []byte, tmp *[1024]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBlockAsm8B(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00000008, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm8B: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeSnappyBlockAsm8B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -9(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeSnappyBlockAsm8B: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x04, SI - LEAL 4(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm8B - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x9e3779b1, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x38, R10 - SHLQ $0x20, R11 - IMULQ R9, R11 - SHRQ $0x38, R11 - MOVL (AX)(R10*4), SI - MOVL (AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - LEAL 1(DX), R10 - MOVL R10, (AX)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x38, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_encodeSnappyBlockAsm8B - LEAL 1(DX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_encodeSnappyBlockAsm8B - -repeat_extend_back_loop_encodeSnappyBlockAsm8B: - CMPL DI, SI - JBE repeat_extend_back_end_encodeSnappyBlockAsm8B - MOVB -1(BX)(R8*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_encodeSnappyBlockAsm8B - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B - -repeat_extend_back_end_encodeSnappyBlockAsm8B: - MOVL DI, SI - SUBL 12(SP), SI - LEAQ 3(CX)(SI*1), SI - CMPQ SI, (SP) - JB repeat_dst_size_check_encodeSnappyBlockAsm8B - MOVQ $0x00000000, ret+56(FP) - RET - -repeat_dst_size_check_encodeSnappyBlockAsm8B: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JB one_byte_repeat_emit_encodeSnappyBlockAsm8B - CMPL SI, $0x00000100 - JB two_bytes_repeat_emit_encodeSnappyBlockAsm8B - JB three_bytes_repeat_emit_encodeSnappyBlockAsm8B - -three_bytes_repeat_emit_encodeSnappyBlockAsm8B: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B - -two_bytes_repeat_emit_encodeSnappyBlockAsm8B: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_repeat_emit_encodeSnappyBlockAsm8B - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B - -one_byte_repeat_emit_encodeSnappyBlockAsm8B: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_repeat_emit_encodeSnappyBlockAsm8B: - LEAQ (CX)(R8*1), SI - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (CX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (CX) - MOVQ R9, -8(CX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B: - MOVQ SI, CX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B - -memmove_long_repeat_emit_encodeSnappyBlockAsm8B: - LEAQ (CX)(R8*1), SI - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R8*1) - MOVOU X3, -16(CX)(R8*1) - MOVQ SI, CX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B: - CMPL R8, $0x10 - JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B - XORQ 8(SI)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B - LEAL -16(R8), R8 - LEAL 16(R11), R11 - JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B - -matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B - -matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B - -matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B - -matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B - JB repeat_extend_forward_end_encodeSnappyBlockAsm8B - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B - -matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeSnappyBlockAsm8B: - ADDL R11, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitCopy -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B: - CMPL SI, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B - MOVB $0xee, (CX) - MOVW DI, 1(CX) - LEAL -60(SI), SI - ADDQ $0x03, CX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B: - MOVL SI, R8 - SHLL $0x02, R8 - CMPL SI, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B - LEAL -15(R8), R8 - MOVB DI, 1(CX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, R8 - MOVB R8, (CX) - ADDQ $0x02, CX - JMP repeat_end_emit_encodeSnappyBlockAsm8B - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B: - LEAL -2(R8), R8 - MOVB R8, (CX) - MOVW DI, 1(CX) - ADDQ $0x03, CX - -repeat_end_emit_encodeSnappyBlockAsm8B: - MOVL DX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm8B - -no_repeat_found_encodeSnappyBlockAsm8B: - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeSnappyBlockAsm8B - SHRQ $0x08, DI - MOVL (AX)(R10*4), SI - LEAL 2(DX), R9 - CMPL (BX)(R8*1), DI - JEQ candidate2_match_encodeSnappyBlockAsm8B - MOVL R9, (AX)(R10*4) - SHRQ $0x08, DI - CMPL (BX)(SI*1), DI - JEQ candidate3_match_encodeSnappyBlockAsm8B - MOVL 20(SP), DX - JMP search_loop_encodeSnappyBlockAsm8B - -candidate3_match_encodeSnappyBlockAsm8B: - ADDL $0x02, DX - JMP candidate_match_encodeSnappyBlockAsm8B - -candidate2_match_encodeSnappyBlockAsm8B: - MOVL R9, (AX)(R10*4) - INCL DX - MOVL R8, SI - -candidate_match_encodeSnappyBlockAsm8B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBlockAsm8B - -match_extend_back_loop_encodeSnappyBlockAsm8B: - CMPL DX, DI - JBE match_extend_back_end_encodeSnappyBlockAsm8B - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeSnappyBlockAsm8B - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeSnappyBlockAsm8B - JMP match_extend_back_loop_encodeSnappyBlockAsm8B - -match_extend_back_end_encodeSnappyBlockAsm8B: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 3(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeSnappyBlockAsm8B - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm8B: - MOVL DX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), R8 - CMPL R8, $0x3c - JB one_byte_match_emit_encodeSnappyBlockAsm8B - CMPL R8, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBlockAsm8B - JB three_bytes_match_emit_encodeSnappyBlockAsm8B - -three_bytes_match_emit_encodeSnappyBlockAsm8B: - MOVB $0xf4, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeSnappyBlockAsm8B - -two_bytes_match_emit_encodeSnappyBlockAsm8B: - MOVB $0xf0, (CX) - MOVB R8, 1(CX) - ADDQ $0x02, CX - CMPL R8, $0x40 - JB memmove_match_emit_encodeSnappyBlockAsm8B - JMP memmove_long_match_emit_encodeSnappyBlockAsm8B - -one_byte_match_emit_encodeSnappyBlockAsm8B: - SHLB $0x02, R8 - MOVB R8, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeSnappyBlockAsm8B: - LEAQ (CX)(R9*1), R8 - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8: - MOVQ (DI), R10 - MOVQ R10, (CX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: - MOVQ (DI), R10 - MOVQ -8(DI)(R9*1), DI - MOVQ R10, (CX) - MOVQ DI, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (DI), X0 - MOVOU -16(DI)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm8B: - MOVQ R8, CX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B - -memmove_long_match_emit_encodeSnappyBlockAsm8B: - LEAQ (CX)(R9*1), R8 - - // genMemMoveLong - MOVOU (DI), X0 - MOVOU 16(DI), X1 - MOVOU -32(DI)(R9*1), X2 - MOVOU -16(DI)(R9*1), X3 - MOVQ R9, R11 - SHRQ $0x05, R11 - MOVQ CX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(DI)(R12*1), R10 - LEAQ -32(CX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(DI)(R12*1), X4 - MOVOU -16(DI)(R12*1), X5 - MOVOA X4, -32(CX)(R12*1) - MOVOA X5, -16(CX)(R12*1) - ADDQ $0x20, R12 - CMPQ R9, R12 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ R8, CX - -emit_literal_done_match_emit_encodeSnappyBlockAsm8B: -match_nolit_loop_encodeSnappyBlockAsm8B: - MOVL DX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), DI - SUBL DX, DI - LEAQ (BX)(DX*1), R8 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBlockAsm8B - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B - XORQ 8(SI)(R10*1), R11 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B - -matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm8B - -matchlen_match8_match_nolit_encodeSnappyBlockAsm8B: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBlockAsm8B - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm8B - -matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_encodeSnappyBlockAsm8B - -matchlen_match4_match_nolit_encodeSnappyBlockAsm8B: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBlockAsm8B - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm8B - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_encodeSnappyBlockAsm8B: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B - JB match_nolit_end_encodeSnappyBlockAsm8B - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ match_nolit_end_encodeSnappyBlockAsm8B - -matchlen_match1_match_nolit_encodeSnappyBlockAsm8B: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_encodeSnappyBlockAsm8B - LEAL 1(R10), R10 - -match_nolit_end_encodeSnappyBlockAsm8B: - ADDL R10, DX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL DX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBlockAsm8B: - CMPL R10, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B - MOVB $0xee, (CX) - MOVW SI, 1(CX) - LEAL -60(R10), R10 - ADDQ $0x03, CX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B: - MOVL R10, DI - SHLL $0x02, DI - CMPL R10, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B - LEAL -15(DI), DI - MOVB SI, 1(CX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B - -emit_copy_three_match_nolit_encodeSnappyBlockAsm8B: - LEAL -2(DI), DI - MOVB DI, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm8B: - CMPL DX, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm8B - MOVQ -2(BX)(DX*1), DI - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeSnappyBlockAsm8B - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm8B: - MOVQ $0x9e3779b1, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x20, R8 - IMULQ R9, R8 - SHRQ $0x38, R8 - SHLQ $0x20, SI - IMULQ R9, SI - SHRQ $0x38, SI - LEAL -2(DX), R9 - LEAQ (AX)(SI*4), R10 - MOVL (R10), SI - MOVL R9, (AX)(R8*4) - MOVL DX, (R10) - CMPL (BX)(SI*1), DI - JEQ match_nolit_loop_encodeSnappyBlockAsm8B - INCL DX - JMP search_loop_encodeSnappyBlockAsm8B - -emit_remainder_encodeSnappyBlockAsm8B: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 3(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeSnappyBlockAsm8B - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm8B: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBlockAsm8B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBlockAsm8B - JB three_bytes_emit_remainder_encodeSnappyBlockAsm8B - -three_bytes_emit_remainder_encodeSnappyBlockAsm8B: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B - -two_bytes_emit_remainder_encodeSnappyBlockAsm8B: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBlockAsm8B - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B - -one_byte_emit_remainder_encodeSnappyBlockAsm8B: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeSnappyBlockAsm8B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B - -memmove_long_emit_remainder_encodeSnappyBlockAsm8B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeSnappyBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00001200, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeSnappyBetterBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -9(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeSnappyBetterBlockAsm: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x07, SI - CMPL SI, $0x63 - JBE check_maxskip_ok_encodeSnappyBetterBlockAsm - LEAL 100(DX), SI - JMP check_maxskip_cont_encodeSnappyBetterBlockAsm - -check_maxskip_ok_encodeSnappyBetterBlockAsm: - LEAL 1(DX)(SI*1), SI - -check_maxskip_cont_encodeSnappyBetterBlockAsm: - CMPL SI, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x2f, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x32, R11 - MOVL (AX)(R10*4), SI - MOVL 524288(AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - MOVL DX, 524288(AX)(R11*4) - MOVQ (BX)(SI*1), R10 - MOVQ (BX)(R8*1), R11 - CMPQ R10, DI - JEQ candidate_match_encodeSnappyBetterBlockAsm - CMPQ R11, DI - JNE no_short_found_encodeSnappyBetterBlockAsm - MOVL R8, SI - JMP candidate_match_encodeSnappyBetterBlockAsm - -no_short_found_encodeSnappyBetterBlockAsm: - CMPL R10, DI - JEQ candidate_match_encodeSnappyBetterBlockAsm - CMPL R11, DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm - MOVL 20(SP), DX - JMP search_loop_encodeSnappyBetterBlockAsm - -candidateS_match_encodeSnappyBetterBlockAsm: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x2f, R10 - MOVL (AX)(R10*4), SI - INCL DX - MOVL DX, (AX)(R10*4) - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm - DECL DX - MOVL R8, SI - -candidate_match_encodeSnappyBetterBlockAsm: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm - -match_extend_back_loop_encodeSnappyBetterBlockAsm: - CMPL DX, DI - JBE match_extend_back_end_encodeSnappyBetterBlockAsm - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm - -match_extend_back_end_encodeSnappyBetterBlockAsm: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 5(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeSnappyBetterBlockAsm - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm: - MOVL DX, DI - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), R10 - - // matchLen - XORL R12, R12 - -matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm: - CMPL R8, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm - MOVQ (R9)(R12*1), R11 - MOVQ 8(R9)(R12*1), R13 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm - XORQ 8(R10)(R12*1), R13 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm - LEAL -16(R8), R8 - LEAL 16(R12), R12 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm - -matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R13, R13 - -#else - BSFQ R13, R13 - -#endif - SARQ $0x03, R13 - LEAL 8(R12)(R13*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm - -matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm: - CMPL R8, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm - LEAL -8(R8), R8 - LEAL 8(R12), R12 - JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm - -matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm - -matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm: - CMPL R8, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm - LEAL -4(R8), R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm: - CMPL R8, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm - JB match_nolit_end_encodeSnappyBetterBlockAsm - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm - LEAL 2(R12), R12 - SUBL $0x02, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm - -matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeSnappyBetterBlockAsm - LEAL 1(R12), R12 - -match_nolit_end_encodeSnappyBetterBlockAsm: - MOVL DX, R8 - SUBL SI, R8 - - // Check if repeat - CMPL R12, $0x01 - JA match_length_ok_encodeSnappyBetterBlockAsm - CMPL R8, $0x0000ffff - JBE match_length_ok_encodeSnappyBetterBlockAsm - MOVL 20(SP), DX - INCL DX - JMP search_loop_encodeSnappyBetterBlockAsm - -match_length_ok_encodeSnappyBetterBlockAsm: - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_match_emit_encodeSnappyBetterBlockAsm - CMPL SI, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBetterBlockAsm - CMPL SI, $0x00010000 - JB three_bytes_match_emit_encodeSnappyBetterBlockAsm - CMPL SI, $0x01000000 - JB four_bytes_match_emit_encodeSnappyBetterBlockAsm - MOVB $0xfc, (CX) - MOVL SI, 1(CX) - ADDQ $0x05, CX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm - -four_bytes_match_emit_encodeSnappyBetterBlockAsm: - MOVL SI, R11 - SHRL $0x10, R11 - MOVB $0xf8, (CX) - MOVW SI, 1(CX) - MOVB R11, 3(CX) - ADDQ $0x04, CX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm - -three_bytes_match_emit_encodeSnappyBetterBlockAsm: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm - -two_bytes_match_emit_encodeSnappyBetterBlockAsm: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_match_emit_encodeSnappyBetterBlockAsm - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm - -one_byte_match_emit_encodeSnappyBetterBlockAsm: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeSnappyBetterBlockAsm: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (CX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm: - MOVQ SI, CX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm - -memmove_long_match_emit_encodeSnappyBetterBlockAsm: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(CX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(CX)(R14*1) - MOVOA X5, -16(CX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm: - ADDL R12, DX - ADDL $0x04, R12 - MOVL DX, 12(SP) - - // emitCopy - CMPL R8, $0x00010000 - JB two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm - -four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm: - CMPL R12, $0x40 - JBE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm - MOVB $0xff, (CX) - MOVL R8, 1(CX) - LEAL -64(R12), R12 - ADDQ $0x05, CX - CMPL R12, $0x04 - JB four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm - JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm - -four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm: - TESTL R12, R12 - JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm - XORL SI, SI - LEAL -1(SI)(R12*4), R12 - MOVB R12, (CX) - MOVL R8, 1(CX) - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm - -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm: - CMPL R12, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm - MOVB $0xee, (CX) - MOVW R8, 1(CX) - LEAL -60(R12), R12 - ADDQ $0x03, CX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm: - MOVL R12, SI - SHLL $0x02, SI - CMPL R12, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm - CMPL R8, $0x00000800 - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm - LEAL -15(SI), SI - MOVB R8, 1(CX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm: - LEAL -2(SI), SI - MOVB SI, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm: - CMPL DX, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm: - MOVQ $0x00cf1bbcdcbfa563, SI - MOVQ $0x9e3779b1, R8 - LEAQ 1(DI), DI - LEAQ -2(DX), R9 - MOVQ (BX)(DI*1), R10 - MOVQ 1(BX)(DI*1), R11 - MOVQ (BX)(R9*1), R12 - MOVQ 1(BX)(R9*1), R13 - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x2f, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x32, R11 - SHLQ $0x08, R12 - IMULQ SI, R12 - SHRQ $0x2f, R12 - SHLQ $0x20, R13 - IMULQ R8, R13 - SHRQ $0x32, R13 - LEAQ 1(DI), R8 - LEAQ 1(R9), R14 - MOVL DI, (AX)(R10*4) - MOVL R9, (AX)(R12*4) - MOVL R8, 524288(AX)(R11*4) - MOVL R14, 524288(AX)(R13*4) - LEAQ 1(R9)(DI*1), R8 - SHRQ $0x01, R8 - ADDQ $0x01, DI - SUBQ $0x01, R9 - -index_loop_encodeSnappyBetterBlockAsm: - CMPQ R8, R9 - JAE search_loop_encodeSnappyBetterBlockAsm - MOVQ (BX)(DI*1), R10 - MOVQ (BX)(R8*1), R11 - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x2f, R10 - SHLQ $0x08, R11 - IMULQ SI, R11 - SHRQ $0x2f, R11 - MOVL DI, (AX)(R10*4) - MOVL R8, (AX)(R11*4) - ADDQ $0x02, DI - ADDQ $0x02, R8 - JMP index_loop_encodeSnappyBetterBlockAsm - -emit_remainder_encodeSnappyBetterBlockAsm: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 5(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeSnappyBetterBlockAsm - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm - CMPL DX, $0x00010000 - JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm - CMPL DX, $0x01000000 - JB four_bytes_emit_remainder_encodeSnappyBetterBlockAsm - MOVB $0xfc, (CX) - MOVL DX, 1(CX) - ADDQ $0x05, CX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm - -four_bytes_emit_remainder_encodeSnappyBetterBlockAsm: - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (CX) - MOVW DX, 1(CX) - MOVB BL, 3(CX) - ADDQ $0x04, CX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm - -three_bytes_emit_remainder_encodeSnappyBetterBlockAsm: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBetterBlockAsm - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte, tmp *[294912]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm64K(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00000900, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm64K: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeSnappyBetterBlockAsm64K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -9(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeSnappyBetterBlockAsm64K: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x07, SI - LEAL 1(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm64K - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x33, R11 - MOVL (AX)(R10*4), SI - MOVL 262144(AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - MOVL DX, 262144(AX)(R11*4) - MOVQ (BX)(SI*1), R10 - MOVQ (BX)(R8*1), R11 - CMPQ R10, DI - JEQ candidate_match_encodeSnappyBetterBlockAsm64K - CMPQ R11, DI - JNE no_short_found_encodeSnappyBetterBlockAsm64K - MOVL R8, SI - JMP candidate_match_encodeSnappyBetterBlockAsm64K - -no_short_found_encodeSnappyBetterBlockAsm64K: - CMPL R10, DI - JEQ candidate_match_encodeSnappyBetterBlockAsm64K - CMPL R11, DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm64K - MOVL 20(SP), DX - JMP search_loop_encodeSnappyBetterBlockAsm64K - -candidateS_match_encodeSnappyBetterBlockAsm64K: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x08, R10 - IMULQ R9, R10 - SHRQ $0x30, R10 - MOVL (AX)(R10*4), SI - INCL DX - MOVL DX, (AX)(R10*4) - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm64K - DECL DX - MOVL R8, SI - -candidate_match_encodeSnappyBetterBlockAsm64K: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K - -match_extend_back_loop_encodeSnappyBetterBlockAsm64K: - CMPL DX, DI - JBE match_extend_back_end_encodeSnappyBetterBlockAsm64K - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K - -match_extend_back_end_encodeSnappyBetterBlockAsm64K: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 3(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeSnappyBetterBlockAsm64K - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm64K: - MOVL DX, DI - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), R10 - - // matchLen - XORL R12, R12 - -matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K: - CMPL R8, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K - MOVQ (R9)(R12*1), R11 - MOVQ 8(R9)(R12*1), R13 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K - XORQ 8(R10)(R12*1), R13 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K - LEAL -16(R8), R8 - LEAL 16(R12), R12 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K - -matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K: -#ifdef GOAMD64_v3 - TZCNTQ R13, R13 - -#else - BSFQ R13, R13 - -#endif - SARQ $0x03, R13 - LEAL 8(R12)(R13*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm64K - -matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K: - CMPL R8, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K - LEAL -8(R8), R8 - LEAL 8(R12), R12 - JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K - -matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm64K - -matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K: - CMPL R8, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K - LEAL -4(R8), R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K: - CMPL R8, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K - JB match_nolit_end_encodeSnappyBetterBlockAsm64K - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K - LEAL 2(R12), R12 - SUBL $0x02, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm64K - -matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeSnappyBetterBlockAsm64K - LEAL 1(R12), R12 - -match_nolit_end_encodeSnappyBetterBlockAsm64K: - MOVL DX, R8 - SUBL SI, R8 - - // Check if repeat - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_match_emit_encodeSnappyBetterBlockAsm64K - CMPL SI, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBetterBlockAsm64K - JB three_bytes_match_emit_encodeSnappyBetterBlockAsm64K - -three_bytes_match_emit_encodeSnappyBetterBlockAsm64K: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K - -two_bytes_match_emit_encodeSnappyBetterBlockAsm64K: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_match_emit_encodeSnappyBetterBlockAsm64K - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K - -one_byte_match_emit_encodeSnappyBetterBlockAsm64K: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeSnappyBetterBlockAsm64K: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (CX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K: - MOVQ SI, CX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K - -memmove_long_match_emit_encodeSnappyBetterBlockAsm64K: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(CX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(CX)(R14*1) - MOVOA X5, -16(CX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K: - ADDL R12, DX - ADDL $0x04, R12 - MOVL DX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K: - CMPL R12, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K - MOVB $0xee, (CX) - MOVW R8, 1(CX) - LEAL -60(R12), R12 - ADDQ $0x03, CX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K: - MOVL R12, SI - SHLL $0x02, SI - CMPL R12, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K - CMPL R8, $0x00000800 - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K - LEAL -15(SI), SI - MOVB R8, 1(CX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K: - LEAL -2(SI), SI - MOVB SI, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K: - CMPL DX, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm64K - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K: - MOVQ $0x00cf1bbcdcbfa563, SI - MOVQ $0x9e3779b1, R8 - LEAQ 1(DI), DI - LEAQ -2(DX), R9 - MOVQ (BX)(DI*1), R10 - MOVQ 1(BX)(DI*1), R11 - MOVQ (BX)(R9*1), R12 - MOVQ 1(BX)(R9*1), R13 - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x33, R11 - SHLQ $0x08, R12 - IMULQ SI, R12 - SHRQ $0x30, R12 - SHLQ $0x20, R13 - IMULQ R8, R13 - SHRQ $0x33, R13 - LEAQ 1(DI), R8 - LEAQ 1(R9), R14 - MOVL DI, (AX)(R10*4) - MOVL R9, (AX)(R12*4) - MOVL R8, 262144(AX)(R11*4) - MOVL R14, 262144(AX)(R13*4) - LEAQ 1(R9)(DI*1), R8 - SHRQ $0x01, R8 - ADDQ $0x01, DI - SUBQ $0x01, R9 - -index_loop_encodeSnappyBetterBlockAsm64K: - CMPQ R8, R9 - JAE search_loop_encodeSnappyBetterBlockAsm64K - MOVQ (BX)(DI*1), R10 - MOVQ (BX)(R8*1), R11 - SHLQ $0x08, R10 - IMULQ SI, R10 - SHRQ $0x30, R10 - SHLQ $0x08, R11 - IMULQ SI, R11 - SHRQ $0x30, R11 - MOVL DI, (AX)(R10*4) - MOVL R8, (AX)(R11*4) - ADDQ $0x02, DI - ADDQ $0x02, R8 - JMP index_loop_encodeSnappyBetterBlockAsm64K - -emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 3(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeSnappyBetterBlockAsm64K - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm64K: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K - JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K - -three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBetterBlockAsm64K - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm64K: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte, tmp *[81920]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm12B(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00000280, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm12B: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeSnappyBetterBlockAsm12B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -9(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeSnappyBetterBlockAsm12B: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x06, SI - LEAL 1(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm12B - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x34, R11 - MOVL (AX)(R10*4), SI - MOVL 65536(AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - MOVL DX, 65536(AX)(R11*4) - MOVQ (BX)(SI*1), R10 - MOVQ (BX)(R8*1), R11 - CMPQ R10, DI - JEQ candidate_match_encodeSnappyBetterBlockAsm12B - CMPQ R11, DI - JNE no_short_found_encodeSnappyBetterBlockAsm12B - MOVL R8, SI - JMP candidate_match_encodeSnappyBetterBlockAsm12B - -no_short_found_encodeSnappyBetterBlockAsm12B: - CMPL R10, DI - JEQ candidate_match_encodeSnappyBetterBlockAsm12B - CMPL R11, DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm12B - MOVL 20(SP), DX - JMP search_loop_encodeSnappyBetterBlockAsm12B - -candidateS_match_encodeSnappyBetterBlockAsm12B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x32, R10 - MOVL (AX)(R10*4), SI - INCL DX - MOVL DX, (AX)(R10*4) - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm12B - DECL DX - MOVL R8, SI - -candidate_match_encodeSnappyBetterBlockAsm12B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B - -match_extend_back_loop_encodeSnappyBetterBlockAsm12B: - CMPL DX, DI - JBE match_extend_back_end_encodeSnappyBetterBlockAsm12B - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B - -match_extend_back_end_encodeSnappyBetterBlockAsm12B: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 3(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeSnappyBetterBlockAsm12B - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm12B: - MOVL DX, DI - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), R10 - - // matchLen - XORL R12, R12 - -matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B: - CMPL R8, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B - MOVQ (R9)(R12*1), R11 - MOVQ 8(R9)(R12*1), R13 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B - XORQ 8(R10)(R12*1), R13 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B - LEAL -16(R8), R8 - LEAL 16(R12), R12 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B - -matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R13, R13 - -#else - BSFQ R13, R13 - -#endif - SARQ $0x03, R13 - LEAL 8(R12)(R13*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm12B - -matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B: - CMPL R8, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B - LEAL -8(R8), R8 - LEAL 8(R12), R12 - JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B - -matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm12B - -matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B: - CMPL R8, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B - LEAL -4(R8), R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B: - CMPL R8, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B - JB match_nolit_end_encodeSnappyBetterBlockAsm12B - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B - LEAL 2(R12), R12 - SUBL $0x02, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm12B - -matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeSnappyBetterBlockAsm12B - LEAL 1(R12), R12 - -match_nolit_end_encodeSnappyBetterBlockAsm12B: - MOVL DX, R8 - SUBL SI, R8 - - // Check if repeat - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_match_emit_encodeSnappyBetterBlockAsm12B - CMPL SI, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBetterBlockAsm12B - JB three_bytes_match_emit_encodeSnappyBetterBlockAsm12B - -three_bytes_match_emit_encodeSnappyBetterBlockAsm12B: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B - -two_bytes_match_emit_encodeSnappyBetterBlockAsm12B: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_match_emit_encodeSnappyBetterBlockAsm12B - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B - -one_byte_match_emit_encodeSnappyBetterBlockAsm12B: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeSnappyBetterBlockAsm12B: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (CX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B: - MOVQ SI, CX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B - -memmove_long_match_emit_encodeSnappyBetterBlockAsm12B: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(CX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(CX)(R14*1) - MOVOA X5, -16(CX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B: - ADDL R12, DX - ADDL $0x04, R12 - MOVL DX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B: - CMPL R12, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B - MOVB $0xee, (CX) - MOVW R8, 1(CX) - LEAL -60(R12), R12 - ADDQ $0x03, CX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B: - MOVL R12, SI - SHLL $0x02, SI - CMPL R12, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B - CMPL R8, $0x00000800 - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B - LEAL -15(SI), SI - MOVB R8, 1(CX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B: - LEAL -2(SI), SI - MOVB SI, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B: - CMPL DX, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm12B - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - LEAQ 1(DI), DI - LEAQ -2(DX), R9 - MOVQ (BX)(DI*1), R10 - MOVQ 1(BX)(DI*1), R11 - MOVQ (BX)(R9*1), R12 - MOVQ 1(BX)(R9*1), R13 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x32, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x34, R11 - SHLQ $0x10, R12 - IMULQ SI, R12 - SHRQ $0x32, R12 - SHLQ $0x20, R13 - IMULQ R8, R13 - SHRQ $0x34, R13 - LEAQ 1(DI), R8 - LEAQ 1(R9), R14 - MOVL DI, (AX)(R10*4) - MOVL R9, (AX)(R12*4) - MOVL R8, 65536(AX)(R11*4) - MOVL R14, 65536(AX)(R13*4) - LEAQ 1(R9)(DI*1), R8 - SHRQ $0x01, R8 - ADDQ $0x01, DI - SUBQ $0x01, R9 - -index_loop_encodeSnappyBetterBlockAsm12B: - CMPQ R8, R9 - JAE search_loop_encodeSnappyBetterBlockAsm12B - MOVQ (BX)(DI*1), R10 - MOVQ (BX)(R8*1), R11 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x32, R10 - SHLQ $0x10, R11 - IMULQ SI, R11 - SHRQ $0x32, R11 - MOVL DI, (AX)(R10*4) - MOVL R8, (AX)(R11*4) - ADDQ $0x02, DI - ADDQ $0x02, R8 - JMP index_loop_encodeSnappyBetterBlockAsm12B - -emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 3(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeSnappyBetterBlockAsm12B - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm12B: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B - JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B - -three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBetterBlockAsm12B - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm12B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte, tmp *[20480]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm10B(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x000000a0, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm10B: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeSnappyBetterBlockAsm10B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -9(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeSnappyBetterBlockAsm10B: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 1(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm10B - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x36, R11 - MOVL (AX)(R10*4), SI - MOVL 16384(AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - MOVL DX, 16384(AX)(R11*4) - MOVQ (BX)(SI*1), R10 - MOVQ (BX)(R8*1), R11 - CMPQ R10, DI - JEQ candidate_match_encodeSnappyBetterBlockAsm10B - CMPQ R11, DI - JNE no_short_found_encodeSnappyBetterBlockAsm10B - MOVL R8, SI - JMP candidate_match_encodeSnappyBetterBlockAsm10B - -no_short_found_encodeSnappyBetterBlockAsm10B: - CMPL R10, DI - JEQ candidate_match_encodeSnappyBetterBlockAsm10B - CMPL R11, DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm10B - MOVL 20(SP), DX - JMP search_loop_encodeSnappyBetterBlockAsm10B - -candidateS_match_encodeSnappyBetterBlockAsm10B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x34, R10 - MOVL (AX)(R10*4), SI - INCL DX - MOVL DX, (AX)(R10*4) - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm10B - DECL DX - MOVL R8, SI - -candidate_match_encodeSnappyBetterBlockAsm10B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B - -match_extend_back_loop_encodeSnappyBetterBlockAsm10B: - CMPL DX, DI - JBE match_extend_back_end_encodeSnappyBetterBlockAsm10B - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B - -match_extend_back_end_encodeSnappyBetterBlockAsm10B: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 3(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeSnappyBetterBlockAsm10B - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm10B: - MOVL DX, DI - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), R10 - - // matchLen - XORL R12, R12 - -matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B: - CMPL R8, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B - MOVQ (R9)(R12*1), R11 - MOVQ 8(R9)(R12*1), R13 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B - XORQ 8(R10)(R12*1), R13 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B - LEAL -16(R8), R8 - LEAL 16(R12), R12 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B - -matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R13, R13 - -#else - BSFQ R13, R13 - -#endif - SARQ $0x03, R13 - LEAL 8(R12)(R13*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm10B - -matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B: - CMPL R8, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B - LEAL -8(R8), R8 - LEAL 8(R12), R12 - JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B - -matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm10B - -matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B: - CMPL R8, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B - LEAL -4(R8), R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B: - CMPL R8, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B - JB match_nolit_end_encodeSnappyBetterBlockAsm10B - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B - LEAL 2(R12), R12 - SUBL $0x02, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm10B - -matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeSnappyBetterBlockAsm10B - LEAL 1(R12), R12 - -match_nolit_end_encodeSnappyBetterBlockAsm10B: - MOVL DX, R8 - SUBL SI, R8 - - // Check if repeat - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_match_emit_encodeSnappyBetterBlockAsm10B - CMPL SI, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBetterBlockAsm10B - JB three_bytes_match_emit_encodeSnappyBetterBlockAsm10B - -three_bytes_match_emit_encodeSnappyBetterBlockAsm10B: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B - -two_bytes_match_emit_encodeSnappyBetterBlockAsm10B: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_match_emit_encodeSnappyBetterBlockAsm10B - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B - -one_byte_match_emit_encodeSnappyBetterBlockAsm10B: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeSnappyBetterBlockAsm10B: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (CX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B: - MOVQ SI, CX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B - -memmove_long_match_emit_encodeSnappyBetterBlockAsm10B: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(CX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(CX)(R14*1) - MOVOA X5, -16(CX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B: - ADDL R12, DX - ADDL $0x04, R12 - MOVL DX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B: - CMPL R12, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B - MOVB $0xee, (CX) - MOVW R8, 1(CX) - LEAL -60(R12), R12 - ADDQ $0x03, CX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B: - MOVL R12, SI - SHLL $0x02, SI - CMPL R12, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B - CMPL R8, $0x00000800 - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B - LEAL -15(SI), SI - MOVB R8, 1(CX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B: - LEAL -2(SI), SI - MOVB SI, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B: - CMPL DX, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm10B - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - LEAQ 1(DI), DI - LEAQ -2(DX), R9 - MOVQ (BX)(DI*1), R10 - MOVQ 1(BX)(DI*1), R11 - MOVQ (BX)(R9*1), R12 - MOVQ 1(BX)(R9*1), R13 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x34, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x36, R11 - SHLQ $0x10, R12 - IMULQ SI, R12 - SHRQ $0x34, R12 - SHLQ $0x20, R13 - IMULQ R8, R13 - SHRQ $0x36, R13 - LEAQ 1(DI), R8 - LEAQ 1(R9), R14 - MOVL DI, (AX)(R10*4) - MOVL R9, (AX)(R12*4) - MOVL R8, 16384(AX)(R11*4) - MOVL R14, 16384(AX)(R13*4) - LEAQ 1(R9)(DI*1), R8 - SHRQ $0x01, R8 - ADDQ $0x01, DI - SUBQ $0x01, R9 - -index_loop_encodeSnappyBetterBlockAsm10B: - CMPQ R8, R9 - JAE search_loop_encodeSnappyBetterBlockAsm10B - MOVQ (BX)(DI*1), R10 - MOVQ (BX)(R8*1), R11 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x34, R10 - SHLQ $0x10, R11 - IMULQ SI, R11 - SHRQ $0x34, R11 - MOVL DI, (AX)(R10*4) - MOVL R8, (AX)(R11*4) - ADDQ $0x02, DI - ADDQ $0x02, R8 - JMP index_loop_encodeSnappyBetterBlockAsm10B - -emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 3(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeSnappyBetterBlockAsm10B - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm10B: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B - JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B - -three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBetterBlockAsm10B - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm10B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte, tmp *[5120]byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm8B(SB), $24-64 - MOVQ tmp+48(FP), AX - MOVQ dst_base+0(FP), CX - MOVQ $0x00000028, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm8B: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_encodeSnappyBetterBlockAsm8B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), DX - LEAQ -9(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), BX - -search_loop_encodeSnappyBetterBlockAsm8B: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x04, SI - LEAL 1(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm8B - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ $0x9e3779b1, SI - MOVQ DI, R10 - MOVQ DI, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ SI, R11 - SHRQ $0x38, R11 - MOVL (AX)(R10*4), SI - MOVL 4096(AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - MOVL DX, 4096(AX)(R11*4) - MOVQ (BX)(SI*1), R10 - MOVQ (BX)(R8*1), R11 - CMPQ R10, DI - JEQ candidate_match_encodeSnappyBetterBlockAsm8B - CMPQ R11, DI - JNE no_short_found_encodeSnappyBetterBlockAsm8B - MOVL R8, SI - JMP candidate_match_encodeSnappyBetterBlockAsm8B - -no_short_found_encodeSnappyBetterBlockAsm8B: - CMPL R10, DI - JEQ candidate_match_encodeSnappyBetterBlockAsm8B - CMPL R11, DI - JEQ candidateS_match_encodeSnappyBetterBlockAsm8B - MOVL 20(SP), DX - JMP search_loop_encodeSnappyBetterBlockAsm8B - -candidateS_match_encodeSnappyBetterBlockAsm8B: - SHRQ $0x08, DI - MOVQ DI, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x36, R10 - MOVL (AX)(R10*4), SI - INCL DX - MOVL DX, (AX)(R10*4) - CMPL (BX)(SI*1), DI - JEQ candidate_match_encodeSnappyBetterBlockAsm8B - DECL DX - MOVL R8, SI - -candidate_match_encodeSnappyBetterBlockAsm8B: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B - -match_extend_back_loop_encodeSnappyBetterBlockAsm8B: - CMPL DX, DI - JBE match_extend_back_end_encodeSnappyBetterBlockAsm8B - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B - -match_extend_back_end_encodeSnappyBetterBlockAsm8B: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 3(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_encodeSnappyBetterBlockAsm8B - MOVQ $0x00000000, ret+56(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm8B: - MOVL DX, DI - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+32(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), R10 - - // matchLen - XORL R12, R12 - -matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B: - CMPL R8, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B - MOVQ (R9)(R12*1), R11 - MOVQ 8(R9)(R12*1), R13 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B - XORQ 8(R10)(R12*1), R13 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B - LEAL -16(R8), R8 - LEAL 16(R12), R12 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B - -matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R13, R13 - -#else - BSFQ R13, R13 - -#endif - SARQ $0x03, R13 - LEAL 8(R12)(R13*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm8B - -matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B: - CMPL R8, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B - MOVQ (R9)(R12*1), R11 - XORQ (R10)(R12*1), R11 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B - LEAL -8(R8), R8 - LEAL 8(R12), R12 - JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B - -matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL (R12)(R11*1), R12 - JMP match_nolit_end_encodeSnappyBetterBlockAsm8B - -matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B: - CMPL R8, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B - MOVL (R9)(R12*1), R11 - CMPL (R10)(R12*1), R11 - JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B - LEAL -4(R8), R8 - LEAL 4(R12), R12 - -matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B: - CMPL R8, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B - JB match_nolit_end_encodeSnappyBetterBlockAsm8B - MOVW (R9)(R12*1), R11 - CMPW (R10)(R12*1), R11 - JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B - LEAL 2(R12), R12 - SUBL $0x02, R8 - JZ match_nolit_end_encodeSnappyBetterBlockAsm8B - -matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B: - MOVB (R9)(R12*1), R11 - CMPB (R10)(R12*1), R11 - JNE match_nolit_end_encodeSnappyBetterBlockAsm8B - LEAL 1(R12), R12 - -match_nolit_end_encodeSnappyBetterBlockAsm8B: - MOVL DX, R8 - SUBL SI, R8 - - // Check if repeat - MOVL R8, 16(SP) - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R10 - SUBL SI, R9 - LEAL -1(R9), SI - CMPL SI, $0x3c - JB one_byte_match_emit_encodeSnappyBetterBlockAsm8B - CMPL SI, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBetterBlockAsm8B - JB three_bytes_match_emit_encodeSnappyBetterBlockAsm8B - -three_bytes_match_emit_encodeSnappyBetterBlockAsm8B: - MOVB $0xf4, (CX) - MOVW SI, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B - -two_bytes_match_emit_encodeSnappyBetterBlockAsm8B: - MOVB $0xf0, (CX) - MOVB SI, 1(CX) - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_match_emit_encodeSnappyBetterBlockAsm8B - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B - -one_byte_match_emit_encodeSnappyBetterBlockAsm8B: - SHLB $0x02, SI - MOVB SI, (CX) - ADDQ $0x01, CX - -memmove_match_emit_encodeSnappyBetterBlockAsm8B: - LEAQ (CX)(R9*1), SI - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8: - MOVQ (R10), R11 - MOVQ R11, (CX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: - MOVQ (R10), R11 - MOVQ -8(R10)(R9*1), R10 - MOVQ R11, (CX) - MOVQ R10, -8(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: - MOVOU (R10), X0 - MOVOU -16(R10)(R9*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(R9*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B: - MOVQ SI, CX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B - -memmove_long_match_emit_encodeSnappyBetterBlockAsm8B: - LEAQ (CX)(R9*1), SI - - // genMemMoveLong - MOVOU (R10), X0 - MOVOU 16(R10), X1 - MOVOU -32(R10)(R9*1), X2 - MOVOU -16(R10)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ CX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R14 - SUBQ R11, R14 - DECQ R13 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R10)(R14*1), R11 - LEAQ -32(CX)(R14*1), R15 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R11 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R10)(R14*1), X4 - MOVOU -16(R10)(R14*1), X5 - MOVOA X4, -32(CX)(R14*1) - MOVOA X5, -16(CX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(R9*1) - MOVOU X3, -16(CX)(R9*1) - MOVQ SI, CX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B: - ADDL R12, DX - ADDL $0x04, R12 - MOVL DX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B: - CMPL R12, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B - MOVB $0xee, (CX) - MOVW R8, 1(CX) - LEAL -60(R12), R12 - ADDQ $0x03, CX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B: - MOVL R12, SI - SHLL $0x02, SI - CMPL R12, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B - LEAL -15(SI), SI - MOVB R8, 1(CX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, SI - MOVB SI, (CX) - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B: - LEAL -2(SI), SI - MOVB SI, (CX) - MOVW R8, 1(CX) - ADDQ $0x03, CX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B: - CMPL DX, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm8B - CMPQ CX, (SP) - JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B - MOVQ $0x00000000, ret+56(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B: - MOVQ $0x0000cf1bbcdcbf9b, SI - MOVQ $0x9e3779b1, R8 - LEAQ 1(DI), DI - LEAQ -2(DX), R9 - MOVQ (BX)(DI*1), R10 - MOVQ 1(BX)(DI*1), R11 - MOVQ (BX)(R9*1), R12 - MOVQ 1(BX)(R9*1), R13 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x36, R10 - SHLQ $0x20, R11 - IMULQ R8, R11 - SHRQ $0x38, R11 - SHLQ $0x10, R12 - IMULQ SI, R12 - SHRQ $0x36, R12 - SHLQ $0x20, R13 - IMULQ R8, R13 - SHRQ $0x38, R13 - LEAQ 1(DI), R8 - LEAQ 1(R9), R14 - MOVL DI, (AX)(R10*4) - MOVL R9, (AX)(R12*4) - MOVL R8, 4096(AX)(R11*4) - MOVL R14, 4096(AX)(R13*4) - LEAQ 1(R9)(DI*1), R8 - SHRQ $0x01, R8 - ADDQ $0x01, DI - SUBQ $0x01, R9 - -index_loop_encodeSnappyBetterBlockAsm8B: - CMPQ R8, R9 - JAE search_loop_encodeSnappyBetterBlockAsm8B - MOVQ (BX)(DI*1), R10 - MOVQ (BX)(R8*1), R11 - SHLQ $0x10, R10 - IMULQ SI, R10 - SHRQ $0x36, R10 - SHLQ $0x10, R11 - IMULQ SI, R11 - SHRQ $0x36, R11 - MOVL DI, (AX)(R10*4) - MOVL R8, (AX)(R11*4) - ADDQ $0x02, DI - ADDQ $0x02, R8 - JMP index_loop_encodeSnappyBetterBlockAsm8B - -emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVQ src_len+32(FP), AX - SUBL 12(SP), AX - LEAQ 3(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_encodeSnappyBetterBlockAsm8B - MOVQ $0x00000000, ret+56(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm8B: - MOVQ src_len+32(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B - JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B - -three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVB $0xf4, (CX) - MOVW DX, 1(CX) - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVB $0xf0, (CX) - MOVB DL, 1(CX) - ADDQ $0x02, CX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBetterBlockAsm8B - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B: - SHLB $0x02, DL - MOVB DL, (CX) - ADDQ $0x01, CX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm8B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2: - MOVB (AX), SI - MOVB -1(AX)(BX*1), AL - MOVB SI, (CX) - MOVB AL, -1(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3: - MOVW (AX), SI - MOVB 2(AX), AL - MOVW SI, (CX) - MOVB AL, 2(CX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7: - MOVL (AX), SI - MOVL -4(AX)(BX*1), AX - MOVL SI, (CX) - MOVL AX, -4(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: - MOVQ (AX), SI - MOVQ -8(AX)(BX*1), AX - MOVQ SI, (CX) - MOVQ AX, -8(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: - MOVOU (AX), X0 - MOVOU -16(AX)(BX*1), X1 - MOVOU X0, (CX) - MOVOU X1, -16(CX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVQ DX, CX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B: - LEAQ (CX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (AX), X0 - MOVOU 16(AX), X1 - MOVOU -32(AX)(BX*1), X2 - MOVOU -16(AX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ CX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(AX)(R8*1), SI - LEAQ -32(CX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(AX)(R8*1), X4 - MOVOU -16(AX)(R8*1), X5 - MOVOA X4, -32(CX)(R8*1) - MOVOA X5, -16(CX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (CX) - MOVOU X1, 16(CX) - MOVOU X2, -32(CX)(BX*1) - MOVOU X3, -16(CX)(BX*1) - MOVQ DX, CX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVQ dst_base+0(FP), AX - SUBQ AX, CX - MOVQ CX, ret+56(FP) - RET - -// func calcBlockSize(src []byte, tmp *[32768]byte) int -// Requires: BMI, SSE2 -TEXT ·calcBlockSize(SB), $24-40 - MOVQ tmp+24(FP), AX - XORQ CX, CX - MOVQ $0x00000100, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_calcBlockSize: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_calcBlockSize - MOVL $0x00000000, 12(SP) - MOVQ src_len+8(FP), DX - LEAQ -9(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+0(FP), BX - -search_loop_calcBlockSize: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x05, SI - LEAL 4(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_calcBlockSize - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x33, R10 - SHLQ $0x10, R11 - IMULQ R9, R11 - SHRQ $0x33, R11 - MOVL (AX)(R10*4), SI - MOVL (AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - LEAL 1(DX), R10 - MOVL R10, (AX)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x10, R10 - IMULQ R9, R10 - SHRQ $0x33, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_calcBlockSize - LEAL 1(DX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_calcBlockSize - -repeat_extend_back_loop_calcBlockSize: - CMPL DI, SI - JBE repeat_extend_back_end_calcBlockSize - MOVB -1(BX)(R8*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_calcBlockSize - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_calcBlockSize - -repeat_extend_back_end_calcBlockSize: - MOVL DI, SI - SUBL 12(SP), SI - LEAQ 5(CX)(SI*1), SI - CMPQ SI, (SP) - JB repeat_dst_size_check_calcBlockSize - MOVQ $0x00000000, ret+32(FP) - RET - -repeat_dst_size_check_calcBlockSize: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_calcBlockSize - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JB one_byte_repeat_emit_calcBlockSize - CMPL SI, $0x00000100 - JB two_bytes_repeat_emit_calcBlockSize - CMPL SI, $0x00010000 - JB three_bytes_repeat_emit_calcBlockSize - CMPL SI, $0x01000000 - JB four_bytes_repeat_emit_calcBlockSize - ADDQ $0x05, CX - JMP memmove_long_repeat_emit_calcBlockSize - -four_bytes_repeat_emit_calcBlockSize: - ADDQ $0x04, CX - JMP memmove_long_repeat_emit_calcBlockSize - -three_bytes_repeat_emit_calcBlockSize: - ADDQ $0x03, CX - JMP memmove_long_repeat_emit_calcBlockSize - -two_bytes_repeat_emit_calcBlockSize: - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_repeat_emit_calcBlockSize - JMP memmove_long_repeat_emit_calcBlockSize - -one_byte_repeat_emit_calcBlockSize: - ADDQ $0x01, CX - -memmove_repeat_emit_calcBlockSize: - LEAQ (CX)(R8*1), CX - JMP emit_literal_done_repeat_emit_calcBlockSize - -memmove_long_repeat_emit_calcBlockSize: - LEAQ (CX)(R8*1), CX - -emit_literal_done_repeat_emit_calcBlockSize: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+8(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_repeat_extend_calcBlockSize: - CMPL R8, $0x10 - JB matchlen_match8_repeat_extend_calcBlockSize - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_calcBlockSize - XORQ 8(SI)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_calcBlockSize - LEAL -16(R8), R8 - LEAL 16(R11), R11 - JMP matchlen_loopback_16_repeat_extend_calcBlockSize - -matchlen_bsf_16repeat_extend_calcBlockSize: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_calcBlockSize - -matchlen_match8_repeat_extend_calcBlockSize: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_calcBlockSize - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_calcBlockSize - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_calcBlockSize - -matchlen_bsf_8_repeat_extend_calcBlockSize: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_calcBlockSize - -matchlen_match4_repeat_extend_calcBlockSize: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_calcBlockSize - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_calcBlockSize - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_calcBlockSize: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_calcBlockSize - JB repeat_extend_forward_end_calcBlockSize - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_calcBlockSize - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_calcBlockSize - -matchlen_match1_repeat_extend_calcBlockSize: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_calcBlockSize - LEAL 1(R11), R11 - -repeat_extend_forward_end_calcBlockSize: - ADDL R11, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitCopy - CMPL DI, $0x00010000 - JB two_byte_offset_repeat_as_copy_calcBlockSize - -four_bytes_loop_back_repeat_as_copy_calcBlockSize: - CMPL SI, $0x40 - JBE four_bytes_remain_repeat_as_copy_calcBlockSize - LEAL -64(SI), SI - ADDQ $0x05, CX - CMPL SI, $0x04 - JB four_bytes_remain_repeat_as_copy_calcBlockSize - JMP four_bytes_loop_back_repeat_as_copy_calcBlockSize - -four_bytes_remain_repeat_as_copy_calcBlockSize: - TESTL SI, SI - JZ repeat_end_emit_calcBlockSize - XORL SI, SI - ADDQ $0x05, CX - JMP repeat_end_emit_calcBlockSize - -two_byte_offset_repeat_as_copy_calcBlockSize: - CMPL SI, $0x40 - JBE two_byte_offset_short_repeat_as_copy_calcBlockSize - LEAL -60(SI), SI - ADDQ $0x03, CX - JMP two_byte_offset_repeat_as_copy_calcBlockSize - -two_byte_offset_short_repeat_as_copy_calcBlockSize: - MOVL SI, R8 - SHLL $0x02, R8 - CMPL SI, $0x0c - JAE emit_copy_three_repeat_as_copy_calcBlockSize - CMPL DI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_calcBlockSize - ADDQ $0x02, CX - JMP repeat_end_emit_calcBlockSize - -emit_copy_three_repeat_as_copy_calcBlockSize: - ADDQ $0x03, CX - -repeat_end_emit_calcBlockSize: - MOVL DX, 12(SP) - JMP search_loop_calcBlockSize - -no_repeat_found_calcBlockSize: - CMPL (BX)(SI*1), DI - JEQ candidate_match_calcBlockSize - SHRQ $0x08, DI - MOVL (AX)(R10*4), SI - LEAL 2(DX), R9 - CMPL (BX)(R8*1), DI - JEQ candidate2_match_calcBlockSize - MOVL R9, (AX)(R10*4) - SHRQ $0x08, DI - CMPL (BX)(SI*1), DI - JEQ candidate3_match_calcBlockSize - MOVL 20(SP), DX - JMP search_loop_calcBlockSize - -candidate3_match_calcBlockSize: - ADDL $0x02, DX - JMP candidate_match_calcBlockSize - -candidate2_match_calcBlockSize: - MOVL R9, (AX)(R10*4) - INCL DX - MOVL R8, SI - -candidate_match_calcBlockSize: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_calcBlockSize - -match_extend_back_loop_calcBlockSize: - CMPL DX, DI - JBE match_extend_back_end_calcBlockSize - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_calcBlockSize - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_calcBlockSize - JMP match_extend_back_loop_calcBlockSize - -match_extend_back_end_calcBlockSize: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 5(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_calcBlockSize - MOVQ $0x00000000, ret+32(FP) - RET - -match_dst_size_check_calcBlockSize: - MOVL DX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_calcBlockSize - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), DI - CMPL DI, $0x3c - JB one_byte_match_emit_calcBlockSize - CMPL DI, $0x00000100 - JB two_bytes_match_emit_calcBlockSize - CMPL DI, $0x00010000 - JB three_bytes_match_emit_calcBlockSize - CMPL DI, $0x01000000 - JB four_bytes_match_emit_calcBlockSize - ADDQ $0x05, CX - JMP memmove_long_match_emit_calcBlockSize - -four_bytes_match_emit_calcBlockSize: - ADDQ $0x04, CX - JMP memmove_long_match_emit_calcBlockSize - -three_bytes_match_emit_calcBlockSize: - ADDQ $0x03, CX - JMP memmove_long_match_emit_calcBlockSize - -two_bytes_match_emit_calcBlockSize: - ADDQ $0x02, CX - CMPL DI, $0x40 - JB memmove_match_emit_calcBlockSize - JMP memmove_long_match_emit_calcBlockSize - -one_byte_match_emit_calcBlockSize: - ADDQ $0x01, CX - -memmove_match_emit_calcBlockSize: - LEAQ (CX)(R9*1), CX - JMP emit_literal_done_match_emit_calcBlockSize - -memmove_long_match_emit_calcBlockSize: - LEAQ (CX)(R9*1), CX - -emit_literal_done_match_emit_calcBlockSize: -match_nolit_loop_calcBlockSize: - MOVL DX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+8(FP), DI - SUBL DX, DI - LEAQ (BX)(DX*1), R8 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_match_nolit_calcBlockSize: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_calcBlockSize - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_calcBlockSize - XORQ 8(SI)(R10*1), R11 - JNZ matchlen_bsf_16match_nolit_calcBlockSize - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_match_nolit_calcBlockSize - -matchlen_bsf_16match_nolit_calcBlockSize: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP match_nolit_end_calcBlockSize - -matchlen_match8_match_nolit_calcBlockSize: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_calcBlockSize - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_calcBlockSize - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_match_nolit_calcBlockSize - -matchlen_bsf_8_match_nolit_calcBlockSize: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_calcBlockSize - -matchlen_match4_match_nolit_calcBlockSize: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_calcBlockSize - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_calcBlockSize - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_calcBlockSize: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_calcBlockSize - JB match_nolit_end_calcBlockSize - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_calcBlockSize - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ match_nolit_end_calcBlockSize - -matchlen_match1_match_nolit_calcBlockSize: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_calcBlockSize - LEAL 1(R10), R10 - -match_nolit_end_calcBlockSize: - ADDL R10, DX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL DX, 12(SP) - - // emitCopy - CMPL SI, $0x00010000 - JB two_byte_offset_match_nolit_calcBlockSize - -four_bytes_loop_back_match_nolit_calcBlockSize: - CMPL R10, $0x40 - JBE four_bytes_remain_match_nolit_calcBlockSize - LEAL -64(R10), R10 - ADDQ $0x05, CX - CMPL R10, $0x04 - JB four_bytes_remain_match_nolit_calcBlockSize - JMP four_bytes_loop_back_match_nolit_calcBlockSize - -four_bytes_remain_match_nolit_calcBlockSize: - TESTL R10, R10 - JZ match_nolit_emitcopy_end_calcBlockSize - XORL SI, SI - ADDQ $0x05, CX - JMP match_nolit_emitcopy_end_calcBlockSize - -two_byte_offset_match_nolit_calcBlockSize: - CMPL R10, $0x40 - JBE two_byte_offset_short_match_nolit_calcBlockSize - LEAL -60(R10), R10 - ADDQ $0x03, CX - JMP two_byte_offset_match_nolit_calcBlockSize - -two_byte_offset_short_match_nolit_calcBlockSize: - MOVL R10, DI - SHLL $0x02, DI - CMPL R10, $0x0c - JAE emit_copy_three_match_nolit_calcBlockSize - CMPL SI, $0x00000800 - JAE emit_copy_three_match_nolit_calcBlockSize - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_calcBlockSize - -emit_copy_three_match_nolit_calcBlockSize: - ADDQ $0x03, CX - -match_nolit_emitcopy_end_calcBlockSize: - CMPL DX, 8(SP) - JAE emit_remainder_calcBlockSize - MOVQ -2(BX)(DX*1), DI - CMPQ CX, (SP) - JB match_nolit_dst_ok_calcBlockSize - MOVQ $0x00000000, ret+32(FP) - RET - -match_nolit_dst_ok_calcBlockSize: - MOVQ $0x0000cf1bbcdcbf9b, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x10, R8 - IMULQ R9, R8 - SHRQ $0x33, R8 - SHLQ $0x10, SI - IMULQ R9, SI - SHRQ $0x33, SI - LEAL -2(DX), R9 - LEAQ (AX)(SI*4), R10 - MOVL (R10), SI - MOVL R9, (AX)(R8*4) - MOVL DX, (R10) - CMPL (BX)(SI*1), DI - JEQ match_nolit_loop_calcBlockSize - INCL DX - JMP search_loop_calcBlockSize - -emit_remainder_calcBlockSize: - MOVQ src_len+8(FP), AX - SUBL 12(SP), AX - LEAQ 5(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_calcBlockSize - MOVQ $0x00000000, ret+32(FP) - RET - -emit_remainder_ok_calcBlockSize: - MOVQ src_len+8(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_calcBlockSize - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), AX - CMPL AX, $0x3c - JB one_byte_emit_remainder_calcBlockSize - CMPL AX, $0x00000100 - JB two_bytes_emit_remainder_calcBlockSize - CMPL AX, $0x00010000 - JB three_bytes_emit_remainder_calcBlockSize - CMPL AX, $0x01000000 - JB four_bytes_emit_remainder_calcBlockSize - ADDQ $0x05, CX - JMP memmove_long_emit_remainder_calcBlockSize - -four_bytes_emit_remainder_calcBlockSize: - ADDQ $0x04, CX - JMP memmove_long_emit_remainder_calcBlockSize - -three_bytes_emit_remainder_calcBlockSize: - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_calcBlockSize - -two_bytes_emit_remainder_calcBlockSize: - ADDQ $0x02, CX - CMPL AX, $0x40 - JB memmove_emit_remainder_calcBlockSize - JMP memmove_long_emit_remainder_calcBlockSize - -one_byte_emit_remainder_calcBlockSize: - ADDQ $0x01, CX - -memmove_emit_remainder_calcBlockSize: - LEAQ (CX)(SI*1), AX - MOVQ AX, CX - JMP emit_literal_done_emit_remainder_calcBlockSize - -memmove_long_emit_remainder_calcBlockSize: - LEAQ (CX)(SI*1), AX - MOVQ AX, CX - -emit_literal_done_emit_remainder_calcBlockSize: - MOVQ CX, ret+32(FP) - RET - -// func calcBlockSizeSmall(src []byte, tmp *[2048]byte) int -// Requires: BMI, SSE2 -TEXT ·calcBlockSizeSmall(SB), $24-40 - MOVQ tmp+24(FP), AX - XORQ CX, CX - MOVQ $0x00000010, DX - MOVQ AX, BX - PXOR X0, X0 - -zero_loop_calcBlockSizeSmall: - MOVOU X0, (BX) - MOVOU X0, 16(BX) - MOVOU X0, 32(BX) - MOVOU X0, 48(BX) - MOVOU X0, 64(BX) - MOVOU X0, 80(BX) - MOVOU X0, 96(BX) - MOVOU X0, 112(BX) - ADDQ $0x80, BX - DECQ DX - JNZ zero_loop_calcBlockSizeSmall - MOVL $0x00000000, 12(SP) - MOVQ src_len+8(FP), DX - LEAQ -9(DX), BX - LEAQ -8(DX), SI - MOVL SI, 8(SP) - SHRQ $0x05, DX - SUBL DX, BX - LEAQ (CX)(BX*1), BX - MOVQ BX, (SP) - MOVL $0x00000001, DX - MOVL DX, 16(SP) - MOVQ src_base+0(FP), BX - -search_loop_calcBlockSizeSmall: - MOVL DX, SI - SUBL 12(SP), SI - SHRL $0x04, SI - LEAL 4(DX)(SI*1), SI - CMPL SI, 8(SP) - JAE emit_remainder_calcBlockSizeSmall - MOVQ (BX)(DX*1), DI - MOVL SI, 20(SP) - MOVQ $0x9e3779b1, R9 - MOVQ DI, R10 - MOVQ DI, R11 - SHRQ $0x08, R11 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x37, R10 - SHLQ $0x20, R11 - IMULQ R9, R11 - SHRQ $0x37, R11 - MOVL (AX)(R10*4), SI - MOVL (AX)(R11*4), R8 - MOVL DX, (AX)(R10*4) - LEAL 1(DX), R10 - MOVL R10, (AX)(R11*4) - MOVQ DI, R10 - SHRQ $0x10, R10 - SHLQ $0x20, R10 - IMULQ R9, R10 - SHRQ $0x37, R10 - MOVL DX, R9 - SUBL 16(SP), R9 - MOVL 1(BX)(R9*1), R11 - MOVQ DI, R9 - SHRQ $0x08, R9 - CMPL R9, R11 - JNE no_repeat_found_calcBlockSizeSmall - LEAL 1(DX), DI - MOVL 12(SP), SI - MOVL DI, R8 - SUBL 16(SP), R8 - JZ repeat_extend_back_end_calcBlockSizeSmall - -repeat_extend_back_loop_calcBlockSizeSmall: - CMPL DI, SI - JBE repeat_extend_back_end_calcBlockSizeSmall - MOVB -1(BX)(R8*1), R9 - MOVB -1(BX)(DI*1), R10 - CMPB R9, R10 - JNE repeat_extend_back_end_calcBlockSizeSmall - LEAL -1(DI), DI - DECL R8 - JNZ repeat_extend_back_loop_calcBlockSizeSmall - -repeat_extend_back_end_calcBlockSizeSmall: - MOVL DI, SI - SUBL 12(SP), SI - LEAQ 3(CX)(SI*1), SI - CMPQ SI, (SP) - JB repeat_dst_size_check_calcBlockSizeSmall - MOVQ $0x00000000, ret+32(FP) - RET - -repeat_dst_size_check_calcBlockSizeSmall: - MOVL 12(SP), SI - CMPL SI, DI - JEQ emit_literal_done_repeat_emit_calcBlockSizeSmall - MOVL DI, R8 - MOVL DI, 12(SP) - LEAQ (BX)(SI*1), R9 - SUBL SI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JB one_byte_repeat_emit_calcBlockSizeSmall - CMPL SI, $0x00000100 - JB two_bytes_repeat_emit_calcBlockSizeSmall - JB three_bytes_repeat_emit_calcBlockSizeSmall - -three_bytes_repeat_emit_calcBlockSizeSmall: - ADDQ $0x03, CX - JMP memmove_long_repeat_emit_calcBlockSizeSmall - -two_bytes_repeat_emit_calcBlockSizeSmall: - ADDQ $0x02, CX - CMPL SI, $0x40 - JB memmove_repeat_emit_calcBlockSizeSmall - JMP memmove_long_repeat_emit_calcBlockSizeSmall - -one_byte_repeat_emit_calcBlockSizeSmall: - ADDQ $0x01, CX - -memmove_repeat_emit_calcBlockSizeSmall: - LEAQ (CX)(R8*1), CX - JMP emit_literal_done_repeat_emit_calcBlockSizeSmall - -memmove_long_repeat_emit_calcBlockSizeSmall: - LEAQ (CX)(R8*1), CX - -emit_literal_done_repeat_emit_calcBlockSizeSmall: - ADDL $0x05, DX - MOVL DX, SI - SUBL 16(SP), SI - MOVQ src_len+8(FP), R8 - SUBL DX, R8 - LEAQ (BX)(DX*1), R9 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_repeat_extend_calcBlockSizeSmall: - CMPL R8, $0x10 - JB matchlen_match8_repeat_extend_calcBlockSizeSmall - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall - XORQ 8(SI)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_calcBlockSizeSmall - LEAL -16(R8), R8 - LEAL 16(R11), R11 - JMP matchlen_loopback_16_repeat_extend_calcBlockSizeSmall - -matchlen_bsf_16repeat_extend_calcBlockSizeSmall: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_calcBlockSizeSmall - -matchlen_match8_repeat_extend_calcBlockSizeSmall: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_calcBlockSizeSmall - MOVQ (R9)(R11*1), R10 - XORQ (SI)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_calcBlockSizeSmall - -matchlen_bsf_8_repeat_extend_calcBlockSizeSmall: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_calcBlockSizeSmall - -matchlen_match4_repeat_extend_calcBlockSizeSmall: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_calcBlockSizeSmall - MOVL (R9)(R11*1), R10 - CMPL (SI)(R11*1), R10 - JNE matchlen_match2_repeat_extend_calcBlockSizeSmall - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_calcBlockSizeSmall: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_calcBlockSizeSmall - JB repeat_extend_forward_end_calcBlockSizeSmall - MOVW (R9)(R11*1), R10 - CMPW (SI)(R11*1), R10 - JNE matchlen_match1_repeat_extend_calcBlockSizeSmall - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_calcBlockSizeSmall - -matchlen_match1_repeat_extend_calcBlockSizeSmall: - MOVB (R9)(R11*1), R10 - CMPB (SI)(R11*1), R10 - JNE repeat_extend_forward_end_calcBlockSizeSmall - LEAL 1(R11), R11 - -repeat_extend_forward_end_calcBlockSizeSmall: - ADDL R11, DX - MOVL DX, SI - SUBL DI, SI - MOVL 16(SP), DI - - // emitCopy -two_byte_offset_repeat_as_copy_calcBlockSizeSmall: - CMPL SI, $0x40 - JBE two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall - LEAL -60(SI), SI - ADDQ $0x03, CX - JMP two_byte_offset_repeat_as_copy_calcBlockSizeSmall - -two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall: - MOVL SI, DI - SHLL $0x02, DI - CMPL SI, $0x0c - JAE emit_copy_three_repeat_as_copy_calcBlockSizeSmall - ADDQ $0x02, CX - JMP repeat_end_emit_calcBlockSizeSmall - -emit_copy_three_repeat_as_copy_calcBlockSizeSmall: - ADDQ $0x03, CX - -repeat_end_emit_calcBlockSizeSmall: - MOVL DX, 12(SP) - JMP search_loop_calcBlockSizeSmall - -no_repeat_found_calcBlockSizeSmall: - CMPL (BX)(SI*1), DI - JEQ candidate_match_calcBlockSizeSmall - SHRQ $0x08, DI - MOVL (AX)(R10*4), SI - LEAL 2(DX), R9 - CMPL (BX)(R8*1), DI - JEQ candidate2_match_calcBlockSizeSmall - MOVL R9, (AX)(R10*4) - SHRQ $0x08, DI - CMPL (BX)(SI*1), DI - JEQ candidate3_match_calcBlockSizeSmall - MOVL 20(SP), DX - JMP search_loop_calcBlockSizeSmall - -candidate3_match_calcBlockSizeSmall: - ADDL $0x02, DX - JMP candidate_match_calcBlockSizeSmall - -candidate2_match_calcBlockSizeSmall: - MOVL R9, (AX)(R10*4) - INCL DX - MOVL R8, SI - -candidate_match_calcBlockSizeSmall: - MOVL 12(SP), DI - TESTL SI, SI - JZ match_extend_back_end_calcBlockSizeSmall - -match_extend_back_loop_calcBlockSizeSmall: - CMPL DX, DI - JBE match_extend_back_end_calcBlockSizeSmall - MOVB -1(BX)(SI*1), R8 - MOVB -1(BX)(DX*1), R9 - CMPB R8, R9 - JNE match_extend_back_end_calcBlockSizeSmall - LEAL -1(DX), DX - DECL SI - JZ match_extend_back_end_calcBlockSizeSmall - JMP match_extend_back_loop_calcBlockSizeSmall - -match_extend_back_end_calcBlockSizeSmall: - MOVL DX, DI - SUBL 12(SP), DI - LEAQ 3(CX)(DI*1), DI - CMPQ DI, (SP) - JB match_dst_size_check_calcBlockSizeSmall - MOVQ $0x00000000, ret+32(FP) - RET - -match_dst_size_check_calcBlockSizeSmall: - MOVL DX, DI - MOVL 12(SP), R8 - CMPL R8, DI - JEQ emit_literal_done_match_emit_calcBlockSizeSmall - MOVL DI, R9 - MOVL DI, 12(SP) - LEAQ (BX)(R8*1), DI - SUBL R8, R9 - LEAL -1(R9), DI - CMPL DI, $0x3c - JB one_byte_match_emit_calcBlockSizeSmall - CMPL DI, $0x00000100 - JB two_bytes_match_emit_calcBlockSizeSmall - JB three_bytes_match_emit_calcBlockSizeSmall - -three_bytes_match_emit_calcBlockSizeSmall: - ADDQ $0x03, CX - JMP memmove_long_match_emit_calcBlockSizeSmall - -two_bytes_match_emit_calcBlockSizeSmall: - ADDQ $0x02, CX - CMPL DI, $0x40 - JB memmove_match_emit_calcBlockSizeSmall - JMP memmove_long_match_emit_calcBlockSizeSmall - -one_byte_match_emit_calcBlockSizeSmall: - ADDQ $0x01, CX - -memmove_match_emit_calcBlockSizeSmall: - LEAQ (CX)(R9*1), CX - JMP emit_literal_done_match_emit_calcBlockSizeSmall - -memmove_long_match_emit_calcBlockSizeSmall: - LEAQ (CX)(R9*1), CX - -emit_literal_done_match_emit_calcBlockSizeSmall: -match_nolit_loop_calcBlockSizeSmall: - MOVL DX, DI - SUBL SI, DI - MOVL DI, 16(SP) - ADDL $0x04, DX - ADDL $0x04, SI - MOVQ src_len+8(FP), DI - SUBL DX, DI - LEAQ (BX)(DX*1), R8 - LEAQ (BX)(SI*1), SI - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_match_nolit_calcBlockSizeSmall: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_calcBlockSizeSmall - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall - XORQ 8(SI)(R10*1), R11 - JNZ matchlen_bsf_16match_nolit_calcBlockSizeSmall - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_match_nolit_calcBlockSizeSmall - -matchlen_bsf_16match_nolit_calcBlockSizeSmall: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP match_nolit_end_calcBlockSizeSmall - -matchlen_match8_match_nolit_calcBlockSizeSmall: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_calcBlockSizeSmall - MOVQ (R8)(R10*1), R9 - XORQ (SI)(R10*1), R9 - JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_match_nolit_calcBlockSizeSmall - -matchlen_bsf_8_match_nolit_calcBlockSizeSmall: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP match_nolit_end_calcBlockSizeSmall - -matchlen_match4_match_nolit_calcBlockSizeSmall: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_calcBlockSizeSmall - MOVL (R8)(R10*1), R9 - CMPL (SI)(R10*1), R9 - JNE matchlen_match2_match_nolit_calcBlockSizeSmall - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_match_nolit_calcBlockSizeSmall: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_calcBlockSizeSmall - JB match_nolit_end_calcBlockSizeSmall - MOVW (R8)(R10*1), R9 - CMPW (SI)(R10*1), R9 - JNE matchlen_match1_match_nolit_calcBlockSizeSmall - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ match_nolit_end_calcBlockSizeSmall - -matchlen_match1_match_nolit_calcBlockSizeSmall: - MOVB (R8)(R10*1), R9 - CMPB (SI)(R10*1), R9 - JNE match_nolit_end_calcBlockSizeSmall - LEAL 1(R10), R10 - -match_nolit_end_calcBlockSizeSmall: - ADDL R10, DX - MOVL 16(SP), SI - ADDL $0x04, R10 - MOVL DX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_calcBlockSizeSmall: - CMPL R10, $0x40 - JBE two_byte_offset_short_match_nolit_calcBlockSizeSmall - LEAL -60(R10), R10 - ADDQ $0x03, CX - JMP two_byte_offset_match_nolit_calcBlockSizeSmall - -two_byte_offset_short_match_nolit_calcBlockSizeSmall: - MOVL R10, SI - SHLL $0x02, SI - CMPL R10, $0x0c - JAE emit_copy_three_match_nolit_calcBlockSizeSmall - ADDQ $0x02, CX - JMP match_nolit_emitcopy_end_calcBlockSizeSmall - -emit_copy_three_match_nolit_calcBlockSizeSmall: - ADDQ $0x03, CX - -match_nolit_emitcopy_end_calcBlockSizeSmall: - CMPL DX, 8(SP) - JAE emit_remainder_calcBlockSizeSmall - MOVQ -2(BX)(DX*1), DI - CMPQ CX, (SP) - JB match_nolit_dst_ok_calcBlockSizeSmall - MOVQ $0x00000000, ret+32(FP) - RET - -match_nolit_dst_ok_calcBlockSizeSmall: - MOVQ $0x9e3779b1, R9 - MOVQ DI, R8 - SHRQ $0x10, DI - MOVQ DI, SI - SHLQ $0x20, R8 - IMULQ R9, R8 - SHRQ $0x37, R8 - SHLQ $0x20, SI - IMULQ R9, SI - SHRQ $0x37, SI - LEAL -2(DX), R9 - LEAQ (AX)(SI*4), R10 - MOVL (R10), SI - MOVL R9, (AX)(R8*4) - MOVL DX, (R10) - CMPL (BX)(SI*1), DI - JEQ match_nolit_loop_calcBlockSizeSmall - INCL DX - JMP search_loop_calcBlockSizeSmall - -emit_remainder_calcBlockSizeSmall: - MOVQ src_len+8(FP), AX - SUBL 12(SP), AX - LEAQ 3(CX)(AX*1), AX - CMPQ AX, (SP) - JB emit_remainder_ok_calcBlockSizeSmall - MOVQ $0x00000000, ret+32(FP) - RET - -emit_remainder_ok_calcBlockSizeSmall: - MOVQ src_len+8(FP), AX - MOVL 12(SP), DX - CMPL DX, AX - JEQ emit_literal_done_emit_remainder_calcBlockSizeSmall - MOVL AX, SI - MOVL AX, 12(SP) - LEAQ (BX)(DX*1), AX - SUBL DX, SI - LEAL -1(SI), AX - CMPL AX, $0x3c - JB one_byte_emit_remainder_calcBlockSizeSmall - CMPL AX, $0x00000100 - JB two_bytes_emit_remainder_calcBlockSizeSmall - JB three_bytes_emit_remainder_calcBlockSizeSmall - -three_bytes_emit_remainder_calcBlockSizeSmall: - ADDQ $0x03, CX - JMP memmove_long_emit_remainder_calcBlockSizeSmall - -two_bytes_emit_remainder_calcBlockSizeSmall: - ADDQ $0x02, CX - CMPL AX, $0x40 - JB memmove_emit_remainder_calcBlockSizeSmall - JMP memmove_long_emit_remainder_calcBlockSizeSmall - -one_byte_emit_remainder_calcBlockSizeSmall: - ADDQ $0x01, CX - -memmove_emit_remainder_calcBlockSizeSmall: - LEAQ (CX)(SI*1), AX - MOVQ AX, CX - JMP emit_literal_done_emit_remainder_calcBlockSizeSmall - -memmove_long_emit_remainder_calcBlockSizeSmall: - LEAQ (CX)(SI*1), AX - MOVQ AX, CX - -emit_literal_done_emit_remainder_calcBlockSizeSmall: - MOVQ CX, ret+32(FP) - RET - -// func emitLiteral(dst []byte, lit []byte) int -// Requires: SSE2 -TEXT ·emitLiteral(SB), NOSPLIT, $0-56 - MOVQ lit_len+32(FP), DX - MOVQ dst_base+0(FP), AX - MOVQ lit_base+24(FP), CX - TESTQ DX, DX - JZ emit_literal_end_standalone_skip - MOVL DX, BX - LEAL -1(DX), SI - CMPL SI, $0x3c - JB one_byte_standalone - CMPL SI, $0x00000100 - JB two_bytes_standalone - CMPL SI, $0x00010000 - JB three_bytes_standalone - CMPL SI, $0x01000000 - JB four_bytes_standalone - MOVB $0xfc, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP memmove_long_standalone - -four_bytes_standalone: - MOVL SI, DI - SHRL $0x10, DI - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB DI, 3(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP memmove_long_standalone - -three_bytes_standalone: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP memmove_long_standalone - -two_bytes_standalone: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - CMPL SI, $0x40 - JB memmove_standalone - JMP memmove_long_standalone - -one_byte_standalone: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, BX - ADDQ $0x01, AX - -memmove_standalone: - // genMemMoveShort - CMPQ DX, $0x03 - JB emit_lit_memmove_standalone_memmove_move_1or2 - JE emit_lit_memmove_standalone_memmove_move_3 - CMPQ DX, $0x08 - JB emit_lit_memmove_standalone_memmove_move_4through7 - CMPQ DX, $0x10 - JBE emit_lit_memmove_standalone_memmove_move_8through16 - CMPQ DX, $0x20 - JBE emit_lit_memmove_standalone_memmove_move_17through32 - JMP emit_lit_memmove_standalone_memmove_move_33through64 - -emit_lit_memmove_standalone_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(DX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(DX*1) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(DX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(DX*1) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(DX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(DX*1) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(DX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(DX*1) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(DX*1), X2 - MOVOU -16(CX)(DX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DX*1) - MOVOU X3, -16(AX)(DX*1) - JMP emit_literal_end_standalone - JMP emit_literal_end_standalone - -memmove_long_standalone: - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(DX*1), X2 - MOVOU -16(CX)(DX*1), X3 - MOVQ DX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_standalonelarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_standalonelarge_big_loop_back - -emit_lit_memmove_long_standalonelarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ DX, R8 - JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DX*1) - MOVOU X3, -16(AX)(DX*1) - JMP emit_literal_end_standalone - JMP emit_literal_end_standalone - -emit_literal_end_standalone_skip: - XORQ BX, BX - -emit_literal_end_standalone: - MOVQ BX, ret+48(FP) - RET - -// func emitRepeat(dst []byte, offset int, length int) int -TEXT ·emitRepeat(SB), NOSPLIT, $0-48 - XORQ BX, BX - MOVQ dst_base+0(FP), AX - MOVQ offset+24(FP), CX - MOVQ length+32(FP), DX - - // emitRepeat -emit_repeat_again_standalone: - MOVL DX, SI - LEAL -4(DX), DX - CMPL SI, $0x08 - JBE repeat_two_standalone - CMPL SI, $0x0c - JAE cant_repeat_two_offset_standalone - CMPL CX, $0x00000800 - JB repeat_two_offset_standalone - -cant_repeat_two_offset_standalone: - CMPL DX, $0x00000104 - JB repeat_three_standalone - CMPL DX, $0x00010100 - JB repeat_four_standalone - CMPL DX, $0x0100ffff - JB repeat_five_standalone - LEAL -16842747(DX), DX - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - ADDQ $0x05, BX - JMP emit_repeat_again_standalone - -repeat_five_standalone: - LEAL -65536(DX), DX - MOVL DX, CX - MOVW $0x001d, (AX) - MOVW DX, 2(AX) - SARL $0x10, CX - MOVB CL, 4(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_repeat_end - -repeat_four_standalone: - LEAL -256(DX), DX - MOVW $0x0019, (AX) - MOVW DX, 2(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP gen_emit_repeat_end - -repeat_three_standalone: - LEAL -4(DX), DX - MOVW $0x0015, (AX) - MOVB DL, 2(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP gen_emit_repeat_end - -repeat_two_standalone: - SHLL $0x02, DX - ORL $0x01, DX - MOVW DX, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_repeat_end - -repeat_two_offset_standalone: - XORQ SI, SI - LEAL 1(SI)(DX*4), DX - MOVB CL, 1(AX) - SARL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - -gen_emit_repeat_end: - MOVQ BX, ret+40(FP) - RET - -// func emitCopy(dst []byte, offset int, length int) int -TEXT ·emitCopy(SB), NOSPLIT, $0-48 - XORQ BX, BX - MOVQ dst_base+0(FP), AX - MOVQ offset+24(FP), CX - MOVQ length+32(FP), DX - - // emitCopy - CMPL CX, $0x00010000 - JB two_byte_offset_standalone - CMPL DX, $0x40 - JBE four_bytes_remain_standalone - MOVB $0xff, (AX) - MOVL CX, 1(AX) - LEAL -64(DX), DX - ADDQ $0x05, BX - ADDQ $0x05, AX - CMPL DX, $0x04 - JB four_bytes_remain_standalone - - // emitRepeat -emit_repeat_again_standalone_emit_copy: - MOVL DX, SI - LEAL -4(DX), DX - CMPL SI, $0x08 - JBE repeat_two_standalone_emit_copy - CMPL SI, $0x0c - JAE cant_repeat_two_offset_standalone_emit_copy - CMPL CX, $0x00000800 - JB repeat_two_offset_standalone_emit_copy - -cant_repeat_two_offset_standalone_emit_copy: - CMPL DX, $0x00000104 - JB repeat_three_standalone_emit_copy - CMPL DX, $0x00010100 - JB repeat_four_standalone_emit_copy - CMPL DX, $0x0100ffff - JB repeat_five_standalone_emit_copy - LEAL -16842747(DX), DX - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - ADDQ $0x05, BX - JMP emit_repeat_again_standalone_emit_copy - -repeat_five_standalone_emit_copy: - LEAL -65536(DX), DX - MOVL DX, CX - MOVW $0x001d, (AX) - MOVW DX, 2(AX) - SARL $0x10, CX - MOVB CL, 4(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end - -repeat_four_standalone_emit_copy: - LEAL -256(DX), DX - MOVW $0x0019, (AX) - MOVW DX, 2(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP gen_emit_copy_end - -repeat_three_standalone_emit_copy: - LEAL -4(DX), DX - MOVW $0x0015, (AX) - MOVB DL, 2(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP gen_emit_copy_end - -repeat_two_standalone_emit_copy: - SHLL $0x02, DX - ORL $0x01, DX - MOVW DX, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -repeat_two_offset_standalone_emit_copy: - XORQ SI, SI - LEAL 1(SI)(DX*4), DX - MOVB CL, 1(AX) - SARL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -four_bytes_remain_standalone: - TESTL DX, DX - JZ gen_emit_copy_end - XORL SI, SI - LEAL -1(SI)(DX*4), DX - MOVB DL, (AX) - MOVL CX, 1(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end - -two_byte_offset_standalone: - CMPL DX, $0x40 - JBE two_byte_offset_short_standalone - CMPL CX, $0x00000800 - JAE long_offset_short_standalone - MOVL $0x00000001, SI - LEAL 16(SI), SI - MOVB CL, 1(AX) - MOVL CX, DI - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - SUBL $0x08, DX - - // emitRepeat - LEAL -4(DX), DX - JMP cant_repeat_two_offset_standalone_emit_copy_short_2b - -emit_repeat_again_standalone_emit_copy_short_2b: - MOVL DX, SI - LEAL -4(DX), DX - CMPL SI, $0x08 - JBE repeat_two_standalone_emit_copy_short_2b - CMPL SI, $0x0c - JAE cant_repeat_two_offset_standalone_emit_copy_short_2b - CMPL CX, $0x00000800 - JB repeat_two_offset_standalone_emit_copy_short_2b - -cant_repeat_two_offset_standalone_emit_copy_short_2b: - CMPL DX, $0x00000104 - JB repeat_three_standalone_emit_copy_short_2b - CMPL DX, $0x00010100 - JB repeat_four_standalone_emit_copy_short_2b - CMPL DX, $0x0100ffff - JB repeat_five_standalone_emit_copy_short_2b - LEAL -16842747(DX), DX - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - ADDQ $0x05, BX - JMP emit_repeat_again_standalone_emit_copy_short_2b - -repeat_five_standalone_emit_copy_short_2b: - LEAL -65536(DX), DX - MOVL DX, CX - MOVW $0x001d, (AX) - MOVW DX, 2(AX) - SARL $0x10, CX - MOVB CL, 4(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end - -repeat_four_standalone_emit_copy_short_2b: - LEAL -256(DX), DX - MOVW $0x0019, (AX) - MOVW DX, 2(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP gen_emit_copy_end - -repeat_three_standalone_emit_copy_short_2b: - LEAL -4(DX), DX - MOVW $0x0015, (AX) - MOVB DL, 2(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP gen_emit_copy_end - -repeat_two_standalone_emit_copy_short_2b: - SHLL $0x02, DX - ORL $0x01, DX - MOVW DX, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -repeat_two_offset_standalone_emit_copy_short_2b: - XORQ SI, SI - LEAL 1(SI)(DX*4), DX - MOVB CL, 1(AX) - SARL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -long_offset_short_standalone: - MOVB $0xee, (AX) - MOVW CX, 1(AX) - LEAL -60(DX), DX - ADDQ $0x03, AX - ADDQ $0x03, BX - - // emitRepeat -emit_repeat_again_standalone_emit_copy_short: - MOVL DX, SI - LEAL -4(DX), DX - CMPL SI, $0x08 - JBE repeat_two_standalone_emit_copy_short - CMPL SI, $0x0c - JAE cant_repeat_two_offset_standalone_emit_copy_short - CMPL CX, $0x00000800 - JB repeat_two_offset_standalone_emit_copy_short - -cant_repeat_two_offset_standalone_emit_copy_short: - CMPL DX, $0x00000104 - JB repeat_three_standalone_emit_copy_short - CMPL DX, $0x00010100 - JB repeat_four_standalone_emit_copy_short - CMPL DX, $0x0100ffff - JB repeat_five_standalone_emit_copy_short - LEAL -16842747(DX), DX - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - ADDQ $0x05, BX - JMP emit_repeat_again_standalone_emit_copy_short - -repeat_five_standalone_emit_copy_short: - LEAL -65536(DX), DX - MOVL DX, CX - MOVW $0x001d, (AX) - MOVW DX, 2(AX) - SARL $0x10, CX - MOVB CL, 4(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end - -repeat_four_standalone_emit_copy_short: - LEAL -256(DX), DX - MOVW $0x0019, (AX) - MOVW DX, 2(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP gen_emit_copy_end - -repeat_three_standalone_emit_copy_short: - LEAL -4(DX), DX - MOVW $0x0015, (AX) - MOVB DL, 2(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP gen_emit_copy_end - -repeat_two_standalone_emit_copy_short: - SHLL $0x02, DX - ORL $0x01, DX - MOVW DX, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -repeat_two_offset_standalone_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(DX*4), DX - MOVB CL, 1(AX) - SARL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -two_byte_offset_short_standalone: - MOVL DX, SI - SHLL $0x02, SI - CMPL DX, $0x0c - JAE emit_copy_three_standalone - CMPL CX, $0x00000800 - JAE emit_copy_three_standalone - LEAL -15(SI), SI - MOVB CL, 1(AX) - SHRL $0x08, CX - SHLL $0x05, CX - ORL CX, SI - MOVB SI, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -emit_copy_three_standalone: - LEAL -2(SI), SI - MOVB SI, (AX) - MOVW CX, 1(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - -gen_emit_copy_end: - MOVQ BX, ret+40(FP) - RET - -// func emitCopyNoRepeat(dst []byte, offset int, length int) int -TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48 - XORQ BX, BX - MOVQ dst_base+0(FP), AX - MOVQ offset+24(FP), CX - MOVQ length+32(FP), DX - - // emitCopy - CMPL CX, $0x00010000 - JB two_byte_offset_standalone_snappy - -four_bytes_loop_back_standalone_snappy: - CMPL DX, $0x40 - JBE four_bytes_remain_standalone_snappy - MOVB $0xff, (AX) - MOVL CX, 1(AX) - LEAL -64(DX), DX - ADDQ $0x05, BX - ADDQ $0x05, AX - CMPL DX, $0x04 - JB four_bytes_remain_standalone_snappy - JMP four_bytes_loop_back_standalone_snappy - -four_bytes_remain_standalone_snappy: - TESTL DX, DX - JZ gen_emit_copy_end_snappy - XORL SI, SI - LEAL -1(SI)(DX*4), DX - MOVB DL, (AX) - MOVL CX, 1(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end_snappy - -two_byte_offset_standalone_snappy: - CMPL DX, $0x40 - JBE two_byte_offset_short_standalone_snappy - MOVB $0xee, (AX) - MOVW CX, 1(AX) - LEAL -60(DX), DX - ADDQ $0x03, AX - ADDQ $0x03, BX - JMP two_byte_offset_standalone_snappy - -two_byte_offset_short_standalone_snappy: - MOVL DX, SI - SHLL $0x02, SI - CMPL DX, $0x0c - JAE emit_copy_three_standalone_snappy - CMPL CX, $0x00000800 - JAE emit_copy_three_standalone_snappy - LEAL -15(SI), SI - MOVB CL, 1(AX) - SHRL $0x08, CX - SHLL $0x05, CX - ORL CX, SI - MOVB SI, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end_snappy - -emit_copy_three_standalone_snappy: - LEAL -2(SI), SI - MOVB SI, (AX) - MOVW CX, 1(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - -gen_emit_copy_end_snappy: - MOVQ BX, ret+40(FP) - RET - -// func matchLen(a []byte, b []byte) int -// Requires: BMI -TEXT ·matchLen(SB), NOSPLIT, $0-56 - MOVQ a_base+0(FP), AX - MOVQ b_base+24(FP), CX - MOVQ a_len+8(FP), DX - - // matchLen - XORL SI, SI - -matchlen_loopback_16_standalone: - CMPL DX, $0x10 - JB matchlen_match8_standalone - MOVQ (AX)(SI*1), BX - MOVQ 8(AX)(SI*1), DI - XORQ (CX)(SI*1), BX - JNZ matchlen_bsf_8_standalone - XORQ 8(CX)(SI*1), DI - JNZ matchlen_bsf_16standalone - LEAL -16(DX), DX - LEAL 16(SI), SI - JMP matchlen_loopback_16_standalone - -matchlen_bsf_16standalone: -#ifdef GOAMD64_v3 - TZCNTQ DI, DI - -#else - BSFQ DI, DI - -#endif - SARQ $0x03, DI - LEAL 8(SI)(DI*1), SI - JMP gen_match_len_end - -matchlen_match8_standalone: - CMPL DX, $0x08 - JB matchlen_match4_standalone - MOVQ (AX)(SI*1), BX - XORQ (CX)(SI*1), BX - JNZ matchlen_bsf_8_standalone - LEAL -8(DX), DX - LEAL 8(SI), SI - JMP matchlen_match4_standalone - -matchlen_bsf_8_standalone: -#ifdef GOAMD64_v3 - TZCNTQ BX, BX - -#else - BSFQ BX, BX - -#endif - SARQ $0x03, BX - LEAL (SI)(BX*1), SI - JMP gen_match_len_end - -matchlen_match4_standalone: - CMPL DX, $0x04 - JB matchlen_match2_standalone - MOVL (AX)(SI*1), BX - CMPL (CX)(SI*1), BX - JNE matchlen_match2_standalone - LEAL -4(DX), DX - LEAL 4(SI), SI - -matchlen_match2_standalone: - CMPL DX, $0x01 - JE matchlen_match1_standalone - JB gen_match_len_end - MOVW (AX)(SI*1), BX - CMPW (CX)(SI*1), BX - JNE matchlen_match1_standalone - LEAL 2(SI), SI - SUBL $0x02, DX - JZ gen_match_len_end - -matchlen_match1_standalone: - MOVB (AX)(SI*1), BL - CMPB (CX)(SI*1), BL - JNE gen_match_len_end - LEAL 1(SI), SI - -gen_match_len_end: - MOVQ SI, ret+48(FP) - RET - -// func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) -// Requires: SSE2 -TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64 - XORQ SI, SI - MOVQ dst_base+0(FP), AX - MOVQ dst_len+8(FP), CX - MOVQ src_base+24(FP), DX - MOVQ src_len+32(FP), BX - LEAQ (DX)(BX*1), BX - LEAQ -8(AX)(CX*1), CX - XORQ DI, DI - -lz4_s2_loop: - CMPQ DX, BX - JAE lz4_s2_corrupt - CMPQ AX, CX - JAE lz4_s2_dstfull - MOVBQZX (DX), R8 - MOVQ R8, R9 - MOVQ R8, R10 - SHRQ $0x04, R9 - ANDQ $0x0f, R10 - CMPQ R8, $0xf0 - JB lz4_s2_ll_end - -lz4_s2_ll_loop: - INCQ DX - CMPQ DX, BX - JAE lz4_s2_corrupt - MOVBQZX (DX), R8 - ADDQ R8, R9 - CMPQ R8, $0xff - JEQ lz4_s2_ll_loop - -lz4_s2_ll_end: - LEAQ (DX)(R9*1), R8 - ADDQ $0x04, R10 - CMPQ R8, BX - JAE lz4_s2_corrupt - INCQ DX - INCQ R8 - TESTQ R9, R9 - JZ lz4_s2_lits_done - LEAQ (AX)(R9*1), R11 - CMPQ R11, CX - JAE lz4_s2_dstfull - ADDQ R9, SI - LEAL -1(R9), R11 - CMPL R11, $0x3c - JB one_byte_lz4_s2 - CMPL R11, $0x00000100 - JB two_bytes_lz4_s2 - CMPL R11, $0x00010000 - JB three_bytes_lz4_s2 - CMPL R11, $0x01000000 - JB four_bytes_lz4_s2 - MOVB $0xfc, (AX) - MOVL R11, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_lz4_s2 - -four_bytes_lz4_s2: - MOVL R11, R12 - SHRL $0x10, R12 - MOVB $0xf8, (AX) - MOVW R11, 1(AX) - MOVB R12, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_lz4_s2 - -three_bytes_lz4_s2: - MOVB $0xf4, (AX) - MOVW R11, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_lz4_s2 - -two_bytes_lz4_s2: - MOVB $0xf0, (AX) - MOVB R11, 1(AX) - ADDQ $0x02, AX - CMPL R11, $0x40 - JB memmove_lz4_s2 - JMP memmove_long_lz4_s2 - -one_byte_lz4_s2: - SHLB $0x02, R11 - MOVB R11, (AX) - ADDQ $0x01, AX - -memmove_lz4_s2: - LEAQ (AX)(R9*1), R11 - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_lz4_s2_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_lz4_s2_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_lz4_s2_memmove_move_17through32 - JMP emit_lit_memmove_lz4_s2_memmove_move_33through64 - -emit_lit_memmove_lz4_s2_memmove_move_8: - MOVQ (DX), R12 - MOVQ R12, (AX) - JMP memmove_end_copy_lz4_s2 - -emit_lit_memmove_lz4_s2_memmove_move_8through16: - MOVQ (DX), R12 - MOVQ -8(DX)(R9*1), DX - MOVQ R12, (AX) - MOVQ DX, -8(AX)(R9*1) - JMP memmove_end_copy_lz4_s2 - -emit_lit_memmove_lz4_s2_memmove_move_17through32: - MOVOU (DX), X0 - MOVOU -16(DX)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_lz4_s2 - -emit_lit_memmove_lz4_s2_memmove_move_33through64: - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(R9*1), X2 - MOVOU -16(DX)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_lz4_s2: - MOVQ R11, AX - JMP lz4_s2_lits_emit_done - -memmove_long_lz4_s2: - LEAQ (AX)(R9*1), R11 - - // genMemMoveLong - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(R9*1), X2 - MOVOU -16(DX)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R12 - ANDL $0x0000001f, R12 - MOVQ $0x00000040, R14 - SUBQ R12, R14 - DECQ R13 - JA emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32 - LEAQ -32(DX)(R14*1), R12 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_lz4_s2large_big_loop_back: - MOVOU (R12), X4 - MOVOU 16(R12), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R12 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_lz4_s2large_big_loop_back - -emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32: - MOVOU -32(DX)(R14*1), X4 - MOVOU -16(DX)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R11, AX - -lz4_s2_lits_emit_done: - MOVQ R8, DX - -lz4_s2_lits_done: - CMPQ DX, BX - JNE lz4_s2_match - CMPQ R10, $0x04 - JEQ lz4_s2_done - JMP lz4_s2_corrupt - -lz4_s2_match: - LEAQ 2(DX), R8 - CMPQ R8, BX - JAE lz4_s2_corrupt - MOVWQZX (DX), R9 - MOVQ R8, DX - TESTQ R9, R9 - JZ lz4_s2_corrupt - CMPQ R9, SI - JA lz4_s2_corrupt - CMPQ R10, $0x13 - JNE lz4_s2_ml_done - -lz4_s2_ml_loop: - MOVBQZX (DX), R8 - INCQ DX - ADDQ R8, R10 - CMPQ DX, BX - JAE lz4_s2_corrupt - CMPQ R8, $0xff - JEQ lz4_s2_ml_loop - -lz4_s2_ml_done: - ADDQ R10, SI - CMPQ R9, DI - JNE lz4_s2_docopy - - // emitRepeat -emit_repeat_again_lz4_s2: - MOVL R10, R8 - LEAL -4(R10), R10 - CMPL R8, $0x08 - JBE repeat_two_lz4_s2 - CMPL R8, $0x0c - JAE cant_repeat_two_offset_lz4_s2 - CMPL R9, $0x00000800 - JB repeat_two_offset_lz4_s2 - -cant_repeat_two_offset_lz4_s2: - CMPL R10, $0x00000104 - JB repeat_three_lz4_s2 - CMPL R10, $0x00010100 - JB repeat_four_lz4_s2 - CMPL R10, $0x0100ffff - JB repeat_five_lz4_s2 - LEAL -16842747(R10), R10 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_lz4_s2 - -repeat_five_lz4_s2: - LEAL -65536(R10), R10 - MOVL R10, R9 - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, R9 - MOVB R9, 4(AX) - ADDQ $0x05, AX - JMP lz4_s2_loop - -repeat_four_lz4_s2: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP lz4_s2_loop - -repeat_three_lz4_s2: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP lz4_s2_loop - -repeat_two_lz4_s2: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP lz4_s2_loop - -repeat_two_offset_lz4_s2: - XORQ R8, R8 - LEAL 1(R8)(R10*4), R10 - MOVB R9, 1(AX) - SARL $0x08, R9 - SHLL $0x05, R9 - ORL R9, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP lz4_s2_loop - -lz4_s2_docopy: - MOVQ R9, DI - - // emitCopy - CMPL R10, $0x40 - JBE two_byte_offset_short_lz4_s2 - CMPL R9, $0x00000800 - JAE long_offset_short_lz4_s2 - MOVL $0x00000001, R8 - LEAL 16(R8), R8 - MOVB R9, 1(AX) - MOVL R9, R11 - SHRL $0x08, R11 - SHLL $0x05, R11 - ORL R11, R8 - MOVB R8, (AX) - ADDQ $0x02, AX - SUBL $0x08, R10 - - // emitRepeat - LEAL -4(R10), R10 - JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b - -emit_repeat_again_lz4_s2_emit_copy_short_2b: - MOVL R10, R8 - LEAL -4(R10), R10 - CMPL R8, $0x08 - JBE repeat_two_lz4_s2_emit_copy_short_2b - CMPL R8, $0x0c - JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b - CMPL R9, $0x00000800 - JB repeat_two_offset_lz4_s2_emit_copy_short_2b - -cant_repeat_two_offset_lz4_s2_emit_copy_short_2b: - CMPL R10, $0x00000104 - JB repeat_three_lz4_s2_emit_copy_short_2b - CMPL R10, $0x00010100 - JB repeat_four_lz4_s2_emit_copy_short_2b - CMPL R10, $0x0100ffff - JB repeat_five_lz4_s2_emit_copy_short_2b - LEAL -16842747(R10), R10 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_lz4_s2_emit_copy_short_2b - -repeat_five_lz4_s2_emit_copy_short_2b: - LEAL -65536(R10), R10 - MOVL R10, R9 - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, R9 - MOVB R9, 4(AX) - ADDQ $0x05, AX - JMP lz4_s2_loop - -repeat_four_lz4_s2_emit_copy_short_2b: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP lz4_s2_loop - -repeat_three_lz4_s2_emit_copy_short_2b: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP lz4_s2_loop - -repeat_two_lz4_s2_emit_copy_short_2b: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP lz4_s2_loop - -repeat_two_offset_lz4_s2_emit_copy_short_2b: - XORQ R8, R8 - LEAL 1(R8)(R10*4), R10 - MOVB R9, 1(AX) - SARL $0x08, R9 - SHLL $0x05, R9 - ORL R9, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP lz4_s2_loop - -long_offset_short_lz4_s2: - MOVB $0xee, (AX) - MOVW R9, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - - // emitRepeat -emit_repeat_again_lz4_s2_emit_copy_short: - MOVL R10, R8 - LEAL -4(R10), R10 - CMPL R8, $0x08 - JBE repeat_two_lz4_s2_emit_copy_short - CMPL R8, $0x0c - JAE cant_repeat_two_offset_lz4_s2_emit_copy_short - CMPL R9, $0x00000800 - JB repeat_two_offset_lz4_s2_emit_copy_short - -cant_repeat_two_offset_lz4_s2_emit_copy_short: - CMPL R10, $0x00000104 - JB repeat_three_lz4_s2_emit_copy_short - CMPL R10, $0x00010100 - JB repeat_four_lz4_s2_emit_copy_short - CMPL R10, $0x0100ffff - JB repeat_five_lz4_s2_emit_copy_short - LEAL -16842747(R10), R10 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_lz4_s2_emit_copy_short - -repeat_five_lz4_s2_emit_copy_short: - LEAL -65536(R10), R10 - MOVL R10, R9 - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, R9 - MOVB R9, 4(AX) - ADDQ $0x05, AX - JMP lz4_s2_loop - -repeat_four_lz4_s2_emit_copy_short: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP lz4_s2_loop - -repeat_three_lz4_s2_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP lz4_s2_loop - -repeat_two_lz4_s2_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP lz4_s2_loop - -repeat_two_offset_lz4_s2_emit_copy_short: - XORQ R8, R8 - LEAL 1(R8)(R10*4), R10 - MOVB R9, 1(AX) - SARL $0x08, R9 - SHLL $0x05, R9 - ORL R9, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP lz4_s2_loop - -two_byte_offset_short_lz4_s2: - MOVL R10, R8 - SHLL $0x02, R8 - CMPL R10, $0x0c - JAE emit_copy_three_lz4_s2 - CMPL R9, $0x00000800 - JAE emit_copy_three_lz4_s2 - LEAL -15(R8), R8 - MOVB R9, 1(AX) - SHRL $0x08, R9 - SHLL $0x05, R9 - ORL R9, R8 - MOVB R8, (AX) - ADDQ $0x02, AX - JMP lz4_s2_loop - -emit_copy_three_lz4_s2: - LEAL -2(R8), R8 - MOVB R8, (AX) - MOVW R9, 1(AX) - ADDQ $0x03, AX - JMP lz4_s2_loop - -lz4_s2_done: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ SI, uncompressed+48(FP) - MOVQ AX, dstUsed+56(FP) - RET - -lz4_s2_corrupt: - XORQ AX, AX - LEAQ -1(AX), SI - MOVQ SI, uncompressed+48(FP) - RET - -lz4_s2_dstfull: - XORQ AX, AX - LEAQ -2(AX), SI - MOVQ SI, uncompressed+48(FP) - RET - -// func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) -// Requires: SSE2 -TEXT ·cvtLZ4sBlockAsm(SB), NOSPLIT, $0-64 - XORQ SI, SI - MOVQ dst_base+0(FP), AX - MOVQ dst_len+8(FP), CX - MOVQ src_base+24(FP), DX - MOVQ src_len+32(FP), BX - LEAQ (DX)(BX*1), BX - LEAQ -8(AX)(CX*1), CX - XORQ DI, DI - -lz4s_s2_loop: - CMPQ DX, BX - JAE lz4s_s2_corrupt - CMPQ AX, CX - JAE lz4s_s2_dstfull - MOVBQZX (DX), R8 - MOVQ R8, R9 - MOVQ R8, R10 - SHRQ $0x04, R9 - ANDQ $0x0f, R10 - CMPQ R8, $0xf0 - JB lz4s_s2_ll_end - -lz4s_s2_ll_loop: - INCQ DX - CMPQ DX, BX - JAE lz4s_s2_corrupt - MOVBQZX (DX), R8 - ADDQ R8, R9 - CMPQ R8, $0xff - JEQ lz4s_s2_ll_loop - -lz4s_s2_ll_end: - LEAQ (DX)(R9*1), R8 - ADDQ $0x03, R10 - CMPQ R8, BX - JAE lz4s_s2_corrupt - INCQ DX - INCQ R8 - TESTQ R9, R9 - JZ lz4s_s2_lits_done - LEAQ (AX)(R9*1), R11 - CMPQ R11, CX - JAE lz4s_s2_dstfull - ADDQ R9, SI - LEAL -1(R9), R11 - CMPL R11, $0x3c - JB one_byte_lz4s_s2 - CMPL R11, $0x00000100 - JB two_bytes_lz4s_s2 - CMPL R11, $0x00010000 - JB three_bytes_lz4s_s2 - CMPL R11, $0x01000000 - JB four_bytes_lz4s_s2 - MOVB $0xfc, (AX) - MOVL R11, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_lz4s_s2 - -four_bytes_lz4s_s2: - MOVL R11, R12 - SHRL $0x10, R12 - MOVB $0xf8, (AX) - MOVW R11, 1(AX) - MOVB R12, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_lz4s_s2 - -three_bytes_lz4s_s2: - MOVB $0xf4, (AX) - MOVW R11, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_lz4s_s2 - -two_bytes_lz4s_s2: - MOVB $0xf0, (AX) - MOVB R11, 1(AX) - ADDQ $0x02, AX - CMPL R11, $0x40 - JB memmove_lz4s_s2 - JMP memmove_long_lz4s_s2 - -one_byte_lz4s_s2: - SHLB $0x02, R11 - MOVB R11, (AX) - ADDQ $0x01, AX - -memmove_lz4s_s2: - LEAQ (AX)(R9*1), R11 - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_lz4s_s2_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_lz4s_s2_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_lz4s_s2_memmove_move_17through32 - JMP emit_lit_memmove_lz4s_s2_memmove_move_33through64 - -emit_lit_memmove_lz4s_s2_memmove_move_8: - MOVQ (DX), R12 - MOVQ R12, (AX) - JMP memmove_end_copy_lz4s_s2 - -emit_lit_memmove_lz4s_s2_memmove_move_8through16: - MOVQ (DX), R12 - MOVQ -8(DX)(R9*1), DX - MOVQ R12, (AX) - MOVQ DX, -8(AX)(R9*1) - JMP memmove_end_copy_lz4s_s2 - -emit_lit_memmove_lz4s_s2_memmove_move_17through32: - MOVOU (DX), X0 - MOVOU -16(DX)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_lz4s_s2 - -emit_lit_memmove_lz4s_s2_memmove_move_33through64: - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(R9*1), X2 - MOVOU -16(DX)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_lz4s_s2: - MOVQ R11, AX - JMP lz4s_s2_lits_emit_done - -memmove_long_lz4s_s2: - LEAQ (AX)(R9*1), R11 - - // genMemMoveLong - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(R9*1), X2 - MOVOU -16(DX)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R12 - ANDL $0x0000001f, R12 - MOVQ $0x00000040, R14 - SUBQ R12, R14 - DECQ R13 - JA emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32 - LEAQ -32(DX)(R14*1), R12 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_lz4s_s2large_big_loop_back: - MOVOU (R12), X4 - MOVOU 16(R12), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R12 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_lz4s_s2large_big_loop_back - -emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32: - MOVOU -32(DX)(R14*1), X4 - MOVOU -16(DX)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R11, AX - -lz4s_s2_lits_emit_done: - MOVQ R8, DX - -lz4s_s2_lits_done: - CMPQ DX, BX - JNE lz4s_s2_match - CMPQ R10, $0x03 - JEQ lz4s_s2_done - JMP lz4s_s2_corrupt - -lz4s_s2_match: - CMPQ R10, $0x03 - JEQ lz4s_s2_loop - LEAQ 2(DX), R8 - CMPQ R8, BX - JAE lz4s_s2_corrupt - MOVWQZX (DX), R9 - MOVQ R8, DX - TESTQ R9, R9 - JZ lz4s_s2_corrupt - CMPQ R9, SI - JA lz4s_s2_corrupt - CMPQ R10, $0x12 - JNE lz4s_s2_ml_done - -lz4s_s2_ml_loop: - MOVBQZX (DX), R8 - INCQ DX - ADDQ R8, R10 - CMPQ DX, BX - JAE lz4s_s2_corrupt - CMPQ R8, $0xff - JEQ lz4s_s2_ml_loop - -lz4s_s2_ml_done: - ADDQ R10, SI - CMPQ R9, DI - JNE lz4s_s2_docopy - - // emitRepeat -emit_repeat_again_lz4_s2: - MOVL R10, R8 - LEAL -4(R10), R10 - CMPL R8, $0x08 - JBE repeat_two_lz4_s2 - CMPL R8, $0x0c - JAE cant_repeat_two_offset_lz4_s2 - CMPL R9, $0x00000800 - JB repeat_two_offset_lz4_s2 - -cant_repeat_two_offset_lz4_s2: - CMPL R10, $0x00000104 - JB repeat_three_lz4_s2 - CMPL R10, $0x00010100 - JB repeat_four_lz4_s2 - CMPL R10, $0x0100ffff - JB repeat_five_lz4_s2 - LEAL -16842747(R10), R10 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_lz4_s2 - -repeat_five_lz4_s2: - LEAL -65536(R10), R10 - MOVL R10, R9 - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, R9 - MOVB R9, 4(AX) - ADDQ $0x05, AX - JMP lz4s_s2_loop - -repeat_four_lz4_s2: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP lz4s_s2_loop - -repeat_three_lz4_s2: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP lz4s_s2_loop - -repeat_two_lz4_s2: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP lz4s_s2_loop - -repeat_two_offset_lz4_s2: - XORQ R8, R8 - LEAL 1(R8)(R10*4), R10 - MOVB R9, 1(AX) - SARL $0x08, R9 - SHLL $0x05, R9 - ORL R9, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP lz4s_s2_loop - -lz4s_s2_docopy: - MOVQ R9, DI - - // emitCopy - CMPL R10, $0x40 - JBE two_byte_offset_short_lz4_s2 - CMPL R9, $0x00000800 - JAE long_offset_short_lz4_s2 - MOVL $0x00000001, R8 - LEAL 16(R8), R8 - MOVB R9, 1(AX) - MOVL R9, R11 - SHRL $0x08, R11 - SHLL $0x05, R11 - ORL R11, R8 - MOVB R8, (AX) - ADDQ $0x02, AX - SUBL $0x08, R10 - - // emitRepeat - LEAL -4(R10), R10 - JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b - -emit_repeat_again_lz4_s2_emit_copy_short_2b: - MOVL R10, R8 - LEAL -4(R10), R10 - CMPL R8, $0x08 - JBE repeat_two_lz4_s2_emit_copy_short_2b - CMPL R8, $0x0c - JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b - CMPL R9, $0x00000800 - JB repeat_two_offset_lz4_s2_emit_copy_short_2b - -cant_repeat_two_offset_lz4_s2_emit_copy_short_2b: - CMPL R10, $0x00000104 - JB repeat_three_lz4_s2_emit_copy_short_2b - CMPL R10, $0x00010100 - JB repeat_four_lz4_s2_emit_copy_short_2b - CMPL R10, $0x0100ffff - JB repeat_five_lz4_s2_emit_copy_short_2b - LEAL -16842747(R10), R10 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_lz4_s2_emit_copy_short_2b - -repeat_five_lz4_s2_emit_copy_short_2b: - LEAL -65536(R10), R10 - MOVL R10, R9 - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, R9 - MOVB R9, 4(AX) - ADDQ $0x05, AX - JMP lz4s_s2_loop - -repeat_four_lz4_s2_emit_copy_short_2b: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP lz4s_s2_loop - -repeat_three_lz4_s2_emit_copy_short_2b: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP lz4s_s2_loop - -repeat_two_lz4_s2_emit_copy_short_2b: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP lz4s_s2_loop - -repeat_two_offset_lz4_s2_emit_copy_short_2b: - XORQ R8, R8 - LEAL 1(R8)(R10*4), R10 - MOVB R9, 1(AX) - SARL $0x08, R9 - SHLL $0x05, R9 - ORL R9, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP lz4s_s2_loop - -long_offset_short_lz4_s2: - MOVB $0xee, (AX) - MOVW R9, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - - // emitRepeat -emit_repeat_again_lz4_s2_emit_copy_short: - MOVL R10, R8 - LEAL -4(R10), R10 - CMPL R8, $0x08 - JBE repeat_two_lz4_s2_emit_copy_short - CMPL R8, $0x0c - JAE cant_repeat_two_offset_lz4_s2_emit_copy_short - CMPL R9, $0x00000800 - JB repeat_two_offset_lz4_s2_emit_copy_short - -cant_repeat_two_offset_lz4_s2_emit_copy_short: - CMPL R10, $0x00000104 - JB repeat_three_lz4_s2_emit_copy_short - CMPL R10, $0x00010100 - JB repeat_four_lz4_s2_emit_copy_short - CMPL R10, $0x0100ffff - JB repeat_five_lz4_s2_emit_copy_short - LEAL -16842747(R10), R10 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_lz4_s2_emit_copy_short - -repeat_five_lz4_s2_emit_copy_short: - LEAL -65536(R10), R10 - MOVL R10, R9 - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, R9 - MOVB R9, 4(AX) - ADDQ $0x05, AX - JMP lz4s_s2_loop - -repeat_four_lz4_s2_emit_copy_short: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP lz4s_s2_loop - -repeat_three_lz4_s2_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP lz4s_s2_loop - -repeat_two_lz4_s2_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP lz4s_s2_loop - -repeat_two_offset_lz4_s2_emit_copy_short: - XORQ R8, R8 - LEAL 1(R8)(R10*4), R10 - MOVB R9, 1(AX) - SARL $0x08, R9 - SHLL $0x05, R9 - ORL R9, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP lz4s_s2_loop - -two_byte_offset_short_lz4_s2: - MOVL R10, R8 - SHLL $0x02, R8 - CMPL R10, $0x0c - JAE emit_copy_three_lz4_s2 - CMPL R9, $0x00000800 - JAE emit_copy_three_lz4_s2 - LEAL -15(R8), R8 - MOVB R9, 1(AX) - SHRL $0x08, R9 - SHLL $0x05, R9 - ORL R9, R8 - MOVB R8, (AX) - ADDQ $0x02, AX - JMP lz4s_s2_loop - -emit_copy_three_lz4_s2: - LEAL -2(R8), R8 - MOVB R8, (AX) - MOVW R9, 1(AX) - ADDQ $0x03, AX - JMP lz4s_s2_loop - -lz4s_s2_done: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ SI, uncompressed+48(FP) - MOVQ AX, dstUsed+56(FP) - RET - -lz4s_s2_corrupt: - XORQ AX, AX - LEAQ -1(AX), SI - MOVQ SI, uncompressed+48(FP) - RET - -lz4s_s2_dstfull: - XORQ AX, AX - LEAQ -2(AX), SI - MOVQ SI, uncompressed+48(FP) - RET - -// func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) -// Requires: SSE2 -TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64 - XORQ SI, SI - MOVQ dst_base+0(FP), AX - MOVQ dst_len+8(FP), CX - MOVQ src_base+24(FP), DX - MOVQ src_len+32(FP), BX - LEAQ (DX)(BX*1), BX - LEAQ -8(AX)(CX*1), CX - -lz4_snappy_loop: - CMPQ DX, BX - JAE lz4_snappy_corrupt - CMPQ AX, CX - JAE lz4_snappy_dstfull - MOVBQZX (DX), DI - MOVQ DI, R8 - MOVQ DI, R9 - SHRQ $0x04, R8 - ANDQ $0x0f, R9 - CMPQ DI, $0xf0 - JB lz4_snappy_ll_end - -lz4_snappy_ll_loop: - INCQ DX - CMPQ DX, BX - JAE lz4_snappy_corrupt - MOVBQZX (DX), DI - ADDQ DI, R8 - CMPQ DI, $0xff - JEQ lz4_snappy_ll_loop - -lz4_snappy_ll_end: - LEAQ (DX)(R8*1), DI - ADDQ $0x04, R9 - CMPQ DI, BX - JAE lz4_snappy_corrupt - INCQ DX - INCQ DI - TESTQ R8, R8 - JZ lz4_snappy_lits_done - LEAQ (AX)(R8*1), R10 - CMPQ R10, CX - JAE lz4_snappy_dstfull - ADDQ R8, SI - LEAL -1(R8), R10 - CMPL R10, $0x3c - JB one_byte_lz4_snappy - CMPL R10, $0x00000100 - JB two_bytes_lz4_snappy - CMPL R10, $0x00010000 - JB three_bytes_lz4_snappy - CMPL R10, $0x01000000 - JB four_bytes_lz4_snappy - MOVB $0xfc, (AX) - MOVL R10, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_lz4_snappy - -four_bytes_lz4_snappy: - MOVL R10, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW R10, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_lz4_snappy - -three_bytes_lz4_snappy: - MOVB $0xf4, (AX) - MOVW R10, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_lz4_snappy - -two_bytes_lz4_snappy: - MOVB $0xf0, (AX) - MOVB R10, 1(AX) - ADDQ $0x02, AX - CMPL R10, $0x40 - JB memmove_lz4_snappy - JMP memmove_long_lz4_snappy - -one_byte_lz4_snappy: - SHLB $0x02, R10 - MOVB R10, (AX) - ADDQ $0x01, AX - -memmove_lz4_snappy: - LEAQ (AX)(R8*1), R10 - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_lz4_snappy_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_lz4_snappy_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_lz4_snappy_memmove_move_17through32 - JMP emit_lit_memmove_lz4_snappy_memmove_move_33through64 - -emit_lit_memmove_lz4_snappy_memmove_move_8: - MOVQ (DX), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_lz4_snappy - -emit_lit_memmove_lz4_snappy_memmove_move_8through16: - MOVQ (DX), R11 - MOVQ -8(DX)(R8*1), DX - MOVQ R11, (AX) - MOVQ DX, -8(AX)(R8*1) - JMP memmove_end_copy_lz4_snappy - -emit_lit_memmove_lz4_snappy_memmove_move_17through32: - MOVOU (DX), X0 - MOVOU -16(DX)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_lz4_snappy - -emit_lit_memmove_lz4_snappy_memmove_move_33through64: - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(R8*1), X2 - MOVOU -16(DX)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_lz4_snappy: - MOVQ R10, AX - JMP lz4_snappy_lits_emit_done - -memmove_long_lz4_snappy: - LEAQ (AX)(R8*1), R10 - - // genMemMoveLong - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(R8*1), X2 - MOVOU -16(DX)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32 - LEAQ -32(DX)(R13*1), R11 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_lz4_snappylarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_lz4_snappylarge_big_loop_back - -emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32: - MOVOU -32(DX)(R13*1), X4 - MOVOU -16(DX)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ R10, AX - -lz4_snappy_lits_emit_done: - MOVQ DI, DX - -lz4_snappy_lits_done: - CMPQ DX, BX - JNE lz4_snappy_match - CMPQ R9, $0x04 - JEQ lz4_snappy_done - JMP lz4_snappy_corrupt - -lz4_snappy_match: - LEAQ 2(DX), DI - CMPQ DI, BX - JAE lz4_snappy_corrupt - MOVWQZX (DX), R8 - MOVQ DI, DX - TESTQ R8, R8 - JZ lz4_snappy_corrupt - CMPQ R8, SI - JA lz4_snappy_corrupt - CMPQ R9, $0x13 - JNE lz4_snappy_ml_done - -lz4_snappy_ml_loop: - MOVBQZX (DX), DI - INCQ DX - ADDQ DI, R9 - CMPQ DX, BX - JAE lz4_snappy_corrupt - CMPQ DI, $0xff - JEQ lz4_snappy_ml_loop - -lz4_snappy_ml_done: - ADDQ R9, SI - - // emitCopy -two_byte_offset_lz4_s2: - CMPL R9, $0x40 - JBE two_byte_offset_short_lz4_s2 - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R9), R9 - ADDQ $0x03, AX - CMPQ AX, CX - JAE lz4_snappy_loop - JMP two_byte_offset_lz4_s2 - -two_byte_offset_short_lz4_s2: - MOVL R9, DI - SHLL $0x02, DI - CMPL R9, $0x0c - JAE emit_copy_three_lz4_s2 - CMPL R8, $0x00000800 - JAE emit_copy_three_lz4_s2 - LEAL -15(DI), DI - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, DI - MOVB DI, (AX) - ADDQ $0x02, AX - JMP lz4_snappy_loop - -emit_copy_three_lz4_s2: - LEAL -2(DI), DI - MOVB DI, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP lz4_snappy_loop - -lz4_snappy_done: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ SI, uncompressed+48(FP) - MOVQ AX, dstUsed+56(FP) - RET - -lz4_snappy_corrupt: - XORQ AX, AX - LEAQ -1(AX), SI - MOVQ SI, uncompressed+48(FP) - RET - -lz4_snappy_dstfull: - XORQ AX, AX - LEAQ -2(AX), SI - MOVQ SI, uncompressed+48(FP) - RET - -// func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) -// Requires: SSE2 -TEXT ·cvtLZ4sBlockSnappyAsm(SB), NOSPLIT, $0-64 - XORQ SI, SI - MOVQ dst_base+0(FP), AX - MOVQ dst_len+8(FP), CX - MOVQ src_base+24(FP), DX - MOVQ src_len+32(FP), BX - LEAQ (DX)(BX*1), BX - LEAQ -8(AX)(CX*1), CX - -lz4s_snappy_loop: - CMPQ DX, BX - JAE lz4s_snappy_corrupt - CMPQ AX, CX - JAE lz4s_snappy_dstfull - MOVBQZX (DX), DI - MOVQ DI, R8 - MOVQ DI, R9 - SHRQ $0x04, R8 - ANDQ $0x0f, R9 - CMPQ DI, $0xf0 - JB lz4s_snappy_ll_end - -lz4s_snappy_ll_loop: - INCQ DX - CMPQ DX, BX - JAE lz4s_snappy_corrupt - MOVBQZX (DX), DI - ADDQ DI, R8 - CMPQ DI, $0xff - JEQ lz4s_snappy_ll_loop - -lz4s_snappy_ll_end: - LEAQ (DX)(R8*1), DI - ADDQ $0x03, R9 - CMPQ DI, BX - JAE lz4s_snappy_corrupt - INCQ DX - INCQ DI - TESTQ R8, R8 - JZ lz4s_snappy_lits_done - LEAQ (AX)(R8*1), R10 - CMPQ R10, CX - JAE lz4s_snappy_dstfull - ADDQ R8, SI - LEAL -1(R8), R10 - CMPL R10, $0x3c - JB one_byte_lz4s_snappy - CMPL R10, $0x00000100 - JB two_bytes_lz4s_snappy - CMPL R10, $0x00010000 - JB three_bytes_lz4s_snappy - CMPL R10, $0x01000000 - JB four_bytes_lz4s_snappy - MOVB $0xfc, (AX) - MOVL R10, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_lz4s_snappy - -four_bytes_lz4s_snappy: - MOVL R10, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW R10, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_lz4s_snappy - -three_bytes_lz4s_snappy: - MOVB $0xf4, (AX) - MOVW R10, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_lz4s_snappy - -two_bytes_lz4s_snappy: - MOVB $0xf0, (AX) - MOVB R10, 1(AX) - ADDQ $0x02, AX - CMPL R10, $0x40 - JB memmove_lz4s_snappy - JMP memmove_long_lz4s_snappy - -one_byte_lz4s_snappy: - SHLB $0x02, R10 - MOVB R10, (AX) - ADDQ $0x01, AX - -memmove_lz4s_snappy: - LEAQ (AX)(R8*1), R10 - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_lz4s_snappy_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_lz4s_snappy_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_lz4s_snappy_memmove_move_17through32 - JMP emit_lit_memmove_lz4s_snappy_memmove_move_33through64 - -emit_lit_memmove_lz4s_snappy_memmove_move_8: - MOVQ (DX), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_lz4s_snappy - -emit_lit_memmove_lz4s_snappy_memmove_move_8through16: - MOVQ (DX), R11 - MOVQ -8(DX)(R8*1), DX - MOVQ R11, (AX) - MOVQ DX, -8(AX)(R8*1) - JMP memmove_end_copy_lz4s_snappy - -emit_lit_memmove_lz4s_snappy_memmove_move_17through32: - MOVOU (DX), X0 - MOVOU -16(DX)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_lz4s_snappy - -emit_lit_memmove_lz4s_snappy_memmove_move_33through64: - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(R8*1), X2 - MOVOU -16(DX)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_lz4s_snappy: - MOVQ R10, AX - JMP lz4s_snappy_lits_emit_done - -memmove_long_lz4s_snappy: - LEAQ (AX)(R8*1), R10 - - // genMemMoveLong - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(R8*1), X2 - MOVOU -16(DX)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32 - LEAQ -32(DX)(R13*1), R11 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_lz4s_snappylarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_lz4s_snappylarge_big_loop_back - -emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32: - MOVOU -32(DX)(R13*1), X4 - MOVOU -16(DX)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ R10, AX - -lz4s_snappy_lits_emit_done: - MOVQ DI, DX - -lz4s_snappy_lits_done: - CMPQ DX, BX - JNE lz4s_snappy_match - CMPQ R9, $0x03 - JEQ lz4s_snappy_done - JMP lz4s_snappy_corrupt - -lz4s_snappy_match: - CMPQ R9, $0x03 - JEQ lz4s_snappy_loop - LEAQ 2(DX), DI - CMPQ DI, BX - JAE lz4s_snappy_corrupt - MOVWQZX (DX), R8 - MOVQ DI, DX - TESTQ R8, R8 - JZ lz4s_snappy_corrupt - CMPQ R8, SI - JA lz4s_snappy_corrupt - CMPQ R9, $0x12 - JNE lz4s_snappy_ml_done - -lz4s_snappy_ml_loop: - MOVBQZX (DX), DI - INCQ DX - ADDQ DI, R9 - CMPQ DX, BX - JAE lz4s_snappy_corrupt - CMPQ DI, $0xff - JEQ lz4s_snappy_ml_loop - -lz4s_snappy_ml_done: - ADDQ R9, SI - - // emitCopy -two_byte_offset_lz4_s2: - CMPL R9, $0x40 - JBE two_byte_offset_short_lz4_s2 - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R9), R9 - ADDQ $0x03, AX - CMPQ AX, CX - JAE lz4s_snappy_loop - JMP two_byte_offset_lz4_s2 - -two_byte_offset_short_lz4_s2: - MOVL R9, DI - SHLL $0x02, DI - CMPL R9, $0x0c - JAE emit_copy_three_lz4_s2 - CMPL R8, $0x00000800 - JAE emit_copy_three_lz4_s2 - LEAL -15(DI), DI - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, DI - MOVB DI, (AX) - ADDQ $0x02, AX - JMP lz4s_snappy_loop - -emit_copy_three_lz4_s2: - LEAL -2(DI), DI - MOVB DI, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP lz4s_snappy_loop - -lz4s_snappy_done: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ SI, uncompressed+48(FP) - MOVQ AX, dstUsed+56(FP) - RET - -lz4s_snappy_corrupt: - XORQ AX, AX - LEAQ -1(AX), SI - MOVQ SI, uncompressed+48(FP) - RET - -lz4s_snappy_dstfull: - XORQ AX, AX - LEAQ -2(AX), SI - MOVQ SI, uncompressed+48(FP) - RET |