summaryrefslogtreecommitdiff
path: root/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s')
-rw-r--r--vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s16599
1 files changed, 9521 insertions, 7078 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
index 81a487d6d..12a4de3be 100644
--- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
@@ -36,8 +36,8 @@ zero_loop_encodeBlockAsm:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -47,609 +47,601 @@ zero_loop_encodeBlockAsm:
MOVQ src_base+24(FP), DX
search_loop_encodeBlockAsm:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x06, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x06, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBlockAsm
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
SHLQ $0x10, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x32, R10
- SHLQ $0x10, R11
- IMULQ R9, R11
- SHRQ $0x32, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x32, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
+ MOVL CX, R8
+ SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
JNE no_repeat_found_encodeBlockAsm
- LEAL 1(CX), DI
- MOVL 12(SP), R8
- MOVL DI, SI
- SUBL 16(SP), SI
+ LEAL 1(CX), SI
+ MOVL 12(SP), DI
+ MOVL SI, BX
+ SUBL 16(SP), BX
JZ repeat_extend_back_end_encodeBlockAsm
repeat_extend_back_loop_encodeBlockAsm:
- CMPL DI, R8
+ CMPL SI, DI
JLE repeat_extend_back_end_encodeBlockAsm
- MOVB -1(DX)(SI*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(BX*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeBlockAsm
- LEAL -1(DI), DI
- DECL SI
+ LEAL -1(SI), SI
+ DECL BX
JNZ repeat_extend_back_loop_encodeBlockAsm
repeat_extend_back_end_encodeBlockAsm:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeBlockAsm
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeBlockAsm
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeBlockAsm
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JLT three_bytes_repeat_emit_encodeBlockAsm
- CMPL SI, $0x01000000
+ CMPL BX, $0x01000000
JLT four_bytes_repeat_emit_encodeBlockAsm
MOVB $0xfc, (AX)
- MOVL SI, 1(AX)
+ MOVL BX, 1(AX)
ADDQ $0x05, AX
JMP memmove_long_repeat_emit_encodeBlockAsm
four_bytes_repeat_emit_encodeBlockAsm:
- MOVL SI, R11
- SHRL $0x10, R11
+ MOVL BX, R10
+ SHRL $0x10, R10
MOVB $0xf8, (AX)
- MOVW SI, 1(AX)
- MOVB R11, 3(AX)
+ MOVW BX, 1(AX)
+ MOVB R10, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_repeat_emit_encodeBlockAsm
three_bytes_repeat_emit_encodeBlockAsm:
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeBlockAsm
two_bytes_repeat_emit_encodeBlockAsm:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeBlockAsm
JMP memmove_long_repeat_emit_encodeBlockAsm
one_byte_repeat_emit_encodeBlockAsm:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeBlockAsm:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm
emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm
emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm
emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_repeat_emit_encodeBlockAsm:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeBlockAsm
memmove_long_repeat_emit_encodeBlockAsm:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R12
- SHRQ $0x05, R12
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R13
- SUBQ R11, R13
- DECQ R12
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
- LEAQ -32(R10)(R13*1), R11
- LEAQ -32(AX)(R13*1), R14
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R14)
- MOVOA X5, 16(R14)
- ADDQ $0x20, R14
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
ADDQ $0x20, R13
- DECQ R12
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
- MOVOU -32(R10)(R13*1), X4
- MOVOU -16(R10)(R13*1), X5
- MOVOA X4, -32(AX)(R13*1)
- MOVOA X5, -16(AX)(R13*1)
- ADDQ $0x20, R13
- CMPQ R9, R13
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeBlockAsm:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R9
- SUBL CX, R9
- LEAQ (DX)(CX*1), R10
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R12, R12
- CMPL R9, $0x08
+ XORL R11, R11
+ CMPL R8, $0x08
JL matchlen_match4_repeat_extend_encodeBlockAsm
matchlen_loopback_repeat_extend_encodeBlockAsm:
- MOVQ (R10)(R12*1), R11
- XORQ (SI)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R9)(R11*1), R10
+ XORQ (BX)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_repeat_extend_encodeBlockAsm
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP repeat_extend_forward_end_encodeBlockAsm
matchlen_loop_repeat_extend_encodeBlockAsm:
- LEAL -8(R9), R9
- LEAL 8(R12), R12
- CMPL R9, $0x08
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
JGE matchlen_loopback_repeat_extend_encodeBlockAsm
JZ repeat_extend_forward_end_encodeBlockAsm
matchlen_match4_repeat_extend_encodeBlockAsm:
- CMPL R9, $0x04
+ CMPL R8, $0x04
JL matchlen_match2_repeat_extend_encodeBlockAsm
- MOVL (R10)(R12*1), R11
- CMPL (SI)(R12*1), R11
+ MOVL (R9)(R11*1), R10
+ CMPL (BX)(R11*1), R10
JNE matchlen_match2_repeat_extend_encodeBlockAsm
- SUBL $0x04, R9
- LEAL 4(R12), R12
+ SUBL $0x04, R8
+ LEAL 4(R11), R11
matchlen_match2_repeat_extend_encodeBlockAsm:
- CMPL R9, $0x02
+ CMPL R8, $0x02
JL matchlen_match1_repeat_extend_encodeBlockAsm
- MOVW (R10)(R12*1), R11
- CMPW (SI)(R12*1), R11
+ MOVW (R9)(R11*1), R10
+ CMPW (BX)(R11*1), R10
JNE matchlen_match1_repeat_extend_encodeBlockAsm
- SUBL $0x02, R9
- LEAL 2(R12), R12
+ SUBL $0x02, R8
+ LEAL 2(R11), R11
matchlen_match1_repeat_extend_encodeBlockAsm:
- CMPL R9, $0x01
+ CMPL R8, $0x01
JL repeat_extend_forward_end_encodeBlockAsm
- MOVB (R10)(R12*1), R11
- CMPB (SI)(R12*1), R11
+ MOVB (R9)(R11*1), R10
+ CMPB (BX)(R11*1), R10
JNE repeat_extend_forward_end_encodeBlockAsm
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
repeat_extend_forward_end_encodeBlockAsm:
- ADDL R12, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
- TESTL R8, R8
+ ADDL R11, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
+ TESTL DI, DI
JZ repeat_as_copy_encodeBlockAsm
// emitRepeat
emit_repeat_again_match_repeat_encodeBlockAsm:
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_match_repeat_encodeBlockAsm
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_match_repeat_encodeBlockAsm
cant_repeat_two_offset_match_repeat_encodeBlockAsm:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_match_repeat_encodeBlockAsm
- CMPL SI, $0x00010100
+ CMPL BX, $0x00010100
JLT repeat_four_match_repeat_encodeBlockAsm
- CMPL SI, $0x0100ffff
+ CMPL BX, $0x0100ffff
JLT repeat_five_match_repeat_encodeBlockAsm
- LEAL -16842747(SI), SI
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(BX), BX
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_match_repeat_encodeBlockAsm
repeat_five_match_repeat_encodeBlockAsm:
- LEAL -65536(SI), SI
- MOVL SI, DI
+ LEAL -65536(BX), BX
+ MOVL BX, SI
MOVW $0x001d, (AX)
- MOVW SI, 2(AX)
- SARL $0x10, DI
- MOVB DI, 4(AX)
+ MOVW BX, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_four_match_repeat_encodeBlockAsm:
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_three_match_repeat_encodeBlockAsm:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_two_match_repeat_encodeBlockAsm:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_two_offset_match_repeat_encodeBlockAsm:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_as_copy_encodeBlockAsm:
// emitCopy
- CMPL DI, $0x00010000
+ CMPL SI, $0x00010000
JL two_byte_offset_repeat_as_copy_encodeBlockAsm
-
-four_bytes_loop_back_repeat_as_copy_encodeBlockAsm:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm
MOVB $0xff, (AX)
- MOVL DI, 1(AX)
- LEAL -64(SI), SI
+ MOVL SI, 1(AX)
+ LEAL -64(BX), BX
ADDQ $0x05, AX
- CMPL SI, $0x04
+ CMPL BX, $0x04
JL four_bytes_remain_repeat_as_copy_encodeBlockAsm
// emitRepeat
emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
- CMPL SI, $0x00010100
+ CMPL BX, $0x00010100
JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
- CMPL SI, $0x0100ffff
+ CMPL BX, $0x0100ffff
JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
- LEAL -16842747(SI), SI
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(BX), BX
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy
repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
- LEAL -65536(SI), SI
- MOVL SI, DI
+ LEAL -65536(BX), BX
+ MOVL BX, SI
MOVW $0x001d, (AX)
- MOVW SI, 2(AX)
- SARL $0x10, DI
- MOVB DI, 4(AX)
+ MOVW BX, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
- JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm
four_bytes_remain_repeat_as_copy_encodeBlockAsm:
- TESTL SI, SI
+ TESTL BX, BX
JZ repeat_end_emit_encodeBlockAsm
- MOVB $0x03, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVL DI, 1(AX)
+ XORL DI, DI
+ LEAL -1(DI)(BX*4), BX
+ MOVB BL, (AX)
+ MOVL SI, 1(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm
two_byte_offset_repeat_as_copy_encodeBlockAsm:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JAE long_offset_short_repeat_as_copy_encodeBlockAsm
- MOVL $0x00000001, R8
- LEAL 16(R8), R8
- MOVB DI, 1(AX)
- MOVL DI, R9
- SHRL $0x08, R9
- SHLL $0x05, R9
- ORL R9, R8
- MOVB R8, (AX)
+ MOVL $0x00000001, DI
+ LEAL 16(DI), DI
+ MOVB SI, 1(AX)
+ MOVL SI, R8
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, SI
+ SUBL $0x08, BX
// emitRepeat
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
- CMPL SI, $0x00010100
+ CMPL BX, $0x00010100
JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
- CMPL SI, $0x0100ffff
+ CMPL BX, $0x0100ffff
JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
- LEAL -16842747(SI), SI
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(BX), BX
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
- LEAL -65536(SI), SI
- MOVL SI, DI
+ LEAL -65536(BX), BX
+ MOVL BX, SI
MOVW $0x001d, (AX)
- MOVW SI, 2(AX)
- SARL $0x10, DI
- MOVB DI, 4(AX)
+ MOVW BX, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
long_offset_short_repeat_as_copy_encodeBlockAsm:
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
// emitRepeat
emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
- CMPL SI, $0x00010100
+ CMPL BX, $0x00010100
JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
- CMPL SI, $0x0100ffff
+ CMPL BX, $0x0100ffff
JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
- LEAL -16842747(SI), SI
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(BX), BX
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short
repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
- LEAL -65536(SI), SI
- MOVL SI, DI
+ LEAL -65536(BX), BX
+ MOVL BX, SI
MOVW $0x001d, (AX)
- MOVW SI, 2(AX)
- SARL $0x10, DI
- MOVB DI, 4(AX)
+ MOVW BX, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
- JMP two_byte_offset_repeat_as_copy_encodeBlockAsm
two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm
emit_copy_three_repeat_as_copy_encodeBlockAsm:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeBlockAsm:
@@ -657,16 +649,16 @@ repeat_end_emit_encodeBlockAsm:
JMP search_loop_encodeBlockAsm
no_repeat_found_encodeBlockAsm:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBlockAsm
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeBlockAsm
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeBlockAsm
MOVL 20(SP), CX
JMP search_loop_encodeBlockAsm
@@ -676,549 +668,542 @@ candidate3_match_encodeBlockAsm:
JMP candidate_match_encodeBlockAsm
candidate2_match_encodeBlockAsm:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBlockAsm:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBlockAsm
match_extend_back_loop_encodeBlockAsm:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBlockAsm
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBlockAsm
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBlockAsm
JMP match_extend_back_loop_encodeBlockAsm
match_extend_back_end_encodeBlockAsm:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 5(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 5(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBlockAsm
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBlockAsm:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeBlockAsm
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeBlockAsm
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeBlockAsm
- CMPL R8, $0x00010000
+ CMPL DI, $0x00010000
JLT three_bytes_match_emit_encodeBlockAsm
- CMPL R8, $0x01000000
+ CMPL DI, $0x01000000
JLT four_bytes_match_emit_encodeBlockAsm
MOVB $0xfc, (AX)
- MOVL R8, 1(AX)
+ MOVL DI, 1(AX)
ADDQ $0x05, AX
JMP memmove_long_match_emit_encodeBlockAsm
four_bytes_match_emit_encodeBlockAsm:
- MOVL R8, R10
- SHRL $0x10, R10
+ MOVL DI, R9
+ SHRL $0x10, R9
MOVB $0xf8, (AX)
- MOVW R8, 1(AX)
- MOVB R10, 3(AX)
+ MOVW DI, 1(AX)
+ MOVB R9, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_match_emit_encodeBlockAsm
three_bytes_match_emit_encodeBlockAsm:
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBlockAsm
two_bytes_match_emit_encodeBlockAsm:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeBlockAsm
JMP memmove_long_match_emit_encodeBlockAsm
one_byte_match_emit_encodeBlockAsm:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBlockAsm:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeBlockAsm
emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm
emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm
emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBlockAsm:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeBlockAsm
memmove_long_match_emit_encodeBlockAsm:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeBlockAsm:
match_nolit_loop_encodeBlockAsm:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeBlockAsm
matchlen_loopback_match_nolit_encodeBlockAsm:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeBlockAsm
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeBlockAsm
matchlen_loop_match_nolit_encodeBlockAsm:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeBlockAsm
JZ match_nolit_end_encodeBlockAsm
matchlen_match4_match_nolit_encodeBlockAsm:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeBlockAsm
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeBlockAsm
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeBlockAsm:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeBlockAsm
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeBlockAsm
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeBlockAsm:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeBlockAsm
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeBlockAsm
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeBlockAsm:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JL two_byte_offset_match_nolit_encodeBlockAsm
-
-four_bytes_loop_back_match_nolit_encodeBlockAsm:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE four_bytes_remain_match_nolit_encodeBlockAsm
MOVB $0xff, (AX)
- MOVL SI, 1(AX)
- LEAL -64(R10), R10
+ MOVL BX, 1(AX)
+ LEAL -64(R9), R9
ADDQ $0x05, AX
- CMPL R10, $0x04
+ CMPL R9, $0x04
JL four_bytes_remain_match_nolit_encodeBlockAsm
// emitRepeat
emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy
- CMPL R10, $0x00010100
+ CMPL R9, $0x00010100
JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy
- CMPL R10, $0x0100ffff
+ CMPL R9, $0x0100ffff
JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy
- LEAL -16842747(R10), R10
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(R9), R9
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy
repeat_five_match_nolit_encodeBlockAsm_emit_copy:
- LEAL -65536(R10), R10
- MOVL R10, SI
+ LEAL -65536(R9), R9
+ MOVL R9, BX
MOVW $0x001d, (AX)
- MOVW R10, 2(AX)
- SARL $0x10, SI
- MOVB SI, 4(AX)
+ MOVW R9, 2(AX)
+ SARL $0x10, BX
+ MOVB BL, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_four_match_nolit_encodeBlockAsm_emit_copy:
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_three_match_nolit_encodeBlockAsm_emit_copy:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_two_match_nolit_encodeBlockAsm_emit_copy:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
- JMP four_bytes_loop_back_match_nolit_encodeBlockAsm
four_bytes_remain_match_nolit_encodeBlockAsm:
- TESTL R10, R10
+ TESTL R9, R9
JZ match_nolit_emitcopy_end_encodeBlockAsm
- MOVB $0x03, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVL SI, 1(AX)
+ XORL SI, SI
+ LEAL -1(SI)(R9*4), R9
+ MOVB R9, (AX)
+ MOVL BX, 1(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
two_byte_offset_match_nolit_encodeBlockAsm:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeBlockAsm
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JAE long_offset_short_match_nolit_encodeBlockAsm
- MOVL $0x00000001, DI
- LEAL 16(DI), DI
- MOVB SI, 1(AX)
- MOVL SI, R8
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, DI
- MOVB DI, (AX)
+ MOVL $0x00000001, SI
+ LEAL 16(SI), SI
+ MOVB BL, 1(AX)
+ MOVL BX, DI
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R10
+ SUBL $0x08, R9
// emitRepeat
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b:
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b
- CMPL R10, $0x00010100
+ CMPL R9, $0x00010100
JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b
- CMPL R10, $0x0100ffff
+ CMPL R9, $0x0100ffff
JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b
- LEAL -16842747(R10), R10
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(R9), R9
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b
repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b:
- LEAL -65536(R10), R10
- MOVL R10, SI
+ LEAL -65536(R9), R9
+ MOVL R9, BX
MOVW $0x001d, (AX)
- MOVW R10, 2(AX)
- SARL $0x10, SI
- MOVB SI, 4(AX)
+ MOVW R9, 2(AX)
+ SARL $0x10, BX
+ MOVB BL, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b:
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
long_offset_short_match_nolit_encodeBlockAsm:
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
// emitRepeat
emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
- CMPL R10, $0x00010100
+ CMPL R9, $0x00010100
JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
- CMPL R10, $0x0100ffff
+ CMPL R9, $0x0100ffff
JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
- LEAL -16842747(R10), R10
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(R9), R9
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short
repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
- LEAL -65536(R10), R10
- MOVL R10, SI
+ LEAL -65536(R9), R9
+ MOVL R9, BX
MOVW $0x001d, (AX)
- MOVW R10, 2(AX)
- SARL $0x10, SI
- MOVB SI, 4(AX)
+ MOVW R9, 2(AX)
+ SARL $0x10, BX
+ MOVB BL, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
- JMP two_byte_offset_match_nolit_encodeBlockAsm
two_byte_offset_short_match_nolit_encodeBlockAsm:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeBlockAsm
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JGE emit_copy_three_match_nolit_encodeBlockAsm
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm
emit_copy_three_match_nolit_encodeBlockAsm:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeBlockAsm:
CMPL CX, 8(SP)
JGE emit_remainder_encodeBlockAsm
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeBlockAsm
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeBlockAsm:
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x10, R8
- IMULQ R9, R8
- SHRQ $0x32, R8
- SHLQ $0x10, SI
- IMULQ R9, SI
- SHRQ $0x32, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x10, DI
+ IMULQ R8, DI
+ SHRQ $0x32, DI
+ SHLQ $0x10, BX
+ IMULQ R8, BX
+ SHRQ $0x32, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeBlockAsm
INCL CX
JMP search_loop_encodeBlockAsm
@@ -1422,8 +1407,8 @@ zero_loop_encodeBlockAsm4MB:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -1433,555 +1418,551 @@ zero_loop_encodeBlockAsm4MB:
MOVQ src_base+24(FP), DX
search_loop_encodeBlockAsm4MB:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x06, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x06, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBlockAsm4MB
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x32, R10
- SHLQ $0x10, R11
- IMULQ R9, R11
- SHRQ $0x32, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
SHLQ $0x10, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x32, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
+ MOVL CX, R8
+ SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
JNE no_repeat_found_encodeBlockAsm4MB
- LEAL 1(CX), DI
- MOVL 12(SP), R8
- MOVL DI, SI
- SUBL 16(SP), SI
+ LEAL 1(CX), SI
+ MOVL 12(SP), DI
+ MOVL SI, BX
+ SUBL 16(SP), BX
JZ repeat_extend_back_end_encodeBlockAsm4MB
repeat_extend_back_loop_encodeBlockAsm4MB:
- CMPL DI, R8
+ CMPL SI, DI
JLE repeat_extend_back_end_encodeBlockAsm4MB
- MOVB -1(DX)(SI*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(BX*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeBlockAsm4MB
- LEAL -1(DI), DI
- DECL SI
+ LEAL -1(SI), SI
+ DECL BX
JNZ repeat_extend_back_loop_encodeBlockAsm4MB
repeat_extend_back_end_encodeBlockAsm4MB:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeBlockAsm4MB
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeBlockAsm4MB
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JLT three_bytes_repeat_emit_encodeBlockAsm4MB
- MOVL SI, R11
- SHRL $0x10, R11
+ MOVL BX, R10
+ SHRL $0x10, R10
MOVB $0xf8, (AX)
- MOVW SI, 1(AX)
- MOVB R11, 3(AX)
+ MOVW BX, 1(AX)
+ MOVB R10, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_repeat_emit_encodeBlockAsm4MB
three_bytes_repeat_emit_encodeBlockAsm4MB:
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeBlockAsm4MB
two_bytes_repeat_emit_encodeBlockAsm4MB:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeBlockAsm4MB
JMP memmove_long_repeat_emit_encodeBlockAsm4MB
one_byte_repeat_emit_encodeBlockAsm4MB:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeBlockAsm4MB:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB
memmove_long_repeat_emit_encodeBlockAsm4MB:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R12
- SHRQ $0x05, R12
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R13
- SUBQ R11, R13
- DECQ R12
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
- LEAQ -32(R10)(R13*1), R11
- LEAQ -32(AX)(R13*1), R14
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R14)
- MOVOA X5, 16(R14)
- ADDQ $0x20, R14
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
ADDQ $0x20, R13
- DECQ R12
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
- MOVOU -32(R10)(R13*1), X4
- MOVOU -16(R10)(R13*1), X5
- MOVOA X4, -32(AX)(R13*1)
- MOVOA X5, -16(AX)(R13*1)
- ADDQ $0x20, R13
- CMPQ R9, R13
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeBlockAsm4MB:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R9
- SUBL CX, R9
- LEAQ (DX)(CX*1), R10
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R12, R12
- CMPL R9, $0x08
+ XORL R11, R11
+ CMPL R8, $0x08
JL matchlen_match4_repeat_extend_encodeBlockAsm4MB
matchlen_loopback_repeat_extend_encodeBlockAsm4MB:
- MOVQ (R10)(R12*1), R11
- XORQ (SI)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R9)(R11*1), R10
+ XORQ (BX)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP repeat_extend_forward_end_encodeBlockAsm4MB
matchlen_loop_repeat_extend_encodeBlockAsm4MB:
- LEAL -8(R9), R9
- LEAL 8(R12), R12
- CMPL R9, $0x08
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
JGE matchlen_loopback_repeat_extend_encodeBlockAsm4MB
JZ repeat_extend_forward_end_encodeBlockAsm4MB
matchlen_match4_repeat_extend_encodeBlockAsm4MB:
- CMPL R9, $0x04
+ CMPL R8, $0x04
JL matchlen_match2_repeat_extend_encodeBlockAsm4MB
- MOVL (R10)(R12*1), R11
- CMPL (SI)(R12*1), R11
+ MOVL (R9)(R11*1), R10
+ CMPL (BX)(R11*1), R10
JNE matchlen_match2_repeat_extend_encodeBlockAsm4MB
- SUBL $0x04, R9
- LEAL 4(R12), R12
+ SUBL $0x04, R8
+ LEAL 4(R11), R11
matchlen_match2_repeat_extend_encodeBlockAsm4MB:
- CMPL R9, $0x02
+ CMPL R8, $0x02
JL matchlen_match1_repeat_extend_encodeBlockAsm4MB
- MOVW (R10)(R12*1), R11
- CMPW (SI)(R12*1), R11
+ MOVW (R9)(R11*1), R10
+ CMPW (BX)(R11*1), R10
JNE matchlen_match1_repeat_extend_encodeBlockAsm4MB
- SUBL $0x02, R9
- LEAL 2(R12), R12
+ SUBL $0x02, R8
+ LEAL 2(R11), R11
matchlen_match1_repeat_extend_encodeBlockAsm4MB:
- CMPL R9, $0x01
+ CMPL R8, $0x01
JL repeat_extend_forward_end_encodeBlockAsm4MB
- MOVB (R10)(R12*1), R11
- CMPB (SI)(R12*1), R11
+ MOVB (R9)(R11*1), R10
+ CMPB (BX)(R11*1), R10
JNE repeat_extend_forward_end_encodeBlockAsm4MB
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
repeat_extend_forward_end_encodeBlockAsm4MB:
- ADDL R12, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
- TESTL R8, R8
+ ADDL R11, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
+ TESTL DI, DI
JZ repeat_as_copy_encodeBlockAsm4MB
// emitRepeat
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_match_repeat_encodeBlockAsm4MB
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_match_repeat_encodeBlockAsm4MB
cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_match_repeat_encodeBlockAsm4MB
- CMPL SI, $0x00010100
+ CMPL BX, $0x00010100
JLT repeat_four_match_repeat_encodeBlockAsm4MB
- LEAL -65536(SI), SI
- MOVL SI, DI
+ LEAL -65536(BX), BX
+ MOVL BX, SI
MOVW $0x001d, (AX)
- MOVW SI, 2(AX)
- SARL $0x10, DI
- MOVB DI, 4(AX)
+ MOVW BX, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_four_match_repeat_encodeBlockAsm4MB:
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_three_match_repeat_encodeBlockAsm4MB:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_two_match_repeat_encodeBlockAsm4MB:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_two_offset_match_repeat_encodeBlockAsm4MB:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_as_copy_encodeBlockAsm4MB:
// emitCopy
- CMPL DI, $0x00010000
+ CMPL SI, $0x00010000
JL two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
-
-four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
MOVB $0xff, (AX)
- MOVL DI, 1(AX)
- LEAL -64(SI), SI
+ MOVL SI, 1(AX)
+ LEAL -64(BX), BX
ADDQ $0x05, AX
- CMPL SI, $0x04
+ CMPL BX, $0x04
JL four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
// emitRepeat
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
- CMPL SI, $0x00010100
+ CMPL BX, $0x00010100
JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
- LEAL -65536(SI), SI
- MOVL SI, DI
+ LEAL -65536(BX), BX
+ MOVL BX, SI
MOVW $0x001d, (AX)
- MOVW SI, 2(AX)
- SARL $0x10, DI
- MOVB DI, 4(AX)
+ MOVW BX, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
- JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB
four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
- TESTL SI, SI
+ TESTL BX, BX
JZ repeat_end_emit_encodeBlockAsm4MB
- MOVB $0x03, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVL DI, 1(AX)
+ XORL DI, DI
+ LEAL -1(DI)(BX*4), BX
+ MOVB BL, (AX)
+ MOVL SI, 1(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm4MB
two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JAE long_offset_short_repeat_as_copy_encodeBlockAsm4MB
- MOVL $0x00000001, R8
- LEAL 16(R8), R8
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, R8
- MOVB R8, (AX)
+ MOVL $0x00000001, DI
+ LEAL 16(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, SI
+ SUBL $0x08, BX
// emitRepeat
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
- CMPL SI, $0x00010100
+ CMPL BX, $0x00010100
JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
- LEAL -65536(SI), SI
- MOVL SI, DI
+ LEAL -65536(BX), BX
+ MOVL BX, SI
MOVW $0x001d, (AX)
- MOVW SI, 2(AX)
- SARL $0x10, DI
- MOVB DI, 4(AX)
+ MOVW BX, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
long_offset_short_repeat_as_copy_encodeBlockAsm4MB:
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
// emitRepeat
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
- CMPL SI, $0x00010100
+ CMPL BX, $0x00010100
JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
- LEAL -65536(SI), SI
- MOVL SI, DI
+ LEAL -65536(BX), BX
+ MOVL BX, SI
MOVW $0x001d, (AX)
- MOVW SI, 2(AX)
- SARL $0x10, DI
- MOVB DI, 4(AX)
+ MOVW BX, 2(AX)
+ SARL $0x10, SI
+ MOVB SI, 4(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
- JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm4MB
emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeBlockAsm4MB:
@@ -1989,16 +1970,16 @@ repeat_end_emit_encodeBlockAsm4MB:
JMP search_loop_encodeBlockAsm4MB
no_repeat_found_encodeBlockAsm4MB:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBlockAsm4MB
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeBlockAsm4MB
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeBlockAsm4MB
MOVL 20(SP), CX
JMP search_loop_encodeBlockAsm4MB
@@ -2008,506 +1989,502 @@ candidate3_match_encodeBlockAsm4MB:
JMP candidate_match_encodeBlockAsm4MB
candidate2_match_encodeBlockAsm4MB:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBlockAsm4MB:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBlockAsm4MB
match_extend_back_loop_encodeBlockAsm4MB:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBlockAsm4MB
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBlockAsm4MB
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBlockAsm4MB
JMP match_extend_back_loop_encodeBlockAsm4MB
match_extend_back_end_encodeBlockAsm4MB:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 4(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 4(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBlockAsm4MB
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBlockAsm4MB:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeBlockAsm4MB
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeBlockAsm4MB
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeBlockAsm4MB
- CMPL R8, $0x00010000
+ CMPL DI, $0x00010000
JLT three_bytes_match_emit_encodeBlockAsm4MB
- MOVL R8, R10
- SHRL $0x10, R10
+ MOVL DI, R9
+ SHRL $0x10, R9
MOVB $0xf8, (AX)
- MOVW R8, 1(AX)
- MOVB R10, 3(AX)
+ MOVW DI, 1(AX)
+ MOVB R9, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_match_emit_encodeBlockAsm4MB
three_bytes_match_emit_encodeBlockAsm4MB:
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBlockAsm4MB
two_bytes_match_emit_encodeBlockAsm4MB:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeBlockAsm4MB
JMP memmove_long_match_emit_encodeBlockAsm4MB
one_byte_match_emit_encodeBlockAsm4MB:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBlockAsm4MB:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBlockAsm4MB:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeBlockAsm4MB
memmove_long_match_emit_encodeBlockAsm4MB:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeBlockAsm4MB:
match_nolit_loop_encodeBlockAsm4MB:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeBlockAsm4MB
matchlen_loopback_match_nolit_encodeBlockAsm4MB:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeBlockAsm4MB
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeBlockAsm4MB
matchlen_loop_match_nolit_encodeBlockAsm4MB:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeBlockAsm4MB
JZ match_nolit_end_encodeBlockAsm4MB
matchlen_match4_match_nolit_encodeBlockAsm4MB:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeBlockAsm4MB
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeBlockAsm4MB
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeBlockAsm4MB:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeBlockAsm4MB
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeBlockAsm4MB
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeBlockAsm4MB:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeBlockAsm4MB
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeBlockAsm4MB
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeBlockAsm4MB:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JL two_byte_offset_match_nolit_encodeBlockAsm4MB
-
-four_bytes_loop_back_match_nolit_encodeBlockAsm4MB:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE four_bytes_remain_match_nolit_encodeBlockAsm4MB
MOVB $0xff, (AX)
- MOVL SI, 1(AX)
- LEAL -64(R10), R10
+ MOVL BX, 1(AX)
+ LEAL -64(R9), R9
ADDQ $0x05, AX
- CMPL R10, $0x04
+ CMPL R9, $0x04
JL four_bytes_remain_match_nolit_encodeBlockAsm4MB
// emitRepeat
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
- CMPL R10, $0x00010100
+ CMPL R9, $0x00010100
JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
- LEAL -65536(R10), R10
- MOVL R10, SI
+ LEAL -65536(R9), R9
+ MOVL R9, BX
MOVW $0x001d, (AX)
- MOVW R10, 2(AX)
- SARL $0x10, SI
- MOVB SI, 4(AX)
+ MOVW R9, 2(AX)
+ SARL $0x10, BX
+ MOVB BL, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
- JMP four_bytes_loop_back_match_nolit_encodeBlockAsm4MB
four_bytes_remain_match_nolit_encodeBlockAsm4MB:
- TESTL R10, R10
+ TESTL R9, R9
JZ match_nolit_emitcopy_end_encodeBlockAsm4MB
- MOVB $0x03, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVL SI, 1(AX)
+ XORL SI, SI
+ LEAL -1(SI)(R9*4), R9
+ MOVB R9, (AX)
+ MOVL BX, 1(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
two_byte_offset_match_nolit_encodeBlockAsm4MB:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeBlockAsm4MB
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JAE long_offset_short_match_nolit_encodeBlockAsm4MB
- MOVL $0x00000001, DI
- LEAL 16(DI), DI
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, DI
- MOVB DI, (AX)
+ MOVL $0x00000001, SI
+ LEAL 16(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R10
+ SUBL $0x08, R9
// emitRepeat
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
- CMPL R10, $0x00010100
+ CMPL R9, $0x00010100
JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
- LEAL -65536(R10), R10
- MOVL R10, SI
+ LEAL -65536(R9), R9
+ MOVL R9, BX
MOVW $0x001d, (AX)
- MOVW R10, 2(AX)
- SARL $0x10, SI
- MOVB SI, 4(AX)
+ MOVW R9, 2(AX)
+ SARL $0x10, BX
+ MOVB BL, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
long_offset_short_match_nolit_encodeBlockAsm4MB:
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
// emitRepeat
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
- CMPL R10, $0x00010100
+ CMPL R9, $0x00010100
JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
- LEAL -65536(R10), R10
- MOVL R10, SI
+ LEAL -65536(R9), R9
+ MOVL R9, BX
MOVW $0x001d, (AX)
- MOVW R10, 2(AX)
- SARL $0x10, SI
- MOVB SI, 4(AX)
+ MOVW R9, 2(AX)
+ SARL $0x10, BX
+ MOVB BL, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
- JMP two_byte_offset_match_nolit_encodeBlockAsm4MB
two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeBlockAsm4MB
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JGE emit_copy_three_match_nolit_encodeBlockAsm4MB
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
emit_copy_three_match_nolit_encodeBlockAsm4MB:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeBlockAsm4MB:
CMPL CX, 8(SP)
JGE emit_remainder_encodeBlockAsm4MB
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeBlockAsm4MB
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeBlockAsm4MB:
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x10, R8
- IMULQ R9, R8
- SHRQ $0x32, R8
- SHLQ $0x10, SI
- IMULQ R9, SI
- SHRQ $0x32, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x10, DI
+ IMULQ R8, DI
+ SHRQ $0x32, DI
+ SHLQ $0x10, BX
+ IMULQ R8, BX
+ SHRQ $0x32, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeBlockAsm4MB
INCL CX
JMP search_loop_encodeBlockAsm4MB
@@ -2703,8 +2680,8 @@ zero_loop_encodeBlockAsm12B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -2714,428 +2691,426 @@ zero_loop_encodeBlockAsm12B:
MOVQ src_base+24(FP), DX
search_loop_encodeBlockAsm12B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x05, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x05, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBlockAsm12B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x000000cf1bbcdcbb, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x000000cf1bbcdcbb, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x18, R9
+ IMULQ R8, R9
+ SHRQ $0x34, R9
SHLQ $0x18, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x34, R10
- SHLQ $0x18, R11
- IMULQ R9, R11
- SHRQ $0x34, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
- SHLQ $0x18, R10
- IMULQ R9, R10
- SHRQ $0x34, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x18, R9
+ IMULQ R8, R9
+ SHRQ $0x34, R9
+ MOVL CX, R8
+ SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
JNE no_repeat_found_encodeBlockAsm12B
- LEAL 1(CX), DI
- MOVL 12(SP), R8
- MOVL DI, SI
- SUBL 16(SP), SI
+ LEAL 1(CX), SI
+ MOVL 12(SP), DI
+ MOVL SI, BX
+ SUBL 16(SP), BX
JZ repeat_extend_back_end_encodeBlockAsm12B
repeat_extend_back_loop_encodeBlockAsm12B:
- CMPL DI, R8
+ CMPL SI, DI
JLE repeat_extend_back_end_encodeBlockAsm12B
- MOVB -1(DX)(SI*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(BX*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeBlockAsm12B
- LEAL -1(DI), DI
- DECL SI
+ LEAL -1(SI), SI
+ DECL BX
JNZ repeat_extend_back_loop_encodeBlockAsm12B
repeat_extend_back_end_encodeBlockAsm12B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeBlockAsm12B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeBlockAsm12B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeBlockAsm12B
two_bytes_repeat_emit_encodeBlockAsm12B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeBlockAsm12B
JMP memmove_long_repeat_emit_encodeBlockAsm12B
one_byte_repeat_emit_encodeBlockAsm12B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeBlockAsm12B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_repeat_emit_encodeBlockAsm12B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeBlockAsm12B
memmove_long_repeat_emit_encodeBlockAsm12B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R12
- SHRQ $0x05, R12
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R13
- SUBQ R11, R13
- DECQ R12
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R13*1), R11
- LEAQ -32(AX)(R13*1), R14
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R14)
- MOVOA X5, 16(R14)
- ADDQ $0x20, R14
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
ADDQ $0x20, R13
- DECQ R12
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R13*1), X4
- MOVOU -16(R10)(R13*1), X5
- MOVOA X4, -32(AX)(R13*1)
- MOVOA X5, -16(AX)(R13*1)
- ADDQ $0x20, R13
- CMPQ R9, R13
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeBlockAsm12B:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R9
- SUBL CX, R9
- LEAQ (DX)(CX*1), R10
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R12, R12
- CMPL R9, $0x08
+ XORL R11, R11
+ CMPL R8, $0x08
JL matchlen_match4_repeat_extend_encodeBlockAsm12B
matchlen_loopback_repeat_extend_encodeBlockAsm12B:
- MOVQ (R10)(R12*1), R11
- XORQ (SI)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R9)(R11*1), R10
+ XORQ (BX)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_repeat_extend_encodeBlockAsm12B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP repeat_extend_forward_end_encodeBlockAsm12B
matchlen_loop_repeat_extend_encodeBlockAsm12B:
- LEAL -8(R9), R9
- LEAL 8(R12), R12
- CMPL R9, $0x08
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B
JZ repeat_extend_forward_end_encodeBlockAsm12B
matchlen_match4_repeat_extend_encodeBlockAsm12B:
- CMPL R9, $0x04
+ CMPL R8, $0x04
JL matchlen_match2_repeat_extend_encodeBlockAsm12B
- MOVL (R10)(R12*1), R11
- CMPL (SI)(R12*1), R11
+ MOVL (R9)(R11*1), R10
+ CMPL (BX)(R11*1), R10
JNE matchlen_match2_repeat_extend_encodeBlockAsm12B
- SUBL $0x04, R9
- LEAL 4(R12), R12
+ SUBL $0x04, R8
+ LEAL 4(R11), R11
matchlen_match2_repeat_extend_encodeBlockAsm12B:
- CMPL R9, $0x02
+ CMPL R8, $0x02
JL matchlen_match1_repeat_extend_encodeBlockAsm12B
- MOVW (R10)(R12*1), R11
- CMPW (SI)(R12*1), R11
+ MOVW (R9)(R11*1), R10
+ CMPW (BX)(R11*1), R10
JNE matchlen_match1_repeat_extend_encodeBlockAsm12B
- SUBL $0x02, R9
- LEAL 2(R12), R12
+ SUBL $0x02, R8
+ LEAL 2(R11), R11
matchlen_match1_repeat_extend_encodeBlockAsm12B:
- CMPL R9, $0x01
+ CMPL R8, $0x01
JL repeat_extend_forward_end_encodeBlockAsm12B
- MOVB (R10)(R12*1), R11
- CMPB (SI)(R12*1), R11
+ MOVB (R9)(R11*1), R10
+ CMPB (BX)(R11*1), R10
JNE repeat_extend_forward_end_encodeBlockAsm12B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
repeat_extend_forward_end_encodeBlockAsm12B:
- ADDL R12, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
- TESTL R8, R8
+ ADDL R11, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
+ TESTL DI, DI
JZ repeat_as_copy_encodeBlockAsm12B
// emitRepeat
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_match_repeat_encodeBlockAsm12B
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_match_repeat_encodeBlockAsm12B
cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_match_repeat_encodeBlockAsm12B
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_three_match_repeat_encodeBlockAsm12B:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_two_match_repeat_encodeBlockAsm12B:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_two_offset_match_repeat_encodeBlockAsm12B:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_as_copy_encodeBlockAsm12B:
// emitCopy
-two_byte_offset_repeat_as_copy_encodeBlockAsm12B:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JAE long_offset_short_repeat_as_copy_encodeBlockAsm12B
- MOVL $0x00000001, R8
- LEAL 16(R8), R8
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, R8
- MOVB R8, (AX)
+ MOVL $0x00000001, DI
+ LEAL 16(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, SI
+ SUBL $0x08, BX
// emitRepeat
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm12B
long_offset_short_repeat_as_copy_encodeBlockAsm12B:
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
// emitRepeat
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm12B
repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm12B
- JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B
two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm12B
emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeBlockAsm12B:
@@ -3143,16 +3118,16 @@ repeat_end_emit_encodeBlockAsm12B:
JMP search_loop_encodeBlockAsm12B
no_repeat_found_encodeBlockAsm12B:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBlockAsm12B
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeBlockAsm12B
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeBlockAsm12B
MOVL 20(SP), CX
JMP search_loop_encodeBlockAsm12B
@@ -3162,391 +3137,389 @@ candidate3_match_encodeBlockAsm12B:
JMP candidate_match_encodeBlockAsm12B
candidate2_match_encodeBlockAsm12B:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBlockAsm12B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBlockAsm12B
match_extend_back_loop_encodeBlockAsm12B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBlockAsm12B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBlockAsm12B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBlockAsm12B
JMP match_extend_back_loop_encodeBlockAsm12B
match_extend_back_end_encodeBlockAsm12B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBlockAsm12B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBlockAsm12B:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeBlockAsm12B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeBlockAsm12B
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeBlockAsm12B
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBlockAsm12B
two_bytes_match_emit_encodeBlockAsm12B:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeBlockAsm12B
JMP memmove_long_match_emit_encodeBlockAsm12B
one_byte_match_emit_encodeBlockAsm12B:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBlockAsm12B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeBlockAsm12B
emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm12B
emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm12B
emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBlockAsm12B:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeBlockAsm12B
memmove_long_match_emit_encodeBlockAsm12B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeBlockAsm12B:
match_nolit_loop_encodeBlockAsm12B:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeBlockAsm12B
matchlen_loopback_match_nolit_encodeBlockAsm12B:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeBlockAsm12B
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeBlockAsm12B
matchlen_loop_match_nolit_encodeBlockAsm12B:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeBlockAsm12B
JZ match_nolit_end_encodeBlockAsm12B
matchlen_match4_match_nolit_encodeBlockAsm12B:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeBlockAsm12B
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeBlockAsm12B
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeBlockAsm12B:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeBlockAsm12B
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeBlockAsm12B
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeBlockAsm12B:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeBlockAsm12B
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeBlockAsm12B
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeBlockAsm12B:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
-two_byte_offset_match_nolit_encodeBlockAsm12B:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JAE long_offset_short_match_nolit_encodeBlockAsm12B
- MOVL $0x00000001, DI
- LEAL 16(DI), DI
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, DI
- MOVB DI, (AX)
+ MOVL $0x00000001, SI
+ LEAL 16(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R10
+ SUBL $0x08, R9
// emitRepeat
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
long_offset_short_match_nolit_encodeBlockAsm12B:
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
// emitRepeat
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
- JMP two_byte_offset_match_nolit_encodeBlockAsm12B
two_byte_offset_short_match_nolit_encodeBlockAsm12B:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeBlockAsm12B
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JGE emit_copy_three_match_nolit_encodeBlockAsm12B
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm12B
emit_copy_three_match_nolit_encodeBlockAsm12B:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeBlockAsm12B:
CMPL CX, 8(SP)
JGE emit_remainder_encodeBlockAsm12B
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeBlockAsm12B
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeBlockAsm12B:
- MOVQ $0x000000cf1bbcdcbb, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x18, R8
- IMULQ R9, R8
- SHRQ $0x34, R8
- SHLQ $0x18, SI
- IMULQ R9, SI
- SHRQ $0x34, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x000000cf1bbcdcbb, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x18, DI
+ IMULQ R8, DI
+ SHRQ $0x34, DI
+ SHLQ $0x18, BX
+ IMULQ R8, BX
+ SHRQ $0x34, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeBlockAsm12B
INCL CX
JMP search_loop_encodeBlockAsm12B
@@ -3731,8 +3704,8 @@ zero_loop_encodeBlockAsm10B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -3742,428 +3715,426 @@ zero_loop_encodeBlockAsm10B:
MOVQ src_base+24(FP), DX
search_loop_encodeBlockAsm10B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x05, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x05, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBlockAsm10B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x9e3779b1, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
- SHLQ $0x20, R10
- IMULQ R9, R10
- SHRQ $0x36, R10
- SHLQ $0x20, R11
- IMULQ R9, R11
- SHRQ $0x36, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x36, R9
SHLQ $0x20, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x36, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x36, R9
+ MOVL CX, R8
+ SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
JNE no_repeat_found_encodeBlockAsm10B
- LEAL 1(CX), DI
- MOVL 12(SP), R8
- MOVL DI, SI
- SUBL 16(SP), SI
+ LEAL 1(CX), SI
+ MOVL 12(SP), DI
+ MOVL SI, BX
+ SUBL 16(SP), BX
JZ repeat_extend_back_end_encodeBlockAsm10B
repeat_extend_back_loop_encodeBlockAsm10B:
- CMPL DI, R8
+ CMPL SI, DI
JLE repeat_extend_back_end_encodeBlockAsm10B
- MOVB -1(DX)(SI*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(BX*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeBlockAsm10B
- LEAL -1(DI), DI
- DECL SI
+ LEAL -1(SI), SI
+ DECL BX
JNZ repeat_extend_back_loop_encodeBlockAsm10B
repeat_extend_back_end_encodeBlockAsm10B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeBlockAsm10B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeBlockAsm10B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeBlockAsm10B
two_bytes_repeat_emit_encodeBlockAsm10B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeBlockAsm10B
JMP memmove_long_repeat_emit_encodeBlockAsm10B
one_byte_repeat_emit_encodeBlockAsm10B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeBlockAsm10B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_repeat_emit_encodeBlockAsm10B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeBlockAsm10B
memmove_long_repeat_emit_encodeBlockAsm10B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R12
- SHRQ $0x05, R12
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R13
- SUBQ R11, R13
- DECQ R12
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R13*1), R11
- LEAQ -32(AX)(R13*1), R14
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R14)
- MOVOA X5, 16(R14)
- ADDQ $0x20, R14
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
ADDQ $0x20, R13
- DECQ R12
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R13*1), X4
- MOVOU -16(R10)(R13*1), X5
- MOVOA X4, -32(AX)(R13*1)
- MOVOA X5, -16(AX)(R13*1)
- ADDQ $0x20, R13
- CMPQ R9, R13
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeBlockAsm10B:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R9
- SUBL CX, R9
- LEAQ (DX)(CX*1), R10
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R12, R12
- CMPL R9, $0x08
+ XORL R11, R11
+ CMPL R8, $0x08
JL matchlen_match4_repeat_extend_encodeBlockAsm10B
matchlen_loopback_repeat_extend_encodeBlockAsm10B:
- MOVQ (R10)(R12*1), R11
- XORQ (SI)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R9)(R11*1), R10
+ XORQ (BX)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_repeat_extend_encodeBlockAsm10B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP repeat_extend_forward_end_encodeBlockAsm10B
matchlen_loop_repeat_extend_encodeBlockAsm10B:
- LEAL -8(R9), R9
- LEAL 8(R12), R12
- CMPL R9, $0x08
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B
JZ repeat_extend_forward_end_encodeBlockAsm10B
matchlen_match4_repeat_extend_encodeBlockAsm10B:
- CMPL R9, $0x04
+ CMPL R8, $0x04
JL matchlen_match2_repeat_extend_encodeBlockAsm10B
- MOVL (R10)(R12*1), R11
- CMPL (SI)(R12*1), R11
+ MOVL (R9)(R11*1), R10
+ CMPL (BX)(R11*1), R10
JNE matchlen_match2_repeat_extend_encodeBlockAsm10B
- SUBL $0x04, R9
- LEAL 4(R12), R12
+ SUBL $0x04, R8
+ LEAL 4(R11), R11
matchlen_match2_repeat_extend_encodeBlockAsm10B:
- CMPL R9, $0x02
+ CMPL R8, $0x02
JL matchlen_match1_repeat_extend_encodeBlockAsm10B
- MOVW (R10)(R12*1), R11
- CMPW (SI)(R12*1), R11
+ MOVW (R9)(R11*1), R10
+ CMPW (BX)(R11*1), R10
JNE matchlen_match1_repeat_extend_encodeBlockAsm10B
- SUBL $0x02, R9
- LEAL 2(R12), R12
+ SUBL $0x02, R8
+ LEAL 2(R11), R11
matchlen_match1_repeat_extend_encodeBlockAsm10B:
- CMPL R9, $0x01
+ CMPL R8, $0x01
JL repeat_extend_forward_end_encodeBlockAsm10B
- MOVB (R10)(R12*1), R11
- CMPB (SI)(R12*1), R11
+ MOVB (R9)(R11*1), R10
+ CMPB (BX)(R11*1), R10
JNE repeat_extend_forward_end_encodeBlockAsm10B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
repeat_extend_forward_end_encodeBlockAsm10B:
- ADDL R12, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
- TESTL R8, R8
+ ADDL R11, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
+ TESTL DI, DI
JZ repeat_as_copy_encodeBlockAsm10B
// emitRepeat
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_match_repeat_encodeBlockAsm10B
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_match_repeat_encodeBlockAsm10B
cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_match_repeat_encodeBlockAsm10B
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_three_match_repeat_encodeBlockAsm10B:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_two_match_repeat_encodeBlockAsm10B:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_two_offset_match_repeat_encodeBlockAsm10B:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_as_copy_encodeBlockAsm10B:
// emitCopy
-two_byte_offset_repeat_as_copy_encodeBlockAsm10B:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JAE long_offset_short_repeat_as_copy_encodeBlockAsm10B
- MOVL $0x00000001, R8
- LEAL 16(R8), R8
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, R8
- MOVB R8, (AX)
+ MOVL $0x00000001, DI
+ LEAL 16(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, SI
+ SUBL $0x08, BX
// emitRepeat
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm10B
long_offset_short_repeat_as_copy_encodeBlockAsm10B:
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
// emitRepeat
- MOVL SI, R8
- LEAL -4(SI), SI
- CMPL R8, $0x08
+ MOVL BX, DI
+ LEAL -4(BX), BX
+ CMPL DI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
- CMPL R8, $0x0c
+ CMPL DI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm10B
repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm10B
- JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B
two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm10B
emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeBlockAsm10B:
@@ -4171,16 +4142,16 @@ repeat_end_emit_encodeBlockAsm10B:
JMP search_loop_encodeBlockAsm10B
no_repeat_found_encodeBlockAsm10B:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBlockAsm10B
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeBlockAsm10B
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeBlockAsm10B
MOVL 20(SP), CX
JMP search_loop_encodeBlockAsm10B
@@ -4190,391 +4161,389 @@ candidate3_match_encodeBlockAsm10B:
JMP candidate_match_encodeBlockAsm10B
candidate2_match_encodeBlockAsm10B:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBlockAsm10B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBlockAsm10B
match_extend_back_loop_encodeBlockAsm10B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBlockAsm10B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBlockAsm10B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBlockAsm10B
JMP match_extend_back_loop_encodeBlockAsm10B
match_extend_back_end_encodeBlockAsm10B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBlockAsm10B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBlockAsm10B:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeBlockAsm10B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeBlockAsm10B
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeBlockAsm10B
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBlockAsm10B
two_bytes_match_emit_encodeBlockAsm10B:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeBlockAsm10B
JMP memmove_long_match_emit_encodeBlockAsm10B
one_byte_match_emit_encodeBlockAsm10B:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBlockAsm10B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeBlockAsm10B
emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm10B
emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm10B
emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBlockAsm10B:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeBlockAsm10B
memmove_long_match_emit_encodeBlockAsm10B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeBlockAsm10B:
match_nolit_loop_encodeBlockAsm10B:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeBlockAsm10B
matchlen_loopback_match_nolit_encodeBlockAsm10B:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeBlockAsm10B
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeBlockAsm10B
matchlen_loop_match_nolit_encodeBlockAsm10B:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeBlockAsm10B
JZ match_nolit_end_encodeBlockAsm10B
matchlen_match4_match_nolit_encodeBlockAsm10B:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeBlockAsm10B
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeBlockAsm10B
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeBlockAsm10B:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeBlockAsm10B
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeBlockAsm10B
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeBlockAsm10B:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeBlockAsm10B
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeBlockAsm10B
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeBlockAsm10B:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
-two_byte_offset_match_nolit_encodeBlockAsm10B:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JAE long_offset_short_match_nolit_encodeBlockAsm10B
- MOVL $0x00000001, DI
- LEAL 16(DI), DI
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, DI
- MOVB DI, (AX)
+ MOVL $0x00000001, SI
+ LEAL 16(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R10
+ SUBL $0x08, R9
// emitRepeat
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
long_offset_short_match_nolit_encodeBlockAsm10B:
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
// emitRepeat
- MOVL R10, DI
- LEAL -4(R10), R10
- CMPL DI, $0x08
+ MOVL R9, SI
+ LEAL -4(R9), R9
+ CMPL SI, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
- JMP two_byte_offset_match_nolit_encodeBlockAsm10B
two_byte_offset_short_match_nolit_encodeBlockAsm10B:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeBlockAsm10B
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JGE emit_copy_three_match_nolit_encodeBlockAsm10B
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm10B
emit_copy_three_match_nolit_encodeBlockAsm10B:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeBlockAsm10B:
CMPL CX, 8(SP)
JGE emit_remainder_encodeBlockAsm10B
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeBlockAsm10B
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeBlockAsm10B:
- MOVQ $0x9e3779b1, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x20, R8
- IMULQ R9, R8
- SHRQ $0x36, R8
- SHLQ $0x20, SI
- IMULQ R9, SI
- SHRQ $0x36, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x20, DI
+ IMULQ R8, DI
+ SHRQ $0x36, DI
+ SHLQ $0x20, BX
+ IMULQ R8, BX
+ SHRQ $0x36, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeBlockAsm10B
INCL CX
JMP search_loop_encodeBlockAsm10B
@@ -4759,8 +4728,8 @@ zero_loop_encodeBlockAsm8B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -4770,414 +4739,412 @@ zero_loop_encodeBlockAsm8B:
MOVQ src_base+24(FP), DX
search_loop_encodeBlockAsm8B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x04, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x04, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBlockAsm8B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x9e3779b1, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
- SHLQ $0x20, R10
- IMULQ R9, R10
- SHRQ $0x38, R10
- SHLQ $0x20, R11
- IMULQ R9, R11
- SHRQ $0x38, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x38, R9
SHLQ $0x20, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x38, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x38, R9
+ MOVL CX, R8
+ SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
JNE no_repeat_found_encodeBlockAsm8B
- LEAL 1(CX), DI
- MOVL 12(SP), R8
- MOVL DI, SI
- SUBL 16(SP), SI
+ LEAL 1(CX), SI
+ MOVL 12(SP), DI
+ MOVL SI, BX
+ SUBL 16(SP), BX
JZ repeat_extend_back_end_encodeBlockAsm8B
repeat_extend_back_loop_encodeBlockAsm8B:
- CMPL DI, R8
+ CMPL SI, DI
JLE repeat_extend_back_end_encodeBlockAsm8B
- MOVB -1(DX)(SI*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(BX*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeBlockAsm8B
- LEAL -1(DI), DI
- DECL SI
+ LEAL -1(SI), SI
+ DECL BX
JNZ repeat_extend_back_loop_encodeBlockAsm8B
repeat_extend_back_end_encodeBlockAsm8B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeBlockAsm8B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeBlockAsm8B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeBlockAsm8B
two_bytes_repeat_emit_encodeBlockAsm8B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeBlockAsm8B
JMP memmove_long_repeat_emit_encodeBlockAsm8B
one_byte_repeat_emit_encodeBlockAsm8B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeBlockAsm8B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_repeat_emit_encodeBlockAsm8B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeBlockAsm8B
memmove_long_repeat_emit_encodeBlockAsm8B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R12
- SHRQ $0x05, R12
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R13
- SUBQ R11, R13
- DECQ R12
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R11
+ SHRQ $0x05, R11
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R12
+ SUBQ R10, R12
+ DECQ R11
JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R13*1), R11
- LEAQ -32(AX)(R13*1), R14
+ LEAQ -32(R9)(R12*1), R10
+ LEAQ -32(AX)(R12*1), R13
emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R14)
- MOVOA X5, 16(R14)
- ADDQ $0x20, R14
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
ADDQ $0x20, R13
- DECQ R12
+ ADDQ $0x20, R10
+ ADDQ $0x20, R12
+ DECQ R11
JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R13*1), X4
- MOVOU -16(R10)(R13*1), X5
- MOVOA X4, -32(AX)(R13*1)
- MOVOA X5, -16(AX)(R13*1)
- ADDQ $0x20, R13
- CMPQ R9, R13
+ MOVOU -32(R9)(R12*1), X4
+ MOVOU -16(R9)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ R8, R12
JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeBlockAsm8B:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R9
- SUBL CX, R9
- LEAQ (DX)(CX*1), R10
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), R8
+ SUBL CX, R8
+ LEAQ (DX)(CX*1), R9
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R12, R12
- CMPL R9, $0x08
+ XORL R11, R11
+ CMPL R8, $0x08
JL matchlen_match4_repeat_extend_encodeBlockAsm8B
matchlen_loopback_repeat_extend_encodeBlockAsm8B:
- MOVQ (R10)(R12*1), R11
- XORQ (SI)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R9)(R11*1), R10
+ XORQ (BX)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_repeat_extend_encodeBlockAsm8B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP repeat_extend_forward_end_encodeBlockAsm8B
matchlen_loop_repeat_extend_encodeBlockAsm8B:
- LEAL -8(R9), R9
- LEAL 8(R12), R12
- CMPL R9, $0x08
+ LEAL -8(R8), R8
+ LEAL 8(R11), R11
+ CMPL R8, $0x08
JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B
JZ repeat_extend_forward_end_encodeBlockAsm8B
matchlen_match4_repeat_extend_encodeBlockAsm8B:
- CMPL R9, $0x04
+ CMPL R8, $0x04
JL matchlen_match2_repeat_extend_encodeBlockAsm8B
- MOVL (R10)(R12*1), R11
- CMPL (SI)(R12*1), R11
+ MOVL (R9)(R11*1), R10
+ CMPL (BX)(R11*1), R10
JNE matchlen_match2_repeat_extend_encodeBlockAsm8B
- SUBL $0x04, R9
- LEAL 4(R12), R12
+ SUBL $0x04, R8
+ LEAL 4(R11), R11
matchlen_match2_repeat_extend_encodeBlockAsm8B:
- CMPL R9, $0x02
+ CMPL R8, $0x02
JL matchlen_match1_repeat_extend_encodeBlockAsm8B
- MOVW (R10)(R12*1), R11
- CMPW (SI)(R12*1), R11
+ MOVW (R9)(R11*1), R10
+ CMPW (BX)(R11*1), R10
JNE matchlen_match1_repeat_extend_encodeBlockAsm8B
- SUBL $0x02, R9
- LEAL 2(R12), R12
+ SUBL $0x02, R8
+ LEAL 2(R11), R11
matchlen_match1_repeat_extend_encodeBlockAsm8B:
- CMPL R9, $0x01
+ CMPL R8, $0x01
JL repeat_extend_forward_end_encodeBlockAsm8B
- MOVB (R10)(R12*1), R11
- CMPB (SI)(R12*1), R11
+ MOVB (R9)(R11*1), R10
+ CMPB (BX)(R11*1), R10
JNE repeat_extend_forward_end_encodeBlockAsm8B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
repeat_extend_forward_end_encodeBlockAsm8B:
- ADDL R12, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
- TESTL R8, R8
+ ADDL R11, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
+ TESTL DI, DI
JZ repeat_as_copy_encodeBlockAsm8B
// emitRepeat
- MOVL SI, DI
- LEAL -4(SI), SI
- CMPL DI, $0x08
+ MOVL BX, SI
+ LEAL -4(BX), BX
+ CMPL SI, $0x08
JLE repeat_two_match_repeat_encodeBlockAsm8B
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B
cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_match_repeat_encodeBlockAsm8B
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm8B
repeat_three_match_repeat_encodeBlockAsm8B:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm8B
repeat_two_match_repeat_encodeBlockAsm8B:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm8B
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm8B
repeat_as_copy_encodeBlockAsm8B:
// emitCopy
-two_byte_offset_repeat_as_copy_encodeBlockAsm8B:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JAE long_offset_short_repeat_as_copy_encodeBlockAsm8B
- MOVL $0x00000001, R8
- LEAL 16(R8), R8
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, R8
- MOVB R8, (AX)
+ MOVL $0x00000001, DI
+ LEAL 16(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, SI
+ SUBL $0x08, BX
// emitRepeat
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
- MOVL SI, DI
- LEAL -4(SI), SI
- CMPL DI, $0x08
+ MOVL BX, SI
+ LEAL -4(BX), BX
+ CMPL SI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm8B
repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm8B
repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm8B
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm8B
long_offset_short_repeat_as_copy_encodeBlockAsm8B:
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
// emitRepeat
- MOVL SI, DI
- LEAL -4(SI), SI
- CMPL DI, $0x08
+ MOVL BX, SI
+ LEAL -4(BX), BX
+ CMPL SI, $0x08
JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
- CMPL DI, $0x0c
+ CMPL SI, $0x0c
JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
- CMPL SI, $0x00000104
+ CMPL BX, $0x00000104
JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
- LEAL -256(SI), SI
+ LEAL -256(BX), BX
MOVW $0x0019, (AX)
- MOVW SI, 2(AX)
+ MOVW BX, 2(AX)
ADDQ $0x04, AX
JMP repeat_end_emit_encodeBlockAsm8B
repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
- LEAL -4(SI), SI
+ LEAL -4(BX), BX
MOVW $0x0015, (AX)
- MOVB SI, 2(AX)
+ MOVB BL, 2(AX)
ADDQ $0x03, AX
JMP repeat_end_emit_encodeBlockAsm8B
repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
- SHLL $0x02, SI
- ORL $0x01, SI
- MOVW SI, (AX)
+ SHLL $0x02, BX
+ ORL $0x01, BX
+ MOVW BX, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm8B
- XORQ R8, R8
- LEAL 1(R8)(SI*4), SI
- MOVB DI, 1(AX)
- SARL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ XORQ DI, DI
+ LEAL 1(DI)(BX*4), BX
+ MOVB SI, 1(AX)
+ SARL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm8B
- JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B
two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeBlockAsm8B
emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeBlockAsm8B:
@@ -5185,16 +5152,16 @@ repeat_end_emit_encodeBlockAsm8B:
JMP search_loop_encodeBlockAsm8B
no_repeat_found_encodeBlockAsm8B:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBlockAsm8B
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeBlockAsm8B
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeBlockAsm8B
MOVL 20(SP), CX
JMP search_loop_encodeBlockAsm8B
@@ -5204,381 +5171,379 @@ candidate3_match_encodeBlockAsm8B:
JMP candidate_match_encodeBlockAsm8B
candidate2_match_encodeBlockAsm8B:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBlockAsm8B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBlockAsm8B
match_extend_back_loop_encodeBlockAsm8B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBlockAsm8B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBlockAsm8B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBlockAsm8B
JMP match_extend_back_loop_encodeBlockAsm8B
match_extend_back_end_encodeBlockAsm8B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBlockAsm8B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBlockAsm8B:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeBlockAsm8B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeBlockAsm8B
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeBlockAsm8B
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBlockAsm8B
two_bytes_match_emit_encodeBlockAsm8B:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeBlockAsm8B
JMP memmove_long_match_emit_encodeBlockAsm8B
one_byte_match_emit_encodeBlockAsm8B:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBlockAsm8B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeBlockAsm8B
emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm8B
emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBlockAsm8B
emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBlockAsm8B:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeBlockAsm8B
memmove_long_match_emit_encodeBlockAsm8B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeBlockAsm8B:
match_nolit_loop_encodeBlockAsm8B:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeBlockAsm8B
matchlen_loopback_match_nolit_encodeBlockAsm8B:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeBlockAsm8B
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeBlockAsm8B
matchlen_loop_match_nolit_encodeBlockAsm8B:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeBlockAsm8B
JZ match_nolit_end_encodeBlockAsm8B
matchlen_match4_match_nolit_encodeBlockAsm8B:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeBlockAsm8B
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeBlockAsm8B
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeBlockAsm8B:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeBlockAsm8B
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeBlockAsm8B
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeBlockAsm8B:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeBlockAsm8B
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeBlockAsm8B
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeBlockAsm8B:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
-two_byte_offset_match_nolit_encodeBlockAsm8B:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JAE long_offset_short_match_nolit_encodeBlockAsm8B
- MOVL $0x00000001, DI
- LEAL 16(DI), DI
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, DI
- MOVB DI, (AX)
+ MOVL $0x00000001, SI
+ LEAL 16(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R10
+ SUBL $0x08, R9
// emitRepeat
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
- MOVL R10, SI
- LEAL -4(R10), R10
- CMPL SI, $0x08
+ MOVL R9, BX
+ LEAL -4(R9), R9
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
long_offset_short_match_nolit_encodeBlockAsm8B:
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
// emitRepeat
- MOVL R10, SI
- LEAL -4(R10), R10
- CMPL SI, $0x08
+ MOVL R9, BX
+ LEAL -4(R9), R9
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
- CMPL R10, $0x00000104
+ CMPL R9, $0x00000104
JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
- LEAL -256(R10), R10
+ LEAL -256(R9), R9
MOVW $0x0019, (AX)
- MOVW R10, 2(AX)
+ MOVW R9, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
- LEAL -4(R10), R10
+ LEAL -4(R9), R9
MOVW $0x0015, (AX)
- MOVB R10, 2(AX)
+ MOVB R9, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
- SHLL $0x02, R10
- ORL $0x01, R10
- MOVW R10, (AX)
+ SHLL $0x02, R9
+ ORL $0x01, R9
+ MOVW R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
- XORQ DI, DI
- LEAL 1(DI)(R10*4), R10
- MOVB SI, 1(AX)
- SARL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ XORQ SI, SI
+ LEAL 1(SI)(R9*4), R9
+ MOVB BL, 1(AX)
+ SARL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, R9
+ MOVB R9, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
- JMP two_byte_offset_match_nolit_encodeBlockAsm8B
two_byte_offset_short_match_nolit_encodeBlockAsm8B:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeBlockAsm8B
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBlockAsm8B
emit_copy_three_match_nolit_encodeBlockAsm8B:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeBlockAsm8B:
CMPL CX, 8(SP)
JGE emit_remainder_encodeBlockAsm8B
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeBlockAsm8B
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeBlockAsm8B:
- MOVQ $0x9e3779b1, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x20, R8
- IMULQ R9, R8
- SHRQ $0x38, R8
- SHLQ $0x20, SI
- IMULQ R9, SI
- SHRQ $0x38, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x20, DI
+ IMULQ R8, DI
+ SHRQ $0x38, DI
+ SHLQ $0x20, BX
+ IMULQ R8, BX
+ SHRQ $0x38, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeBlockAsm8B
INCL CX
JMP search_loop_encodeBlockAsm8B
@@ -5763,8 +5728,8 @@ zero_loop_encodeBetterBlockAsm:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -6(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -5774,818 +5739,810 @@ zero_loop_encodeBetterBlockAsm:
MOVQ src_base+24(FP), DX
search_loop_encodeBetterBlockAsm:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x07, SI
- CMPL SI, $0x63
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x07, BX
+ CMPL BX, $0x63
JLE check_maxskip_ok_encodeBetterBlockAsm
- LEAL 100(CX), SI
+ LEAL 100(CX), BX
JMP check_maxskip_cont_encodeBetterBlockAsm
check_maxskip_ok_encodeBetterBlockAsm:
- LEAL 1(CX)(SI*1), SI
+ LEAL 1(CX)(BX*1), BX
check_maxskip_cont_encodeBetterBlockAsm:
- CMPL SI, 8(SP)
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBetterBlockAsm
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x00cf1bbcdcbfa563, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x08, R10
- IMULQ R9, R10
- SHRQ $0x2f, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x32, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 524312(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 524312(SP)(R11*4)
- MOVQ (DX)(SI*1), R10
- MOVQ (DX)(R8*1), R11
- CMPQ R10, DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x00cf1bbcdcbfa563, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x08, R9
+ IMULQ R8, R9
+ SHRQ $0x2f, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x32, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 524312(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 524312(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeBetterBlockAsm
- CMPQ R11, DI
+ CMPQ R10, SI
JNE no_short_found_encodeBetterBlockAsm
- MOVL R8, SI
+ MOVL DI, BX
JMP candidate_match_encodeBetterBlockAsm
no_short_found_encodeBetterBlockAsm:
- CMPL R10, DI
+ CMPL R9, SI
JEQ candidate_match_encodeBetterBlockAsm
- CMPL R11, DI
+ CMPL R10, SI
JEQ candidateS_match_encodeBetterBlockAsm
MOVL 20(SP), CX
JMP search_loop_encodeBetterBlockAsm
candidateS_match_encodeBetterBlockAsm:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x08, R10
- IMULQ R9, R10
- SHRQ $0x2f, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x08, R9
+ IMULQ R8, R9
+ SHRQ $0x2f, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBetterBlockAsm
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBetterBlockAsm:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBetterBlockAsm
match_extend_back_loop_encodeBetterBlockAsm:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBetterBlockAsm
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBetterBlockAsm
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBetterBlockAsm
JMP match_extend_back_loop_encodeBetterBlockAsm
match_extend_back_end_encodeBetterBlockAsm:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 5(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 5(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBetterBlockAsm
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBetterBlockAsm:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeBetterBlockAsm
matchlen_loopback_match_nolit_encodeBetterBlockAsm:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeBetterBlockAsm
matchlen_loop_match_nolit_encodeBetterBlockAsm:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm
JZ match_nolit_end_encodeBetterBlockAsm
matchlen_match4_match_nolit_encodeBetterBlockAsm:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeBetterBlockAsm
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeBetterBlockAsm
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeBetterBlockAsm:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeBetterBlockAsm
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeBetterBlockAsm
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeBetterBlockAsm:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeBetterBlockAsm
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeBetterBlockAsm
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeBetterBlockAsm:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- CMPL 16(SP), R8
+ CMPL 16(SP), DI
JEQ match_is_repeat_encodeBetterBlockAsm
- CMPL R12, $0x01
+ CMPL R11, $0x01
JG match_length_ok_encodeBetterBlockAsm
- CMPL R8, $0x0000ffff
+ CMPL DI, $0x0000ffff
JLE match_length_ok_encodeBetterBlockAsm
MOVL 20(SP), CX
INCL CX
JMP search_loop_encodeBetterBlockAsm
match_length_ok_encodeBetterBlockAsm:
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeBetterBlockAsm
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeBetterBlockAsm
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeBetterBlockAsm
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JLT three_bytes_match_emit_encodeBetterBlockAsm
- CMPL SI, $0x01000000
+ CMPL BX, $0x01000000
JLT four_bytes_match_emit_encodeBetterBlockAsm
MOVB $0xfc, (AX)
- MOVL SI, 1(AX)
+ MOVL BX, 1(AX)
ADDQ $0x05, AX
JMP memmove_long_match_emit_encodeBetterBlockAsm
four_bytes_match_emit_encodeBetterBlockAsm:
- MOVL SI, R11
- SHRL $0x10, R11
+ MOVL BX, R10
+ SHRL $0x10, R10
MOVB $0xf8, (AX)
- MOVW SI, 1(AX)
- MOVB R11, 3(AX)
+ MOVW BX, 1(AX)
+ MOVB R10, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_match_emit_encodeBetterBlockAsm
three_bytes_match_emit_encodeBetterBlockAsm:
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBetterBlockAsm
two_bytes_match_emit_encodeBetterBlockAsm:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeBetterBlockAsm
JMP memmove_long_match_emit_encodeBetterBlockAsm
one_byte_match_emit_encodeBetterBlockAsm:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBetterBlockAsm:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBetterBlockAsm:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeBetterBlockAsm
memmove_long_match_emit_encodeBetterBlockAsm:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeBetterBlockAsm:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
- CMPL R8, $0x00010000
+ CMPL DI, $0x00010000
JL two_byte_offset_match_nolit_encodeBetterBlockAsm
-
-four_bytes_loop_back_match_nolit_encodeBetterBlockAsm:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm
MOVB $0xff, (AX)
- MOVL R8, 1(AX)
- LEAL -64(R12), R12
+ MOVL DI, 1(AX)
+ LEAL -64(R11), R11
ADDQ $0x05, AX
- CMPL R12, $0x04
+ CMPL R11, $0x04
JL four_bytes_remain_match_nolit_encodeBetterBlockAsm
// emitRepeat
emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
- CMPL R12, $0x00010100
+ CMPL R11, $0x00010100
JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
- CMPL R12, $0x0100ffff
+ CMPL R11, $0x0100ffff
JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
- LEAL -16842747(R12), R12
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(R11), R11
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy
repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
- LEAL -65536(R12), R12
- MOVL R12, R8
+ LEAL -65536(R11), R11
+ MOVL R11, DI
MOVW $0x001d, (AX)
- MOVW R12, 2(AX)
- SARL $0x10, R8
- MOVB R8, 4(AX)
+ MOVW R11, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
- JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm
four_bytes_remain_match_nolit_encodeBetterBlockAsm:
- TESTL R12, R12
+ TESTL R11, R11
JZ match_nolit_emitcopy_end_encodeBetterBlockAsm
- MOVB $0x03, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVL R8, 1(AX)
+ XORL BX, BX
+ LEAL -1(BX)(R11*4), R11
+ MOVB R11, (AX)
+ MOVL DI, 1(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
two_byte_offset_match_nolit_encodeBetterBlockAsm:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JAE long_offset_short_match_nolit_encodeBetterBlockAsm
- MOVL $0x00000001, SI
- LEAL 16(SI), SI
- MOVB R8, 1(AX)
- MOVL R8, R9
- SHRL $0x08, R9
- SHLL $0x05, R9
- ORL R9, SI
- MOVB SI, (AX)
+ MOVL $0x00000001, BX
+ LEAL 16(BX), BX
+ MOVB DI, 1(AX)
+ MOVL DI, R8
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R12
+ SUBL $0x08, R11
// emitRepeat
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
- CMPL R12, $0x00010100
+ CMPL R11, $0x00010100
JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
- CMPL R12, $0x0100ffff
+ CMPL R11, $0x0100ffff
JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
- LEAL -16842747(R12), R12
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(R11), R11
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
- LEAL -65536(R12), R12
- MOVL R12, R8
+ LEAL -65536(R11), R11
+ MOVL R11, DI
MOVW $0x001d, (AX)
- MOVW R12, 2(AX)
- SARL $0x10, R8
- MOVB R8, 4(AX)
+ MOVW R11, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
long_offset_short_match_nolit_encodeBetterBlockAsm:
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
// emitRepeat
emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
- CMPL R12, $0x00010100
+ CMPL R11, $0x00010100
JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
- CMPL R12, $0x0100ffff
+ CMPL R11, $0x0100ffff
JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
- LEAL -16842747(R12), R12
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(R11), R11
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short
repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
- LEAL -65536(R12), R12
- MOVL R12, R8
+ LEAL -65536(R11), R11
+ MOVL R11, DI
MOVW $0x001d, (AX)
- MOVW R12, 2(AX)
- SARL $0x10, R8
- MOVB R8, 4(AX)
+ MOVW R11, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
- JMP two_byte_offset_match_nolit_encodeBetterBlockAsm
two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
emit_copy_three_match_nolit_encodeBetterBlockAsm:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
match_is_repeat_encodeBetterBlockAsm:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_repeat_encodeBetterBlockAsm
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm
- CMPL SI, $0x01000000
+ CMPL BX, $0x01000000
JLT four_bytes_match_emit_repeat_encodeBetterBlockAsm
MOVB $0xfc, (AX)
- MOVL SI, 1(AX)
+ MOVL BX, 1(AX)
ADDQ $0x05, AX
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
four_bytes_match_emit_repeat_encodeBetterBlockAsm:
- MOVL SI, R11
- SHRL $0x10, R11
+ MOVL BX, R10
+ SHRL $0x10, R10
MOVB $0xf8, (AX)
- MOVW SI, 1(AX)
- MOVB R11, 3(AX)
+ MOVW BX, 1(AX)
+ MOVB R10, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
three_bytes_match_emit_repeat_encodeBetterBlockAsm:
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
two_bytes_match_emit_repeat_encodeBetterBlockAsm:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_repeat_encodeBetterBlockAsm
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
one_byte_match_emit_repeat_encodeBetterBlockAsm:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_repeat_encodeBetterBlockAsm:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
memmove_long_match_emit_repeat_encodeBetterBlockAsm:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitRepeat
emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm
- CMPL R12, $0x00010100
+ CMPL R11, $0x00010100
JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm
- CMPL R12, $0x0100ffff
+ CMPL R11, $0x0100ffff
JLT repeat_five_match_nolit_repeat_encodeBetterBlockAsm
- LEAL -16842747(R12), R12
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ LEAL -16842747(R11), R11
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm
repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
- LEAL -65536(R12), R12
- MOVL R12, R8
+ LEAL -65536(R11), R11
+ MOVL R11, DI
MOVW $0x001d, (AX)
- MOVW R12, 2(AX)
- SARL $0x10, R8
- MOVB R8, 4(AX)
+ MOVW R11, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
match_nolit_emitcopy_end_encodeBetterBlockAsm:
@@ -6597,50 +6554,50 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm:
RET
match_nolit_dst_ok_encodeBetterBlockAsm:
- MOVQ $0x00cf1bbcdcbfa563, SI
- MOVQ $0x9e3779b1, R8
- LEAQ 1(DI), DI
- LEAQ -2(CX), R9
- MOVQ (DX)(DI*1), R10
- MOVQ 1(DX)(DI*1), R11
- MOVQ (DX)(R9*1), R12
- MOVQ 1(DX)(R9*1), R13
- SHLQ $0x08, R10
- IMULQ SI, R10
- SHRQ $0x2f, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x32, R11
- SHLQ $0x08, R12
- IMULQ SI, R12
- SHRQ $0x2f, R12
- SHLQ $0x20, R13
- IMULQ R8, R13
- SHRQ $0x32, R13
- LEAQ 1(DI), R8
- LEAQ 1(R9), R14
- MOVL DI, 24(SP)(R10*4)
- MOVL R9, 24(SP)(R12*4)
- MOVL R8, 524312(SP)(R11*4)
- MOVL R14, 524312(SP)(R13*4)
- ADDQ $0x01, DI
- SUBQ $0x01, R9
+ MOVQ $0x00cf1bbcdcbfa563, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x08, R9
+ IMULQ BX, R9
+ SHRQ $0x2f, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x32, R10
+ SHLQ $0x08, R11
+ IMULQ BX, R11
+ SHRQ $0x2f, R11
+ SHLQ $0x20, R12
+ IMULQ DI, R12
+ SHRQ $0x32, R12
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 524312(SP)(R10*4)
+ MOVL R13, 524312(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
index_loop_encodeBetterBlockAsm:
- CMPQ DI, R9
+ CMPQ SI, R8
JAE search_loop_encodeBetterBlockAsm
- MOVQ (DX)(DI*1), R8
- MOVQ (DX)(R9*1), R10
- SHLQ $0x08, R8
- IMULQ SI, R8
- SHRQ $0x2f, R8
- SHLQ $0x08, R10
- IMULQ SI, R10
- SHRQ $0x2f, R10
- MOVL DI, 24(SP)(R8*4)
- MOVL R9, 24(SP)(R10*4)
- ADDQ $0x02, DI
- SUBQ $0x02, R9
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x08, DI
+ IMULQ BX, DI
+ SHRQ $0x2f, DI
+ SHLQ $0x08, R9
+ IMULQ BX, R9
+ SHRQ $0x2f, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
JMP index_loop_encodeBetterBlockAsm
emit_remainder_encodeBetterBlockAsm:
@@ -6842,8 +6799,8 @@ zero_loop_encodeBetterBlockAsm4MB:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -6(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -6853,756 +6810,752 @@ zero_loop_encodeBetterBlockAsm4MB:
MOVQ src_base+24(FP), DX
search_loop_encodeBetterBlockAsm4MB:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x07, SI
- CMPL SI, $0x63
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x07, BX
+ CMPL BX, $0x63
JLE check_maxskip_ok_encodeBetterBlockAsm4MB
- LEAL 100(CX), SI
+ LEAL 100(CX), BX
JMP check_maxskip_cont_encodeBetterBlockAsm4MB
check_maxskip_ok_encodeBetterBlockAsm4MB:
- LEAL 1(CX)(SI*1), SI
+ LEAL 1(CX)(BX*1), BX
check_maxskip_cont_encodeBetterBlockAsm4MB:
- CMPL SI, 8(SP)
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBetterBlockAsm4MB
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x00cf1bbcdcbfa563, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x08, R10
- IMULQ R9, R10
- SHRQ $0x2f, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x32, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 524312(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 524312(SP)(R11*4)
- MOVQ (DX)(SI*1), R10
- MOVQ (DX)(R8*1), R11
- CMPQ R10, DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x00cf1bbcdcbfa563, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x08, R9
+ IMULQ R8, R9
+ SHRQ $0x2f, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x32, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 524312(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 524312(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeBetterBlockAsm4MB
- CMPQ R11, DI
+ CMPQ R10, SI
JNE no_short_found_encodeBetterBlockAsm4MB
- MOVL R8, SI
+ MOVL DI, BX
JMP candidate_match_encodeBetterBlockAsm4MB
no_short_found_encodeBetterBlockAsm4MB:
- CMPL R10, DI
+ CMPL R9, SI
JEQ candidate_match_encodeBetterBlockAsm4MB
- CMPL R11, DI
+ CMPL R10, SI
JEQ candidateS_match_encodeBetterBlockAsm4MB
MOVL 20(SP), CX
JMP search_loop_encodeBetterBlockAsm4MB
candidateS_match_encodeBetterBlockAsm4MB:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x08, R10
- IMULQ R9, R10
- SHRQ $0x2f, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x08, R9
+ IMULQ R8, R9
+ SHRQ $0x2f, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBetterBlockAsm4MB
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBetterBlockAsm4MB:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBetterBlockAsm4MB
match_extend_back_loop_encodeBetterBlockAsm4MB:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBetterBlockAsm4MB
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBetterBlockAsm4MB
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBetterBlockAsm4MB
JMP match_extend_back_loop_encodeBetterBlockAsm4MB
match_extend_back_end_encodeBetterBlockAsm4MB:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 4(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 4(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBetterBlockAsm4MB
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBetterBlockAsm4MB:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeBetterBlockAsm4MB
matchlen_loop_match_nolit_encodeBetterBlockAsm4MB:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB
JZ match_nolit_end_encodeBetterBlockAsm4MB
matchlen_match4_match_nolit_encodeBetterBlockAsm4MB:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeBetterBlockAsm4MB:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeBetterBlockAsm4MB:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeBetterBlockAsm4MB
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeBetterBlockAsm4MB
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeBetterBlockAsm4MB:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- CMPL 16(SP), R8
+ CMPL 16(SP), DI
JEQ match_is_repeat_encodeBetterBlockAsm4MB
- CMPL R12, $0x01
+ CMPL R11, $0x01
JG match_length_ok_encodeBetterBlockAsm4MB
- CMPL R8, $0x0000ffff
+ CMPL DI, $0x0000ffff
JLE match_length_ok_encodeBetterBlockAsm4MB
MOVL 20(SP), CX
INCL CX
JMP search_loop_encodeBetterBlockAsm4MB
match_length_ok_encodeBetterBlockAsm4MB:
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeBetterBlockAsm4MB
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeBetterBlockAsm4MB
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JLT three_bytes_match_emit_encodeBetterBlockAsm4MB
- MOVL SI, R11
- SHRL $0x10, R11
+ MOVL BX, R10
+ SHRL $0x10, R10
MOVB $0xf8, (AX)
- MOVW SI, 1(AX)
- MOVB R11, 3(AX)
+ MOVW BX, 1(AX)
+ MOVB R10, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
three_bytes_match_emit_encodeBetterBlockAsm4MB:
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
two_bytes_match_emit_encodeBetterBlockAsm4MB:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeBetterBlockAsm4MB
JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
one_byte_match_emit_encodeBetterBlockAsm4MB:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBetterBlockAsm4MB:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB
memmove_long_match_emit_encodeBetterBlockAsm4MB:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
- CMPL R8, $0x00010000
+ CMPL DI, $0x00010000
JL two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
-
-four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
MOVB $0xff, (AX)
- MOVL R8, 1(AX)
- LEAL -64(R12), R12
+ MOVL DI, 1(AX)
+ LEAL -64(R11), R11
ADDQ $0x05, AX
- CMPL R12, $0x04
+ CMPL R11, $0x04
JL four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
- CMPL R12, $0x00010100
+ CMPL R11, $0x00010100
JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
- LEAL -65536(R12), R12
- MOVL R12, R8
+ LEAL -65536(R11), R11
+ MOVL R11, DI
MOVW $0x001d, (AX)
- MOVW R12, 2(AX)
- SARL $0x10, R8
- MOVB R8, 4(AX)
+ MOVW R11, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
- JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB
four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
- TESTL R12, R12
+ TESTL R11, R11
JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
- MOVB $0x03, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVL R8, 1(AX)
+ XORL BX, BX
+ LEAL -1(BX)(R11*4), R11
+ MOVB R11, (AX)
+ MOVL DI, 1(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JAE long_offset_short_match_nolit_encodeBetterBlockAsm4MB
- MOVL $0x00000001, SI
- LEAL 16(SI), SI
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, SI
- MOVB SI, (AX)
+ MOVL $0x00000001, BX
+ LEAL 16(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R12
+ SUBL $0x08, R11
// emitRepeat
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
- CMPL R12, $0x00010100
+ CMPL R11, $0x00010100
JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
- LEAL -65536(R12), R12
- MOVL R12, R8
+ LEAL -65536(R11), R11
+ MOVL R11, DI
MOVW $0x001d, (AX)
- MOVW R12, 2(AX)
- SARL $0x10, R8
- MOVB R8, 4(AX)
+ MOVW R11, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
long_offset_short_match_nolit_encodeBetterBlockAsm4MB:
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
- CMPL R12, $0x00010100
+ CMPL R11, $0x00010100
JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
- LEAL -65536(R12), R12
- MOVL R12, R8
+ LEAL -65536(R11), R11
+ MOVL R11, DI
MOVW $0x001d, (AX)
- MOVW R12, 2(AX)
- SARL $0x10, R8
- MOVB R8, 4(AX)
+ MOVW R11, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
- JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
match_is_repeat_encodeBetterBlockAsm4MB:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
- MOVL SI, R11
- SHRL $0x10, R11
+ MOVL BX, R10
+ SHRL $0x10, R10
MOVB $0xf8, (AX)
- MOVW SI, 1(AX)
- MOVB R11, 3(AX)
+ MOVW BX, 1(AX)
+ MOVB R10, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_repeat_encodeBetterBlockAsm4MB
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
- CMPL R12, $0x00010100
+ CMPL R11, $0x00010100
JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
- LEAL -65536(R12), R12
- MOVL R12, R8
+ LEAL -65536(R11), R11
+ MOVL R11, DI
MOVW $0x001d, (AX)
- MOVW R12, 2(AX)
- SARL $0x10, R8
- MOVB R8, 4(AX)
+ MOVW R11, 2(AX)
+ SARL $0x10, DI
+ MOVB DI, 4(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
@@ -7614,50 +7567,50 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
RET
match_nolit_dst_ok_encodeBetterBlockAsm4MB:
- MOVQ $0x00cf1bbcdcbfa563, SI
- MOVQ $0x9e3779b1, R8
- LEAQ 1(DI), DI
- LEAQ -2(CX), R9
- MOVQ (DX)(DI*1), R10
- MOVQ 1(DX)(DI*1), R11
- MOVQ (DX)(R9*1), R12
- MOVQ 1(DX)(R9*1), R13
- SHLQ $0x08, R10
- IMULQ SI, R10
- SHRQ $0x2f, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x32, R11
- SHLQ $0x08, R12
- IMULQ SI, R12
- SHRQ $0x2f, R12
- SHLQ $0x20, R13
- IMULQ R8, R13
- SHRQ $0x32, R13
- LEAQ 1(DI), R8
- LEAQ 1(R9), R14
- MOVL DI, 24(SP)(R10*4)
- MOVL R9, 24(SP)(R12*4)
- MOVL R8, 524312(SP)(R11*4)
- MOVL R14, 524312(SP)(R13*4)
- ADDQ $0x01, DI
- SUBQ $0x01, R9
+ MOVQ $0x00cf1bbcdcbfa563, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x08, R9
+ IMULQ BX, R9
+ SHRQ $0x2f, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x32, R10
+ SHLQ $0x08, R11
+ IMULQ BX, R11
+ SHRQ $0x2f, R11
+ SHLQ $0x20, R12
+ IMULQ DI, R12
+ SHRQ $0x32, R12
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 524312(SP)(R10*4)
+ MOVL R13, 524312(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
index_loop_encodeBetterBlockAsm4MB:
- CMPQ DI, R9
+ CMPQ SI, R8
JAE search_loop_encodeBetterBlockAsm4MB
- MOVQ (DX)(DI*1), R8
- MOVQ (DX)(R9*1), R10
- SHLQ $0x08, R8
- IMULQ SI, R8
- SHRQ $0x2f, R8
- SHLQ $0x08, R10
- IMULQ SI, R10
- SHRQ $0x2f, R10
- MOVL DI, 24(SP)(R8*4)
- MOVL R9, 24(SP)(R10*4)
- ADDQ $0x02, DI
- SUBQ $0x02, R9
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x08, DI
+ IMULQ BX, DI
+ SHRQ $0x2f, DI
+ SHLQ $0x08, R9
+ IMULQ BX, R9
+ SHRQ $0x2f, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
JMP index_loop_encodeBetterBlockAsm4MB
emit_remainder_encodeBetterBlockAsm4MB:
@@ -7851,8 +7804,8 @@ zero_loop_encodeBetterBlockAsm12B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -6(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -7862,601 +7815,599 @@ zero_loop_encodeBetterBlockAsm12B:
MOVQ src_base+24(FP), DX
search_loop_encodeBetterBlockAsm12B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x06, SI
- LEAL 1(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x06, BX
+ LEAL 1(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBetterBlockAsm12B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x32, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x34, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 65560(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 65560(SP)(R11*4)
- MOVQ (DX)(SI*1), R10
- MOVQ (DX)(R8*1), R11
- CMPQ R10, DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x34, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 65560(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 65560(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeBetterBlockAsm12B
- CMPQ R11, DI
+ CMPQ R10, SI
JNE no_short_found_encodeBetterBlockAsm12B
- MOVL R8, SI
+ MOVL DI, BX
JMP candidate_match_encodeBetterBlockAsm12B
no_short_found_encodeBetterBlockAsm12B:
- CMPL R10, DI
+ CMPL R9, SI
JEQ candidate_match_encodeBetterBlockAsm12B
- CMPL R11, DI
+ CMPL R10, SI
JEQ candidateS_match_encodeBetterBlockAsm12B
MOVL 20(SP), CX
JMP search_loop_encodeBetterBlockAsm12B
candidateS_match_encodeBetterBlockAsm12B:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x32, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBetterBlockAsm12B
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBetterBlockAsm12B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBetterBlockAsm12B
match_extend_back_loop_encodeBetterBlockAsm12B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBetterBlockAsm12B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBetterBlockAsm12B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBetterBlockAsm12B
JMP match_extend_back_loop_encodeBetterBlockAsm12B
match_extend_back_end_encodeBetterBlockAsm12B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBetterBlockAsm12B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBetterBlockAsm12B:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeBetterBlockAsm12B
matchlen_loopback_match_nolit_encodeBetterBlockAsm12B:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeBetterBlockAsm12B
matchlen_loop_match_nolit_encodeBetterBlockAsm12B:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B
JZ match_nolit_end_encodeBetterBlockAsm12B
matchlen_match4_match_nolit_encodeBetterBlockAsm12B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeBetterBlockAsm12B
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeBetterBlockAsm12B
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeBetterBlockAsm12B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeBetterBlockAsm12B
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeBetterBlockAsm12B
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeBetterBlockAsm12B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeBetterBlockAsm12B
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeBetterBlockAsm12B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeBetterBlockAsm12B:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- CMPL 16(SP), R8
+ CMPL 16(SP), DI
JEQ match_is_repeat_encodeBetterBlockAsm12B
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeBetterBlockAsm12B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeBetterBlockAsm12B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBetterBlockAsm12B
two_bytes_match_emit_encodeBetterBlockAsm12B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeBetterBlockAsm12B
JMP memmove_long_match_emit_encodeBetterBlockAsm12B
one_byte_match_emit_encodeBetterBlockAsm12B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBetterBlockAsm12B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B
memmove_long_match_emit_encodeBetterBlockAsm12B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeBetterBlockAsm12B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
-two_byte_offset_match_nolit_encodeBetterBlockAsm12B:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JAE long_offset_short_match_nolit_encodeBetterBlockAsm12B
- MOVL $0x00000001, SI
- LEAL 16(SI), SI
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, SI
- MOVB SI, (AX)
+ MOVL $0x00000001, BX
+ LEAL 16(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R12
+ SUBL $0x08, R11
// emitRepeat
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
long_offset_short_match_nolit_encodeBetterBlockAsm12B:
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
- JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B
two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
match_is_repeat_encodeBetterBlockAsm12B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_repeat_encodeBetterBlockAsm12B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_repeat_encodeBetterBlockAsm12B
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_repeat_encodeBetterBlockAsm12B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
@@ -8468,50 +8419,50 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
RET
match_nolit_dst_ok_encodeBetterBlockAsm12B:
- MOVQ $0x0000cf1bbcdcbf9b, SI
- MOVQ $0x9e3779b1, R8
- LEAQ 1(DI), DI
- LEAQ -2(CX), R9
- MOVQ (DX)(DI*1), R10
- MOVQ 1(DX)(DI*1), R11
- MOVQ (DX)(R9*1), R12
- MOVQ 1(DX)(R9*1), R13
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x32, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x34, R11
- SHLQ $0x10, R12
- IMULQ SI, R12
- SHRQ $0x32, R12
- SHLQ $0x20, R13
- IMULQ R8, R13
- SHRQ $0x34, R13
- LEAQ 1(DI), R8
- LEAQ 1(R9), R14
- MOVL DI, 24(SP)(R10*4)
- MOVL R9, 24(SP)(R12*4)
- MOVL R8, 65560(SP)(R11*4)
- MOVL R14, 65560(SP)(R13*4)
- ADDQ $0x01, DI
- SUBQ $0x01, R9
+ MOVQ $0x0000cf1bbcdcbf9b, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x32, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x34, R10
+ SHLQ $0x10, R11
+ IMULQ BX, R11
+ SHRQ $0x32, R11
+ SHLQ $0x20, R12
+ IMULQ DI, R12
+ SHRQ $0x34, R12
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 65560(SP)(R10*4)
+ MOVL R13, 65560(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
index_loop_encodeBetterBlockAsm12B:
- CMPQ DI, R9
+ CMPQ SI, R8
JAE search_loop_encodeBetterBlockAsm12B
- MOVQ (DX)(DI*1), R8
- MOVQ (DX)(R9*1), R10
- SHLQ $0x10, R8
- IMULQ SI, R8
- SHRQ $0x32, R8
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x32, R10
- MOVL DI, 24(SP)(R8*4)
- MOVL R9, 24(SP)(R10*4)
- ADDQ $0x02, DI
- SUBQ $0x02, R9
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x10, DI
+ IMULQ BX, DI
+ SHRQ $0x32, DI
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x32, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
JMP index_loop_encodeBetterBlockAsm12B
emit_remainder_encodeBetterBlockAsm12B:
@@ -8694,8 +8645,8 @@ zero_loop_encodeBetterBlockAsm10B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -6(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -8705,601 +8656,599 @@ zero_loop_encodeBetterBlockAsm10B:
MOVQ src_base+24(FP), DX
search_loop_encodeBetterBlockAsm10B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x05, SI
- LEAL 1(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x05, BX
+ LEAL 1(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBetterBlockAsm10B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x34, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x36, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 16408(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 16408(SP)(R11*4)
- MOVQ (DX)(SI*1), R10
- MOVQ (DX)(R8*1), R11
- CMPQ R10, DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x34, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x36, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 16408(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 16408(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeBetterBlockAsm10B
- CMPQ R11, DI
+ CMPQ R10, SI
JNE no_short_found_encodeBetterBlockAsm10B
- MOVL R8, SI
+ MOVL DI, BX
JMP candidate_match_encodeBetterBlockAsm10B
no_short_found_encodeBetterBlockAsm10B:
- CMPL R10, DI
+ CMPL R9, SI
JEQ candidate_match_encodeBetterBlockAsm10B
- CMPL R11, DI
+ CMPL R10, SI
JEQ candidateS_match_encodeBetterBlockAsm10B
MOVL 20(SP), CX
JMP search_loop_encodeBetterBlockAsm10B
candidateS_match_encodeBetterBlockAsm10B:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x34, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x34, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBetterBlockAsm10B
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBetterBlockAsm10B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBetterBlockAsm10B
match_extend_back_loop_encodeBetterBlockAsm10B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBetterBlockAsm10B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBetterBlockAsm10B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBetterBlockAsm10B
JMP match_extend_back_loop_encodeBetterBlockAsm10B
match_extend_back_end_encodeBetterBlockAsm10B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBetterBlockAsm10B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBetterBlockAsm10B:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeBetterBlockAsm10B
matchlen_loopback_match_nolit_encodeBetterBlockAsm10B:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeBetterBlockAsm10B
matchlen_loop_match_nolit_encodeBetterBlockAsm10B:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B
JZ match_nolit_end_encodeBetterBlockAsm10B
matchlen_match4_match_nolit_encodeBetterBlockAsm10B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeBetterBlockAsm10B
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeBetterBlockAsm10B
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeBetterBlockAsm10B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeBetterBlockAsm10B
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeBetterBlockAsm10B
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeBetterBlockAsm10B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeBetterBlockAsm10B
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeBetterBlockAsm10B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeBetterBlockAsm10B:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- CMPL 16(SP), R8
+ CMPL 16(SP), DI
JEQ match_is_repeat_encodeBetterBlockAsm10B
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeBetterBlockAsm10B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeBetterBlockAsm10B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBetterBlockAsm10B
two_bytes_match_emit_encodeBetterBlockAsm10B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeBetterBlockAsm10B
JMP memmove_long_match_emit_encodeBetterBlockAsm10B
one_byte_match_emit_encodeBetterBlockAsm10B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBetterBlockAsm10B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B
memmove_long_match_emit_encodeBetterBlockAsm10B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeBetterBlockAsm10B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
-two_byte_offset_match_nolit_encodeBetterBlockAsm10B:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JAE long_offset_short_match_nolit_encodeBetterBlockAsm10B
- MOVL $0x00000001, SI
- LEAL 16(SI), SI
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, SI
- MOVB SI, (AX)
+ MOVL $0x00000001, BX
+ LEAL 16(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R12
+ SUBL $0x08, R11
// emitRepeat
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
long_offset_short_match_nolit_encodeBetterBlockAsm10B:
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
- JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B
two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
match_is_repeat_encodeBetterBlockAsm10B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_repeat_encodeBetterBlockAsm10B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_repeat_encodeBetterBlockAsm10B
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_repeat_encodeBetterBlockAsm10B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
@@ -9311,50 +9260,50 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
RET
match_nolit_dst_ok_encodeBetterBlockAsm10B:
- MOVQ $0x0000cf1bbcdcbf9b, SI
- MOVQ $0x9e3779b1, R8
- LEAQ 1(DI), DI
- LEAQ -2(CX), R9
- MOVQ (DX)(DI*1), R10
- MOVQ 1(DX)(DI*1), R11
- MOVQ (DX)(R9*1), R12
- MOVQ 1(DX)(R9*1), R13
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x34, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x36, R11
- SHLQ $0x10, R12
- IMULQ SI, R12
- SHRQ $0x34, R12
- SHLQ $0x20, R13
- IMULQ R8, R13
- SHRQ $0x36, R13
- LEAQ 1(DI), R8
- LEAQ 1(R9), R14
- MOVL DI, 24(SP)(R10*4)
- MOVL R9, 24(SP)(R12*4)
- MOVL R8, 16408(SP)(R11*4)
- MOVL R14, 16408(SP)(R13*4)
- ADDQ $0x01, DI
- SUBQ $0x01, R9
+ MOVQ $0x0000cf1bbcdcbf9b, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x34, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x36, R10
+ SHLQ $0x10, R11
+ IMULQ BX, R11
+ SHRQ $0x34, R11
+ SHLQ $0x20, R12
+ IMULQ DI, R12
+ SHRQ $0x36, R12
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 16408(SP)(R10*4)
+ MOVL R13, 16408(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
index_loop_encodeBetterBlockAsm10B:
- CMPQ DI, R9
+ CMPQ SI, R8
JAE search_loop_encodeBetterBlockAsm10B
- MOVQ (DX)(DI*1), R8
- MOVQ (DX)(R9*1), R10
- SHLQ $0x10, R8
- IMULQ SI, R8
- SHRQ $0x34, R8
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x34, R10
- MOVL DI, 24(SP)(R8*4)
- MOVL R9, 24(SP)(R10*4)
- ADDQ $0x02, DI
- SUBQ $0x02, R9
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x10, DI
+ IMULQ BX, DI
+ SHRQ $0x34, DI
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x34, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
JMP index_loop_encodeBetterBlockAsm10B
emit_remainder_encodeBetterBlockAsm10B:
@@ -9537,8 +9486,8 @@ zero_loop_encodeBetterBlockAsm8B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -6(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -9548,587 +9497,585 @@ zero_loop_encodeBetterBlockAsm8B:
MOVQ src_base+24(FP), DX
search_loop_encodeBetterBlockAsm8B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x04, SI
- LEAL 1(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x04, BX
+ LEAL 1(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeBetterBlockAsm8B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x36, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x38, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 4120(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 4120(SP)(R11*4)
- MOVQ (DX)(SI*1), R10
- MOVQ (DX)(R8*1), R11
- CMPQ R10, DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x36, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x38, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 4120(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 4120(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeBetterBlockAsm8B
- CMPQ R11, DI
+ CMPQ R10, SI
JNE no_short_found_encodeBetterBlockAsm8B
- MOVL R8, SI
+ MOVL DI, BX
JMP candidate_match_encodeBetterBlockAsm8B
no_short_found_encodeBetterBlockAsm8B:
- CMPL R10, DI
+ CMPL R9, SI
JEQ candidate_match_encodeBetterBlockAsm8B
- CMPL R11, DI
+ CMPL R10, SI
JEQ candidateS_match_encodeBetterBlockAsm8B
MOVL 20(SP), CX
JMP search_loop_encodeBetterBlockAsm8B
candidateS_match_encodeBetterBlockAsm8B:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x36, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x36, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeBetterBlockAsm8B
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeBetterBlockAsm8B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeBetterBlockAsm8B
match_extend_back_loop_encodeBetterBlockAsm8B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeBetterBlockAsm8B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeBetterBlockAsm8B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeBetterBlockAsm8B
JMP match_extend_back_loop_encodeBetterBlockAsm8B
match_extend_back_end_encodeBetterBlockAsm8B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeBetterBlockAsm8B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeBetterBlockAsm8B:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeBetterBlockAsm8B
matchlen_loopback_match_nolit_encodeBetterBlockAsm8B:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeBetterBlockAsm8B
matchlen_loop_match_nolit_encodeBetterBlockAsm8B:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B
JZ match_nolit_end_encodeBetterBlockAsm8B
matchlen_match4_match_nolit_encodeBetterBlockAsm8B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeBetterBlockAsm8B
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeBetterBlockAsm8B
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeBetterBlockAsm8B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeBetterBlockAsm8B
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeBetterBlockAsm8B
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeBetterBlockAsm8B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeBetterBlockAsm8B
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeBetterBlockAsm8B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeBetterBlockAsm8B:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- CMPL 16(SP), R8
+ CMPL 16(SP), DI
JEQ match_is_repeat_encodeBetterBlockAsm8B
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeBetterBlockAsm8B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeBetterBlockAsm8B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeBetterBlockAsm8B
two_bytes_match_emit_encodeBetterBlockAsm8B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeBetterBlockAsm8B
JMP memmove_long_match_emit_encodeBetterBlockAsm8B
one_byte_match_emit_encodeBetterBlockAsm8B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeBetterBlockAsm8B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x04
+ CMPQ R8, $0x04
JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4:
- MOVL (R10), R11
- MOVL R11, (AX)
+ MOVL (R9), R10
+ MOVL R10, (AX)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
- MOVL (R10), R11
- MOVL -4(R10)(R9*1), R10
- MOVL R11, (AX)
- MOVL R10, -4(AX)(R9*1)
+ MOVL (R9), R10
+ MOVL -4(R9)(R8*1), R9
+ MOVL R10, (AX)
+ MOVL R9, -4(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B
memmove_long_match_emit_encodeBetterBlockAsm8B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeBetterBlockAsm8B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
-two_byte_offset_match_nolit_encodeBetterBlockAsm8B:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JAE long_offset_short_match_nolit_encodeBetterBlockAsm8B
- MOVL $0x00000001, SI
- LEAL 16(SI), SI
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, SI
- MOVB SI, (AX)
+ MOVL $0x00000001, BX
+ LEAL 16(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
- SUBL $0x08, R12
+ SUBL $0x08, R11
// emitRepeat
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
long_offset_short_match_nolit_encodeBetterBlockAsm8B:
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
- JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B
two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeBetterBlockAsm8B
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
match_is_repeat_encodeBetterBlockAsm8B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
- MOVL DI, R8
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R9
- SUBL SI, R8
- LEAL -1(R8), SI
- CMPL SI, $0x3c
+ MOVL SI, DI
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R8
+ SUBL BX, DI
+ LEAL -1(DI), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_repeat_encodeBetterBlockAsm8B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_repeat_encodeBetterBlockAsm8B
JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_repeat_encodeBetterBlockAsm8B:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveShort
- CMPQ R8, $0x04
+ CMPQ DI, $0x04
JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4
- CMPQ R8, $0x08
+ CMPQ DI, $0x08
JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
- CMPQ R8, $0x10
+ CMPQ DI, $0x10
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
- CMPQ R8, $0x20
+ CMPQ DI, $0x20
JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4:
- MOVL (R9), R10
- MOVL R10, (AX)
+ MOVL (R8), R9
+ MOVL R9, (AX)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
- MOVL (R9), R10
- MOVL -4(R9)(R8*1), R9
- MOVL R10, (AX)
- MOVL R9, -4(AX)(R8*1)
+ MOVL (R8), R9
+ MOVL -4(R8)(DI*1), R8
+ MOVL R9, (AX)
+ MOVL R8, -4(AX)(DI*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
- MOVQ (R9), R10
- MOVQ -8(R9)(R8*1), R9
- MOVQ R10, (AX)
- MOVQ R9, -8(AX)(R8*1)
+ MOVQ (R8), R9
+ MOVQ -8(R8)(DI*1), R8
+ MOVQ R9, (AX)
+ MOVQ R8, -8(AX)(DI*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
- MOVOU (R9), X0
- MOVOU -16(R9)(R8*1), X1
+ MOVOU (R8), X0
+ MOVOU -16(R8)(DI*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R8*1)
+ MOVOU X1, -16(AX)(DI*1)
JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveLong
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
- MOVQ R8, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R13
- SUBQ R10, R13
- DECQ R11
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
+ MOVQ DI, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R12
+ SUBQ R9, R12
+ DECQ R10
JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
- LEAQ -32(R9)(R13*1), R10
- LEAQ -32(AX)(R13*1), R14
+ LEAQ -32(R8)(R12*1), R9
+ LEAQ -32(AX)(R12*1), R13
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R14)
- MOVOA X5, 16(R14)
- ADDQ $0x20, R14
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R13)
+ MOVOA X5, 16(R13)
ADDQ $0x20, R13
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R12
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back
emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
- MOVOU -32(R9)(R13*1), X4
- MOVOU -16(R9)(R13*1), X5
- MOVOA X4, -32(AX)(R13*1)
- MOVOA X5, -16(AX)(R13*1)
- ADDQ $0x20, R13
- CMPQ R8, R13
+ MOVOU -32(R8)(R12*1), X4
+ MOVOU -16(R8)(R12*1), X5
+ MOVOA X4, -32(AX)(R12*1)
+ MOVOA X5, -16(AX)(R12*1)
+ ADDQ $0x20, R12
+ CMPQ DI, R12
JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitRepeat
- MOVL R12, SI
- LEAL -4(R12), R12
- CMPL SI, $0x08
+ MOVL R11, BX
+ LEAL -4(R11), R11
+ CMPL BX, $0x08
JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
- CMPL SI, $0x0c
+ CMPL BX, $0x0c
JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B
cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
- CMPL R12, $0x00000104
+ CMPL R11, $0x00000104
JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
- LEAL -256(R12), R12
+ LEAL -256(R11), R11
MOVW $0x0019, (AX)
- MOVW R12, 2(AX)
+ MOVW R11, 2(AX)
ADDQ $0x04, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
- LEAL -4(R12), R12
+ LEAL -4(R11), R11
MOVW $0x0015, (AX)
- MOVB R12, 2(AX)
+ MOVB R11, 2(AX)
ADDQ $0x03, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
- SHLL $0x02, R12
- ORL $0x01, R12
- MOVW R12, (AX)
+ SHLL $0x02, R11
+ ORL $0x01, R11
+ MOVW R11, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
- XORQ SI, SI
- LEAL 1(SI)(R12*4), R12
- MOVB R8, 1(AX)
- SARL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ XORQ BX, BX
+ LEAL 1(BX)(R11*4), R11
+ MOVB DI, 1(AX)
+ SARL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, R11
+ MOVB R11, (AX)
ADDQ $0x02, AX
match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
@@ -10140,50 +10087,50 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
RET
match_nolit_dst_ok_encodeBetterBlockAsm8B:
- MOVQ $0x0000cf1bbcdcbf9b, SI
- MOVQ $0x9e3779b1, R8
- LEAQ 1(DI), DI
- LEAQ -2(CX), R9
- MOVQ (DX)(DI*1), R10
- MOVQ 1(DX)(DI*1), R11
- MOVQ (DX)(R9*1), R12
- MOVQ 1(DX)(R9*1), R13
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x36, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x38, R11
- SHLQ $0x10, R12
- IMULQ SI, R12
- SHRQ $0x36, R12
- SHLQ $0x20, R13
- IMULQ R8, R13
- SHRQ $0x38, R13
- LEAQ 1(DI), R8
- LEAQ 1(R9), R14
- MOVL DI, 24(SP)(R10*4)
- MOVL R9, 24(SP)(R12*4)
- MOVL R8, 4120(SP)(R11*4)
- MOVL R14, 4120(SP)(R13*4)
- ADDQ $0x01, DI
- SUBQ $0x01, R9
+ MOVQ $0x0000cf1bbcdcbf9b, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x36, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x38, R10
+ SHLQ $0x10, R11
+ IMULQ BX, R11
+ SHRQ $0x36, R11
+ SHLQ $0x20, R12
+ IMULQ DI, R12
+ SHRQ $0x38, R12
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 4120(SP)(R10*4)
+ MOVL R13, 4120(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
index_loop_encodeBetterBlockAsm8B:
- CMPQ DI, R9
+ CMPQ SI, R8
JAE search_loop_encodeBetterBlockAsm8B
- MOVQ (DX)(DI*1), R8
- MOVQ (DX)(R9*1), R10
- SHLQ $0x10, R8
- IMULQ SI, R8
- SHRQ $0x36, R8
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x36, R10
- MOVL DI, 24(SP)(R8*4)
- MOVL R9, 24(SP)(R10*4)
- ADDQ $0x02, DI
- SUBQ $0x02, R9
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x10, DI
+ IMULQ BX, DI
+ SHRQ $0x36, DI
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x36, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
JMP index_loop_encodeBetterBlockAsm8B
emit_remainder_encodeBetterBlockAsm8B:
@@ -10366,8 +10313,8 @@ zero_loop_encodeSnappyBlockAsm:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -10377,321 +10324,321 @@ zero_loop_encodeSnappyBlockAsm:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBlockAsm:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x06, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x06, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x32, R10
- SHLQ $0x10, R11
- IMULQ R9, R11
- SHRQ $0x32, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
SHLQ $0x10, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x32, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
- JNE no_repeat_found_encodeSnappyBlockAsm
- LEAL 1(CX), DI
- MOVL 12(SP), SI
- MOVL DI, R8
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
+ MOVL CX, R8
SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
+ JNE no_repeat_found_encodeSnappyBlockAsm
+ LEAL 1(CX), SI
+ MOVL 12(SP), BX
+ MOVL SI, DI
+ SUBL 16(SP), DI
JZ repeat_extend_back_end_encodeSnappyBlockAsm
repeat_extend_back_loop_encodeSnappyBlockAsm:
- CMPL DI, SI
+ CMPL SI, BX
JLE repeat_extend_back_end_encodeSnappyBlockAsm
- MOVB -1(DX)(R8*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(DI*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeSnappyBlockAsm
- LEAL -1(DI), DI
- DECL R8
+ LEAL -1(SI), SI
+ DECL DI
JNZ repeat_extend_back_loop_encodeSnappyBlockAsm
repeat_extend_back_end_encodeSnappyBlockAsm:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm
- MOVL DI, R8
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R9
- SUBL SI, R8
- LEAL -1(R8), SI
- CMPL SI, $0x3c
+ MOVL SI, DI
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R8
+ SUBL BX, DI
+ LEAL -1(DI), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeSnappyBlockAsm
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeSnappyBlockAsm
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JLT three_bytes_repeat_emit_encodeSnappyBlockAsm
- CMPL SI, $0x01000000
+ CMPL BX, $0x01000000
JLT four_bytes_repeat_emit_encodeSnappyBlockAsm
MOVB $0xfc, (AX)
- MOVL SI, 1(AX)
+ MOVL BX, 1(AX)
ADDQ $0x05, AX
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
four_bytes_repeat_emit_encodeSnappyBlockAsm:
- MOVL SI, R10
- SHRL $0x10, R10
+ MOVL BX, R9
+ SHRL $0x10, R9
MOVB $0xf8, (AX)
- MOVW SI, 1(AX)
- MOVB R10, 3(AX)
+ MOVW BX, 1(AX)
+ MOVB R9, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
three_bytes_repeat_emit_encodeSnappyBlockAsm:
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
two_bytes_repeat_emit_encodeSnappyBlockAsm:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeSnappyBlockAsm
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
one_byte_repeat_emit_encodeSnappyBlockAsm:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeSnappyBlockAsm:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveShort
- CMPQ R8, $0x08
+ CMPQ DI, $0x08
JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8
- CMPQ R8, $0x10
+ CMPQ DI, $0x10
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
- CMPQ R8, $0x20
+ CMPQ DI, $0x20
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8:
- MOVQ (R9), R10
- MOVQ R10, (AX)
+ MOVQ (R8), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
- MOVQ (R9), R10
- MOVQ -8(R9)(R8*1), R9
- MOVQ R10, (AX)
- MOVQ R9, -8(AX)(R8*1)
+ MOVQ (R8), R9
+ MOVQ -8(R8)(DI*1), R8
+ MOVQ R9, (AX)
+ MOVQ R8, -8(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
- MOVOU (R9), X0
- MOVOU -16(R9)(R8*1), X1
+ MOVOU (R8), X0
+ MOVOU -16(R8)(DI*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R8*1)
+ MOVOU X1, -16(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm
memmove_long_repeat_emit_encodeSnappyBlockAsm:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveLong
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
- MOVQ R8, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
+ MOVQ DI, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
- LEAQ -32(R9)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(R8)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
- MOVOU -32(R9)(R12*1), X4
- MOVOU -16(R9)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R8, R12
+ MOVOU -32(R8)(R11*1), X4
+ MOVOU -16(R8)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ DI, R11
JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R11, R11
- CMPL R8, $0x08
+ XORL R10, R10
+ CMPL DI, $0x08
JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm:
- MOVQ (R9)(R11*1), R10
- XORQ (SI)(R11*1), R10
- TESTQ R10, R10
+ MOVQ (R8)(R10*1), R9
+ XORQ (BX)(R10*1), R9
+ TESTQ R9, R9
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm
#ifdef GOAMD64_v3
- TZCNTQ R10, R10
+ TZCNTQ R9, R9
#else
- BSFQ R10, R10
+ BSFQ R9, R9
#endif
- SARQ $0x03, R10
- LEAL (R11)(R10*1), R11
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
JMP repeat_extend_forward_end_encodeSnappyBlockAsm
matchlen_loop_repeat_extend_encodeSnappyBlockAsm:
- LEAL -8(R8), R8
- LEAL 8(R11), R11
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm
JZ repeat_extend_forward_end_encodeSnappyBlockAsm
matchlen_match4_repeat_extend_encodeSnappyBlockAsm:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm
- MOVL (R9)(R11*1), R10
- CMPL (SI)(R11*1), R10
+ MOVL (R8)(R10*1), R9
+ CMPL (BX)(R10*1), R9
JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm
- SUBL $0x04, R8
- LEAL 4(R11), R11
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
matchlen_match2_repeat_extend_encodeSnappyBlockAsm:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm
- MOVW (R9)(R11*1), R10
- CMPW (SI)(R11*1), R10
+ MOVW (R8)(R10*1), R9
+ CMPW (BX)(R10*1), R9
JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm
- SUBL $0x02, R8
- LEAL 2(R11), R11
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
matchlen_match1_repeat_extend_encodeSnappyBlockAsm:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL repeat_extend_forward_end_encodeSnappyBlockAsm
- MOVB (R9)(R11*1), R10
- CMPB (SI)(R11*1), R10
+ MOVB (R8)(R10*1), R9
+ CMPB (BX)(R10*1), R9
JNE repeat_extend_forward_end_encodeSnappyBlockAsm
- LEAL 1(R11), R11
+ LEAL 1(R10), R10
repeat_extend_forward_end_encodeSnappyBlockAsm:
- ADDL R11, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
+ ADDL R10, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
// emitCopy
- CMPL DI, $0x00010000
+ CMPL SI, $0x00010000
JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
MOVB $0xff, (AX)
- MOVL DI, 1(AX)
- LEAL -64(SI), SI
+ MOVL SI, 1(AX)
+ LEAL -64(BX), BX
ADDQ $0x05, AX
- CMPL SI, $0x04
+ CMPL BX, $0x04
JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm
four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
- TESTL SI, SI
+ TESTL BX, BX
JZ repeat_end_emit_encodeSnappyBlockAsm
- MOVB $0x03, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVL DI, 1(AX)
+ XORL DI, DI
+ LEAL -1(DI)(BX*4), BX
+ MOVB BL, (AX)
+ MOVL SI, 1(AX)
ADDQ $0x05, AX
JMP repeat_end_emit_encodeSnappyBlockAsm
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeSnappyBlockAsm
emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeSnappyBlockAsm:
@@ -10699,16 +10646,16 @@ repeat_end_emit_encodeSnappyBlockAsm:
JMP search_loop_encodeSnappyBlockAsm
no_repeat_found_encodeSnappyBlockAsm:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBlockAsm
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeSnappyBlockAsm
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeSnappyBlockAsm
MOVL 20(SP), CX
JMP search_loop_encodeSnappyBlockAsm
@@ -10718,331 +10665,331 @@ candidate3_match_encodeSnappyBlockAsm:
JMP candidate_match_encodeSnappyBlockAsm
candidate2_match_encodeSnappyBlockAsm:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBlockAsm:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBlockAsm
match_extend_back_loop_encodeSnappyBlockAsm:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBlockAsm
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBlockAsm
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBlockAsm
JMP match_extend_back_loop_encodeSnappyBlockAsm
match_extend_back_end_encodeSnappyBlockAsm:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 5(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 5(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBlockAsm
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBlockAsm:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeSnappyBlockAsm
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBlockAsm
- CMPL R8, $0x00010000
+ CMPL DI, $0x00010000
JLT three_bytes_match_emit_encodeSnappyBlockAsm
- CMPL R8, $0x01000000
+ CMPL DI, $0x01000000
JLT four_bytes_match_emit_encodeSnappyBlockAsm
MOVB $0xfc, (AX)
- MOVL R8, 1(AX)
+ MOVL DI, 1(AX)
ADDQ $0x05, AX
JMP memmove_long_match_emit_encodeSnappyBlockAsm
four_bytes_match_emit_encodeSnappyBlockAsm:
- MOVL R8, R10
- SHRL $0x10, R10
+ MOVL DI, R9
+ SHRL $0x10, R9
MOVB $0xf8, (AX)
- MOVW R8, 1(AX)
- MOVB R10, 3(AX)
+ MOVW DI, 1(AX)
+ MOVB R9, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_match_emit_encodeSnappyBlockAsm
three_bytes_match_emit_encodeSnappyBlockAsm:
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBlockAsm
two_bytes_match_emit_encodeSnappyBlockAsm:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeSnappyBlockAsm
JMP memmove_long_match_emit_encodeSnappyBlockAsm
one_byte_match_emit_encodeSnappyBlockAsm:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBlockAsm:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBlockAsm:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeSnappyBlockAsm
memmove_long_match_emit_encodeSnappyBlockAsm:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeSnappyBlockAsm:
match_nolit_loop_encodeSnappyBlockAsm:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBlockAsm
matchlen_loopback_match_nolit_encodeSnappyBlockAsm:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeSnappyBlockAsm
matchlen_loop_match_nolit_encodeSnappyBlockAsm:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm
JZ match_nolit_end_encodeSnappyBlockAsm
matchlen_match4_match_nolit_encodeSnappyBlockAsm:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBlockAsm
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeSnappyBlockAsm:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBlockAsm
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeSnappyBlockAsm:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeSnappyBlockAsm
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeSnappyBlockAsm
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeSnappyBlockAsm:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JL two_byte_offset_match_nolit_encodeSnappyBlockAsm
four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm
MOVB $0xff, (AX)
- MOVL SI, 1(AX)
- LEAL -64(R10), R10
+ MOVL BX, 1(AX)
+ LEAL -64(R9), R9
ADDQ $0x05, AX
- CMPL R10, $0x04
+ CMPL R9, $0x04
JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm
JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm
four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
- TESTL R10, R10
+ TESTL R9, R9
JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm
- MOVB $0x03, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVL SI, 1(AX)
+ XORL SI, SI
+ LEAL -1(SI)(R9*4), R9
+ MOVB R9, (AX)
+ MOVL BX, 1(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
two_byte_offset_match_nolit_encodeSnappyBlockAsm:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm
two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
emit_copy_three_match_nolit_encodeSnappyBlockAsm:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBlockAsm:
CMPL CX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeSnappyBlockAsm
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeSnappyBlockAsm:
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x10, R8
- IMULQ R9, R8
- SHRQ $0x32, R8
- SHLQ $0x10, SI
- IMULQ R9, SI
- SHRQ $0x32, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x10, DI
+ IMULQ R8, DI
+ SHRQ $0x32, DI
+ SHLQ $0x10, BX
+ IMULQ R8, BX
+ SHRQ $0x32, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeSnappyBlockAsm
INCL CX
JMP search_loop_encodeSnappyBlockAsm
@@ -11246,8 +11193,8 @@ zero_loop_encodeSnappyBlockAsm64K:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -11257,278 +11204,278 @@ zero_loop_encodeSnappyBlockAsm64K:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBlockAsm64K:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x06, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x06, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm64K
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x32, R10
- SHLQ $0x10, R11
- IMULQ R9, R11
- SHRQ $0x32, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
SHLQ $0x10, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x32, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
- JNE no_repeat_found_encodeSnappyBlockAsm64K
- LEAL 1(CX), DI
- MOVL 12(SP), SI
- MOVL DI, R8
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
+ MOVL CX, R8
SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
+ JNE no_repeat_found_encodeSnappyBlockAsm64K
+ LEAL 1(CX), SI
+ MOVL 12(SP), BX
+ MOVL SI, DI
+ SUBL 16(SP), DI
JZ repeat_extend_back_end_encodeSnappyBlockAsm64K
repeat_extend_back_loop_encodeSnappyBlockAsm64K:
- CMPL DI, SI
+ CMPL SI, BX
JLE repeat_extend_back_end_encodeSnappyBlockAsm64K
- MOVB -1(DX)(R8*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(DI*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeSnappyBlockAsm64K
- LEAL -1(DI), DI
- DECL R8
+ LEAL -1(SI), SI
+ DECL DI
JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K
repeat_extend_back_end_encodeSnappyBlockAsm64K:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
- MOVL DI, R8
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R9
- SUBL SI, R8
- LEAL -1(R8), SI
- CMPL SI, $0x3c
+ MOVL SI, DI
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R8
+ SUBL BX, DI
+ LEAL -1(DI), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeSnappyBlockAsm64K
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeSnappyBlockAsm64K
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
two_bytes_repeat_emit_encodeSnappyBlockAsm64K:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeSnappyBlockAsm64K
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
one_byte_repeat_emit_encodeSnappyBlockAsm64K:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeSnappyBlockAsm64K:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveShort
- CMPQ R8, $0x08
+ CMPQ DI, $0x08
JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8
- CMPQ R8, $0x10
+ CMPQ DI, $0x10
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
- CMPQ R8, $0x20
+ CMPQ DI, $0x20
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8:
- MOVQ (R9), R10
- MOVQ R10, (AX)
+ MOVQ (R8), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
- MOVQ (R9), R10
- MOVQ -8(R9)(R8*1), R9
- MOVQ R10, (AX)
- MOVQ R9, -8(AX)(R8*1)
+ MOVQ (R8), R9
+ MOVQ -8(R8)(DI*1), R8
+ MOVQ R9, (AX)
+ MOVQ R8, -8(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
- MOVOU (R9), X0
- MOVOU -16(R9)(R8*1), X1
+ MOVOU (R8), X0
+ MOVOU -16(R8)(DI*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R8*1)
+ MOVOU X1, -16(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
memmove_long_repeat_emit_encodeSnappyBlockAsm64K:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveLong
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
- MOVQ R8, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
+ MOVQ DI, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
- LEAQ -32(R9)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(R8)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
- MOVOU -32(R9)(R12*1), X4
- MOVOU -16(R9)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R8, R12
+ MOVOU -32(R8)(R11*1), X4
+ MOVOU -16(R8)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ DI, R11
JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R11, R11
- CMPL R8, $0x08
+ XORL R10, R10
+ CMPL DI, $0x08
JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K:
- MOVQ (R9)(R11*1), R10
- XORQ (SI)(R11*1), R10
- TESTQ R10, R10
+ MOVQ (R8)(R10*1), R9
+ XORQ (BX)(R10*1), R9
+ TESTQ R9, R9
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K
#ifdef GOAMD64_v3
- TZCNTQ R10, R10
+ TZCNTQ R9, R9
#else
- BSFQ R10, R10
+ BSFQ R9, R9
#endif
- SARQ $0x03, R10
- LEAL (R11)(R10*1), R11
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K
matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K:
- LEAL -8(R8), R8
- LEAL 8(R11), R11
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K
JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K
matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
- MOVL (R9)(R11*1), R10
- CMPL (SI)(R11*1), R10
+ MOVL (R8)(R10*1), R9
+ CMPL (BX)(R10*1), R9
JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
- SUBL $0x04, R8
- LEAL 4(R11), R11
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
- MOVW (R9)(R11*1), R10
- CMPW (SI)(R11*1), R10
+ MOVW (R8)(R10*1), R9
+ CMPW (BX)(R10*1), R9
JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
- SUBL $0x02, R8
- LEAL 2(R11), R11
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL repeat_extend_forward_end_encodeSnappyBlockAsm64K
- MOVB (R9)(R11*1), R10
- CMPB (SI)(R11*1), R10
+ MOVB (R8)(R10*1), R9
+ CMPB (BX)(R10*1), R9
JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K
- LEAL 1(R11), R11
+ LEAL 1(R10), R10
repeat_extend_forward_end_encodeSnappyBlockAsm64K:
- ADDL R11, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
+ ADDL R10, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K
two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeSnappyBlockAsm64K
emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeSnappyBlockAsm64K:
@@ -11536,16 +11483,16 @@ repeat_end_emit_encodeSnappyBlockAsm64K:
JMP search_loop_encodeSnappyBlockAsm64K
no_repeat_found_encodeSnappyBlockAsm64K:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBlockAsm64K
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeSnappyBlockAsm64K
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeSnappyBlockAsm64K
MOVL 20(SP), CX
JMP search_loop_encodeSnappyBlockAsm64K
@@ -11555,288 +11502,288 @@ candidate3_match_encodeSnappyBlockAsm64K:
JMP candidate_match_encodeSnappyBlockAsm64K
candidate2_match_encodeSnappyBlockAsm64K:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBlockAsm64K:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBlockAsm64K
match_extend_back_loop_encodeSnappyBlockAsm64K:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBlockAsm64K
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBlockAsm64K
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBlockAsm64K
JMP match_extend_back_loop_encodeSnappyBlockAsm64K
match_extend_back_end_encodeSnappyBlockAsm64K:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBlockAsm64K
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBlockAsm64K:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeSnappyBlockAsm64K
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBlockAsm64K
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
two_bytes_match_emit_encodeSnappyBlockAsm64K:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeSnappyBlockAsm64K
JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
one_byte_match_emit_encodeSnappyBlockAsm64K:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBlockAsm64K:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBlockAsm64K:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K
memmove_long_match_emit_encodeSnappyBlockAsm64K:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeSnappyBlockAsm64K:
match_nolit_loop_encodeSnappyBlockAsm64K:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeSnappyBlockAsm64K
matchlen_loop_match_nolit_encodeSnappyBlockAsm64K:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K
JZ match_nolit_end_encodeSnappyBlockAsm64K
matchlen_match4_match_nolit_encodeSnappyBlockAsm64K:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeSnappyBlockAsm64K:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeSnappyBlockAsm64K:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeSnappyBlockAsm64K
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeSnappyBlockAsm64K
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeSnappyBlockAsm64K:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm64K:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K
two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K
emit_copy_three_match_nolit_encodeSnappyBlockAsm64K:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBlockAsm64K:
CMPL CX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm64K
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeSnappyBlockAsm64K
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeSnappyBlockAsm64K:
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x10, R8
- IMULQ R9, R8
- SHRQ $0x32, R8
- SHLQ $0x10, SI
- IMULQ R9, SI
- SHRQ $0x32, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x10, DI
+ IMULQ R8, DI
+ SHRQ $0x32, DI
+ SHLQ $0x10, BX
+ IMULQ R8, BX
+ SHRQ $0x32, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeSnappyBlockAsm64K
INCL CX
JMP search_loop_encodeSnappyBlockAsm64K
@@ -12021,8 +11968,8 @@ zero_loop_encodeSnappyBlockAsm12B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -12032,278 +11979,278 @@ zero_loop_encodeSnappyBlockAsm12B:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBlockAsm12B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x05, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x05, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm12B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x000000cf1bbcdcbb, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x000000cf1bbcdcbb, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x18, R9
+ IMULQ R8, R9
+ SHRQ $0x34, R9
SHLQ $0x18, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x34, R10
- SHLQ $0x18, R11
- IMULQ R9, R11
- SHRQ $0x34, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
- SHLQ $0x18, R10
- IMULQ R9, R10
- SHRQ $0x34, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
- JNE no_repeat_found_encodeSnappyBlockAsm12B
- LEAL 1(CX), DI
- MOVL 12(SP), SI
- MOVL DI, R8
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x18, R9
+ IMULQ R8, R9
+ SHRQ $0x34, R9
+ MOVL CX, R8
SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
+ JNE no_repeat_found_encodeSnappyBlockAsm12B
+ LEAL 1(CX), SI
+ MOVL 12(SP), BX
+ MOVL SI, DI
+ SUBL 16(SP), DI
JZ repeat_extend_back_end_encodeSnappyBlockAsm12B
repeat_extend_back_loop_encodeSnappyBlockAsm12B:
- CMPL DI, SI
+ CMPL SI, BX
JLE repeat_extend_back_end_encodeSnappyBlockAsm12B
- MOVB -1(DX)(R8*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(DI*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeSnappyBlockAsm12B
- LEAL -1(DI), DI
- DECL R8
+ LEAL -1(SI), SI
+ DECL DI
JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B
repeat_extend_back_end_encodeSnappyBlockAsm12B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
- MOVL DI, R8
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R9
- SUBL SI, R8
- LEAL -1(R8), SI
- CMPL SI, $0x3c
+ MOVL SI, DI
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R8
+ SUBL BX, DI
+ LEAL -1(DI), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeSnappyBlockAsm12B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeSnappyBlockAsm12B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeSnappyBlockAsm12B
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
one_byte_repeat_emit_encodeSnappyBlockAsm12B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeSnappyBlockAsm12B:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveShort
- CMPQ R8, $0x08
+ CMPQ DI, $0x08
JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8
- CMPQ R8, $0x10
+ CMPQ DI, $0x10
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
- CMPQ R8, $0x20
+ CMPQ DI, $0x20
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8:
- MOVQ (R9), R10
- MOVQ R10, (AX)
+ MOVQ (R8), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
- MOVQ (R9), R10
- MOVQ -8(R9)(R8*1), R9
- MOVQ R10, (AX)
- MOVQ R9, -8(AX)(R8*1)
+ MOVQ (R8), R9
+ MOVQ -8(R8)(DI*1), R8
+ MOVQ R9, (AX)
+ MOVQ R8, -8(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
- MOVOU (R9), X0
- MOVOU -16(R9)(R8*1), X1
+ MOVOU (R8), X0
+ MOVOU -16(R8)(DI*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R8*1)
+ MOVOU X1, -16(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveLong
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
- MOVQ R8, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
+ MOVQ DI, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
- LEAQ -32(R9)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(R8)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
- MOVOU -32(R9)(R12*1), X4
- MOVOU -16(R9)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R8, R12
+ MOVOU -32(R8)(R11*1), X4
+ MOVOU -16(R8)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ DI, R11
JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R11, R11
- CMPL R8, $0x08
+ XORL R10, R10
+ CMPL DI, $0x08
JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B:
- MOVQ (R9)(R11*1), R10
- XORQ (SI)(R11*1), R10
- TESTQ R10, R10
+ MOVQ (R8)(R10*1), R9
+ XORQ (BX)(R10*1), R9
+ TESTQ R9, R9
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B
#ifdef GOAMD64_v3
- TZCNTQ R10, R10
+ TZCNTQ R9, R9
#else
- BSFQ R10, R10
+ BSFQ R9, R9
#endif
- SARQ $0x03, R10
- LEAL (R11)(R10*1), R11
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B
matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B:
- LEAL -8(R8), R8
- LEAL 8(R11), R11
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B
JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B
matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
- MOVL (R9)(R11*1), R10
- CMPL (SI)(R11*1), R10
+ MOVL (R8)(R10*1), R9
+ CMPL (BX)(R10*1), R9
JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
- SUBL $0x04, R8
- LEAL 4(R11), R11
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
- MOVW (R9)(R11*1), R10
- CMPW (SI)(R11*1), R10
+ MOVW (R8)(R10*1), R9
+ CMPW (BX)(R10*1), R9
JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
- SUBL $0x02, R8
- LEAL 2(R11), R11
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL repeat_extend_forward_end_encodeSnappyBlockAsm12B
- MOVB (R9)(R11*1), R10
- CMPB (SI)(R11*1), R10
+ MOVB (R8)(R10*1), R9
+ CMPB (BX)(R10*1), R9
JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B
- LEAL 1(R11), R11
+ LEAL 1(R10), R10
repeat_extend_forward_end_encodeSnappyBlockAsm12B:
- ADDL R11, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
+ ADDL R10, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B
two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeSnappyBlockAsm12B
emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeSnappyBlockAsm12B:
@@ -12311,16 +12258,16 @@ repeat_end_emit_encodeSnappyBlockAsm12B:
JMP search_loop_encodeSnappyBlockAsm12B
no_repeat_found_encodeSnappyBlockAsm12B:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBlockAsm12B
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeSnappyBlockAsm12B
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeSnappyBlockAsm12B
MOVL 20(SP), CX
JMP search_loop_encodeSnappyBlockAsm12B
@@ -12330,288 +12277,288 @@ candidate3_match_encodeSnappyBlockAsm12B:
JMP candidate_match_encodeSnappyBlockAsm12B
candidate2_match_encodeSnappyBlockAsm12B:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBlockAsm12B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBlockAsm12B
match_extend_back_loop_encodeSnappyBlockAsm12B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBlockAsm12B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBlockAsm12B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBlockAsm12B
JMP match_extend_back_loop_encodeSnappyBlockAsm12B
match_extend_back_end_encodeSnappyBlockAsm12B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBlockAsm12B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBlockAsm12B:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeSnappyBlockAsm12B
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBlockAsm12B
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
two_bytes_match_emit_encodeSnappyBlockAsm12B:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeSnappyBlockAsm12B
JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
one_byte_match_emit_encodeSnappyBlockAsm12B:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBlockAsm12B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B
memmove_long_match_emit_encodeSnappyBlockAsm12B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
match_nolit_loop_encodeSnappyBlockAsm12B:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeSnappyBlockAsm12B
matchlen_loop_match_nolit_encodeSnappyBlockAsm12B:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B
JZ match_nolit_end_encodeSnappyBlockAsm12B
matchlen_match4_match_nolit_encodeSnappyBlockAsm12B:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeSnappyBlockAsm12B:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeSnappyBlockAsm12B:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeSnappyBlockAsm12B
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeSnappyBlockAsm12B
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeSnappyBlockAsm12B:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B
two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B
emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
CMPL CX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm12B
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeSnappyBlockAsm12B
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeSnappyBlockAsm12B:
- MOVQ $0x000000cf1bbcdcbb, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x18, R8
- IMULQ R9, R8
- SHRQ $0x34, R8
- SHLQ $0x18, SI
- IMULQ R9, SI
- SHRQ $0x34, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x000000cf1bbcdcbb, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x18, DI
+ IMULQ R8, DI
+ SHRQ $0x34, DI
+ SHLQ $0x18, BX
+ IMULQ R8, BX
+ SHRQ $0x34, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeSnappyBlockAsm12B
INCL CX
JMP search_loop_encodeSnappyBlockAsm12B
@@ -12796,8 +12743,8 @@ zero_loop_encodeSnappyBlockAsm10B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -12807,278 +12754,278 @@ zero_loop_encodeSnappyBlockAsm10B:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBlockAsm10B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x05, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x05, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm10B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x9e3779b1, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
- SHLQ $0x20, R10
- IMULQ R9, R10
- SHRQ $0x36, R10
- SHLQ $0x20, R11
- IMULQ R9, R11
- SHRQ $0x36, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x36, R9
SHLQ $0x20, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x36, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
- JNE no_repeat_found_encodeSnappyBlockAsm10B
- LEAL 1(CX), DI
- MOVL 12(SP), SI
- MOVL DI, R8
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x36, R9
+ MOVL CX, R8
SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
+ JNE no_repeat_found_encodeSnappyBlockAsm10B
+ LEAL 1(CX), SI
+ MOVL 12(SP), BX
+ MOVL SI, DI
+ SUBL 16(SP), DI
JZ repeat_extend_back_end_encodeSnappyBlockAsm10B
repeat_extend_back_loop_encodeSnappyBlockAsm10B:
- CMPL DI, SI
+ CMPL SI, BX
JLE repeat_extend_back_end_encodeSnappyBlockAsm10B
- MOVB -1(DX)(R8*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(DI*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeSnappyBlockAsm10B
- LEAL -1(DI), DI
- DECL R8
+ LEAL -1(SI), SI
+ DECL DI
JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B
repeat_extend_back_end_encodeSnappyBlockAsm10B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
- MOVL DI, R8
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R9
- SUBL SI, R8
- LEAL -1(R8), SI
- CMPL SI, $0x3c
+ MOVL SI, DI
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R8
+ SUBL BX, DI
+ LEAL -1(DI), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeSnappyBlockAsm10B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeSnappyBlockAsm10B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeSnappyBlockAsm10B
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
one_byte_repeat_emit_encodeSnappyBlockAsm10B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeSnappyBlockAsm10B:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveShort
- CMPQ R8, $0x08
+ CMPQ DI, $0x08
JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8
- CMPQ R8, $0x10
+ CMPQ DI, $0x10
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
- CMPQ R8, $0x20
+ CMPQ DI, $0x20
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8:
- MOVQ (R9), R10
- MOVQ R10, (AX)
+ MOVQ (R8), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
- MOVQ (R9), R10
- MOVQ -8(R9)(R8*1), R9
- MOVQ R10, (AX)
- MOVQ R9, -8(AX)(R8*1)
+ MOVQ (R8), R9
+ MOVQ -8(R8)(DI*1), R8
+ MOVQ R9, (AX)
+ MOVQ R8, -8(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
- MOVOU (R9), X0
- MOVOU -16(R9)(R8*1), X1
+ MOVOU (R8), X0
+ MOVOU -16(R8)(DI*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R8*1)
+ MOVOU X1, -16(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveLong
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
- MOVQ R8, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
+ MOVQ DI, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
- LEAQ -32(R9)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(R8)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
- MOVOU -32(R9)(R12*1), X4
- MOVOU -16(R9)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R8, R12
+ MOVOU -32(R8)(R11*1), X4
+ MOVOU -16(R8)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ DI, R11
JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R11, R11
- CMPL R8, $0x08
+ XORL R10, R10
+ CMPL DI, $0x08
JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B:
- MOVQ (R9)(R11*1), R10
- XORQ (SI)(R11*1), R10
- TESTQ R10, R10
+ MOVQ (R8)(R10*1), R9
+ XORQ (BX)(R10*1), R9
+ TESTQ R9, R9
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B
#ifdef GOAMD64_v3
- TZCNTQ R10, R10
+ TZCNTQ R9, R9
#else
- BSFQ R10, R10
+ BSFQ R9, R9
#endif
- SARQ $0x03, R10
- LEAL (R11)(R10*1), R11
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B
matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B:
- LEAL -8(R8), R8
- LEAL 8(R11), R11
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B
JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B
matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
- MOVL (R9)(R11*1), R10
- CMPL (SI)(R11*1), R10
+ MOVL (R8)(R10*1), R9
+ CMPL (BX)(R10*1), R9
JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
- SUBL $0x04, R8
- LEAL 4(R11), R11
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
- MOVW (R9)(R11*1), R10
- CMPW (SI)(R11*1), R10
+ MOVW (R8)(R10*1), R9
+ CMPW (BX)(R10*1), R9
JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
- SUBL $0x02, R8
- LEAL 2(R11), R11
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL repeat_extend_forward_end_encodeSnappyBlockAsm10B
- MOVB (R9)(R11*1), R10
- CMPB (SI)(R11*1), R10
+ MOVB (R8)(R10*1), R9
+ CMPB (BX)(R10*1), R9
JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B
- LEAL 1(R11), R11
+ LEAL 1(R10), R10
repeat_extend_forward_end_encodeSnappyBlockAsm10B:
- ADDL R11, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
+ ADDL R10, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B
two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
- CMPL DI, $0x00000800
+ CMPL SI, $0x00000800
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeSnappyBlockAsm10B
emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeSnappyBlockAsm10B:
@@ -13086,16 +13033,16 @@ repeat_end_emit_encodeSnappyBlockAsm10B:
JMP search_loop_encodeSnappyBlockAsm10B
no_repeat_found_encodeSnappyBlockAsm10B:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBlockAsm10B
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeSnappyBlockAsm10B
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeSnappyBlockAsm10B
MOVL 20(SP), CX
JMP search_loop_encodeSnappyBlockAsm10B
@@ -13105,288 +13052,288 @@ candidate3_match_encodeSnappyBlockAsm10B:
JMP candidate_match_encodeSnappyBlockAsm10B
candidate2_match_encodeSnappyBlockAsm10B:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBlockAsm10B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBlockAsm10B
match_extend_back_loop_encodeSnappyBlockAsm10B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBlockAsm10B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBlockAsm10B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBlockAsm10B
JMP match_extend_back_loop_encodeSnappyBlockAsm10B
match_extend_back_end_encodeSnappyBlockAsm10B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBlockAsm10B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBlockAsm10B:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeSnappyBlockAsm10B
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBlockAsm10B
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
two_bytes_match_emit_encodeSnappyBlockAsm10B:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeSnappyBlockAsm10B
JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
one_byte_match_emit_encodeSnappyBlockAsm10B:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBlockAsm10B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B
memmove_long_match_emit_encodeSnappyBlockAsm10B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
match_nolit_loop_encodeSnappyBlockAsm10B:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeSnappyBlockAsm10B
matchlen_loop_match_nolit_encodeSnappyBlockAsm10B:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B
JZ match_nolit_end_encodeSnappyBlockAsm10B
matchlen_match4_match_nolit_encodeSnappyBlockAsm10B:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeSnappyBlockAsm10B:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeSnappyBlockAsm10B:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeSnappyBlockAsm10B
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeSnappyBlockAsm10B
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeSnappyBlockAsm10B:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B
two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
- CMPL SI, $0x00000800
+ CMPL BX, $0x00000800
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B
emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
CMPL CX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm10B
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeSnappyBlockAsm10B
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeSnappyBlockAsm10B:
- MOVQ $0x9e3779b1, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x20, R8
- IMULQ R9, R8
- SHRQ $0x36, R8
- SHLQ $0x20, SI
- IMULQ R9, SI
- SHRQ $0x36, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x20, DI
+ IMULQ R8, DI
+ SHRQ $0x36, DI
+ SHLQ $0x20, BX
+ IMULQ R8, BX
+ SHRQ $0x36, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeSnappyBlockAsm10B
INCL CX
JMP search_loop_encodeSnappyBlockAsm10B
@@ -13571,8 +13518,8 @@ zero_loop_encodeSnappyBlockAsm8B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -13582,276 +13529,276 @@ zero_loop_encodeSnappyBlockAsm8B:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBlockAsm8B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x04, SI
- LEAL 4(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x04, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm8B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x9e3779b1, R9
- MOVQ DI, R10
- MOVQ DI, R11
- SHRQ $0x08, R11
- SHLQ $0x20, R10
- IMULQ R9, R10
- SHRQ $0x38, R10
- SHLQ $0x20, R11
- IMULQ R9, R11
- SHRQ $0x38, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 24(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- LEAL 1(CX), R10
- MOVL R10, 24(SP)(R11*4)
- MOVQ DI, R10
- SHRQ $0x10, R10
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x38, R9
SHLQ $0x20, R10
- IMULQ R9, R10
+ IMULQ R8, R10
SHRQ $0x38, R10
- MOVL CX, R9
- SUBL 16(SP), R9
- MOVL 1(DX)(R9*1), R11
- MOVQ DI, R9
- SHRQ $0x08, R9
- CMPL R9, R11
- JNE no_repeat_found_encodeSnappyBlockAsm8B
- LEAL 1(CX), DI
- MOVL 12(SP), SI
- MOVL DI, R8
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x38, R9
+ MOVL CX, R8
SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
+ JNE no_repeat_found_encodeSnappyBlockAsm8B
+ LEAL 1(CX), SI
+ MOVL 12(SP), BX
+ MOVL SI, DI
+ SUBL 16(SP), DI
JZ repeat_extend_back_end_encodeSnappyBlockAsm8B
repeat_extend_back_loop_encodeSnappyBlockAsm8B:
- CMPL DI, SI
+ CMPL SI, BX
JLE repeat_extend_back_end_encodeSnappyBlockAsm8B
- MOVB -1(DX)(R8*1), BL
- MOVB -1(DX)(DI*1), R9
- CMPB BL, R9
+ MOVB -1(DX)(DI*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
JNE repeat_extend_back_end_encodeSnappyBlockAsm8B
- LEAL -1(DI), DI
- DECL R8
+ LEAL -1(SI), SI
+ DECL DI
JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B
repeat_extend_back_end_encodeSnappyBlockAsm8B:
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
- MOVL DI, R8
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R9
- SUBL SI, R8
- LEAL -1(R8), SI
- CMPL SI, $0x3c
+ MOVL SI, DI
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R8
+ SUBL BX, DI
+ LEAL -1(DI), BX
+ CMPL BX, $0x3c
JLT one_byte_repeat_emit_encodeSnappyBlockAsm8B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_repeat_emit_encodeSnappyBlockAsm8B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_repeat_emit_encodeSnappyBlockAsm8B
JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
one_byte_repeat_emit_encodeSnappyBlockAsm8B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_repeat_emit_encodeSnappyBlockAsm8B:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveShort
- CMPQ R8, $0x08
+ CMPQ DI, $0x08
JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8
- CMPQ R8, $0x10
+ CMPQ DI, $0x10
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
- CMPQ R8, $0x20
+ CMPQ DI, $0x20
JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8:
- MOVQ (R9), R10
- MOVQ R10, (AX)
+ MOVQ (R8), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
- MOVQ (R9), R10
- MOVQ -8(R9)(R8*1), R9
- MOVQ R10, (AX)
- MOVQ R9, -8(AX)(R8*1)
+ MOVQ (R8), R9
+ MOVQ -8(R8)(DI*1), R8
+ MOVQ R9, (AX)
+ MOVQ R8, -8(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
- MOVOU (R9), X0
- MOVOU -16(R9)(R8*1), X1
+ MOVOU (R8), X0
+ MOVOU -16(R8)(DI*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R8*1)
+ MOVOU X1, -16(AX)(DI*1)
JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
- LEAQ (AX)(R8*1), SI
+ LEAQ (AX)(DI*1), BX
// genMemMoveLong
- MOVOU (R9), X0
- MOVOU 16(R9), X1
- MOVOU -32(R9)(R8*1), X2
- MOVOU -16(R9)(R8*1), X3
- MOVQ R8, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (R8), X0
+ MOVOU 16(R8), X1
+ MOVOU -32(R8)(DI*1), X2
+ MOVOU -16(R8)(DI*1), X3
+ MOVQ DI, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
- LEAQ -32(R9)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(R8)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
- MOVOU -32(R9)(R12*1), X4
- MOVOU -16(R9)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R8, R12
+ MOVOU -32(R8)(R11*1), X4
+ MOVOU -16(R8)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ DI, R11
JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R8*1)
- MOVOU X3, -16(AX)(R8*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(DI*1)
+ MOVOU X3, -16(AX)(DI*1)
+ MOVQ BX, AX
emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
ADDL $0x05, CX
- MOVL CX, SI
- SUBL 16(SP), SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), SI
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R11, R11
- CMPL R8, $0x08
+ XORL R10, R10
+ CMPL DI, $0x08
JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B:
- MOVQ (R9)(R11*1), R10
- XORQ (SI)(R11*1), R10
- TESTQ R10, R10
+ MOVQ (R8)(R10*1), R9
+ XORQ (BX)(R10*1), R9
+ TESTQ R9, R9
JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B
#ifdef GOAMD64_v3
- TZCNTQ R10, R10
+ TZCNTQ R9, R9
#else
- BSFQ R10, R10
+ BSFQ R9, R9
#endif
- SARQ $0x03, R10
- LEAL (R11)(R10*1), R11
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B
matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B:
- LEAL -8(R8), R8
- LEAL 8(R11), R11
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B
JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B
matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
- MOVL (R9)(R11*1), R10
- CMPL (SI)(R11*1), R10
+ MOVL (R8)(R10*1), R9
+ CMPL (BX)(R10*1), R9
JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
- SUBL $0x04, R8
- LEAL 4(R11), R11
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
- MOVW (R9)(R11*1), R10
- CMPW (SI)(R11*1), R10
+ MOVW (R8)(R10*1), R9
+ CMPW (BX)(R10*1), R9
JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
- SUBL $0x02, R8
- LEAL 2(R11), R11
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL repeat_extend_forward_end_encodeSnappyBlockAsm8B
- MOVB (R9)(R11*1), R10
- CMPB (SI)(R11*1), R10
+ MOVB (R8)(R10*1), R9
+ CMPB (BX)(R10*1), R9
JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B
- LEAL 1(R11), R11
+ LEAL 1(R10), R10
repeat_extend_forward_end_encodeSnappyBlockAsm8B:
- ADDL R11, CX
- MOVL CX, SI
- SUBL DI, SI
- MOVL 16(SP), DI
+ ADDL R10, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
// emitCopy
two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
- CMPL SI, $0x40
+ CMPL BX, $0x40
JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
MOVB $0xee, (AX)
- MOVW DI, 1(AX)
- LEAL -60(SI), SI
+ MOVW SI, 1(AX)
+ LEAL -60(BX), BX
ADDQ $0x03, AX
JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B
two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
- CMPL SI, $0x0c
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
- MOVB $0x01, BL
- LEAL -16(BX)(SI*4), SI
- MOVB DI, 1(AX)
- SHRL $0x08, DI
- SHLL $0x05, DI
- ORL DI, SI
- MOVB SI, (AX)
+ LEAL -15(DI), DI
+ MOVB SI, 1(AX)
+ SHRL $0x08, SI
+ SHLL $0x05, SI
+ ORL SI, DI
+ MOVB DI, (AX)
ADDQ $0x02, AX
JMP repeat_end_emit_encodeSnappyBlockAsm8B
emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
- MOVB $0x02, BL
- LEAL -4(BX)(SI*4), SI
- MOVB SI, (AX)
- MOVW DI, 1(AX)
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW SI, 1(AX)
ADDQ $0x03, AX
repeat_end_emit_encodeSnappyBlockAsm8B:
@@ -13859,16 +13806,16 @@ repeat_end_emit_encodeSnappyBlockAsm8B:
JMP search_loop_encodeSnappyBlockAsm8B
no_repeat_found_encodeSnappyBlockAsm8B:
- CMPL (DX)(SI*1), DI
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBlockAsm8B
- SHRQ $0x08, DI
- MOVL 24(SP)(R10*4), SI
- LEAL 2(CX), R9
- CMPL (DX)(R8*1), DI
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
JEQ candidate2_match_encodeSnappyBlockAsm8B
- MOVL R9, 24(SP)(R10*4)
- SHRQ $0x08, DI
- CMPL (DX)(SI*1), DI
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
JEQ candidate3_match_encodeSnappyBlockAsm8B
MOVL 20(SP), CX
JMP search_loop_encodeSnappyBlockAsm8B
@@ -13878,286 +13825,286 @@ candidate3_match_encodeSnappyBlockAsm8B:
JMP candidate_match_encodeSnappyBlockAsm8B
candidate2_match_encodeSnappyBlockAsm8B:
- MOVL R9, 24(SP)(R10*4)
+ MOVL R8, 24(SP)(R9*4)
INCL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBlockAsm8B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBlockAsm8B
match_extend_back_loop_encodeSnappyBlockAsm8B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBlockAsm8B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBlockAsm8B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBlockAsm8B
JMP match_extend_back_loop_encodeSnappyBlockAsm8B
match_extend_back_end_encodeSnappyBlockAsm8B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBlockAsm8B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBlockAsm8B:
- MOVL CX, DI
- MOVL 12(SP), R8
- CMPL R8, DI
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(R8*1), DI
- SUBL R8, R9
- LEAL -1(R9), R8
- CMPL R8, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), DI
+ CMPL DI, $0x3c
JLT one_byte_match_emit_encodeSnappyBlockAsm8B
- CMPL R8, $0x00000100
+ CMPL DI, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBlockAsm8B
MOVB $0xf4, (AX)
- MOVW R8, 1(AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
two_bytes_match_emit_encodeSnappyBlockAsm8B:
MOVB $0xf0, (AX)
- MOVB R8, 1(AX)
+ MOVB DI, 1(AX)
ADDQ $0x02, AX
- CMPL R8, $0x40
+ CMPL DI, $0x40
JL memmove_match_emit_encodeSnappyBlockAsm8B
JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
one_byte_match_emit_encodeSnappyBlockAsm8B:
- SHLB $0x02, R8
- MOVB R8, (AX)
+ SHLB $0x02, DI
+ MOVB DI, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBlockAsm8B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8:
- MOVQ (DI), R10
- MOVQ R10, (AX)
+ MOVQ (SI), R9
+ MOVQ R9, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
- MOVQ (DI), R10
- MOVQ -8(DI)(R9*1), DI
- MOVQ R10, (AX)
- MOVQ DI, -8(AX)(R9*1)
+ MOVQ (SI), R9
+ MOVQ -8(SI)(R8*1), SI
+ MOVQ R9, (AX)
+ MOVQ SI, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
- MOVOU (DI), X0
- MOVOU -16(DI)(R9*1), X1
+ MOVOU (SI), X0
+ MOVOU -16(SI)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
- MOVQ R8, AX
+ MOVQ DI, AX
JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B
memmove_long_match_emit_encodeSnappyBlockAsm8B:
- LEAQ (AX)(R9*1), R8
+ LEAQ (AX)(R8*1), DI
// genMemMoveLong
- MOVOU (DI), X0
- MOVOU 16(DI), X1
- MOVOU -32(DI)(R9*1), X2
- MOVOU -16(DI)(R9*1), X3
- MOVQ R9, R11
- SHRQ $0x05, R11
- MOVQ AX, R10
- ANDL $0x0000001f, R10
- MOVQ $0x00000040, R12
- SUBQ R10, R12
- DECQ R11
+ MOVOU (SI), X0
+ MOVOU 16(SI), X1
+ MOVOU -32(SI)(R8*1), X2
+ MOVOU -16(SI)(R8*1), X3
+ MOVQ R8, R10
+ SHRQ $0x05, R10
+ MOVQ AX, R9
+ ANDL $0x0000001f, R9
+ MOVQ $0x00000040, R11
+ SUBQ R9, R11
+ DECQ R10
JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
- LEAQ -32(DI)(R12*1), R10
- LEAQ -32(AX)(R12*1), R13
+ LEAQ -32(SI)(R11*1), R9
+ LEAQ -32(AX)(R11*1), R12
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
- MOVOU (R10), X4
- MOVOU 16(R10), X5
- MOVOA X4, (R13)
- MOVOA X5, 16(R13)
- ADDQ $0x20, R13
- ADDQ $0x20, R10
+ MOVOU (R9), X4
+ MOVOU 16(R9), X5
+ MOVOA X4, (R12)
+ MOVOA X5, 16(R12)
ADDQ $0x20, R12
- DECQ R11
+ ADDQ $0x20, R9
+ ADDQ $0x20, R11
+ DECQ R10
JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
- MOVOU -32(DI)(R12*1), X4
- MOVOU -16(DI)(R12*1), X5
- MOVOA X4, -32(AX)(R12*1)
- MOVOA X5, -16(AX)(R12*1)
- ADDQ $0x20, R12
- CMPQ R9, R12
+ MOVOU -32(SI)(R11*1), X4
+ MOVOU -16(SI)(R11*1), X5
+ MOVOA X4, -32(AX)(R11*1)
+ MOVOA X5, -16(AX)(R11*1)
+ ADDQ $0x20, R11
+ CMPQ R8, R11
JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ R8, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ DI, AX
emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
match_nolit_loop_encodeSnappyBlockAsm8B:
- MOVL CX, DI
- SUBL SI, DI
- MOVL DI, 16(SP)
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), DI
- SUBL CX, DI
- LEAQ (DX)(CX*1), R8
- LEAQ (DX)(SI*1), SI
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
// matchLen
- XORL R10, R10
- CMPL DI, $0x08
+ XORL R9, R9
+ CMPL SI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B:
- MOVQ (R8)(R10*1), R9
- XORQ (SI)(R10*1), R9
- TESTQ R9, R9
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B
#ifdef GOAMD64_v3
- TZCNTQ R9, R9
+ TZCNTQ R8, R8
#else
- BSFQ R9, R9
+ BSFQ R8, R8
#endif
- SARQ $0x03, R9
- LEAL (R10)(R9*1), R10
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
JMP match_nolit_end_encodeSnappyBlockAsm8B
matchlen_loop_match_nolit_encodeSnappyBlockAsm8B:
- LEAL -8(DI), DI
- LEAL 8(R10), R10
- CMPL DI, $0x08
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B
JZ match_nolit_end_encodeSnappyBlockAsm8B
matchlen_match4_match_nolit_encodeSnappyBlockAsm8B:
- CMPL DI, $0x04
+ CMPL SI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
- MOVL (R8)(R10*1), R9
- CMPL (SI)(R10*1), R9
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
- SUBL $0x04, DI
- LEAL 4(R10), R10
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
matchlen_match2_match_nolit_encodeSnappyBlockAsm8B:
- CMPL DI, $0x02
+ CMPL SI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
- MOVW (R8)(R10*1), R9
- CMPW (SI)(R10*1), R9
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
- SUBL $0x02, DI
- LEAL 2(R10), R10
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
matchlen_match1_match_nolit_encodeSnappyBlockAsm8B:
- CMPL DI, $0x01
+ CMPL SI, $0x01
JL match_nolit_end_encodeSnappyBlockAsm8B
- MOVB (R8)(R10*1), R9
- CMPB (SI)(R10*1), R9
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
JNE match_nolit_end_encodeSnappyBlockAsm8B
- LEAL 1(R10), R10
+ LEAL 1(R9), R9
match_nolit_end_encodeSnappyBlockAsm8B:
- ADDL R10, CX
- MOVL 16(SP), SI
- ADDL $0x04, R10
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
MOVL CX, 12(SP)
// emitCopy
two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
- CMPL R10, $0x40
+ CMPL R9, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
MOVB $0xee, (AX)
- MOVW SI, 1(AX)
- LEAL -60(R10), R10
+ MOVW BX, 1(AX)
+ LEAL -60(R9), R9
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B
two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
- CMPL R10, $0x0c
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
- MOVB $0x01, BL
- LEAL -16(BX)(R10*4), R10
- MOVB SI, 1(AX)
- SHRL $0x08, SI
- SHLL $0x05, SI
- ORL SI, R10
- MOVB R10, (AX)
+ LEAL -15(SI), SI
+ MOVB BL, 1(AX)
+ SHRL $0x08, BX
+ SHLL $0x05, BX
+ ORL BX, SI
+ MOVB SI, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B
emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
- MOVB $0x02, BL
- LEAL -4(BX)(R10*4), R10
- MOVB R10, (AX)
- MOVW SI, 1(AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
CMPL CX, 8(SP)
JGE emit_remainder_encodeSnappyBlockAsm8B
- MOVQ -2(DX)(CX*1), DI
+ MOVQ -2(DX)(CX*1), SI
CMPQ AX, (SP)
JL match_nolit_dst_ok_encodeSnappyBlockAsm8B
MOVQ $0x00000000, ret+48(FP)
RET
match_nolit_dst_ok_encodeSnappyBlockAsm8B:
- MOVQ $0x9e3779b1, R9
- MOVQ DI, R8
- SHRQ $0x10, DI
- MOVQ DI, SI
- SHLQ $0x20, R8
- IMULQ R9, R8
- SHRQ $0x38, R8
- SHLQ $0x20, SI
- IMULQ R9, SI
- SHRQ $0x38, SI
- LEAL -2(CX), R9
- LEAQ 24(SP)(SI*4), R10
- MOVL (R10), SI
- MOVL R9, 24(SP)(R8*4)
- MOVL CX, (R10)
- CMPL (DX)(SI*1), DI
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x20, DI
+ IMULQ R8, DI
+ SHRQ $0x38, DI
+ SHLQ $0x20, BX
+ IMULQ R8, BX
+ SHRQ $0x38, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
JEQ match_nolit_loop_encodeSnappyBlockAsm8B
INCL CX
JMP search_loop_encodeSnappyBlockAsm8B
@@ -14342,8 +14289,8 @@ zero_loop_encodeSnappyBetterBlockAsm:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -14353,369 +14300,369 @@ zero_loop_encodeSnappyBetterBlockAsm:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBetterBlockAsm:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x07, SI
- CMPL SI, $0x63
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x07, BX
+ CMPL BX, $0x63
JLE check_maxskip_ok_encodeSnappyBetterBlockAsm
- LEAL 100(CX), SI
+ LEAL 100(CX), BX
JMP check_maxskip_cont_encodeSnappyBetterBlockAsm
check_maxskip_ok_encodeSnappyBetterBlockAsm:
- LEAL 1(CX)(SI*1), SI
+ LEAL 1(CX)(BX*1), BX
check_maxskip_cont_encodeSnappyBetterBlockAsm:
- CMPL SI, 8(SP)
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBetterBlockAsm
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x00cf1bbcdcbfa563, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x08, R10
- IMULQ R9, R10
- SHRQ $0x2f, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x32, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 524312(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 524312(SP)(R11*4)
- MOVQ (DX)(SI*1), R10
- MOVQ (DX)(R8*1), R11
- CMPQ R10, DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x00cf1bbcdcbfa563, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x08, R9
+ IMULQ R8, R9
+ SHRQ $0x2f, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x32, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 524312(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 524312(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeSnappyBetterBlockAsm
- CMPQ R11, DI
+ CMPQ R10, SI
JNE no_short_found_encodeSnappyBetterBlockAsm
- MOVL R8, SI
+ MOVL DI, BX
JMP candidate_match_encodeSnappyBetterBlockAsm
no_short_found_encodeSnappyBetterBlockAsm:
- CMPL R10, DI
+ CMPL R9, SI
JEQ candidate_match_encodeSnappyBetterBlockAsm
- CMPL R11, DI
+ CMPL R10, SI
JEQ candidateS_match_encodeSnappyBetterBlockAsm
MOVL 20(SP), CX
JMP search_loop_encodeSnappyBetterBlockAsm
candidateS_match_encodeSnappyBetterBlockAsm:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x08, R10
- IMULQ R9, R10
- SHRQ $0x2f, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x08, R9
+ IMULQ R8, R9
+ SHRQ $0x2f, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBetterBlockAsm
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBetterBlockAsm:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm
match_extend_back_loop_encodeSnappyBetterBlockAsm:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBetterBlockAsm
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBetterBlockAsm
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm
JMP match_extend_back_loop_encodeSnappyBetterBlockAsm
match_extend_back_end_encodeSnappyBetterBlockAsm:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 5(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 5(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBetterBlockAsm
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBetterBlockAsm:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeSnappyBetterBlockAsm
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm
JZ match_nolit_end_encodeSnappyBetterBlockAsm
matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeSnappyBetterBlockAsm
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeSnappyBetterBlockAsm
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeSnappyBetterBlockAsm:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- CMPL R12, $0x01
+ CMPL R11, $0x01
JG match_length_ok_encodeSnappyBetterBlockAsm
- CMPL R8, $0x0000ffff
+ CMPL DI, $0x0000ffff
JLE match_length_ok_encodeSnappyBetterBlockAsm
MOVL 20(SP), CX
INCL CX
JMP search_loop_encodeSnappyBetterBlockAsm
match_length_ok_encodeSnappyBetterBlockAsm:
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeSnappyBetterBlockAsm
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm
- CMPL SI, $0x00010000
+ CMPL BX, $0x00010000
JLT three_bytes_match_emit_encodeSnappyBetterBlockAsm
- CMPL SI, $0x01000000
+ CMPL BX, $0x01000000
JLT four_bytes_match_emit_encodeSnappyBetterBlockAsm
MOVB $0xfc, (AX)
- MOVL SI, 1(AX)
+ MOVL BX, 1(AX)
ADDQ $0x05, AX
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
four_bytes_match_emit_encodeSnappyBetterBlockAsm:
- MOVL SI, R11
- SHRL $0x10, R11
+ MOVL BX, R10
+ SHRL $0x10, R10
MOVB $0xf8, (AX)
- MOVW SI, 1(AX)
- MOVB R11, 3(AX)
+ MOVW BX, 1(AX)
+ MOVB R10, 3(AX)
ADDQ $0x04, AX
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
three_bytes_match_emit_encodeSnappyBetterBlockAsm:
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
two_bytes_match_emit_encodeSnappyBetterBlockAsm:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeSnappyBetterBlockAsm
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
one_byte_match_emit_encodeSnappyBetterBlockAsm:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBetterBlockAsm:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
memmove_long_match_emit_encodeSnappyBetterBlockAsm:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeSnappyBetterBlockAsm:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
- CMPL R8, $0x00010000
+ CMPL DI, $0x00010000
JL two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
MOVB $0xff, (AX)
- MOVL R8, 1(AX)
- LEAL -64(R12), R12
+ MOVL DI, 1(AX)
+ LEAL -64(R11), R11
ADDQ $0x05, AX
- CMPL R12, $0x04
+ CMPL R11, $0x04
JL four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm
four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm:
- TESTL R12, R12
+ TESTL R11, R11
JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
- MOVB $0x03, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVL R8, 1(AX)
+ XORL BX, BX
+ LEAL -1(BX)(R11*4), R11
+ MOVB R11, (AX)
+ MOVL DI, 1(AX)
ADDQ $0x05, AX
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
@@ -14727,50 +14674,50 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
RET
match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
- MOVQ $0x00cf1bbcdcbfa563, SI
- MOVQ $0x9e3779b1, R8
- LEAQ 1(DI), DI
- LEAQ -2(CX), R9
- MOVQ (DX)(DI*1), R10
- MOVQ 1(DX)(DI*1), R11
- MOVQ (DX)(R9*1), R12
- MOVQ 1(DX)(R9*1), R13
- SHLQ $0x08, R10
- IMULQ SI, R10
- SHRQ $0x2f, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x32, R11
- SHLQ $0x08, R12
- IMULQ SI, R12
- SHRQ $0x2f, R12
- SHLQ $0x20, R13
- IMULQ R8, R13
- SHRQ $0x32, R13
- LEAQ 1(DI), R8
- LEAQ 1(R9), R14
- MOVL DI, 24(SP)(R10*4)
- MOVL R9, 24(SP)(R12*4)
- MOVL R8, 524312(SP)(R11*4)
- MOVL R14, 524312(SP)(R13*4)
- ADDQ $0x01, DI
- SUBQ $0x01, R9
+ MOVQ $0x00cf1bbcdcbfa563, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x08, R9
+ IMULQ BX, R9
+ SHRQ $0x2f, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x32, R10
+ SHLQ $0x08, R11
+ IMULQ BX, R11
+ SHRQ $0x2f, R11
+ SHLQ $0x20, R12
+ IMULQ DI, R12
+ SHRQ $0x32, R12
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 524312(SP)(R10*4)
+ MOVL R13, 524312(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
index_loop_encodeSnappyBetterBlockAsm:
- CMPQ DI, R9
+ CMPQ SI, R8
JAE search_loop_encodeSnappyBetterBlockAsm
- MOVQ (DX)(DI*1), R8
- MOVQ (DX)(R9*1), R10
- SHLQ $0x08, R8
- IMULQ SI, R8
- SHRQ $0x2f, R8
- SHLQ $0x08, R10
- IMULQ SI, R10
- SHRQ $0x2f, R10
- MOVL DI, 24(SP)(R8*4)
- MOVL R9, 24(SP)(R10*4)
- ADDQ $0x02, DI
- SUBQ $0x02, R9
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x08, DI
+ IMULQ BX, DI
+ SHRQ $0x2f, DI
+ SHLQ $0x08, R9
+ IMULQ BX, R9
+ SHRQ $0x2f, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
JMP index_loop_encodeSnappyBetterBlockAsm
emit_remainder_encodeSnappyBetterBlockAsm:
@@ -14972,8 +14919,8 @@ zero_loop_encodeSnappyBetterBlockAsm64K:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -14983,309 +14930,309 @@ zero_loop_encodeSnappyBetterBlockAsm64K:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBetterBlockAsm64K:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x07, SI
- LEAL 1(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x07, BX
+ LEAL 1(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBetterBlockAsm64K
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x00cf1bbcdcbfa563, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x08, R10
- IMULQ R9, R10
- SHRQ $0x30, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x32, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 262168(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 262168(SP)(R11*4)
- MOVQ (DX)(SI*1), R10
- MOVQ (DX)(R8*1), R11
- CMPQ R10, DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x00cf1bbcdcbfa563, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x08, R9
+ IMULQ R8, R9
+ SHRQ $0x30, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x32, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 262168(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 262168(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeSnappyBetterBlockAsm64K
- CMPQ R11, DI
+ CMPQ R10, SI
JNE no_short_found_encodeSnappyBetterBlockAsm64K
- MOVL R8, SI
+ MOVL DI, BX
JMP candidate_match_encodeSnappyBetterBlockAsm64K
no_short_found_encodeSnappyBetterBlockAsm64K:
- CMPL R10, DI
+ CMPL R9, SI
JEQ candidate_match_encodeSnappyBetterBlockAsm64K
- CMPL R11, DI
+ CMPL R10, SI
JEQ candidateS_match_encodeSnappyBetterBlockAsm64K
MOVL 20(SP), CX
JMP search_loop_encodeSnappyBetterBlockAsm64K
candidateS_match_encodeSnappyBetterBlockAsm64K:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x08, R10
- IMULQ R9, R10
- SHRQ $0x30, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x08, R9
+ IMULQ R8, R9
+ SHRQ $0x30, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBetterBlockAsm64K
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBetterBlockAsm64K:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
match_extend_back_loop_encodeSnappyBetterBlockAsm64K:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBetterBlockAsm64K
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K
match_extend_back_end_encodeSnappyBetterBlockAsm64K:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBetterBlockAsm64K
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBetterBlockAsm64K:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeSnappyBetterBlockAsm64K
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K
JZ match_nolit_end_encodeSnappyBetterBlockAsm64K
matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeSnappyBetterBlockAsm64K
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeSnappyBetterBlockAsm64K
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeSnappyBetterBlockAsm64K:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeSnappyBetterBlockAsm64K
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm64K
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
two_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeSnappyBetterBlockAsm64K
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
one_byte_match_emit_encodeSnappyBetterBlockAsm64K:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBetterBlockAsm64K:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
memmove_long_match_emit_encodeSnappyBetterBlockAsm64K:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K
two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K
emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
@@ -15297,50 +15244,50 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
RET
match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
- MOVQ $0x00cf1bbcdcbfa563, SI
- MOVQ $0x9e3779b1, R8
- LEAQ 1(DI), DI
- LEAQ -2(CX), R9
- MOVQ (DX)(DI*1), R10
- MOVQ 1(DX)(DI*1), R11
- MOVQ (DX)(R9*1), R12
- MOVQ 1(DX)(R9*1), R13
- SHLQ $0x08, R10
- IMULQ SI, R10
- SHRQ $0x30, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x32, R11
- SHLQ $0x08, R12
- IMULQ SI, R12
- SHRQ $0x30, R12
- SHLQ $0x20, R13
- IMULQ R8, R13
- SHRQ $0x32, R13
- LEAQ 1(DI), R8
- LEAQ 1(R9), R14
- MOVL DI, 24(SP)(R10*4)
- MOVL R9, 24(SP)(R12*4)
- MOVL R8, 262168(SP)(R11*4)
- MOVL R14, 262168(SP)(R13*4)
- ADDQ $0x01, DI
- SUBQ $0x01, R9
+ MOVQ $0x00cf1bbcdcbfa563, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x08, R9
+ IMULQ BX, R9
+ SHRQ $0x30, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x32, R10
+ SHLQ $0x08, R11
+ IMULQ BX, R11
+ SHRQ $0x30, R11
+ SHLQ $0x20, R12
+ IMULQ DI, R12
+ SHRQ $0x32, R12
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 262168(SP)(R10*4)
+ MOVL R13, 262168(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
index_loop_encodeSnappyBetterBlockAsm64K:
- CMPQ DI, R9
+ CMPQ SI, R8
JAE search_loop_encodeSnappyBetterBlockAsm64K
- MOVQ (DX)(DI*1), R8
- MOVQ (DX)(R9*1), R10
- SHLQ $0x08, R8
- IMULQ SI, R8
- SHRQ $0x30, R8
- SHLQ $0x08, R10
- IMULQ SI, R10
- SHRQ $0x30, R10
- MOVL DI, 24(SP)(R8*4)
- MOVL R9, 24(SP)(R10*4)
- ADDQ $0x02, DI
- SUBQ $0x02, R9
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x08, DI
+ IMULQ BX, DI
+ SHRQ $0x30, DI
+ SHLQ $0x08, R9
+ IMULQ BX, R9
+ SHRQ $0x30, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
JMP index_loop_encodeSnappyBetterBlockAsm64K
emit_remainder_encodeSnappyBetterBlockAsm64K:
@@ -15523,8 +15470,8 @@ zero_loop_encodeSnappyBetterBlockAsm12B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -15534,309 +15481,309 @@ zero_loop_encodeSnappyBetterBlockAsm12B:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBetterBlockAsm12B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x06, SI
- LEAL 1(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x06, BX
+ LEAL 1(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBetterBlockAsm12B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x32, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x34, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 65560(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 65560(SP)(R11*4)
- MOVQ (DX)(SI*1), R10
- MOVQ (DX)(R8*1), R11
- CMPQ R10, DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x34, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 65560(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 65560(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeSnappyBetterBlockAsm12B
- CMPQ R11, DI
+ CMPQ R10, SI
JNE no_short_found_encodeSnappyBetterBlockAsm12B
- MOVL R8, SI
+ MOVL DI, BX
JMP candidate_match_encodeSnappyBetterBlockAsm12B
no_short_found_encodeSnappyBetterBlockAsm12B:
- CMPL R10, DI
+ CMPL R9, SI
JEQ candidate_match_encodeSnappyBetterBlockAsm12B
- CMPL R11, DI
+ CMPL R10, SI
JEQ candidateS_match_encodeSnappyBetterBlockAsm12B
MOVL 20(SP), CX
JMP search_loop_encodeSnappyBetterBlockAsm12B
candidateS_match_encodeSnappyBetterBlockAsm12B:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x32, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x32, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBetterBlockAsm12B
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBetterBlockAsm12B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
match_extend_back_loop_encodeSnappyBetterBlockAsm12B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBetterBlockAsm12B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B
match_extend_back_end_encodeSnappyBetterBlockAsm12B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBetterBlockAsm12B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBetterBlockAsm12B:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeSnappyBetterBlockAsm12B
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B
JZ match_nolit_end_encodeSnappyBetterBlockAsm12B
matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeSnappyBetterBlockAsm12B
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeSnappyBetterBlockAsm12B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeSnappyBetterBlockAsm12B:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeSnappyBetterBlockAsm12B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm12B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
two_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeSnappyBetterBlockAsm12B
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
one_byte_match_emit_encodeSnappyBetterBlockAsm12B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBetterBlockAsm12B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
memmove_long_match_emit_encodeSnappyBetterBlockAsm12B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B
two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B
emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
@@ -15848,50 +15795,50 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
RET
match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
- MOVQ $0x0000cf1bbcdcbf9b, SI
- MOVQ $0x9e3779b1, R8
- LEAQ 1(DI), DI
- LEAQ -2(CX), R9
- MOVQ (DX)(DI*1), R10
- MOVQ 1(DX)(DI*1), R11
- MOVQ (DX)(R9*1), R12
- MOVQ 1(DX)(R9*1), R13
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x32, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x34, R11
- SHLQ $0x10, R12
- IMULQ SI, R12
- SHRQ $0x32, R12
- SHLQ $0x20, R13
- IMULQ R8, R13
- SHRQ $0x34, R13
- LEAQ 1(DI), R8
- LEAQ 1(R9), R14
- MOVL DI, 24(SP)(R10*4)
- MOVL R9, 24(SP)(R12*4)
- MOVL R8, 65560(SP)(R11*4)
- MOVL R14, 65560(SP)(R13*4)
- ADDQ $0x01, DI
- SUBQ $0x01, R9
+ MOVQ $0x0000cf1bbcdcbf9b, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x32, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x34, R10
+ SHLQ $0x10, R11
+ IMULQ BX, R11
+ SHRQ $0x32, R11
+ SHLQ $0x20, R12
+ IMULQ DI, R12
+ SHRQ $0x34, R12
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 65560(SP)(R10*4)
+ MOVL R13, 65560(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
index_loop_encodeSnappyBetterBlockAsm12B:
- CMPQ DI, R9
+ CMPQ SI, R8
JAE search_loop_encodeSnappyBetterBlockAsm12B
- MOVQ (DX)(DI*1), R8
- MOVQ (DX)(R9*1), R10
- SHLQ $0x10, R8
- IMULQ SI, R8
- SHRQ $0x32, R8
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x32, R10
- MOVL DI, 24(SP)(R8*4)
- MOVL R9, 24(SP)(R10*4)
- ADDQ $0x02, DI
- SUBQ $0x02, R9
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x10, DI
+ IMULQ BX, DI
+ SHRQ $0x32, DI
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x32, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
JMP index_loop_encodeSnappyBetterBlockAsm12B
emit_remainder_encodeSnappyBetterBlockAsm12B:
@@ -16074,8 +16021,8 @@ zero_loop_encodeSnappyBetterBlockAsm10B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -16085,309 +16032,309 @@ zero_loop_encodeSnappyBetterBlockAsm10B:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBetterBlockAsm10B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x05, SI
- LEAL 1(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x05, BX
+ LEAL 1(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBetterBlockAsm10B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x34, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x36, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 16408(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 16408(SP)(R11*4)
- MOVQ (DX)(SI*1), R10
- MOVQ (DX)(R8*1), R11
- CMPQ R10, DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x34, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x36, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 16408(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 16408(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeSnappyBetterBlockAsm10B
- CMPQ R11, DI
+ CMPQ R10, SI
JNE no_short_found_encodeSnappyBetterBlockAsm10B
- MOVL R8, SI
+ MOVL DI, BX
JMP candidate_match_encodeSnappyBetterBlockAsm10B
no_short_found_encodeSnappyBetterBlockAsm10B:
- CMPL R10, DI
+ CMPL R9, SI
JEQ candidate_match_encodeSnappyBetterBlockAsm10B
- CMPL R11, DI
+ CMPL R10, SI
JEQ candidateS_match_encodeSnappyBetterBlockAsm10B
MOVL 20(SP), CX
JMP search_loop_encodeSnappyBetterBlockAsm10B
candidateS_match_encodeSnappyBetterBlockAsm10B:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x34, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x34, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBetterBlockAsm10B
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBetterBlockAsm10B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
match_extend_back_loop_encodeSnappyBetterBlockAsm10B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBetterBlockAsm10B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B
match_extend_back_end_encodeSnappyBetterBlockAsm10B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBetterBlockAsm10B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBetterBlockAsm10B:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeSnappyBetterBlockAsm10B
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B
JZ match_nolit_end_encodeSnappyBetterBlockAsm10B
matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeSnappyBetterBlockAsm10B
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeSnappyBetterBlockAsm10B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeSnappyBetterBlockAsm10B:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeSnappyBetterBlockAsm10B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm10B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
two_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeSnappyBetterBlockAsm10B
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
one_byte_match_emit_encodeSnappyBetterBlockAsm10B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBetterBlockAsm10B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
memmove_long_match_emit_encodeSnappyBetterBlockAsm10B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B
two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
- CMPL R8, $0x00000800
+ CMPL DI, $0x00000800
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B
emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
@@ -16399,50 +16346,50 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
RET
match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
- MOVQ $0x0000cf1bbcdcbf9b, SI
- MOVQ $0x9e3779b1, R8
- LEAQ 1(DI), DI
- LEAQ -2(CX), R9
- MOVQ (DX)(DI*1), R10
- MOVQ 1(DX)(DI*1), R11
- MOVQ (DX)(R9*1), R12
- MOVQ 1(DX)(R9*1), R13
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x34, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x36, R11
- SHLQ $0x10, R12
- IMULQ SI, R12
- SHRQ $0x34, R12
- SHLQ $0x20, R13
- IMULQ R8, R13
- SHRQ $0x36, R13
- LEAQ 1(DI), R8
- LEAQ 1(R9), R14
- MOVL DI, 24(SP)(R10*4)
- MOVL R9, 24(SP)(R12*4)
- MOVL R8, 16408(SP)(R11*4)
- MOVL R14, 16408(SP)(R13*4)
- ADDQ $0x01, DI
- SUBQ $0x01, R9
+ MOVQ $0x0000cf1bbcdcbf9b, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x34, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x36, R10
+ SHLQ $0x10, R11
+ IMULQ BX, R11
+ SHRQ $0x34, R11
+ SHLQ $0x20, R12
+ IMULQ DI, R12
+ SHRQ $0x36, R12
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 16408(SP)(R10*4)
+ MOVL R13, 16408(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
index_loop_encodeSnappyBetterBlockAsm10B:
- CMPQ DI, R9
+ CMPQ SI, R8
JAE search_loop_encodeSnappyBetterBlockAsm10B
- MOVQ (DX)(DI*1), R8
- MOVQ (DX)(R9*1), R10
- SHLQ $0x10, R8
- IMULQ SI, R8
- SHRQ $0x34, R8
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x34, R10
- MOVL DI, 24(SP)(R8*4)
- MOVL R9, 24(SP)(R10*4)
- ADDQ $0x02, DI
- SUBQ $0x02, R9
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x10, DI
+ IMULQ BX, DI
+ SHRQ $0x34, DI
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x34, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
JMP index_loop_encodeSnappyBetterBlockAsm10B
emit_remainder_encodeSnappyBetterBlockAsm10B:
@@ -16625,8 +16572,8 @@ zero_loop_encodeSnappyBetterBlockAsm8B:
MOVL $0x00000000, 12(SP)
MOVQ src_len+32(FP), CX
LEAQ -9(CX), DX
- LEAQ -8(CX), SI
- MOVL SI, 8(SP)
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
SHRQ $0x05, CX
SUBL CX, DX
LEAQ (AX)(DX*1), DX
@@ -16636,307 +16583,307 @@ zero_loop_encodeSnappyBetterBlockAsm8B:
MOVQ src_base+24(FP), DX
search_loop_encodeSnappyBetterBlockAsm8B:
- MOVL CX, SI
- SUBL 12(SP), SI
- SHRL $0x04, SI
- LEAL 1(CX)(SI*1), SI
- CMPL SI, 8(SP)
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x04, BX
+ LEAL 1(CX)(BX*1), BX
+ CMPL BX, 8(SP)
JGE emit_remainder_encodeSnappyBetterBlockAsm8B
- MOVQ (DX)(CX*1), DI
- MOVL SI, 20(SP)
- MOVQ $0x0000cf1bbcdcbf9b, R9
- MOVQ $0x9e3779b1, SI
- MOVQ DI, R10
- MOVQ DI, R11
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x36, R10
- SHLQ $0x20, R11
- IMULQ SI, R11
- SHRQ $0x38, R11
- MOVL 24(SP)(R10*4), SI
- MOVL 4120(SP)(R11*4), R8
- MOVL CX, 24(SP)(R10*4)
- MOVL CX, 4120(SP)(R11*4)
- MOVQ (DX)(SI*1), R10
- MOVQ (DX)(R8*1), R11
- CMPQ R10, DI
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ $0x9e3779b1, BX
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x36, R9
+ SHLQ $0x20, R10
+ IMULQ BX, R10
+ SHRQ $0x38, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 4120(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ MOVL CX, 4120(SP)(R10*4)
+ MOVQ (DX)(BX*1), R9
+ MOVQ (DX)(DI*1), R10
+ CMPQ R9, SI
JEQ candidate_match_encodeSnappyBetterBlockAsm8B
- CMPQ R11, DI
+ CMPQ R10, SI
JNE no_short_found_encodeSnappyBetterBlockAsm8B
- MOVL R8, SI
+ MOVL DI, BX
JMP candidate_match_encodeSnappyBetterBlockAsm8B
no_short_found_encodeSnappyBetterBlockAsm8B:
- CMPL R10, DI
+ CMPL R9, SI
JEQ candidate_match_encodeSnappyBetterBlockAsm8B
- CMPL R11, DI
+ CMPL R10, SI
JEQ candidateS_match_encodeSnappyBetterBlockAsm8B
MOVL 20(SP), CX
JMP search_loop_encodeSnappyBetterBlockAsm8B
candidateS_match_encodeSnappyBetterBlockAsm8B:
- SHRQ $0x08, DI
- MOVQ DI, R10
- SHLQ $0x10, R10
- IMULQ R9, R10
- SHRQ $0x36, R10
- MOVL 24(SP)(R10*4), SI
+ SHRQ $0x08, SI
+ MOVQ SI, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x36, R9
+ MOVL 24(SP)(R9*4), BX
INCL CX
- MOVL CX, 24(SP)(R10*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 24(SP)(R9*4)
+ CMPL (DX)(BX*1), SI
JEQ candidate_match_encodeSnappyBetterBlockAsm8B
DECL CX
- MOVL R8, SI
+ MOVL DI, BX
candidate_match_encodeSnappyBetterBlockAsm8B:
- MOVL 12(SP), DI
- TESTL SI, SI
+ MOVL 12(SP), SI
+ TESTL BX, BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
match_extend_back_loop_encodeSnappyBetterBlockAsm8B:
- CMPL CX, DI
+ CMPL CX, SI
JLE match_extend_back_end_encodeSnappyBetterBlockAsm8B
- MOVB -1(DX)(SI*1), BL
+ MOVB -1(DX)(BX*1), DI
MOVB -1(DX)(CX*1), R8
- CMPB BL, R8
+ CMPB DI, R8
JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B
LEAL -1(CX), CX
- DECL SI
+ DECL BX
JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B
match_extend_back_end_encodeSnappyBetterBlockAsm8B:
- MOVL CX, DI
- SUBL 12(SP), DI
- LEAQ 3(AX)(DI*1), DI
- CMPQ DI, (SP)
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
JL match_dst_size_check_encodeSnappyBetterBlockAsm8B
MOVQ $0x00000000, ret+48(FP)
RET
match_dst_size_check_encodeSnappyBetterBlockAsm8B:
- MOVL CX, DI
+ MOVL CX, SI
ADDL $0x04, CX
- ADDL $0x04, SI
- MOVQ src_len+32(FP), R8
- SUBL CX, R8
- LEAQ (DX)(CX*1), R9
- LEAQ (DX)(SI*1), R10
+ ADDL $0x04, BX
+ MOVQ src_len+32(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), R9
// matchLen
- XORL R12, R12
- CMPL R8, $0x08
+ XORL R11, R11
+ CMPL DI, $0x08
JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
- MOVQ (R9)(R12*1), R11
- XORQ (R10)(R12*1), R11
- TESTQ R11, R11
+ MOVQ (R8)(R11*1), R10
+ XORQ (R9)(R11*1), R10
+ TESTQ R10, R10
JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B
#ifdef GOAMD64_v3
- TZCNTQ R11, R11
+ TZCNTQ R10, R10
#else
- BSFQ R11, R11
+ BSFQ R10, R10
#endif
- SARQ $0x03, R11
- LEAL (R12)(R11*1), R12
+ SARQ $0x03, R10
+ LEAL (R11)(R10*1), R11
JMP match_nolit_end_encodeSnappyBetterBlockAsm8B
matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B:
- LEAL -8(R8), R8
- LEAL 8(R12), R12
- CMPL R8, $0x08
+ LEAL -8(DI), DI
+ LEAL 8(R11), R11
+ CMPL DI, $0x08
JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B
JZ match_nolit_end_encodeSnappyBetterBlockAsm8B
matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B:
- CMPL R8, $0x04
+ CMPL DI, $0x04
JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
- MOVL (R9)(R12*1), R11
- CMPL (R10)(R12*1), R11
+ MOVL (R8)(R11*1), R10
+ CMPL (R9)(R11*1), R10
JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
- SUBL $0x04, R8
- LEAL 4(R12), R12
+ SUBL $0x04, DI
+ LEAL 4(R11), R11
matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B:
- CMPL R8, $0x02
+ CMPL DI, $0x02
JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
- MOVW (R9)(R12*1), R11
- CMPW (R10)(R12*1), R11
+ MOVW (R8)(R11*1), R10
+ CMPW (R9)(R11*1), R10
JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
- SUBL $0x02, R8
- LEAL 2(R12), R12
+ SUBL $0x02, DI
+ LEAL 2(R11), R11
matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B:
- CMPL R8, $0x01
+ CMPL DI, $0x01
JL match_nolit_end_encodeSnappyBetterBlockAsm8B
- MOVB (R9)(R12*1), R11
- CMPB (R10)(R12*1), R11
+ MOVB (R8)(R11*1), R10
+ CMPB (R9)(R11*1), R10
JNE match_nolit_end_encodeSnappyBetterBlockAsm8B
- LEAL 1(R12), R12
+ LEAL 1(R11), R11
match_nolit_end_encodeSnappyBetterBlockAsm8B:
- MOVL CX, R8
- SUBL SI, R8
+ MOVL CX, DI
+ SUBL BX, DI
// Check if repeat
- MOVL R8, 16(SP)
- MOVL 12(SP), SI
- CMPL SI, DI
+ MOVL DI, 16(SP)
+ MOVL 12(SP), BX
+ CMPL BX, SI
JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
- MOVL DI, R9
- MOVL DI, 12(SP)
- LEAQ (DX)(SI*1), R10
- SUBL SI, R9
- LEAL -1(R9), SI
- CMPL SI, $0x3c
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R9
+ SUBL BX, R8
+ LEAL -1(R8), BX
+ CMPL BX, $0x3c
JLT one_byte_match_emit_encodeSnappyBetterBlockAsm8B
- CMPL SI, $0x00000100
+ CMPL BX, $0x00000100
JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm8B
MOVB $0xf4, (AX)
- MOVW SI, 1(AX)
+ MOVW BX, 1(AX)
ADDQ $0x03, AX
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
two_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
MOVB $0xf0, (AX)
- MOVB SI, 1(AX)
+ MOVB BL, 1(AX)
ADDQ $0x02, AX
- CMPL SI, $0x40
+ CMPL BX, $0x40
JL memmove_match_emit_encodeSnappyBetterBlockAsm8B
JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
one_byte_match_emit_encodeSnappyBetterBlockAsm8B:
- SHLB $0x02, SI
- MOVB SI, (AX)
+ SHLB $0x02, BL
+ MOVB BL, (AX)
ADDQ $0x01, AX
memmove_match_emit_encodeSnappyBetterBlockAsm8B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveShort
- CMPQ R9, $0x08
+ CMPQ R8, $0x08
JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8
- CMPQ R9, $0x10
+ CMPQ R8, $0x10
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
- CMPQ R9, $0x20
+ CMPQ R8, $0x20
JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8:
- MOVQ (R10), R11
- MOVQ R11, (AX)
+ MOVQ (R9), R10
+ MOVQ R10, (AX)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
- MOVQ (R10), R11
- MOVQ -8(R10)(R9*1), R10
- MOVQ R11, (AX)
- MOVQ R10, -8(AX)(R9*1)
+ MOVQ (R9), R10
+ MOVQ -8(R9)(R8*1), R9
+ MOVQ R10, (AX)
+ MOVQ R9, -8(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
- MOVOU (R10), X0
- MOVOU -16(R10)(R9*1), X1
+ MOVOU (R9), X0
+ MOVOU -16(R9)(R8*1), X1
MOVOU X0, (AX)
- MOVOU X1, -16(AX)(R9*1)
+ MOVOU X1, -16(AX)(R8*1)
JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B:
- MOVQ SI, AX
+ MOVQ BX, AX
JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
memmove_long_match_emit_encodeSnappyBetterBlockAsm8B:
- LEAQ (AX)(R9*1), SI
+ LEAQ (AX)(R8*1), BX
// genMemMoveLong
- MOVOU (R10), X0
- MOVOU 16(R10), X1
- MOVOU -32(R10)(R9*1), X2
- MOVOU -16(R10)(R9*1), X3
- MOVQ R9, R13
- SHRQ $0x05, R13
- MOVQ AX, R11
- ANDL $0x0000001f, R11
- MOVQ $0x00000040, R14
- SUBQ R11, R14
- DECQ R13
+ MOVOU (R9), X0
+ MOVOU 16(R9), X1
+ MOVOU -32(R9)(R8*1), X2
+ MOVOU -16(R9)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R10
+ ANDL $0x0000001f, R10
+ MOVQ $0x00000040, R13
+ SUBQ R10, R13
+ DECQ R12
JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
- LEAQ -32(R10)(R14*1), R11
- LEAQ -32(AX)(R14*1), R15
+ LEAQ -32(R9)(R13*1), R10
+ LEAQ -32(AX)(R13*1), R14
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
- MOVOU (R11), X4
- MOVOU 16(R11), X5
- MOVOA X4, (R15)
- MOVOA X5, 16(R15)
- ADDQ $0x20, R15
- ADDQ $0x20, R11
+ MOVOU (R10), X4
+ MOVOU 16(R10), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
ADDQ $0x20, R14
- DECQ R13
+ ADDQ $0x20, R10
+ ADDQ $0x20, R13
+ DECQ R12
JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
- MOVOU -32(R10)(R14*1), X4
- MOVOU -16(R10)(R14*1), X5
- MOVOA X4, -32(AX)(R14*1)
- MOVOA X5, -16(AX)(R14*1)
- ADDQ $0x20, R14
- CMPQ R9, R14
+ MOVOU -32(R9)(R13*1), X4
+ MOVOU -16(R9)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
MOVOU X0, (AX)
MOVOU X1, 16(AX)
- MOVOU X2, -32(AX)(R9*1)
- MOVOU X3, -16(AX)(R9*1)
- MOVQ SI, AX
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ BX, AX
emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B:
- ADDL R12, CX
- ADDL $0x04, R12
+ ADDL R11, CX
+ ADDL $0x04, R11
MOVL CX, 12(SP)
// emitCopy
two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B:
- CMPL R12, $0x40
+ CMPL R11, $0x40
JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B
MOVB $0xee, (AX)
- MOVW R8, 1(AX)
- LEAL -60(R12), R12
+ MOVW DI, 1(AX)
+ LEAL -60(R11), R11
ADDQ $0x03, AX
JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B
two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B:
- CMPL R12, $0x0c
+ MOVL R11, BX
+ SHLL $0x02, BX
+ CMPL R11, $0x0c
JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B
- MOVB $0x01, BL
- LEAL -16(BX)(R12*4), R12
- MOVB R8, 1(AX)
- SHRL $0x08, R8
- SHLL $0x05, R8
- ORL R8, R12
- MOVB R12, (AX)
+ LEAL -15(BX), BX
+ MOVB DI, 1(AX)
+ SHRL $0x08, DI
+ SHLL $0x05, DI
+ ORL DI, BX
+ MOVB BL, (AX)
ADDQ $0x02, AX
JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B
emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B:
- MOVB $0x02, BL
- LEAL -4(BX)(R12*4), R12
- MOVB R12, (AX)
- MOVW R8, 1(AX)
+ LEAL -2(BX), BX
+ MOVB BL, (AX)
+ MOVW DI, 1(AX)
ADDQ $0x03, AX
match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
@@ -16948,50 +16895,50 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
RET
match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
- MOVQ $0x0000cf1bbcdcbf9b, SI
- MOVQ $0x9e3779b1, R8
- LEAQ 1(DI), DI
- LEAQ -2(CX), R9
- MOVQ (DX)(DI*1), R10
- MOVQ 1(DX)(DI*1), R11
- MOVQ (DX)(R9*1), R12
- MOVQ 1(DX)(R9*1), R13
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x36, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x38, R11
- SHLQ $0x10, R12
- IMULQ SI, R12
- SHRQ $0x36, R12
- SHLQ $0x20, R13
- IMULQ R8, R13
- SHRQ $0x38, R13
- LEAQ 1(DI), R8
- LEAQ 1(R9), R14
- MOVL DI, 24(SP)(R10*4)
- MOVL R9, 24(SP)(R12*4)
- MOVL R8, 4120(SP)(R11*4)
- MOVL R14, 4120(SP)(R13*4)
- ADDQ $0x01, DI
- SUBQ $0x01, R9
+ MOVQ $0x0000cf1bbcdcbf9b, BX
+ MOVQ $0x9e3779b1, DI
+ LEAQ 1(SI), SI
+ LEAQ -2(CX), R8
+ MOVQ (DX)(SI*1), R9
+ MOVQ 1(DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ MOVQ 1(DX)(R8*1), R12
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x36, R9
+ SHLQ $0x20, R10
+ IMULQ DI, R10
+ SHRQ $0x38, R10
+ SHLQ $0x10, R11
+ IMULQ BX, R11
+ SHRQ $0x36, R11
+ SHLQ $0x20, R12
+ IMULQ DI, R12
+ SHRQ $0x38, R12
+ LEAQ 1(SI), DI
+ LEAQ 1(R8), R13
+ MOVL SI, 24(SP)(R9*4)
+ MOVL R8, 24(SP)(R11*4)
+ MOVL DI, 4120(SP)(R10*4)
+ MOVL R13, 4120(SP)(R12*4)
+ ADDQ $0x01, SI
+ SUBQ $0x01, R8
index_loop_encodeSnappyBetterBlockAsm8B:
- CMPQ DI, R9
+ CMPQ SI, R8
JAE search_loop_encodeSnappyBetterBlockAsm8B
- MOVQ (DX)(DI*1), R8
- MOVQ (DX)(R9*1), R10
- SHLQ $0x10, R8
- IMULQ SI, R8
- SHRQ $0x36, R8
- SHLQ $0x10, R10
- IMULQ SI, R10
- SHRQ $0x36, R10
- MOVL DI, 24(SP)(R8*4)
- MOVL R9, 24(SP)(R10*4)
- ADDQ $0x02, DI
- SUBQ $0x02, R9
+ MOVQ (DX)(SI*1), DI
+ MOVQ (DX)(R8*1), R9
+ SHLQ $0x10, DI
+ IMULQ BX, DI
+ SHRQ $0x36, DI
+ SHLQ $0x10, R9
+ IMULQ BX, R9
+ SHRQ $0x36, R9
+ MOVL SI, 24(SP)(DI*4)
+ MOVL R8, 24(SP)(R9*4)
+ ADDQ $0x02, SI
+ SUBQ $0x02, R8
JMP index_loop_encodeSnappyBetterBlockAsm8B
emit_remainder_encodeSnappyBetterBlockAsm8B:
@@ -17151,6 +17098,1008 @@ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B:
MOVQ AX, ret+48(FP)
RET
+// func calcBlockSize(src []byte) int
+// Requires: BMI, SSE2
+TEXT ·calcBlockSize(SB), $32792-32
+ XORQ AX, AX
+ MOVQ $0x00000100, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_calcBlockSize:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_calcBlockSize
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+8(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+0(FP), DX
+
+search_loop_calcBlockSize:
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x05, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
+ JGE emit_remainder_calcBlockSize
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x33, R9
+ SHLQ $0x10, R10
+ IMULQ R8, R10
+ SHRQ $0x33, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x10, R9
+ IMULQ R8, R9
+ SHRQ $0x33, R9
+ MOVL CX, R8
+ SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
+ JNE no_repeat_found_calcBlockSize
+ LEAL 1(CX), SI
+ MOVL 12(SP), BX
+ MOVL SI, DI
+ SUBL 16(SP), DI
+ JZ repeat_extend_back_end_calcBlockSize
+
+repeat_extend_back_loop_calcBlockSize:
+ CMPL SI, BX
+ JLE repeat_extend_back_end_calcBlockSize
+ MOVB -1(DX)(DI*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
+ JNE repeat_extend_back_end_calcBlockSize
+ LEAL -1(SI), SI
+ DECL DI
+ JNZ repeat_extend_back_loop_calcBlockSize
+
+repeat_extend_back_end_calcBlockSize:
+ MOVL 12(SP), BX
+ CMPL BX, SI
+ JEQ emit_literal_done_repeat_emit_calcBlockSize
+ MOVL SI, DI
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R8
+ SUBL BX, DI
+ LEAL -1(DI), BX
+ CMPL BX, $0x3c
+ JLT one_byte_repeat_emit_calcBlockSize
+ CMPL BX, $0x00000100
+ JLT two_bytes_repeat_emit_calcBlockSize
+ CMPL BX, $0x00010000
+ JLT three_bytes_repeat_emit_calcBlockSize
+ CMPL BX, $0x01000000
+ JLT four_bytes_repeat_emit_calcBlockSize
+ ADDQ $0x05, AX
+ JMP memmove_long_repeat_emit_calcBlockSize
+
+four_bytes_repeat_emit_calcBlockSize:
+ ADDQ $0x04, AX
+ JMP memmove_long_repeat_emit_calcBlockSize
+
+three_bytes_repeat_emit_calcBlockSize:
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_calcBlockSize
+
+two_bytes_repeat_emit_calcBlockSize:
+ ADDQ $0x02, AX
+ CMPL BX, $0x40
+ JL memmove_repeat_emit_calcBlockSize
+ JMP memmove_long_repeat_emit_calcBlockSize
+
+one_byte_repeat_emit_calcBlockSize:
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_calcBlockSize:
+ LEAQ (AX)(DI*1), AX
+ JMP emit_literal_done_repeat_emit_calcBlockSize
+
+memmove_long_repeat_emit_calcBlockSize:
+ LEAQ (AX)(DI*1), AX
+
+emit_literal_done_repeat_emit_calcBlockSize:
+ ADDL $0x05, CX
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+8(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), BX
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_match4_repeat_extend_calcBlockSize
+
+matchlen_loopback_repeat_extend_calcBlockSize:
+ MOVQ (R8)(R10*1), R9
+ XORQ (BX)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_repeat_extend_calcBlockSize
+
+#ifdef GOAMD64_v3
+ TZCNTQ R9, R9
+
+#else
+ BSFQ R9, R9
+
+#endif
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP repeat_extend_forward_end_calcBlockSize
+
+matchlen_loop_repeat_extend_calcBlockSize:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_repeat_extend_calcBlockSize
+ JZ repeat_extend_forward_end_calcBlockSize
+
+matchlen_match4_repeat_extend_calcBlockSize:
+ CMPL DI, $0x04
+ JL matchlen_match2_repeat_extend_calcBlockSize
+ MOVL (R8)(R10*1), R9
+ CMPL (BX)(R10*1), R9
+ JNE matchlen_match2_repeat_extend_calcBlockSize
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
+
+matchlen_match2_repeat_extend_calcBlockSize:
+ CMPL DI, $0x02
+ JL matchlen_match1_repeat_extend_calcBlockSize
+ MOVW (R8)(R10*1), R9
+ CMPW (BX)(R10*1), R9
+ JNE matchlen_match1_repeat_extend_calcBlockSize
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
+
+matchlen_match1_repeat_extend_calcBlockSize:
+ CMPL DI, $0x01
+ JL repeat_extend_forward_end_calcBlockSize
+ MOVB (R8)(R10*1), R9
+ CMPB (BX)(R10*1), R9
+ JNE repeat_extend_forward_end_calcBlockSize
+ LEAL 1(R10), R10
+
+repeat_extend_forward_end_calcBlockSize:
+ ADDL R10, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
+
+ // emitCopy
+ CMPL SI, $0x00010000
+ JL two_byte_offset_repeat_as_copy_calcBlockSize
+
+four_bytes_loop_back_repeat_as_copy_calcBlockSize:
+ CMPL BX, $0x40
+ JLE four_bytes_remain_repeat_as_copy_calcBlockSize
+ LEAL -64(BX), BX
+ ADDQ $0x05, AX
+ CMPL BX, $0x04
+ JL four_bytes_remain_repeat_as_copy_calcBlockSize
+ JMP four_bytes_loop_back_repeat_as_copy_calcBlockSize
+
+four_bytes_remain_repeat_as_copy_calcBlockSize:
+ TESTL BX, BX
+ JZ repeat_end_emit_calcBlockSize
+ XORL BX, BX
+ ADDQ $0x05, AX
+ JMP repeat_end_emit_calcBlockSize
+
+two_byte_offset_repeat_as_copy_calcBlockSize:
+ CMPL BX, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_calcBlockSize
+ LEAL -60(BX), BX
+ ADDQ $0x03, AX
+ JMP two_byte_offset_repeat_as_copy_calcBlockSize
+
+two_byte_offset_short_repeat_as_copy_calcBlockSize:
+ MOVL BX, DI
+ SHLL $0x02, DI
+ CMPL BX, $0x0c
+ JGE emit_copy_three_repeat_as_copy_calcBlockSize
+ CMPL SI, $0x00000800
+ JGE emit_copy_three_repeat_as_copy_calcBlockSize
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_calcBlockSize
+
+emit_copy_three_repeat_as_copy_calcBlockSize:
+ ADDQ $0x03, AX
+
+repeat_end_emit_calcBlockSize:
+ MOVL CX, 12(SP)
+ JMP search_loop_calcBlockSize
+
+no_repeat_found_calcBlockSize:
+ CMPL (DX)(BX*1), SI
+ JEQ candidate_match_calcBlockSize
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
+ JEQ candidate2_match_calcBlockSize
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
+ JEQ candidate3_match_calcBlockSize
+ MOVL 20(SP), CX
+ JMP search_loop_calcBlockSize
+
+candidate3_match_calcBlockSize:
+ ADDL $0x02, CX
+ JMP candidate_match_calcBlockSize
+
+candidate2_match_calcBlockSize:
+ MOVL R8, 24(SP)(R9*4)
+ INCL CX
+ MOVL DI, BX
+
+candidate_match_calcBlockSize:
+ MOVL 12(SP), SI
+ TESTL BX, BX
+ JZ match_extend_back_end_calcBlockSize
+
+match_extend_back_loop_calcBlockSize:
+ CMPL CX, SI
+ JLE match_extend_back_end_calcBlockSize
+ MOVB -1(DX)(BX*1), DI
+ MOVB -1(DX)(CX*1), R8
+ CMPB DI, R8
+ JNE match_extend_back_end_calcBlockSize
+ LEAL -1(CX), CX
+ DECL BX
+ JZ match_extend_back_end_calcBlockSize
+ JMP match_extend_back_loop_calcBlockSize
+
+match_extend_back_end_calcBlockSize:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 5(AX)(SI*1), SI
+ CMPQ SI, (SP)
+ JL match_dst_size_check_calcBlockSize
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+match_dst_size_check_calcBlockSize:
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
+ JEQ emit_literal_done_match_emit_calcBlockSize
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_calcBlockSize
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_calcBlockSize
+ CMPL SI, $0x00010000
+ JLT three_bytes_match_emit_calcBlockSize
+ CMPL SI, $0x01000000
+ JLT four_bytes_match_emit_calcBlockSize
+ ADDQ $0x05, AX
+ JMP memmove_long_match_emit_calcBlockSize
+
+four_bytes_match_emit_calcBlockSize:
+ ADDQ $0x04, AX
+ JMP memmove_long_match_emit_calcBlockSize
+
+three_bytes_match_emit_calcBlockSize:
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_calcBlockSize
+
+two_bytes_match_emit_calcBlockSize:
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_calcBlockSize
+ JMP memmove_long_match_emit_calcBlockSize
+
+one_byte_match_emit_calcBlockSize:
+ ADDQ $0x01, AX
+
+memmove_match_emit_calcBlockSize:
+ LEAQ (AX)(R8*1), AX
+ JMP emit_literal_done_match_emit_calcBlockSize
+
+memmove_long_match_emit_calcBlockSize:
+ LEAQ (AX)(R8*1), AX
+
+emit_literal_done_match_emit_calcBlockSize:
+match_nolit_loop_calcBlockSize:
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, BX
+ MOVQ src_len+8(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
+
+ // matchLen
+ XORL R9, R9
+ CMPL SI, $0x08
+ JL matchlen_match4_match_nolit_calcBlockSize
+
+matchlen_loopback_match_nolit_calcBlockSize:
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
+ JZ matchlen_loop_match_nolit_calcBlockSize
+
+#ifdef GOAMD64_v3
+ TZCNTQ R8, R8
+
+#else
+ BSFQ R8, R8
+
+#endif
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
+ JMP match_nolit_end_calcBlockSize
+
+matchlen_loop_match_nolit_calcBlockSize:
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
+ JGE matchlen_loopback_match_nolit_calcBlockSize
+ JZ match_nolit_end_calcBlockSize
+
+matchlen_match4_match_nolit_calcBlockSize:
+ CMPL SI, $0x04
+ JL matchlen_match2_match_nolit_calcBlockSize
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
+ JNE matchlen_match2_match_nolit_calcBlockSize
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
+
+matchlen_match2_match_nolit_calcBlockSize:
+ CMPL SI, $0x02
+ JL matchlen_match1_match_nolit_calcBlockSize
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
+ JNE matchlen_match1_match_nolit_calcBlockSize
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
+
+matchlen_match1_match_nolit_calcBlockSize:
+ CMPL SI, $0x01
+ JL match_nolit_end_calcBlockSize
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
+ JNE match_nolit_end_calcBlockSize
+ LEAL 1(R9), R9
+
+match_nolit_end_calcBlockSize:
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
+ MOVL CX, 12(SP)
+
+ // emitCopy
+ CMPL BX, $0x00010000
+ JL two_byte_offset_match_nolit_calcBlockSize
+
+four_bytes_loop_back_match_nolit_calcBlockSize:
+ CMPL R9, $0x40
+ JLE four_bytes_remain_match_nolit_calcBlockSize
+ LEAL -64(R9), R9
+ ADDQ $0x05, AX
+ CMPL R9, $0x04
+ JL four_bytes_remain_match_nolit_calcBlockSize
+ JMP four_bytes_loop_back_match_nolit_calcBlockSize
+
+four_bytes_remain_match_nolit_calcBlockSize:
+ TESTL R9, R9
+ JZ match_nolit_emitcopy_end_calcBlockSize
+ XORL BX, BX
+ ADDQ $0x05, AX
+ JMP match_nolit_emitcopy_end_calcBlockSize
+
+two_byte_offset_match_nolit_calcBlockSize:
+ CMPL R9, $0x40
+ JLE two_byte_offset_short_match_nolit_calcBlockSize
+ LEAL -60(R9), R9
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_calcBlockSize
+
+two_byte_offset_short_match_nolit_calcBlockSize:
+ MOVL R9, SI
+ SHLL $0x02, SI
+ CMPL R9, $0x0c
+ JGE emit_copy_three_match_nolit_calcBlockSize
+ CMPL BX, $0x00000800
+ JGE emit_copy_three_match_nolit_calcBlockSize
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_calcBlockSize
+
+emit_copy_three_match_nolit_calcBlockSize:
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_calcBlockSize:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_calcBlockSize
+ MOVQ -2(DX)(CX*1), SI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_calcBlockSize
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+match_nolit_dst_ok_calcBlockSize:
+ MOVQ $0x0000cf1bbcdcbf9b, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x10, DI
+ IMULQ R8, DI
+ SHRQ $0x33, DI
+ SHLQ $0x10, BX
+ IMULQ R8, BX
+ SHRQ $0x33, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
+ JEQ match_nolit_loop_calcBlockSize
+ INCL CX
+ JMP search_loop_calcBlockSize
+
+emit_remainder_calcBlockSize:
+ MOVQ src_len+8(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 5(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_calcBlockSize
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+emit_remainder_ok_calcBlockSize:
+ MOVQ src_len+8(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_calcBlockSize
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), CX
+ CMPL CX, $0x3c
+ JLT one_byte_emit_remainder_calcBlockSize
+ CMPL CX, $0x00000100
+ JLT two_bytes_emit_remainder_calcBlockSize
+ CMPL CX, $0x00010000
+ JLT three_bytes_emit_remainder_calcBlockSize
+ CMPL CX, $0x01000000
+ JLT four_bytes_emit_remainder_calcBlockSize
+ ADDQ $0x05, AX
+ JMP memmove_long_emit_remainder_calcBlockSize
+
+four_bytes_emit_remainder_calcBlockSize:
+ ADDQ $0x04, AX
+ JMP memmove_long_emit_remainder_calcBlockSize
+
+three_bytes_emit_remainder_calcBlockSize:
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_calcBlockSize
+
+two_bytes_emit_remainder_calcBlockSize:
+ ADDQ $0x02, AX
+ CMPL CX, $0x40
+ JL memmove_emit_remainder_calcBlockSize
+ JMP memmove_long_emit_remainder_calcBlockSize
+
+one_byte_emit_remainder_calcBlockSize:
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_calcBlockSize:
+ LEAQ (AX)(SI*1), AX
+ JMP emit_literal_done_emit_remainder_calcBlockSize
+
+memmove_long_emit_remainder_calcBlockSize:
+ LEAQ (AX)(SI*1), AX
+
+emit_literal_done_emit_remainder_calcBlockSize:
+ MOVQ AX, ret+24(FP)
+ RET
+
+// func calcBlockSizeSmall(src []byte) int
+// Requires: BMI, SSE2
+TEXT ·calcBlockSizeSmall(SB), $2072-32
+ XORQ AX, AX
+ MOVQ $0x00000010, CX
+ LEAQ 24(SP), DX
+ PXOR X0, X0
+
+zero_loop_calcBlockSizeSmall:
+ MOVOU X0, (DX)
+ MOVOU X0, 16(DX)
+ MOVOU X0, 32(DX)
+ MOVOU X0, 48(DX)
+ MOVOU X0, 64(DX)
+ MOVOU X0, 80(DX)
+ MOVOU X0, 96(DX)
+ MOVOU X0, 112(DX)
+ ADDQ $0x80, DX
+ DECQ CX
+ JNZ zero_loop_calcBlockSizeSmall
+ MOVL $0x00000000, 12(SP)
+ MOVQ src_len+8(FP), CX
+ LEAQ -9(CX), DX
+ LEAQ -8(CX), BX
+ MOVL BX, 8(SP)
+ SHRQ $0x05, CX
+ SUBL CX, DX
+ LEAQ (AX)(DX*1), DX
+ MOVQ DX, (SP)
+ MOVL $0x00000001, CX
+ MOVL CX, 16(SP)
+ MOVQ src_base+0(FP), DX
+
+search_loop_calcBlockSizeSmall:
+ MOVL CX, BX
+ SUBL 12(SP), BX
+ SHRL $0x04, BX
+ LEAL 4(CX)(BX*1), BX
+ CMPL BX, 8(SP)
+ JGE emit_remainder_calcBlockSizeSmall
+ MOVQ (DX)(CX*1), SI
+ MOVL BX, 20(SP)
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, R9
+ MOVQ SI, R10
+ SHRQ $0x08, R10
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x37, R9
+ SHLQ $0x20, R10
+ IMULQ R8, R10
+ SHRQ $0x37, R10
+ MOVL 24(SP)(R9*4), BX
+ MOVL 24(SP)(R10*4), DI
+ MOVL CX, 24(SP)(R9*4)
+ LEAL 1(CX), R9
+ MOVL R9, 24(SP)(R10*4)
+ MOVQ SI, R9
+ SHRQ $0x10, R9
+ SHLQ $0x20, R9
+ IMULQ R8, R9
+ SHRQ $0x37, R9
+ MOVL CX, R8
+ SUBL 16(SP), R8
+ MOVL 1(DX)(R8*1), R10
+ MOVQ SI, R8
+ SHRQ $0x08, R8
+ CMPL R8, R10
+ JNE no_repeat_found_calcBlockSizeSmall
+ LEAL 1(CX), SI
+ MOVL 12(SP), BX
+ MOVL SI, DI
+ SUBL 16(SP), DI
+ JZ repeat_extend_back_end_calcBlockSizeSmall
+
+repeat_extend_back_loop_calcBlockSizeSmall:
+ CMPL SI, BX
+ JLE repeat_extend_back_end_calcBlockSizeSmall
+ MOVB -1(DX)(DI*1), R8
+ MOVB -1(DX)(SI*1), R9
+ CMPB R8, R9
+ JNE repeat_extend_back_end_calcBlockSizeSmall
+ LEAL -1(SI), SI
+ DECL DI
+ JNZ repeat_extend_back_loop_calcBlockSizeSmall
+
+repeat_extend_back_end_calcBlockSizeSmall:
+ MOVL 12(SP), BX
+ CMPL BX, SI
+ JEQ emit_literal_done_repeat_emit_calcBlockSizeSmall
+ MOVL SI, DI
+ MOVL SI, 12(SP)
+ LEAQ (DX)(BX*1), R8
+ SUBL BX, DI
+ LEAL -1(DI), BX
+ CMPL BX, $0x3c
+ JLT one_byte_repeat_emit_calcBlockSizeSmall
+ CMPL BX, $0x00000100
+ JLT two_bytes_repeat_emit_calcBlockSizeSmall
+ ADDQ $0x03, AX
+ JMP memmove_long_repeat_emit_calcBlockSizeSmall
+
+two_bytes_repeat_emit_calcBlockSizeSmall:
+ ADDQ $0x02, AX
+ CMPL BX, $0x40
+ JL memmove_repeat_emit_calcBlockSizeSmall
+ JMP memmove_long_repeat_emit_calcBlockSizeSmall
+
+one_byte_repeat_emit_calcBlockSizeSmall:
+ ADDQ $0x01, AX
+
+memmove_repeat_emit_calcBlockSizeSmall:
+ LEAQ (AX)(DI*1), AX
+ JMP emit_literal_done_repeat_emit_calcBlockSizeSmall
+
+memmove_long_repeat_emit_calcBlockSizeSmall:
+ LEAQ (AX)(DI*1), AX
+
+emit_literal_done_repeat_emit_calcBlockSizeSmall:
+ ADDL $0x05, CX
+ MOVL CX, BX
+ SUBL 16(SP), BX
+ MOVQ src_len+8(FP), DI
+ SUBL CX, DI
+ LEAQ (DX)(CX*1), R8
+ LEAQ (DX)(BX*1), BX
+
+ // matchLen
+ XORL R10, R10
+ CMPL DI, $0x08
+ JL matchlen_match4_repeat_extend_calcBlockSizeSmall
+
+matchlen_loopback_repeat_extend_calcBlockSizeSmall:
+ MOVQ (R8)(R10*1), R9
+ XORQ (BX)(R10*1), R9
+ TESTQ R9, R9
+ JZ matchlen_loop_repeat_extend_calcBlockSizeSmall
+
+#ifdef GOAMD64_v3
+ TZCNTQ R9, R9
+
+#else
+ BSFQ R9, R9
+
+#endif
+ SARQ $0x03, R9
+ LEAL (R10)(R9*1), R10
+ JMP repeat_extend_forward_end_calcBlockSizeSmall
+
+matchlen_loop_repeat_extend_calcBlockSizeSmall:
+ LEAL -8(DI), DI
+ LEAL 8(R10), R10
+ CMPL DI, $0x08
+ JGE matchlen_loopback_repeat_extend_calcBlockSizeSmall
+ JZ repeat_extend_forward_end_calcBlockSizeSmall
+
+matchlen_match4_repeat_extend_calcBlockSizeSmall:
+ CMPL DI, $0x04
+ JL matchlen_match2_repeat_extend_calcBlockSizeSmall
+ MOVL (R8)(R10*1), R9
+ CMPL (BX)(R10*1), R9
+ JNE matchlen_match2_repeat_extend_calcBlockSizeSmall
+ SUBL $0x04, DI
+ LEAL 4(R10), R10
+
+matchlen_match2_repeat_extend_calcBlockSizeSmall:
+ CMPL DI, $0x02
+ JL matchlen_match1_repeat_extend_calcBlockSizeSmall
+ MOVW (R8)(R10*1), R9
+ CMPW (BX)(R10*1), R9
+ JNE matchlen_match1_repeat_extend_calcBlockSizeSmall
+ SUBL $0x02, DI
+ LEAL 2(R10), R10
+
+matchlen_match1_repeat_extend_calcBlockSizeSmall:
+ CMPL DI, $0x01
+ JL repeat_extend_forward_end_calcBlockSizeSmall
+ MOVB (R8)(R10*1), R9
+ CMPB (BX)(R10*1), R9
+ JNE repeat_extend_forward_end_calcBlockSizeSmall
+ LEAL 1(R10), R10
+
+repeat_extend_forward_end_calcBlockSizeSmall:
+ ADDL R10, CX
+ MOVL CX, BX
+ SUBL SI, BX
+ MOVL 16(SP), SI
+
+ // emitCopy
+two_byte_offset_repeat_as_copy_calcBlockSizeSmall:
+ CMPL BX, $0x40
+ JLE two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall
+ LEAL -60(BX), BX
+ ADDQ $0x03, AX
+ JMP two_byte_offset_repeat_as_copy_calcBlockSizeSmall
+
+two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall:
+ MOVL BX, SI
+ SHLL $0x02, SI
+ CMPL BX, $0x0c
+ JGE emit_copy_three_repeat_as_copy_calcBlockSizeSmall
+ ADDQ $0x02, AX
+ JMP repeat_end_emit_calcBlockSizeSmall
+
+emit_copy_three_repeat_as_copy_calcBlockSizeSmall:
+ ADDQ $0x03, AX
+
+repeat_end_emit_calcBlockSizeSmall:
+ MOVL CX, 12(SP)
+ JMP search_loop_calcBlockSizeSmall
+
+no_repeat_found_calcBlockSizeSmall:
+ CMPL (DX)(BX*1), SI
+ JEQ candidate_match_calcBlockSizeSmall
+ SHRQ $0x08, SI
+ MOVL 24(SP)(R9*4), BX
+ LEAL 2(CX), R8
+ CMPL (DX)(DI*1), SI
+ JEQ candidate2_match_calcBlockSizeSmall
+ MOVL R8, 24(SP)(R9*4)
+ SHRQ $0x08, SI
+ CMPL (DX)(BX*1), SI
+ JEQ candidate3_match_calcBlockSizeSmall
+ MOVL 20(SP), CX
+ JMP search_loop_calcBlockSizeSmall
+
+candidate3_match_calcBlockSizeSmall:
+ ADDL $0x02, CX
+ JMP candidate_match_calcBlockSizeSmall
+
+candidate2_match_calcBlockSizeSmall:
+ MOVL R8, 24(SP)(R9*4)
+ INCL CX
+ MOVL DI, BX
+
+candidate_match_calcBlockSizeSmall:
+ MOVL 12(SP), SI
+ TESTL BX, BX
+ JZ match_extend_back_end_calcBlockSizeSmall
+
+match_extend_back_loop_calcBlockSizeSmall:
+ CMPL CX, SI
+ JLE match_extend_back_end_calcBlockSizeSmall
+ MOVB -1(DX)(BX*1), DI
+ MOVB -1(DX)(CX*1), R8
+ CMPB DI, R8
+ JNE match_extend_back_end_calcBlockSizeSmall
+ LEAL -1(CX), CX
+ DECL BX
+ JZ match_extend_back_end_calcBlockSizeSmall
+ JMP match_extend_back_loop_calcBlockSizeSmall
+
+match_extend_back_end_calcBlockSizeSmall:
+ MOVL CX, SI
+ SUBL 12(SP), SI
+ LEAQ 3(AX)(SI*1), SI
+ CMPQ SI, (SP)
+ JL match_dst_size_check_calcBlockSizeSmall
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+match_dst_size_check_calcBlockSizeSmall:
+ MOVL CX, SI
+ MOVL 12(SP), DI
+ CMPL DI, SI
+ JEQ emit_literal_done_match_emit_calcBlockSizeSmall
+ MOVL SI, R8
+ MOVL SI, 12(SP)
+ LEAQ (DX)(DI*1), SI
+ SUBL DI, R8
+ LEAL -1(R8), SI
+ CMPL SI, $0x3c
+ JLT one_byte_match_emit_calcBlockSizeSmall
+ CMPL SI, $0x00000100
+ JLT two_bytes_match_emit_calcBlockSizeSmall
+ ADDQ $0x03, AX
+ JMP memmove_long_match_emit_calcBlockSizeSmall
+
+two_bytes_match_emit_calcBlockSizeSmall:
+ ADDQ $0x02, AX
+ CMPL SI, $0x40
+ JL memmove_match_emit_calcBlockSizeSmall
+ JMP memmove_long_match_emit_calcBlockSizeSmall
+
+one_byte_match_emit_calcBlockSizeSmall:
+ ADDQ $0x01, AX
+
+memmove_match_emit_calcBlockSizeSmall:
+ LEAQ (AX)(R8*1), AX
+ JMP emit_literal_done_match_emit_calcBlockSizeSmall
+
+memmove_long_match_emit_calcBlockSizeSmall:
+ LEAQ (AX)(R8*1), AX
+
+emit_literal_done_match_emit_calcBlockSizeSmall:
+match_nolit_loop_calcBlockSizeSmall:
+ MOVL CX, SI
+ SUBL BX, SI
+ MOVL SI, 16(SP)
+ ADDL $0x04, CX
+ ADDL $0x04, BX
+ MOVQ src_len+8(FP), SI
+ SUBL CX, SI
+ LEAQ (DX)(CX*1), DI
+ LEAQ (DX)(BX*1), BX
+
+ // matchLen
+ XORL R9, R9
+ CMPL SI, $0x08
+ JL matchlen_match4_match_nolit_calcBlockSizeSmall
+
+matchlen_loopback_match_nolit_calcBlockSizeSmall:
+ MOVQ (DI)(R9*1), R8
+ XORQ (BX)(R9*1), R8
+ TESTQ R8, R8
+ JZ matchlen_loop_match_nolit_calcBlockSizeSmall
+
+#ifdef GOAMD64_v3
+ TZCNTQ R8, R8
+
+#else
+ BSFQ R8, R8
+
+#endif
+ SARQ $0x03, R8
+ LEAL (R9)(R8*1), R9
+ JMP match_nolit_end_calcBlockSizeSmall
+
+matchlen_loop_match_nolit_calcBlockSizeSmall:
+ LEAL -8(SI), SI
+ LEAL 8(R9), R9
+ CMPL SI, $0x08
+ JGE matchlen_loopback_match_nolit_calcBlockSizeSmall
+ JZ match_nolit_end_calcBlockSizeSmall
+
+matchlen_match4_match_nolit_calcBlockSizeSmall:
+ CMPL SI, $0x04
+ JL matchlen_match2_match_nolit_calcBlockSizeSmall
+ MOVL (DI)(R9*1), R8
+ CMPL (BX)(R9*1), R8
+ JNE matchlen_match2_match_nolit_calcBlockSizeSmall
+ SUBL $0x04, SI
+ LEAL 4(R9), R9
+
+matchlen_match2_match_nolit_calcBlockSizeSmall:
+ CMPL SI, $0x02
+ JL matchlen_match1_match_nolit_calcBlockSizeSmall
+ MOVW (DI)(R9*1), R8
+ CMPW (BX)(R9*1), R8
+ JNE matchlen_match1_match_nolit_calcBlockSizeSmall
+ SUBL $0x02, SI
+ LEAL 2(R9), R9
+
+matchlen_match1_match_nolit_calcBlockSizeSmall:
+ CMPL SI, $0x01
+ JL match_nolit_end_calcBlockSizeSmall
+ MOVB (DI)(R9*1), R8
+ CMPB (BX)(R9*1), R8
+ JNE match_nolit_end_calcBlockSizeSmall
+ LEAL 1(R9), R9
+
+match_nolit_end_calcBlockSizeSmall:
+ ADDL R9, CX
+ MOVL 16(SP), BX
+ ADDL $0x04, R9
+ MOVL CX, 12(SP)
+
+ // emitCopy
+two_byte_offset_match_nolit_calcBlockSizeSmall:
+ CMPL R9, $0x40
+ JLE two_byte_offset_short_match_nolit_calcBlockSizeSmall
+ LEAL -60(R9), R9
+ ADDQ $0x03, AX
+ JMP two_byte_offset_match_nolit_calcBlockSizeSmall
+
+two_byte_offset_short_match_nolit_calcBlockSizeSmall:
+ MOVL R9, BX
+ SHLL $0x02, BX
+ CMPL R9, $0x0c
+ JGE emit_copy_three_match_nolit_calcBlockSizeSmall
+ ADDQ $0x02, AX
+ JMP match_nolit_emitcopy_end_calcBlockSizeSmall
+
+emit_copy_three_match_nolit_calcBlockSizeSmall:
+ ADDQ $0x03, AX
+
+match_nolit_emitcopy_end_calcBlockSizeSmall:
+ CMPL CX, 8(SP)
+ JGE emit_remainder_calcBlockSizeSmall
+ MOVQ -2(DX)(CX*1), SI
+ CMPQ AX, (SP)
+ JL match_nolit_dst_ok_calcBlockSizeSmall
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+match_nolit_dst_ok_calcBlockSizeSmall:
+ MOVQ $0x9e3779b1, R8
+ MOVQ SI, DI
+ SHRQ $0x10, SI
+ MOVQ SI, BX
+ SHLQ $0x20, DI
+ IMULQ R8, DI
+ SHRQ $0x37, DI
+ SHLQ $0x20, BX
+ IMULQ R8, BX
+ SHRQ $0x37, BX
+ LEAL -2(CX), R8
+ LEAQ 24(SP)(BX*4), R9
+ MOVL (R9), BX
+ MOVL R8, 24(SP)(DI*4)
+ MOVL CX, (R9)
+ CMPL (DX)(BX*1), SI
+ JEQ match_nolit_loop_calcBlockSizeSmall
+ INCL CX
+ JMP search_loop_calcBlockSizeSmall
+
+emit_remainder_calcBlockSizeSmall:
+ MOVQ src_len+8(FP), CX
+ SUBL 12(SP), CX
+ LEAQ 3(AX)(CX*1), CX
+ CMPQ CX, (SP)
+ JL emit_remainder_ok_calcBlockSizeSmall
+ MOVQ $0x00000000, ret+24(FP)
+ RET
+
+emit_remainder_ok_calcBlockSizeSmall:
+ MOVQ src_len+8(FP), CX
+ MOVL 12(SP), BX
+ CMPL BX, CX
+ JEQ emit_literal_done_emit_remainder_calcBlockSizeSmall
+ MOVL CX, SI
+ MOVL CX, 12(SP)
+ LEAQ (DX)(BX*1), CX
+ SUBL BX, SI
+ LEAL -1(SI), CX
+ CMPL CX, $0x3c
+ JLT one_byte_emit_remainder_calcBlockSizeSmall
+ CMPL CX, $0x00000100
+ JLT two_bytes_emit_remainder_calcBlockSizeSmall
+ ADDQ $0x03, AX
+ JMP memmove_long_emit_remainder_calcBlockSizeSmall
+
+two_bytes_emit_remainder_calcBlockSizeSmall:
+ ADDQ $0x02, AX
+ CMPL CX, $0x40
+ JL memmove_emit_remainder_calcBlockSizeSmall
+ JMP memmove_long_emit_remainder_calcBlockSizeSmall
+
+one_byte_emit_remainder_calcBlockSizeSmall:
+ ADDQ $0x01, AX
+
+memmove_emit_remainder_calcBlockSizeSmall:
+ LEAQ (AX)(SI*1), AX
+ JMP emit_literal_done_emit_remainder_calcBlockSizeSmall
+
+memmove_long_emit_remainder_calcBlockSizeSmall:
+ LEAQ (AX)(SI*1), AX
+
+emit_literal_done_emit_remainder_calcBlockSizeSmall:
+ MOVQ AX, ret+24(FP)
+ RET
+
// func emitLiteral(dst []byte, lit []byte) int
// Requires: SSE2
TEXT ·emitLiteral(SB), NOSPLIT, $0-56
@@ -17343,8 +18292,7 @@ cant_repeat_two_offset_standalone:
CMPL DX, $0x0100ffff
JLT repeat_five_standalone
LEAL -16842747(DX), DX
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
ADDQ $0x05, BX
@@ -17410,8 +18358,6 @@ TEXT ·emitCopy(SB), NOSPLIT, $0-48
// emitCopy
CMPL CX, $0x00010000
JL two_byte_offset_standalone
-
-four_bytes_loop_back_standalone:
CMPL DX, $0x40
JLE four_bytes_remain_standalone
MOVB $0xff, (AX)
@@ -17441,8 +18387,7 @@ cant_repeat_two_offset_standalone_emit_copy:
CMPL DX, $0x0100ffff
JLT repeat_five_standalone_emit_copy
LEAL -16842747(DX), DX
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
ADDQ $0x05, BX
@@ -17494,13 +18439,12 @@ repeat_two_offset_standalone_emit_copy:
ADDQ $0x02, BX
ADDQ $0x02, AX
JMP gen_emit_copy_end
- JMP four_bytes_loop_back_standalone
four_bytes_remain_standalone:
TESTL DX, DX
JZ gen_emit_copy_end
- MOVB $0x03, SI
- LEAL -4(SI)(DX*4), DX
+ XORL SI, SI
+ LEAL -1(SI)(DX*4), DX
MOVB DL, (AX)
MOVL CX, 1(AX)
ADDQ $0x05, BX
@@ -17546,8 +18490,7 @@ cant_repeat_two_offset_standalone_emit_copy_short_2b:
CMPL DX, $0x0100ffff
JLT repeat_five_standalone_emit_copy_short_2b
LEAL -16842747(DX), DX
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
ADDQ $0x05, BX
@@ -17626,8 +18569,7 @@ cant_repeat_two_offset_standalone_emit_copy_short:
CMPL DX, $0x0100ffff
JLT repeat_five_standalone_emit_copy_short
LEAL -16842747(DX), DX
- MOVW $0x001d, (AX)
- MOVW $0xfffb, 2(AX)
+ MOVL $0xfffb001d, (AX)
MOVB $0xff, 4(AX)
ADDQ $0x05, AX
ADDQ $0x05, BX
@@ -17679,28 +18621,27 @@ repeat_two_offset_standalone_emit_copy_short:
ADDQ $0x02, BX
ADDQ $0x02, AX
JMP gen_emit_copy_end
- JMP two_byte_offset_standalone
two_byte_offset_short_standalone:
+ MOVL DX, SI
+ SHLL $0x02, SI
CMPL DX, $0x0c
JGE emit_copy_three_standalone
CMPL CX, $0x00000800
JGE emit_copy_three_standalone
- MOVB $0x01, SI
- LEAL -16(SI)(DX*4), DX
+ LEAL -15(SI), SI
MOVB CL, 1(AX)
SHRL $0x08, CX
SHLL $0x05, CX
- ORL CX, DX
- MOVB DL, (AX)
+ ORL CX, SI
+ MOVB SI, (AX)
ADDQ $0x02, BX
ADDQ $0x02, AX
JMP gen_emit_copy_end
emit_copy_three_standalone:
- MOVB $0x02, SI
- LEAL -4(SI)(DX*4), DX
- MOVB DL, (AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
MOVW CX, 1(AX)
ADDQ $0x03, BX
ADDQ $0x03, AX
@@ -17735,8 +18676,8 @@ four_bytes_loop_back_standalone_snappy:
four_bytes_remain_standalone_snappy:
TESTL DX, DX
JZ gen_emit_copy_end_snappy
- MOVB $0x03, SI
- LEAL -4(SI)(DX*4), DX
+ XORL SI, SI
+ LEAL -1(SI)(DX*4), DX
MOVB DL, (AX)
MOVL CX, 1(AX)
ADDQ $0x05, BX
@@ -17754,25 +18695,25 @@ two_byte_offset_standalone_snappy:
JMP two_byte_offset_standalone_snappy
two_byte_offset_short_standalone_snappy:
+ MOVL DX, SI
+ SHLL $0x02, SI
CMPL DX, $0x0c
JGE emit_copy_three_standalone_snappy
CMPL CX, $0x00000800
JGE emit_copy_three_standalone_snappy
- MOVB $0x01, SI
- LEAL -16(SI)(DX*4), DX
+ LEAL -15(SI), SI
MOVB CL, 1(AX)
SHRL $0x08, CX
SHLL $0x05, CX
- ORL CX, DX
- MOVB DL, (AX)
+ ORL CX, SI
+ MOVB SI, (AX)
ADDQ $0x02, BX
ADDQ $0x02, AX
JMP gen_emit_copy_end_snappy
emit_copy_three_standalone_snappy:
- MOVB $0x02, SI
- LEAL -4(SI)(DX*4), DX
- MOVB DL, (AX)
+ LEAL -2(SI), SI
+ MOVB SI, (AX)
MOVW CX, 1(AX)
ADDQ $0x03, BX
ADDQ $0x03, AX
@@ -17846,3 +18787,1505 @@ matchlen_match1_standalone:
gen_match_len_end:
MOVQ SI, ret+48(FP)
RET
+
+// func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: SSE2
+TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64
+ XORQ SI, SI
+ MOVQ dst_base+0(FP), AX
+ MOVQ dst_len+8(FP), CX
+ MOVQ src_base+24(FP), DX
+ MOVQ src_len+32(FP), BX
+ LEAQ (DX)(BX*1), BX
+ LEAQ -10(AX)(CX*1), CX
+ XORQ DI, DI
+
+lz4_s2_loop:
+ CMPQ DX, BX
+ JAE lz4_s2_corrupt
+ CMPQ AX, CX
+ JAE lz4_s2_dstfull
+ MOVBQZX (DX), R8
+ MOVQ R8, R9
+ MOVQ R8, R10
+ SHRQ $0x04, R9
+ ANDQ $0x0f, R10
+ CMPQ R8, $0xf0
+ JB lz4_s2_ll_end
+
+lz4_s2_ll_loop:
+ INCQ DX
+ CMPQ DX, BX
+ JAE lz4_s2_corrupt
+ MOVBQZX (DX), R8
+ ADDQ R8, R9
+ CMPQ R8, $0xff
+ JEQ lz4_s2_ll_loop
+
+lz4_s2_ll_end:
+ LEAQ (DX)(R9*1), R8
+ ADDQ $0x04, R10
+ CMPQ R8, BX
+ JAE lz4_s2_corrupt
+ INCQ DX
+ INCQ R8
+ TESTQ R9, R9
+ JZ lz4_s2_lits_done
+ LEAQ (AX)(R9*1), R11
+ CMPQ R11, CX
+ JAE lz4_s2_dstfull
+ ADDQ R9, SI
+ LEAL -1(R9), R11
+ CMPL R11, $0x3c
+ JLT one_byte_lz4_s2
+ CMPL R11, $0x00000100
+ JLT two_bytes_lz4_s2
+ CMPL R11, $0x00010000
+ JLT three_bytes_lz4_s2
+ CMPL R11, $0x01000000
+ JLT four_bytes_lz4_s2
+ MOVB $0xfc, (AX)
+ MOVL R11, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_lz4_s2
+
+four_bytes_lz4_s2:
+ MOVL R11, R12
+ SHRL $0x10, R12
+ MOVB $0xf8, (AX)
+ MOVW R11, 1(AX)
+ MOVB R12, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_lz4_s2
+
+three_bytes_lz4_s2:
+ MOVB $0xf4, (AX)
+ MOVW R11, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_lz4_s2
+
+two_bytes_lz4_s2:
+ MOVB $0xf0, (AX)
+ MOVB R11, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R11, $0x40
+ JL memmove_lz4_s2
+ JMP memmove_long_lz4_s2
+
+one_byte_lz4_s2:
+ SHLB $0x02, R11
+ MOVB R11, (AX)
+ ADDQ $0x01, AX
+
+memmove_lz4_s2:
+ LEAQ (AX)(R9*1), R11
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_lz4_s2_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_lz4_s2_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_lz4_s2_memmove_move_17through32
+ JMP emit_lit_memmove_lz4_s2_memmove_move_33through64
+
+emit_lit_memmove_lz4_s2_memmove_move_8:
+ MOVQ (DX), R12
+ MOVQ R12, (AX)
+ JMP memmove_end_copy_lz4_s2
+
+emit_lit_memmove_lz4_s2_memmove_move_8through16:
+ MOVQ (DX), R12
+ MOVQ -8(DX)(R9*1), DX
+ MOVQ R12, (AX)
+ MOVQ DX, -8(AX)(R9*1)
+ JMP memmove_end_copy_lz4_s2
+
+emit_lit_memmove_lz4_s2_memmove_move_17through32:
+ MOVOU (DX), X0
+ MOVOU -16(DX)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_lz4_s2
+
+emit_lit_memmove_lz4_s2_memmove_move_33through64:
+ MOVOU (DX), X0
+ MOVOU 16(DX), X1
+ MOVOU -32(DX)(R9*1), X2
+ MOVOU -16(DX)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_lz4_s2:
+ MOVQ R11, AX
+ JMP lz4_s2_lits_emit_done
+
+memmove_long_lz4_s2:
+ LEAQ (AX)(R9*1), R11
+
+ // genMemMoveLong
+ MOVOU (DX), X0
+ MOVOU 16(DX), X1
+ MOVOU -32(DX)(R9*1), X2
+ MOVOU -16(DX)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R12
+ ANDL $0x0000001f, R12
+ MOVQ $0x00000040, R14
+ SUBQ R12, R14
+ DECQ R13
+ JA emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
+ LEAQ -32(DX)(R14*1), R12
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_lz4_s2large_big_loop_back:
+ MOVOU (R12), X4
+ MOVOU 16(R12), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R12
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_lz4_s2large_big_loop_back
+
+emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32:
+ MOVOU -32(DX)(R14*1), X4
+ MOVOU -16(DX)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R11, AX
+
+lz4_s2_lits_emit_done:
+ MOVQ R8, DX
+
+lz4_s2_lits_done:
+ CMPQ DX, BX
+ JNE lz4_s2_match
+ CMPQ R10, $0x04
+ JEQ lz4_s2_done
+ JMP lz4_s2_corrupt
+
+lz4_s2_match:
+ LEAQ 2(DX), R8
+ CMPQ R8, BX
+ JAE lz4_s2_corrupt
+ MOVWQZX (DX), R9
+ MOVQ R8, DX
+ TESTQ R9, R9
+ JZ lz4_s2_corrupt
+ CMPQ R9, SI
+ JA lz4_s2_corrupt
+ CMPQ R10, $0x13
+ JNE lz4_s2_ml_done
+
+lz4_s2_ml_loop:
+ MOVBQZX (DX), R8
+ INCQ DX
+ ADDQ R8, R10
+ CMPQ DX, BX
+ JAE lz4_s2_corrupt
+ CMPQ R8, $0xff
+ JEQ lz4_s2_ml_loop
+
+lz4_s2_ml_done:
+ ADDQ R10, SI
+ CMPQ R9, DI
+ JNE lz4_s2_docopy
+
+ // emitRepeat
+emit_repeat_again_lz4_s2:
+ MOVL R10, R8
+ LEAL -4(R10), R10
+ CMPL R8, $0x08
+ JLE repeat_two_lz4_s2
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_lz4_s2
+ CMPL R9, $0x00000800
+ JLT repeat_two_offset_lz4_s2
+
+cant_repeat_two_offset_lz4_s2:
+ CMPL R10, $0x00000104
+ JLT repeat_three_lz4_s2
+ CMPL R10, $0x00010100
+ JLT repeat_four_lz4_s2
+ CMPL R10, $0x0100ffff
+ JLT repeat_five_lz4_s2
+ LEAL -16842747(R10), R10
+ MOVL $0xfffb001d, (AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_lz4_s2
+
+repeat_five_lz4_s2:
+ LEAL -65536(R10), R10
+ MOVL R10, R9
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, R9
+ MOVB R9, 4(AX)
+ ADDQ $0x05, AX
+ JMP lz4_s2_loop
+
+repeat_four_lz4_s2:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP lz4_s2_loop
+
+repeat_three_lz4_s2:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP lz4_s2_loop
+
+repeat_two_lz4_s2:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4_s2_loop
+
+repeat_two_offset_lz4_s2:
+ XORQ R8, R8
+ LEAL 1(R8)(R10*4), R10
+ MOVB R9, 1(AX)
+ SARL $0x08, R9
+ SHLL $0x05, R9
+ ORL R9, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4_s2_loop
+
+lz4_s2_docopy:
+ MOVQ R9, DI
+
+ // emitCopy
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_lz4_s2
+ CMPL R9, $0x00000800
+ JAE long_offset_short_lz4_s2
+ MOVL $0x00000001, R8
+ LEAL 16(R8), R8
+ MOVB R9, 1(AX)
+ MOVL R9, R11
+ SHRL $0x08, R11
+ SHLL $0x05, R11
+ ORL R11, R8
+ MOVB R8, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, R10
+
+ // emitRepeat
+ LEAL -4(R10), R10
+ JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+
+emit_repeat_again_lz4_s2_emit_copy_short_2b:
+ MOVL R10, R8
+ LEAL -4(R10), R10
+ CMPL R8, $0x08
+ JLE repeat_two_lz4_s2_emit_copy_short_2b
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+ CMPL R9, $0x00000800
+ JLT repeat_two_offset_lz4_s2_emit_copy_short_2b
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
+ CMPL R10, $0x00000104
+ JLT repeat_three_lz4_s2_emit_copy_short_2b
+ CMPL R10, $0x00010100
+ JLT repeat_four_lz4_s2_emit_copy_short_2b
+ CMPL R10, $0x0100ffff
+ JLT repeat_five_lz4_s2_emit_copy_short_2b
+ LEAL -16842747(R10), R10
+ MOVL $0xfffb001d, (AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_lz4_s2_emit_copy_short_2b
+
+repeat_five_lz4_s2_emit_copy_short_2b:
+ LEAL -65536(R10), R10
+ MOVL R10, R9
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, R9
+ MOVB R9, 4(AX)
+ ADDQ $0x05, AX
+ JMP lz4_s2_loop
+
+repeat_four_lz4_s2_emit_copy_short_2b:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP lz4_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short_2b:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP lz4_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short_2b:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short_2b:
+ XORQ R8, R8
+ LEAL 1(R8)(R10*4), R10
+ MOVB R9, 1(AX)
+ SARL $0x08, R9
+ SHLL $0x05, R9
+ ORL R9, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4_s2_loop
+
+long_offset_short_lz4_s2:
+ MOVB $0xee, (AX)
+ MOVW R9, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+
+ // emitRepeat
+emit_repeat_again_lz4_s2_emit_copy_short:
+ MOVL R10, R8
+ LEAL -4(R10), R10
+ CMPL R8, $0x08
+ JLE repeat_two_lz4_s2_emit_copy_short
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_lz4_s2_emit_copy_short
+ CMPL R9, $0x00000800
+ JLT repeat_two_offset_lz4_s2_emit_copy_short
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short:
+ CMPL R10, $0x00000104
+ JLT repeat_three_lz4_s2_emit_copy_short
+ CMPL R10, $0x00010100
+ JLT repeat_four_lz4_s2_emit_copy_short
+ CMPL R10, $0x0100ffff
+ JLT repeat_five_lz4_s2_emit_copy_short
+ LEAL -16842747(R10), R10
+ MOVL $0xfffb001d, (AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_lz4_s2_emit_copy_short
+
+repeat_five_lz4_s2_emit_copy_short:
+ LEAL -65536(R10), R10
+ MOVL R10, R9
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, R9
+ MOVB R9, 4(AX)
+ ADDQ $0x05, AX
+ JMP lz4_s2_loop
+
+repeat_four_lz4_s2_emit_copy_short:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP lz4_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP lz4_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short:
+ XORQ R8, R8
+ LEAL 1(R8)(R10*4), R10
+ MOVB R9, 1(AX)
+ SARL $0x08, R9
+ SHLL $0x05, R9
+ ORL R9, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4_s2_loop
+
+two_byte_offset_short_lz4_s2:
+ MOVL R10, R8
+ SHLL $0x02, R8
+ CMPL R10, $0x0c
+ JGE emit_copy_three_lz4_s2
+ CMPL R9, $0x00000800
+ JGE emit_copy_three_lz4_s2
+ LEAL -15(R8), R8
+ MOVB R9, 1(AX)
+ SHRL $0x08, R9
+ SHLL $0x05, R9
+ ORL R9, R8
+ MOVB R8, (AX)
+ ADDQ $0x02, AX
+ JMP lz4_s2_loop
+
+emit_copy_three_lz4_s2:
+ LEAL -2(R8), R8
+ MOVB R8, (AX)
+ MOVW R9, 1(AX)
+ ADDQ $0x03, AX
+ JMP lz4_s2_loop
+
+lz4_s2_done:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ SI, uncompressed+48(FP)
+ MOVQ AX, dstUsed+56(FP)
+ RET
+
+lz4_s2_corrupt:
+ XORQ AX, AX
+ LEAQ -1(AX), SI
+ MOVQ SI, uncompressed+48(FP)
+ RET
+
+lz4_s2_dstfull:
+ XORQ AX, AX
+ LEAQ -2(AX), SI
+ MOVQ SI, uncompressed+48(FP)
+ RET
+
+// func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: SSE2
+TEXT ·cvtLZ4sBlockAsm(SB), NOSPLIT, $0-64
+ XORQ SI, SI
+ MOVQ dst_base+0(FP), AX
+ MOVQ dst_len+8(FP), CX
+ MOVQ src_base+24(FP), DX
+ MOVQ src_len+32(FP), BX
+ LEAQ (DX)(BX*1), BX
+ LEAQ -10(AX)(CX*1), CX
+ XORQ DI, DI
+
+lz4s_s2_loop:
+ CMPQ DX, BX
+ JAE lz4s_s2_corrupt
+ CMPQ AX, CX
+ JAE lz4s_s2_dstfull
+ MOVBQZX (DX), R8
+ MOVQ R8, R9
+ MOVQ R8, R10
+ SHRQ $0x04, R9
+ ANDQ $0x0f, R10
+ CMPQ R8, $0xf0
+ JB lz4s_s2_ll_end
+
+lz4s_s2_ll_loop:
+ INCQ DX
+ CMPQ DX, BX
+ JAE lz4s_s2_corrupt
+ MOVBQZX (DX), R8
+ ADDQ R8, R9
+ CMPQ R8, $0xff
+ JEQ lz4s_s2_ll_loop
+
+lz4s_s2_ll_end:
+ LEAQ (DX)(R9*1), R8
+ ADDQ $0x03, R10
+ CMPQ R8, BX
+ JAE lz4s_s2_corrupt
+ INCQ DX
+ INCQ R8
+ TESTQ R9, R9
+ JZ lz4s_s2_lits_done
+ LEAQ (AX)(R9*1), R11
+ CMPQ R11, CX
+ JAE lz4s_s2_dstfull
+ ADDQ R9, SI
+ LEAL -1(R9), R11
+ CMPL R11, $0x3c
+ JLT one_byte_lz4s_s2
+ CMPL R11, $0x00000100
+ JLT two_bytes_lz4s_s2
+ CMPL R11, $0x00010000
+ JLT three_bytes_lz4s_s2
+ CMPL R11, $0x01000000
+ JLT four_bytes_lz4s_s2
+ MOVB $0xfc, (AX)
+ MOVL R11, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_lz4s_s2
+
+four_bytes_lz4s_s2:
+ MOVL R11, R12
+ SHRL $0x10, R12
+ MOVB $0xf8, (AX)
+ MOVW R11, 1(AX)
+ MOVB R12, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_lz4s_s2
+
+three_bytes_lz4s_s2:
+ MOVB $0xf4, (AX)
+ MOVW R11, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_lz4s_s2
+
+two_bytes_lz4s_s2:
+ MOVB $0xf0, (AX)
+ MOVB R11, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R11, $0x40
+ JL memmove_lz4s_s2
+ JMP memmove_long_lz4s_s2
+
+one_byte_lz4s_s2:
+ SHLB $0x02, R11
+ MOVB R11, (AX)
+ ADDQ $0x01, AX
+
+memmove_lz4s_s2:
+ LEAQ (AX)(R9*1), R11
+
+ // genMemMoveShort
+ CMPQ R9, $0x08
+ JLE emit_lit_memmove_lz4s_s2_memmove_move_8
+ CMPQ R9, $0x10
+ JBE emit_lit_memmove_lz4s_s2_memmove_move_8through16
+ CMPQ R9, $0x20
+ JBE emit_lit_memmove_lz4s_s2_memmove_move_17through32
+ JMP emit_lit_memmove_lz4s_s2_memmove_move_33through64
+
+emit_lit_memmove_lz4s_s2_memmove_move_8:
+ MOVQ (DX), R12
+ MOVQ R12, (AX)
+ JMP memmove_end_copy_lz4s_s2
+
+emit_lit_memmove_lz4s_s2_memmove_move_8through16:
+ MOVQ (DX), R12
+ MOVQ -8(DX)(R9*1), DX
+ MOVQ R12, (AX)
+ MOVQ DX, -8(AX)(R9*1)
+ JMP memmove_end_copy_lz4s_s2
+
+emit_lit_memmove_lz4s_s2_memmove_move_17through32:
+ MOVOU (DX), X0
+ MOVOU -16(DX)(R9*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R9*1)
+ JMP memmove_end_copy_lz4s_s2
+
+emit_lit_memmove_lz4s_s2_memmove_move_33through64:
+ MOVOU (DX), X0
+ MOVOU 16(DX), X1
+ MOVOU -32(DX)(R9*1), X2
+ MOVOU -16(DX)(R9*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_lz4s_s2:
+ MOVQ R11, AX
+ JMP lz4s_s2_lits_emit_done
+
+memmove_long_lz4s_s2:
+ LEAQ (AX)(R9*1), R11
+
+ // genMemMoveLong
+ MOVOU (DX), X0
+ MOVOU 16(DX), X1
+ MOVOU -32(DX)(R9*1), X2
+ MOVOU -16(DX)(R9*1), X3
+ MOVQ R9, R13
+ SHRQ $0x05, R13
+ MOVQ AX, R12
+ ANDL $0x0000001f, R12
+ MOVQ $0x00000040, R14
+ SUBQ R12, R14
+ DECQ R13
+ JA emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
+ LEAQ -32(DX)(R14*1), R12
+ LEAQ -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_lz4s_s2large_big_loop_back:
+ MOVOU (R12), X4
+ MOVOU 16(R12), X5
+ MOVOA X4, (R15)
+ MOVOA X5, 16(R15)
+ ADDQ $0x20, R15
+ ADDQ $0x20, R12
+ ADDQ $0x20, R14
+ DECQ R13
+ JNA emit_lit_memmove_long_lz4s_s2large_big_loop_back
+
+emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32:
+ MOVOU -32(DX)(R14*1), X4
+ MOVOU -16(DX)(R14*1), X5
+ MOVOA X4, -32(AX)(R14*1)
+ MOVOA X5, -16(AX)(R14*1)
+ ADDQ $0x20, R14
+ CMPQ R9, R14
+ JAE emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R9*1)
+ MOVOU X3, -16(AX)(R9*1)
+ MOVQ R11, AX
+
+lz4s_s2_lits_emit_done:
+ MOVQ R8, DX
+
+lz4s_s2_lits_done:
+ CMPQ DX, BX
+ JNE lz4s_s2_match
+ CMPQ R10, $0x03
+ JEQ lz4s_s2_done
+ JMP lz4s_s2_corrupt
+
+lz4s_s2_match:
+ CMPQ R10, $0x03
+ JEQ lz4s_s2_loop
+ LEAQ 2(DX), R8
+ CMPQ R8, BX
+ JAE lz4s_s2_corrupt
+ MOVWQZX (DX), R9
+ MOVQ R8, DX
+ TESTQ R9, R9
+ JZ lz4s_s2_corrupt
+ CMPQ R9, SI
+ JA lz4s_s2_corrupt
+ CMPQ R10, $0x12
+ JNE lz4s_s2_ml_done
+
+lz4s_s2_ml_loop:
+ MOVBQZX (DX), R8
+ INCQ DX
+ ADDQ R8, R10
+ CMPQ DX, BX
+ JAE lz4s_s2_corrupt
+ CMPQ R8, $0xff
+ JEQ lz4s_s2_ml_loop
+
+lz4s_s2_ml_done:
+ ADDQ R10, SI
+ CMPQ R9, DI
+ JNE lz4s_s2_docopy
+
+ // emitRepeat
+emit_repeat_again_lz4_s2:
+ MOVL R10, R8
+ LEAL -4(R10), R10
+ CMPL R8, $0x08
+ JLE repeat_two_lz4_s2
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_lz4_s2
+ CMPL R9, $0x00000800
+ JLT repeat_two_offset_lz4_s2
+
+cant_repeat_two_offset_lz4_s2:
+ CMPL R10, $0x00000104
+ JLT repeat_three_lz4_s2
+ CMPL R10, $0x00010100
+ JLT repeat_four_lz4_s2
+ CMPL R10, $0x0100ffff
+ JLT repeat_five_lz4_s2
+ LEAL -16842747(R10), R10
+ MOVL $0xfffb001d, (AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_lz4_s2
+
+repeat_five_lz4_s2:
+ LEAL -65536(R10), R10
+ MOVL R10, R9
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, R9
+ MOVB R9, 4(AX)
+ ADDQ $0x05, AX
+ JMP lz4s_s2_loop
+
+repeat_four_lz4_s2:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP lz4s_s2_loop
+
+repeat_three_lz4_s2:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP lz4s_s2_loop
+
+repeat_two_lz4_s2:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4s_s2_loop
+
+repeat_two_offset_lz4_s2:
+ XORQ R8, R8
+ LEAL 1(R8)(R10*4), R10
+ MOVB R9, 1(AX)
+ SARL $0x08, R9
+ SHLL $0x05, R9
+ ORL R9, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4s_s2_loop
+
+lz4s_s2_docopy:
+ MOVQ R9, DI
+
+ // emitCopy
+ CMPL R10, $0x40
+ JLE two_byte_offset_short_lz4_s2
+ CMPL R9, $0x00000800
+ JAE long_offset_short_lz4_s2
+ MOVL $0x00000001, R8
+ LEAL 16(R8), R8
+ MOVB R9, 1(AX)
+ MOVL R9, R11
+ SHRL $0x08, R11
+ SHLL $0x05, R11
+ ORL R11, R8
+ MOVB R8, (AX)
+ ADDQ $0x02, AX
+ SUBL $0x08, R10
+
+ // emitRepeat
+ LEAL -4(R10), R10
+ JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+
+emit_repeat_again_lz4_s2_emit_copy_short_2b:
+ MOVL R10, R8
+ LEAL -4(R10), R10
+ CMPL R8, $0x08
+ JLE repeat_two_lz4_s2_emit_copy_short_2b
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+ CMPL R9, $0x00000800
+ JLT repeat_two_offset_lz4_s2_emit_copy_short_2b
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
+ CMPL R10, $0x00000104
+ JLT repeat_three_lz4_s2_emit_copy_short_2b
+ CMPL R10, $0x00010100
+ JLT repeat_four_lz4_s2_emit_copy_short_2b
+ CMPL R10, $0x0100ffff
+ JLT repeat_five_lz4_s2_emit_copy_short_2b
+ LEAL -16842747(R10), R10
+ MOVL $0xfffb001d, (AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_lz4_s2_emit_copy_short_2b
+
+repeat_five_lz4_s2_emit_copy_short_2b:
+ LEAL -65536(R10), R10
+ MOVL R10, R9
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, R9
+ MOVB R9, 4(AX)
+ ADDQ $0x05, AX
+ JMP lz4s_s2_loop
+
+repeat_four_lz4_s2_emit_copy_short_2b:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP lz4s_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short_2b:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP lz4s_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short_2b:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4s_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short_2b:
+ XORQ R8, R8
+ LEAL 1(R8)(R10*4), R10
+ MOVB R9, 1(AX)
+ SARL $0x08, R9
+ SHLL $0x05, R9
+ ORL R9, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4s_s2_loop
+
+long_offset_short_lz4_s2:
+ MOVB $0xee, (AX)
+ MOVW R9, 1(AX)
+ LEAL -60(R10), R10
+ ADDQ $0x03, AX
+
+ // emitRepeat
+emit_repeat_again_lz4_s2_emit_copy_short:
+ MOVL R10, R8
+ LEAL -4(R10), R10
+ CMPL R8, $0x08
+ JLE repeat_two_lz4_s2_emit_copy_short
+ CMPL R8, $0x0c
+ JGE cant_repeat_two_offset_lz4_s2_emit_copy_short
+ CMPL R9, $0x00000800
+ JLT repeat_two_offset_lz4_s2_emit_copy_short
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short:
+ CMPL R10, $0x00000104
+ JLT repeat_three_lz4_s2_emit_copy_short
+ CMPL R10, $0x00010100
+ JLT repeat_four_lz4_s2_emit_copy_short
+ CMPL R10, $0x0100ffff
+ JLT repeat_five_lz4_s2_emit_copy_short
+ LEAL -16842747(R10), R10
+ MOVL $0xfffb001d, (AX)
+ MOVB $0xff, 4(AX)
+ ADDQ $0x05, AX
+ JMP emit_repeat_again_lz4_s2_emit_copy_short
+
+repeat_five_lz4_s2_emit_copy_short:
+ LEAL -65536(R10), R10
+ MOVL R10, R9
+ MOVW $0x001d, (AX)
+ MOVW R10, 2(AX)
+ SARL $0x10, R9
+ MOVB R9, 4(AX)
+ ADDQ $0x05, AX
+ JMP lz4s_s2_loop
+
+repeat_four_lz4_s2_emit_copy_short:
+ LEAL -256(R10), R10
+ MOVW $0x0019, (AX)
+ MOVW R10, 2(AX)
+ ADDQ $0x04, AX
+ JMP lz4s_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short:
+ LEAL -4(R10), R10
+ MOVW $0x0015, (AX)
+ MOVB R10, 2(AX)
+ ADDQ $0x03, AX
+ JMP lz4s_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short:
+ SHLL $0x02, R10
+ ORL $0x01, R10
+ MOVW R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4s_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short:
+ XORQ R8, R8
+ LEAL 1(R8)(R10*4), R10
+ MOVB R9, 1(AX)
+ SARL $0x08, R9
+ SHLL $0x05, R9
+ ORL R9, R10
+ MOVB R10, (AX)
+ ADDQ $0x02, AX
+ JMP lz4s_s2_loop
+
+two_byte_offset_short_lz4_s2:
+ MOVL R10, R8
+ SHLL $0x02, R8
+ CMPL R10, $0x0c
+ JGE emit_copy_three_lz4_s2
+ CMPL R9, $0x00000800
+ JGE emit_copy_three_lz4_s2
+ LEAL -15(R8), R8
+ MOVB R9, 1(AX)
+ SHRL $0x08, R9
+ SHLL $0x05, R9
+ ORL R9, R8
+ MOVB R8, (AX)
+ ADDQ $0x02, AX
+ JMP lz4s_s2_loop
+
+emit_copy_three_lz4_s2:
+ LEAL -2(R8), R8
+ MOVB R8, (AX)
+ MOVW R9, 1(AX)
+ ADDQ $0x03, AX
+ JMP lz4s_s2_loop
+
+lz4s_s2_done:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ SI, uncompressed+48(FP)
+ MOVQ AX, dstUsed+56(FP)
+ RET
+
+lz4s_s2_corrupt:
+ XORQ AX, AX
+ LEAQ -1(AX), SI
+ MOVQ SI, uncompressed+48(FP)
+ RET
+
+lz4s_s2_dstfull:
+ XORQ AX, AX
+ LEAQ -2(AX), SI
+ MOVQ SI, uncompressed+48(FP)
+ RET
+
+// func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: SSE2
+TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64
+ XORQ SI, SI
+ MOVQ dst_base+0(FP), AX
+ MOVQ dst_len+8(FP), CX
+ MOVQ src_base+24(FP), DX
+ MOVQ src_len+32(FP), BX
+ LEAQ (DX)(BX*1), BX
+ LEAQ -10(AX)(CX*1), CX
+
+lz4_snappy_loop:
+ CMPQ DX, BX
+ JAE lz4_snappy_corrupt
+ CMPQ AX, CX
+ JAE lz4_snappy_dstfull
+ MOVBQZX (DX), DI
+ MOVQ DI, R8
+ MOVQ DI, R9
+ SHRQ $0x04, R8
+ ANDQ $0x0f, R9
+ CMPQ DI, $0xf0
+ JB lz4_snappy_ll_end
+
+lz4_snappy_ll_loop:
+ INCQ DX
+ CMPQ DX, BX
+ JAE lz4_snappy_corrupt
+ MOVBQZX (DX), DI
+ ADDQ DI, R8
+ CMPQ DI, $0xff
+ JEQ lz4_snappy_ll_loop
+
+lz4_snappy_ll_end:
+ LEAQ (DX)(R8*1), DI
+ ADDQ $0x04, R9
+ CMPQ DI, BX
+ JAE lz4_snappy_corrupt
+ INCQ DX
+ INCQ DI
+ TESTQ R8, R8
+ JZ lz4_snappy_lits_done
+ LEAQ (AX)(R8*1), R10
+ CMPQ R10, CX
+ JAE lz4_snappy_dstfull
+ ADDQ R8, SI
+ LEAL -1(R8), R10
+ CMPL R10, $0x3c
+ JLT one_byte_lz4_snappy
+ CMPL R10, $0x00000100
+ JLT two_bytes_lz4_snappy
+ CMPL R10, $0x00010000
+ JLT three_bytes_lz4_snappy
+ CMPL R10, $0x01000000
+ JLT four_bytes_lz4_snappy
+ MOVB $0xfc, (AX)
+ MOVL R10, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_lz4_snappy
+
+four_bytes_lz4_snappy:
+ MOVL R10, R11
+ SHRL $0x10, R11
+ MOVB $0xf8, (AX)
+ MOVW R10, 1(AX)
+ MOVB R11, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_lz4_snappy
+
+three_bytes_lz4_snappy:
+ MOVB $0xf4, (AX)
+ MOVW R10, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_lz4_snappy
+
+two_bytes_lz4_snappy:
+ MOVB $0xf0, (AX)
+ MOVB R10, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R10, $0x40
+ JL memmove_lz4_snappy
+ JMP memmove_long_lz4_snappy
+
+one_byte_lz4_snappy:
+ SHLB $0x02, R10
+ MOVB R10, (AX)
+ ADDQ $0x01, AX
+
+memmove_lz4_snappy:
+ LEAQ (AX)(R8*1), R10
+
+ // genMemMoveShort
+ CMPQ R8, $0x08
+ JLE emit_lit_memmove_lz4_snappy_memmove_move_8
+ CMPQ R8, $0x10
+ JBE emit_lit_memmove_lz4_snappy_memmove_move_8through16
+ CMPQ R8, $0x20
+ JBE emit_lit_memmove_lz4_snappy_memmove_move_17through32
+ JMP emit_lit_memmove_lz4_snappy_memmove_move_33through64
+
+emit_lit_memmove_lz4_snappy_memmove_move_8:
+ MOVQ (DX), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_lz4_snappy
+
+emit_lit_memmove_lz4_snappy_memmove_move_8through16:
+ MOVQ (DX), R11
+ MOVQ -8(DX)(R8*1), DX
+ MOVQ R11, (AX)
+ MOVQ DX, -8(AX)(R8*1)
+ JMP memmove_end_copy_lz4_snappy
+
+emit_lit_memmove_lz4_snappy_memmove_move_17through32:
+ MOVOU (DX), X0
+ MOVOU -16(DX)(R8*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R8*1)
+ JMP memmove_end_copy_lz4_snappy
+
+emit_lit_memmove_lz4_snappy_memmove_move_33through64:
+ MOVOU (DX), X0
+ MOVOU 16(DX), X1
+ MOVOU -32(DX)(R8*1), X2
+ MOVOU -16(DX)(R8*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_lz4_snappy:
+ MOVQ R10, AX
+ JMP lz4_snappy_lits_emit_done
+
+memmove_long_lz4_snappy:
+ LEAQ (AX)(R8*1), R10
+
+ // genMemMoveLong
+ MOVOU (DX), X0
+ MOVOU 16(DX), X1
+ MOVOU -32(DX)(R8*1), X2
+ MOVOU -16(DX)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R13
+ SUBQ R11, R13
+ DECQ R12
+ JA emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
+ LEAQ -32(DX)(R13*1), R11
+ LEAQ -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_lz4_snappylarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
+ ADDQ $0x20, R14
+ ADDQ $0x20, R11
+ ADDQ $0x20, R13
+ DECQ R12
+ JNA emit_lit_memmove_long_lz4_snappylarge_big_loop_back
+
+emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32:
+ MOVOU -32(DX)(R13*1), X4
+ MOVOU -16(DX)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
+ JAE emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ R10, AX
+
+lz4_snappy_lits_emit_done:
+ MOVQ DI, DX
+
+lz4_snappy_lits_done:
+ CMPQ DX, BX
+ JNE lz4_snappy_match
+ CMPQ R9, $0x04
+ JEQ lz4_snappy_done
+ JMP lz4_snappy_corrupt
+
+lz4_snappy_match:
+ LEAQ 2(DX), DI
+ CMPQ DI, BX
+ JAE lz4_snappy_corrupt
+ MOVWQZX (DX), R8
+ MOVQ DI, DX
+ TESTQ R8, R8
+ JZ lz4_snappy_corrupt
+ CMPQ R8, SI
+ JA lz4_snappy_corrupt
+ CMPQ R9, $0x13
+ JNE lz4_snappy_ml_done
+
+lz4_snappy_ml_loop:
+ MOVBQZX (DX), DI
+ INCQ DX
+ ADDQ DI, R9
+ CMPQ DX, BX
+ JAE lz4_snappy_corrupt
+ CMPQ DI, $0xff
+ JEQ lz4_snappy_ml_loop
+
+lz4_snappy_ml_done:
+ ADDQ R9, SI
+
+ // emitCopy
+two_byte_offset_lz4_s2:
+ CMPL R9, $0x40
+ JLE two_byte_offset_short_lz4_s2
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R9), R9
+ ADDQ $0x03, AX
+ CMPQ AX, CX
+ JAE lz4_snappy_loop
+ JMP two_byte_offset_lz4_s2
+
+two_byte_offset_short_lz4_s2:
+ MOVL R9, DI
+ SHLL $0x02, DI
+ CMPL R9, $0x0c
+ JGE emit_copy_three_lz4_s2
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_lz4_s2
+ LEAL -15(DI), DI
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, DI
+ MOVB DI, (AX)
+ ADDQ $0x02, AX
+ JMP lz4_snappy_loop
+
+emit_copy_three_lz4_s2:
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP lz4_snappy_loop
+
+lz4_snappy_done:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ SI, uncompressed+48(FP)
+ MOVQ AX, dstUsed+56(FP)
+ RET
+
+lz4_snappy_corrupt:
+ XORQ AX, AX
+ LEAQ -1(AX), SI
+ MOVQ SI, uncompressed+48(FP)
+ RET
+
+lz4_snappy_dstfull:
+ XORQ AX, AX
+ LEAQ -2(AX), SI
+ MOVQ SI, uncompressed+48(FP)
+ RET
+
+// func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: SSE2
+TEXT ·cvtLZ4sBlockSnappyAsm(SB), NOSPLIT, $0-64
+ XORQ SI, SI
+ MOVQ dst_base+0(FP), AX
+ MOVQ dst_len+8(FP), CX
+ MOVQ src_base+24(FP), DX
+ MOVQ src_len+32(FP), BX
+ LEAQ (DX)(BX*1), BX
+ LEAQ -10(AX)(CX*1), CX
+
+lz4s_snappy_loop:
+ CMPQ DX, BX
+ JAE lz4s_snappy_corrupt
+ CMPQ AX, CX
+ JAE lz4s_snappy_dstfull
+ MOVBQZX (DX), DI
+ MOVQ DI, R8
+ MOVQ DI, R9
+ SHRQ $0x04, R8
+ ANDQ $0x0f, R9
+ CMPQ DI, $0xf0
+ JB lz4s_snappy_ll_end
+
+lz4s_snappy_ll_loop:
+ INCQ DX
+ CMPQ DX, BX
+ JAE lz4s_snappy_corrupt
+ MOVBQZX (DX), DI
+ ADDQ DI, R8
+ CMPQ DI, $0xff
+ JEQ lz4s_snappy_ll_loop
+
+lz4s_snappy_ll_end:
+ LEAQ (DX)(R8*1), DI
+ ADDQ $0x03, R9
+ CMPQ DI, BX
+ JAE lz4s_snappy_corrupt
+ INCQ DX
+ INCQ DI
+ TESTQ R8, R8
+ JZ lz4s_snappy_lits_done
+ LEAQ (AX)(R8*1), R10
+ CMPQ R10, CX
+ JAE lz4s_snappy_dstfull
+ ADDQ R8, SI
+ LEAL -1(R8), R10
+ CMPL R10, $0x3c
+ JLT one_byte_lz4s_snappy
+ CMPL R10, $0x00000100
+ JLT two_bytes_lz4s_snappy
+ CMPL R10, $0x00010000
+ JLT three_bytes_lz4s_snappy
+ CMPL R10, $0x01000000
+ JLT four_bytes_lz4s_snappy
+ MOVB $0xfc, (AX)
+ MOVL R10, 1(AX)
+ ADDQ $0x05, AX
+ JMP memmove_long_lz4s_snappy
+
+four_bytes_lz4s_snappy:
+ MOVL R10, R11
+ SHRL $0x10, R11
+ MOVB $0xf8, (AX)
+ MOVW R10, 1(AX)
+ MOVB R11, 3(AX)
+ ADDQ $0x04, AX
+ JMP memmove_long_lz4s_snappy
+
+three_bytes_lz4s_snappy:
+ MOVB $0xf4, (AX)
+ MOVW R10, 1(AX)
+ ADDQ $0x03, AX
+ JMP memmove_long_lz4s_snappy
+
+two_bytes_lz4s_snappy:
+ MOVB $0xf0, (AX)
+ MOVB R10, 1(AX)
+ ADDQ $0x02, AX
+ CMPL R10, $0x40
+ JL memmove_lz4s_snappy
+ JMP memmove_long_lz4s_snappy
+
+one_byte_lz4s_snappy:
+ SHLB $0x02, R10
+ MOVB R10, (AX)
+ ADDQ $0x01, AX
+
+memmove_lz4s_snappy:
+ LEAQ (AX)(R8*1), R10
+
+ // genMemMoveShort
+ CMPQ R8, $0x08
+ JLE emit_lit_memmove_lz4s_snappy_memmove_move_8
+ CMPQ R8, $0x10
+ JBE emit_lit_memmove_lz4s_snappy_memmove_move_8through16
+ CMPQ R8, $0x20
+ JBE emit_lit_memmove_lz4s_snappy_memmove_move_17through32
+ JMP emit_lit_memmove_lz4s_snappy_memmove_move_33through64
+
+emit_lit_memmove_lz4s_snappy_memmove_move_8:
+ MOVQ (DX), R11
+ MOVQ R11, (AX)
+ JMP memmove_end_copy_lz4s_snappy
+
+emit_lit_memmove_lz4s_snappy_memmove_move_8through16:
+ MOVQ (DX), R11
+ MOVQ -8(DX)(R8*1), DX
+ MOVQ R11, (AX)
+ MOVQ DX, -8(AX)(R8*1)
+ JMP memmove_end_copy_lz4s_snappy
+
+emit_lit_memmove_lz4s_snappy_memmove_move_17through32:
+ MOVOU (DX), X0
+ MOVOU -16(DX)(R8*1), X1
+ MOVOU X0, (AX)
+ MOVOU X1, -16(AX)(R8*1)
+ JMP memmove_end_copy_lz4s_snappy
+
+emit_lit_memmove_lz4s_snappy_memmove_move_33through64:
+ MOVOU (DX), X0
+ MOVOU 16(DX), X1
+ MOVOU -32(DX)(R8*1), X2
+ MOVOU -16(DX)(R8*1), X3
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+
+memmove_end_copy_lz4s_snappy:
+ MOVQ R10, AX
+ JMP lz4s_snappy_lits_emit_done
+
+memmove_long_lz4s_snappy:
+ LEAQ (AX)(R8*1), R10
+
+ // genMemMoveLong
+ MOVOU (DX), X0
+ MOVOU 16(DX), X1
+ MOVOU -32(DX)(R8*1), X2
+ MOVOU -16(DX)(R8*1), X3
+ MOVQ R8, R12
+ SHRQ $0x05, R12
+ MOVQ AX, R11
+ ANDL $0x0000001f, R11
+ MOVQ $0x00000040, R13
+ SUBQ R11, R13
+ DECQ R12
+ JA emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
+ LEAQ -32(DX)(R13*1), R11
+ LEAQ -32(AX)(R13*1), R14
+
+emit_lit_memmove_long_lz4s_snappylarge_big_loop_back:
+ MOVOU (R11), X4
+ MOVOU 16(R11), X5
+ MOVOA X4, (R14)
+ MOVOA X5, 16(R14)
+ ADDQ $0x20, R14
+ ADDQ $0x20, R11
+ ADDQ $0x20, R13
+ DECQ R12
+ JNA emit_lit_memmove_long_lz4s_snappylarge_big_loop_back
+
+emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32:
+ MOVOU -32(DX)(R13*1), X4
+ MOVOU -16(DX)(R13*1), X5
+ MOVOA X4, -32(AX)(R13*1)
+ MOVOA X5, -16(AX)(R13*1)
+ ADDQ $0x20, R13
+ CMPQ R8, R13
+ JAE emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, -32(AX)(R8*1)
+ MOVOU X3, -16(AX)(R8*1)
+ MOVQ R10, AX
+
+lz4s_snappy_lits_emit_done:
+ MOVQ DI, DX
+
+lz4s_snappy_lits_done:
+ CMPQ DX, BX
+ JNE lz4s_snappy_match
+ CMPQ R9, $0x03
+ JEQ lz4s_snappy_done
+ JMP lz4s_snappy_corrupt
+
+lz4s_snappy_match:
+ CMPQ R9, $0x03
+ JEQ lz4s_snappy_loop
+ LEAQ 2(DX), DI
+ CMPQ DI, BX
+ JAE lz4s_snappy_corrupt
+ MOVWQZX (DX), R8
+ MOVQ DI, DX
+ TESTQ R8, R8
+ JZ lz4s_snappy_corrupt
+ CMPQ R8, SI
+ JA lz4s_snappy_corrupt
+ CMPQ R9, $0x12
+ JNE lz4s_snappy_ml_done
+
+lz4s_snappy_ml_loop:
+ MOVBQZX (DX), DI
+ INCQ DX
+ ADDQ DI, R9
+ CMPQ DX, BX
+ JAE lz4s_snappy_corrupt
+ CMPQ DI, $0xff
+ JEQ lz4s_snappy_ml_loop
+
+lz4s_snappy_ml_done:
+ ADDQ R9, SI
+
+ // emitCopy
+two_byte_offset_lz4_s2:
+ CMPL R9, $0x40
+ JLE two_byte_offset_short_lz4_s2
+ MOVB $0xee, (AX)
+ MOVW R8, 1(AX)
+ LEAL -60(R9), R9
+ ADDQ $0x03, AX
+ CMPQ AX, CX
+ JAE lz4s_snappy_loop
+ JMP two_byte_offset_lz4_s2
+
+two_byte_offset_short_lz4_s2:
+ MOVL R9, DI
+ SHLL $0x02, DI
+ CMPL R9, $0x0c
+ JGE emit_copy_three_lz4_s2
+ CMPL R8, $0x00000800
+ JGE emit_copy_three_lz4_s2
+ LEAL -15(DI), DI
+ MOVB R8, 1(AX)
+ SHRL $0x08, R8
+ SHLL $0x05, R8
+ ORL R8, DI
+ MOVB DI, (AX)
+ ADDQ $0x02, AX
+ JMP lz4s_snappy_loop
+
+emit_copy_three_lz4_s2:
+ LEAL -2(DI), DI
+ MOVB DI, (AX)
+ MOVW R8, 1(AX)
+ ADDQ $0x03, AX
+ JMP lz4s_snappy_loop
+
+lz4s_snappy_done:
+ MOVQ dst_base+0(FP), CX
+ SUBQ CX, AX
+ MOVQ SI, uncompressed+48(FP)
+ MOVQ AX, dstUsed+56(FP)
+ RET
+
+lz4s_snappy_corrupt:
+ XORQ AX, AX
+ LEAQ -1(AX), SI
+ MOVQ SI, uncompressed+48(FP)
+ RET
+
+lz4s_snappy_dstfull:
+ XORQ AX, AX
+ LEAQ -2(AX), SI
+ MOVQ SI, uncompressed+48(FP)
+ RET