summaryrefslogtreecommitdiff
path: root/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s')
-rw-r--r--vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s4151
1 files changed, 0 insertions, 4151 deletions
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
deleted file mode 100644
index f5591fa1e..000000000
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
+++ /dev/null
@@ -1,4151 +0,0 @@
-// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
-
-//go:build !appengine && !noasm && gc && !noasm
-
-// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
-// Requires: CMOV
-TEXT ·sequenceDecs_decode_amd64(SB), $8-32
- MOVQ br+8(FP), CX
- MOVQ 24(CX), DX
- MOVBQZX 32(CX), BX
- MOVQ (CX), AX
- MOVQ 8(CX), SI
- ADDQ SI, AX
- MOVQ AX, (SP)
- MOVQ ctx+16(FP), AX
- MOVQ 72(AX), DI
- MOVQ 80(AX), R8
- MOVQ 88(AX), R9
- MOVQ 104(AX), R10
- MOVQ s+0(FP), AX
- MOVQ 144(AX), R11
- MOVQ 152(AX), R12
- MOVQ 160(AX), R13
-
-sequenceDecs_decode_amd64_main_loop:
- MOVQ (SP), R14
-
- // Fill bitreader to have enough for the offset and match length.
- CMPQ SI, $0x08
- JL sequenceDecs_decode_amd64_fill_byte_by_byte
- MOVQ BX, AX
- SHRQ $0x03, AX
- SUBQ AX, R14
- MOVQ (R14), DX
- SUBQ AX, SI
- ANDQ $0x07, BX
- JMP sequenceDecs_decode_amd64_fill_end
-
-sequenceDecs_decode_amd64_fill_byte_by_byte:
- CMPQ SI, $0x00
- JLE sequenceDecs_decode_amd64_fill_check_overread
- CMPQ BX, $0x07
- JLE sequenceDecs_decode_amd64_fill_end
- SHLQ $0x08, DX
- SUBQ $0x01, R14
- SUBQ $0x01, SI
- SUBQ $0x08, BX
- MOVBQZX (R14), AX
- ORQ AX, DX
- JMP sequenceDecs_decode_amd64_fill_byte_by_byte
-
-sequenceDecs_decode_amd64_fill_check_overread:
- CMPQ BX, $0x40
- JA error_overread
-
-sequenceDecs_decode_amd64_fill_end:
- // Update offset
- MOVQ R9, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- SHRQ $0x20, AX
- TESTQ CX, CX
- JZ sequenceDecs_decode_amd64_of_update_zero
- ADDQ CX, BX
- CMPQ BX, $0x40
- JA sequenceDecs_decode_amd64_of_update_zero
- CMPQ CX, $0x40
- JAE sequenceDecs_decode_amd64_of_update_zero
- NEGQ CX
- SHRQ CL, R15
- ADDQ R15, AX
-
-sequenceDecs_decode_amd64_of_update_zero:
- MOVQ AX, 16(R10)
-
- // Update match length
- MOVQ R8, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- SHRQ $0x20, AX
- TESTQ CX, CX
- JZ sequenceDecs_decode_amd64_ml_update_zero
- ADDQ CX, BX
- CMPQ BX, $0x40
- JA sequenceDecs_decode_amd64_ml_update_zero
- CMPQ CX, $0x40
- JAE sequenceDecs_decode_amd64_ml_update_zero
- NEGQ CX
- SHRQ CL, R15
- ADDQ R15, AX
-
-sequenceDecs_decode_amd64_ml_update_zero:
- MOVQ AX, 8(R10)
-
- // Fill bitreader to have enough for the remaining
- CMPQ SI, $0x08
- JL sequenceDecs_decode_amd64_fill_2_byte_by_byte
- MOVQ BX, AX
- SHRQ $0x03, AX
- SUBQ AX, R14
- MOVQ (R14), DX
- SUBQ AX, SI
- ANDQ $0x07, BX
- JMP sequenceDecs_decode_amd64_fill_2_end
-
-sequenceDecs_decode_amd64_fill_2_byte_by_byte:
- CMPQ SI, $0x00
- JLE sequenceDecs_decode_amd64_fill_2_check_overread
- CMPQ BX, $0x07
- JLE sequenceDecs_decode_amd64_fill_2_end
- SHLQ $0x08, DX
- SUBQ $0x01, R14
- SUBQ $0x01, SI
- SUBQ $0x08, BX
- MOVBQZX (R14), AX
- ORQ AX, DX
- JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte
-
-sequenceDecs_decode_amd64_fill_2_check_overread:
- CMPQ BX, $0x40
- JA error_overread
-
-sequenceDecs_decode_amd64_fill_2_end:
- // Update literal length
- MOVQ DI, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- SHRQ $0x20, AX
- TESTQ CX, CX
- JZ sequenceDecs_decode_amd64_ll_update_zero
- ADDQ CX, BX
- CMPQ BX, $0x40
- JA sequenceDecs_decode_amd64_ll_update_zero
- CMPQ CX, $0x40
- JAE sequenceDecs_decode_amd64_ll_update_zero
- NEGQ CX
- SHRQ CL, R15
- ADDQ R15, AX
-
-sequenceDecs_decode_amd64_ll_update_zero:
- MOVQ AX, (R10)
-
- // Fill bitreader for state updates
- MOVQ R14, (SP)
- MOVQ R9, AX
- SHRQ $0x08, AX
- MOVBQZX AL, AX
- MOVQ ctx+16(FP), CX
- CMPQ 96(CX), $0x00
- JZ sequenceDecs_decode_amd64_skip_update
-
- // Update Literal Length State
- MOVBQZX DI, R14
- SHRL $0x10, DI
- LEAQ (BX)(R14*1), CX
- MOVQ DX, R15
- MOVQ CX, BX
- ROLQ CL, R15
- MOVL $0x00000001, BP
- MOVB R14, CL
- SHLL CL, BP
- DECL BP
- ANDQ BP, R15
- ADDQ R15, DI
-
- // Load ctx.llTable
- MOVQ ctx+16(FP), CX
- MOVQ (CX), CX
- MOVQ (CX)(DI*8), DI
-
- // Update Match Length State
- MOVBQZX R8, R14
- SHRL $0x10, R8
- LEAQ (BX)(R14*1), CX
- MOVQ DX, R15
- MOVQ CX, BX
- ROLQ CL, R15
- MOVL $0x00000001, BP
- MOVB R14, CL
- SHLL CL, BP
- DECL BP
- ANDQ BP, R15
- ADDQ R15, R8
-
- // Load ctx.mlTable
- MOVQ ctx+16(FP), CX
- MOVQ 24(CX), CX
- MOVQ (CX)(R8*8), R8
-
- // Update Offset State
- MOVBQZX R9, R14
- SHRL $0x10, R9
- LEAQ (BX)(R14*1), CX
- MOVQ DX, R15
- MOVQ CX, BX
- ROLQ CL, R15
- MOVL $0x00000001, BP
- MOVB R14, CL
- SHLL CL, BP
- DECL BP
- ANDQ BP, R15
- ADDQ R15, R9
-
- // Load ctx.ofTable
- MOVQ ctx+16(FP), CX
- MOVQ 48(CX), CX
- MOVQ (CX)(R9*8), R9
-
-sequenceDecs_decode_amd64_skip_update:
- // Adjust offset
- MOVQ 16(R10), CX
- CMPQ AX, $0x01
- JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
- MOVQ R12, R13
- MOVQ R11, R12
- MOVQ CX, R11
- JMP sequenceDecs_decode_amd64_after_adjust
-
-sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
- CMPQ (R10), $0x00000000
- JNE sequenceDecs_decode_amd64_adjust_offset_maybezero
- INCQ CX
- JMP sequenceDecs_decode_amd64_adjust_offset_nonzero
-
-sequenceDecs_decode_amd64_adjust_offset_maybezero:
- TESTQ CX, CX
- JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero
- MOVQ R11, CX
- JMP sequenceDecs_decode_amd64_after_adjust
-
-sequenceDecs_decode_amd64_adjust_offset_nonzero:
- CMPQ CX, $0x01
- JB sequenceDecs_decode_amd64_adjust_zero
- JEQ sequenceDecs_decode_amd64_adjust_one
- CMPQ CX, $0x02
- JA sequenceDecs_decode_amd64_adjust_three
- JMP sequenceDecs_decode_amd64_adjust_two
-
-sequenceDecs_decode_amd64_adjust_zero:
- MOVQ R11, AX
- JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
-
-sequenceDecs_decode_amd64_adjust_one:
- MOVQ R12, AX
- JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
-
-sequenceDecs_decode_amd64_adjust_two:
- MOVQ R13, AX
- JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
-
-sequenceDecs_decode_amd64_adjust_three:
- LEAQ -1(R11), AX
-
-sequenceDecs_decode_amd64_adjust_test_temp_valid:
- TESTQ AX, AX
- JNZ sequenceDecs_decode_amd64_adjust_temp_valid
- MOVQ $0x00000001, AX
-
-sequenceDecs_decode_amd64_adjust_temp_valid:
- CMPQ CX, $0x01
- CMOVQNE R12, R13
- MOVQ R11, R12
- MOVQ AX, R11
- MOVQ AX, CX
-
-sequenceDecs_decode_amd64_after_adjust:
- MOVQ CX, 16(R10)
-
- // Check values
- MOVQ 8(R10), AX
- MOVQ (R10), R14
- LEAQ (AX)(R14*1), R15
- MOVQ s+0(FP), BP
- ADDQ R15, 256(BP)
- MOVQ ctx+16(FP), R15
- SUBQ R14, 128(R15)
- JS error_not_enough_literals
- CMPQ AX, $0x00020002
- JA sequenceDecs_decode_amd64_error_match_len_too_big
- TESTQ CX, CX
- JNZ sequenceDecs_decode_amd64_match_len_ofs_ok
- TESTQ AX, AX
- JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
-
-sequenceDecs_decode_amd64_match_len_ofs_ok:
- ADDQ $0x18, R10
- MOVQ ctx+16(FP), AX
- DECQ 96(AX)
- JNS sequenceDecs_decode_amd64_main_loop
- MOVQ s+0(FP), AX
- MOVQ R11, 144(AX)
- MOVQ R12, 152(AX)
- MOVQ R13, 160(AX)
- MOVQ br+8(FP), AX
- MOVQ DX, 24(AX)
- MOVB BL, 32(AX)
- MOVQ SI, 8(AX)
-
- // Return success
- MOVQ $0x00000000, ret+24(FP)
- RET
-
- // Return with match length error
-sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
- MOVQ $0x00000001, ret+24(FP)
- RET
-
- // Return with match too long error
-sequenceDecs_decode_amd64_error_match_len_too_big:
- MOVQ $0x00000002, ret+24(FP)
- RET
-
- // Return with match offset too long error
- MOVQ $0x00000003, ret+24(FP)
- RET
-
- // Return with not enough literals error
-error_not_enough_literals:
- MOVQ $0x00000004, ret+24(FP)
- RET
-
- // Return with overread error
-error_overread:
- MOVQ $0x00000006, ret+24(FP)
- RET
-
-// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
-// Requires: CMOV
-TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
- MOVQ br+8(FP), CX
- MOVQ 24(CX), DX
- MOVBQZX 32(CX), BX
- MOVQ (CX), AX
- MOVQ 8(CX), SI
- ADDQ SI, AX
- MOVQ AX, (SP)
- MOVQ ctx+16(FP), AX
- MOVQ 72(AX), DI
- MOVQ 80(AX), R8
- MOVQ 88(AX), R9
- MOVQ 104(AX), R10
- MOVQ s+0(FP), AX
- MOVQ 144(AX), R11
- MOVQ 152(AX), R12
- MOVQ 160(AX), R13
-
-sequenceDecs_decode_56_amd64_main_loop:
- MOVQ (SP), R14
-
- // Fill bitreader to have enough for the offset and match length.
- CMPQ SI, $0x08
- JL sequenceDecs_decode_56_amd64_fill_byte_by_byte
- MOVQ BX, AX
- SHRQ $0x03, AX
- SUBQ AX, R14
- MOVQ (R14), DX
- SUBQ AX, SI
- ANDQ $0x07, BX
- JMP sequenceDecs_decode_56_amd64_fill_end
-
-sequenceDecs_decode_56_amd64_fill_byte_by_byte:
- CMPQ SI, $0x00
- JLE sequenceDecs_decode_56_amd64_fill_check_overread
- CMPQ BX, $0x07
- JLE sequenceDecs_decode_56_amd64_fill_end
- SHLQ $0x08, DX
- SUBQ $0x01, R14
- SUBQ $0x01, SI
- SUBQ $0x08, BX
- MOVBQZX (R14), AX
- ORQ AX, DX
- JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte
-
-sequenceDecs_decode_56_amd64_fill_check_overread:
- CMPQ BX, $0x40
- JA error_overread
-
-sequenceDecs_decode_56_amd64_fill_end:
- // Update offset
- MOVQ R9, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- SHRQ $0x20, AX
- TESTQ CX, CX
- JZ sequenceDecs_decode_56_amd64_of_update_zero
- ADDQ CX, BX
- CMPQ BX, $0x40
- JA sequenceDecs_decode_56_amd64_of_update_zero
- CMPQ CX, $0x40
- JAE sequenceDecs_decode_56_amd64_of_update_zero
- NEGQ CX
- SHRQ CL, R15
- ADDQ R15, AX
-
-sequenceDecs_decode_56_amd64_of_update_zero:
- MOVQ AX, 16(R10)
-
- // Update match length
- MOVQ R8, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- SHRQ $0x20, AX
- TESTQ CX, CX
- JZ sequenceDecs_decode_56_amd64_ml_update_zero
- ADDQ CX, BX
- CMPQ BX, $0x40
- JA sequenceDecs_decode_56_amd64_ml_update_zero
- CMPQ CX, $0x40
- JAE sequenceDecs_decode_56_amd64_ml_update_zero
- NEGQ CX
- SHRQ CL, R15
- ADDQ R15, AX
-
-sequenceDecs_decode_56_amd64_ml_update_zero:
- MOVQ AX, 8(R10)
-
- // Update literal length
- MOVQ DI, AX
- MOVQ BX, CX
- MOVQ DX, R15
- SHLQ CL, R15
- MOVB AH, CL
- SHRQ $0x20, AX
- TESTQ CX, CX
- JZ sequenceDecs_decode_56_amd64_ll_update_zero
- ADDQ CX, BX
- CMPQ BX, $0x40
- JA sequenceDecs_decode_56_amd64_ll_update_zero
- CMPQ CX, $0x40
- JAE sequenceDecs_decode_56_amd64_ll_update_zero
- NEGQ CX
- SHRQ CL, R15
- ADDQ R15, AX
-
-sequenceDecs_decode_56_amd64_ll_update_zero:
- MOVQ AX, (R10)
-
- // Fill bitreader for state updates
- MOVQ R14, (SP)
- MOVQ R9, AX
- SHRQ $0x08, AX
- MOVBQZX AL, AX
- MOVQ ctx+16(FP), CX
- CMPQ 96(CX), $0x00
- JZ sequenceDecs_decode_56_amd64_skip_update
-
- // Update Literal Length State
- MOVBQZX DI, R14
- SHRL $0x10, DI
- LEAQ (BX)(R14*1), CX
- MOVQ DX, R15
- MOVQ CX, BX
- ROLQ CL, R15
- MOVL $0x00000001, BP
- MOVB R14, CL
- SHLL CL, BP
- DECL BP
- ANDQ BP, R15
- ADDQ R15, DI
-
- // Load ctx.llTable
- MOVQ ctx+16(FP), CX
- MOVQ (CX), CX
- MOVQ (CX)(DI*8), DI
-
- // Update Match Length State
- MOVBQZX R8, R14
- SHRL $0x10, R8
- LEAQ (BX)(R14*1), CX
- MOVQ DX, R15
- MOVQ CX, BX
- ROLQ CL, R15
- MOVL $0x00000001, BP
- MOVB R14, CL
- SHLL CL, BP
- DECL BP
- ANDQ BP, R15
- ADDQ R15, R8
-
- // Load ctx.mlTable
- MOVQ ctx+16(FP), CX
- MOVQ 24(CX), CX
- MOVQ (CX)(R8*8), R8
-
- // Update Offset State
- MOVBQZX R9, R14
- SHRL $0x10, R9
- LEAQ (BX)(R14*1), CX
- MOVQ DX, R15
- MOVQ CX, BX
- ROLQ CL, R15
- MOVL $0x00000001, BP
- MOVB R14, CL
- SHLL CL, BP
- DECL BP
- ANDQ BP, R15
- ADDQ R15, R9
-
- // Load ctx.ofTable
- MOVQ ctx+16(FP), CX
- MOVQ 48(CX), CX
- MOVQ (CX)(R9*8), R9
-
-sequenceDecs_decode_56_amd64_skip_update:
- // Adjust offset
- MOVQ 16(R10), CX
- CMPQ AX, $0x01
- JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
- MOVQ R12, R13
- MOVQ R11, R12
- MOVQ CX, R11
- JMP sequenceDecs_decode_56_amd64_after_adjust
-
-sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
- CMPQ (R10), $0x00000000
- JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero
- INCQ CX
- JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero
-
-sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
- TESTQ CX, CX
- JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero
- MOVQ R11, CX
- JMP sequenceDecs_decode_56_amd64_after_adjust
-
-sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
- CMPQ CX, $0x01
- JB sequenceDecs_decode_56_amd64_adjust_zero
- JEQ sequenceDecs_decode_56_amd64_adjust_one
- CMPQ CX, $0x02
- JA sequenceDecs_decode_56_amd64_adjust_three
- JMP sequenceDecs_decode_56_amd64_adjust_two
-
-sequenceDecs_decode_56_amd64_adjust_zero:
- MOVQ R11, AX
- JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
-
-sequenceDecs_decode_56_amd64_adjust_one:
- MOVQ R12, AX
- JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
-
-sequenceDecs_decode_56_amd64_adjust_two:
- MOVQ R13, AX
- JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
-
-sequenceDecs_decode_56_amd64_adjust_three:
- LEAQ -1(R11), AX
-
-sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
- TESTQ AX, AX
- JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid
- MOVQ $0x00000001, AX
-
-sequenceDecs_decode_56_amd64_adjust_temp_valid:
- CMPQ CX, $0x01
- CMOVQNE R12, R13
- MOVQ R11, R12
- MOVQ AX, R11
- MOVQ AX, CX
-
-sequenceDecs_decode_56_amd64_after_adjust:
- MOVQ CX, 16(R10)
-
- // Check values
- MOVQ 8(R10), AX
- MOVQ (R10), R14
- LEAQ (AX)(R14*1), R15
- MOVQ s+0(FP), BP
- ADDQ R15, 256(BP)
- MOVQ ctx+16(FP), R15
- SUBQ R14, 128(R15)
- JS error_not_enough_literals
- CMPQ AX, $0x00020002
- JA sequenceDecs_decode_56_amd64_error_match_len_too_big
- TESTQ CX, CX
- JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok
- TESTQ AX, AX
- JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
-
-sequenceDecs_decode_56_amd64_match_len_ofs_ok:
- ADDQ $0x18, R10
- MOVQ ctx+16(FP), AX
- DECQ 96(AX)
- JNS sequenceDecs_decode_56_amd64_main_loop
- MOVQ s+0(FP), AX
- MOVQ R11, 144(AX)
- MOVQ R12, 152(AX)
- MOVQ R13, 160(AX)
- MOVQ br+8(FP), AX
- MOVQ DX, 24(AX)
- MOVB BL, 32(AX)
- MOVQ SI, 8(AX)
-
- // Return success
- MOVQ $0x00000000, ret+24(FP)
- RET
-
- // Return with match length error
-sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
- MOVQ $0x00000001, ret+24(FP)
- RET
-
- // Return with match too long error
-sequenceDecs_decode_56_amd64_error_match_len_too_big:
- MOVQ $0x00000002, ret+24(FP)
- RET
-
- // Return with match offset too long error
- MOVQ $0x00000003, ret+24(FP)
- RET
-
- // Return with not enough literals error
-error_not_enough_literals:
- MOVQ $0x00000004, ret+24(FP)
- RET
-
- // Return with overread error
-error_overread:
- MOVQ $0x00000006, ret+24(FP)
- RET
-
-// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
-// Requires: BMI, BMI2, CMOV
-TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
- MOVQ br+8(FP), BX
- MOVQ 24(BX), AX
- MOVBQZX 32(BX), DX
- MOVQ (BX), CX
- MOVQ 8(BX), BX
- ADDQ BX, CX
- MOVQ CX, (SP)
- MOVQ ctx+16(FP), CX
- MOVQ 72(CX), SI
- MOVQ 80(CX), DI
- MOVQ 88(CX), R8
- MOVQ 104(CX), R9
- MOVQ s+0(FP), CX
- MOVQ 144(CX), R10
- MOVQ 152(CX), R11
- MOVQ 160(CX), R12
-
-sequenceDecs_decode_bmi2_main_loop:
- MOVQ (SP), R13
-
- // Fill bitreader to have enough for the offset and match length.
- CMPQ BX, $0x08
- JL sequenceDecs_decode_bmi2_fill_byte_by_byte
- MOVQ DX, CX
- SHRQ $0x03, CX
- SUBQ CX, R13
- MOVQ (R13), AX
- SUBQ CX, BX
- ANDQ $0x07, DX
- JMP sequenceDecs_decode_bmi2_fill_end
-
-sequenceDecs_decode_bmi2_fill_byte_by_byte:
- CMPQ BX, $0x00
- JLE sequenceDecs_decode_bmi2_fill_check_overread
- CMPQ DX, $0x07
- JLE sequenceDecs_decode_bmi2_fill_end
- SHLQ $0x08, AX
- SUBQ $0x01, R13
- SUBQ $0x01, BX
- SUBQ $0x08, DX
- MOVBQZX (R13), CX
- ORQ CX, AX
- JMP sequenceDecs_decode_bmi2_fill_byte_by_byte
-
-sequenceDecs_decode_bmi2_fill_check_overread:
- CMPQ DX, $0x40
- JA error_overread
-
-sequenceDecs_decode_bmi2_fill_end:
- // Update offset
- MOVQ $0x00000808, CX
- BEXTRQ CX, R8, R14
- MOVQ AX, R15
- LEAQ (DX)(R14*1), CX
- ROLQ CL, R15
- BZHIQ R14, R15, R15
- MOVQ CX, DX
- MOVQ R8, CX
- SHRQ $0x20, CX
- ADDQ R15, CX
- MOVQ CX, 16(R9)
-
- // Update match length
- MOVQ $0x00000808, CX
- BEXTRQ CX, DI, R14
- MOVQ AX, R15
- LEAQ (DX)(R14*1), CX
- ROLQ CL, R15
- BZHIQ R14, R15, R15
- MOVQ CX, DX
- MOVQ DI, CX
- SHRQ $0x20, CX
- ADDQ R15, CX
- MOVQ CX, 8(R9)
-
- // Fill bitreader to have enough for the remaining
- CMPQ BX, $0x08
- JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte
- MOVQ DX, CX
- SHRQ $0x03, CX
- SUBQ CX, R13
- MOVQ (R13), AX
- SUBQ CX, BX
- ANDQ $0x07, DX
- JMP sequenceDecs_decode_bmi2_fill_2_end
-
-sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
- CMPQ BX, $0x00
- JLE sequenceDecs_decode_bmi2_fill_2_check_overread
- CMPQ DX, $0x07
- JLE sequenceDecs_decode_bmi2_fill_2_end
- SHLQ $0x08, AX
- SUBQ $0x01, R13
- SUBQ $0x01, BX
- SUBQ $0x08, DX
- MOVBQZX (R13), CX
- ORQ CX, AX
- JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte
-
-sequenceDecs_decode_bmi2_fill_2_check_overread:
- CMPQ DX, $0x40
- JA error_overread
-
-sequenceDecs_decode_bmi2_fill_2_end:
- // Update literal length
- MOVQ $0x00000808, CX
- BEXTRQ CX, SI, R14
- MOVQ AX, R15
- LEAQ (DX)(R14*1), CX
- ROLQ CL, R15
- BZHIQ R14, R15, R15
- MOVQ CX, DX
- MOVQ SI, CX
- SHRQ $0x20, CX
- ADDQ R15, CX
- MOVQ CX, (R9)
-
- // Fill bitreader for state updates
- MOVQ R13, (SP)
- MOVQ $0x00000808, CX
- BEXTRQ CX, R8, R13
- MOVQ ctx+16(FP), CX
- CMPQ 96(CX), $0x00
- JZ sequenceDecs_decode_bmi2_skip_update
- LEAQ (SI)(DI*1), R14
- ADDQ R8, R14
- MOVBQZX R14, R14
- LEAQ (DX)(R14*1), CX
- MOVQ AX, R15
- MOVQ CX, DX
- ROLQ CL, R15
- BZHIQ R14, R15, R15
-
- // Update Offset State
- BZHIQ R8, R15, CX
- SHRXQ R8, R15, R15
- SHRL $0x10, R8
- ADDQ CX, R8
-
- // Load ctx.ofTable
- MOVQ ctx+16(FP), CX
- MOVQ 48(CX), CX
- MOVQ (CX)(R8*8), R8
-
- // Update Match Length State
- BZHIQ DI, R15, CX
- SHRXQ DI, R15, R15
- SHRL $0x10, DI
- ADDQ CX, DI
-
- // Load ctx.mlTable
- MOVQ ctx+16(FP), CX
- MOVQ 24(CX), CX
- MOVQ (CX)(DI*8), DI
-
- // Update Literal Length State
- BZHIQ SI, R15, CX
- SHRL $0x10, SI
- ADDQ CX, SI
-
- // Load ctx.llTable
- MOVQ ctx+16(FP), CX
- MOVQ (CX), CX
- MOVQ (CX)(SI*8), SI
-
-sequenceDecs_decode_bmi2_skip_update:
- // Adjust offset
- MOVQ 16(R9), CX
- CMPQ R13, $0x01
- JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
- MOVQ R11, R12
- MOVQ R10, R11
- MOVQ CX, R10
- JMP sequenceDecs_decode_bmi2_after_adjust
-
-sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
- CMPQ (R9), $0x00000000
- JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero
- INCQ CX
- JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero
-
-sequenceDecs_decode_bmi2_adjust_offset_maybezero:
- TESTQ CX, CX
- JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero
- MOVQ R10, CX
- JMP sequenceDecs_decode_bmi2_after_adjust
-
-sequenceDecs_decode_bmi2_adjust_offset_nonzero:
- CMPQ CX, $0x01
- JB sequenceDecs_decode_bmi2_adjust_zero
- JEQ sequenceDecs_decode_bmi2_adjust_one
- CMPQ CX, $0x02
- JA sequenceDecs_decode_bmi2_adjust_three
- JMP sequenceDecs_decode_bmi2_adjust_two
-
-sequenceDecs_decode_bmi2_adjust_zero:
- MOVQ R10, R13
- JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
-
-sequenceDecs_decode_bmi2_adjust_one:
- MOVQ R11, R13
- JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
-
-sequenceDecs_decode_bmi2_adjust_two:
- MOVQ R12, R13
- JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
-
-sequenceDecs_decode_bmi2_adjust_three:
- LEAQ -1(R10), R13
-
-sequenceDecs_decode_bmi2_adjust_test_temp_valid:
- TESTQ R13, R13
- JNZ sequenceDecs_decode_bmi2_adjust_temp_valid
- MOVQ $0x00000001, R13
-
-sequenceDecs_decode_bmi2_adjust_temp_valid:
- CMPQ CX, $0x01
- CMOVQNE R11, R12
- MOVQ R10, R11
- MOVQ R13, R10
- MOVQ R13, CX
-
-sequenceDecs_decode_bmi2_after_adjust:
- MOVQ CX, 16(R9)
-
- // Check values
- MOVQ 8(R9), R13
- MOVQ (R9), R14
- LEAQ (R13)(R14*1), R15
- MOVQ s+0(FP), BP
- ADDQ R15, 256(BP)
- MOVQ ctx+16(FP), R15
- SUBQ R14, 128(R15)
- JS error_not_enough_literals
- CMPQ R13, $0x00020002
- JA sequenceDecs_decode_bmi2_error_match_len_too_big
- TESTQ CX, CX
- JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok
- TESTQ R13, R13
- JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
-
-sequenceDecs_decode_bmi2_match_len_ofs_ok:
- ADDQ $0x18, R9
- MOVQ ctx+16(FP), CX
- DECQ 96(CX)
- JNS sequenceDecs_decode_bmi2_main_loop
- MOVQ s+0(FP), CX
- MOVQ R10, 144(CX)
- MOVQ R11, 152(CX)
- MOVQ R12, 160(CX)
- MOVQ br+8(FP), CX
- MOVQ AX, 24(CX)
- MOVB DL, 32(CX)
- MOVQ BX, 8(CX)
-
- // Return success
- MOVQ $0x00000000, ret+24(FP)
- RET
-
- // Return with match length error
-sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
- MOVQ $0x00000001, ret+24(FP)
- RET
-
- // Return with match too long error
-sequenceDecs_decode_bmi2_error_match_len_too_big:
- MOVQ $0x00000002, ret+24(FP)
- RET
-
- // Return with match offset too long error
- MOVQ $0x00000003, ret+24(FP)
- RET
-
- // Return with not enough literals error
-error_not_enough_literals:
- MOVQ $0x00000004, ret+24(FP)
- RET
-
- // Return with overread error
-error_overread:
- MOVQ $0x00000006, ret+24(FP)
- RET
-
-// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
-// Requires: BMI, BMI2, CMOV
-TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
- MOVQ br+8(FP), BX
- MOVQ 24(BX), AX
- MOVBQZX 32(BX), DX
- MOVQ (BX), CX
- MOVQ 8(BX), BX
- ADDQ BX, CX
- MOVQ CX, (SP)
- MOVQ ctx+16(FP), CX
- MOVQ 72(CX), SI
- MOVQ 80(CX), DI
- MOVQ 88(CX), R8
- MOVQ 104(CX), R9
- MOVQ s+0(FP), CX
- MOVQ 144(CX), R10
- MOVQ 152(CX), R11
- MOVQ 160(CX), R12
-
-sequenceDecs_decode_56_bmi2_main_loop:
- MOVQ (SP), R13
-
- // Fill bitreader to have enough for the offset and match length.
- CMPQ BX, $0x08
- JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte
- MOVQ DX, CX
- SHRQ $0x03, CX
- SUBQ CX, R13
- MOVQ (R13), AX
- SUBQ CX, BX
- ANDQ $0x07, DX
- JMP sequenceDecs_decode_56_bmi2_fill_end
-
-sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
- CMPQ BX, $0x00
- JLE sequenceDecs_decode_56_bmi2_fill_check_overread
- CMPQ DX, $0x07
- JLE sequenceDecs_decode_56_bmi2_fill_end
- SHLQ $0x08, AX
- SUBQ $0x01, R13
- SUBQ $0x01, BX
- SUBQ $0x08, DX
- MOVBQZX (R13), CX
- ORQ CX, AX
- JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte
-
-sequenceDecs_decode_56_bmi2_fill_check_overread:
- CMPQ DX, $0x40
- JA error_overread
-
-sequenceDecs_decode_56_bmi2_fill_end:
- // Update offset
- MOVQ $0x00000808, CX
- BEXTRQ CX, R8, R14
- MOVQ AX, R15
- LEAQ (DX)(R14*1), CX
- ROLQ CL, R15
- BZHIQ R14, R15, R15
- MOVQ CX, DX
- MOVQ R8, CX
- SHRQ $0x20, CX
- ADDQ R15, CX
- MOVQ CX, 16(R9)
-
- // Update match length
- MOVQ $0x00000808, CX
- BEXTRQ CX, DI, R14
- MOVQ AX, R15
- LEAQ (DX)(R14*1), CX
- ROLQ CL, R15
- BZHIQ R14, R15, R15
- MOVQ CX, DX
- MOVQ DI, CX
- SHRQ $0x20, CX
- ADDQ R15, CX
- MOVQ CX, 8(R9)
-
- // Update literal length
- MOVQ $0x00000808, CX
- BEXTRQ CX, SI, R14
- MOVQ AX, R15
- LEAQ (DX)(R14*1), CX
- ROLQ CL, R15
- BZHIQ R14, R15, R15
- MOVQ CX, DX
- MOVQ SI, CX
- SHRQ $0x20, CX
- ADDQ R15, CX
- MOVQ CX, (R9)
-
- // Fill bitreader for state updates
- MOVQ R13, (SP)
- MOVQ $0x00000808, CX
- BEXTRQ CX, R8, R13
- MOVQ ctx+16(FP), CX
- CMPQ 96(CX), $0x00
- JZ sequenceDecs_decode_56_bmi2_skip_update
- LEAQ (SI)(DI*1), R14
- ADDQ R8, R14
- MOVBQZX R14, R14
- LEAQ (DX)(R14*1), CX
- MOVQ AX, R15
- MOVQ CX, DX
- ROLQ CL, R15
- BZHIQ R14, R15, R15
-
- // Update Offset State
- BZHIQ R8, R15, CX
- SHRXQ R8, R15, R15
- SHRL $0x10, R8
- ADDQ CX, R8
-
- // Load ctx.ofTable
- MOVQ ctx+16(FP), CX
- MOVQ 48(CX), CX
- MOVQ (CX)(R8*8), R8
-
- // Update Match Length State
- BZHIQ DI, R15, CX
- SHRXQ DI, R15, R15
- SHRL $0x10, DI
- ADDQ CX, DI
-
- // Load ctx.mlTable
- MOVQ ctx+16(FP), CX
- MOVQ 24(CX), CX
- MOVQ (CX)(DI*8), DI
-
- // Update Literal Length State
- BZHIQ SI, R15, CX
- SHRL $0x10, SI
- ADDQ CX, SI
-
- // Load ctx.llTable
- MOVQ ctx+16(FP), CX
- MOVQ (CX), CX
- MOVQ (CX)(SI*8), SI
-
-sequenceDecs_decode_56_bmi2_skip_update:
- // Adjust offset
- MOVQ 16(R9), CX
- CMPQ R13, $0x01
- JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
- MOVQ R11, R12
- MOVQ R10, R11
- MOVQ CX, R10
- JMP sequenceDecs_decode_56_bmi2_after_adjust
-
-sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
- CMPQ (R9), $0x00000000
- JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
- INCQ CX
- JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
-
-sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
- TESTQ CX, CX
- JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
- MOVQ R10, CX
- JMP sequenceDecs_decode_56_bmi2_after_adjust
-
-sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
- CMPQ CX, $0x01
- JB sequenceDecs_decode_56_bmi2_adjust_zero
- JEQ sequenceDecs_decode_56_bmi2_adjust_one
- CMPQ CX, $0x02
- JA sequenceDecs_decode_56_bmi2_adjust_three
- JMP sequenceDecs_decode_56_bmi2_adjust_two
-
-sequenceDecs_decode_56_bmi2_adjust_zero:
- MOVQ R10, R13
- JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
-
-sequenceDecs_decode_56_bmi2_adjust_one:
- MOVQ R11, R13
- JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
-
-sequenceDecs_decode_56_bmi2_adjust_two:
- MOVQ R12, R13
- JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
-
-sequenceDecs_decode_56_bmi2_adjust_three:
- LEAQ -1(R10), R13
-
-sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
- TESTQ R13, R13
- JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid
- MOVQ $0x00000001, R13
-
-sequenceDecs_decode_56_bmi2_adjust_temp_valid:
- CMPQ CX, $0x01
- CMOVQNE R11, R12
- MOVQ R10, R11
- MOVQ R13, R10
- MOVQ R13, CX
-
-sequenceDecs_decode_56_bmi2_after_adjust:
- MOVQ CX, 16(R9)
-
- // Check values
- MOVQ 8(R9), R13
- MOVQ (R9), R14
- LEAQ (R13)(R14*1), R15
- MOVQ s+0(FP), BP
- ADDQ R15, 256(BP)
- MOVQ ctx+16(FP), R15
- SUBQ R14, 128(R15)
- JS error_not_enough_literals
- CMPQ R13, $0x00020002
- JA sequenceDecs_decode_56_bmi2_error_match_len_too_big
- TESTQ CX, CX
- JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok
- TESTQ R13, R13
- JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
-
-sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
- ADDQ $0x18, R9
- MOVQ ctx+16(FP), CX
- DECQ 96(CX)
- JNS sequenceDecs_decode_56_bmi2_main_loop
- MOVQ s+0(FP), CX
- MOVQ R10, 144(CX)
- MOVQ R11, 152(CX)
- MOVQ R12, 160(CX)
- MOVQ br+8(FP), CX
- MOVQ AX, 24(CX)
- MOVB DL, 32(CX)
- MOVQ BX, 8(CX)
-
- // Return success
- MOVQ $0x00000000, ret+24(FP)
- RET
-
- // Return with match length error
-sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
- MOVQ $0x00000001, ret+24(FP)
- RET
-
- // Return with match too long error
-sequenceDecs_decode_56_bmi2_error_match_len_too_big:
- MOVQ $0x00000002, ret+24(FP)
- RET
-
- // Return with match offset too long error
- MOVQ $0x00000003, ret+24(FP)
- RET
-
- // Return with not enough literals error
-error_not_enough_literals:
- MOVQ $0x00000004, ret+24(FP)
- RET
-
- // Return with overread error
-error_overread:
- MOVQ $0x00000006, ret+24(FP)
- RET
-
-// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
-// Requires: SSE
-TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
- MOVQ ctx+0(FP), R10
- MOVQ 8(R10), CX
- TESTQ CX, CX
- JZ empty_seqs
- MOVQ (R10), AX
- MOVQ 24(R10), DX
- MOVQ 32(R10), BX
- MOVQ 80(R10), SI
- MOVQ 104(R10), DI
- MOVQ 120(R10), R8
- MOVQ 56(R10), R9
- MOVQ 64(R10), R10
- ADDQ R10, R9
-
- // seqsBase += 24 * seqIndex
- LEAQ (DX)(DX*2), R11
- SHLQ $0x03, R11
- ADDQ R11, AX
-
- // outBase += outPosition
- ADDQ DI, BX
-
-main_loop:
- MOVQ (AX), R11
- MOVQ 16(AX), R12
- MOVQ 8(AX), R13
-
- // Copy literals
- TESTQ R11, R11
- JZ check_offset
- XORQ R14, R14
-
-copy_1:
- MOVUPS (SI)(R14*1), X0
- MOVUPS X0, (BX)(R14*1)
- ADDQ $0x10, R14
- CMPQ R14, R11
- JB copy_1
- ADDQ R11, SI
- ADDQ R11, BX
- ADDQ R11, DI
-
- // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
-check_offset:
- LEAQ (DI)(R10*1), R11
- CMPQ R12, R11
- JG error_match_off_too_big
- CMPQ R12, R8
- JG error_match_off_too_big
-
- // Copy match from history
- MOVQ R12, R11
- SUBQ DI, R11
- JLS copy_match
- MOVQ R9, R14
- SUBQ R11, R14
- CMPQ R13, R11
- JG copy_all_from_history
- MOVQ R13, R11
- SUBQ $0x10, R11
- JB copy_4_small
-
-copy_4_loop:
- MOVUPS (R14), X0
- MOVUPS X0, (BX)
- ADDQ $0x10, R14
- ADDQ $0x10, BX
- SUBQ $0x10, R11
- JAE copy_4_loop
- LEAQ 16(R14)(R11*1), R14
- LEAQ 16(BX)(R11*1), BX
- MOVUPS -16(R14), X0
- MOVUPS X0, -16(BX)
- JMP copy_4_end
-
-copy_4_small:
- CMPQ R13, $0x03
- JE copy_4_move_3
- CMPQ R13, $0x08
- JB copy_4_move_4through7
- JMP copy_4_move_8through16
-
-copy_4_move_3:
- MOVW (R14), R11
- MOVB 2(R14), R12
- MOVW R11, (BX)
- MOVB R12, 2(BX)
- ADDQ R13, R14
- ADDQ R13, BX
- JMP copy_4_end
-
-copy_4_move_4through7:
- MOVL (R14), R11
- MOVL -4(R14)(R13*1), R12
- MOVL R11, (BX)
- MOVL R12, -4(BX)(R13*1)
- ADDQ R13, R14
- ADDQ R13, BX
- JMP copy_4_end
-
-copy_4_move_8through16:
- MOVQ (R14), R11
- MOVQ -8(R14)(R13*1), R12
- MOVQ R11, (BX)
- MOVQ R12, -8(BX)(R13*1)
- ADDQ R13, R14
- ADDQ R13, BX
-
-copy_4_end:
- ADDQ R13, DI
- ADDQ $0x18, AX
- INCQ DX
- CMPQ DX, CX
- JB main_loop
- JMP loop_finished
-
-copy_all_from_history:
- MOVQ R11, R15
- SUBQ $0x10, R15
- JB copy_5_small
-
-copy_5_loop:
- MOVUPS (R14), X0
- MOVUPS X0, (BX)
- ADDQ $0x10, R14
- ADDQ $0x10, BX
- SUBQ $0x10, R15
- JAE copy_5_loop
- LEAQ 16(R14)(R15*1), R14
- LEAQ 16(BX)(R15*1), BX
- MOVUPS -16(R14), X0
- MOVUPS X0, -16(BX)
- JMP copy_5_end
-
-copy_5_small:
- CMPQ R11, $0x03
- JE copy_5_move_3
- JB copy_5_move_1or2
- CMPQ R11, $0x08
- JB copy_5_move_4through7
- JMP copy_5_move_8through16
-
-copy_5_move_1or2:
- MOVB (R14), R15
- MOVB -1(R14)(R11*1), BP
- MOVB R15, (BX)
- MOVB BP, -1(BX)(R11*1)
- ADDQ R11, R14
- ADDQ R11, BX
- JMP copy_5_end
-
-copy_5_move_3:
- MOVW (R14), R15
- MOVB 2(R14), BP
- MOVW R15, (BX)
- MOVB BP, 2(BX)
- ADDQ R11, R14
- ADDQ R11, BX
- JMP copy_5_end
-
-copy_5_move_4through7:
- MOVL (R14), R15
- MOVL -4(R14)(R11*1), BP
- MOVL R15, (BX)
- MOVL BP, -4(BX)(R11*1)
- ADDQ R11, R14
- ADDQ R11, BX
- JMP copy_5_end
-
-copy_5_move_8through16:
- MOVQ (R14), R15
- MOVQ -8(R14)(R11*1), BP
- MOVQ R15, (BX)
- MOVQ BP, -8(BX)(R11*1)
- ADDQ R11, R14
- ADDQ R11, BX
-
-copy_5_end:
- ADDQ R11, DI
- SUBQ R11, R13
-
- // Copy match from the current buffer
-copy_match:
- MOVQ BX, R11
- SUBQ R12, R11
-
- // ml <= mo
- CMPQ R13, R12
- JA copy_overlapping_match
-
- // Copy non-overlapping match
- ADDQ R13, DI
- MOVQ BX, R12
- ADDQ R13, BX
-
-copy_2:
- MOVUPS (R11), X0
- MOVUPS X0, (R12)
- ADDQ $0x10, R11
- ADDQ $0x10, R12
- SUBQ $0x10, R13
- JHI copy_2
- JMP handle_loop
-
- // Copy overlapping match
-copy_overlapping_match:
- ADDQ R13, DI
-
-copy_slow_3:
- MOVB (R11), R12
- MOVB R12, (BX)
- INCQ R11
- INCQ BX
- DECQ R13
- JNZ copy_slow_3
-
-handle_loop:
- ADDQ $0x18, AX
- INCQ DX
- CMPQ DX, CX
- JB main_loop
-
-loop_finished:
- // Return value
- MOVB $0x01, ret+8(FP)
-
- // Update the context
- MOVQ ctx+0(FP), AX
- MOVQ DX, 24(AX)
- MOVQ DI, 104(AX)
- SUBQ 80(AX), SI
- MOVQ SI, 112(AX)
- RET
-
-error_match_off_too_big:
- // Return value
- MOVB $0x00, ret+8(FP)
-
- // Update the context
- MOVQ ctx+0(FP), AX
- MOVQ DX, 24(AX)
- MOVQ DI, 104(AX)
- SUBQ 80(AX), SI
- MOVQ SI, 112(AX)
- RET
-
-empty_seqs:
- // Return value
- MOVB $0x01, ret+8(FP)
- RET
-
-// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
-// Requires: SSE
-TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
- MOVQ ctx+0(FP), R10
- MOVQ 8(R10), CX
- TESTQ CX, CX
- JZ empty_seqs
- MOVQ (R10), AX
- MOVQ 24(R10), DX
- MOVQ 32(R10), BX
- MOVQ 80(R10), SI
- MOVQ 104(R10), DI
- MOVQ 120(R10), R8
- MOVQ 56(R10), R9
- MOVQ 64(R10), R10
- ADDQ R10, R9
-
- // seqsBase += 24 * seqIndex
- LEAQ (DX)(DX*2), R11
- SHLQ $0x03, R11
- ADDQ R11, AX
-
- // outBase += outPosition
- ADDQ DI, BX
-
-main_loop:
- MOVQ (AX), R11
- MOVQ 16(AX), R12
- MOVQ 8(AX), R13
-
- // Copy literals
- TESTQ R11, R11
- JZ check_offset
- MOVQ R11, R14
- SUBQ $0x10, R14
- JB copy_1_small
-
-copy_1_loop:
- MOVUPS (SI), X0
- MOVUPS X0, (BX)
- ADDQ $0x10, SI
- ADDQ $0x10, BX
- SUBQ $0x10, R14
- JAE copy_1_loop
- LEAQ 16(SI)(R14*1), SI
- LEAQ 16(BX)(R14*1), BX
- MOVUPS -16(SI), X0
- MOVUPS X0, -16(BX)
- JMP copy_1_end
-
-copy_1_small:
- CMPQ R11, $0x03
- JE copy_1_move_3
- JB copy_1_move_1or2
- CMPQ R11, $0x08
- JB copy_1_move_4through7
- JMP copy_1_move_8through16
-
-copy_1_move_1or2:
- MOVB (SI), R14
- MOVB -1(SI)(R11*1), R15
- MOVB R14, (BX)
- MOVB R15, -1(BX)(R11*1)
- ADDQ R11, SI
- ADDQ R11, BX
- JMP copy_1_end
-
-copy_1_move_3:
- MOVW (SI), R14
- MOVB 2(SI), R15
- MOVW R14, (BX)
- MOVB R15, 2(BX)
- ADDQ R11, SI
- ADDQ R11, BX
- JMP copy_1_end
-
-copy_1_move_4through7:
- MOVL (SI), R14
- MOVL -4(SI)(R11*1), R15
- MOVL R14, (BX)
- MOVL R15, -4(BX)(R11*1)
- ADDQ R11, SI
- ADDQ R11, BX
- JMP copy_1_end
-
-copy_1_move_8through16:
- MOVQ (SI), R14
- MOVQ -8(SI)(R11*1), R15
- MOVQ R14, (BX)
- MOVQ R15, -8(BX)(R11*1)
- ADDQ R11, SI
- ADDQ R11, BX
-
-copy_1_end:
- ADDQ R11, DI
-
- // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
-check_offset:
- LEAQ (DI)(R10*1), R11
- CMPQ R12, R11
- JG error_match_off_too_big
- CMPQ R12, R8
- JG error_match_off_too_big
-
- // Copy match from history
- MOVQ R12, R11
- SUBQ DI, R11
- JLS copy_match
- MOVQ R9, R14
- SUBQ R11, R14
- CMPQ R13, R11
- JG copy_all_from_history
- MOVQ R13, R11
- SUBQ $0x10, R11
- JB copy_4_small
-
-copy_4_loop:
- MOVUPS (R14), X0
- MOVUPS X0, (BX)
- ADDQ $0x10, R14
- ADDQ $0x10, BX
- SUBQ $0x10, R11
- JAE copy_4_loop
- LEAQ 16(R14)(R11*1), R14
- LEAQ 16(BX)(R11*1), BX
- MOVUPS -16(R14), X0
- MOVUPS X0, -16(BX)
- JMP copy_4_end
-
-copy_4_small:
- CMPQ R13, $0x03
- JE copy_4_move_3
- CMPQ R13, $0x08
- JB copy_4_move_4through7
- JMP copy_4_move_8through16
-
-copy_4_move_3:
- MOVW (R14), R11
- MOVB 2(R14), R12
- MOVW R11, (BX)
- MOVB R12, 2(BX)
- ADDQ R13, R14
- ADDQ R13, BX
- JMP copy_4_end
-
-copy_4_move_4through7:
- MOVL (R14), R11
- MOVL -4(R14)(R13*1), R12
- MOVL R11, (BX)
- MOVL R12, -4(BX)(R13*1)
- ADDQ R13, R14
- ADDQ R13, BX
- JMP copy_4_end
-
-copy_4_move_8through16:
- MOVQ (R14), R11
- MOVQ -8(R14)(R13*1), R12
- MOVQ R11, (BX)
- MOVQ R12, -8(BX)(R13*1)
- ADDQ R13, R14
- ADDQ R13, BX
-
-copy_4_end:
- ADDQ R13, DI
- ADDQ $0x18, AX
- INCQ DX
- CMPQ DX, CX
- JB main_loop
- JMP loop_finished
-
-copy_all_from_history:
- MOVQ R11, R15
- SUBQ $0x10, R15
- JB copy_5_small
-
-copy_5_loop:
- MOVUPS (R14), X0
- MOVUPS X0, (BX)
- ADDQ $0x10, R14
- ADDQ $0x10, BX
- SUBQ $0x10, R15
- JAE copy_5_loop
- LEAQ 16(R14)(R15*1), R14
- LEAQ 16(BX)(R15*1), BX
- MOVUPS -16(R14), X0
- MOVUPS X0, -16(BX)
- JMP copy_5_end
-
-copy_5_small:
- CMPQ R11, $0x03
- JE copy_5_move_3
- JB copy_5_move_1or2
- CMPQ R11, $0x08
- JB copy_5_move_4through7
- JMP copy_5_move_8through16
-
-copy_5_move_1or2:
- MOVB (R14), R15
- MOVB -1(R14)(R11*1), BP
- MOVB R15, (BX)
- MOVB BP, -1(BX)(R11*1)
- ADDQ R11, R14
- ADDQ R11, BX
- JMP copy_5_end
-
-copy_5_move_3:
- MOVW (R14), R15
- MOVB 2(R14), BP
- MOVW R15, (BX)
- MOVB BP, 2(BX)
- ADDQ R11, R14
- ADDQ R11, BX
- JMP copy_5_end
-
-copy_5_move_4through7:
- MOVL (R14), R15
- MOVL -4(R14)(R11*1), BP
- MOVL R15, (BX)
- MOVL BP, -4(BX)(R11*1)
- ADDQ R11, R14
- ADDQ R11, BX
- JMP copy_5_end
-
-copy_5_move_8through16:
- MOVQ (R14), R15
- MOVQ -8(R14)(R11*1), BP
- MOVQ R15, (BX)
- MOVQ BP, -8(BX)(R11*1)
- ADDQ R11, R14
- ADDQ R11, BX
-
-copy_5_end:
- ADDQ R11, DI
- SUBQ R11, R13
-
- // Copy match from the current buffer
-copy_match:
- MOVQ BX, R11
- SUBQ R12, R11
-
- // ml <= mo
- CMPQ R13, R12
- JA copy_overlapping_match
-
- // Copy non-overlapping match
- ADDQ R13, DI
- MOVQ R13, R12
- SUBQ $0x10, R12
- JB copy_2_small
-
-copy_2_loop:
- MOVUPS (R11), X0
- MOVUPS X0, (BX)
- ADDQ $0x10, R11
- ADDQ $0x10, BX
- SUBQ $0x10, R12
- JAE copy_2_loop
- LEAQ 16(R11)(R12*1), R11
- LEAQ 16(BX)(R12*1), BX
- MOVUPS -16(R11), X0
- MOVUPS X0, -16(BX)
- JMP copy_2_end
-
-copy_2_small:
- CMPQ R13, $0x03
- JE copy_2_move_3
- JB copy_2_move_1or2
- CMPQ R13, $0x08
- JB copy_2_move_4through7
- JMP copy_2_move_8through16
-
-copy_2_move_1or2:
- MOVB (R11), R12
- MOVB -1(R11)(R13*1), R14
- MOVB R12, (BX)
- MOVB R14, -1(BX)(R13*1)
- ADDQ R13, R11
- ADDQ R13, BX
- JMP copy_2_end
-
-copy_2_move_3:
- MOVW (R11), R12
- MOVB 2(R11), R14
- MOVW R12, (BX)
- MOVB R14, 2(BX)
- ADDQ R13, R11
- ADDQ R13, BX
- JMP copy_2_end
-
-copy_2_move_4through7:
- MOVL (R11), R12
- MOVL -4(R11)(R13*1), R14
- MOVL R12, (BX)
- MOVL R14, -4(BX)(R13*1)
- ADDQ R13, R11
- ADDQ R13, BX
- JMP copy_2_end
-
-copy_2_move_8through16:
- MOVQ (R11), R12
- MOVQ -8(R11)(R13*1), R14
- MOVQ R12, (BX)
- MOVQ R14, -8(BX)(R13*1)
- ADDQ R13, R11
- ADDQ R13, BX
-
-copy_2_end:
- JMP handle_loop
-
- // Copy overlapping match
-copy_overlapping_match:
- ADDQ R13, DI
-
-copy_slow_3:
- MOVB (R11), R12
- MOVB R12, (BX)
- INCQ R11
- INCQ BX
- DECQ R13
- JNZ copy_slow_3
-
-handle_loop:
- ADDQ $0x18, AX
- INCQ DX
- CMPQ DX, CX
- JB main_loop
-
-loop_finished:
- // Return value
- MOVB $0x01, ret+8(FP)
-
- // Update the context
- MOVQ ctx+0(FP), AX
- MOVQ DX, 24(AX)
- MOVQ DI, 104(AX)
- SUBQ 80(AX), SI
- MOVQ SI, 112(AX)
- RET
-
-error_match_off_too_big:
- // Return value
- MOVB $0x00, ret+8(FP)
-
- // Update the context
- MOVQ ctx+0(FP), AX
- MOVQ DX, 24(AX)
- MOVQ DI, 104(AX)
- SUBQ 80(AX), SI
- MOVQ SI, 112(AX)
- RET
-
-empty_seqs:
- // Return value
- MOVB $0x01, ret+8(FP)
- RET
-
-// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
-// Requires: CMOV, SSE
-TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
- MOVQ br+8(FP), CX
- MOVQ 24(CX), DX
- MOVBQZX 32(CX), BX
- MOVQ (CX), AX
- MOVQ 8(CX), SI
- ADDQ SI, AX
- MOVQ AX, (SP)
- MOVQ ctx+16(FP), AX
- MOVQ 72(AX), DI
- MOVQ 80(AX), R8
- MOVQ 88(AX), R9
- XORQ CX, CX
- MOVQ CX, 8(SP)
- MOVQ CX, 16(SP)
- MOVQ CX, 24(SP)
- MOVQ 112(AX), R10
- MOVQ 128(AX), CX
- MOVQ CX, 32(SP)
- MOVQ 144(AX), R11
- MOVQ 136(AX), R12
- MOVQ 200(AX), CX
- MOVQ CX, 56(SP)
- MOVQ 176(AX), CX
- MOVQ CX, 48(SP)
- MOVQ 184(AX), AX
- MOVQ AX, 40(SP)
- MOVQ 40(SP), AX
- ADDQ AX, 48(SP)
-
- // Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
- ADDQ R10, 32(SP)
-
- // outBase += outPosition
- ADDQ R12, R10
-
-sequenceDecs_decodeSync_amd64_main_loop:
- MOVQ (SP), R13
-
- // Fill bitreader to have enough for the offset and match length.
- CMPQ SI, $0x08
- JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte
- MOVQ BX, AX
- SHRQ $0x03, AX
- SUBQ AX, R13
- MOVQ (R13), DX
- SUBQ AX, SI
- ANDQ $0x07, BX
- JMP sequenceDecs_decodeSync_amd64_fill_end
-
-sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
- CMPQ SI, $0x00
- JLE sequenceDecs_decodeSync_amd64_fill_check_overread
- CMPQ BX, $0x07
- JLE sequenceDecs_decodeSync_amd64_fill_end
- SHLQ $0x08, DX
- SUBQ $0x01, R13
- SUBQ $0x01, SI
- SUBQ $0x08, BX
- MOVBQZX (R13), AX
- ORQ AX, DX
- JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte
-
-sequenceDecs_decodeSync_amd64_fill_check_overread:
- CMPQ BX, $0x40
- JA error_overread
-
-sequenceDecs_decodeSync_amd64_fill_end:
- // Update offset
- MOVQ R9, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- SHRQ $0x20, AX
- TESTQ CX, CX
- JZ sequenceDecs_decodeSync_amd64_of_update_zero
- ADDQ CX, BX
- CMPQ BX, $0x40
- JA sequenceDecs_decodeSync_amd64_of_update_zero
- CMPQ CX, $0x40
- JAE sequenceDecs_decodeSync_amd64_of_update_zero
- NEGQ CX
- SHRQ CL, R14
- ADDQ R14, AX
-
-sequenceDecs_decodeSync_amd64_of_update_zero:
- MOVQ AX, 8(SP)
-
- // Update match length
- MOVQ R8, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- SHRQ $0x20, AX
- TESTQ CX, CX
- JZ sequenceDecs_decodeSync_amd64_ml_update_zero
- ADDQ CX, BX
- CMPQ BX, $0x40
- JA sequenceDecs_decodeSync_amd64_ml_update_zero
- CMPQ CX, $0x40
- JAE sequenceDecs_decodeSync_amd64_ml_update_zero
- NEGQ CX
- SHRQ CL, R14
- ADDQ R14, AX
-
-sequenceDecs_decodeSync_amd64_ml_update_zero:
- MOVQ AX, 16(SP)
-
- // Fill bitreader to have enough for the remaining
- CMPQ SI, $0x08
- JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
- MOVQ BX, AX
- SHRQ $0x03, AX
- SUBQ AX, R13
- MOVQ (R13), DX
- SUBQ AX, SI
- ANDQ $0x07, BX
- JMP sequenceDecs_decodeSync_amd64_fill_2_end
-
-sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
- CMPQ SI, $0x00
- JLE sequenceDecs_decodeSync_amd64_fill_2_check_overread
- CMPQ BX, $0x07
- JLE sequenceDecs_decodeSync_amd64_fill_2_end
- SHLQ $0x08, DX
- SUBQ $0x01, R13
- SUBQ $0x01, SI
- SUBQ $0x08, BX
- MOVBQZX (R13), AX
- ORQ AX, DX
- JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
-
-sequenceDecs_decodeSync_amd64_fill_2_check_overread:
- CMPQ BX, $0x40
- JA error_overread
-
-sequenceDecs_decodeSync_amd64_fill_2_end:
- // Update literal length
- MOVQ DI, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- SHRQ $0x20, AX
- TESTQ CX, CX
- JZ sequenceDecs_decodeSync_amd64_ll_update_zero
- ADDQ CX, BX
- CMPQ BX, $0x40
- JA sequenceDecs_decodeSync_amd64_ll_update_zero
- CMPQ CX, $0x40
- JAE sequenceDecs_decodeSync_amd64_ll_update_zero
- NEGQ CX
- SHRQ CL, R14
- ADDQ R14, AX
-
-sequenceDecs_decodeSync_amd64_ll_update_zero:
- MOVQ AX, 24(SP)
-
- // Fill bitreader for state updates
- MOVQ R13, (SP)
- MOVQ R9, AX
- SHRQ $0x08, AX
- MOVBQZX AL, AX
- MOVQ ctx+16(FP), CX
- CMPQ 96(CX), $0x00
- JZ sequenceDecs_decodeSync_amd64_skip_update
-
- // Update Literal Length State
- MOVBQZX DI, R13
- SHRL $0x10, DI
- LEAQ (BX)(R13*1), CX
- MOVQ DX, R14
- MOVQ CX, BX
- ROLQ CL, R14
- MOVL $0x00000001, R15
- MOVB R13, CL
- SHLL CL, R15
- DECL R15
- ANDQ R15, R14
- ADDQ R14, DI
-
- // Load ctx.llTable
- MOVQ ctx+16(FP), CX
- MOVQ (CX), CX
- MOVQ (CX)(DI*8), DI
-
- // Update Match Length State
- MOVBQZX R8, R13
- SHRL $0x10, R8
- LEAQ (BX)(R13*1), CX
- MOVQ DX, R14
- MOVQ CX, BX
- ROLQ CL, R14
- MOVL $0x00000001, R15
- MOVB R13, CL
- SHLL CL, R15
- DECL R15
- ANDQ R15, R14
- ADDQ R14, R8
-
- // Load ctx.mlTable
- MOVQ ctx+16(FP), CX
- MOVQ 24(CX), CX
- MOVQ (CX)(R8*8), R8
-
- // Update Offset State
- MOVBQZX R9, R13
- SHRL $0x10, R9
- LEAQ (BX)(R13*1), CX
- MOVQ DX, R14
- MOVQ CX, BX
- ROLQ CL, R14
- MOVL $0x00000001, R15
- MOVB R13, CL
- SHLL CL, R15
- DECL R15
- ANDQ R15, R14
- ADDQ R14, R9
-
- // Load ctx.ofTable
- MOVQ ctx+16(FP), CX
- MOVQ 48(CX), CX
- MOVQ (CX)(R9*8), R9
-
-sequenceDecs_decodeSync_amd64_skip_update:
- // Adjust offset
- MOVQ s+0(FP), CX
- MOVQ 8(SP), R13
- CMPQ AX, $0x01
- JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
- MOVUPS 144(CX), X0
- MOVQ R13, 144(CX)
- MOVUPS X0, 152(CX)
- JMP sequenceDecs_decodeSync_amd64_after_adjust
-
-sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
- CMPQ 24(SP), $0x00000000
- JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
- INCQ R13
- JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
-
-sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
- TESTQ R13, R13
- JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
- MOVQ 144(CX), R13
- JMP sequenceDecs_decodeSync_amd64_after_adjust
-
-sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
- MOVQ R13, AX
- XORQ R14, R14
- MOVQ $-1, R15
- CMPQ R13, $0x03
- CMOVQEQ R14, AX
- CMOVQEQ R15, R14
- ADDQ 144(CX)(AX*8), R14
- JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid
- MOVQ $0x00000001, R14
-
-sequenceDecs_decodeSync_amd64_adjust_temp_valid:
- CMPQ R13, $0x01
- JZ sequenceDecs_decodeSync_amd64_adjust_skip
- MOVQ 152(CX), AX
- MOVQ AX, 160(CX)
-
-sequenceDecs_decodeSync_amd64_adjust_skip:
- MOVQ 144(CX), AX
- MOVQ AX, 152(CX)
- MOVQ R14, 144(CX)
- MOVQ R14, R13
-
-sequenceDecs_decodeSync_amd64_after_adjust:
- MOVQ R13, 8(SP)
-
- // Check values
- MOVQ 16(SP), AX
- MOVQ 24(SP), CX
- LEAQ (AX)(CX*1), R14
- MOVQ s+0(FP), R15
- ADDQ R14, 256(R15)
- MOVQ ctx+16(FP), R14
- SUBQ CX, 104(R14)
- JS error_not_enough_literals
- CMPQ AX, $0x00020002
- JA sequenceDecs_decodeSync_amd64_error_match_len_too_big
- TESTQ R13, R13
- JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok
- TESTQ AX, AX
- JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
-
-sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
- MOVQ 24(SP), AX
- MOVQ 8(SP), CX
- MOVQ 16(SP), R13
-
- // Check if we have enough space in s.out
- LEAQ (AX)(R13*1), R14
- ADDQ R10, R14
- CMPQ R14, 32(SP)
- JA error_not_enough_space
-
- // Copy literals
- TESTQ AX, AX
- JZ check_offset
- XORQ R14, R14
-
-copy_1:
- MOVUPS (R11)(R14*1), X0
- MOVUPS X0, (R10)(R14*1)
- ADDQ $0x10, R14
- CMPQ R14, AX
- JB copy_1
- ADDQ AX, R11
- ADDQ AX, R10
- ADDQ AX, R12
-
- // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
-check_offset:
- MOVQ R12, AX
- ADDQ 40(SP), AX
- CMPQ CX, AX
- JG error_match_off_too_big
- CMPQ CX, 56(SP)
- JG error_match_off_too_big
-
- // Copy match from history
- MOVQ CX, AX
- SUBQ R12, AX
- JLS copy_match
- MOVQ 48(SP), R14
- SUBQ AX, R14
- CMPQ R13, AX
- JG copy_all_from_history
- MOVQ R13, AX
- SUBQ $0x10, AX
- JB copy_4_small
-
-copy_4_loop:
- MOVUPS (R14), X0
- MOVUPS X0, (R10)
- ADDQ $0x10, R14
- ADDQ $0x10, R10
- SUBQ $0x10, AX
- JAE copy_4_loop
- LEAQ 16(R14)(AX*1), R14
- LEAQ 16(R10)(AX*1), R10
- MOVUPS -16(R14), X0
- MOVUPS X0, -16(R10)
- JMP copy_4_end
-
-copy_4_small:
- CMPQ R13, $0x03
- JE copy_4_move_3
- CMPQ R13, $0x08
- JB copy_4_move_4through7
- JMP copy_4_move_8through16
-
-copy_4_move_3:
- MOVW (R14), AX
- MOVB 2(R14), CL
- MOVW AX, (R10)
- MOVB CL, 2(R10)
- ADDQ R13, R14
- ADDQ R13, R10
- JMP copy_4_end
-
-copy_4_move_4through7:
- MOVL (R14), AX
- MOVL -4(R14)(R13*1), CX
- MOVL AX, (R10)
- MOVL CX, -4(R10)(R13*1)
- ADDQ R13, R14
- ADDQ R13, R10
- JMP copy_4_end
-
-copy_4_move_8through16:
- MOVQ (R14), AX
- MOVQ -8(R14)(R13*1), CX
- MOVQ AX, (R10)
- MOVQ CX, -8(R10)(R13*1)
- ADDQ R13, R14
- ADDQ R13, R10
-
-copy_4_end:
- ADDQ R13, R12
- JMP handle_loop
- JMP loop_finished
-
-copy_all_from_history:
- MOVQ AX, R15
- SUBQ $0x10, R15
- JB copy_5_small
-
-copy_5_loop:
- MOVUPS (R14), X0
- MOVUPS X0, (R10)
- ADDQ $0x10, R14
- ADDQ $0x10, R10
- SUBQ $0x10, R15
- JAE copy_5_loop
- LEAQ 16(R14)(R15*1), R14
- LEAQ 16(R10)(R15*1), R10
- MOVUPS -16(R14), X0
- MOVUPS X0, -16(R10)
- JMP copy_5_end
-
-copy_5_small:
- CMPQ AX, $0x03
- JE copy_5_move_3
- JB copy_5_move_1or2
- CMPQ AX, $0x08
- JB copy_5_move_4through7
- JMP copy_5_move_8through16
-
-copy_5_move_1or2:
- MOVB (R14), R15
- MOVB -1(R14)(AX*1), BP
- MOVB R15, (R10)
- MOVB BP, -1(R10)(AX*1)
- ADDQ AX, R14
- ADDQ AX, R10
- JMP copy_5_end
-
-copy_5_move_3:
- MOVW (R14), R15
- MOVB 2(R14), BP
- MOVW R15, (R10)
- MOVB BP, 2(R10)
- ADDQ AX, R14
- ADDQ AX, R10
- JMP copy_5_end
-
-copy_5_move_4through7:
- MOVL (R14), R15
- MOVL -4(R14)(AX*1), BP
- MOVL R15, (R10)
- MOVL BP, -4(R10)(AX*1)
- ADDQ AX, R14
- ADDQ AX, R10
- JMP copy_5_end
-
-copy_5_move_8through16:
- MOVQ (R14), R15
- MOVQ -8(R14)(AX*1), BP
- MOVQ R15, (R10)
- MOVQ BP, -8(R10)(AX*1)
- ADDQ AX, R14
- ADDQ AX, R10
-
-copy_5_end:
- ADDQ AX, R12
- SUBQ AX, R13
-
- // Copy match from the current buffer
-copy_match:
- MOVQ R10, AX
- SUBQ CX, AX
-
- // ml <= mo
- CMPQ R13, CX
- JA copy_overlapping_match
-
- // Copy non-overlapping match
- ADDQ R13, R12
- MOVQ R10, CX
- ADDQ R13, R10
-
-copy_2:
- MOVUPS (AX), X0
- MOVUPS X0, (CX)
- ADDQ $0x10, AX
- ADDQ $0x10, CX
- SUBQ $0x10, R13
- JHI copy_2
- JMP handle_loop
-
- // Copy overlapping match
-copy_overlapping_match:
- ADDQ R13, R12
-
-copy_slow_3:
- MOVB (AX), CL
- MOVB CL, (R10)
- INCQ AX
- INCQ R10
- DECQ R13
- JNZ copy_slow_3
-
-handle_loop:
- MOVQ ctx+16(FP), AX
- DECQ 96(AX)
- JNS sequenceDecs_decodeSync_amd64_main_loop
-
-loop_finished:
- MOVQ br+8(FP), AX
- MOVQ DX, 24(AX)
- MOVB BL, 32(AX)
- MOVQ SI, 8(AX)
-
- // Update the context
- MOVQ ctx+16(FP), AX
- MOVQ R12, 136(AX)
- MOVQ 144(AX), CX
- SUBQ CX, R11
- MOVQ R11, 168(AX)
-
- // Return success
- MOVQ $0x00000000, ret+24(FP)
- RET
-
- // Return with match length error
-sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
- MOVQ 16(SP), AX
- MOVQ ctx+16(FP), CX
- MOVQ AX, 216(CX)
- MOVQ $0x00000001, ret+24(FP)
- RET
-
- // Return with match too long error
-sequenceDecs_decodeSync_amd64_error_match_len_too_big:
- MOVQ ctx+16(FP), AX
- MOVQ 16(SP), CX
- MOVQ CX, 216(AX)
- MOVQ $0x00000002, ret+24(FP)
- RET
-
- // Return with match offset too long error
-error_match_off_too_big:
- MOVQ ctx+16(FP), AX
- MOVQ 8(SP), CX
- MOVQ CX, 224(AX)
- MOVQ R12, 136(AX)
- MOVQ $0x00000003, ret+24(FP)
- RET
-
- // Return with not enough literals error
-error_not_enough_literals:
- MOVQ ctx+16(FP), AX
- MOVQ 24(SP), CX
- MOVQ CX, 208(AX)
- MOVQ $0x00000004, ret+24(FP)
- RET
-
- // Return with overread error
-error_overread:
- MOVQ $0x00000006, ret+24(FP)
- RET
-
- // Return with not enough output space error
-error_not_enough_space:
- MOVQ ctx+16(FP), AX
- MOVQ 24(SP), CX
- MOVQ CX, 208(AX)
- MOVQ 16(SP), CX
- MOVQ CX, 216(AX)
- MOVQ R12, 136(AX)
- MOVQ $0x00000005, ret+24(FP)
- RET
-
-// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
-// Requires: BMI, BMI2, CMOV, SSE
-TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
- MOVQ br+8(FP), BX
- MOVQ 24(BX), AX
- MOVBQZX 32(BX), DX
- MOVQ (BX), CX
- MOVQ 8(BX), BX
- ADDQ BX, CX
- MOVQ CX, (SP)
- MOVQ ctx+16(FP), CX
- MOVQ 72(CX), SI
- MOVQ 80(CX), DI
- MOVQ 88(CX), R8
- XORQ R9, R9
- MOVQ R9, 8(SP)
- MOVQ R9, 16(SP)
- MOVQ R9, 24(SP)
- MOVQ 112(CX), R9
- MOVQ 128(CX), R10
- MOVQ R10, 32(SP)
- MOVQ 144(CX), R10
- MOVQ 136(CX), R11
- MOVQ 200(CX), R12
- MOVQ R12, 56(SP)
- MOVQ 176(CX), R12
- MOVQ R12, 48(SP)
- MOVQ 184(CX), CX
- MOVQ CX, 40(SP)
- MOVQ 40(SP), CX
- ADDQ CX, 48(SP)
-
- // Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
- ADDQ R9, 32(SP)
-
- // outBase += outPosition
- ADDQ R11, R9
-
-sequenceDecs_decodeSync_bmi2_main_loop:
- MOVQ (SP), R12
-
- // Fill bitreader to have enough for the offset and match length.
- CMPQ BX, $0x08
- JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
- MOVQ DX, CX
- SHRQ $0x03, CX
- SUBQ CX, R12
- MOVQ (R12), AX
- SUBQ CX, BX
- ANDQ $0x07, DX
- JMP sequenceDecs_decodeSync_bmi2_fill_end
-
-sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
- CMPQ BX, $0x00
- JLE sequenceDecs_decodeSync_bmi2_fill_check_overread
- CMPQ DX, $0x07
- JLE sequenceDecs_decodeSync_bmi2_fill_end
- SHLQ $0x08, AX
- SUBQ $0x01, R12
- SUBQ $0x01, BX
- SUBQ $0x08, DX
- MOVBQZX (R12), CX
- ORQ CX, AX
- JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
-
-sequenceDecs_decodeSync_bmi2_fill_check_overread:
- CMPQ DX, $0x40
- JA error_overread
-
-sequenceDecs_decodeSync_bmi2_fill_end:
- // Update offset
- MOVQ $0x00000808, CX
- BEXTRQ CX, R8, R13
- MOVQ AX, R14
- LEAQ (DX)(R13*1), CX
- ROLQ CL, R14
- BZHIQ R13, R14, R14
- MOVQ CX, DX
- MOVQ R8, CX
- SHRQ $0x20, CX
- ADDQ R14, CX
- MOVQ CX, 8(SP)
-
- // Update match length
- MOVQ $0x00000808, CX
- BEXTRQ CX, DI, R13
- MOVQ AX, R14
- LEAQ (DX)(R13*1), CX
- ROLQ CL, R14
- BZHIQ R13, R14, R14
- MOVQ CX, DX
- MOVQ DI, CX
- SHRQ $0x20, CX
- ADDQ R14, CX
- MOVQ CX, 16(SP)
-
- // Fill bitreader to have enough for the remaining
- CMPQ BX, $0x08
- JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
- MOVQ DX, CX
- SHRQ $0x03, CX
- SUBQ CX, R12
- MOVQ (R12), AX
- SUBQ CX, BX
- ANDQ $0x07, DX
- JMP sequenceDecs_decodeSync_bmi2_fill_2_end
-
-sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
- CMPQ BX, $0x00
- JLE sequenceDecs_decodeSync_bmi2_fill_2_check_overread
- CMPQ DX, $0x07
- JLE sequenceDecs_decodeSync_bmi2_fill_2_end
- SHLQ $0x08, AX
- SUBQ $0x01, R12
- SUBQ $0x01, BX
- SUBQ $0x08, DX
- MOVBQZX (R12), CX
- ORQ CX, AX
- JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
-
-sequenceDecs_decodeSync_bmi2_fill_2_check_overread:
- CMPQ DX, $0x40
- JA error_overread
-
-sequenceDecs_decodeSync_bmi2_fill_2_end:
- // Update literal length
- MOVQ $0x00000808, CX
- BEXTRQ CX, SI, R13
- MOVQ AX, R14
- LEAQ (DX)(R13*1), CX
- ROLQ CL, R14
- BZHIQ R13, R14, R14
- MOVQ CX, DX
- MOVQ SI, CX
- SHRQ $0x20, CX
- ADDQ R14, CX
- MOVQ CX, 24(SP)
-
- // Fill bitreader for state updates
- MOVQ R12, (SP)
- MOVQ $0x00000808, CX
- BEXTRQ CX, R8, R12
- MOVQ ctx+16(FP), CX
- CMPQ 96(CX), $0x00
- JZ sequenceDecs_decodeSync_bmi2_skip_update
- LEAQ (SI)(DI*1), R13
- ADDQ R8, R13
- MOVBQZX R13, R13
- LEAQ (DX)(R13*1), CX
- MOVQ AX, R14
- MOVQ CX, DX
- ROLQ CL, R14
- BZHIQ R13, R14, R14
-
- // Update Offset State
- BZHIQ R8, R14, CX
- SHRXQ R8, R14, R14
- SHRL $0x10, R8
- ADDQ CX, R8
-
- // Load ctx.ofTable
- MOVQ ctx+16(FP), CX
- MOVQ 48(CX), CX
- MOVQ (CX)(R8*8), R8
-
- // Update Match Length State
- BZHIQ DI, R14, CX
- SHRXQ DI, R14, R14
- SHRL $0x10, DI
- ADDQ CX, DI
-
- // Load ctx.mlTable
- MOVQ ctx+16(FP), CX
- MOVQ 24(CX), CX
- MOVQ (CX)(DI*8), DI
-
- // Update Literal Length State
- BZHIQ SI, R14, CX
- SHRL $0x10, SI
- ADDQ CX, SI
-
- // Load ctx.llTable
- MOVQ ctx+16(FP), CX
- MOVQ (CX), CX
- MOVQ (CX)(SI*8), SI
-
-sequenceDecs_decodeSync_bmi2_skip_update:
- // Adjust offset
- MOVQ s+0(FP), CX
- MOVQ 8(SP), R13
- CMPQ R12, $0x01
- JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
- MOVUPS 144(CX), X0
- MOVQ R13, 144(CX)
- MOVUPS X0, 152(CX)
- JMP sequenceDecs_decodeSync_bmi2_after_adjust
-
-sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
- CMPQ 24(SP), $0x00000000
- JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
- INCQ R13
- JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
-
-sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
- TESTQ R13, R13
- JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
- MOVQ 144(CX), R13
- JMP sequenceDecs_decodeSync_bmi2_after_adjust
-
-sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
- MOVQ R13, R12
- XORQ R14, R14
- MOVQ $-1, R15
- CMPQ R13, $0x03
- CMOVQEQ R14, R12
- CMOVQEQ R15, R14
- ADDQ 144(CX)(R12*8), R14
- JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid
- MOVQ $0x00000001, R14
-
-sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
- CMPQ R13, $0x01
- JZ sequenceDecs_decodeSync_bmi2_adjust_skip
- MOVQ 152(CX), R12
- MOVQ R12, 160(CX)
-
-sequenceDecs_decodeSync_bmi2_adjust_skip:
- MOVQ 144(CX), R12
- MOVQ R12, 152(CX)
- MOVQ R14, 144(CX)
- MOVQ R14, R13
-
-sequenceDecs_decodeSync_bmi2_after_adjust:
- MOVQ R13, 8(SP)
-
- // Check values
- MOVQ 16(SP), CX
- MOVQ 24(SP), R12
- LEAQ (CX)(R12*1), R14
- MOVQ s+0(FP), R15
- ADDQ R14, 256(R15)
- MOVQ ctx+16(FP), R14
- SUBQ R12, 104(R14)
- JS error_not_enough_literals
- CMPQ CX, $0x00020002
- JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big
- TESTQ R13, R13
- JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
- TESTQ CX, CX
- JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
-
-sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
- MOVQ 24(SP), CX
- MOVQ 8(SP), R12
- MOVQ 16(SP), R13
-
- // Check if we have enough space in s.out
- LEAQ (CX)(R13*1), R14
- ADDQ R9, R14
- CMPQ R14, 32(SP)
- JA error_not_enough_space
-
- // Copy literals
- TESTQ CX, CX
- JZ check_offset
- XORQ R14, R14
-
-copy_1:
- MOVUPS (R10)(R14*1), X0
- MOVUPS X0, (R9)(R14*1)
- ADDQ $0x10, R14
- CMPQ R14, CX
- JB copy_1
- ADDQ CX, R10
- ADDQ CX, R9
- ADDQ CX, R11
-
- // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
-check_offset:
- MOVQ R11, CX
- ADDQ 40(SP), CX
- CMPQ R12, CX
- JG error_match_off_too_big
- CMPQ R12, 56(SP)
- JG error_match_off_too_big
-
- // Copy match from history
- MOVQ R12, CX
- SUBQ R11, CX
- JLS copy_match
- MOVQ 48(SP), R14
- SUBQ CX, R14
- CMPQ R13, CX
- JG copy_all_from_history
- MOVQ R13, CX
- SUBQ $0x10, CX
- JB copy_4_small
-
-copy_4_loop:
- MOVUPS (R14), X0
- MOVUPS X0, (R9)
- ADDQ $0x10, R14
- ADDQ $0x10, R9
- SUBQ $0x10, CX
- JAE copy_4_loop
- LEAQ 16(R14)(CX*1), R14
- LEAQ 16(R9)(CX*1), R9
- MOVUPS -16(R14), X0
- MOVUPS X0, -16(R9)
- JMP copy_4_end
-
-copy_4_small:
- CMPQ R13, $0x03
- JE copy_4_move_3
- CMPQ R13, $0x08
- JB copy_4_move_4through7
- JMP copy_4_move_8through16
-
-copy_4_move_3:
- MOVW (R14), CX
- MOVB 2(R14), R12
- MOVW CX, (R9)
- MOVB R12, 2(R9)
- ADDQ R13, R14
- ADDQ R13, R9
- JMP copy_4_end
-
-copy_4_move_4through7:
- MOVL (R14), CX
- MOVL -4(R14)(R13*1), R12
- MOVL CX, (R9)
- MOVL R12, -4(R9)(R13*1)
- ADDQ R13, R14
- ADDQ R13, R9
- JMP copy_4_end
-
-copy_4_move_8through16:
- MOVQ (R14), CX
- MOVQ -8(R14)(R13*1), R12
- MOVQ CX, (R9)
- MOVQ R12, -8(R9)(R13*1)
- ADDQ R13, R14
- ADDQ R13, R9
-
-copy_4_end:
- ADDQ R13, R11
- JMP handle_loop
- JMP loop_finished
-
-copy_all_from_history:
- MOVQ CX, R15
- SUBQ $0x10, R15
- JB copy_5_small
-
-copy_5_loop:
- MOVUPS (R14), X0
- MOVUPS X0, (R9)
- ADDQ $0x10, R14
- ADDQ $0x10, R9
- SUBQ $0x10, R15
- JAE copy_5_loop
- LEAQ 16(R14)(R15*1), R14
- LEAQ 16(R9)(R15*1), R9
- MOVUPS -16(R14), X0
- MOVUPS X0, -16(R9)
- JMP copy_5_end
-
-copy_5_small:
- CMPQ CX, $0x03
- JE copy_5_move_3
- JB copy_5_move_1or2
- CMPQ CX, $0x08
- JB copy_5_move_4through7
- JMP copy_5_move_8through16
-
-copy_5_move_1or2:
- MOVB (R14), R15
- MOVB -1(R14)(CX*1), BP
- MOVB R15, (R9)
- MOVB BP, -1(R9)(CX*1)
- ADDQ CX, R14
- ADDQ CX, R9
- JMP copy_5_end
-
-copy_5_move_3:
- MOVW (R14), R15
- MOVB 2(R14), BP
- MOVW R15, (R9)
- MOVB BP, 2(R9)
- ADDQ CX, R14
- ADDQ CX, R9
- JMP copy_5_end
-
-copy_5_move_4through7:
- MOVL (R14), R15
- MOVL -4(R14)(CX*1), BP
- MOVL R15, (R9)
- MOVL BP, -4(R9)(CX*1)
- ADDQ CX, R14
- ADDQ CX, R9
- JMP copy_5_end
-
-copy_5_move_8through16:
- MOVQ (R14), R15
- MOVQ -8(R14)(CX*1), BP
- MOVQ R15, (R9)
- MOVQ BP, -8(R9)(CX*1)
- ADDQ CX, R14
- ADDQ CX, R9
-
-copy_5_end:
- ADDQ CX, R11
- SUBQ CX, R13
-
- // Copy match from the current buffer
-copy_match:
- MOVQ R9, CX
- SUBQ R12, CX
-
- // ml <= mo
- CMPQ R13, R12
- JA copy_overlapping_match
-
- // Copy non-overlapping match
- ADDQ R13, R11
- MOVQ R9, R12
- ADDQ R13, R9
-
-copy_2:
- MOVUPS (CX), X0
- MOVUPS X0, (R12)
- ADDQ $0x10, CX
- ADDQ $0x10, R12
- SUBQ $0x10, R13
- JHI copy_2
- JMP handle_loop
-
- // Copy overlapping match
-copy_overlapping_match:
- ADDQ R13, R11
-
-copy_slow_3:
- MOVB (CX), R12
- MOVB R12, (R9)
- INCQ CX
- INCQ R9
- DECQ R13
- JNZ copy_slow_3
-
-handle_loop:
- MOVQ ctx+16(FP), CX
- DECQ 96(CX)
- JNS sequenceDecs_decodeSync_bmi2_main_loop
-
-loop_finished:
- MOVQ br+8(FP), CX
- MOVQ AX, 24(CX)
- MOVB DL, 32(CX)
- MOVQ BX, 8(CX)
-
- // Update the context
- MOVQ ctx+16(FP), AX
- MOVQ R11, 136(AX)
- MOVQ 144(AX), CX
- SUBQ CX, R10
- MOVQ R10, 168(AX)
-
- // Return success
- MOVQ $0x00000000, ret+24(FP)
- RET
-
- // Return with match length error
-sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
- MOVQ 16(SP), AX
- MOVQ ctx+16(FP), CX
- MOVQ AX, 216(CX)
- MOVQ $0x00000001, ret+24(FP)
- RET
-
- // Return with match too long error
-sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
- MOVQ ctx+16(FP), AX
- MOVQ 16(SP), CX
- MOVQ CX, 216(AX)
- MOVQ $0x00000002, ret+24(FP)
- RET
-
- // Return with match offset too long error
-error_match_off_too_big:
- MOVQ ctx+16(FP), AX
- MOVQ 8(SP), CX
- MOVQ CX, 224(AX)
- MOVQ R11, 136(AX)
- MOVQ $0x00000003, ret+24(FP)
- RET
-
- // Return with not enough literals error
-error_not_enough_literals:
- MOVQ ctx+16(FP), AX
- MOVQ 24(SP), CX
- MOVQ CX, 208(AX)
- MOVQ $0x00000004, ret+24(FP)
- RET
-
- // Return with overread error
-error_overread:
- MOVQ $0x00000006, ret+24(FP)
- RET
-
- // Return with not enough output space error
-error_not_enough_space:
- MOVQ ctx+16(FP), AX
- MOVQ 24(SP), CX
- MOVQ CX, 208(AX)
- MOVQ 16(SP), CX
- MOVQ CX, 216(AX)
- MOVQ R11, 136(AX)
- MOVQ $0x00000005, ret+24(FP)
- RET
-
-// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
-// Requires: CMOV, SSE
-TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
- MOVQ br+8(FP), CX
- MOVQ 24(CX), DX
- MOVBQZX 32(CX), BX
- MOVQ (CX), AX
- MOVQ 8(CX), SI
- ADDQ SI, AX
- MOVQ AX, (SP)
- MOVQ ctx+16(FP), AX
- MOVQ 72(AX), DI
- MOVQ 80(AX), R8
- MOVQ 88(AX), R9
- XORQ CX, CX
- MOVQ CX, 8(SP)
- MOVQ CX, 16(SP)
- MOVQ CX, 24(SP)
- MOVQ 112(AX), R10
- MOVQ 128(AX), CX
- MOVQ CX, 32(SP)
- MOVQ 144(AX), R11
- MOVQ 136(AX), R12
- MOVQ 200(AX), CX
- MOVQ CX, 56(SP)
- MOVQ 176(AX), CX
- MOVQ CX, 48(SP)
- MOVQ 184(AX), AX
- MOVQ AX, 40(SP)
- MOVQ 40(SP), AX
- ADDQ AX, 48(SP)
-
- // Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
- ADDQ R10, 32(SP)
-
- // outBase += outPosition
- ADDQ R12, R10
-
-sequenceDecs_decodeSync_safe_amd64_main_loop:
- MOVQ (SP), R13
-
- // Fill bitreader to have enough for the offset and match length.
- CMPQ SI, $0x08
- JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
- MOVQ BX, AX
- SHRQ $0x03, AX
- SUBQ AX, R13
- MOVQ (R13), DX
- SUBQ AX, SI
- ANDQ $0x07, BX
- JMP sequenceDecs_decodeSync_safe_amd64_fill_end
-
-sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
- CMPQ SI, $0x00
- JLE sequenceDecs_decodeSync_safe_amd64_fill_check_overread
- CMPQ BX, $0x07
- JLE sequenceDecs_decodeSync_safe_amd64_fill_end
- SHLQ $0x08, DX
- SUBQ $0x01, R13
- SUBQ $0x01, SI
- SUBQ $0x08, BX
- MOVBQZX (R13), AX
- ORQ AX, DX
- JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
-
-sequenceDecs_decodeSync_safe_amd64_fill_check_overread:
- CMPQ BX, $0x40
- JA error_overread
-
-sequenceDecs_decodeSync_safe_amd64_fill_end:
- // Update offset
- MOVQ R9, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- SHRQ $0x20, AX
- TESTQ CX, CX
- JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero
- ADDQ CX, BX
- CMPQ BX, $0x40
- JA sequenceDecs_decodeSync_safe_amd64_of_update_zero
- CMPQ CX, $0x40
- JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero
- NEGQ CX
- SHRQ CL, R14
- ADDQ R14, AX
-
-sequenceDecs_decodeSync_safe_amd64_of_update_zero:
- MOVQ AX, 8(SP)
-
- // Update match length
- MOVQ R8, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- SHRQ $0x20, AX
- TESTQ CX, CX
- JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero
- ADDQ CX, BX
- CMPQ BX, $0x40
- JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero
- CMPQ CX, $0x40
- JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero
- NEGQ CX
- SHRQ CL, R14
- ADDQ R14, AX
-
-sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
- MOVQ AX, 16(SP)
-
- // Fill bitreader to have enough for the remaining
- CMPQ SI, $0x08
- JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
- MOVQ BX, AX
- SHRQ $0x03, AX
- SUBQ AX, R13
- MOVQ (R13), DX
- SUBQ AX, SI
- ANDQ $0x07, BX
- JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end
-
-sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
- CMPQ SI, $0x00
- JLE sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread
- CMPQ BX, $0x07
- JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
- SHLQ $0x08, DX
- SUBQ $0x01, R13
- SUBQ $0x01, SI
- SUBQ $0x08, BX
- MOVBQZX (R13), AX
- ORQ AX, DX
- JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
-
-sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread:
- CMPQ BX, $0x40
- JA error_overread
-
-sequenceDecs_decodeSync_safe_amd64_fill_2_end:
- // Update literal length
- MOVQ DI, AX
- MOVQ BX, CX
- MOVQ DX, R14
- SHLQ CL, R14
- MOVB AH, CL
- SHRQ $0x20, AX
- TESTQ CX, CX
- JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero
- ADDQ CX, BX
- CMPQ BX, $0x40
- JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero
- CMPQ CX, $0x40
- JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero
- NEGQ CX
- SHRQ CL, R14
- ADDQ R14, AX
-
-sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
- MOVQ AX, 24(SP)
-
- // Fill bitreader for state updates
- MOVQ R13, (SP)
- MOVQ R9, AX
- SHRQ $0x08, AX
- MOVBQZX AL, AX
- MOVQ ctx+16(FP), CX
- CMPQ 96(CX), $0x00
- JZ sequenceDecs_decodeSync_safe_amd64_skip_update
-
- // Update Literal Length State
- MOVBQZX DI, R13
- SHRL $0x10, DI
- LEAQ (BX)(R13*1), CX
- MOVQ DX, R14
- MOVQ CX, BX
- ROLQ CL, R14
- MOVL $0x00000001, R15
- MOVB R13, CL
- SHLL CL, R15
- DECL R15
- ANDQ R15, R14
- ADDQ R14, DI
-
- // Load ctx.llTable
- MOVQ ctx+16(FP), CX
- MOVQ (CX), CX
- MOVQ (CX)(DI*8), DI
-
- // Update Match Length State
- MOVBQZX R8, R13
- SHRL $0x10, R8
- LEAQ (BX)(R13*1), CX
- MOVQ DX, R14
- MOVQ CX, BX
- ROLQ CL, R14
- MOVL $0x00000001, R15
- MOVB R13, CL
- SHLL CL, R15
- DECL R15
- ANDQ R15, R14
- ADDQ R14, R8
-
- // Load ctx.mlTable
- MOVQ ctx+16(FP), CX
- MOVQ 24(CX), CX
- MOVQ (CX)(R8*8), R8
-
- // Update Offset State
- MOVBQZX R9, R13
- SHRL $0x10, R9
- LEAQ (BX)(R13*1), CX
- MOVQ DX, R14
- MOVQ CX, BX
- ROLQ CL, R14
- MOVL $0x00000001, R15
- MOVB R13, CL
- SHLL CL, R15
- DECL R15
- ANDQ R15, R14
- ADDQ R14, R9
-
- // Load ctx.ofTable
- MOVQ ctx+16(FP), CX
- MOVQ 48(CX), CX
- MOVQ (CX)(R9*8), R9
-
-sequenceDecs_decodeSync_safe_amd64_skip_update:
- // Adjust offset
- MOVQ s+0(FP), CX
- MOVQ 8(SP), R13
- CMPQ AX, $0x01
- JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
- MOVUPS 144(CX), X0
- MOVQ R13, 144(CX)
- MOVUPS X0, 152(CX)
- JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
-
-sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
- CMPQ 24(SP), $0x00000000
- JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
- INCQ R13
- JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
-
-sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
- TESTQ R13, R13
- JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
- MOVQ 144(CX), R13
- JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
-
-sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
- MOVQ R13, AX
- XORQ R14, R14
- MOVQ $-1, R15
- CMPQ R13, $0x03
- CMOVQEQ R14, AX
- CMOVQEQ R15, R14
- ADDQ 144(CX)(AX*8), R14
- JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
- MOVQ $0x00000001, R14
-
-sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
- CMPQ R13, $0x01
- JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip
- MOVQ 152(CX), AX
- MOVQ AX, 160(CX)
-
-sequenceDecs_decodeSync_safe_amd64_adjust_skip:
- MOVQ 144(CX), AX
- MOVQ AX, 152(CX)
- MOVQ R14, 144(CX)
- MOVQ R14, R13
-
-sequenceDecs_decodeSync_safe_amd64_after_adjust:
- MOVQ R13, 8(SP)
-
- // Check values
- MOVQ 16(SP), AX
- MOVQ 24(SP), CX
- LEAQ (AX)(CX*1), R14
- MOVQ s+0(FP), R15
- ADDQ R14, 256(R15)
- MOVQ ctx+16(FP), R14
- SUBQ CX, 104(R14)
- JS error_not_enough_literals
- CMPQ AX, $0x00020002
- JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
- TESTQ R13, R13
- JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
- TESTQ AX, AX
- JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
-
-sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
- MOVQ 24(SP), AX
- MOVQ 8(SP), CX
- MOVQ 16(SP), R13
-
- // Check if we have enough space in s.out
- LEAQ (AX)(R13*1), R14
- ADDQ R10, R14
- CMPQ R14, 32(SP)
- JA error_not_enough_space
-
- // Copy literals
- TESTQ AX, AX
- JZ check_offset
- MOVQ AX, R14
- SUBQ $0x10, R14
- JB copy_1_small
-
-copy_1_loop:
- MOVUPS (R11), X0
- MOVUPS X0, (R10)
- ADDQ $0x10, R11
- ADDQ $0x10, R10
- SUBQ $0x10, R14
- JAE copy_1_loop
- LEAQ 16(R11)(R14*1), R11
- LEAQ 16(R10)(R14*1), R10
- MOVUPS -16(R11), X0
- MOVUPS X0, -16(R10)
- JMP copy_1_end
-
-copy_1_small:
- CMPQ AX, $0x03
- JE copy_1_move_3
- JB copy_1_move_1or2
- CMPQ AX, $0x08
- JB copy_1_move_4through7
- JMP copy_1_move_8through16
-
-copy_1_move_1or2:
- MOVB (R11), R14
- MOVB -1(R11)(AX*1), R15
- MOVB R14, (R10)
- MOVB R15, -1(R10)(AX*1)
- ADDQ AX, R11
- ADDQ AX, R10
- JMP copy_1_end
-
-copy_1_move_3:
- MOVW (R11), R14
- MOVB 2(R11), R15
- MOVW R14, (R10)
- MOVB R15, 2(R10)
- ADDQ AX, R11
- ADDQ AX, R10
- JMP copy_1_end
-
-copy_1_move_4through7:
- MOVL (R11), R14
- MOVL -4(R11)(AX*1), R15
- MOVL R14, (R10)
- MOVL R15, -4(R10)(AX*1)
- ADDQ AX, R11
- ADDQ AX, R10
- JMP copy_1_end
-
-copy_1_move_8through16:
- MOVQ (R11), R14
- MOVQ -8(R11)(AX*1), R15
- MOVQ R14, (R10)
- MOVQ R15, -8(R10)(AX*1)
- ADDQ AX, R11
- ADDQ AX, R10
-
-copy_1_end:
- ADDQ AX, R12
-
- // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
-check_offset:
- MOVQ R12, AX
- ADDQ 40(SP), AX
- CMPQ CX, AX
- JG error_match_off_too_big
- CMPQ CX, 56(SP)
- JG error_match_off_too_big
-
- // Copy match from history
- MOVQ CX, AX
- SUBQ R12, AX
- JLS copy_match
- MOVQ 48(SP), R14
- SUBQ AX, R14
- CMPQ R13, AX
- JG copy_all_from_history
- MOVQ R13, AX
- SUBQ $0x10, AX
- JB copy_4_small
-
-copy_4_loop:
- MOVUPS (R14), X0
- MOVUPS X0, (R10)
- ADDQ $0x10, R14
- ADDQ $0x10, R10
- SUBQ $0x10, AX
- JAE copy_4_loop
- LEAQ 16(R14)(AX*1), R14
- LEAQ 16(R10)(AX*1), R10
- MOVUPS -16(R14), X0
- MOVUPS X0, -16(R10)
- JMP copy_4_end
-
-copy_4_small:
- CMPQ R13, $0x03
- JE copy_4_move_3
- CMPQ R13, $0x08
- JB copy_4_move_4through7
- JMP copy_4_move_8through16
-
-copy_4_move_3:
- MOVW (R14), AX
- MOVB 2(R14), CL
- MOVW AX, (R10)
- MOVB CL, 2(R10)
- ADDQ R13, R14
- ADDQ R13, R10
- JMP copy_4_end
-
-copy_4_move_4through7:
- MOVL (R14), AX
- MOVL -4(R14)(R13*1), CX
- MOVL AX, (R10)
- MOVL CX, -4(R10)(R13*1)
- ADDQ R13, R14
- ADDQ R13, R10
- JMP copy_4_end
-
-copy_4_move_8through16:
- MOVQ (R14), AX
- MOVQ -8(R14)(R13*1), CX
- MOVQ AX, (R10)
- MOVQ CX, -8(R10)(R13*1)
- ADDQ R13, R14
- ADDQ R13, R10
-
-copy_4_end:
- ADDQ R13, R12
- JMP handle_loop
- JMP loop_finished
-
-copy_all_from_history:
- MOVQ AX, R15
- SUBQ $0x10, R15
- JB copy_5_small
-
-copy_5_loop:
- MOVUPS (R14), X0
- MOVUPS X0, (R10)
- ADDQ $0x10, R14
- ADDQ $0x10, R10
- SUBQ $0x10, R15
- JAE copy_5_loop
- LEAQ 16(R14)(R15*1), R14
- LEAQ 16(R10)(R15*1), R10
- MOVUPS -16(R14), X0
- MOVUPS X0, -16(R10)
- JMP copy_5_end
-
-copy_5_small:
- CMPQ AX, $0x03
- JE copy_5_move_3
- JB copy_5_move_1or2
- CMPQ AX, $0x08
- JB copy_5_move_4through7
- JMP copy_5_move_8through16
-
-copy_5_move_1or2:
- MOVB (R14), R15
- MOVB -1(R14)(AX*1), BP
- MOVB R15, (R10)
- MOVB BP, -1(R10)(AX*1)
- ADDQ AX, R14
- ADDQ AX, R10
- JMP copy_5_end
-
-copy_5_move_3:
- MOVW (R14), R15
- MOVB 2(R14), BP
- MOVW R15, (R10)
- MOVB BP, 2(R10)
- ADDQ AX, R14
- ADDQ AX, R10
- JMP copy_5_end
-
-copy_5_move_4through7:
- MOVL (R14), R15
- MOVL -4(R14)(AX*1), BP
- MOVL R15, (R10)
- MOVL BP, -4(R10)(AX*1)
- ADDQ AX, R14
- ADDQ AX, R10
- JMP copy_5_end
-
-copy_5_move_8through16:
- MOVQ (R14), R15
- MOVQ -8(R14)(AX*1), BP
- MOVQ R15, (R10)
- MOVQ BP, -8(R10)(AX*1)
- ADDQ AX, R14
- ADDQ AX, R10
-
-copy_5_end:
- ADDQ AX, R12
- SUBQ AX, R13
-
- // Copy match from the current buffer
-copy_match:
- MOVQ R10, AX
- SUBQ CX, AX
-
- // ml <= mo
- CMPQ R13, CX
- JA copy_overlapping_match
-
- // Copy non-overlapping match
- ADDQ R13, R12
- MOVQ R13, CX
- SUBQ $0x10, CX
- JB copy_2_small
-
-copy_2_loop:
- MOVUPS (AX), X0
- MOVUPS X0, (R10)
- ADDQ $0x10, AX
- ADDQ $0x10, R10
- SUBQ $0x10, CX
- JAE copy_2_loop
- LEAQ 16(AX)(CX*1), AX
- LEAQ 16(R10)(CX*1), R10
- MOVUPS -16(AX), X0
- MOVUPS X0, -16(R10)
- JMP copy_2_end
-
-copy_2_small:
- CMPQ R13, $0x03
- JE copy_2_move_3
- JB copy_2_move_1or2
- CMPQ R13, $0x08
- JB copy_2_move_4through7
- JMP copy_2_move_8through16
-
-copy_2_move_1or2:
- MOVB (AX), CL
- MOVB -1(AX)(R13*1), R14
- MOVB CL, (R10)
- MOVB R14, -1(R10)(R13*1)
- ADDQ R13, AX
- ADDQ R13, R10
- JMP copy_2_end
-
-copy_2_move_3:
- MOVW (AX), CX
- MOVB 2(AX), R14
- MOVW CX, (R10)
- MOVB R14, 2(R10)
- ADDQ R13, AX
- ADDQ R13, R10
- JMP copy_2_end
-
-copy_2_move_4through7:
- MOVL (AX), CX
- MOVL -4(AX)(R13*1), R14
- MOVL CX, (R10)
- MOVL R14, -4(R10)(R13*1)
- ADDQ R13, AX
- ADDQ R13, R10
- JMP copy_2_end
-
-copy_2_move_8through16:
- MOVQ (AX), CX
- MOVQ -8(AX)(R13*1), R14
- MOVQ CX, (R10)
- MOVQ R14, -8(R10)(R13*1)
- ADDQ R13, AX
- ADDQ R13, R10
-
-copy_2_end:
- JMP handle_loop
-
- // Copy overlapping match
-copy_overlapping_match:
- ADDQ R13, R12
-
-copy_slow_3:
- MOVB (AX), CL
- MOVB CL, (R10)
- INCQ AX
- INCQ R10
- DECQ R13
- JNZ copy_slow_3
-
-handle_loop:
- MOVQ ctx+16(FP), AX
- DECQ 96(AX)
- JNS sequenceDecs_decodeSync_safe_amd64_main_loop
-
-loop_finished:
- MOVQ br+8(FP), AX
- MOVQ DX, 24(AX)
- MOVB BL, 32(AX)
- MOVQ SI, 8(AX)
-
- // Update the context
- MOVQ ctx+16(FP), AX
- MOVQ R12, 136(AX)
- MOVQ 144(AX), CX
- SUBQ CX, R11
- MOVQ R11, 168(AX)
-
- // Return success
- MOVQ $0x00000000, ret+24(FP)
- RET
-
- // Return with match length error
-sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
- MOVQ 16(SP), AX
- MOVQ ctx+16(FP), CX
- MOVQ AX, 216(CX)
- MOVQ $0x00000001, ret+24(FP)
- RET
-
- // Return with match too long error
-sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
- MOVQ ctx+16(FP), AX
- MOVQ 16(SP), CX
- MOVQ CX, 216(AX)
- MOVQ $0x00000002, ret+24(FP)
- RET
-
- // Return with match offset too long error
-error_match_off_too_big:
- MOVQ ctx+16(FP), AX
- MOVQ 8(SP), CX
- MOVQ CX, 224(AX)
- MOVQ R12, 136(AX)
- MOVQ $0x00000003, ret+24(FP)
- RET
-
- // Return with not enough literals error
-error_not_enough_literals:
- MOVQ ctx+16(FP), AX
- MOVQ 24(SP), CX
- MOVQ CX, 208(AX)
- MOVQ $0x00000004, ret+24(FP)
- RET
-
- // Return with overread error
-error_overread:
- MOVQ $0x00000006, ret+24(FP)
- RET
-
- // Return with not enough output space error
-error_not_enough_space:
- MOVQ ctx+16(FP), AX
- MOVQ 24(SP), CX
- MOVQ CX, 208(AX)
- MOVQ 16(SP), CX
- MOVQ CX, 216(AX)
- MOVQ R12, 136(AX)
- MOVQ $0x00000005, ret+24(FP)
- RET
-
-// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
-// Requires: BMI, BMI2, CMOV, SSE
-TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
- MOVQ br+8(FP), BX
- MOVQ 24(BX), AX
- MOVBQZX 32(BX), DX
- MOVQ (BX), CX
- MOVQ 8(BX), BX
- ADDQ BX, CX
- MOVQ CX, (SP)
- MOVQ ctx+16(FP), CX
- MOVQ 72(CX), SI
- MOVQ 80(CX), DI
- MOVQ 88(CX), R8
- XORQ R9, R9
- MOVQ R9, 8(SP)
- MOVQ R9, 16(SP)
- MOVQ R9, 24(SP)
- MOVQ 112(CX), R9
- MOVQ 128(CX), R10
- MOVQ R10, 32(SP)
- MOVQ 144(CX), R10
- MOVQ 136(CX), R11
- MOVQ 200(CX), R12
- MOVQ R12, 56(SP)
- MOVQ 176(CX), R12
- MOVQ R12, 48(SP)
- MOVQ 184(CX), CX
- MOVQ CX, 40(SP)
- MOVQ 40(SP), CX
- ADDQ CX, 48(SP)
-
- // Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
- ADDQ R9, 32(SP)
-
- // outBase += outPosition
- ADDQ R11, R9
-
-sequenceDecs_decodeSync_safe_bmi2_main_loop:
- MOVQ (SP), R12
-
- // Fill bitreader to have enough for the offset and match length.
- CMPQ BX, $0x08
- JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
- MOVQ DX, CX
- SHRQ $0x03, CX
- SUBQ CX, R12
- MOVQ (R12), AX
- SUBQ CX, BX
- ANDQ $0x07, DX
- JMP sequenceDecs_decodeSync_safe_bmi2_fill_end
-
-sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
- CMPQ BX, $0x00
- JLE sequenceDecs_decodeSync_safe_bmi2_fill_check_overread
- CMPQ DX, $0x07
- JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
- SHLQ $0x08, AX
- SUBQ $0x01, R12
- SUBQ $0x01, BX
- SUBQ $0x08, DX
- MOVBQZX (R12), CX
- ORQ CX, AX
- JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
-
-sequenceDecs_decodeSync_safe_bmi2_fill_check_overread:
- CMPQ DX, $0x40
- JA error_overread
-
-sequenceDecs_decodeSync_safe_bmi2_fill_end:
- // Update offset
- MOVQ $0x00000808, CX
- BEXTRQ CX, R8, R13
- MOVQ AX, R14
- LEAQ (DX)(R13*1), CX
- ROLQ CL, R14
- BZHIQ R13, R14, R14
- MOVQ CX, DX
- MOVQ R8, CX
- SHRQ $0x20, CX
- ADDQ R14, CX
- MOVQ CX, 8(SP)
-
- // Update match length
- MOVQ $0x00000808, CX
- BEXTRQ CX, DI, R13
- MOVQ AX, R14
- LEAQ (DX)(R13*1), CX
- ROLQ CL, R14
- BZHIQ R13, R14, R14
- MOVQ CX, DX
- MOVQ DI, CX
- SHRQ $0x20, CX
- ADDQ R14, CX
- MOVQ CX, 16(SP)
-
- // Fill bitreader to have enough for the remaining
- CMPQ BX, $0x08
- JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
- MOVQ DX, CX
- SHRQ $0x03, CX
- SUBQ CX, R12
- MOVQ (R12), AX
- SUBQ CX, BX
- ANDQ $0x07, DX
- JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end
-
-sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
- CMPQ BX, $0x00
- JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread
- CMPQ DX, $0x07
- JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
- SHLQ $0x08, AX
- SUBQ $0x01, R12
- SUBQ $0x01, BX
- SUBQ $0x08, DX
- MOVBQZX (R12), CX
- ORQ CX, AX
- JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
-
-sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread:
- CMPQ DX, $0x40
- JA error_overread
-
-sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
- // Update literal length
- MOVQ $0x00000808, CX
- BEXTRQ CX, SI, R13
- MOVQ AX, R14
- LEAQ (DX)(R13*1), CX
- ROLQ CL, R14
- BZHIQ R13, R14, R14
- MOVQ CX, DX
- MOVQ SI, CX
- SHRQ $0x20, CX
- ADDQ R14, CX
- MOVQ CX, 24(SP)
-
- // Fill bitreader for state updates
- MOVQ R12, (SP)
- MOVQ $0x00000808, CX
- BEXTRQ CX, R8, R12
- MOVQ ctx+16(FP), CX
- CMPQ 96(CX), $0x00
- JZ sequenceDecs_decodeSync_safe_bmi2_skip_update
- LEAQ (SI)(DI*1), R13
- ADDQ R8, R13
- MOVBQZX R13, R13
- LEAQ (DX)(R13*1), CX
- MOVQ AX, R14
- MOVQ CX, DX
- ROLQ CL, R14
- BZHIQ R13, R14, R14
-
- // Update Offset State
- BZHIQ R8, R14, CX
- SHRXQ R8, R14, R14
- SHRL $0x10, R8
- ADDQ CX, R8
-
- // Load ctx.ofTable
- MOVQ ctx+16(FP), CX
- MOVQ 48(CX), CX
- MOVQ (CX)(R8*8), R8
-
- // Update Match Length State
- BZHIQ DI, R14, CX
- SHRXQ DI, R14, R14
- SHRL $0x10, DI
- ADDQ CX, DI
-
- // Load ctx.mlTable
- MOVQ ctx+16(FP), CX
- MOVQ 24(CX), CX
- MOVQ (CX)(DI*8), DI
-
- // Update Literal Length State
- BZHIQ SI, R14, CX
- SHRL $0x10, SI
- ADDQ CX, SI
-
- // Load ctx.llTable
- MOVQ ctx+16(FP), CX
- MOVQ (CX), CX
- MOVQ (CX)(SI*8), SI
-
-sequenceDecs_decodeSync_safe_bmi2_skip_update:
- // Adjust offset
- MOVQ s+0(FP), CX
- MOVQ 8(SP), R13
- CMPQ R12, $0x01
- JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
- MOVUPS 144(CX), X0
- MOVQ R13, 144(CX)
- MOVUPS X0, 152(CX)
- JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
-
-sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
- CMPQ 24(SP), $0x00000000
- JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
- INCQ R13
- JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
-
-sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
- TESTQ R13, R13
- JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
- MOVQ 144(CX), R13
- JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
-
-sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
- MOVQ R13, R12
- XORQ R14, R14
- MOVQ $-1, R15
- CMPQ R13, $0x03
- CMOVQEQ R14, R12
- CMOVQEQ R15, R14
- ADDQ 144(CX)(R12*8), R14
- JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
- MOVQ $0x00000001, R14
-
-sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
- CMPQ R13, $0x01
- JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip
- MOVQ 152(CX), R12
- MOVQ R12, 160(CX)
-
-sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
- MOVQ 144(CX), R12
- MOVQ R12, 152(CX)
- MOVQ R14, 144(CX)
- MOVQ R14, R13
-
-sequenceDecs_decodeSync_safe_bmi2_after_adjust:
- MOVQ R13, 8(SP)
-
- // Check values
- MOVQ 16(SP), CX
- MOVQ 24(SP), R12
- LEAQ (CX)(R12*1), R14
- MOVQ s+0(FP), R15
- ADDQ R14, 256(R15)
- MOVQ ctx+16(FP), R14
- SUBQ R12, 104(R14)
- JS error_not_enough_literals
- CMPQ CX, $0x00020002
- JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
- TESTQ R13, R13
- JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
- TESTQ CX, CX
- JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
-
-sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
- MOVQ 24(SP), CX
- MOVQ 8(SP), R12
- MOVQ 16(SP), R13
-
- // Check if we have enough space in s.out
- LEAQ (CX)(R13*1), R14
- ADDQ R9, R14
- CMPQ R14, 32(SP)
- JA error_not_enough_space
-
- // Copy literals
- TESTQ CX, CX
- JZ check_offset
- MOVQ CX, R14
- SUBQ $0x10, R14
- JB copy_1_small
-
-copy_1_loop:
- MOVUPS (R10), X0
- MOVUPS X0, (R9)
- ADDQ $0x10, R10
- ADDQ $0x10, R9
- SUBQ $0x10, R14
- JAE copy_1_loop
- LEAQ 16(R10)(R14*1), R10
- LEAQ 16(R9)(R14*1), R9
- MOVUPS -16(R10), X0
- MOVUPS X0, -16(R9)
- JMP copy_1_end
-
-copy_1_small:
- CMPQ CX, $0x03
- JE copy_1_move_3
- JB copy_1_move_1or2
- CMPQ CX, $0x08
- JB copy_1_move_4through7
- JMP copy_1_move_8through16
-
-copy_1_move_1or2:
- MOVB (R10), R14
- MOVB -1(R10)(CX*1), R15
- MOVB R14, (R9)
- MOVB R15, -1(R9)(CX*1)
- ADDQ CX, R10
- ADDQ CX, R9
- JMP copy_1_end
-
-copy_1_move_3:
- MOVW (R10), R14
- MOVB 2(R10), R15
- MOVW R14, (R9)
- MOVB R15, 2(R9)
- ADDQ CX, R10
- ADDQ CX, R9
- JMP copy_1_end
-
-copy_1_move_4through7:
- MOVL (R10), R14
- MOVL -4(R10)(CX*1), R15
- MOVL R14, (R9)
- MOVL R15, -4(R9)(CX*1)
- ADDQ CX, R10
- ADDQ CX, R9
- JMP copy_1_end
-
-copy_1_move_8through16:
- MOVQ (R10), R14
- MOVQ -8(R10)(CX*1), R15
- MOVQ R14, (R9)
- MOVQ R15, -8(R9)(CX*1)
- ADDQ CX, R10
- ADDQ CX, R9
-
-copy_1_end:
- ADDQ CX, R11
-
- // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
-check_offset:
- MOVQ R11, CX
- ADDQ 40(SP), CX
- CMPQ R12, CX
- JG error_match_off_too_big
- CMPQ R12, 56(SP)
- JG error_match_off_too_big
-
- // Copy match from history
- MOVQ R12, CX
- SUBQ R11, CX
- JLS copy_match
- MOVQ 48(SP), R14
- SUBQ CX, R14
- CMPQ R13, CX
- JG copy_all_from_history
- MOVQ R13, CX
- SUBQ $0x10, CX
- JB copy_4_small
-
-copy_4_loop:
- MOVUPS (R14), X0
- MOVUPS X0, (R9)
- ADDQ $0x10, R14
- ADDQ $0x10, R9
- SUBQ $0x10, CX
- JAE copy_4_loop
- LEAQ 16(R14)(CX*1), R14
- LEAQ 16(R9)(CX*1), R9
- MOVUPS -16(R14), X0
- MOVUPS X0, -16(R9)
- JMP copy_4_end
-
-copy_4_small:
- CMPQ R13, $0x03
- JE copy_4_move_3
- CMPQ R13, $0x08
- JB copy_4_move_4through7
- JMP copy_4_move_8through16
-
-copy_4_move_3:
- MOVW (R14), CX
- MOVB 2(R14), R12
- MOVW CX, (R9)
- MOVB R12, 2(R9)
- ADDQ R13, R14
- ADDQ R13, R9
- JMP copy_4_end
-
-copy_4_move_4through7:
- MOVL (R14), CX
- MOVL -4(R14)(R13*1), R12
- MOVL CX, (R9)
- MOVL R12, -4(R9)(R13*1)
- ADDQ R13, R14
- ADDQ R13, R9
- JMP copy_4_end
-
-copy_4_move_8through16:
- MOVQ (R14), CX
- MOVQ -8(R14)(R13*1), R12
- MOVQ CX, (R9)
- MOVQ R12, -8(R9)(R13*1)
- ADDQ R13, R14
- ADDQ R13, R9
-
-copy_4_end:
- ADDQ R13, R11
- JMP handle_loop
- JMP loop_finished
-
-copy_all_from_history:
- MOVQ CX, R15
- SUBQ $0x10, R15
- JB copy_5_small
-
-copy_5_loop:
- MOVUPS (R14), X0
- MOVUPS X0, (R9)
- ADDQ $0x10, R14
- ADDQ $0x10, R9
- SUBQ $0x10, R15
- JAE copy_5_loop
- LEAQ 16(R14)(R15*1), R14
- LEAQ 16(R9)(R15*1), R9
- MOVUPS -16(R14), X0
- MOVUPS X0, -16(R9)
- JMP copy_5_end
-
-copy_5_small:
- CMPQ CX, $0x03
- JE copy_5_move_3
- JB copy_5_move_1or2
- CMPQ CX, $0x08
- JB copy_5_move_4through7
- JMP copy_5_move_8through16
-
-copy_5_move_1or2:
- MOVB (R14), R15
- MOVB -1(R14)(CX*1), BP
- MOVB R15, (R9)
- MOVB BP, -1(R9)(CX*1)
- ADDQ CX, R14
- ADDQ CX, R9
- JMP copy_5_end
-
-copy_5_move_3:
- MOVW (R14), R15
- MOVB 2(R14), BP
- MOVW R15, (R9)
- MOVB BP, 2(R9)
- ADDQ CX, R14
- ADDQ CX, R9
- JMP copy_5_end
-
-copy_5_move_4through7:
- MOVL (R14), R15
- MOVL -4(R14)(CX*1), BP
- MOVL R15, (R9)
- MOVL BP, -4(R9)(CX*1)
- ADDQ CX, R14
- ADDQ CX, R9
- JMP copy_5_end
-
-copy_5_move_8through16:
- MOVQ (R14), R15
- MOVQ -8(R14)(CX*1), BP
- MOVQ R15, (R9)
- MOVQ BP, -8(R9)(CX*1)
- ADDQ CX, R14
- ADDQ CX, R9
-
-copy_5_end:
- ADDQ CX, R11
- SUBQ CX, R13
-
- // Copy match from the current buffer
-copy_match:
- MOVQ R9, CX
- SUBQ R12, CX
-
- // ml <= mo
- CMPQ R13, R12
- JA copy_overlapping_match
-
- // Copy non-overlapping match
- ADDQ R13, R11
- MOVQ R13, R12
- SUBQ $0x10, R12
- JB copy_2_small
-
-copy_2_loop:
- MOVUPS (CX), X0
- MOVUPS X0, (R9)
- ADDQ $0x10, CX
- ADDQ $0x10, R9
- SUBQ $0x10, R12
- JAE copy_2_loop
- LEAQ 16(CX)(R12*1), CX
- LEAQ 16(R9)(R12*1), R9
- MOVUPS -16(CX), X0
- MOVUPS X0, -16(R9)
- JMP copy_2_end
-
-copy_2_small:
- CMPQ R13, $0x03
- JE copy_2_move_3
- JB copy_2_move_1or2
- CMPQ R13, $0x08
- JB copy_2_move_4through7
- JMP copy_2_move_8through16
-
-copy_2_move_1or2:
- MOVB (CX), R12
- MOVB -1(CX)(R13*1), R14
- MOVB R12, (R9)
- MOVB R14, -1(R9)(R13*1)
- ADDQ R13, CX
- ADDQ R13, R9
- JMP copy_2_end
-
-copy_2_move_3:
- MOVW (CX), R12
- MOVB 2(CX), R14
- MOVW R12, (R9)
- MOVB R14, 2(R9)
- ADDQ R13, CX
- ADDQ R13, R9
- JMP copy_2_end
-
-copy_2_move_4through7:
- MOVL (CX), R12
- MOVL -4(CX)(R13*1), R14
- MOVL R12, (R9)
- MOVL R14, -4(R9)(R13*1)
- ADDQ R13, CX
- ADDQ R13, R9
- JMP copy_2_end
-
-copy_2_move_8through16:
- MOVQ (CX), R12
- MOVQ -8(CX)(R13*1), R14
- MOVQ R12, (R9)
- MOVQ R14, -8(R9)(R13*1)
- ADDQ R13, CX
- ADDQ R13, R9
-
-copy_2_end:
- JMP handle_loop
-
- // Copy overlapping match
-copy_overlapping_match:
- ADDQ R13, R11
-
-copy_slow_3:
- MOVB (CX), R12
- MOVB R12, (R9)
- INCQ CX
- INCQ R9
- DECQ R13
- JNZ copy_slow_3
-
-handle_loop:
- MOVQ ctx+16(FP), CX
- DECQ 96(CX)
- JNS sequenceDecs_decodeSync_safe_bmi2_main_loop
-
-loop_finished:
- MOVQ br+8(FP), CX
- MOVQ AX, 24(CX)
- MOVB DL, 32(CX)
- MOVQ BX, 8(CX)
-
- // Update the context
- MOVQ ctx+16(FP), AX
- MOVQ R11, 136(AX)
- MOVQ 144(AX), CX
- SUBQ CX, R10
- MOVQ R10, 168(AX)
-
- // Return success
- MOVQ $0x00000000, ret+24(FP)
- RET
-
- // Return with match length error
-sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
- MOVQ 16(SP), AX
- MOVQ ctx+16(FP), CX
- MOVQ AX, 216(CX)
- MOVQ $0x00000001, ret+24(FP)
- RET
-
- // Return with match too long error
-sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
- MOVQ ctx+16(FP), AX
- MOVQ 16(SP), CX
- MOVQ CX, 216(AX)
- MOVQ $0x00000002, ret+24(FP)
- RET
-
- // Return with match offset too long error
-error_match_off_too_big:
- MOVQ ctx+16(FP), AX
- MOVQ 8(SP), CX
- MOVQ CX, 224(AX)
- MOVQ R11, 136(AX)
- MOVQ $0x00000003, ret+24(FP)
- RET
-
- // Return with not enough literals error
-error_not_enough_literals:
- MOVQ ctx+16(FP), AX
- MOVQ 24(SP), CX
- MOVQ CX, 208(AX)
- MOVQ $0x00000004, ret+24(FP)
- RET
-
- // Return with overread error
-error_overread:
- MOVQ $0x00000006, ret+24(FP)
- RET
-
- // Return with not enough output space error
-error_not_enough_space:
- MOVQ ctx+16(FP), AX
- MOVQ 24(SP), CX
- MOVQ CX, 208(AX)
- MOVQ 16(SP), CX
- MOVQ CX, 216(AX)
- MOVQ R11, 136(AX)
- MOVQ $0x00000005, ret+24(FP)
- RET