diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s')
-rw-r--r-- | vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s | 4151 |
1 files changed, 0 insertions, 4151 deletions
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s deleted file mode 100644 index f5591fa1e..000000000 --- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s +++ /dev/null @@ -1,4151 +0,0 @@ -// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT. - -//go:build !appengine && !noasm && gc && !noasm - -// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int -// Requires: CMOV -TEXT ·sequenceDecs_decode_amd64(SB), $8-32 - MOVQ br+8(FP), CX - MOVQ 24(CX), DX - MOVBQZX 32(CX), BX - MOVQ (CX), AX - MOVQ 8(CX), SI - ADDQ SI, AX - MOVQ AX, (SP) - MOVQ ctx+16(FP), AX - MOVQ 72(AX), DI - MOVQ 80(AX), R8 - MOVQ 88(AX), R9 - MOVQ 104(AX), R10 - MOVQ s+0(FP), AX - MOVQ 144(AX), R11 - MOVQ 152(AX), R12 - MOVQ 160(AX), R13 - -sequenceDecs_decode_amd64_main_loop: - MOVQ (SP), R14 - - // Fill bitreader to have enough for the offset and match length. - CMPQ SI, $0x08 - JL sequenceDecs_decode_amd64_fill_byte_by_byte - MOVQ BX, AX - SHRQ $0x03, AX - SUBQ AX, R14 - MOVQ (R14), DX - SUBQ AX, SI - ANDQ $0x07, BX - JMP sequenceDecs_decode_amd64_fill_end - -sequenceDecs_decode_amd64_fill_byte_by_byte: - CMPQ SI, $0x00 - JLE sequenceDecs_decode_amd64_fill_check_overread - CMPQ BX, $0x07 - JLE sequenceDecs_decode_amd64_fill_end - SHLQ $0x08, DX - SUBQ $0x01, R14 - SUBQ $0x01, SI - SUBQ $0x08, BX - MOVBQZX (R14), AX - ORQ AX, DX - JMP sequenceDecs_decode_amd64_fill_byte_by_byte - -sequenceDecs_decode_amd64_fill_check_overread: - CMPQ BX, $0x40 - JA error_overread - -sequenceDecs_decode_amd64_fill_end: - // Update offset - MOVQ R9, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - SHRQ $0x20, AX - TESTQ CX, CX - JZ sequenceDecs_decode_amd64_of_update_zero - ADDQ CX, BX - CMPQ BX, $0x40 - JA sequenceDecs_decode_amd64_of_update_zero - CMPQ CX, $0x40 - JAE sequenceDecs_decode_amd64_of_update_zero - NEGQ CX - SHRQ CL, R15 - ADDQ R15, AX - -sequenceDecs_decode_amd64_of_update_zero: - MOVQ AX, 16(R10) - - // Update match length - MOVQ R8, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - SHRQ $0x20, AX - TESTQ CX, CX - JZ sequenceDecs_decode_amd64_ml_update_zero - ADDQ CX, BX - CMPQ BX, $0x40 - JA sequenceDecs_decode_amd64_ml_update_zero - CMPQ CX, $0x40 - JAE sequenceDecs_decode_amd64_ml_update_zero - NEGQ CX - SHRQ CL, R15 - ADDQ R15, AX - -sequenceDecs_decode_amd64_ml_update_zero: - MOVQ AX, 8(R10) - - // Fill bitreader to have enough for the remaining - CMPQ SI, $0x08 - JL sequenceDecs_decode_amd64_fill_2_byte_by_byte - MOVQ BX, AX - SHRQ $0x03, AX - SUBQ AX, R14 - MOVQ (R14), DX - SUBQ AX, SI - ANDQ $0x07, BX - JMP sequenceDecs_decode_amd64_fill_2_end - -sequenceDecs_decode_amd64_fill_2_byte_by_byte: - CMPQ SI, $0x00 - JLE sequenceDecs_decode_amd64_fill_2_check_overread - CMPQ BX, $0x07 - JLE sequenceDecs_decode_amd64_fill_2_end - SHLQ $0x08, DX - SUBQ $0x01, R14 - SUBQ $0x01, SI - SUBQ $0x08, BX - MOVBQZX (R14), AX - ORQ AX, DX - JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte - -sequenceDecs_decode_amd64_fill_2_check_overread: - CMPQ BX, $0x40 - JA error_overread - -sequenceDecs_decode_amd64_fill_2_end: - // Update literal length - MOVQ DI, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - SHRQ $0x20, AX - TESTQ CX, CX - JZ sequenceDecs_decode_amd64_ll_update_zero - ADDQ CX, BX - CMPQ BX, $0x40 - JA sequenceDecs_decode_amd64_ll_update_zero - CMPQ CX, $0x40 - JAE sequenceDecs_decode_amd64_ll_update_zero - NEGQ CX - SHRQ CL, R15 - ADDQ R15, AX - -sequenceDecs_decode_amd64_ll_update_zero: - MOVQ AX, (R10) - - // Fill bitreader for state updates - MOVQ R14, (SP) - MOVQ R9, AX - SHRQ $0x08, AX - MOVBQZX AL, AX - MOVQ ctx+16(FP), CX - CMPQ 96(CX), $0x00 - JZ sequenceDecs_decode_amd64_skip_update - - // Update Literal Length State - MOVBQZX DI, R14 - SHRL $0x10, DI - LEAQ (BX)(R14*1), CX - MOVQ DX, R15 - MOVQ CX, BX - ROLQ CL, R15 - MOVL $0x00000001, BP - MOVB R14, CL - SHLL CL, BP - DECL BP - ANDQ BP, R15 - ADDQ R15, DI - - // Load ctx.llTable - MOVQ ctx+16(FP), CX - MOVQ (CX), CX - MOVQ (CX)(DI*8), DI - - // Update Match Length State - MOVBQZX R8, R14 - SHRL $0x10, R8 - LEAQ (BX)(R14*1), CX - MOVQ DX, R15 - MOVQ CX, BX - ROLQ CL, R15 - MOVL $0x00000001, BP - MOVB R14, CL - SHLL CL, BP - DECL BP - ANDQ BP, R15 - ADDQ R15, R8 - - // Load ctx.mlTable - MOVQ ctx+16(FP), CX - MOVQ 24(CX), CX - MOVQ (CX)(R8*8), R8 - - // Update Offset State - MOVBQZX R9, R14 - SHRL $0x10, R9 - LEAQ (BX)(R14*1), CX - MOVQ DX, R15 - MOVQ CX, BX - ROLQ CL, R15 - MOVL $0x00000001, BP - MOVB R14, CL - SHLL CL, BP - DECL BP - ANDQ BP, R15 - ADDQ R15, R9 - - // Load ctx.ofTable - MOVQ ctx+16(FP), CX - MOVQ 48(CX), CX - MOVQ (CX)(R9*8), R9 - -sequenceDecs_decode_amd64_skip_update: - // Adjust offset - MOVQ 16(R10), CX - CMPQ AX, $0x01 - JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0 - MOVQ R12, R13 - MOVQ R11, R12 - MOVQ CX, R11 - JMP sequenceDecs_decode_amd64_after_adjust - -sequenceDecs_decode_amd64_adjust_offsetB_1_or_0: - CMPQ (R10), $0x00000000 - JNE sequenceDecs_decode_amd64_adjust_offset_maybezero - INCQ CX - JMP sequenceDecs_decode_amd64_adjust_offset_nonzero - -sequenceDecs_decode_amd64_adjust_offset_maybezero: - TESTQ CX, CX - JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero - MOVQ R11, CX - JMP sequenceDecs_decode_amd64_after_adjust - -sequenceDecs_decode_amd64_adjust_offset_nonzero: - CMPQ CX, $0x01 - JB sequenceDecs_decode_amd64_adjust_zero - JEQ sequenceDecs_decode_amd64_adjust_one - CMPQ CX, $0x02 - JA sequenceDecs_decode_amd64_adjust_three - JMP sequenceDecs_decode_amd64_adjust_two - -sequenceDecs_decode_amd64_adjust_zero: - MOVQ R11, AX - JMP sequenceDecs_decode_amd64_adjust_test_temp_valid - -sequenceDecs_decode_amd64_adjust_one: - MOVQ R12, AX - JMP sequenceDecs_decode_amd64_adjust_test_temp_valid - -sequenceDecs_decode_amd64_adjust_two: - MOVQ R13, AX - JMP sequenceDecs_decode_amd64_adjust_test_temp_valid - -sequenceDecs_decode_amd64_adjust_three: - LEAQ -1(R11), AX - -sequenceDecs_decode_amd64_adjust_test_temp_valid: - TESTQ AX, AX - JNZ sequenceDecs_decode_amd64_adjust_temp_valid - MOVQ $0x00000001, AX - -sequenceDecs_decode_amd64_adjust_temp_valid: - CMPQ CX, $0x01 - CMOVQNE R12, R13 - MOVQ R11, R12 - MOVQ AX, R11 - MOVQ AX, CX - -sequenceDecs_decode_amd64_after_adjust: - MOVQ CX, 16(R10) - - // Check values - MOVQ 8(R10), AX - MOVQ (R10), R14 - LEAQ (AX)(R14*1), R15 - MOVQ s+0(FP), BP - ADDQ R15, 256(BP) - MOVQ ctx+16(FP), R15 - SUBQ R14, 128(R15) - JS error_not_enough_literals - CMPQ AX, $0x00020002 - JA sequenceDecs_decode_amd64_error_match_len_too_big - TESTQ CX, CX - JNZ sequenceDecs_decode_amd64_match_len_ofs_ok - TESTQ AX, AX - JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch - -sequenceDecs_decode_amd64_match_len_ofs_ok: - ADDQ $0x18, R10 - MOVQ ctx+16(FP), AX - DECQ 96(AX) - JNS sequenceDecs_decode_amd64_main_loop - MOVQ s+0(FP), AX - MOVQ R11, 144(AX) - MOVQ R12, 152(AX) - MOVQ R13, 160(AX) - MOVQ br+8(FP), AX - MOVQ DX, 24(AX) - MOVB BL, 32(AX) - MOVQ SI, 8(AX) - - // Return success - MOVQ $0x00000000, ret+24(FP) - RET - - // Return with match length error -sequenceDecs_decode_amd64_error_match_len_ofs_mismatch: - MOVQ $0x00000001, ret+24(FP) - RET - - // Return with match too long error -sequenceDecs_decode_amd64_error_match_len_too_big: - MOVQ $0x00000002, ret+24(FP) - RET - - // Return with match offset too long error - MOVQ $0x00000003, ret+24(FP) - RET - - // Return with not enough literals error -error_not_enough_literals: - MOVQ $0x00000004, ret+24(FP) - RET - - // Return with overread error -error_overread: - MOVQ $0x00000006, ret+24(FP) - RET - -// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int -// Requires: CMOV -TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32 - MOVQ br+8(FP), CX - MOVQ 24(CX), DX - MOVBQZX 32(CX), BX - MOVQ (CX), AX - MOVQ 8(CX), SI - ADDQ SI, AX - MOVQ AX, (SP) - MOVQ ctx+16(FP), AX - MOVQ 72(AX), DI - MOVQ 80(AX), R8 - MOVQ 88(AX), R9 - MOVQ 104(AX), R10 - MOVQ s+0(FP), AX - MOVQ 144(AX), R11 - MOVQ 152(AX), R12 - MOVQ 160(AX), R13 - -sequenceDecs_decode_56_amd64_main_loop: - MOVQ (SP), R14 - - // Fill bitreader to have enough for the offset and match length. - CMPQ SI, $0x08 - JL sequenceDecs_decode_56_amd64_fill_byte_by_byte - MOVQ BX, AX - SHRQ $0x03, AX - SUBQ AX, R14 - MOVQ (R14), DX - SUBQ AX, SI - ANDQ $0x07, BX - JMP sequenceDecs_decode_56_amd64_fill_end - -sequenceDecs_decode_56_amd64_fill_byte_by_byte: - CMPQ SI, $0x00 - JLE sequenceDecs_decode_56_amd64_fill_check_overread - CMPQ BX, $0x07 - JLE sequenceDecs_decode_56_amd64_fill_end - SHLQ $0x08, DX - SUBQ $0x01, R14 - SUBQ $0x01, SI - SUBQ $0x08, BX - MOVBQZX (R14), AX - ORQ AX, DX - JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte - -sequenceDecs_decode_56_amd64_fill_check_overread: - CMPQ BX, $0x40 - JA error_overread - -sequenceDecs_decode_56_amd64_fill_end: - // Update offset - MOVQ R9, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - SHRQ $0x20, AX - TESTQ CX, CX - JZ sequenceDecs_decode_56_amd64_of_update_zero - ADDQ CX, BX - CMPQ BX, $0x40 - JA sequenceDecs_decode_56_amd64_of_update_zero - CMPQ CX, $0x40 - JAE sequenceDecs_decode_56_amd64_of_update_zero - NEGQ CX - SHRQ CL, R15 - ADDQ R15, AX - -sequenceDecs_decode_56_amd64_of_update_zero: - MOVQ AX, 16(R10) - - // Update match length - MOVQ R8, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - SHRQ $0x20, AX - TESTQ CX, CX - JZ sequenceDecs_decode_56_amd64_ml_update_zero - ADDQ CX, BX - CMPQ BX, $0x40 - JA sequenceDecs_decode_56_amd64_ml_update_zero - CMPQ CX, $0x40 - JAE sequenceDecs_decode_56_amd64_ml_update_zero - NEGQ CX - SHRQ CL, R15 - ADDQ R15, AX - -sequenceDecs_decode_56_amd64_ml_update_zero: - MOVQ AX, 8(R10) - - // Update literal length - MOVQ DI, AX - MOVQ BX, CX - MOVQ DX, R15 - SHLQ CL, R15 - MOVB AH, CL - SHRQ $0x20, AX - TESTQ CX, CX - JZ sequenceDecs_decode_56_amd64_ll_update_zero - ADDQ CX, BX - CMPQ BX, $0x40 - JA sequenceDecs_decode_56_amd64_ll_update_zero - CMPQ CX, $0x40 - JAE sequenceDecs_decode_56_amd64_ll_update_zero - NEGQ CX - SHRQ CL, R15 - ADDQ R15, AX - -sequenceDecs_decode_56_amd64_ll_update_zero: - MOVQ AX, (R10) - - // Fill bitreader for state updates - MOVQ R14, (SP) - MOVQ R9, AX - SHRQ $0x08, AX - MOVBQZX AL, AX - MOVQ ctx+16(FP), CX - CMPQ 96(CX), $0x00 - JZ sequenceDecs_decode_56_amd64_skip_update - - // Update Literal Length State - MOVBQZX DI, R14 - SHRL $0x10, DI - LEAQ (BX)(R14*1), CX - MOVQ DX, R15 - MOVQ CX, BX - ROLQ CL, R15 - MOVL $0x00000001, BP - MOVB R14, CL - SHLL CL, BP - DECL BP - ANDQ BP, R15 - ADDQ R15, DI - - // Load ctx.llTable - MOVQ ctx+16(FP), CX - MOVQ (CX), CX - MOVQ (CX)(DI*8), DI - - // Update Match Length State - MOVBQZX R8, R14 - SHRL $0x10, R8 - LEAQ (BX)(R14*1), CX - MOVQ DX, R15 - MOVQ CX, BX - ROLQ CL, R15 - MOVL $0x00000001, BP - MOVB R14, CL - SHLL CL, BP - DECL BP - ANDQ BP, R15 - ADDQ R15, R8 - - // Load ctx.mlTable - MOVQ ctx+16(FP), CX - MOVQ 24(CX), CX - MOVQ (CX)(R8*8), R8 - - // Update Offset State - MOVBQZX R9, R14 - SHRL $0x10, R9 - LEAQ (BX)(R14*1), CX - MOVQ DX, R15 - MOVQ CX, BX - ROLQ CL, R15 - MOVL $0x00000001, BP - MOVB R14, CL - SHLL CL, BP - DECL BP - ANDQ BP, R15 - ADDQ R15, R9 - - // Load ctx.ofTable - MOVQ ctx+16(FP), CX - MOVQ 48(CX), CX - MOVQ (CX)(R9*8), R9 - -sequenceDecs_decode_56_amd64_skip_update: - // Adjust offset - MOVQ 16(R10), CX - CMPQ AX, $0x01 - JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0 - MOVQ R12, R13 - MOVQ R11, R12 - MOVQ CX, R11 - JMP sequenceDecs_decode_56_amd64_after_adjust - -sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0: - CMPQ (R10), $0x00000000 - JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero - INCQ CX - JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero - -sequenceDecs_decode_56_amd64_adjust_offset_maybezero: - TESTQ CX, CX - JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero - MOVQ R11, CX - JMP sequenceDecs_decode_56_amd64_after_adjust - -sequenceDecs_decode_56_amd64_adjust_offset_nonzero: - CMPQ CX, $0x01 - JB sequenceDecs_decode_56_amd64_adjust_zero - JEQ sequenceDecs_decode_56_amd64_adjust_one - CMPQ CX, $0x02 - JA sequenceDecs_decode_56_amd64_adjust_three - JMP sequenceDecs_decode_56_amd64_adjust_two - -sequenceDecs_decode_56_amd64_adjust_zero: - MOVQ R11, AX - JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid - -sequenceDecs_decode_56_amd64_adjust_one: - MOVQ R12, AX - JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid - -sequenceDecs_decode_56_amd64_adjust_two: - MOVQ R13, AX - JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid - -sequenceDecs_decode_56_amd64_adjust_three: - LEAQ -1(R11), AX - -sequenceDecs_decode_56_amd64_adjust_test_temp_valid: - TESTQ AX, AX - JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid - MOVQ $0x00000001, AX - -sequenceDecs_decode_56_amd64_adjust_temp_valid: - CMPQ CX, $0x01 - CMOVQNE R12, R13 - MOVQ R11, R12 - MOVQ AX, R11 - MOVQ AX, CX - -sequenceDecs_decode_56_amd64_after_adjust: - MOVQ CX, 16(R10) - - // Check values - MOVQ 8(R10), AX - MOVQ (R10), R14 - LEAQ (AX)(R14*1), R15 - MOVQ s+0(FP), BP - ADDQ R15, 256(BP) - MOVQ ctx+16(FP), R15 - SUBQ R14, 128(R15) - JS error_not_enough_literals - CMPQ AX, $0x00020002 - JA sequenceDecs_decode_56_amd64_error_match_len_too_big - TESTQ CX, CX - JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok - TESTQ AX, AX - JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch - -sequenceDecs_decode_56_amd64_match_len_ofs_ok: - ADDQ $0x18, R10 - MOVQ ctx+16(FP), AX - DECQ 96(AX) - JNS sequenceDecs_decode_56_amd64_main_loop - MOVQ s+0(FP), AX - MOVQ R11, 144(AX) - MOVQ R12, 152(AX) - MOVQ R13, 160(AX) - MOVQ br+8(FP), AX - MOVQ DX, 24(AX) - MOVB BL, 32(AX) - MOVQ SI, 8(AX) - - // Return success - MOVQ $0x00000000, ret+24(FP) - RET - - // Return with match length error -sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch: - MOVQ $0x00000001, ret+24(FP) - RET - - // Return with match too long error -sequenceDecs_decode_56_amd64_error_match_len_too_big: - MOVQ $0x00000002, ret+24(FP) - RET - - // Return with match offset too long error - MOVQ $0x00000003, ret+24(FP) - RET - - // Return with not enough literals error -error_not_enough_literals: - MOVQ $0x00000004, ret+24(FP) - RET - - // Return with overread error -error_overread: - MOVQ $0x00000006, ret+24(FP) - RET - -// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int -// Requires: BMI, BMI2, CMOV -TEXT ·sequenceDecs_decode_bmi2(SB), $8-32 - MOVQ br+8(FP), BX - MOVQ 24(BX), AX - MOVBQZX 32(BX), DX - MOVQ (BX), CX - MOVQ 8(BX), BX - ADDQ BX, CX - MOVQ CX, (SP) - MOVQ ctx+16(FP), CX - MOVQ 72(CX), SI - MOVQ 80(CX), DI - MOVQ 88(CX), R8 - MOVQ 104(CX), R9 - MOVQ s+0(FP), CX - MOVQ 144(CX), R10 - MOVQ 152(CX), R11 - MOVQ 160(CX), R12 - -sequenceDecs_decode_bmi2_main_loop: - MOVQ (SP), R13 - - // Fill bitreader to have enough for the offset and match length. - CMPQ BX, $0x08 - JL sequenceDecs_decode_bmi2_fill_byte_by_byte - MOVQ DX, CX - SHRQ $0x03, CX - SUBQ CX, R13 - MOVQ (R13), AX - SUBQ CX, BX - ANDQ $0x07, DX - JMP sequenceDecs_decode_bmi2_fill_end - -sequenceDecs_decode_bmi2_fill_byte_by_byte: - CMPQ BX, $0x00 - JLE sequenceDecs_decode_bmi2_fill_check_overread - CMPQ DX, $0x07 - JLE sequenceDecs_decode_bmi2_fill_end - SHLQ $0x08, AX - SUBQ $0x01, R13 - SUBQ $0x01, BX - SUBQ $0x08, DX - MOVBQZX (R13), CX - ORQ CX, AX - JMP sequenceDecs_decode_bmi2_fill_byte_by_byte - -sequenceDecs_decode_bmi2_fill_check_overread: - CMPQ DX, $0x40 - JA error_overread - -sequenceDecs_decode_bmi2_fill_end: - // Update offset - MOVQ $0x00000808, CX - BEXTRQ CX, R8, R14 - MOVQ AX, R15 - LEAQ (DX)(R14*1), CX - ROLQ CL, R15 - BZHIQ R14, R15, R15 - MOVQ CX, DX - MOVQ R8, CX - SHRQ $0x20, CX - ADDQ R15, CX - MOVQ CX, 16(R9) - - // Update match length - MOVQ $0x00000808, CX - BEXTRQ CX, DI, R14 - MOVQ AX, R15 - LEAQ (DX)(R14*1), CX - ROLQ CL, R15 - BZHIQ R14, R15, R15 - MOVQ CX, DX - MOVQ DI, CX - SHRQ $0x20, CX - ADDQ R15, CX - MOVQ CX, 8(R9) - - // Fill bitreader to have enough for the remaining - CMPQ BX, $0x08 - JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte - MOVQ DX, CX - SHRQ $0x03, CX - SUBQ CX, R13 - MOVQ (R13), AX - SUBQ CX, BX - ANDQ $0x07, DX - JMP sequenceDecs_decode_bmi2_fill_2_end - -sequenceDecs_decode_bmi2_fill_2_byte_by_byte: - CMPQ BX, $0x00 - JLE sequenceDecs_decode_bmi2_fill_2_check_overread - CMPQ DX, $0x07 - JLE sequenceDecs_decode_bmi2_fill_2_end - SHLQ $0x08, AX - SUBQ $0x01, R13 - SUBQ $0x01, BX - SUBQ $0x08, DX - MOVBQZX (R13), CX - ORQ CX, AX - JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte - -sequenceDecs_decode_bmi2_fill_2_check_overread: - CMPQ DX, $0x40 - JA error_overread - -sequenceDecs_decode_bmi2_fill_2_end: - // Update literal length - MOVQ $0x00000808, CX - BEXTRQ CX, SI, R14 - MOVQ AX, R15 - LEAQ (DX)(R14*1), CX - ROLQ CL, R15 - BZHIQ R14, R15, R15 - MOVQ CX, DX - MOVQ SI, CX - SHRQ $0x20, CX - ADDQ R15, CX - MOVQ CX, (R9) - - // Fill bitreader for state updates - MOVQ R13, (SP) - MOVQ $0x00000808, CX - BEXTRQ CX, R8, R13 - MOVQ ctx+16(FP), CX - CMPQ 96(CX), $0x00 - JZ sequenceDecs_decode_bmi2_skip_update - LEAQ (SI)(DI*1), R14 - ADDQ R8, R14 - MOVBQZX R14, R14 - LEAQ (DX)(R14*1), CX - MOVQ AX, R15 - MOVQ CX, DX - ROLQ CL, R15 - BZHIQ R14, R15, R15 - - // Update Offset State - BZHIQ R8, R15, CX - SHRXQ R8, R15, R15 - SHRL $0x10, R8 - ADDQ CX, R8 - - // Load ctx.ofTable - MOVQ ctx+16(FP), CX - MOVQ 48(CX), CX - MOVQ (CX)(R8*8), R8 - - // Update Match Length State - BZHIQ DI, R15, CX - SHRXQ DI, R15, R15 - SHRL $0x10, DI - ADDQ CX, DI - - // Load ctx.mlTable - MOVQ ctx+16(FP), CX - MOVQ 24(CX), CX - MOVQ (CX)(DI*8), DI - - // Update Literal Length State - BZHIQ SI, R15, CX - SHRL $0x10, SI - ADDQ CX, SI - - // Load ctx.llTable - MOVQ ctx+16(FP), CX - MOVQ (CX), CX - MOVQ (CX)(SI*8), SI - -sequenceDecs_decode_bmi2_skip_update: - // Adjust offset - MOVQ 16(R9), CX - CMPQ R13, $0x01 - JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0 - MOVQ R11, R12 - MOVQ R10, R11 - MOVQ CX, R10 - JMP sequenceDecs_decode_bmi2_after_adjust - -sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0: - CMPQ (R9), $0x00000000 - JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero - INCQ CX - JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero - -sequenceDecs_decode_bmi2_adjust_offset_maybezero: - TESTQ CX, CX - JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero - MOVQ R10, CX - JMP sequenceDecs_decode_bmi2_after_adjust - -sequenceDecs_decode_bmi2_adjust_offset_nonzero: - CMPQ CX, $0x01 - JB sequenceDecs_decode_bmi2_adjust_zero - JEQ sequenceDecs_decode_bmi2_adjust_one - CMPQ CX, $0x02 - JA sequenceDecs_decode_bmi2_adjust_three - JMP sequenceDecs_decode_bmi2_adjust_two - -sequenceDecs_decode_bmi2_adjust_zero: - MOVQ R10, R13 - JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid - -sequenceDecs_decode_bmi2_adjust_one: - MOVQ R11, R13 - JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid - -sequenceDecs_decode_bmi2_adjust_two: - MOVQ R12, R13 - JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid - -sequenceDecs_decode_bmi2_adjust_three: - LEAQ -1(R10), R13 - -sequenceDecs_decode_bmi2_adjust_test_temp_valid: - TESTQ R13, R13 - JNZ sequenceDecs_decode_bmi2_adjust_temp_valid - MOVQ $0x00000001, R13 - -sequenceDecs_decode_bmi2_adjust_temp_valid: - CMPQ CX, $0x01 - CMOVQNE R11, R12 - MOVQ R10, R11 - MOVQ R13, R10 - MOVQ R13, CX - -sequenceDecs_decode_bmi2_after_adjust: - MOVQ CX, 16(R9) - - // Check values - MOVQ 8(R9), R13 - MOVQ (R9), R14 - LEAQ (R13)(R14*1), R15 - MOVQ s+0(FP), BP - ADDQ R15, 256(BP) - MOVQ ctx+16(FP), R15 - SUBQ R14, 128(R15) - JS error_not_enough_literals - CMPQ R13, $0x00020002 - JA sequenceDecs_decode_bmi2_error_match_len_too_big - TESTQ CX, CX - JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok - TESTQ R13, R13 - JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch - -sequenceDecs_decode_bmi2_match_len_ofs_ok: - ADDQ $0x18, R9 - MOVQ ctx+16(FP), CX - DECQ 96(CX) - JNS sequenceDecs_decode_bmi2_main_loop - MOVQ s+0(FP), CX - MOVQ R10, 144(CX) - MOVQ R11, 152(CX) - MOVQ R12, 160(CX) - MOVQ br+8(FP), CX - MOVQ AX, 24(CX) - MOVB DL, 32(CX) - MOVQ BX, 8(CX) - - // Return success - MOVQ $0x00000000, ret+24(FP) - RET - - // Return with match length error -sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch: - MOVQ $0x00000001, ret+24(FP) - RET - - // Return with match too long error -sequenceDecs_decode_bmi2_error_match_len_too_big: - MOVQ $0x00000002, ret+24(FP) - RET - - // Return with match offset too long error - MOVQ $0x00000003, ret+24(FP) - RET - - // Return with not enough literals error -error_not_enough_literals: - MOVQ $0x00000004, ret+24(FP) - RET - - // Return with overread error -error_overread: - MOVQ $0x00000006, ret+24(FP) - RET - -// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int -// Requires: BMI, BMI2, CMOV -TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32 - MOVQ br+8(FP), BX - MOVQ 24(BX), AX - MOVBQZX 32(BX), DX - MOVQ (BX), CX - MOVQ 8(BX), BX - ADDQ BX, CX - MOVQ CX, (SP) - MOVQ ctx+16(FP), CX - MOVQ 72(CX), SI - MOVQ 80(CX), DI - MOVQ 88(CX), R8 - MOVQ 104(CX), R9 - MOVQ s+0(FP), CX - MOVQ 144(CX), R10 - MOVQ 152(CX), R11 - MOVQ 160(CX), R12 - -sequenceDecs_decode_56_bmi2_main_loop: - MOVQ (SP), R13 - - // Fill bitreader to have enough for the offset and match length. - CMPQ BX, $0x08 - JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte - MOVQ DX, CX - SHRQ $0x03, CX - SUBQ CX, R13 - MOVQ (R13), AX - SUBQ CX, BX - ANDQ $0x07, DX - JMP sequenceDecs_decode_56_bmi2_fill_end - -sequenceDecs_decode_56_bmi2_fill_byte_by_byte: - CMPQ BX, $0x00 - JLE sequenceDecs_decode_56_bmi2_fill_check_overread - CMPQ DX, $0x07 - JLE sequenceDecs_decode_56_bmi2_fill_end - SHLQ $0x08, AX - SUBQ $0x01, R13 - SUBQ $0x01, BX - SUBQ $0x08, DX - MOVBQZX (R13), CX - ORQ CX, AX - JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte - -sequenceDecs_decode_56_bmi2_fill_check_overread: - CMPQ DX, $0x40 - JA error_overread - -sequenceDecs_decode_56_bmi2_fill_end: - // Update offset - MOVQ $0x00000808, CX - BEXTRQ CX, R8, R14 - MOVQ AX, R15 - LEAQ (DX)(R14*1), CX - ROLQ CL, R15 - BZHIQ R14, R15, R15 - MOVQ CX, DX - MOVQ R8, CX - SHRQ $0x20, CX - ADDQ R15, CX - MOVQ CX, 16(R9) - - // Update match length - MOVQ $0x00000808, CX - BEXTRQ CX, DI, R14 - MOVQ AX, R15 - LEAQ (DX)(R14*1), CX - ROLQ CL, R15 - BZHIQ R14, R15, R15 - MOVQ CX, DX - MOVQ DI, CX - SHRQ $0x20, CX - ADDQ R15, CX - MOVQ CX, 8(R9) - - // Update literal length - MOVQ $0x00000808, CX - BEXTRQ CX, SI, R14 - MOVQ AX, R15 - LEAQ (DX)(R14*1), CX - ROLQ CL, R15 - BZHIQ R14, R15, R15 - MOVQ CX, DX - MOVQ SI, CX - SHRQ $0x20, CX - ADDQ R15, CX - MOVQ CX, (R9) - - // Fill bitreader for state updates - MOVQ R13, (SP) - MOVQ $0x00000808, CX - BEXTRQ CX, R8, R13 - MOVQ ctx+16(FP), CX - CMPQ 96(CX), $0x00 - JZ sequenceDecs_decode_56_bmi2_skip_update - LEAQ (SI)(DI*1), R14 - ADDQ R8, R14 - MOVBQZX R14, R14 - LEAQ (DX)(R14*1), CX - MOVQ AX, R15 - MOVQ CX, DX - ROLQ CL, R15 - BZHIQ R14, R15, R15 - - // Update Offset State - BZHIQ R8, R15, CX - SHRXQ R8, R15, R15 - SHRL $0x10, R8 - ADDQ CX, R8 - - // Load ctx.ofTable - MOVQ ctx+16(FP), CX - MOVQ 48(CX), CX - MOVQ (CX)(R8*8), R8 - - // Update Match Length State - BZHIQ DI, R15, CX - SHRXQ DI, R15, R15 - SHRL $0x10, DI - ADDQ CX, DI - - // Load ctx.mlTable - MOVQ ctx+16(FP), CX - MOVQ 24(CX), CX - MOVQ (CX)(DI*8), DI - - // Update Literal Length State - BZHIQ SI, R15, CX - SHRL $0x10, SI - ADDQ CX, SI - - // Load ctx.llTable - MOVQ ctx+16(FP), CX - MOVQ (CX), CX - MOVQ (CX)(SI*8), SI - -sequenceDecs_decode_56_bmi2_skip_update: - // Adjust offset - MOVQ 16(R9), CX - CMPQ R13, $0x01 - JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0 - MOVQ R11, R12 - MOVQ R10, R11 - MOVQ CX, R10 - JMP sequenceDecs_decode_56_bmi2_after_adjust - -sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0: - CMPQ (R9), $0x00000000 - JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero - INCQ CX - JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero - -sequenceDecs_decode_56_bmi2_adjust_offset_maybezero: - TESTQ CX, CX - JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero - MOVQ R10, CX - JMP sequenceDecs_decode_56_bmi2_after_adjust - -sequenceDecs_decode_56_bmi2_adjust_offset_nonzero: - CMPQ CX, $0x01 - JB sequenceDecs_decode_56_bmi2_adjust_zero - JEQ sequenceDecs_decode_56_bmi2_adjust_one - CMPQ CX, $0x02 - JA sequenceDecs_decode_56_bmi2_adjust_three - JMP sequenceDecs_decode_56_bmi2_adjust_two - -sequenceDecs_decode_56_bmi2_adjust_zero: - MOVQ R10, R13 - JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid - -sequenceDecs_decode_56_bmi2_adjust_one: - MOVQ R11, R13 - JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid - -sequenceDecs_decode_56_bmi2_adjust_two: - MOVQ R12, R13 - JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid - -sequenceDecs_decode_56_bmi2_adjust_three: - LEAQ -1(R10), R13 - -sequenceDecs_decode_56_bmi2_adjust_test_temp_valid: - TESTQ R13, R13 - JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid - MOVQ $0x00000001, R13 - -sequenceDecs_decode_56_bmi2_adjust_temp_valid: - CMPQ CX, $0x01 - CMOVQNE R11, R12 - MOVQ R10, R11 - MOVQ R13, R10 - MOVQ R13, CX - -sequenceDecs_decode_56_bmi2_after_adjust: - MOVQ CX, 16(R9) - - // Check values - MOVQ 8(R9), R13 - MOVQ (R9), R14 - LEAQ (R13)(R14*1), R15 - MOVQ s+0(FP), BP - ADDQ R15, 256(BP) - MOVQ ctx+16(FP), R15 - SUBQ R14, 128(R15) - JS error_not_enough_literals - CMPQ R13, $0x00020002 - JA sequenceDecs_decode_56_bmi2_error_match_len_too_big - TESTQ CX, CX - JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok - TESTQ R13, R13 - JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch - -sequenceDecs_decode_56_bmi2_match_len_ofs_ok: - ADDQ $0x18, R9 - MOVQ ctx+16(FP), CX - DECQ 96(CX) - JNS sequenceDecs_decode_56_bmi2_main_loop - MOVQ s+0(FP), CX - MOVQ R10, 144(CX) - MOVQ R11, 152(CX) - MOVQ R12, 160(CX) - MOVQ br+8(FP), CX - MOVQ AX, 24(CX) - MOVB DL, 32(CX) - MOVQ BX, 8(CX) - - // Return success - MOVQ $0x00000000, ret+24(FP) - RET - - // Return with match length error -sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch: - MOVQ $0x00000001, ret+24(FP) - RET - - // Return with match too long error -sequenceDecs_decode_56_bmi2_error_match_len_too_big: - MOVQ $0x00000002, ret+24(FP) - RET - - // Return with match offset too long error - MOVQ $0x00000003, ret+24(FP) - RET - - // Return with not enough literals error -error_not_enough_literals: - MOVQ $0x00000004, ret+24(FP) - RET - - // Return with overread error -error_overread: - MOVQ $0x00000006, ret+24(FP) - RET - -// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool -// Requires: SSE -TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9 - MOVQ ctx+0(FP), R10 - MOVQ 8(R10), CX - TESTQ CX, CX - JZ empty_seqs - MOVQ (R10), AX - MOVQ 24(R10), DX - MOVQ 32(R10), BX - MOVQ 80(R10), SI - MOVQ 104(R10), DI - MOVQ 120(R10), R8 - MOVQ 56(R10), R9 - MOVQ 64(R10), R10 - ADDQ R10, R9 - - // seqsBase += 24 * seqIndex - LEAQ (DX)(DX*2), R11 - SHLQ $0x03, R11 - ADDQ R11, AX - - // outBase += outPosition - ADDQ DI, BX - -main_loop: - MOVQ (AX), R11 - MOVQ 16(AX), R12 - MOVQ 8(AX), R13 - - // Copy literals - TESTQ R11, R11 - JZ check_offset - XORQ R14, R14 - -copy_1: - MOVUPS (SI)(R14*1), X0 - MOVUPS X0, (BX)(R14*1) - ADDQ $0x10, R14 - CMPQ R14, R11 - JB copy_1 - ADDQ R11, SI - ADDQ R11, BX - ADDQ R11, DI - - // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) -check_offset: - LEAQ (DI)(R10*1), R11 - CMPQ R12, R11 - JG error_match_off_too_big - CMPQ R12, R8 - JG error_match_off_too_big - - // Copy match from history - MOVQ R12, R11 - SUBQ DI, R11 - JLS copy_match - MOVQ R9, R14 - SUBQ R11, R14 - CMPQ R13, R11 - JG copy_all_from_history - MOVQ R13, R11 - SUBQ $0x10, R11 - JB copy_4_small - -copy_4_loop: - MOVUPS (R14), X0 - MOVUPS X0, (BX) - ADDQ $0x10, R14 - ADDQ $0x10, BX - SUBQ $0x10, R11 - JAE copy_4_loop - LEAQ 16(R14)(R11*1), R14 - LEAQ 16(BX)(R11*1), BX - MOVUPS -16(R14), X0 - MOVUPS X0, -16(BX) - JMP copy_4_end - -copy_4_small: - CMPQ R13, $0x03 - JE copy_4_move_3 - CMPQ R13, $0x08 - JB copy_4_move_4through7 - JMP copy_4_move_8through16 - -copy_4_move_3: - MOVW (R14), R11 - MOVB 2(R14), R12 - MOVW R11, (BX) - MOVB R12, 2(BX) - ADDQ R13, R14 - ADDQ R13, BX - JMP copy_4_end - -copy_4_move_4through7: - MOVL (R14), R11 - MOVL -4(R14)(R13*1), R12 - MOVL R11, (BX) - MOVL R12, -4(BX)(R13*1) - ADDQ R13, R14 - ADDQ R13, BX - JMP copy_4_end - -copy_4_move_8through16: - MOVQ (R14), R11 - MOVQ -8(R14)(R13*1), R12 - MOVQ R11, (BX) - MOVQ R12, -8(BX)(R13*1) - ADDQ R13, R14 - ADDQ R13, BX - -copy_4_end: - ADDQ R13, DI - ADDQ $0x18, AX - INCQ DX - CMPQ DX, CX - JB main_loop - JMP loop_finished - -copy_all_from_history: - MOVQ R11, R15 - SUBQ $0x10, R15 - JB copy_5_small - -copy_5_loop: - MOVUPS (R14), X0 - MOVUPS X0, (BX) - ADDQ $0x10, R14 - ADDQ $0x10, BX - SUBQ $0x10, R15 - JAE copy_5_loop - LEAQ 16(R14)(R15*1), R14 - LEAQ 16(BX)(R15*1), BX - MOVUPS -16(R14), X0 - MOVUPS X0, -16(BX) - JMP copy_5_end - -copy_5_small: - CMPQ R11, $0x03 - JE copy_5_move_3 - JB copy_5_move_1or2 - CMPQ R11, $0x08 - JB copy_5_move_4through7 - JMP copy_5_move_8through16 - -copy_5_move_1or2: - MOVB (R14), R15 - MOVB -1(R14)(R11*1), BP - MOVB R15, (BX) - MOVB BP, -1(BX)(R11*1) - ADDQ R11, R14 - ADDQ R11, BX - JMP copy_5_end - -copy_5_move_3: - MOVW (R14), R15 - MOVB 2(R14), BP - MOVW R15, (BX) - MOVB BP, 2(BX) - ADDQ R11, R14 - ADDQ R11, BX - JMP copy_5_end - -copy_5_move_4through7: - MOVL (R14), R15 - MOVL -4(R14)(R11*1), BP - MOVL R15, (BX) - MOVL BP, -4(BX)(R11*1) - ADDQ R11, R14 - ADDQ R11, BX - JMP copy_5_end - -copy_5_move_8through16: - MOVQ (R14), R15 - MOVQ -8(R14)(R11*1), BP - MOVQ R15, (BX) - MOVQ BP, -8(BX)(R11*1) - ADDQ R11, R14 - ADDQ R11, BX - -copy_5_end: - ADDQ R11, DI - SUBQ R11, R13 - - // Copy match from the current buffer -copy_match: - MOVQ BX, R11 - SUBQ R12, R11 - - // ml <= mo - CMPQ R13, R12 - JA copy_overlapping_match - - // Copy non-overlapping match - ADDQ R13, DI - MOVQ BX, R12 - ADDQ R13, BX - -copy_2: - MOVUPS (R11), X0 - MOVUPS X0, (R12) - ADDQ $0x10, R11 - ADDQ $0x10, R12 - SUBQ $0x10, R13 - JHI copy_2 - JMP handle_loop - - // Copy overlapping match -copy_overlapping_match: - ADDQ R13, DI - -copy_slow_3: - MOVB (R11), R12 - MOVB R12, (BX) - INCQ R11 - INCQ BX - DECQ R13 - JNZ copy_slow_3 - -handle_loop: - ADDQ $0x18, AX - INCQ DX - CMPQ DX, CX - JB main_loop - -loop_finished: - // Return value - MOVB $0x01, ret+8(FP) - - // Update the context - MOVQ ctx+0(FP), AX - MOVQ DX, 24(AX) - MOVQ DI, 104(AX) - SUBQ 80(AX), SI - MOVQ SI, 112(AX) - RET - -error_match_off_too_big: - // Return value - MOVB $0x00, ret+8(FP) - - // Update the context - MOVQ ctx+0(FP), AX - MOVQ DX, 24(AX) - MOVQ DI, 104(AX) - SUBQ 80(AX), SI - MOVQ SI, 112(AX) - RET - -empty_seqs: - // Return value - MOVB $0x01, ret+8(FP) - RET - -// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool -// Requires: SSE -TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9 - MOVQ ctx+0(FP), R10 - MOVQ 8(R10), CX - TESTQ CX, CX - JZ empty_seqs - MOVQ (R10), AX - MOVQ 24(R10), DX - MOVQ 32(R10), BX - MOVQ 80(R10), SI - MOVQ 104(R10), DI - MOVQ 120(R10), R8 - MOVQ 56(R10), R9 - MOVQ 64(R10), R10 - ADDQ R10, R9 - - // seqsBase += 24 * seqIndex - LEAQ (DX)(DX*2), R11 - SHLQ $0x03, R11 - ADDQ R11, AX - - // outBase += outPosition - ADDQ DI, BX - -main_loop: - MOVQ (AX), R11 - MOVQ 16(AX), R12 - MOVQ 8(AX), R13 - - // Copy literals - TESTQ R11, R11 - JZ check_offset - MOVQ R11, R14 - SUBQ $0x10, R14 - JB copy_1_small - -copy_1_loop: - MOVUPS (SI), X0 - MOVUPS X0, (BX) - ADDQ $0x10, SI - ADDQ $0x10, BX - SUBQ $0x10, R14 - JAE copy_1_loop - LEAQ 16(SI)(R14*1), SI - LEAQ 16(BX)(R14*1), BX - MOVUPS -16(SI), X0 - MOVUPS X0, -16(BX) - JMP copy_1_end - -copy_1_small: - CMPQ R11, $0x03 - JE copy_1_move_3 - JB copy_1_move_1or2 - CMPQ R11, $0x08 - JB copy_1_move_4through7 - JMP copy_1_move_8through16 - -copy_1_move_1or2: - MOVB (SI), R14 - MOVB -1(SI)(R11*1), R15 - MOVB R14, (BX) - MOVB R15, -1(BX)(R11*1) - ADDQ R11, SI - ADDQ R11, BX - JMP copy_1_end - -copy_1_move_3: - MOVW (SI), R14 - MOVB 2(SI), R15 - MOVW R14, (BX) - MOVB R15, 2(BX) - ADDQ R11, SI - ADDQ R11, BX - JMP copy_1_end - -copy_1_move_4through7: - MOVL (SI), R14 - MOVL -4(SI)(R11*1), R15 - MOVL R14, (BX) - MOVL R15, -4(BX)(R11*1) - ADDQ R11, SI - ADDQ R11, BX - JMP copy_1_end - -copy_1_move_8through16: - MOVQ (SI), R14 - MOVQ -8(SI)(R11*1), R15 - MOVQ R14, (BX) - MOVQ R15, -8(BX)(R11*1) - ADDQ R11, SI - ADDQ R11, BX - -copy_1_end: - ADDQ R11, DI - - // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) -check_offset: - LEAQ (DI)(R10*1), R11 - CMPQ R12, R11 - JG error_match_off_too_big - CMPQ R12, R8 - JG error_match_off_too_big - - // Copy match from history - MOVQ R12, R11 - SUBQ DI, R11 - JLS copy_match - MOVQ R9, R14 - SUBQ R11, R14 - CMPQ R13, R11 - JG copy_all_from_history - MOVQ R13, R11 - SUBQ $0x10, R11 - JB copy_4_small - -copy_4_loop: - MOVUPS (R14), X0 - MOVUPS X0, (BX) - ADDQ $0x10, R14 - ADDQ $0x10, BX - SUBQ $0x10, R11 - JAE copy_4_loop - LEAQ 16(R14)(R11*1), R14 - LEAQ 16(BX)(R11*1), BX - MOVUPS -16(R14), X0 - MOVUPS X0, -16(BX) - JMP copy_4_end - -copy_4_small: - CMPQ R13, $0x03 - JE copy_4_move_3 - CMPQ R13, $0x08 - JB copy_4_move_4through7 - JMP copy_4_move_8through16 - -copy_4_move_3: - MOVW (R14), R11 - MOVB 2(R14), R12 - MOVW R11, (BX) - MOVB R12, 2(BX) - ADDQ R13, R14 - ADDQ R13, BX - JMP copy_4_end - -copy_4_move_4through7: - MOVL (R14), R11 - MOVL -4(R14)(R13*1), R12 - MOVL R11, (BX) - MOVL R12, -4(BX)(R13*1) - ADDQ R13, R14 - ADDQ R13, BX - JMP copy_4_end - -copy_4_move_8through16: - MOVQ (R14), R11 - MOVQ -8(R14)(R13*1), R12 - MOVQ R11, (BX) - MOVQ R12, -8(BX)(R13*1) - ADDQ R13, R14 - ADDQ R13, BX - -copy_4_end: - ADDQ R13, DI - ADDQ $0x18, AX - INCQ DX - CMPQ DX, CX - JB main_loop - JMP loop_finished - -copy_all_from_history: - MOVQ R11, R15 - SUBQ $0x10, R15 - JB copy_5_small - -copy_5_loop: - MOVUPS (R14), X0 - MOVUPS X0, (BX) - ADDQ $0x10, R14 - ADDQ $0x10, BX - SUBQ $0x10, R15 - JAE copy_5_loop - LEAQ 16(R14)(R15*1), R14 - LEAQ 16(BX)(R15*1), BX - MOVUPS -16(R14), X0 - MOVUPS X0, -16(BX) - JMP copy_5_end - -copy_5_small: - CMPQ R11, $0x03 - JE copy_5_move_3 - JB copy_5_move_1or2 - CMPQ R11, $0x08 - JB copy_5_move_4through7 - JMP copy_5_move_8through16 - -copy_5_move_1or2: - MOVB (R14), R15 - MOVB -1(R14)(R11*1), BP - MOVB R15, (BX) - MOVB BP, -1(BX)(R11*1) - ADDQ R11, R14 - ADDQ R11, BX - JMP copy_5_end - -copy_5_move_3: - MOVW (R14), R15 - MOVB 2(R14), BP - MOVW R15, (BX) - MOVB BP, 2(BX) - ADDQ R11, R14 - ADDQ R11, BX - JMP copy_5_end - -copy_5_move_4through7: - MOVL (R14), R15 - MOVL -4(R14)(R11*1), BP - MOVL R15, (BX) - MOVL BP, -4(BX)(R11*1) - ADDQ R11, R14 - ADDQ R11, BX - JMP copy_5_end - -copy_5_move_8through16: - MOVQ (R14), R15 - MOVQ -8(R14)(R11*1), BP - MOVQ R15, (BX) - MOVQ BP, -8(BX)(R11*1) - ADDQ R11, R14 - ADDQ R11, BX - -copy_5_end: - ADDQ R11, DI - SUBQ R11, R13 - - // Copy match from the current buffer -copy_match: - MOVQ BX, R11 - SUBQ R12, R11 - - // ml <= mo - CMPQ R13, R12 - JA copy_overlapping_match - - // Copy non-overlapping match - ADDQ R13, DI - MOVQ R13, R12 - SUBQ $0x10, R12 - JB copy_2_small - -copy_2_loop: - MOVUPS (R11), X0 - MOVUPS X0, (BX) - ADDQ $0x10, R11 - ADDQ $0x10, BX - SUBQ $0x10, R12 - JAE copy_2_loop - LEAQ 16(R11)(R12*1), R11 - LEAQ 16(BX)(R12*1), BX - MOVUPS -16(R11), X0 - MOVUPS X0, -16(BX) - JMP copy_2_end - -copy_2_small: - CMPQ R13, $0x03 - JE copy_2_move_3 - JB copy_2_move_1or2 - CMPQ R13, $0x08 - JB copy_2_move_4through7 - JMP copy_2_move_8through16 - -copy_2_move_1or2: - MOVB (R11), R12 - MOVB -1(R11)(R13*1), R14 - MOVB R12, (BX) - MOVB R14, -1(BX)(R13*1) - ADDQ R13, R11 - ADDQ R13, BX - JMP copy_2_end - -copy_2_move_3: - MOVW (R11), R12 - MOVB 2(R11), R14 - MOVW R12, (BX) - MOVB R14, 2(BX) - ADDQ R13, R11 - ADDQ R13, BX - JMP copy_2_end - -copy_2_move_4through7: - MOVL (R11), R12 - MOVL -4(R11)(R13*1), R14 - MOVL R12, (BX) - MOVL R14, -4(BX)(R13*1) - ADDQ R13, R11 - ADDQ R13, BX - JMP copy_2_end - -copy_2_move_8through16: - MOVQ (R11), R12 - MOVQ -8(R11)(R13*1), R14 - MOVQ R12, (BX) - MOVQ R14, -8(BX)(R13*1) - ADDQ R13, R11 - ADDQ R13, BX - -copy_2_end: - JMP handle_loop - - // Copy overlapping match -copy_overlapping_match: - ADDQ R13, DI - -copy_slow_3: - MOVB (R11), R12 - MOVB R12, (BX) - INCQ R11 - INCQ BX - DECQ R13 - JNZ copy_slow_3 - -handle_loop: - ADDQ $0x18, AX - INCQ DX - CMPQ DX, CX - JB main_loop - -loop_finished: - // Return value - MOVB $0x01, ret+8(FP) - - // Update the context - MOVQ ctx+0(FP), AX - MOVQ DX, 24(AX) - MOVQ DI, 104(AX) - SUBQ 80(AX), SI - MOVQ SI, 112(AX) - RET - -error_match_off_too_big: - // Return value - MOVB $0x00, ret+8(FP) - - // Update the context - MOVQ ctx+0(FP), AX - MOVQ DX, 24(AX) - MOVQ DI, 104(AX) - SUBQ 80(AX), SI - MOVQ SI, 112(AX) - RET - -empty_seqs: - // Return value - MOVB $0x01, ret+8(FP) - RET - -// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int -// Requires: CMOV, SSE -TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32 - MOVQ br+8(FP), CX - MOVQ 24(CX), DX - MOVBQZX 32(CX), BX - MOVQ (CX), AX - MOVQ 8(CX), SI - ADDQ SI, AX - MOVQ AX, (SP) - MOVQ ctx+16(FP), AX - MOVQ 72(AX), DI - MOVQ 80(AX), R8 - MOVQ 88(AX), R9 - XORQ CX, CX - MOVQ CX, 8(SP) - MOVQ CX, 16(SP) - MOVQ CX, 24(SP) - MOVQ 112(AX), R10 - MOVQ 128(AX), CX - MOVQ CX, 32(SP) - MOVQ 144(AX), R11 - MOVQ 136(AX), R12 - MOVQ 200(AX), CX - MOVQ CX, 56(SP) - MOVQ 176(AX), CX - MOVQ CX, 48(SP) - MOVQ 184(AX), AX - MOVQ AX, 40(SP) - MOVQ 40(SP), AX - ADDQ AX, 48(SP) - - // Calculate pointer to s.out[cap(s.out)] (a past-end pointer) - ADDQ R10, 32(SP) - - // outBase += outPosition - ADDQ R12, R10 - -sequenceDecs_decodeSync_amd64_main_loop: - MOVQ (SP), R13 - - // Fill bitreader to have enough for the offset and match length. - CMPQ SI, $0x08 - JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte - MOVQ BX, AX - SHRQ $0x03, AX - SUBQ AX, R13 - MOVQ (R13), DX - SUBQ AX, SI - ANDQ $0x07, BX - JMP sequenceDecs_decodeSync_amd64_fill_end - -sequenceDecs_decodeSync_amd64_fill_byte_by_byte: - CMPQ SI, $0x00 - JLE sequenceDecs_decodeSync_amd64_fill_check_overread - CMPQ BX, $0x07 - JLE sequenceDecs_decodeSync_amd64_fill_end - SHLQ $0x08, DX - SUBQ $0x01, R13 - SUBQ $0x01, SI - SUBQ $0x08, BX - MOVBQZX (R13), AX - ORQ AX, DX - JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte - -sequenceDecs_decodeSync_amd64_fill_check_overread: - CMPQ BX, $0x40 - JA error_overread - -sequenceDecs_decodeSync_amd64_fill_end: - // Update offset - MOVQ R9, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - SHRQ $0x20, AX - TESTQ CX, CX - JZ sequenceDecs_decodeSync_amd64_of_update_zero - ADDQ CX, BX - CMPQ BX, $0x40 - JA sequenceDecs_decodeSync_amd64_of_update_zero - CMPQ CX, $0x40 - JAE sequenceDecs_decodeSync_amd64_of_update_zero - NEGQ CX - SHRQ CL, R14 - ADDQ R14, AX - -sequenceDecs_decodeSync_amd64_of_update_zero: - MOVQ AX, 8(SP) - - // Update match length - MOVQ R8, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - SHRQ $0x20, AX - TESTQ CX, CX - JZ sequenceDecs_decodeSync_amd64_ml_update_zero - ADDQ CX, BX - CMPQ BX, $0x40 - JA sequenceDecs_decodeSync_amd64_ml_update_zero - CMPQ CX, $0x40 - JAE sequenceDecs_decodeSync_amd64_ml_update_zero - NEGQ CX - SHRQ CL, R14 - ADDQ R14, AX - -sequenceDecs_decodeSync_amd64_ml_update_zero: - MOVQ AX, 16(SP) - - // Fill bitreader to have enough for the remaining - CMPQ SI, $0x08 - JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte - MOVQ BX, AX - SHRQ $0x03, AX - SUBQ AX, R13 - MOVQ (R13), DX - SUBQ AX, SI - ANDQ $0x07, BX - JMP sequenceDecs_decodeSync_amd64_fill_2_end - -sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte: - CMPQ SI, $0x00 - JLE sequenceDecs_decodeSync_amd64_fill_2_check_overread - CMPQ BX, $0x07 - JLE sequenceDecs_decodeSync_amd64_fill_2_end - SHLQ $0x08, DX - SUBQ $0x01, R13 - SUBQ $0x01, SI - SUBQ $0x08, BX - MOVBQZX (R13), AX - ORQ AX, DX - JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte - -sequenceDecs_decodeSync_amd64_fill_2_check_overread: - CMPQ BX, $0x40 - JA error_overread - -sequenceDecs_decodeSync_amd64_fill_2_end: - // Update literal length - MOVQ DI, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - SHRQ $0x20, AX - TESTQ CX, CX - JZ sequenceDecs_decodeSync_amd64_ll_update_zero - ADDQ CX, BX - CMPQ BX, $0x40 - JA sequenceDecs_decodeSync_amd64_ll_update_zero - CMPQ CX, $0x40 - JAE sequenceDecs_decodeSync_amd64_ll_update_zero - NEGQ CX - SHRQ CL, R14 - ADDQ R14, AX - -sequenceDecs_decodeSync_amd64_ll_update_zero: - MOVQ AX, 24(SP) - - // Fill bitreader for state updates - MOVQ R13, (SP) - MOVQ R9, AX - SHRQ $0x08, AX - MOVBQZX AL, AX - MOVQ ctx+16(FP), CX - CMPQ 96(CX), $0x00 - JZ sequenceDecs_decodeSync_amd64_skip_update - - // Update Literal Length State - MOVBQZX DI, R13 - SHRL $0x10, DI - LEAQ (BX)(R13*1), CX - MOVQ DX, R14 - MOVQ CX, BX - ROLQ CL, R14 - MOVL $0x00000001, R15 - MOVB R13, CL - SHLL CL, R15 - DECL R15 - ANDQ R15, R14 - ADDQ R14, DI - - // Load ctx.llTable - MOVQ ctx+16(FP), CX - MOVQ (CX), CX - MOVQ (CX)(DI*8), DI - - // Update Match Length State - MOVBQZX R8, R13 - SHRL $0x10, R8 - LEAQ (BX)(R13*1), CX - MOVQ DX, R14 - MOVQ CX, BX - ROLQ CL, R14 - MOVL $0x00000001, R15 - MOVB R13, CL - SHLL CL, R15 - DECL R15 - ANDQ R15, R14 - ADDQ R14, R8 - - // Load ctx.mlTable - MOVQ ctx+16(FP), CX - MOVQ 24(CX), CX - MOVQ (CX)(R8*8), R8 - - // Update Offset State - MOVBQZX R9, R13 - SHRL $0x10, R9 - LEAQ (BX)(R13*1), CX - MOVQ DX, R14 - MOVQ CX, BX - ROLQ CL, R14 - MOVL $0x00000001, R15 - MOVB R13, CL - SHLL CL, R15 - DECL R15 - ANDQ R15, R14 - ADDQ R14, R9 - - // Load ctx.ofTable - MOVQ ctx+16(FP), CX - MOVQ 48(CX), CX - MOVQ (CX)(R9*8), R9 - -sequenceDecs_decodeSync_amd64_skip_update: - // Adjust offset - MOVQ s+0(FP), CX - MOVQ 8(SP), R13 - CMPQ AX, $0x01 - JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0 - MOVUPS 144(CX), X0 - MOVQ R13, 144(CX) - MOVUPS X0, 152(CX) - JMP sequenceDecs_decodeSync_amd64_after_adjust - -sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0: - CMPQ 24(SP), $0x00000000 - JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero - INCQ R13 - JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero - -sequenceDecs_decodeSync_amd64_adjust_offset_maybezero: - TESTQ R13, R13 - JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero - MOVQ 144(CX), R13 - JMP sequenceDecs_decodeSync_amd64_after_adjust - -sequenceDecs_decodeSync_amd64_adjust_offset_nonzero: - MOVQ R13, AX - XORQ R14, R14 - MOVQ $-1, R15 - CMPQ R13, $0x03 - CMOVQEQ R14, AX - CMOVQEQ R15, R14 - ADDQ 144(CX)(AX*8), R14 - JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid - MOVQ $0x00000001, R14 - -sequenceDecs_decodeSync_amd64_adjust_temp_valid: - CMPQ R13, $0x01 - JZ sequenceDecs_decodeSync_amd64_adjust_skip - MOVQ 152(CX), AX - MOVQ AX, 160(CX) - -sequenceDecs_decodeSync_amd64_adjust_skip: - MOVQ 144(CX), AX - MOVQ AX, 152(CX) - MOVQ R14, 144(CX) - MOVQ R14, R13 - -sequenceDecs_decodeSync_amd64_after_adjust: - MOVQ R13, 8(SP) - - // Check values - MOVQ 16(SP), AX - MOVQ 24(SP), CX - LEAQ (AX)(CX*1), R14 - MOVQ s+0(FP), R15 - ADDQ R14, 256(R15) - MOVQ ctx+16(FP), R14 - SUBQ CX, 104(R14) - JS error_not_enough_literals - CMPQ AX, $0x00020002 - JA sequenceDecs_decodeSync_amd64_error_match_len_too_big - TESTQ R13, R13 - JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok - TESTQ AX, AX - JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch - -sequenceDecs_decodeSync_amd64_match_len_ofs_ok: - MOVQ 24(SP), AX - MOVQ 8(SP), CX - MOVQ 16(SP), R13 - - // Check if we have enough space in s.out - LEAQ (AX)(R13*1), R14 - ADDQ R10, R14 - CMPQ R14, 32(SP) - JA error_not_enough_space - - // Copy literals - TESTQ AX, AX - JZ check_offset - XORQ R14, R14 - -copy_1: - MOVUPS (R11)(R14*1), X0 - MOVUPS X0, (R10)(R14*1) - ADDQ $0x10, R14 - CMPQ R14, AX - JB copy_1 - ADDQ AX, R11 - ADDQ AX, R10 - ADDQ AX, R12 - - // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) -check_offset: - MOVQ R12, AX - ADDQ 40(SP), AX - CMPQ CX, AX - JG error_match_off_too_big - CMPQ CX, 56(SP) - JG error_match_off_too_big - - // Copy match from history - MOVQ CX, AX - SUBQ R12, AX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ AX, R14 - CMPQ R13, AX - JG copy_all_from_history - MOVQ R13, AX - SUBQ $0x10, AX - JB copy_4_small - -copy_4_loop: - MOVUPS (R14), X0 - MOVUPS X0, (R10) - ADDQ $0x10, R14 - ADDQ $0x10, R10 - SUBQ $0x10, AX - JAE copy_4_loop - LEAQ 16(R14)(AX*1), R14 - LEAQ 16(R10)(AX*1), R10 - MOVUPS -16(R14), X0 - MOVUPS X0, -16(R10) - JMP copy_4_end - -copy_4_small: - CMPQ R13, $0x03 - JE copy_4_move_3 - CMPQ R13, $0x08 - JB copy_4_move_4through7 - JMP copy_4_move_8through16 - -copy_4_move_3: - MOVW (R14), AX - MOVB 2(R14), CL - MOVW AX, (R10) - MOVB CL, 2(R10) - ADDQ R13, R14 - ADDQ R13, R10 - JMP copy_4_end - -copy_4_move_4through7: - MOVL (R14), AX - MOVL -4(R14)(R13*1), CX - MOVL AX, (R10) - MOVL CX, -4(R10)(R13*1) - ADDQ R13, R14 - ADDQ R13, R10 - JMP copy_4_end - -copy_4_move_8through16: - MOVQ (R14), AX - MOVQ -8(R14)(R13*1), CX - MOVQ AX, (R10) - MOVQ CX, -8(R10)(R13*1) - ADDQ R13, R14 - ADDQ R13, R10 - -copy_4_end: - ADDQ R13, R12 - JMP handle_loop - JMP loop_finished - -copy_all_from_history: - MOVQ AX, R15 - SUBQ $0x10, R15 - JB copy_5_small - -copy_5_loop: - MOVUPS (R14), X0 - MOVUPS X0, (R10) - ADDQ $0x10, R14 - ADDQ $0x10, R10 - SUBQ $0x10, R15 - JAE copy_5_loop - LEAQ 16(R14)(R15*1), R14 - LEAQ 16(R10)(R15*1), R10 - MOVUPS -16(R14), X0 - MOVUPS X0, -16(R10) - JMP copy_5_end - -copy_5_small: - CMPQ AX, $0x03 - JE copy_5_move_3 - JB copy_5_move_1or2 - CMPQ AX, $0x08 - JB copy_5_move_4through7 - JMP copy_5_move_8through16 - -copy_5_move_1or2: - MOVB (R14), R15 - MOVB -1(R14)(AX*1), BP - MOVB R15, (R10) - MOVB BP, -1(R10)(AX*1) - ADDQ AX, R14 - ADDQ AX, R10 - JMP copy_5_end - -copy_5_move_3: - MOVW (R14), R15 - MOVB 2(R14), BP - MOVW R15, (R10) - MOVB BP, 2(R10) - ADDQ AX, R14 - ADDQ AX, R10 - JMP copy_5_end - -copy_5_move_4through7: - MOVL (R14), R15 - MOVL -4(R14)(AX*1), BP - MOVL R15, (R10) - MOVL BP, -4(R10)(AX*1) - ADDQ AX, R14 - ADDQ AX, R10 - JMP copy_5_end - -copy_5_move_8through16: - MOVQ (R14), R15 - MOVQ -8(R14)(AX*1), BP - MOVQ R15, (R10) - MOVQ BP, -8(R10)(AX*1) - ADDQ AX, R14 - ADDQ AX, R10 - -copy_5_end: - ADDQ AX, R12 - SUBQ AX, R13 - - // Copy match from the current buffer -copy_match: - MOVQ R10, AX - SUBQ CX, AX - - // ml <= mo - CMPQ R13, CX - JA copy_overlapping_match - - // Copy non-overlapping match - ADDQ R13, R12 - MOVQ R10, CX - ADDQ R13, R10 - -copy_2: - MOVUPS (AX), X0 - MOVUPS X0, (CX) - ADDQ $0x10, AX - ADDQ $0x10, CX - SUBQ $0x10, R13 - JHI copy_2 - JMP handle_loop - - // Copy overlapping match -copy_overlapping_match: - ADDQ R13, R12 - -copy_slow_3: - MOVB (AX), CL - MOVB CL, (R10) - INCQ AX - INCQ R10 - DECQ R13 - JNZ copy_slow_3 - -handle_loop: - MOVQ ctx+16(FP), AX - DECQ 96(AX) - JNS sequenceDecs_decodeSync_amd64_main_loop - -loop_finished: - MOVQ br+8(FP), AX - MOVQ DX, 24(AX) - MOVB BL, 32(AX) - MOVQ SI, 8(AX) - - // Update the context - MOVQ ctx+16(FP), AX - MOVQ R12, 136(AX) - MOVQ 144(AX), CX - SUBQ CX, R11 - MOVQ R11, 168(AX) - - // Return success - MOVQ $0x00000000, ret+24(FP) - RET - - // Return with match length error -sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch: - MOVQ 16(SP), AX - MOVQ ctx+16(FP), CX - MOVQ AX, 216(CX) - MOVQ $0x00000001, ret+24(FP) - RET - - // Return with match too long error -sequenceDecs_decodeSync_amd64_error_match_len_too_big: - MOVQ ctx+16(FP), AX - MOVQ 16(SP), CX - MOVQ CX, 216(AX) - MOVQ $0x00000002, ret+24(FP) - RET - - // Return with match offset too long error -error_match_off_too_big: - MOVQ ctx+16(FP), AX - MOVQ 8(SP), CX - MOVQ CX, 224(AX) - MOVQ R12, 136(AX) - MOVQ $0x00000003, ret+24(FP) - RET - - // Return with not enough literals error -error_not_enough_literals: - MOVQ ctx+16(FP), AX - MOVQ 24(SP), CX - MOVQ CX, 208(AX) - MOVQ $0x00000004, ret+24(FP) - RET - - // Return with overread error -error_overread: - MOVQ $0x00000006, ret+24(FP) - RET - - // Return with not enough output space error -error_not_enough_space: - MOVQ ctx+16(FP), AX - MOVQ 24(SP), CX - MOVQ CX, 208(AX) - MOVQ 16(SP), CX - MOVQ CX, 216(AX) - MOVQ R12, 136(AX) - MOVQ $0x00000005, ret+24(FP) - RET - -// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int -// Requires: BMI, BMI2, CMOV, SSE -TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32 - MOVQ br+8(FP), BX - MOVQ 24(BX), AX - MOVBQZX 32(BX), DX - MOVQ (BX), CX - MOVQ 8(BX), BX - ADDQ BX, CX - MOVQ CX, (SP) - MOVQ ctx+16(FP), CX - MOVQ 72(CX), SI - MOVQ 80(CX), DI - MOVQ 88(CX), R8 - XORQ R9, R9 - MOVQ R9, 8(SP) - MOVQ R9, 16(SP) - MOVQ R9, 24(SP) - MOVQ 112(CX), R9 - MOVQ 128(CX), R10 - MOVQ R10, 32(SP) - MOVQ 144(CX), R10 - MOVQ 136(CX), R11 - MOVQ 200(CX), R12 - MOVQ R12, 56(SP) - MOVQ 176(CX), R12 - MOVQ R12, 48(SP) - MOVQ 184(CX), CX - MOVQ CX, 40(SP) - MOVQ 40(SP), CX - ADDQ CX, 48(SP) - - // Calculate pointer to s.out[cap(s.out)] (a past-end pointer) - ADDQ R9, 32(SP) - - // outBase += outPosition - ADDQ R11, R9 - -sequenceDecs_decodeSync_bmi2_main_loop: - MOVQ (SP), R12 - - // Fill bitreader to have enough for the offset and match length. - CMPQ BX, $0x08 - JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte - MOVQ DX, CX - SHRQ $0x03, CX - SUBQ CX, R12 - MOVQ (R12), AX - SUBQ CX, BX - ANDQ $0x07, DX - JMP sequenceDecs_decodeSync_bmi2_fill_end - -sequenceDecs_decodeSync_bmi2_fill_byte_by_byte: - CMPQ BX, $0x00 - JLE sequenceDecs_decodeSync_bmi2_fill_check_overread - CMPQ DX, $0x07 - JLE sequenceDecs_decodeSync_bmi2_fill_end - SHLQ $0x08, AX - SUBQ $0x01, R12 - SUBQ $0x01, BX - SUBQ $0x08, DX - MOVBQZX (R12), CX - ORQ CX, AX - JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte - -sequenceDecs_decodeSync_bmi2_fill_check_overread: - CMPQ DX, $0x40 - JA error_overread - -sequenceDecs_decodeSync_bmi2_fill_end: - // Update offset - MOVQ $0x00000808, CX - BEXTRQ CX, R8, R13 - MOVQ AX, R14 - LEAQ (DX)(R13*1), CX - ROLQ CL, R14 - BZHIQ R13, R14, R14 - MOVQ CX, DX - MOVQ R8, CX - SHRQ $0x20, CX - ADDQ R14, CX - MOVQ CX, 8(SP) - - // Update match length - MOVQ $0x00000808, CX - BEXTRQ CX, DI, R13 - MOVQ AX, R14 - LEAQ (DX)(R13*1), CX - ROLQ CL, R14 - BZHIQ R13, R14, R14 - MOVQ CX, DX - MOVQ DI, CX - SHRQ $0x20, CX - ADDQ R14, CX - MOVQ CX, 16(SP) - - // Fill bitreader to have enough for the remaining - CMPQ BX, $0x08 - JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte - MOVQ DX, CX - SHRQ $0x03, CX - SUBQ CX, R12 - MOVQ (R12), AX - SUBQ CX, BX - ANDQ $0x07, DX - JMP sequenceDecs_decodeSync_bmi2_fill_2_end - -sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte: - CMPQ BX, $0x00 - JLE sequenceDecs_decodeSync_bmi2_fill_2_check_overread - CMPQ DX, $0x07 - JLE sequenceDecs_decodeSync_bmi2_fill_2_end - SHLQ $0x08, AX - SUBQ $0x01, R12 - SUBQ $0x01, BX - SUBQ $0x08, DX - MOVBQZX (R12), CX - ORQ CX, AX - JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte - -sequenceDecs_decodeSync_bmi2_fill_2_check_overread: - CMPQ DX, $0x40 - JA error_overread - -sequenceDecs_decodeSync_bmi2_fill_2_end: - // Update literal length - MOVQ $0x00000808, CX - BEXTRQ CX, SI, R13 - MOVQ AX, R14 - LEAQ (DX)(R13*1), CX - ROLQ CL, R14 - BZHIQ R13, R14, R14 - MOVQ CX, DX - MOVQ SI, CX - SHRQ $0x20, CX - ADDQ R14, CX - MOVQ CX, 24(SP) - - // Fill bitreader for state updates - MOVQ R12, (SP) - MOVQ $0x00000808, CX - BEXTRQ CX, R8, R12 - MOVQ ctx+16(FP), CX - CMPQ 96(CX), $0x00 - JZ sequenceDecs_decodeSync_bmi2_skip_update - LEAQ (SI)(DI*1), R13 - ADDQ R8, R13 - MOVBQZX R13, R13 - LEAQ (DX)(R13*1), CX - MOVQ AX, R14 - MOVQ CX, DX - ROLQ CL, R14 - BZHIQ R13, R14, R14 - - // Update Offset State - BZHIQ R8, R14, CX - SHRXQ R8, R14, R14 - SHRL $0x10, R8 - ADDQ CX, R8 - - // Load ctx.ofTable - MOVQ ctx+16(FP), CX - MOVQ 48(CX), CX - MOVQ (CX)(R8*8), R8 - - // Update Match Length State - BZHIQ DI, R14, CX - SHRXQ DI, R14, R14 - SHRL $0x10, DI - ADDQ CX, DI - - // Load ctx.mlTable - MOVQ ctx+16(FP), CX - MOVQ 24(CX), CX - MOVQ (CX)(DI*8), DI - - // Update Literal Length State - BZHIQ SI, R14, CX - SHRL $0x10, SI - ADDQ CX, SI - - // Load ctx.llTable - MOVQ ctx+16(FP), CX - MOVQ (CX), CX - MOVQ (CX)(SI*8), SI - -sequenceDecs_decodeSync_bmi2_skip_update: - // Adjust offset - MOVQ s+0(FP), CX - MOVQ 8(SP), R13 - CMPQ R12, $0x01 - JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0 - MOVUPS 144(CX), X0 - MOVQ R13, 144(CX) - MOVUPS X0, 152(CX) - JMP sequenceDecs_decodeSync_bmi2_after_adjust - -sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0: - CMPQ 24(SP), $0x00000000 - JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero - INCQ R13 - JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero - -sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero: - TESTQ R13, R13 - JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero - MOVQ 144(CX), R13 - JMP sequenceDecs_decodeSync_bmi2_after_adjust - -sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero: - MOVQ R13, R12 - XORQ R14, R14 - MOVQ $-1, R15 - CMPQ R13, $0x03 - CMOVQEQ R14, R12 - CMOVQEQ R15, R14 - ADDQ 144(CX)(R12*8), R14 - JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid - MOVQ $0x00000001, R14 - -sequenceDecs_decodeSync_bmi2_adjust_temp_valid: - CMPQ R13, $0x01 - JZ sequenceDecs_decodeSync_bmi2_adjust_skip - MOVQ 152(CX), R12 - MOVQ R12, 160(CX) - -sequenceDecs_decodeSync_bmi2_adjust_skip: - MOVQ 144(CX), R12 - MOVQ R12, 152(CX) - MOVQ R14, 144(CX) - MOVQ R14, R13 - -sequenceDecs_decodeSync_bmi2_after_adjust: - MOVQ R13, 8(SP) - - // Check values - MOVQ 16(SP), CX - MOVQ 24(SP), R12 - LEAQ (CX)(R12*1), R14 - MOVQ s+0(FP), R15 - ADDQ R14, 256(R15) - MOVQ ctx+16(FP), R14 - SUBQ R12, 104(R14) - JS error_not_enough_literals - CMPQ CX, $0x00020002 - JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big - TESTQ R13, R13 - JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok - TESTQ CX, CX - JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch - -sequenceDecs_decodeSync_bmi2_match_len_ofs_ok: - MOVQ 24(SP), CX - MOVQ 8(SP), R12 - MOVQ 16(SP), R13 - - // Check if we have enough space in s.out - LEAQ (CX)(R13*1), R14 - ADDQ R9, R14 - CMPQ R14, 32(SP) - JA error_not_enough_space - - // Copy literals - TESTQ CX, CX - JZ check_offset - XORQ R14, R14 - -copy_1: - MOVUPS (R10)(R14*1), X0 - MOVUPS X0, (R9)(R14*1) - ADDQ $0x10, R14 - CMPQ R14, CX - JB copy_1 - ADDQ CX, R10 - ADDQ CX, R9 - ADDQ CX, R11 - - // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) -check_offset: - MOVQ R11, CX - ADDQ 40(SP), CX - CMPQ R12, CX - JG error_match_off_too_big - CMPQ R12, 56(SP) - JG error_match_off_too_big - - // Copy match from history - MOVQ R12, CX - SUBQ R11, CX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ CX, R14 - CMPQ R13, CX - JG copy_all_from_history - MOVQ R13, CX - SUBQ $0x10, CX - JB copy_4_small - -copy_4_loop: - MOVUPS (R14), X0 - MOVUPS X0, (R9) - ADDQ $0x10, R14 - ADDQ $0x10, R9 - SUBQ $0x10, CX - JAE copy_4_loop - LEAQ 16(R14)(CX*1), R14 - LEAQ 16(R9)(CX*1), R9 - MOVUPS -16(R14), X0 - MOVUPS X0, -16(R9) - JMP copy_4_end - -copy_4_small: - CMPQ R13, $0x03 - JE copy_4_move_3 - CMPQ R13, $0x08 - JB copy_4_move_4through7 - JMP copy_4_move_8through16 - -copy_4_move_3: - MOVW (R14), CX - MOVB 2(R14), R12 - MOVW CX, (R9) - MOVB R12, 2(R9) - ADDQ R13, R14 - ADDQ R13, R9 - JMP copy_4_end - -copy_4_move_4through7: - MOVL (R14), CX - MOVL -4(R14)(R13*1), R12 - MOVL CX, (R9) - MOVL R12, -4(R9)(R13*1) - ADDQ R13, R14 - ADDQ R13, R9 - JMP copy_4_end - -copy_4_move_8through16: - MOVQ (R14), CX - MOVQ -8(R14)(R13*1), R12 - MOVQ CX, (R9) - MOVQ R12, -8(R9)(R13*1) - ADDQ R13, R14 - ADDQ R13, R9 - -copy_4_end: - ADDQ R13, R11 - JMP handle_loop - JMP loop_finished - -copy_all_from_history: - MOVQ CX, R15 - SUBQ $0x10, R15 - JB copy_5_small - -copy_5_loop: - MOVUPS (R14), X0 - MOVUPS X0, (R9) - ADDQ $0x10, R14 - ADDQ $0x10, R9 - SUBQ $0x10, R15 - JAE copy_5_loop - LEAQ 16(R14)(R15*1), R14 - LEAQ 16(R9)(R15*1), R9 - MOVUPS -16(R14), X0 - MOVUPS X0, -16(R9) - JMP copy_5_end - -copy_5_small: - CMPQ CX, $0x03 - JE copy_5_move_3 - JB copy_5_move_1or2 - CMPQ CX, $0x08 - JB copy_5_move_4through7 - JMP copy_5_move_8through16 - -copy_5_move_1or2: - MOVB (R14), R15 - MOVB -1(R14)(CX*1), BP - MOVB R15, (R9) - MOVB BP, -1(R9)(CX*1) - ADDQ CX, R14 - ADDQ CX, R9 - JMP copy_5_end - -copy_5_move_3: - MOVW (R14), R15 - MOVB 2(R14), BP - MOVW R15, (R9) - MOVB BP, 2(R9) - ADDQ CX, R14 - ADDQ CX, R9 - JMP copy_5_end - -copy_5_move_4through7: - MOVL (R14), R15 - MOVL -4(R14)(CX*1), BP - MOVL R15, (R9) - MOVL BP, -4(R9)(CX*1) - ADDQ CX, R14 - ADDQ CX, R9 - JMP copy_5_end - -copy_5_move_8through16: - MOVQ (R14), R15 - MOVQ -8(R14)(CX*1), BP - MOVQ R15, (R9) - MOVQ BP, -8(R9)(CX*1) - ADDQ CX, R14 - ADDQ CX, R9 - -copy_5_end: - ADDQ CX, R11 - SUBQ CX, R13 - - // Copy match from the current buffer -copy_match: - MOVQ R9, CX - SUBQ R12, CX - - // ml <= mo - CMPQ R13, R12 - JA copy_overlapping_match - - // Copy non-overlapping match - ADDQ R13, R11 - MOVQ R9, R12 - ADDQ R13, R9 - -copy_2: - MOVUPS (CX), X0 - MOVUPS X0, (R12) - ADDQ $0x10, CX - ADDQ $0x10, R12 - SUBQ $0x10, R13 - JHI copy_2 - JMP handle_loop - - // Copy overlapping match -copy_overlapping_match: - ADDQ R13, R11 - -copy_slow_3: - MOVB (CX), R12 - MOVB R12, (R9) - INCQ CX - INCQ R9 - DECQ R13 - JNZ copy_slow_3 - -handle_loop: - MOVQ ctx+16(FP), CX - DECQ 96(CX) - JNS sequenceDecs_decodeSync_bmi2_main_loop - -loop_finished: - MOVQ br+8(FP), CX - MOVQ AX, 24(CX) - MOVB DL, 32(CX) - MOVQ BX, 8(CX) - - // Update the context - MOVQ ctx+16(FP), AX - MOVQ R11, 136(AX) - MOVQ 144(AX), CX - SUBQ CX, R10 - MOVQ R10, 168(AX) - - // Return success - MOVQ $0x00000000, ret+24(FP) - RET - - // Return with match length error -sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch: - MOVQ 16(SP), AX - MOVQ ctx+16(FP), CX - MOVQ AX, 216(CX) - MOVQ $0x00000001, ret+24(FP) - RET - - // Return with match too long error -sequenceDecs_decodeSync_bmi2_error_match_len_too_big: - MOVQ ctx+16(FP), AX - MOVQ 16(SP), CX - MOVQ CX, 216(AX) - MOVQ $0x00000002, ret+24(FP) - RET - - // Return with match offset too long error -error_match_off_too_big: - MOVQ ctx+16(FP), AX - MOVQ 8(SP), CX - MOVQ CX, 224(AX) - MOVQ R11, 136(AX) - MOVQ $0x00000003, ret+24(FP) - RET - - // Return with not enough literals error -error_not_enough_literals: - MOVQ ctx+16(FP), AX - MOVQ 24(SP), CX - MOVQ CX, 208(AX) - MOVQ $0x00000004, ret+24(FP) - RET - - // Return with overread error -error_overread: - MOVQ $0x00000006, ret+24(FP) - RET - - // Return with not enough output space error -error_not_enough_space: - MOVQ ctx+16(FP), AX - MOVQ 24(SP), CX - MOVQ CX, 208(AX) - MOVQ 16(SP), CX - MOVQ CX, 216(AX) - MOVQ R11, 136(AX) - MOVQ $0x00000005, ret+24(FP) - RET - -// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int -// Requires: CMOV, SSE -TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32 - MOVQ br+8(FP), CX - MOVQ 24(CX), DX - MOVBQZX 32(CX), BX - MOVQ (CX), AX - MOVQ 8(CX), SI - ADDQ SI, AX - MOVQ AX, (SP) - MOVQ ctx+16(FP), AX - MOVQ 72(AX), DI - MOVQ 80(AX), R8 - MOVQ 88(AX), R9 - XORQ CX, CX - MOVQ CX, 8(SP) - MOVQ CX, 16(SP) - MOVQ CX, 24(SP) - MOVQ 112(AX), R10 - MOVQ 128(AX), CX - MOVQ CX, 32(SP) - MOVQ 144(AX), R11 - MOVQ 136(AX), R12 - MOVQ 200(AX), CX - MOVQ CX, 56(SP) - MOVQ 176(AX), CX - MOVQ CX, 48(SP) - MOVQ 184(AX), AX - MOVQ AX, 40(SP) - MOVQ 40(SP), AX - ADDQ AX, 48(SP) - - // Calculate pointer to s.out[cap(s.out)] (a past-end pointer) - ADDQ R10, 32(SP) - - // outBase += outPosition - ADDQ R12, R10 - -sequenceDecs_decodeSync_safe_amd64_main_loop: - MOVQ (SP), R13 - - // Fill bitreader to have enough for the offset and match length. - CMPQ SI, $0x08 - JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte - MOVQ BX, AX - SHRQ $0x03, AX - SUBQ AX, R13 - MOVQ (R13), DX - SUBQ AX, SI - ANDQ $0x07, BX - JMP sequenceDecs_decodeSync_safe_amd64_fill_end - -sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte: - CMPQ SI, $0x00 - JLE sequenceDecs_decodeSync_safe_amd64_fill_check_overread - CMPQ BX, $0x07 - JLE sequenceDecs_decodeSync_safe_amd64_fill_end - SHLQ $0x08, DX - SUBQ $0x01, R13 - SUBQ $0x01, SI - SUBQ $0x08, BX - MOVBQZX (R13), AX - ORQ AX, DX - JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte - -sequenceDecs_decodeSync_safe_amd64_fill_check_overread: - CMPQ BX, $0x40 - JA error_overread - -sequenceDecs_decodeSync_safe_amd64_fill_end: - // Update offset - MOVQ R9, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - SHRQ $0x20, AX - TESTQ CX, CX - JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero - ADDQ CX, BX - CMPQ BX, $0x40 - JA sequenceDecs_decodeSync_safe_amd64_of_update_zero - CMPQ CX, $0x40 - JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero - NEGQ CX - SHRQ CL, R14 - ADDQ R14, AX - -sequenceDecs_decodeSync_safe_amd64_of_update_zero: - MOVQ AX, 8(SP) - - // Update match length - MOVQ R8, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - SHRQ $0x20, AX - TESTQ CX, CX - JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero - ADDQ CX, BX - CMPQ BX, $0x40 - JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero - CMPQ CX, $0x40 - JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero - NEGQ CX - SHRQ CL, R14 - ADDQ R14, AX - -sequenceDecs_decodeSync_safe_amd64_ml_update_zero: - MOVQ AX, 16(SP) - - // Fill bitreader to have enough for the remaining - CMPQ SI, $0x08 - JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte - MOVQ BX, AX - SHRQ $0x03, AX - SUBQ AX, R13 - MOVQ (R13), DX - SUBQ AX, SI - ANDQ $0x07, BX - JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end - -sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte: - CMPQ SI, $0x00 - JLE sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread - CMPQ BX, $0x07 - JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end - SHLQ $0x08, DX - SUBQ $0x01, R13 - SUBQ $0x01, SI - SUBQ $0x08, BX - MOVBQZX (R13), AX - ORQ AX, DX - JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte - -sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread: - CMPQ BX, $0x40 - JA error_overread - -sequenceDecs_decodeSync_safe_amd64_fill_2_end: - // Update literal length - MOVQ DI, AX - MOVQ BX, CX - MOVQ DX, R14 - SHLQ CL, R14 - MOVB AH, CL - SHRQ $0x20, AX - TESTQ CX, CX - JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero - ADDQ CX, BX - CMPQ BX, $0x40 - JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero - CMPQ CX, $0x40 - JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero - NEGQ CX - SHRQ CL, R14 - ADDQ R14, AX - -sequenceDecs_decodeSync_safe_amd64_ll_update_zero: - MOVQ AX, 24(SP) - - // Fill bitreader for state updates - MOVQ R13, (SP) - MOVQ R9, AX - SHRQ $0x08, AX - MOVBQZX AL, AX - MOVQ ctx+16(FP), CX - CMPQ 96(CX), $0x00 - JZ sequenceDecs_decodeSync_safe_amd64_skip_update - - // Update Literal Length State - MOVBQZX DI, R13 - SHRL $0x10, DI - LEAQ (BX)(R13*1), CX - MOVQ DX, R14 - MOVQ CX, BX - ROLQ CL, R14 - MOVL $0x00000001, R15 - MOVB R13, CL - SHLL CL, R15 - DECL R15 - ANDQ R15, R14 - ADDQ R14, DI - - // Load ctx.llTable - MOVQ ctx+16(FP), CX - MOVQ (CX), CX - MOVQ (CX)(DI*8), DI - - // Update Match Length State - MOVBQZX R8, R13 - SHRL $0x10, R8 - LEAQ (BX)(R13*1), CX - MOVQ DX, R14 - MOVQ CX, BX - ROLQ CL, R14 - MOVL $0x00000001, R15 - MOVB R13, CL - SHLL CL, R15 - DECL R15 - ANDQ R15, R14 - ADDQ R14, R8 - - // Load ctx.mlTable - MOVQ ctx+16(FP), CX - MOVQ 24(CX), CX - MOVQ (CX)(R8*8), R8 - - // Update Offset State - MOVBQZX R9, R13 - SHRL $0x10, R9 - LEAQ (BX)(R13*1), CX - MOVQ DX, R14 - MOVQ CX, BX - ROLQ CL, R14 - MOVL $0x00000001, R15 - MOVB R13, CL - SHLL CL, R15 - DECL R15 - ANDQ R15, R14 - ADDQ R14, R9 - - // Load ctx.ofTable - MOVQ ctx+16(FP), CX - MOVQ 48(CX), CX - MOVQ (CX)(R9*8), R9 - -sequenceDecs_decodeSync_safe_amd64_skip_update: - // Adjust offset - MOVQ s+0(FP), CX - MOVQ 8(SP), R13 - CMPQ AX, $0x01 - JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0 - MOVUPS 144(CX), X0 - MOVQ R13, 144(CX) - MOVUPS X0, 152(CX) - JMP sequenceDecs_decodeSync_safe_amd64_after_adjust - -sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0: - CMPQ 24(SP), $0x00000000 - JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero - INCQ R13 - JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero - -sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero: - TESTQ R13, R13 - JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero - MOVQ 144(CX), R13 - JMP sequenceDecs_decodeSync_safe_amd64_after_adjust - -sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero: - MOVQ R13, AX - XORQ R14, R14 - MOVQ $-1, R15 - CMPQ R13, $0x03 - CMOVQEQ R14, AX - CMOVQEQ R15, R14 - ADDQ 144(CX)(AX*8), R14 - JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid - MOVQ $0x00000001, R14 - -sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid: - CMPQ R13, $0x01 - JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip - MOVQ 152(CX), AX - MOVQ AX, 160(CX) - -sequenceDecs_decodeSync_safe_amd64_adjust_skip: - MOVQ 144(CX), AX - MOVQ AX, 152(CX) - MOVQ R14, 144(CX) - MOVQ R14, R13 - -sequenceDecs_decodeSync_safe_amd64_after_adjust: - MOVQ R13, 8(SP) - - // Check values - MOVQ 16(SP), AX - MOVQ 24(SP), CX - LEAQ (AX)(CX*1), R14 - MOVQ s+0(FP), R15 - ADDQ R14, 256(R15) - MOVQ ctx+16(FP), R14 - SUBQ CX, 104(R14) - JS error_not_enough_literals - CMPQ AX, $0x00020002 - JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big - TESTQ R13, R13 - JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok - TESTQ AX, AX - JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch - -sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok: - MOVQ 24(SP), AX - MOVQ 8(SP), CX - MOVQ 16(SP), R13 - - // Check if we have enough space in s.out - LEAQ (AX)(R13*1), R14 - ADDQ R10, R14 - CMPQ R14, 32(SP) - JA error_not_enough_space - - // Copy literals - TESTQ AX, AX - JZ check_offset - MOVQ AX, R14 - SUBQ $0x10, R14 - JB copy_1_small - -copy_1_loop: - MOVUPS (R11), X0 - MOVUPS X0, (R10) - ADDQ $0x10, R11 - ADDQ $0x10, R10 - SUBQ $0x10, R14 - JAE copy_1_loop - LEAQ 16(R11)(R14*1), R11 - LEAQ 16(R10)(R14*1), R10 - MOVUPS -16(R11), X0 - MOVUPS X0, -16(R10) - JMP copy_1_end - -copy_1_small: - CMPQ AX, $0x03 - JE copy_1_move_3 - JB copy_1_move_1or2 - CMPQ AX, $0x08 - JB copy_1_move_4through7 - JMP copy_1_move_8through16 - -copy_1_move_1or2: - MOVB (R11), R14 - MOVB -1(R11)(AX*1), R15 - MOVB R14, (R10) - MOVB R15, -1(R10)(AX*1) - ADDQ AX, R11 - ADDQ AX, R10 - JMP copy_1_end - -copy_1_move_3: - MOVW (R11), R14 - MOVB 2(R11), R15 - MOVW R14, (R10) - MOVB R15, 2(R10) - ADDQ AX, R11 - ADDQ AX, R10 - JMP copy_1_end - -copy_1_move_4through7: - MOVL (R11), R14 - MOVL -4(R11)(AX*1), R15 - MOVL R14, (R10) - MOVL R15, -4(R10)(AX*1) - ADDQ AX, R11 - ADDQ AX, R10 - JMP copy_1_end - -copy_1_move_8through16: - MOVQ (R11), R14 - MOVQ -8(R11)(AX*1), R15 - MOVQ R14, (R10) - MOVQ R15, -8(R10)(AX*1) - ADDQ AX, R11 - ADDQ AX, R10 - -copy_1_end: - ADDQ AX, R12 - - // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) -check_offset: - MOVQ R12, AX - ADDQ 40(SP), AX - CMPQ CX, AX - JG error_match_off_too_big - CMPQ CX, 56(SP) - JG error_match_off_too_big - - // Copy match from history - MOVQ CX, AX - SUBQ R12, AX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ AX, R14 - CMPQ R13, AX - JG copy_all_from_history - MOVQ R13, AX - SUBQ $0x10, AX - JB copy_4_small - -copy_4_loop: - MOVUPS (R14), X0 - MOVUPS X0, (R10) - ADDQ $0x10, R14 - ADDQ $0x10, R10 - SUBQ $0x10, AX - JAE copy_4_loop - LEAQ 16(R14)(AX*1), R14 - LEAQ 16(R10)(AX*1), R10 - MOVUPS -16(R14), X0 - MOVUPS X0, -16(R10) - JMP copy_4_end - -copy_4_small: - CMPQ R13, $0x03 - JE copy_4_move_3 - CMPQ R13, $0x08 - JB copy_4_move_4through7 - JMP copy_4_move_8through16 - -copy_4_move_3: - MOVW (R14), AX - MOVB 2(R14), CL - MOVW AX, (R10) - MOVB CL, 2(R10) - ADDQ R13, R14 - ADDQ R13, R10 - JMP copy_4_end - -copy_4_move_4through7: - MOVL (R14), AX - MOVL -4(R14)(R13*1), CX - MOVL AX, (R10) - MOVL CX, -4(R10)(R13*1) - ADDQ R13, R14 - ADDQ R13, R10 - JMP copy_4_end - -copy_4_move_8through16: - MOVQ (R14), AX - MOVQ -8(R14)(R13*1), CX - MOVQ AX, (R10) - MOVQ CX, -8(R10)(R13*1) - ADDQ R13, R14 - ADDQ R13, R10 - -copy_4_end: - ADDQ R13, R12 - JMP handle_loop - JMP loop_finished - -copy_all_from_history: - MOVQ AX, R15 - SUBQ $0x10, R15 - JB copy_5_small - -copy_5_loop: - MOVUPS (R14), X0 - MOVUPS X0, (R10) - ADDQ $0x10, R14 - ADDQ $0x10, R10 - SUBQ $0x10, R15 - JAE copy_5_loop - LEAQ 16(R14)(R15*1), R14 - LEAQ 16(R10)(R15*1), R10 - MOVUPS -16(R14), X0 - MOVUPS X0, -16(R10) - JMP copy_5_end - -copy_5_small: - CMPQ AX, $0x03 - JE copy_5_move_3 - JB copy_5_move_1or2 - CMPQ AX, $0x08 - JB copy_5_move_4through7 - JMP copy_5_move_8through16 - -copy_5_move_1or2: - MOVB (R14), R15 - MOVB -1(R14)(AX*1), BP - MOVB R15, (R10) - MOVB BP, -1(R10)(AX*1) - ADDQ AX, R14 - ADDQ AX, R10 - JMP copy_5_end - -copy_5_move_3: - MOVW (R14), R15 - MOVB 2(R14), BP - MOVW R15, (R10) - MOVB BP, 2(R10) - ADDQ AX, R14 - ADDQ AX, R10 - JMP copy_5_end - -copy_5_move_4through7: - MOVL (R14), R15 - MOVL -4(R14)(AX*1), BP - MOVL R15, (R10) - MOVL BP, -4(R10)(AX*1) - ADDQ AX, R14 - ADDQ AX, R10 - JMP copy_5_end - -copy_5_move_8through16: - MOVQ (R14), R15 - MOVQ -8(R14)(AX*1), BP - MOVQ R15, (R10) - MOVQ BP, -8(R10)(AX*1) - ADDQ AX, R14 - ADDQ AX, R10 - -copy_5_end: - ADDQ AX, R12 - SUBQ AX, R13 - - // Copy match from the current buffer -copy_match: - MOVQ R10, AX - SUBQ CX, AX - - // ml <= mo - CMPQ R13, CX - JA copy_overlapping_match - - // Copy non-overlapping match - ADDQ R13, R12 - MOVQ R13, CX - SUBQ $0x10, CX - JB copy_2_small - -copy_2_loop: - MOVUPS (AX), X0 - MOVUPS X0, (R10) - ADDQ $0x10, AX - ADDQ $0x10, R10 - SUBQ $0x10, CX - JAE copy_2_loop - LEAQ 16(AX)(CX*1), AX - LEAQ 16(R10)(CX*1), R10 - MOVUPS -16(AX), X0 - MOVUPS X0, -16(R10) - JMP copy_2_end - -copy_2_small: - CMPQ R13, $0x03 - JE copy_2_move_3 - JB copy_2_move_1or2 - CMPQ R13, $0x08 - JB copy_2_move_4through7 - JMP copy_2_move_8through16 - -copy_2_move_1or2: - MOVB (AX), CL - MOVB -1(AX)(R13*1), R14 - MOVB CL, (R10) - MOVB R14, -1(R10)(R13*1) - ADDQ R13, AX - ADDQ R13, R10 - JMP copy_2_end - -copy_2_move_3: - MOVW (AX), CX - MOVB 2(AX), R14 - MOVW CX, (R10) - MOVB R14, 2(R10) - ADDQ R13, AX - ADDQ R13, R10 - JMP copy_2_end - -copy_2_move_4through7: - MOVL (AX), CX - MOVL -4(AX)(R13*1), R14 - MOVL CX, (R10) - MOVL R14, -4(R10)(R13*1) - ADDQ R13, AX - ADDQ R13, R10 - JMP copy_2_end - -copy_2_move_8through16: - MOVQ (AX), CX - MOVQ -8(AX)(R13*1), R14 - MOVQ CX, (R10) - MOVQ R14, -8(R10)(R13*1) - ADDQ R13, AX - ADDQ R13, R10 - -copy_2_end: - JMP handle_loop - - // Copy overlapping match -copy_overlapping_match: - ADDQ R13, R12 - -copy_slow_3: - MOVB (AX), CL - MOVB CL, (R10) - INCQ AX - INCQ R10 - DECQ R13 - JNZ copy_slow_3 - -handle_loop: - MOVQ ctx+16(FP), AX - DECQ 96(AX) - JNS sequenceDecs_decodeSync_safe_amd64_main_loop - -loop_finished: - MOVQ br+8(FP), AX - MOVQ DX, 24(AX) - MOVB BL, 32(AX) - MOVQ SI, 8(AX) - - // Update the context - MOVQ ctx+16(FP), AX - MOVQ R12, 136(AX) - MOVQ 144(AX), CX - SUBQ CX, R11 - MOVQ R11, 168(AX) - - // Return success - MOVQ $0x00000000, ret+24(FP) - RET - - // Return with match length error -sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch: - MOVQ 16(SP), AX - MOVQ ctx+16(FP), CX - MOVQ AX, 216(CX) - MOVQ $0x00000001, ret+24(FP) - RET - - // Return with match too long error -sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big: - MOVQ ctx+16(FP), AX - MOVQ 16(SP), CX - MOVQ CX, 216(AX) - MOVQ $0x00000002, ret+24(FP) - RET - - // Return with match offset too long error -error_match_off_too_big: - MOVQ ctx+16(FP), AX - MOVQ 8(SP), CX - MOVQ CX, 224(AX) - MOVQ R12, 136(AX) - MOVQ $0x00000003, ret+24(FP) - RET - - // Return with not enough literals error -error_not_enough_literals: - MOVQ ctx+16(FP), AX - MOVQ 24(SP), CX - MOVQ CX, 208(AX) - MOVQ $0x00000004, ret+24(FP) - RET - - // Return with overread error -error_overread: - MOVQ $0x00000006, ret+24(FP) - RET - - // Return with not enough output space error -error_not_enough_space: - MOVQ ctx+16(FP), AX - MOVQ 24(SP), CX - MOVQ CX, 208(AX) - MOVQ 16(SP), CX - MOVQ CX, 216(AX) - MOVQ R12, 136(AX) - MOVQ $0x00000005, ret+24(FP) - RET - -// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int -// Requires: BMI, BMI2, CMOV, SSE -TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32 - MOVQ br+8(FP), BX - MOVQ 24(BX), AX - MOVBQZX 32(BX), DX - MOVQ (BX), CX - MOVQ 8(BX), BX - ADDQ BX, CX - MOVQ CX, (SP) - MOVQ ctx+16(FP), CX - MOVQ 72(CX), SI - MOVQ 80(CX), DI - MOVQ 88(CX), R8 - XORQ R9, R9 - MOVQ R9, 8(SP) - MOVQ R9, 16(SP) - MOVQ R9, 24(SP) - MOVQ 112(CX), R9 - MOVQ 128(CX), R10 - MOVQ R10, 32(SP) - MOVQ 144(CX), R10 - MOVQ 136(CX), R11 - MOVQ 200(CX), R12 - MOVQ R12, 56(SP) - MOVQ 176(CX), R12 - MOVQ R12, 48(SP) - MOVQ 184(CX), CX - MOVQ CX, 40(SP) - MOVQ 40(SP), CX - ADDQ CX, 48(SP) - - // Calculate pointer to s.out[cap(s.out)] (a past-end pointer) - ADDQ R9, 32(SP) - - // outBase += outPosition - ADDQ R11, R9 - -sequenceDecs_decodeSync_safe_bmi2_main_loop: - MOVQ (SP), R12 - - // Fill bitreader to have enough for the offset and match length. - CMPQ BX, $0x08 - JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte - MOVQ DX, CX - SHRQ $0x03, CX - SUBQ CX, R12 - MOVQ (R12), AX - SUBQ CX, BX - ANDQ $0x07, DX - JMP sequenceDecs_decodeSync_safe_bmi2_fill_end - -sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte: - CMPQ BX, $0x00 - JLE sequenceDecs_decodeSync_safe_bmi2_fill_check_overread - CMPQ DX, $0x07 - JLE sequenceDecs_decodeSync_safe_bmi2_fill_end - SHLQ $0x08, AX - SUBQ $0x01, R12 - SUBQ $0x01, BX - SUBQ $0x08, DX - MOVBQZX (R12), CX - ORQ CX, AX - JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte - -sequenceDecs_decodeSync_safe_bmi2_fill_check_overread: - CMPQ DX, $0x40 - JA error_overread - -sequenceDecs_decodeSync_safe_bmi2_fill_end: - // Update offset - MOVQ $0x00000808, CX - BEXTRQ CX, R8, R13 - MOVQ AX, R14 - LEAQ (DX)(R13*1), CX - ROLQ CL, R14 - BZHIQ R13, R14, R14 - MOVQ CX, DX - MOVQ R8, CX - SHRQ $0x20, CX - ADDQ R14, CX - MOVQ CX, 8(SP) - - // Update match length - MOVQ $0x00000808, CX - BEXTRQ CX, DI, R13 - MOVQ AX, R14 - LEAQ (DX)(R13*1), CX - ROLQ CL, R14 - BZHIQ R13, R14, R14 - MOVQ CX, DX - MOVQ DI, CX - SHRQ $0x20, CX - ADDQ R14, CX - MOVQ CX, 16(SP) - - // Fill bitreader to have enough for the remaining - CMPQ BX, $0x08 - JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte - MOVQ DX, CX - SHRQ $0x03, CX - SUBQ CX, R12 - MOVQ (R12), AX - SUBQ CX, BX - ANDQ $0x07, DX - JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end - -sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte: - CMPQ BX, $0x00 - JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread - CMPQ DX, $0x07 - JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end - SHLQ $0x08, AX - SUBQ $0x01, R12 - SUBQ $0x01, BX - SUBQ $0x08, DX - MOVBQZX (R12), CX - ORQ CX, AX - JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte - -sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread: - CMPQ DX, $0x40 - JA error_overread - -sequenceDecs_decodeSync_safe_bmi2_fill_2_end: - // Update literal length - MOVQ $0x00000808, CX - BEXTRQ CX, SI, R13 - MOVQ AX, R14 - LEAQ (DX)(R13*1), CX - ROLQ CL, R14 - BZHIQ R13, R14, R14 - MOVQ CX, DX - MOVQ SI, CX - SHRQ $0x20, CX - ADDQ R14, CX - MOVQ CX, 24(SP) - - // Fill bitreader for state updates - MOVQ R12, (SP) - MOVQ $0x00000808, CX - BEXTRQ CX, R8, R12 - MOVQ ctx+16(FP), CX - CMPQ 96(CX), $0x00 - JZ sequenceDecs_decodeSync_safe_bmi2_skip_update - LEAQ (SI)(DI*1), R13 - ADDQ R8, R13 - MOVBQZX R13, R13 - LEAQ (DX)(R13*1), CX - MOVQ AX, R14 - MOVQ CX, DX - ROLQ CL, R14 - BZHIQ R13, R14, R14 - - // Update Offset State - BZHIQ R8, R14, CX - SHRXQ R8, R14, R14 - SHRL $0x10, R8 - ADDQ CX, R8 - - // Load ctx.ofTable - MOVQ ctx+16(FP), CX - MOVQ 48(CX), CX - MOVQ (CX)(R8*8), R8 - - // Update Match Length State - BZHIQ DI, R14, CX - SHRXQ DI, R14, R14 - SHRL $0x10, DI - ADDQ CX, DI - - // Load ctx.mlTable - MOVQ ctx+16(FP), CX - MOVQ 24(CX), CX - MOVQ (CX)(DI*8), DI - - // Update Literal Length State - BZHIQ SI, R14, CX - SHRL $0x10, SI - ADDQ CX, SI - - // Load ctx.llTable - MOVQ ctx+16(FP), CX - MOVQ (CX), CX - MOVQ (CX)(SI*8), SI - -sequenceDecs_decodeSync_safe_bmi2_skip_update: - // Adjust offset - MOVQ s+0(FP), CX - MOVQ 8(SP), R13 - CMPQ R12, $0x01 - JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0 - MOVUPS 144(CX), X0 - MOVQ R13, 144(CX) - MOVUPS X0, 152(CX) - JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust - -sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0: - CMPQ 24(SP), $0x00000000 - JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero - INCQ R13 - JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero - -sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero: - TESTQ R13, R13 - JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero - MOVQ 144(CX), R13 - JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust - -sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero: - MOVQ R13, R12 - XORQ R14, R14 - MOVQ $-1, R15 - CMPQ R13, $0x03 - CMOVQEQ R14, R12 - CMOVQEQ R15, R14 - ADDQ 144(CX)(R12*8), R14 - JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid - MOVQ $0x00000001, R14 - -sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid: - CMPQ R13, $0x01 - JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip - MOVQ 152(CX), R12 - MOVQ R12, 160(CX) - -sequenceDecs_decodeSync_safe_bmi2_adjust_skip: - MOVQ 144(CX), R12 - MOVQ R12, 152(CX) - MOVQ R14, 144(CX) - MOVQ R14, R13 - -sequenceDecs_decodeSync_safe_bmi2_after_adjust: - MOVQ R13, 8(SP) - - // Check values - MOVQ 16(SP), CX - MOVQ 24(SP), R12 - LEAQ (CX)(R12*1), R14 - MOVQ s+0(FP), R15 - ADDQ R14, 256(R15) - MOVQ ctx+16(FP), R14 - SUBQ R12, 104(R14) - JS error_not_enough_literals - CMPQ CX, $0x00020002 - JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big - TESTQ R13, R13 - JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok - TESTQ CX, CX - JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch - -sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok: - MOVQ 24(SP), CX - MOVQ 8(SP), R12 - MOVQ 16(SP), R13 - - // Check if we have enough space in s.out - LEAQ (CX)(R13*1), R14 - ADDQ R9, R14 - CMPQ R14, 32(SP) - JA error_not_enough_space - - // Copy literals - TESTQ CX, CX - JZ check_offset - MOVQ CX, R14 - SUBQ $0x10, R14 - JB copy_1_small - -copy_1_loop: - MOVUPS (R10), X0 - MOVUPS X0, (R9) - ADDQ $0x10, R10 - ADDQ $0x10, R9 - SUBQ $0x10, R14 - JAE copy_1_loop - LEAQ 16(R10)(R14*1), R10 - LEAQ 16(R9)(R14*1), R9 - MOVUPS -16(R10), X0 - MOVUPS X0, -16(R9) - JMP copy_1_end - -copy_1_small: - CMPQ CX, $0x03 - JE copy_1_move_3 - JB copy_1_move_1or2 - CMPQ CX, $0x08 - JB copy_1_move_4through7 - JMP copy_1_move_8through16 - -copy_1_move_1or2: - MOVB (R10), R14 - MOVB -1(R10)(CX*1), R15 - MOVB R14, (R9) - MOVB R15, -1(R9)(CX*1) - ADDQ CX, R10 - ADDQ CX, R9 - JMP copy_1_end - -copy_1_move_3: - MOVW (R10), R14 - MOVB 2(R10), R15 - MOVW R14, (R9) - MOVB R15, 2(R9) - ADDQ CX, R10 - ADDQ CX, R9 - JMP copy_1_end - -copy_1_move_4through7: - MOVL (R10), R14 - MOVL -4(R10)(CX*1), R15 - MOVL R14, (R9) - MOVL R15, -4(R9)(CX*1) - ADDQ CX, R10 - ADDQ CX, R9 - JMP copy_1_end - -copy_1_move_8through16: - MOVQ (R10), R14 - MOVQ -8(R10)(CX*1), R15 - MOVQ R14, (R9) - MOVQ R15, -8(R9)(CX*1) - ADDQ CX, R10 - ADDQ CX, R9 - -copy_1_end: - ADDQ CX, R11 - - // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) -check_offset: - MOVQ R11, CX - ADDQ 40(SP), CX - CMPQ R12, CX - JG error_match_off_too_big - CMPQ R12, 56(SP) - JG error_match_off_too_big - - // Copy match from history - MOVQ R12, CX - SUBQ R11, CX - JLS copy_match - MOVQ 48(SP), R14 - SUBQ CX, R14 - CMPQ R13, CX - JG copy_all_from_history - MOVQ R13, CX - SUBQ $0x10, CX - JB copy_4_small - -copy_4_loop: - MOVUPS (R14), X0 - MOVUPS X0, (R9) - ADDQ $0x10, R14 - ADDQ $0x10, R9 - SUBQ $0x10, CX - JAE copy_4_loop - LEAQ 16(R14)(CX*1), R14 - LEAQ 16(R9)(CX*1), R9 - MOVUPS -16(R14), X0 - MOVUPS X0, -16(R9) - JMP copy_4_end - -copy_4_small: - CMPQ R13, $0x03 - JE copy_4_move_3 - CMPQ R13, $0x08 - JB copy_4_move_4through7 - JMP copy_4_move_8through16 - -copy_4_move_3: - MOVW (R14), CX - MOVB 2(R14), R12 - MOVW CX, (R9) - MOVB R12, 2(R9) - ADDQ R13, R14 - ADDQ R13, R9 - JMP copy_4_end - -copy_4_move_4through7: - MOVL (R14), CX - MOVL -4(R14)(R13*1), R12 - MOVL CX, (R9) - MOVL R12, -4(R9)(R13*1) - ADDQ R13, R14 - ADDQ R13, R9 - JMP copy_4_end - -copy_4_move_8through16: - MOVQ (R14), CX - MOVQ -8(R14)(R13*1), R12 - MOVQ CX, (R9) - MOVQ R12, -8(R9)(R13*1) - ADDQ R13, R14 - ADDQ R13, R9 - -copy_4_end: - ADDQ R13, R11 - JMP handle_loop - JMP loop_finished - -copy_all_from_history: - MOVQ CX, R15 - SUBQ $0x10, R15 - JB copy_5_small - -copy_5_loop: - MOVUPS (R14), X0 - MOVUPS X0, (R9) - ADDQ $0x10, R14 - ADDQ $0x10, R9 - SUBQ $0x10, R15 - JAE copy_5_loop - LEAQ 16(R14)(R15*1), R14 - LEAQ 16(R9)(R15*1), R9 - MOVUPS -16(R14), X0 - MOVUPS X0, -16(R9) - JMP copy_5_end - -copy_5_small: - CMPQ CX, $0x03 - JE copy_5_move_3 - JB copy_5_move_1or2 - CMPQ CX, $0x08 - JB copy_5_move_4through7 - JMP copy_5_move_8through16 - -copy_5_move_1or2: - MOVB (R14), R15 - MOVB -1(R14)(CX*1), BP - MOVB R15, (R9) - MOVB BP, -1(R9)(CX*1) - ADDQ CX, R14 - ADDQ CX, R9 - JMP copy_5_end - -copy_5_move_3: - MOVW (R14), R15 - MOVB 2(R14), BP - MOVW R15, (R9) - MOVB BP, 2(R9) - ADDQ CX, R14 - ADDQ CX, R9 - JMP copy_5_end - -copy_5_move_4through7: - MOVL (R14), R15 - MOVL -4(R14)(CX*1), BP - MOVL R15, (R9) - MOVL BP, -4(R9)(CX*1) - ADDQ CX, R14 - ADDQ CX, R9 - JMP copy_5_end - -copy_5_move_8through16: - MOVQ (R14), R15 - MOVQ -8(R14)(CX*1), BP - MOVQ R15, (R9) - MOVQ BP, -8(R9)(CX*1) - ADDQ CX, R14 - ADDQ CX, R9 - -copy_5_end: - ADDQ CX, R11 - SUBQ CX, R13 - - // Copy match from the current buffer -copy_match: - MOVQ R9, CX - SUBQ R12, CX - - // ml <= mo - CMPQ R13, R12 - JA copy_overlapping_match - - // Copy non-overlapping match - ADDQ R13, R11 - MOVQ R13, R12 - SUBQ $0x10, R12 - JB copy_2_small - -copy_2_loop: - MOVUPS (CX), X0 - MOVUPS X0, (R9) - ADDQ $0x10, CX - ADDQ $0x10, R9 - SUBQ $0x10, R12 - JAE copy_2_loop - LEAQ 16(CX)(R12*1), CX - LEAQ 16(R9)(R12*1), R9 - MOVUPS -16(CX), X0 - MOVUPS X0, -16(R9) - JMP copy_2_end - -copy_2_small: - CMPQ R13, $0x03 - JE copy_2_move_3 - JB copy_2_move_1or2 - CMPQ R13, $0x08 - JB copy_2_move_4through7 - JMP copy_2_move_8through16 - -copy_2_move_1or2: - MOVB (CX), R12 - MOVB -1(CX)(R13*1), R14 - MOVB R12, (R9) - MOVB R14, -1(R9)(R13*1) - ADDQ R13, CX - ADDQ R13, R9 - JMP copy_2_end - -copy_2_move_3: - MOVW (CX), R12 - MOVB 2(CX), R14 - MOVW R12, (R9) - MOVB R14, 2(R9) - ADDQ R13, CX - ADDQ R13, R9 - JMP copy_2_end - -copy_2_move_4through7: - MOVL (CX), R12 - MOVL -4(CX)(R13*1), R14 - MOVL R12, (R9) - MOVL R14, -4(R9)(R13*1) - ADDQ R13, CX - ADDQ R13, R9 - JMP copy_2_end - -copy_2_move_8through16: - MOVQ (CX), R12 - MOVQ -8(CX)(R13*1), R14 - MOVQ R12, (R9) - MOVQ R14, -8(R9)(R13*1) - ADDQ R13, CX - ADDQ R13, R9 - -copy_2_end: - JMP handle_loop - - // Copy overlapping match -copy_overlapping_match: - ADDQ R13, R11 - -copy_slow_3: - MOVB (CX), R12 - MOVB R12, (R9) - INCQ CX - INCQ R9 - DECQ R13 - JNZ copy_slow_3 - -handle_loop: - MOVQ ctx+16(FP), CX - DECQ 96(CX) - JNS sequenceDecs_decodeSync_safe_bmi2_main_loop - -loop_finished: - MOVQ br+8(FP), CX - MOVQ AX, 24(CX) - MOVB DL, 32(CX) - MOVQ BX, 8(CX) - - // Update the context - MOVQ ctx+16(FP), AX - MOVQ R11, 136(AX) - MOVQ 144(AX), CX - SUBQ CX, R10 - MOVQ R10, 168(AX) - - // Return success - MOVQ $0x00000000, ret+24(FP) - RET - - // Return with match length error -sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch: - MOVQ 16(SP), AX - MOVQ ctx+16(FP), CX - MOVQ AX, 216(CX) - MOVQ $0x00000001, ret+24(FP) - RET - - // Return with match too long error -sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big: - MOVQ ctx+16(FP), AX - MOVQ 16(SP), CX - MOVQ CX, 216(AX) - MOVQ $0x00000002, ret+24(FP) - RET - - // Return with match offset too long error -error_match_off_too_big: - MOVQ ctx+16(FP), AX - MOVQ 8(SP), CX - MOVQ CX, 224(AX) - MOVQ R11, 136(AX) - MOVQ $0x00000003, ret+24(FP) - RET - - // Return with not enough literals error -error_not_enough_literals: - MOVQ ctx+16(FP), AX - MOVQ 24(SP), CX - MOVQ CX, 208(AX) - MOVQ $0x00000004, ret+24(FP) - RET - - // Return with overread error -error_overread: - MOVQ $0x00000006, ret+24(FP) - RET - - // Return with not enough output space error -error_not_enough_space: - MOVQ ctx+16(FP), AX - MOVQ 24(SP), CX - MOVQ CX, 208(AX) - MOVQ 16(SP), CX - MOVQ CX, 216(AX) - MOVQ R11, 136(AX) - MOVQ $0x00000005, ret+24(FP) - RET |