diff options
Diffstat (limited to 'vendor/github.com/minio/crc64nvme/crc64_amd64.s')
| -rw-r--r-- | vendor/github.com/minio/crc64nvme/crc64_amd64.s | 150 |
1 files changed, 150 insertions, 0 deletions
diff --git a/vendor/github.com/minio/crc64nvme/crc64_amd64.s b/vendor/github.com/minio/crc64nvme/crc64_amd64.s index 9782321fd..acfea6a15 100644 --- a/vendor/github.com/minio/crc64nvme/crc64_amd64.s +++ b/vendor/github.com/minio/crc64nvme/crc64_amd64.s @@ -155,3 +155,153 @@ skip128: NOTQ AX MOVQ AX, checksum+32(FP) RET + +// Constants, pre-splatted. +DATA ·asmConstantsPoly<>+0x00(SB)/8, $0xa1ca681e733f9c40 +DATA ·asmConstantsPoly<>+0x08(SB)/8, $0 +DATA ·asmConstantsPoly<>+0x10(SB)/8, $0xa1ca681e733f9c40 +DATA ·asmConstantsPoly<>+0x18(SB)/8, $0 +DATA ·asmConstantsPoly<>+0x20(SB)/8, $0xa1ca681e733f9c40 +DATA ·asmConstantsPoly<>+0x28(SB)/8, $0 +DATA ·asmConstantsPoly<>+0x30(SB)/8, $0xa1ca681e733f9c40 +DATA ·asmConstantsPoly<>+0x38(SB)/8, $0 +// Upper +DATA ·asmConstantsPoly<>+0x40(SB)/8, $0 +DATA ·asmConstantsPoly<>+0x48(SB)/8, $0x5f852fb61e8d92dc +DATA ·asmConstantsPoly<>+0x50(SB)/8, $0 +DATA ·asmConstantsPoly<>+0x58(SB)/8, $0x5f852fb61e8d92dc +DATA ·asmConstantsPoly<>+0x60(SB)/8, $0 +DATA ·asmConstantsPoly<>+0x68(SB)/8, $0x5f852fb61e8d92dc +DATA ·asmConstantsPoly<>+0x70(SB)/8, $0 +DATA ·asmConstantsPoly<>+0x78(SB)/8, $0x5f852fb61e8d92dc +GLOBL ·asmConstantsPoly<>(SB), (NOPTR+RODATA), $128 + +TEXT ·updateAsm512(SB), $0-40 + MOVQ crc+0(FP), AX // checksum + MOVQ p_base+8(FP), SI // start pointer + MOVQ p_len+16(FP), CX // length of buffer + NOTQ AX + SHRQ $7, CX + CMPQ CX, $1 + VPXORQ Z8, Z8, Z8 // Initialize ZMM8 to zero + JLT skip128 + + VMOVDQU64 0x00(SI), Z0 + VMOVDQU64 0x40(SI), Z4 + MOVQ $·asmConstantsPoly<>(SB), BX + VMOVQ AX, X8 + + // XOR initialization value into lower 64 bits of ZMM0 + VPXORQ Z8, Z0, Z0 + CMPQ CX, $1 + JE tail128 + + VMOVDQU64 0(BX), Z8 + VMOVDQU64 64(BX), Z9 + + PCALIGN $16 + +loop128: + VMOVDQU64 0x80(SI), Z1 + VMOVDQU64 0xc0(SI), Z5 + ADDQ $128, SI + + SUBQ $1, CX + VPCLMULQDQ $0x00, Z8, Z0, Z10 + VPCLMULQDQ $0x11, Z9, Z0, Z0 + VPTERNLOGD $0x96, Z1, Z10, Z0 // Combine results with xor into Z0 + + VPCLMULQDQ $0x00, Z8, Z4, Z10 + VPCLMULQDQ $0x11, Z9, Z4, Z4 + VPTERNLOGD $0x96, Z5, Z10, Z4 // Combine results with xor into Z4 + + CMPQ CX, $1 + JGT loop128 + +tail128: + // Extract X0 to X3 from ZMM0 + VEXTRACTF32X4 $1, Z0, X1 // X1: Second 128-bit lane + VEXTRACTF32X4 $2, Z0, X2 // X2: Third 128-bit lane + VEXTRACTF32X4 $3, Z0, X3 // X3: Fourth 128-bit lane + + // Extract X4 to X7 from ZMM4 + VEXTRACTF32X4 $1, Z4, X5 // X5: Second 128-bit lane + VEXTRACTF32X4 $2, Z4, X6 // X6: Third 128-bit lane + VEXTRACTF32X4 $3, Z4, X7 // X7: Fourth 128-bit lane + + MOVQ $0xd083dd594d96319d, AX + MOVQ AX, X11 + PCLMULQDQ $0x00, X0, X11 + MOVQ $0x946588403d4adcbc, AX + PINSRQ $0x1, AX, X12 + PCLMULQDQ $0x11, X12, X0 + PXOR X11, X7 + PXOR X0, X7 + MOVQ $0x3c255f5ebc414423, AX + MOVQ AX, X11 + PCLMULQDQ $0x00, X1, X11 + MOVQ $0x34f5a24e22d66e90, AX + PINSRQ $0x1, AX, X12 + PCLMULQDQ $0x11, X12, X1 + PXOR X11, X1 + PXOR X7, X1 + MOVQ $0x7b0ab10dd0f809fe, AX + MOVQ AX, X11 + PCLMULQDQ $0x00, X2, X11 + MOVQ $0x03363823e6e791e5, AX + PINSRQ $0x1, AX, X12 + PCLMULQDQ $0x11, X12, X2 + PXOR X11, X2 + PXOR X1, X2 + MOVQ $0x0c32cdb31e18a84a, AX + MOVQ AX, X11 + PCLMULQDQ $0x00, X3, X11 + MOVQ $0x62242240ace5045a, AX + PINSRQ $0x1, AX, X12 + PCLMULQDQ $0x11, X12, X3 + PXOR X11, X3 + PXOR X2, X3 + MOVQ $0xbdd7ac0ee1a4a0f0, AX + MOVQ AX, X11 + PCLMULQDQ $0x00, X4, X11 + MOVQ $0xa3ffdc1fe8e82a8b, AX + PINSRQ $0x1, AX, X12 + PCLMULQDQ $0x11, X12, X4 + PXOR X11, X4 + PXOR X3, X4 + MOVQ $0xb0bc2e589204f500, AX + MOVQ AX, X11 + PCLMULQDQ $0x00, X5, X11 + MOVQ $0xe1e0bb9d45d7a44c, AX + PINSRQ $0x1, AX, X12 + PCLMULQDQ $0x11, X12, X5 + PXOR X11, X5 + PXOR X4, X5 + MOVQ $0xeadc41fd2ba3d420, AX + MOVQ AX, X11 + PCLMULQDQ $0x00, X6, X11 + MOVQ $0x21e9761e252621ac, AX + PINSRQ $0x1, AX, X12 + PCLMULQDQ $0x11, X12, X6 + PXOR X11, X6 + PXOR X5, X6 + MOVQ AX, X5 + PCLMULQDQ $0x00, X6, X5 + PSHUFD $0xee, X6, X6 + PXOR X5, X6 + MOVQ $0x27ecfa329aef9f77, AX + MOVQ AX, X4 + PCLMULQDQ $0x00, X4, X6 + PEXTRQ $0, X6, BX + MOVQ $0x34d926535897936b, AX + MOVQ AX, X4 + PCLMULQDQ $0x00, X4, X6 + PXOR X5, X6 + PEXTRQ $1, X6, AX + XORQ BX, AX + +skip128: + NOTQ AX + MOVQ AX, checksum+32(FP) + VZEROUPPER + RET |
