diff options
Diffstat (limited to 'vendor/github.com/klauspost/crc32/crc32_amd64.s')
| -rw-r--r-- | vendor/github.com/klauspost/crc32/crc32_amd64.s | 527 |
1 files changed, 527 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64.s b/vendor/github.com/klauspost/crc32/crc32_amd64.s new file mode 100644 index 000000000..e2de3a5cb --- /dev/null +++ b/vendor/github.com/klauspost/crc32/crc32_amd64.s @@ -0,0 +1,527 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// castagnoliSSE42 updates the (non-inverted) crc with the given buffer. +// +// func castagnoliSSE42(crc uint32, p []byte) uint32 +TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 + MOVL crc+0(FP), AX // CRC value + MOVQ p+8(FP), SI // data pointer + MOVQ p_len+16(FP), CX // len(p) + + // If there are fewer than 8 bytes to process, skip alignment. + CMPQ CX, $8 + JL less_than_8 + + MOVQ SI, BX + ANDQ $7, BX + JZ aligned + + // Process the first few bytes to 8-byte align the input. + + // BX = 8 - BX. We need to process this many bytes to align. + SUBQ $1, BX + XORQ $7, BX + + BTQ $0, BX + JNC align_2 + + CRC32B (SI), AX + DECQ CX + INCQ SI + +align_2: + BTQ $1, BX + JNC align_4 + + CRC32W (SI), AX + + SUBQ $2, CX + ADDQ $2, SI + +align_4: + BTQ $2, BX + JNC aligned + + CRC32L (SI), AX + + SUBQ $4, CX + ADDQ $4, SI + +aligned: + // The input is now 8-byte aligned and we can process 8-byte chunks. + CMPQ CX, $8 + JL less_than_8 + + CRC32Q (SI), AX + ADDQ $8, SI + SUBQ $8, CX + JMP aligned + +less_than_8: + // We may have some bytes left over; process 4 bytes, then 2, then 1. + BTQ $2, CX + JNC less_than_4 + + CRC32L (SI), AX + ADDQ $4, SI + +less_than_4: + BTQ $1, CX + JNC less_than_2 + + CRC32W (SI), AX + ADDQ $2, SI + +less_than_2: + BTQ $0, CX + JNC done + + CRC32B (SI), AX + +done: + MOVL AX, ret+32(FP) + RET + +// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds) +// bytes from each buffer. +// +// func castagnoliSSE42Triple( +// crc1, crc2, crc3 uint32, +// a, b, c []byte, +// rounds uint32, +// ) (retA uint32, retB uint32, retC uint32) +TEXT ·castagnoliSSE42Triple(SB), NOSPLIT, $0 + MOVL crcA+0(FP), AX + MOVL crcB+4(FP), CX + MOVL crcC+8(FP), DX + + MOVQ a+16(FP), R8 // data pointer + MOVQ b+40(FP), R9 // data pointer + MOVQ c+64(FP), R10 // data pointer + + MOVL rounds+88(FP), R11 + +loop: + CRC32Q (R8), AX + CRC32Q (R9), CX + CRC32Q (R10), DX + + CRC32Q 8(R8), AX + CRC32Q 8(R9), CX + CRC32Q 8(R10), DX + + CRC32Q 16(R8), AX + CRC32Q 16(R9), CX + CRC32Q 16(R10), DX + + ADDQ $24, R8 + ADDQ $24, R9 + ADDQ $24, R10 + + DECQ R11 + JNZ loop + + MOVL AX, retA+96(FP) + MOVL CX, retB+100(FP) + MOVL DX, retC+104(FP) + RET + +// CRC32 polynomial data +// +// These constants are lifted from the +// Linux kernel, since they avoid the costly +// PSHUFB 16 byte reversal proposed in the +// original Intel paper. +DATA r2r1<>+0(SB)/8, $0x154442bd4 +DATA r2r1<>+8(SB)/8, $0x1c6e41596 +DATA r4r3<>+0(SB)/8, $0x1751997d0 +DATA r4r3<>+8(SB)/8, $0x0ccaa009e +DATA rupoly<>+0(SB)/8, $0x1db710641 +DATA rupoly<>+8(SB)/8, $0x1f7011641 +DATA r5<>+0(SB)/8, $0x163cd6124 + +GLOBL r2r1<>(SB), RODATA, $16 +GLOBL r4r3<>(SB), RODATA, $16 +GLOBL rupoly<>(SB), RODATA, $16 +GLOBL r5<>(SB), RODATA, $8 + +// Based on https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +// len(p) must be at least 64, and must be a multiple of 16. + +// func ieeeCLMUL(crc uint32, p []byte) uint32 +TEXT ·ieeeCLMUL(SB), NOSPLIT, $0 + MOVL crc+0(FP), X0 // Initial CRC value + MOVQ p+8(FP), SI // data pointer + MOVQ p_len+16(FP), CX // len(p) + + MOVOU (SI), X1 + MOVOU 16(SI), X2 + MOVOU 32(SI), X3 + MOVOU 48(SI), X4 + PXOR X0, X1 + ADDQ $64, SI // buf+=64 + SUBQ $64, CX // len-=64 + CMPQ CX, $64 // Less than 64 bytes left + JB remain64 + + MOVOA r2r1<>+0(SB), X0 + +loopback64: + MOVOA X1, X5 + MOVOA X2, X6 + MOVOA X3, X7 + MOVOA X4, X8 + + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0, X0, X2 + PCLMULQDQ $0, X0, X3 + PCLMULQDQ $0, X0, X4 + + // Load next early + MOVOU (SI), X11 + MOVOU 16(SI), X12 + MOVOU 32(SI), X13 + MOVOU 48(SI), X14 + + PCLMULQDQ $0x11, X0, X5 + PCLMULQDQ $0x11, X0, X6 + PCLMULQDQ $0x11, X0, X7 + PCLMULQDQ $0x11, X0, X8 + + PXOR X5, X1 + PXOR X6, X2 + PXOR X7, X3 + PXOR X8, X4 + + PXOR X11, X1 + PXOR X12, X2 + PXOR X13, X3 + PXOR X14, X4 + + ADDQ $0x40, DI + ADDQ $64, SI // buf+=64 + SUBQ $64, CX // len-=64 + CMPQ CX, $64 // Less than 64 bytes left? + JGE loopback64 + + // Fold result into a single register (X1) +remain64: + MOVOA r4r3<>+0(SB), X0 + + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X2, X1 + + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X3, X1 + + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X4, X1 + + // If there is less than 16 bytes left we are done + CMPQ CX, $16 + JB finish + + // Encode 16 bytes +remain16: + MOVOU (SI), X10 + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X10, X1 + SUBQ $16, CX + ADDQ $16, SI + CMPQ CX, $16 + JGE remain16 + +finish: + // Fold final result into 32 bits and return it + PCMPEQB X3, X3 + PCLMULQDQ $1, X1, X0 + PSRLDQ $8, X1 + PXOR X0, X1 + + MOVOA X1, X2 + MOVQ r5<>+0(SB), X0 + + // Creates 32 bit mask. Note that we don't care about upper half. + PSRLQ $32, X3 + + PSRLDQ $4, X2 + PAND X3, X1 + PCLMULQDQ $0, X0, X1 + PXOR X2, X1 + + MOVOA rupoly<>+0(SB), X0 + + MOVOA X1, X2 + PAND X3, X1 + PCLMULQDQ $0x10, X0, X1 + PAND X3, X1 + PCLMULQDQ $0, X0, X1 + PXOR X2, X1 + + PEXTRD $1, X1, AX + MOVL AX, ret+32(FP) + + RET + +DATA r2r1X<>+0(SB)/8, $0x154442bd4 +DATA r2r1X<>+8(SB)/8, $0x1c6e41596 +DATA r2r1X<>+16(SB)/8, $0x154442bd4 +DATA r2r1X<>+24(SB)/8, $0x1c6e41596 +DATA r2r1X<>+32(SB)/8, $0x154442bd4 +DATA r2r1X<>+40(SB)/8, $0x1c6e41596 +DATA r2r1X<>+48(SB)/8, $0x154442bd4 +DATA r2r1X<>+56(SB)/8, $0x1c6e41596 +GLOBL r2r1X<>(SB), RODATA, $64 + +// Based on https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +// len(p) must be at least 128, and must be a multiple of 16. + +// func ieeeCLMULAvx512(crc uint32, p []byte) uint32 +TEXT ·ieeeCLMULAvx512(SB), NOSPLIT, $0 + MOVL crc+0(FP), AX // Initial CRC value + MOVQ p+8(FP), SI // data pointer + MOVQ p_len+16(FP), CX // len(p) + + VPXORQ Z0, Z0, Z0 + VMOVDQU64 (SI), Z1 + VMOVQ AX, X0 + VPXORQ Z0, Z1, Z1 // Merge initial CRC value into Z1 + ADDQ $64, SI // buf+=64 + SUBQ $64, CX // len-=64 + + VMOVDQU64 r2r1X<>+0(SB), Z0 + +loopback64: + // Load next early + VMOVDQU64 (SI), Z11 + + VPCLMULQDQ $0x11, Z0, Z1, Z5 + VPCLMULQDQ $0, Z0, Z1, Z1 + + VPTERNLOGD $0x96, Z11, Z5, Z1 // Combine results with xor into Z1 + + ADDQ $0x40, DI + ADDQ $64, SI // buf+=64 + SUBQ $64, CX // len-=64 + CMPQ CX, $64 // Less than 64 bytes left? + JGE loopback64 + + // Fold result into a single register (X1) +remain64: + VEXTRACTF32X4 $1, Z1, X2 // X2: Second 128-bit lane + VEXTRACTF32X4 $2, Z1, X3 // X3: Third 128-bit lane + VEXTRACTF32X4 $3, Z1, X4 // X4: Fourth 128-bit lane + + MOVOA r4r3<>+0(SB), X0 + + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X2, X1 + + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X3, X1 + + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X4, X1 + + // If there is less than 16 bytes left we are done + CMPQ CX, $16 + JB finish + + // Encode 16 bytes +remain16: + MOVOU (SI), X10 + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X10, X1 + SUBQ $16, CX + ADDQ $16, SI + CMPQ CX, $16 + JGE remain16 + +finish: + // Fold final result into 32 bits and return it + PCMPEQB X3, X3 + PCLMULQDQ $1, X1, X0 + PSRLDQ $8, X1 + PXOR X0, X1 + + MOVOA X1, X2 + MOVQ r5<>+0(SB), X0 + + // Creates 32 bit mask. Note that we don't care about upper half. + PSRLQ $32, X3 + + PSRLDQ $4, X2 + PAND X3, X1 + PCLMULQDQ $0, X0, X1 + PXOR X2, X1 + + MOVOA rupoly<>+0(SB), X0 + + MOVOA X1, X2 + PAND X3, X1 + PCLMULQDQ $0x10, X0, X1 + PAND X3, X1 + PCLMULQDQ $0, X0, X1 + PXOR X2, X1 + + PEXTRD $1, X1, AX + MOVL AX, ret+32(FP) + VZEROUPPER + RET + +// Castagonli Polynomial constants +DATA r2r1C<>+0(SB)/8, $0x0740eef02 +DATA r2r1C<>+8(SB)/8, $0x09e4addf8 +DATA r2r1C<>+16(SB)/8, $0x0740eef02 +DATA r2r1C<>+24(SB)/8, $0x09e4addf8 +DATA r2r1C<>+32(SB)/8, $0x0740eef02 +DATA r2r1C<>+40(SB)/8, $0x09e4addf8 +DATA r2r1C<>+48(SB)/8, $0x0740eef02 +DATA r2r1C<>+56(SB)/8, $0x09e4addf8 +GLOBL r2r1C<>(SB), RODATA, $64 + +DATA r4r3C<>+0(SB)/8, $0xf20c0dfe +DATA r4r3C<>+8(SB)/8, $0x14cd00bd6 +DATA rupolyC<>+0(SB)/8, $0x105ec76f0 +DATA rupolyC<>+8(SB)/8, $0xdea713f1 +DATA r5C<>+0(SB)/8, $0xdd45aab8 + +GLOBL r4r3C<>(SB), RODATA, $16 +GLOBL rupolyC<>(SB), RODATA, $16 +GLOBL r5C<>(SB), RODATA, $8 + +// Based on https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +// len(p) must be at least 128, and must be a multiple of 16. + +// func castagnoliCLMULAvx512(crc uint32, p []byte) uint32 +TEXT ·castagnoliCLMULAvx512(SB), NOSPLIT, $0 + MOVL crc+0(FP), AX // Initial CRC value + MOVQ p+8(FP), SI // data pointer + MOVQ p_len+16(FP), CX // len(p) + + VPXORQ Z0, Z0, Z0 + VMOVDQU64 (SI), Z1 + VMOVQ AX, X0 + VPXORQ Z0, Z1, Z1 // Merge initial CRC value into Z1 + ADDQ $64, SI // buf+=64 + SUBQ $64, CX // len-=64 + + VMOVDQU64 r2r1C<>+0(SB), Z0 + +loopback64: + // Load next early + VMOVDQU64 (SI), Z11 + + VPCLMULQDQ $0x11, Z0, Z1, Z5 + VPCLMULQDQ $0, Z0, Z1, Z1 + + VPTERNLOGD $0x96, Z11, Z5, Z1 // Combine results with xor into Z1 + + ADDQ $0x40, DI + ADDQ $64, SI // buf+=64 + SUBQ $64, CX // len-=64 + CMPQ CX, $64 // Less than 64 bytes left? + JGE loopback64 + + // Fold result into a single register (X1) +remain64: + VEXTRACTF32X4 $1, Z1, X2 // X2: Second 128-bit lane + VEXTRACTF32X4 $2, Z1, X3 // X3: Third 128-bit lane + VEXTRACTF32X4 $3, Z1, X4 // X4: Fourth 128-bit lane + + MOVOA r4r3C<>+0(SB), X0 + + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X2, X1 + + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X3, X1 + + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X4, X1 + + // If there is less than 16 bytes left we are done + CMPQ CX, $16 + JB finish + + // Encode 16 bytes +remain16: + MOVOU (SI), X10 + MOVOA X1, X5 + PCLMULQDQ $0, X0, X1 + PCLMULQDQ $0x11, X0, X5 + PXOR X5, X1 + PXOR X10, X1 + SUBQ $16, CX + ADDQ $16, SI + CMPQ CX, $16 + JGE remain16 + +finish: + // Fold final result into 32 bits and return it + PCMPEQB X3, X3 + PCLMULQDQ $1, X1, X0 + PSRLDQ $8, X1 + PXOR X0, X1 + + MOVOA X1, X2 + MOVQ r5C<>+0(SB), X0 + + // Creates 32 bit mask. Note that we don't care about upper half. + PSRLQ $32, X3 + + PSRLDQ $4, X2 + PAND X3, X1 + PCLMULQDQ $0, X0, X1 + PXOR X2, X1 + + MOVOA rupolyC<>+0(SB), X0 + + MOVOA X1, X2 + PAND X3, X1 + PCLMULQDQ $0x10, X0, X1 + PAND X3, X1 + PCLMULQDQ $0, X0, X1 + PXOR X2, X1 + + PEXTRD $1, X1, AX + MOVL AX, ret+32(FP) + VZEROUPPER + RET |
