diff options
| author | 2025-03-09 17:47:56 +0100 | |
|---|---|---|
| committer | 2025-12-01 22:08:04 +0100 | |
| commit | b1af8fd87760b34e3ff2fd3bda38f211815a0473 (patch) | |
| tree | 9317fad1a7ec298d7a8d2678e4e422953bbc6f33 /vendor/github.com/klauspost/crc32/crc32_amd64.s | |
| parent | [chore] update URLs to forked source (diff) | |
| download | gotosocial-b1af8fd87760b34e3ff2fd3bda38f211815a0473.tar.xz | |
[chore] remove vendor
Diffstat (limited to 'vendor/github.com/klauspost/crc32/crc32_amd64.s')
| -rw-r--r-- | vendor/github.com/klauspost/crc32/crc32_amd64.s | 527 |
1 files changed, 0 insertions, 527 deletions
diff --git a/vendor/github.com/klauspost/crc32/crc32_amd64.s b/vendor/github.com/klauspost/crc32/crc32_amd64.s deleted file mode 100644 index e2de3a5cb..000000000 --- a/vendor/github.com/klauspost/crc32/crc32_amd64.s +++ /dev/null @@ -1,527 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "textflag.h" - -// castagnoliSSE42 updates the (non-inverted) crc with the given buffer. -// -// func castagnoliSSE42(crc uint32, p []byte) uint32 -TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 - MOVL crc+0(FP), AX // CRC value - MOVQ p+8(FP), SI // data pointer - MOVQ p_len+16(FP), CX // len(p) - - // If there are fewer than 8 bytes to process, skip alignment. - CMPQ CX, $8 - JL less_than_8 - - MOVQ SI, BX - ANDQ $7, BX - JZ aligned - - // Process the first few bytes to 8-byte align the input. - - // BX = 8 - BX. We need to process this many bytes to align. - SUBQ $1, BX - XORQ $7, BX - - BTQ $0, BX - JNC align_2 - - CRC32B (SI), AX - DECQ CX - INCQ SI - -align_2: - BTQ $1, BX - JNC align_4 - - CRC32W (SI), AX - - SUBQ $2, CX - ADDQ $2, SI - -align_4: - BTQ $2, BX - JNC aligned - - CRC32L (SI), AX - - SUBQ $4, CX - ADDQ $4, SI - -aligned: - // The input is now 8-byte aligned and we can process 8-byte chunks. - CMPQ CX, $8 - JL less_than_8 - - CRC32Q (SI), AX - ADDQ $8, SI - SUBQ $8, CX - JMP aligned - -less_than_8: - // We may have some bytes left over; process 4 bytes, then 2, then 1. - BTQ $2, CX - JNC less_than_4 - - CRC32L (SI), AX - ADDQ $4, SI - -less_than_4: - BTQ $1, CX - JNC less_than_2 - - CRC32W (SI), AX - ADDQ $2, SI - -less_than_2: - BTQ $0, CX - JNC done - - CRC32B (SI), AX - -done: - MOVL AX, ret+32(FP) - RET - -// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds) -// bytes from each buffer. -// -// func castagnoliSSE42Triple( -// crc1, crc2, crc3 uint32, -// a, b, c []byte, -// rounds uint32, -// ) (retA uint32, retB uint32, retC uint32) -TEXT ·castagnoliSSE42Triple(SB), NOSPLIT, $0 - MOVL crcA+0(FP), AX - MOVL crcB+4(FP), CX - MOVL crcC+8(FP), DX - - MOVQ a+16(FP), R8 // data pointer - MOVQ b+40(FP), R9 // data pointer - MOVQ c+64(FP), R10 // data pointer - - MOVL rounds+88(FP), R11 - -loop: - CRC32Q (R8), AX - CRC32Q (R9), CX - CRC32Q (R10), DX - - CRC32Q 8(R8), AX - CRC32Q 8(R9), CX - CRC32Q 8(R10), DX - - CRC32Q 16(R8), AX - CRC32Q 16(R9), CX - CRC32Q 16(R10), DX - - ADDQ $24, R8 - ADDQ $24, R9 - ADDQ $24, R10 - - DECQ R11 - JNZ loop - - MOVL AX, retA+96(FP) - MOVL CX, retB+100(FP) - MOVL DX, retC+104(FP) - RET - -// CRC32 polynomial data -// -// These constants are lifted from the -// Linux kernel, since they avoid the costly -// PSHUFB 16 byte reversal proposed in the -// original Intel paper. -DATA r2r1<>+0(SB)/8, $0x154442bd4 -DATA r2r1<>+8(SB)/8, $0x1c6e41596 -DATA r4r3<>+0(SB)/8, $0x1751997d0 -DATA r4r3<>+8(SB)/8, $0x0ccaa009e -DATA rupoly<>+0(SB)/8, $0x1db710641 -DATA rupoly<>+8(SB)/8, $0x1f7011641 -DATA r5<>+0(SB)/8, $0x163cd6124 - -GLOBL r2r1<>(SB), RODATA, $16 -GLOBL r4r3<>(SB), RODATA, $16 -GLOBL rupoly<>(SB), RODATA, $16 -GLOBL r5<>(SB), RODATA, $8 - -// Based on https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf -// len(p) must be at least 64, and must be a multiple of 16. - -// func ieeeCLMUL(crc uint32, p []byte) uint32 -TEXT ·ieeeCLMUL(SB), NOSPLIT, $0 - MOVL crc+0(FP), X0 // Initial CRC value - MOVQ p+8(FP), SI // data pointer - MOVQ p_len+16(FP), CX // len(p) - - MOVOU (SI), X1 - MOVOU 16(SI), X2 - MOVOU 32(SI), X3 - MOVOU 48(SI), X4 - PXOR X0, X1 - ADDQ $64, SI // buf+=64 - SUBQ $64, CX // len-=64 - CMPQ CX, $64 // Less than 64 bytes left - JB remain64 - - MOVOA r2r1<>+0(SB), X0 - -loopback64: - MOVOA X1, X5 - MOVOA X2, X6 - MOVOA X3, X7 - MOVOA X4, X8 - - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0, X0, X2 - PCLMULQDQ $0, X0, X3 - PCLMULQDQ $0, X0, X4 - - // Load next early - MOVOU (SI), X11 - MOVOU 16(SI), X12 - MOVOU 32(SI), X13 - MOVOU 48(SI), X14 - - PCLMULQDQ $0x11, X0, X5 - PCLMULQDQ $0x11, X0, X6 - PCLMULQDQ $0x11, X0, X7 - PCLMULQDQ $0x11, X0, X8 - - PXOR X5, X1 - PXOR X6, X2 - PXOR X7, X3 - PXOR X8, X4 - - PXOR X11, X1 - PXOR X12, X2 - PXOR X13, X3 - PXOR X14, X4 - - ADDQ $0x40, DI - ADDQ $64, SI // buf+=64 - SUBQ $64, CX // len-=64 - CMPQ CX, $64 // Less than 64 bytes left? - JGE loopback64 - - // Fold result into a single register (X1) -remain64: - MOVOA r4r3<>+0(SB), X0 - - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X2, X1 - - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X3, X1 - - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X4, X1 - - // If there is less than 16 bytes left we are done - CMPQ CX, $16 - JB finish - - // Encode 16 bytes -remain16: - MOVOU (SI), X10 - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X10, X1 - SUBQ $16, CX - ADDQ $16, SI - CMPQ CX, $16 - JGE remain16 - -finish: - // Fold final result into 32 bits and return it - PCMPEQB X3, X3 - PCLMULQDQ $1, X1, X0 - PSRLDQ $8, X1 - PXOR X0, X1 - - MOVOA X1, X2 - MOVQ r5<>+0(SB), X0 - - // Creates 32 bit mask. Note that we don't care about upper half. - PSRLQ $32, X3 - - PSRLDQ $4, X2 - PAND X3, X1 - PCLMULQDQ $0, X0, X1 - PXOR X2, X1 - - MOVOA rupoly<>+0(SB), X0 - - MOVOA X1, X2 - PAND X3, X1 - PCLMULQDQ $0x10, X0, X1 - PAND X3, X1 - PCLMULQDQ $0, X0, X1 - PXOR X2, X1 - - PEXTRD $1, X1, AX - MOVL AX, ret+32(FP) - - RET - -DATA r2r1X<>+0(SB)/8, $0x154442bd4 -DATA r2r1X<>+8(SB)/8, $0x1c6e41596 -DATA r2r1X<>+16(SB)/8, $0x154442bd4 -DATA r2r1X<>+24(SB)/8, $0x1c6e41596 -DATA r2r1X<>+32(SB)/8, $0x154442bd4 -DATA r2r1X<>+40(SB)/8, $0x1c6e41596 -DATA r2r1X<>+48(SB)/8, $0x154442bd4 -DATA r2r1X<>+56(SB)/8, $0x1c6e41596 -GLOBL r2r1X<>(SB), RODATA, $64 - -// Based on https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf -// len(p) must be at least 128, and must be a multiple of 16. - -// func ieeeCLMULAvx512(crc uint32, p []byte) uint32 -TEXT ·ieeeCLMULAvx512(SB), NOSPLIT, $0 - MOVL crc+0(FP), AX // Initial CRC value - MOVQ p+8(FP), SI // data pointer - MOVQ p_len+16(FP), CX // len(p) - - VPXORQ Z0, Z0, Z0 - VMOVDQU64 (SI), Z1 - VMOVQ AX, X0 - VPXORQ Z0, Z1, Z1 // Merge initial CRC value into Z1 - ADDQ $64, SI // buf+=64 - SUBQ $64, CX // len-=64 - - VMOVDQU64 r2r1X<>+0(SB), Z0 - -loopback64: - // Load next early - VMOVDQU64 (SI), Z11 - - VPCLMULQDQ $0x11, Z0, Z1, Z5 - VPCLMULQDQ $0, Z0, Z1, Z1 - - VPTERNLOGD $0x96, Z11, Z5, Z1 // Combine results with xor into Z1 - - ADDQ $0x40, DI - ADDQ $64, SI // buf+=64 - SUBQ $64, CX // len-=64 - CMPQ CX, $64 // Less than 64 bytes left? - JGE loopback64 - - // Fold result into a single register (X1) -remain64: - VEXTRACTF32X4 $1, Z1, X2 // X2: Second 128-bit lane - VEXTRACTF32X4 $2, Z1, X3 // X3: Third 128-bit lane - VEXTRACTF32X4 $3, Z1, X4 // X4: Fourth 128-bit lane - - MOVOA r4r3<>+0(SB), X0 - - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X2, X1 - - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X3, X1 - - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X4, X1 - - // If there is less than 16 bytes left we are done - CMPQ CX, $16 - JB finish - - // Encode 16 bytes -remain16: - MOVOU (SI), X10 - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X10, X1 - SUBQ $16, CX - ADDQ $16, SI - CMPQ CX, $16 - JGE remain16 - -finish: - // Fold final result into 32 bits and return it - PCMPEQB X3, X3 - PCLMULQDQ $1, X1, X0 - PSRLDQ $8, X1 - PXOR X0, X1 - - MOVOA X1, X2 - MOVQ r5<>+0(SB), X0 - - // Creates 32 bit mask. Note that we don't care about upper half. - PSRLQ $32, X3 - - PSRLDQ $4, X2 - PAND X3, X1 - PCLMULQDQ $0, X0, X1 - PXOR X2, X1 - - MOVOA rupoly<>+0(SB), X0 - - MOVOA X1, X2 - PAND X3, X1 - PCLMULQDQ $0x10, X0, X1 - PAND X3, X1 - PCLMULQDQ $0, X0, X1 - PXOR X2, X1 - - PEXTRD $1, X1, AX - MOVL AX, ret+32(FP) - VZEROUPPER - RET - -// Castagonli Polynomial constants -DATA r2r1C<>+0(SB)/8, $0x0740eef02 -DATA r2r1C<>+8(SB)/8, $0x09e4addf8 -DATA r2r1C<>+16(SB)/8, $0x0740eef02 -DATA r2r1C<>+24(SB)/8, $0x09e4addf8 -DATA r2r1C<>+32(SB)/8, $0x0740eef02 -DATA r2r1C<>+40(SB)/8, $0x09e4addf8 -DATA r2r1C<>+48(SB)/8, $0x0740eef02 -DATA r2r1C<>+56(SB)/8, $0x09e4addf8 -GLOBL r2r1C<>(SB), RODATA, $64 - -DATA r4r3C<>+0(SB)/8, $0xf20c0dfe -DATA r4r3C<>+8(SB)/8, $0x14cd00bd6 -DATA rupolyC<>+0(SB)/8, $0x105ec76f0 -DATA rupolyC<>+8(SB)/8, $0xdea713f1 -DATA r5C<>+0(SB)/8, $0xdd45aab8 - -GLOBL r4r3C<>(SB), RODATA, $16 -GLOBL rupolyC<>(SB), RODATA, $16 -GLOBL r5C<>(SB), RODATA, $8 - -// Based on https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf -// len(p) must be at least 128, and must be a multiple of 16. - -// func castagnoliCLMULAvx512(crc uint32, p []byte) uint32 -TEXT ·castagnoliCLMULAvx512(SB), NOSPLIT, $0 - MOVL crc+0(FP), AX // Initial CRC value - MOVQ p+8(FP), SI // data pointer - MOVQ p_len+16(FP), CX // len(p) - - VPXORQ Z0, Z0, Z0 - VMOVDQU64 (SI), Z1 - VMOVQ AX, X0 - VPXORQ Z0, Z1, Z1 // Merge initial CRC value into Z1 - ADDQ $64, SI // buf+=64 - SUBQ $64, CX // len-=64 - - VMOVDQU64 r2r1C<>+0(SB), Z0 - -loopback64: - // Load next early - VMOVDQU64 (SI), Z11 - - VPCLMULQDQ $0x11, Z0, Z1, Z5 - VPCLMULQDQ $0, Z0, Z1, Z1 - - VPTERNLOGD $0x96, Z11, Z5, Z1 // Combine results with xor into Z1 - - ADDQ $0x40, DI - ADDQ $64, SI // buf+=64 - SUBQ $64, CX // len-=64 - CMPQ CX, $64 // Less than 64 bytes left? - JGE loopback64 - - // Fold result into a single register (X1) -remain64: - VEXTRACTF32X4 $1, Z1, X2 // X2: Second 128-bit lane - VEXTRACTF32X4 $2, Z1, X3 // X3: Third 128-bit lane - VEXTRACTF32X4 $3, Z1, X4 // X4: Fourth 128-bit lane - - MOVOA r4r3C<>+0(SB), X0 - - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X2, X1 - - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X3, X1 - - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X4, X1 - - // If there is less than 16 bytes left we are done - CMPQ CX, $16 - JB finish - - // Encode 16 bytes -remain16: - MOVOU (SI), X10 - MOVOA X1, X5 - PCLMULQDQ $0, X0, X1 - PCLMULQDQ $0x11, X0, X5 - PXOR X5, X1 - PXOR X10, X1 - SUBQ $16, CX - ADDQ $16, SI - CMPQ CX, $16 - JGE remain16 - -finish: - // Fold final result into 32 bits and return it - PCMPEQB X3, X3 - PCLMULQDQ $1, X1, X0 - PSRLDQ $8, X1 - PXOR X0, X1 - - MOVOA X1, X2 - MOVQ r5C<>+0(SB), X0 - - // Creates 32 bit mask. Note that we don't care about upper half. - PSRLQ $32, X3 - - PSRLDQ $4, X2 - PAND X3, X1 - PCLMULQDQ $0, X0, X1 - PXOR X2, X1 - - MOVOA rupolyC<>+0(SB), X0 - - MOVOA X1, X2 - PAND X3, X1 - PCLMULQDQ $0x10, X0, X1 - PAND X3, X1 - PCLMULQDQ $0, X0, X1 - PXOR X2, X1 - - PEXTRD $1, X1, AX - MOVL AX, ret+32(FP) - VZEROUPPER - RET |
