diff options
Diffstat (limited to 'vendor/github.com/zeebo/blake3/internal/alg/compress')
5 files changed, 725 insertions, 0 deletions
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go new file mode 100644 index 000000000..0b2685408 --- /dev/null +++ b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go @@ -0,0 +1,15 @@ +package compress + +import ( + "github.com/zeebo/blake3/internal/alg/compress/compress_pure" + "github.com/zeebo/blake3/internal/alg/compress/compress_sse41" + "github.com/zeebo/blake3/internal/consts" +) + +func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) { + if consts.HasSSE41 { + compress_sse41.Compress(chain, block, counter, blen, flags, out) + } else { + compress_pure.Compress(chain, block, counter, blen, flags, out) + } +} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go new file mode 100644 index 000000000..66ea1fb75 --- /dev/null +++ b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go @@ -0,0 +1,135 @@ +package compress_pure + +import ( + "math/bits" + + "github.com/zeebo/blake3/internal/consts" +) + +func Compress( + chain *[8]uint32, + block *[16]uint32, + counter uint64, + blen uint32, + flags uint32, + out *[16]uint32, +) { + + *out = [16]uint32{ + chain[0], chain[1], chain[2], chain[3], + chain[4], chain[5], chain[6], chain[7], + consts.IV0, consts.IV1, consts.IV2, consts.IV3, + uint32(counter), uint32(counter >> 32), blen, flags, + } + + rcompress(out, block) +} + +func g(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) { + a += b + mx + d = bits.RotateLeft32(d^a, -16) + c += d + b = bits.RotateLeft32(b^c, -12) + a += b + my + d = bits.RotateLeft32(d^a, -8) + c += d + b = bits.RotateLeft32(b^c, -7) + return a, b, c, d +} + +func rcompress(s *[16]uint32, m *[16]uint32) { + const ( + a = 10 + b = 11 + c = 12 + d = 13 + e = 14 + f = 15 + ) + + s0, s1, s2, s3 := s[0+0], s[0+1], s[0+2], s[0+3] + s4, s5, s6, s7 := s[0+4], s[0+5], s[0+6], s[0+7] + s8, s9, sa, sb := s[8+0], s[8+1], s[8+2], s[8+3] + sc, sd, se, sf := s[8+4], s[8+5], s[8+6], s[8+7] + + s0, s4, s8, sc = g(s0, s4, s8, sc, m[0], m[1]) + s1, s5, s9, sd = g(s1, s5, s9, sd, m[2], m[3]) + s2, s6, sa, se = g(s2, s6, sa, se, m[4], m[5]) + s3, s7, sb, sf = g(s3, s7, sb, sf, m[6], m[7]) + s0, s5, sa, sf = g(s0, s5, sa, sf, m[8], m[9]) + s1, s6, sb, sc = g(s1, s6, sb, sc, m[a], m[b]) + s2, s7, s8, sd = g(s2, s7, s8, sd, m[c], m[d]) + s3, s4, s9, se = g(s3, s4, s9, se, m[e], m[f]) + + s0, s4, s8, sc = g(s0, s4, s8, sc, m[2], m[6]) + s1, s5, s9, sd = g(s1, s5, s9, sd, m[3], m[a]) + s2, s6, sa, se = g(s2, s6, sa, se, m[7], m[0]) + s3, s7, sb, sf = g(s3, s7, sb, sf, m[4], m[d]) + s0, s5, sa, sf = g(s0, s5, sa, sf, m[1], m[b]) + s1, s6, sb, sc = g(s1, s6, sb, sc, m[c], m[5]) + s2, s7, s8, sd = g(s2, s7, s8, sd, m[9], m[e]) + s3, s4, s9, se = g(s3, s4, s9, se, m[f], m[8]) + + s0, s4, s8, sc = g(s0, s4, s8, sc, m[3], m[4]) + s1, s5, s9, sd = g(s1, s5, s9, sd, m[a], m[c]) + s2, s6, sa, se = g(s2, s6, sa, se, m[d], m[2]) + s3, s7, sb, sf = g(s3, s7, sb, sf, m[7], m[e]) + s0, s5, sa, sf = g(s0, s5, sa, sf, m[6], m[5]) + s1, s6, sb, sc = g(s1, s6, sb, sc, m[9], m[0]) + s2, s7, s8, sd = g(s2, s7, s8, sd, m[b], m[f]) + s3, s4, s9, se = g(s3, s4, s9, se, m[8], m[1]) + + s0, s4, s8, sc = g(s0, s4, s8, sc, m[a], m[7]) + s1, s5, s9, sd = g(s1, s5, s9, sd, m[c], m[9]) + s2, s6, sa, se = g(s2, s6, sa, se, m[e], m[3]) + s3, s7, sb, sf = g(s3, s7, sb, sf, m[d], m[f]) + s0, s5, sa, sf = g(s0, s5, sa, sf, m[4], m[0]) + s1, s6, sb, sc = g(s1, s6, sb, sc, m[b], m[2]) + s2, s7, s8, sd = g(s2, s7, s8, sd, m[5], m[8]) + s3, s4, s9, se = g(s3, s4, s9, se, m[1], m[6]) + + s0, s4, s8, sc = g(s0, s4, s8, sc, m[c], m[d]) + s1, s5, s9, sd = g(s1, s5, s9, sd, m[9], m[b]) + s2, s6, sa, se = g(s2, s6, sa, se, m[f], m[a]) + s3, s7, sb, sf = g(s3, s7, sb, sf, m[e], m[8]) + s0, s5, sa, sf = g(s0, s5, sa, sf, m[7], m[2]) + s1, s6, sb, sc = g(s1, s6, sb, sc, m[5], m[3]) + s2, s7, s8, sd = g(s2, s7, s8, sd, m[0], m[1]) + s3, s4, s9, se = g(s3, s4, s9, se, m[6], m[4]) + + s0, s4, s8, sc = g(s0, s4, s8, sc, m[9], m[e]) + s1, s5, s9, sd = g(s1, s5, s9, sd, m[b], m[5]) + s2, s6, sa, se = g(s2, s6, sa, se, m[8], m[c]) + s3, s7, sb, sf = g(s3, s7, sb, sf, m[f], m[1]) + s0, s5, sa, sf = g(s0, s5, sa, sf, m[d], m[3]) + s1, s6, sb, sc = g(s1, s6, sb, sc, m[0], m[a]) + s2, s7, s8, sd = g(s2, s7, s8, sd, m[2], m[6]) + s3, s4, s9, se = g(s3, s4, s9, se, m[4], m[7]) + + s0, s4, s8, sc = g(s0, s4, s8, sc, m[b], m[f]) + s1, s5, s9, sd = g(s1, s5, s9, sd, m[5], m[0]) + s2, s6, sa, se = g(s2, s6, sa, se, m[1], m[9]) + s3, s7, sb, sf = g(s3, s7, sb, sf, m[8], m[6]) + s0, s5, sa, sf = g(s0, s5, sa, sf, m[e], m[a]) + s1, s6, sb, sc = g(s1, s6, sb, sc, m[2], m[c]) + s2, s7, s8, sd = g(s2, s7, s8, sd, m[3], m[4]) + s3, s4, s9, se = g(s3, s4, s9, se, m[7], m[d]) + + s[8+0] = s8 ^ s[0] + s[8+1] = s9 ^ s[1] + s[8+2] = sa ^ s[2] + s[8+3] = sb ^ s[3] + s[8+4] = sc ^ s[4] + s[8+5] = sd ^ s[5] + s[8+6] = se ^ s[6] + s[8+7] = sf ^ s[7] + + s[0] = s0 ^ s8 + s[1] = s1 ^ s9 + s[2] = s2 ^ sa + s[3] = s3 ^ sb + s[4] = s4 ^ sc + s[5] = s5 ^ sd + s[6] = s6 ^ se + s[7] = s7 ^ sf +} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s new file mode 100644 index 000000000..0fedf0b3a --- /dev/null +++ b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s @@ -0,0 +1,560 @@ +// Code generated by command: go run compress.go. DO NOT EDIT. + +#include "textflag.h" + +DATA iv<>+0(SB)/4, $0x6a09e667 +DATA iv<>+4(SB)/4, $0xbb67ae85 +DATA iv<>+8(SB)/4, $0x3c6ef372 +DATA iv<>+12(SB)/4, $0xa54ff53a +DATA iv<>+16(SB)/4, $0x510e527f +DATA iv<>+20(SB)/4, $0x9b05688c +DATA iv<>+24(SB)/4, $0x1f83d9ab +DATA iv<>+28(SB)/4, $0x5be0cd19 +GLOBL iv<>(SB), RODATA|NOPTR, $32 + +DATA rot16_shuf<>+0(SB)/1, $0x02 +DATA rot16_shuf<>+1(SB)/1, $0x03 +DATA rot16_shuf<>+2(SB)/1, $0x00 +DATA rot16_shuf<>+3(SB)/1, $0x01 +DATA rot16_shuf<>+4(SB)/1, $0x06 +DATA rot16_shuf<>+5(SB)/1, $0x07 +DATA rot16_shuf<>+6(SB)/1, $0x04 +DATA rot16_shuf<>+7(SB)/1, $0x05 +DATA rot16_shuf<>+8(SB)/1, $0x0a +DATA rot16_shuf<>+9(SB)/1, $0x0b +DATA rot16_shuf<>+10(SB)/1, $0x08 +DATA rot16_shuf<>+11(SB)/1, $0x09 +DATA rot16_shuf<>+12(SB)/1, $0x0e +DATA rot16_shuf<>+13(SB)/1, $0x0f +DATA rot16_shuf<>+14(SB)/1, $0x0c +DATA rot16_shuf<>+15(SB)/1, $0x0d +DATA rot16_shuf<>+16(SB)/1, $0x12 +DATA rot16_shuf<>+17(SB)/1, $0x13 +DATA rot16_shuf<>+18(SB)/1, $0x10 +DATA rot16_shuf<>+19(SB)/1, $0x11 +DATA rot16_shuf<>+20(SB)/1, $0x16 +DATA rot16_shuf<>+21(SB)/1, $0x17 +DATA rot16_shuf<>+22(SB)/1, $0x14 +DATA rot16_shuf<>+23(SB)/1, $0x15 +DATA rot16_shuf<>+24(SB)/1, $0x1a +DATA rot16_shuf<>+25(SB)/1, $0x1b +DATA rot16_shuf<>+26(SB)/1, $0x18 +DATA rot16_shuf<>+27(SB)/1, $0x19 +DATA rot16_shuf<>+28(SB)/1, $0x1e +DATA rot16_shuf<>+29(SB)/1, $0x1f +DATA rot16_shuf<>+30(SB)/1, $0x1c +DATA rot16_shuf<>+31(SB)/1, $0x1d +GLOBL rot16_shuf<>(SB), RODATA|NOPTR, $32 + +DATA rot8_shuf<>+0(SB)/1, $0x01 +DATA rot8_shuf<>+1(SB)/1, $0x02 +DATA rot8_shuf<>+2(SB)/1, $0x03 +DATA rot8_shuf<>+3(SB)/1, $0x00 +DATA rot8_shuf<>+4(SB)/1, $0x05 +DATA rot8_shuf<>+5(SB)/1, $0x06 +DATA rot8_shuf<>+6(SB)/1, $0x07 +DATA rot8_shuf<>+7(SB)/1, $0x04 +DATA rot8_shuf<>+8(SB)/1, $0x09 +DATA rot8_shuf<>+9(SB)/1, $0x0a +DATA rot8_shuf<>+10(SB)/1, $0x0b +DATA rot8_shuf<>+11(SB)/1, $0x08 +DATA rot8_shuf<>+12(SB)/1, $0x0d +DATA rot8_shuf<>+13(SB)/1, $0x0e +DATA rot8_shuf<>+14(SB)/1, $0x0f +DATA rot8_shuf<>+15(SB)/1, $0x0c +DATA rot8_shuf<>+16(SB)/1, $0x11 +DATA rot8_shuf<>+17(SB)/1, $0x12 +DATA rot8_shuf<>+18(SB)/1, $0x13 +DATA rot8_shuf<>+19(SB)/1, $0x10 +DATA rot8_shuf<>+20(SB)/1, $0x15 +DATA rot8_shuf<>+21(SB)/1, $0x16 +DATA rot8_shuf<>+22(SB)/1, $0x17 +DATA rot8_shuf<>+23(SB)/1, $0x14 +DATA rot8_shuf<>+24(SB)/1, $0x19 +DATA rot8_shuf<>+25(SB)/1, $0x1a +DATA rot8_shuf<>+26(SB)/1, $0x1b +DATA rot8_shuf<>+27(SB)/1, $0x18 +DATA rot8_shuf<>+28(SB)/1, $0x1d +DATA rot8_shuf<>+29(SB)/1, $0x1e +DATA rot8_shuf<>+30(SB)/1, $0x1f +DATA rot8_shuf<>+31(SB)/1, $0x1c +GLOBL rot8_shuf<>(SB), RODATA|NOPTR, $32 + +// func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) +// Requires: SSE, SSE2, SSE4.1, SSSE3 +TEXT ·Compress(SB), NOSPLIT, $0-40 + MOVQ chain+0(FP), AX + MOVQ block+8(FP), CX + MOVQ counter+16(FP), DX + MOVL blen+24(FP), BX + MOVL flags+28(FP), BP + MOVQ out+32(FP), SI + MOVUPS (AX), X0 + MOVUPS 16(AX), X1 + MOVUPS iv<>+0(SB), X2 + PINSRD $0x00, DX, X3 + SHRQ $0x20, DX + PINSRD $0x01, DX, X3 + PINSRD $0x02, BX, X3 + PINSRD $0x03, BP, X3 + MOVUPS (CX), X4 + MOVUPS 16(CX), X5 + MOVUPS 32(CX), X6 + MOVUPS 48(CX), X7 + MOVUPS rot16_shuf<>+0(SB), X8 + MOVUPS rot8_shuf<>+0(SB), X9 + + // round 1 + MOVAPS X4, X10 + SHUFPS $0x88, X5, X10 + PADDD X10, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X8, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X11 + PSRLL $0x0c, X1 + PSLLL $0x14, X11 + POR X11, X1 + MOVAPS X4, X4 + SHUFPS $0xdd, X5, X4 + PADDD X4, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X9, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X5 + PSRLL $0x07, X1 + PSLLL $0x19, X5 + POR X5, X1 + PSHUFD $0x93, X0, X0 + PSHUFD $0x4e, X3, X3 + PSHUFD $0x39, X2, X2 + MOVAPS X6, X5 + SHUFPS $0x88, X7, X5 + SHUFPS $0x93, X5, X5 + PADDD X5, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X8, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X11 + PSRLL $0x0c, X1 + PSLLL $0x14, X11 + POR X11, X1 + MOVAPS X6, X6 + SHUFPS $0xdd, X7, X6 + SHUFPS $0x93, X6, X6 + PADDD X6, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X9, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X7 + PSRLL $0x07, X1 + PSLLL $0x19, X7 + POR X7, X1 + PSHUFD $0x39, X0, X0 + PSHUFD $0x4e, X3, X3 + PSHUFD $0x93, X2, X2 + + // round 2 + MOVAPS X10, X7 + SHUFPS $0xd6, X4, X7 + SHUFPS $0x39, X7, X7 + PADDD X7, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X8, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X11 + PSRLL $0x0c, X1 + PSLLL $0x14, X11 + POR X11, X1 + MOVAPS X5, X11 + SHUFPS $0xfa, X6, X11 + PSHUFD $0x0f, X10, X10 + PBLENDW $0x33, X10, X11 + PADDD X11, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X9, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X10 + PSRLL $0x07, X1 + PSLLL $0x19, X10 + POR X10, X1 + PSHUFD $0x93, X0, X0 + PSHUFD $0x4e, X3, X3 + PSHUFD $0x39, X2, X2 + MOVAPS X6, X12 + PUNPCKLLQ X4, X12 + PBLENDW $0xc0, X5, X12 + SHUFPS $0xb4, X12, X12 + PADDD X12, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X8, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X10 + PSRLL $0x0c, X1 + PSLLL $0x14, X10 + POR X10, X1 + MOVAPS X4, X10 + PUNPCKHLQ X6, X10 + MOVAPS X5, X4 + PUNPCKLLQ X10, X4 + SHUFPS $0x1e, X4, X4 + PADDD X4, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X9, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X5 + PSRLL $0x07, X1 + PSLLL $0x19, X5 + POR X5, X1 + PSHUFD $0x39, X0, X0 + PSHUFD $0x4e, X3, X3 + PSHUFD $0x93, X2, X2 + + // round 3 + MOVAPS X7, X5 + SHUFPS $0xd6, X11, X5 + SHUFPS $0x39, X5, X5 + PADDD X5, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X8, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X6 + PSRLL $0x0c, X1 + PSLLL $0x14, X6 + POR X6, X1 + MOVAPS X12, X6 + SHUFPS $0xfa, X4, X6 + PSHUFD $0x0f, X7, X7 + PBLENDW $0x33, X7, X6 + PADDD X6, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X9, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X7 + PSRLL $0x07, X1 + PSLLL $0x19, X7 + POR X7, X1 + PSHUFD $0x93, X0, X0 + PSHUFD $0x4e, X3, X3 + PSHUFD $0x39, X2, X2 + MOVAPS X4, X10 + PUNPCKLLQ X11, X10 + PBLENDW $0xc0, X12, X10 + SHUFPS $0xb4, X10, X10 + PADDD X10, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X8, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X7 + PSRLL $0x0c, X1 + PSLLL $0x14, X7 + POR X7, X1 + MOVAPS X11, X7 + PUNPCKHLQ X4, X7 + MOVAPS X12, X4 + PUNPCKLLQ X7, X4 + SHUFPS $0x1e, X4, X4 + PADDD X4, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X9, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X7 + PSRLL $0x07, X1 + PSLLL $0x19, X7 + POR X7, X1 + PSHUFD $0x39, X0, X0 + PSHUFD $0x4e, X3, X3 + PSHUFD $0x93, X2, X2 + + // round 4 + MOVAPS X5, X7 + SHUFPS $0xd6, X6, X7 + SHUFPS $0x39, X7, X7 + PADDD X7, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X8, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X11 + PSRLL $0x0c, X1 + PSLLL $0x14, X11 + POR X11, X1 + MOVAPS X10, X11 + SHUFPS $0xfa, X4, X11 + PSHUFD $0x0f, X5, X5 + PBLENDW $0x33, X5, X11 + PADDD X11, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X9, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X5 + PSRLL $0x07, X1 + PSLLL $0x19, X5 + POR X5, X1 + PSHUFD $0x93, X0, X0 + PSHUFD $0x4e, X3, X3 + PSHUFD $0x39, X2, X2 + MOVAPS X4, X12 + PUNPCKLLQ X6, X12 + PBLENDW $0xc0, X10, X12 + SHUFPS $0xb4, X12, X12 + PADDD X12, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X8, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X5 + PSRLL $0x0c, X1 + PSLLL $0x14, X5 + POR X5, X1 + MOVAPS X6, X5 + PUNPCKHLQ X4, X5 + MOVAPS X10, X4 + PUNPCKLLQ X5, X4 + SHUFPS $0x1e, X4, X4 + PADDD X4, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X9, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X5 + PSRLL $0x07, X1 + PSLLL $0x19, X5 + POR X5, X1 + PSHUFD $0x39, X0, X0 + PSHUFD $0x4e, X3, X3 + PSHUFD $0x93, X2, X2 + + // round 5 + MOVAPS X7, X5 + SHUFPS $0xd6, X11, X5 + SHUFPS $0x39, X5, X5 + PADDD X5, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X8, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X6 + PSRLL $0x0c, X1 + PSLLL $0x14, X6 + POR X6, X1 + MOVAPS X12, X6 + SHUFPS $0xfa, X4, X6 + PSHUFD $0x0f, X7, X7 + PBLENDW $0x33, X7, X6 + PADDD X6, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X9, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X7 + PSRLL $0x07, X1 + PSLLL $0x19, X7 + POR X7, X1 + PSHUFD $0x93, X0, X0 + PSHUFD $0x4e, X3, X3 + PSHUFD $0x39, X2, X2 + MOVAPS X4, X10 + PUNPCKLLQ X11, X10 + PBLENDW $0xc0, X12, X10 + SHUFPS $0xb4, X10, X10 + PADDD X10, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X8, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X7 + PSRLL $0x0c, X1 + PSLLL $0x14, X7 + POR X7, X1 + MOVAPS X11, X7 + PUNPCKHLQ X4, X7 + MOVAPS X12, X4 + PUNPCKLLQ X7, X4 + SHUFPS $0x1e, X4, X4 + PADDD X4, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X9, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X7 + PSRLL $0x07, X1 + PSLLL $0x19, X7 + POR X7, X1 + PSHUFD $0x39, X0, X0 + PSHUFD $0x4e, X3, X3 + PSHUFD $0x93, X2, X2 + + // round 6 + MOVAPS X5, X7 + SHUFPS $0xd6, X6, X7 + SHUFPS $0x39, X7, X7 + PADDD X7, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X8, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X11 + PSRLL $0x0c, X1 + PSLLL $0x14, X11 + POR X11, X1 + MOVAPS X10, X11 + SHUFPS $0xfa, X4, X11 + PSHUFD $0x0f, X5, X5 + PBLENDW $0x33, X5, X11 + PADDD X11, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X9, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X5 + PSRLL $0x07, X1 + PSLLL $0x19, X5 + POR X5, X1 + PSHUFD $0x93, X0, X0 + PSHUFD $0x4e, X3, X3 + PSHUFD $0x39, X2, X2 + MOVAPS X4, X12 + PUNPCKLLQ X6, X12 + PBLENDW $0xc0, X10, X12 + SHUFPS $0xb4, X12, X12 + PADDD X12, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X8, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X5 + PSRLL $0x0c, X1 + PSLLL $0x14, X5 + POR X5, X1 + MOVAPS X6, X5 + PUNPCKHLQ X4, X5 + MOVAPS X10, X4 + PUNPCKLLQ X5, X4 + SHUFPS $0x1e, X4, X4 + PADDD X4, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X9, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X5 + PSRLL $0x07, X1 + PSLLL $0x19, X5 + POR X5, X1 + PSHUFD $0x39, X0, X0 + PSHUFD $0x4e, X3, X3 + PSHUFD $0x93, X2, X2 + + // round 7 + MOVAPS X7, X5 + SHUFPS $0xd6, X11, X5 + SHUFPS $0x39, X5, X5 + PADDD X5, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X8, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X5 + PSRLL $0x0c, X1 + PSLLL $0x14, X5 + POR X5, X1 + MOVAPS X12, X5 + SHUFPS $0xfa, X4, X5 + PSHUFD $0x0f, X7, X6 + PBLENDW $0x33, X6, X5 + PADDD X5, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X9, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X5 + PSRLL $0x07, X1 + PSLLL $0x19, X5 + POR X5, X1 + PSHUFD $0x93, X0, X0 + PSHUFD $0x4e, X3, X3 + PSHUFD $0x39, X2, X2 + MOVAPS X4, X5 + PUNPCKLLQ X11, X5 + PBLENDW $0xc0, X12, X5 + SHUFPS $0xb4, X5, X5 + PADDD X5, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X8, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X5 + PSRLL $0x0c, X1 + PSLLL $0x14, X5 + POR X5, X1 + MOVAPS X11, X6 + PUNPCKHLQ X4, X6 + MOVAPS X12, X4 + PUNPCKLLQ X6, X4 + SHUFPS $0x1e, X4, X4 + PADDD X4, X0 + PADDD X1, X0 + PXOR X0, X3 + PSHUFB X9, X3 + PADDD X3, X2 + PXOR X2, X1 + MOVAPS X1, X4 + PSRLL $0x07, X1 + PSLLL $0x19, X4 + POR X4, X1 + PSHUFD $0x39, X0, X0 + PSHUFD $0x4e, X3, X3 + PSHUFD $0x93, X2, X2 + + // finalize + PXOR X2, X0 + PXOR X3, X1 + MOVUPS (AX), X4 + PXOR X4, X2 + MOVUPS 16(AX), X4 + PXOR X4, X3 + MOVUPS X0, (SI) + MOVUPS X1, 16(SI) + MOVUPS X2, 32(SI) + MOVUPS X3, 48(SI) + RET diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go new file mode 100644 index 000000000..cd63e9740 --- /dev/null +++ b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go @@ -0,0 +1,9 @@ +// +build !amd64 + +package compress_sse41 + +import "github.com/zeebo/blake3/internal/alg/compress/compress_pure" + +func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) { + compress_pure.Compress(chain, block, counter, blen, flags, out) +} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go new file mode 100644 index 000000000..ffd932d3c --- /dev/null +++ b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go @@ -0,0 +1,6 @@ +// +build amd64 + +package compress_sse41 + +//go:noescape +func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) |