summaryrefslogtreecommitdiff
path: root/vendor/github.com/zeebo/blake3/internal/alg/compress
diff options
context:
space:
mode:
authorLibravatar tsmethurst <tobi.smethurst@protonmail.com>2022-01-16 18:52:30 +0100
committerLibravatar tsmethurst <tobi.smethurst@protonmail.com>2022-01-16 18:52:30 +0100
commit6f5ccf435585e43a00e3cc50f4bcefac36ada818 (patch)
treeba368d27464b79b1e5d010c0662fd3e340bf108e /vendor/github.com/zeebo/blake3/internal/alg/compress
parentadd go-runners to readme (diff)
downloadgotosocial-6f5ccf435585e43a00e3cc50f4bcefac36ada818.tar.xz
update dependencies
Diffstat (limited to 'vendor/github.com/zeebo/blake3/internal/alg/compress')
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go15
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go135
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s560
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go9
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go6
5 files changed, 725 insertions, 0 deletions
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go
new file mode 100644
index 000000000..0b2685408
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go
@@ -0,0 +1,15 @@
+package compress
+
+import (
+ "github.com/zeebo/blake3/internal/alg/compress/compress_pure"
+ "github.com/zeebo/blake3/internal/alg/compress/compress_sse41"
+ "github.com/zeebo/blake3/internal/consts"
+)
+
+func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) {
+ if consts.HasSSE41 {
+ compress_sse41.Compress(chain, block, counter, blen, flags, out)
+ } else {
+ compress_pure.Compress(chain, block, counter, blen, flags, out)
+ }
+}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go
new file mode 100644
index 000000000..66ea1fb75
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go
@@ -0,0 +1,135 @@
+package compress_pure
+
+import (
+ "math/bits"
+
+ "github.com/zeebo/blake3/internal/consts"
+)
+
+func Compress(
+ chain *[8]uint32,
+ block *[16]uint32,
+ counter uint64,
+ blen uint32,
+ flags uint32,
+ out *[16]uint32,
+) {
+
+ *out = [16]uint32{
+ chain[0], chain[1], chain[2], chain[3],
+ chain[4], chain[5], chain[6], chain[7],
+ consts.IV0, consts.IV1, consts.IV2, consts.IV3,
+ uint32(counter), uint32(counter >> 32), blen, flags,
+ }
+
+ rcompress(out, block)
+}
+
+func g(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) {
+ a += b + mx
+ d = bits.RotateLeft32(d^a, -16)
+ c += d
+ b = bits.RotateLeft32(b^c, -12)
+ a += b + my
+ d = bits.RotateLeft32(d^a, -8)
+ c += d
+ b = bits.RotateLeft32(b^c, -7)
+ return a, b, c, d
+}
+
+func rcompress(s *[16]uint32, m *[16]uint32) {
+ const (
+ a = 10
+ b = 11
+ c = 12
+ d = 13
+ e = 14
+ f = 15
+ )
+
+ s0, s1, s2, s3 := s[0+0], s[0+1], s[0+2], s[0+3]
+ s4, s5, s6, s7 := s[0+4], s[0+5], s[0+6], s[0+7]
+ s8, s9, sa, sb := s[8+0], s[8+1], s[8+2], s[8+3]
+ sc, sd, se, sf := s[8+4], s[8+5], s[8+6], s[8+7]
+
+ s0, s4, s8, sc = g(s0, s4, s8, sc, m[0], m[1])
+ s1, s5, s9, sd = g(s1, s5, s9, sd, m[2], m[3])
+ s2, s6, sa, se = g(s2, s6, sa, se, m[4], m[5])
+ s3, s7, sb, sf = g(s3, s7, sb, sf, m[6], m[7])
+ s0, s5, sa, sf = g(s0, s5, sa, sf, m[8], m[9])
+ s1, s6, sb, sc = g(s1, s6, sb, sc, m[a], m[b])
+ s2, s7, s8, sd = g(s2, s7, s8, sd, m[c], m[d])
+ s3, s4, s9, se = g(s3, s4, s9, se, m[e], m[f])
+
+ s0, s4, s8, sc = g(s0, s4, s8, sc, m[2], m[6])
+ s1, s5, s9, sd = g(s1, s5, s9, sd, m[3], m[a])
+ s2, s6, sa, se = g(s2, s6, sa, se, m[7], m[0])
+ s3, s7, sb, sf = g(s3, s7, sb, sf, m[4], m[d])
+ s0, s5, sa, sf = g(s0, s5, sa, sf, m[1], m[b])
+ s1, s6, sb, sc = g(s1, s6, sb, sc, m[c], m[5])
+ s2, s7, s8, sd = g(s2, s7, s8, sd, m[9], m[e])
+ s3, s4, s9, se = g(s3, s4, s9, se, m[f], m[8])
+
+ s0, s4, s8, sc = g(s0, s4, s8, sc, m[3], m[4])
+ s1, s5, s9, sd = g(s1, s5, s9, sd, m[a], m[c])
+ s2, s6, sa, se = g(s2, s6, sa, se, m[d], m[2])
+ s3, s7, sb, sf = g(s3, s7, sb, sf, m[7], m[e])
+ s0, s5, sa, sf = g(s0, s5, sa, sf, m[6], m[5])
+ s1, s6, sb, sc = g(s1, s6, sb, sc, m[9], m[0])
+ s2, s7, s8, sd = g(s2, s7, s8, sd, m[b], m[f])
+ s3, s4, s9, se = g(s3, s4, s9, se, m[8], m[1])
+
+ s0, s4, s8, sc = g(s0, s4, s8, sc, m[a], m[7])
+ s1, s5, s9, sd = g(s1, s5, s9, sd, m[c], m[9])
+ s2, s6, sa, se = g(s2, s6, sa, se, m[e], m[3])
+ s3, s7, sb, sf = g(s3, s7, sb, sf, m[d], m[f])
+ s0, s5, sa, sf = g(s0, s5, sa, sf, m[4], m[0])
+ s1, s6, sb, sc = g(s1, s6, sb, sc, m[b], m[2])
+ s2, s7, s8, sd = g(s2, s7, s8, sd, m[5], m[8])
+ s3, s4, s9, se = g(s3, s4, s9, se, m[1], m[6])
+
+ s0, s4, s8, sc = g(s0, s4, s8, sc, m[c], m[d])
+ s1, s5, s9, sd = g(s1, s5, s9, sd, m[9], m[b])
+ s2, s6, sa, se = g(s2, s6, sa, se, m[f], m[a])
+ s3, s7, sb, sf = g(s3, s7, sb, sf, m[e], m[8])
+ s0, s5, sa, sf = g(s0, s5, sa, sf, m[7], m[2])
+ s1, s6, sb, sc = g(s1, s6, sb, sc, m[5], m[3])
+ s2, s7, s8, sd = g(s2, s7, s8, sd, m[0], m[1])
+ s3, s4, s9, se = g(s3, s4, s9, se, m[6], m[4])
+
+ s0, s4, s8, sc = g(s0, s4, s8, sc, m[9], m[e])
+ s1, s5, s9, sd = g(s1, s5, s9, sd, m[b], m[5])
+ s2, s6, sa, se = g(s2, s6, sa, se, m[8], m[c])
+ s3, s7, sb, sf = g(s3, s7, sb, sf, m[f], m[1])
+ s0, s5, sa, sf = g(s0, s5, sa, sf, m[d], m[3])
+ s1, s6, sb, sc = g(s1, s6, sb, sc, m[0], m[a])
+ s2, s7, s8, sd = g(s2, s7, s8, sd, m[2], m[6])
+ s3, s4, s9, se = g(s3, s4, s9, se, m[4], m[7])
+
+ s0, s4, s8, sc = g(s0, s4, s8, sc, m[b], m[f])
+ s1, s5, s9, sd = g(s1, s5, s9, sd, m[5], m[0])
+ s2, s6, sa, se = g(s2, s6, sa, se, m[1], m[9])
+ s3, s7, sb, sf = g(s3, s7, sb, sf, m[8], m[6])
+ s0, s5, sa, sf = g(s0, s5, sa, sf, m[e], m[a])
+ s1, s6, sb, sc = g(s1, s6, sb, sc, m[2], m[c])
+ s2, s7, s8, sd = g(s2, s7, s8, sd, m[3], m[4])
+ s3, s4, s9, se = g(s3, s4, s9, se, m[7], m[d])
+
+ s[8+0] = s8 ^ s[0]
+ s[8+1] = s9 ^ s[1]
+ s[8+2] = sa ^ s[2]
+ s[8+3] = sb ^ s[3]
+ s[8+4] = sc ^ s[4]
+ s[8+5] = sd ^ s[5]
+ s[8+6] = se ^ s[6]
+ s[8+7] = sf ^ s[7]
+
+ s[0] = s0 ^ s8
+ s[1] = s1 ^ s9
+ s[2] = s2 ^ sa
+ s[3] = s3 ^ sb
+ s[4] = s4 ^ sc
+ s[5] = s5 ^ sd
+ s[6] = s6 ^ se
+ s[7] = s7 ^ sf
+}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s
new file mode 100644
index 000000000..0fedf0b3a
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s
@@ -0,0 +1,560 @@
+// Code generated by command: go run compress.go. DO NOT EDIT.
+
+#include "textflag.h"
+
+DATA iv<>+0(SB)/4, $0x6a09e667
+DATA iv<>+4(SB)/4, $0xbb67ae85
+DATA iv<>+8(SB)/4, $0x3c6ef372
+DATA iv<>+12(SB)/4, $0xa54ff53a
+DATA iv<>+16(SB)/4, $0x510e527f
+DATA iv<>+20(SB)/4, $0x9b05688c
+DATA iv<>+24(SB)/4, $0x1f83d9ab
+DATA iv<>+28(SB)/4, $0x5be0cd19
+GLOBL iv<>(SB), RODATA|NOPTR, $32
+
+DATA rot16_shuf<>+0(SB)/1, $0x02
+DATA rot16_shuf<>+1(SB)/1, $0x03
+DATA rot16_shuf<>+2(SB)/1, $0x00
+DATA rot16_shuf<>+3(SB)/1, $0x01
+DATA rot16_shuf<>+4(SB)/1, $0x06
+DATA rot16_shuf<>+5(SB)/1, $0x07
+DATA rot16_shuf<>+6(SB)/1, $0x04
+DATA rot16_shuf<>+7(SB)/1, $0x05
+DATA rot16_shuf<>+8(SB)/1, $0x0a
+DATA rot16_shuf<>+9(SB)/1, $0x0b
+DATA rot16_shuf<>+10(SB)/1, $0x08
+DATA rot16_shuf<>+11(SB)/1, $0x09
+DATA rot16_shuf<>+12(SB)/1, $0x0e
+DATA rot16_shuf<>+13(SB)/1, $0x0f
+DATA rot16_shuf<>+14(SB)/1, $0x0c
+DATA rot16_shuf<>+15(SB)/1, $0x0d
+DATA rot16_shuf<>+16(SB)/1, $0x12
+DATA rot16_shuf<>+17(SB)/1, $0x13
+DATA rot16_shuf<>+18(SB)/1, $0x10
+DATA rot16_shuf<>+19(SB)/1, $0x11
+DATA rot16_shuf<>+20(SB)/1, $0x16
+DATA rot16_shuf<>+21(SB)/1, $0x17
+DATA rot16_shuf<>+22(SB)/1, $0x14
+DATA rot16_shuf<>+23(SB)/1, $0x15
+DATA rot16_shuf<>+24(SB)/1, $0x1a
+DATA rot16_shuf<>+25(SB)/1, $0x1b
+DATA rot16_shuf<>+26(SB)/1, $0x18
+DATA rot16_shuf<>+27(SB)/1, $0x19
+DATA rot16_shuf<>+28(SB)/1, $0x1e
+DATA rot16_shuf<>+29(SB)/1, $0x1f
+DATA rot16_shuf<>+30(SB)/1, $0x1c
+DATA rot16_shuf<>+31(SB)/1, $0x1d
+GLOBL rot16_shuf<>(SB), RODATA|NOPTR, $32
+
+DATA rot8_shuf<>+0(SB)/1, $0x01
+DATA rot8_shuf<>+1(SB)/1, $0x02
+DATA rot8_shuf<>+2(SB)/1, $0x03
+DATA rot8_shuf<>+3(SB)/1, $0x00
+DATA rot8_shuf<>+4(SB)/1, $0x05
+DATA rot8_shuf<>+5(SB)/1, $0x06
+DATA rot8_shuf<>+6(SB)/1, $0x07
+DATA rot8_shuf<>+7(SB)/1, $0x04
+DATA rot8_shuf<>+8(SB)/1, $0x09
+DATA rot8_shuf<>+9(SB)/1, $0x0a
+DATA rot8_shuf<>+10(SB)/1, $0x0b
+DATA rot8_shuf<>+11(SB)/1, $0x08
+DATA rot8_shuf<>+12(SB)/1, $0x0d
+DATA rot8_shuf<>+13(SB)/1, $0x0e
+DATA rot8_shuf<>+14(SB)/1, $0x0f
+DATA rot8_shuf<>+15(SB)/1, $0x0c
+DATA rot8_shuf<>+16(SB)/1, $0x11
+DATA rot8_shuf<>+17(SB)/1, $0x12
+DATA rot8_shuf<>+18(SB)/1, $0x13
+DATA rot8_shuf<>+19(SB)/1, $0x10
+DATA rot8_shuf<>+20(SB)/1, $0x15
+DATA rot8_shuf<>+21(SB)/1, $0x16
+DATA rot8_shuf<>+22(SB)/1, $0x17
+DATA rot8_shuf<>+23(SB)/1, $0x14
+DATA rot8_shuf<>+24(SB)/1, $0x19
+DATA rot8_shuf<>+25(SB)/1, $0x1a
+DATA rot8_shuf<>+26(SB)/1, $0x1b
+DATA rot8_shuf<>+27(SB)/1, $0x18
+DATA rot8_shuf<>+28(SB)/1, $0x1d
+DATA rot8_shuf<>+29(SB)/1, $0x1e
+DATA rot8_shuf<>+30(SB)/1, $0x1f
+DATA rot8_shuf<>+31(SB)/1, $0x1c
+GLOBL rot8_shuf<>(SB), RODATA|NOPTR, $32
+
+// func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32)
+// Requires: SSE, SSE2, SSE4.1, SSSE3
+TEXT ·Compress(SB), NOSPLIT, $0-40
+ MOVQ chain+0(FP), AX
+ MOVQ block+8(FP), CX
+ MOVQ counter+16(FP), DX
+ MOVL blen+24(FP), BX
+ MOVL flags+28(FP), BP
+ MOVQ out+32(FP), SI
+ MOVUPS (AX), X0
+ MOVUPS 16(AX), X1
+ MOVUPS iv<>+0(SB), X2
+ PINSRD $0x00, DX, X3
+ SHRQ $0x20, DX
+ PINSRD $0x01, DX, X3
+ PINSRD $0x02, BX, X3
+ PINSRD $0x03, BP, X3
+ MOVUPS (CX), X4
+ MOVUPS 16(CX), X5
+ MOVUPS 32(CX), X6
+ MOVUPS 48(CX), X7
+ MOVUPS rot16_shuf<>+0(SB), X8
+ MOVUPS rot8_shuf<>+0(SB), X9
+
+ // round 1
+ MOVAPS X4, X10
+ SHUFPS $0x88, X5, X10
+ PADDD X10, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X11
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X11
+ POR X11, X1
+ MOVAPS X4, X4
+ SHUFPS $0xdd, X5, X4
+ PADDD X4, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x07, X1
+ PSLLL $0x19, X5
+ POR X5, X1
+ PSHUFD $0x93, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x39, X2, X2
+ MOVAPS X6, X5
+ SHUFPS $0x88, X7, X5
+ SHUFPS $0x93, X5, X5
+ PADDD X5, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X11
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X11
+ POR X11, X1
+ MOVAPS X6, X6
+ SHUFPS $0xdd, X7, X6
+ SHUFPS $0x93, X6, X6
+ PADDD X6, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X7
+ PSRLL $0x07, X1
+ PSLLL $0x19, X7
+ POR X7, X1
+ PSHUFD $0x39, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x93, X2, X2
+
+ // round 2
+ MOVAPS X10, X7
+ SHUFPS $0xd6, X4, X7
+ SHUFPS $0x39, X7, X7
+ PADDD X7, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X11
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X11
+ POR X11, X1
+ MOVAPS X5, X11
+ SHUFPS $0xfa, X6, X11
+ PSHUFD $0x0f, X10, X10
+ PBLENDW $0x33, X10, X11
+ PADDD X11, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X10
+ PSRLL $0x07, X1
+ PSLLL $0x19, X10
+ POR X10, X1
+ PSHUFD $0x93, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x39, X2, X2
+ MOVAPS X6, X12
+ PUNPCKLLQ X4, X12
+ PBLENDW $0xc0, X5, X12
+ SHUFPS $0xb4, X12, X12
+ PADDD X12, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X10
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X10
+ POR X10, X1
+ MOVAPS X4, X10
+ PUNPCKHLQ X6, X10
+ MOVAPS X5, X4
+ PUNPCKLLQ X10, X4
+ SHUFPS $0x1e, X4, X4
+ PADDD X4, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x07, X1
+ PSLLL $0x19, X5
+ POR X5, X1
+ PSHUFD $0x39, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x93, X2, X2
+
+ // round 3
+ MOVAPS X7, X5
+ SHUFPS $0xd6, X11, X5
+ SHUFPS $0x39, X5, X5
+ PADDD X5, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X6
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X6
+ POR X6, X1
+ MOVAPS X12, X6
+ SHUFPS $0xfa, X4, X6
+ PSHUFD $0x0f, X7, X7
+ PBLENDW $0x33, X7, X6
+ PADDD X6, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X7
+ PSRLL $0x07, X1
+ PSLLL $0x19, X7
+ POR X7, X1
+ PSHUFD $0x93, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x39, X2, X2
+ MOVAPS X4, X10
+ PUNPCKLLQ X11, X10
+ PBLENDW $0xc0, X12, X10
+ SHUFPS $0xb4, X10, X10
+ PADDD X10, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X7
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X7
+ POR X7, X1
+ MOVAPS X11, X7
+ PUNPCKHLQ X4, X7
+ MOVAPS X12, X4
+ PUNPCKLLQ X7, X4
+ SHUFPS $0x1e, X4, X4
+ PADDD X4, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X7
+ PSRLL $0x07, X1
+ PSLLL $0x19, X7
+ POR X7, X1
+ PSHUFD $0x39, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x93, X2, X2
+
+ // round 4
+ MOVAPS X5, X7
+ SHUFPS $0xd6, X6, X7
+ SHUFPS $0x39, X7, X7
+ PADDD X7, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X11
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X11
+ POR X11, X1
+ MOVAPS X10, X11
+ SHUFPS $0xfa, X4, X11
+ PSHUFD $0x0f, X5, X5
+ PBLENDW $0x33, X5, X11
+ PADDD X11, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x07, X1
+ PSLLL $0x19, X5
+ POR X5, X1
+ PSHUFD $0x93, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x39, X2, X2
+ MOVAPS X4, X12
+ PUNPCKLLQ X6, X12
+ PBLENDW $0xc0, X10, X12
+ SHUFPS $0xb4, X12, X12
+ PADDD X12, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X5
+ POR X5, X1
+ MOVAPS X6, X5
+ PUNPCKHLQ X4, X5
+ MOVAPS X10, X4
+ PUNPCKLLQ X5, X4
+ SHUFPS $0x1e, X4, X4
+ PADDD X4, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x07, X1
+ PSLLL $0x19, X5
+ POR X5, X1
+ PSHUFD $0x39, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x93, X2, X2
+
+ // round 5
+ MOVAPS X7, X5
+ SHUFPS $0xd6, X11, X5
+ SHUFPS $0x39, X5, X5
+ PADDD X5, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X6
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X6
+ POR X6, X1
+ MOVAPS X12, X6
+ SHUFPS $0xfa, X4, X6
+ PSHUFD $0x0f, X7, X7
+ PBLENDW $0x33, X7, X6
+ PADDD X6, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X7
+ PSRLL $0x07, X1
+ PSLLL $0x19, X7
+ POR X7, X1
+ PSHUFD $0x93, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x39, X2, X2
+ MOVAPS X4, X10
+ PUNPCKLLQ X11, X10
+ PBLENDW $0xc0, X12, X10
+ SHUFPS $0xb4, X10, X10
+ PADDD X10, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X7
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X7
+ POR X7, X1
+ MOVAPS X11, X7
+ PUNPCKHLQ X4, X7
+ MOVAPS X12, X4
+ PUNPCKLLQ X7, X4
+ SHUFPS $0x1e, X4, X4
+ PADDD X4, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X7
+ PSRLL $0x07, X1
+ PSLLL $0x19, X7
+ POR X7, X1
+ PSHUFD $0x39, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x93, X2, X2
+
+ // round 6
+ MOVAPS X5, X7
+ SHUFPS $0xd6, X6, X7
+ SHUFPS $0x39, X7, X7
+ PADDD X7, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X11
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X11
+ POR X11, X1
+ MOVAPS X10, X11
+ SHUFPS $0xfa, X4, X11
+ PSHUFD $0x0f, X5, X5
+ PBLENDW $0x33, X5, X11
+ PADDD X11, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x07, X1
+ PSLLL $0x19, X5
+ POR X5, X1
+ PSHUFD $0x93, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x39, X2, X2
+ MOVAPS X4, X12
+ PUNPCKLLQ X6, X12
+ PBLENDW $0xc0, X10, X12
+ SHUFPS $0xb4, X12, X12
+ PADDD X12, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X5
+ POR X5, X1
+ MOVAPS X6, X5
+ PUNPCKHLQ X4, X5
+ MOVAPS X10, X4
+ PUNPCKLLQ X5, X4
+ SHUFPS $0x1e, X4, X4
+ PADDD X4, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x07, X1
+ PSLLL $0x19, X5
+ POR X5, X1
+ PSHUFD $0x39, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x93, X2, X2
+
+ // round 7
+ MOVAPS X7, X5
+ SHUFPS $0xd6, X11, X5
+ SHUFPS $0x39, X5, X5
+ PADDD X5, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X5
+ POR X5, X1
+ MOVAPS X12, X5
+ SHUFPS $0xfa, X4, X5
+ PSHUFD $0x0f, X7, X6
+ PBLENDW $0x33, X6, X5
+ PADDD X5, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x07, X1
+ PSLLL $0x19, X5
+ POR X5, X1
+ PSHUFD $0x93, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x39, X2, X2
+ MOVAPS X4, X5
+ PUNPCKLLQ X11, X5
+ PBLENDW $0xc0, X12, X5
+ SHUFPS $0xb4, X5, X5
+ PADDD X5, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X5
+ POR X5, X1
+ MOVAPS X11, X6
+ PUNPCKHLQ X4, X6
+ MOVAPS X12, X4
+ PUNPCKLLQ X6, X4
+ SHUFPS $0x1e, X4, X4
+ PADDD X4, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X4
+ PSRLL $0x07, X1
+ PSLLL $0x19, X4
+ POR X4, X1
+ PSHUFD $0x39, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x93, X2, X2
+
+ // finalize
+ PXOR X2, X0
+ PXOR X3, X1
+ MOVUPS (AX), X4
+ PXOR X4, X2
+ MOVUPS 16(AX), X4
+ PXOR X4, X3
+ MOVUPS X0, (SI)
+ MOVUPS X1, 16(SI)
+ MOVUPS X2, 32(SI)
+ MOVUPS X3, 48(SI)
+ RET
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go
new file mode 100644
index 000000000..cd63e9740
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go
@@ -0,0 +1,9 @@
+// +build !amd64
+
+package compress_sse41
+
+import "github.com/zeebo/blake3/internal/alg/compress/compress_pure"
+
+func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) {
+ compress_pure.Compress(chain, block, counter, blen, flags, out)
+}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go
new file mode 100644
index 000000000..ffd932d3c
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go
@@ -0,0 +1,6 @@
+// +build amd64
+
+package compress_sse41
+
+//go:noescape
+func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32)