diff options
Diffstat (limited to 'vendor/github.com/zeebo/blake3/internal/alg')
12 files changed, 0 insertions, 3443 deletions
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/alg.go b/vendor/github.com/zeebo/blake3/internal/alg/alg.go deleted file mode 100644 index 239fdec5b..000000000 --- a/vendor/github.com/zeebo/blake3/internal/alg/alg.go +++ /dev/null @@ -1,18 +0,0 @@ -package alg - -import ( - "github.com/zeebo/blake3/internal/alg/compress" - "github.com/zeebo/blake3/internal/alg/hash" -) - -func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) { - hash.HashF(input, length, counter, flags, key, out, chain) -} - -func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) { - hash.HashP(left, right, flags, key, out, n) -} - -func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) { - compress.Compress(chain, block, counter, blen, flags, out) -} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go deleted file mode 100644 index 0b2685408..000000000 --- a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go +++ /dev/null @@ -1,15 +0,0 @@ -package compress - -import ( - "github.com/zeebo/blake3/internal/alg/compress/compress_pure" - "github.com/zeebo/blake3/internal/alg/compress/compress_sse41" - "github.com/zeebo/blake3/internal/consts" -) - -func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) { - if consts.HasSSE41 { - compress_sse41.Compress(chain, block, counter, blen, flags, out) - } else { - compress_pure.Compress(chain, block, counter, blen, flags, out) - } -} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go deleted file mode 100644 index 66ea1fb75..000000000 --- a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go +++ /dev/null @@ -1,135 +0,0 @@ -package compress_pure - -import ( - "math/bits" - - "github.com/zeebo/blake3/internal/consts" -) - -func Compress( - chain *[8]uint32, - block *[16]uint32, - counter uint64, - blen uint32, - flags uint32, - out *[16]uint32, -) { - - *out = [16]uint32{ - chain[0], chain[1], chain[2], chain[3], - chain[4], chain[5], chain[6], chain[7], - consts.IV0, consts.IV1, consts.IV2, consts.IV3, - uint32(counter), uint32(counter >> 32), blen, flags, - } - - rcompress(out, block) -} - -func g(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) { - a += b + mx - d = bits.RotateLeft32(d^a, -16) - c += d - b = bits.RotateLeft32(b^c, -12) - a += b + my - d = bits.RotateLeft32(d^a, -8) - c += d - b = bits.RotateLeft32(b^c, -7) - return a, b, c, d -} - -func rcompress(s *[16]uint32, m *[16]uint32) { - const ( - a = 10 - b = 11 - c = 12 - d = 13 - e = 14 - f = 15 - ) - - s0, s1, s2, s3 := s[0+0], s[0+1], s[0+2], s[0+3] - s4, s5, s6, s7 := s[0+4], s[0+5], s[0+6], s[0+7] - s8, s9, sa, sb := s[8+0], s[8+1], s[8+2], s[8+3] - sc, sd, se, sf := s[8+4], s[8+5], s[8+6], s[8+7] - - s0, s4, s8, sc = g(s0, s4, s8, sc, m[0], m[1]) - s1, s5, s9, sd = g(s1, s5, s9, sd, m[2], m[3]) - s2, s6, sa, se = g(s2, s6, sa, se, m[4], m[5]) - s3, s7, sb, sf = g(s3, s7, sb, sf, m[6], m[7]) - s0, s5, sa, sf = g(s0, s5, sa, sf, m[8], m[9]) - s1, s6, sb, sc = g(s1, s6, sb, sc, m[a], m[b]) - s2, s7, s8, sd = g(s2, s7, s8, sd, m[c], m[d]) - s3, s4, s9, se = g(s3, s4, s9, se, m[e], m[f]) - - s0, s4, s8, sc = g(s0, s4, s8, sc, m[2], m[6]) - s1, s5, s9, sd = g(s1, s5, s9, sd, m[3], m[a]) - s2, s6, sa, se = g(s2, s6, sa, se, m[7], m[0]) - s3, s7, sb, sf = g(s3, s7, sb, sf, m[4], m[d]) - s0, s5, sa, sf = g(s0, s5, sa, sf, m[1], m[b]) - s1, s6, sb, sc = g(s1, s6, sb, sc, m[c], m[5]) - s2, s7, s8, sd = g(s2, s7, s8, sd, m[9], m[e]) - s3, s4, s9, se = g(s3, s4, s9, se, m[f], m[8]) - - s0, s4, s8, sc = g(s0, s4, s8, sc, m[3], m[4]) - s1, s5, s9, sd = g(s1, s5, s9, sd, m[a], m[c]) - s2, s6, sa, se = g(s2, s6, sa, se, m[d], m[2]) - s3, s7, sb, sf = g(s3, s7, sb, sf, m[7], m[e]) - s0, s5, sa, sf = g(s0, s5, sa, sf, m[6], m[5]) - s1, s6, sb, sc = g(s1, s6, sb, sc, m[9], m[0]) - s2, s7, s8, sd = g(s2, s7, s8, sd, m[b], m[f]) - s3, s4, s9, se = g(s3, s4, s9, se, m[8], m[1]) - - s0, s4, s8, sc = g(s0, s4, s8, sc, m[a], m[7]) - s1, s5, s9, sd = g(s1, s5, s9, sd, m[c], m[9]) - s2, s6, sa, se = g(s2, s6, sa, se, m[e], m[3]) - s3, s7, sb, sf = g(s3, s7, sb, sf, m[d], m[f]) - s0, s5, sa, sf = g(s0, s5, sa, sf, m[4], m[0]) - s1, s6, sb, sc = g(s1, s6, sb, sc, m[b], m[2]) - s2, s7, s8, sd = g(s2, s7, s8, sd, m[5], m[8]) - s3, s4, s9, se = g(s3, s4, s9, se, m[1], m[6]) - - s0, s4, s8, sc = g(s0, s4, s8, sc, m[c], m[d]) - s1, s5, s9, sd = g(s1, s5, s9, sd, m[9], m[b]) - s2, s6, sa, se = g(s2, s6, sa, se, m[f], m[a]) - s3, s7, sb, sf = g(s3, s7, sb, sf, m[e], m[8]) - s0, s5, sa, sf = g(s0, s5, sa, sf, m[7], m[2]) - s1, s6, sb, sc = g(s1, s6, sb, sc, m[5], m[3]) - s2, s7, s8, sd = g(s2, s7, s8, sd, m[0], m[1]) - s3, s4, s9, se = g(s3, s4, s9, se, m[6], m[4]) - - s0, s4, s8, sc = g(s0, s4, s8, sc, m[9], m[e]) - s1, s5, s9, sd = g(s1, s5, s9, sd, m[b], m[5]) - s2, s6, sa, se = g(s2, s6, sa, se, m[8], m[c]) - s3, s7, sb, sf = g(s3, s7, sb, sf, m[f], m[1]) - s0, s5, sa, sf = g(s0, s5, sa, sf, m[d], m[3]) - s1, s6, sb, sc = g(s1, s6, sb, sc, m[0], m[a]) - s2, s7, s8, sd = g(s2, s7, s8, sd, m[2], m[6]) - s3, s4, s9, se = g(s3, s4, s9, se, m[4], m[7]) - - s0, s4, s8, sc = g(s0, s4, s8, sc, m[b], m[f]) - s1, s5, s9, sd = g(s1, s5, s9, sd, m[5], m[0]) - s2, s6, sa, se = g(s2, s6, sa, se, m[1], m[9]) - s3, s7, sb, sf = g(s3, s7, sb, sf, m[8], m[6]) - s0, s5, sa, sf = g(s0, s5, sa, sf, m[e], m[a]) - s1, s6, sb, sc = g(s1, s6, sb, sc, m[2], m[c]) - s2, s7, s8, sd = g(s2, s7, s8, sd, m[3], m[4]) - s3, s4, s9, se = g(s3, s4, s9, se, m[7], m[d]) - - s[8+0] = s8 ^ s[0] - s[8+1] = s9 ^ s[1] - s[8+2] = sa ^ s[2] - s[8+3] = sb ^ s[3] - s[8+4] = sc ^ s[4] - s[8+5] = sd ^ s[5] - s[8+6] = se ^ s[6] - s[8+7] = sf ^ s[7] - - s[0] = s0 ^ s8 - s[1] = s1 ^ s9 - s[2] = s2 ^ sa - s[3] = s3 ^ sb - s[4] = s4 ^ sc - s[5] = s5 ^ sd - s[6] = s6 ^ se - s[7] = s7 ^ sf -} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s deleted file mode 100644 index 0fedf0b3a..000000000 --- a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s +++ /dev/null @@ -1,560 +0,0 @@ -// Code generated by command: go run compress.go. DO NOT EDIT. - -#include "textflag.h" - -DATA iv<>+0(SB)/4, $0x6a09e667 -DATA iv<>+4(SB)/4, $0xbb67ae85 -DATA iv<>+8(SB)/4, $0x3c6ef372 -DATA iv<>+12(SB)/4, $0xa54ff53a -DATA iv<>+16(SB)/4, $0x510e527f -DATA iv<>+20(SB)/4, $0x9b05688c -DATA iv<>+24(SB)/4, $0x1f83d9ab -DATA iv<>+28(SB)/4, $0x5be0cd19 -GLOBL iv<>(SB), RODATA|NOPTR, $32 - -DATA rot16_shuf<>+0(SB)/1, $0x02 -DATA rot16_shuf<>+1(SB)/1, $0x03 -DATA rot16_shuf<>+2(SB)/1, $0x00 -DATA rot16_shuf<>+3(SB)/1, $0x01 -DATA rot16_shuf<>+4(SB)/1, $0x06 -DATA rot16_shuf<>+5(SB)/1, $0x07 -DATA rot16_shuf<>+6(SB)/1, $0x04 -DATA rot16_shuf<>+7(SB)/1, $0x05 -DATA rot16_shuf<>+8(SB)/1, $0x0a -DATA rot16_shuf<>+9(SB)/1, $0x0b -DATA rot16_shuf<>+10(SB)/1, $0x08 -DATA rot16_shuf<>+11(SB)/1, $0x09 -DATA rot16_shuf<>+12(SB)/1, $0x0e -DATA rot16_shuf<>+13(SB)/1, $0x0f -DATA rot16_shuf<>+14(SB)/1, $0x0c -DATA rot16_shuf<>+15(SB)/1, $0x0d -DATA rot16_shuf<>+16(SB)/1, $0x12 -DATA rot16_shuf<>+17(SB)/1, $0x13 -DATA rot16_shuf<>+18(SB)/1, $0x10 -DATA rot16_shuf<>+19(SB)/1, $0x11 -DATA rot16_shuf<>+20(SB)/1, $0x16 -DATA rot16_shuf<>+21(SB)/1, $0x17 -DATA rot16_shuf<>+22(SB)/1, $0x14 -DATA rot16_shuf<>+23(SB)/1, $0x15 -DATA rot16_shuf<>+24(SB)/1, $0x1a -DATA rot16_shuf<>+25(SB)/1, $0x1b -DATA rot16_shuf<>+26(SB)/1, $0x18 -DATA rot16_shuf<>+27(SB)/1, $0x19 -DATA rot16_shuf<>+28(SB)/1, $0x1e -DATA rot16_shuf<>+29(SB)/1, $0x1f -DATA rot16_shuf<>+30(SB)/1, $0x1c -DATA rot16_shuf<>+31(SB)/1, $0x1d -GLOBL rot16_shuf<>(SB), RODATA|NOPTR, $32 - -DATA rot8_shuf<>+0(SB)/1, $0x01 -DATA rot8_shuf<>+1(SB)/1, $0x02 -DATA rot8_shuf<>+2(SB)/1, $0x03 -DATA rot8_shuf<>+3(SB)/1, $0x00 -DATA rot8_shuf<>+4(SB)/1, $0x05 -DATA rot8_shuf<>+5(SB)/1, $0x06 -DATA rot8_shuf<>+6(SB)/1, $0x07 -DATA rot8_shuf<>+7(SB)/1, $0x04 -DATA rot8_shuf<>+8(SB)/1, $0x09 -DATA rot8_shuf<>+9(SB)/1, $0x0a -DATA rot8_shuf<>+10(SB)/1, $0x0b -DATA rot8_shuf<>+11(SB)/1, $0x08 -DATA rot8_shuf<>+12(SB)/1, $0x0d -DATA rot8_shuf<>+13(SB)/1, $0x0e -DATA rot8_shuf<>+14(SB)/1, $0x0f -DATA rot8_shuf<>+15(SB)/1, $0x0c -DATA rot8_shuf<>+16(SB)/1, $0x11 -DATA rot8_shuf<>+17(SB)/1, $0x12 -DATA rot8_shuf<>+18(SB)/1, $0x13 -DATA rot8_shuf<>+19(SB)/1, $0x10 -DATA rot8_shuf<>+20(SB)/1, $0x15 -DATA rot8_shuf<>+21(SB)/1, $0x16 -DATA rot8_shuf<>+22(SB)/1, $0x17 -DATA rot8_shuf<>+23(SB)/1, $0x14 -DATA rot8_shuf<>+24(SB)/1, $0x19 -DATA rot8_shuf<>+25(SB)/1, $0x1a -DATA rot8_shuf<>+26(SB)/1, $0x1b -DATA rot8_shuf<>+27(SB)/1, $0x18 -DATA rot8_shuf<>+28(SB)/1, $0x1d -DATA rot8_shuf<>+29(SB)/1, $0x1e -DATA rot8_shuf<>+30(SB)/1, $0x1f -DATA rot8_shuf<>+31(SB)/1, $0x1c -GLOBL rot8_shuf<>(SB), RODATA|NOPTR, $32 - -// func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) -// Requires: SSE, SSE2, SSE4.1, SSSE3 -TEXT ·Compress(SB), NOSPLIT, $0-40 - MOVQ chain+0(FP), AX - MOVQ block+8(FP), CX - MOVQ counter+16(FP), DX - MOVL blen+24(FP), BX - MOVL flags+28(FP), BP - MOVQ out+32(FP), SI - MOVUPS (AX), X0 - MOVUPS 16(AX), X1 - MOVUPS iv<>+0(SB), X2 - PINSRD $0x00, DX, X3 - SHRQ $0x20, DX - PINSRD $0x01, DX, X3 - PINSRD $0x02, BX, X3 - PINSRD $0x03, BP, X3 - MOVUPS (CX), X4 - MOVUPS 16(CX), X5 - MOVUPS 32(CX), X6 - MOVUPS 48(CX), X7 - MOVUPS rot16_shuf<>+0(SB), X8 - MOVUPS rot8_shuf<>+0(SB), X9 - - // round 1 - MOVAPS X4, X10 - SHUFPS $0x88, X5, X10 - PADDD X10, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X11 - PSRLL $0x0c, X1 - PSLLL $0x14, X11 - POR X11, X1 - MOVAPS X4, X4 - SHUFPS $0xdd, X5, X4 - PADDD X4, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x07, X1 - PSLLL $0x19, X5 - POR X5, X1 - PSHUFD $0x93, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x39, X2, X2 - MOVAPS X6, X5 - SHUFPS $0x88, X7, X5 - SHUFPS $0x93, X5, X5 - PADDD X5, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X11 - PSRLL $0x0c, X1 - PSLLL $0x14, X11 - POR X11, X1 - MOVAPS X6, X6 - SHUFPS $0xdd, X7, X6 - SHUFPS $0x93, X6, X6 - PADDD X6, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X7 - PSRLL $0x07, X1 - PSLLL $0x19, X7 - POR X7, X1 - PSHUFD $0x39, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x93, X2, X2 - - // round 2 - MOVAPS X10, X7 - SHUFPS $0xd6, X4, X7 - SHUFPS $0x39, X7, X7 - PADDD X7, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X11 - PSRLL $0x0c, X1 - PSLLL $0x14, X11 - POR X11, X1 - MOVAPS X5, X11 - SHUFPS $0xfa, X6, X11 - PSHUFD $0x0f, X10, X10 - PBLENDW $0x33, X10, X11 - PADDD X11, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X10 - PSRLL $0x07, X1 - PSLLL $0x19, X10 - POR X10, X1 - PSHUFD $0x93, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x39, X2, X2 - MOVAPS X6, X12 - PUNPCKLLQ X4, X12 - PBLENDW $0xc0, X5, X12 - SHUFPS $0xb4, X12, X12 - PADDD X12, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X10 - PSRLL $0x0c, X1 - PSLLL $0x14, X10 - POR X10, X1 - MOVAPS X4, X10 - PUNPCKHLQ X6, X10 - MOVAPS X5, X4 - PUNPCKLLQ X10, X4 - SHUFPS $0x1e, X4, X4 - PADDD X4, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x07, X1 - PSLLL $0x19, X5 - POR X5, X1 - PSHUFD $0x39, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x93, X2, X2 - - // round 3 - MOVAPS X7, X5 - SHUFPS $0xd6, X11, X5 - SHUFPS $0x39, X5, X5 - PADDD X5, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X6 - PSRLL $0x0c, X1 - PSLLL $0x14, X6 - POR X6, X1 - MOVAPS X12, X6 - SHUFPS $0xfa, X4, X6 - PSHUFD $0x0f, X7, X7 - PBLENDW $0x33, X7, X6 - PADDD X6, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X7 - PSRLL $0x07, X1 - PSLLL $0x19, X7 - POR X7, X1 - PSHUFD $0x93, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x39, X2, X2 - MOVAPS X4, X10 - PUNPCKLLQ X11, X10 - PBLENDW $0xc0, X12, X10 - SHUFPS $0xb4, X10, X10 - PADDD X10, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X7 - PSRLL $0x0c, X1 - PSLLL $0x14, X7 - POR X7, X1 - MOVAPS X11, X7 - PUNPCKHLQ X4, X7 - MOVAPS X12, X4 - PUNPCKLLQ X7, X4 - SHUFPS $0x1e, X4, X4 - PADDD X4, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X7 - PSRLL $0x07, X1 - PSLLL $0x19, X7 - POR X7, X1 - PSHUFD $0x39, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x93, X2, X2 - - // round 4 - MOVAPS X5, X7 - SHUFPS $0xd6, X6, X7 - SHUFPS $0x39, X7, X7 - PADDD X7, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X11 - PSRLL $0x0c, X1 - PSLLL $0x14, X11 - POR X11, X1 - MOVAPS X10, X11 - SHUFPS $0xfa, X4, X11 - PSHUFD $0x0f, X5, X5 - PBLENDW $0x33, X5, X11 - PADDD X11, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x07, X1 - PSLLL $0x19, X5 - POR X5, X1 - PSHUFD $0x93, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x39, X2, X2 - MOVAPS X4, X12 - PUNPCKLLQ X6, X12 - PBLENDW $0xc0, X10, X12 - SHUFPS $0xb4, X12, X12 - PADDD X12, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x0c, X1 - PSLLL $0x14, X5 - POR X5, X1 - MOVAPS X6, X5 - PUNPCKHLQ X4, X5 - MOVAPS X10, X4 - PUNPCKLLQ X5, X4 - SHUFPS $0x1e, X4, X4 - PADDD X4, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x07, X1 - PSLLL $0x19, X5 - POR X5, X1 - PSHUFD $0x39, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x93, X2, X2 - - // round 5 - MOVAPS X7, X5 - SHUFPS $0xd6, X11, X5 - SHUFPS $0x39, X5, X5 - PADDD X5, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X6 - PSRLL $0x0c, X1 - PSLLL $0x14, X6 - POR X6, X1 - MOVAPS X12, X6 - SHUFPS $0xfa, X4, X6 - PSHUFD $0x0f, X7, X7 - PBLENDW $0x33, X7, X6 - PADDD X6, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X7 - PSRLL $0x07, X1 - PSLLL $0x19, X7 - POR X7, X1 - PSHUFD $0x93, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x39, X2, X2 - MOVAPS X4, X10 - PUNPCKLLQ X11, X10 - PBLENDW $0xc0, X12, X10 - SHUFPS $0xb4, X10, X10 - PADDD X10, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X7 - PSRLL $0x0c, X1 - PSLLL $0x14, X7 - POR X7, X1 - MOVAPS X11, X7 - PUNPCKHLQ X4, X7 - MOVAPS X12, X4 - PUNPCKLLQ X7, X4 - SHUFPS $0x1e, X4, X4 - PADDD X4, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X7 - PSRLL $0x07, X1 - PSLLL $0x19, X7 - POR X7, X1 - PSHUFD $0x39, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x93, X2, X2 - - // round 6 - MOVAPS X5, X7 - SHUFPS $0xd6, X6, X7 - SHUFPS $0x39, X7, X7 - PADDD X7, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X11 - PSRLL $0x0c, X1 - PSLLL $0x14, X11 - POR X11, X1 - MOVAPS X10, X11 - SHUFPS $0xfa, X4, X11 - PSHUFD $0x0f, X5, X5 - PBLENDW $0x33, X5, X11 - PADDD X11, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x07, X1 - PSLLL $0x19, X5 - POR X5, X1 - PSHUFD $0x93, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x39, X2, X2 - MOVAPS X4, X12 - PUNPCKLLQ X6, X12 - PBLENDW $0xc0, X10, X12 - SHUFPS $0xb4, X12, X12 - PADDD X12, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x0c, X1 - PSLLL $0x14, X5 - POR X5, X1 - MOVAPS X6, X5 - PUNPCKHLQ X4, X5 - MOVAPS X10, X4 - PUNPCKLLQ X5, X4 - SHUFPS $0x1e, X4, X4 - PADDD X4, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x07, X1 - PSLLL $0x19, X5 - POR X5, X1 - PSHUFD $0x39, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x93, X2, X2 - - // round 7 - MOVAPS X7, X5 - SHUFPS $0xd6, X11, X5 - SHUFPS $0x39, X5, X5 - PADDD X5, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x0c, X1 - PSLLL $0x14, X5 - POR X5, X1 - MOVAPS X12, X5 - SHUFPS $0xfa, X4, X5 - PSHUFD $0x0f, X7, X6 - PBLENDW $0x33, X6, X5 - PADDD X5, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x07, X1 - PSLLL $0x19, X5 - POR X5, X1 - PSHUFD $0x93, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x39, X2, X2 - MOVAPS X4, X5 - PUNPCKLLQ X11, X5 - PBLENDW $0xc0, X12, X5 - SHUFPS $0xb4, X5, X5 - PADDD X5, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X8, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X5 - PSRLL $0x0c, X1 - PSLLL $0x14, X5 - POR X5, X1 - MOVAPS X11, X6 - PUNPCKHLQ X4, X6 - MOVAPS X12, X4 - PUNPCKLLQ X6, X4 - SHUFPS $0x1e, X4, X4 - PADDD X4, X0 - PADDD X1, X0 - PXOR X0, X3 - PSHUFB X9, X3 - PADDD X3, X2 - PXOR X2, X1 - MOVAPS X1, X4 - PSRLL $0x07, X1 - PSLLL $0x19, X4 - POR X4, X1 - PSHUFD $0x39, X0, X0 - PSHUFD $0x4e, X3, X3 - PSHUFD $0x93, X2, X2 - - // finalize - PXOR X2, X0 - PXOR X3, X1 - MOVUPS (AX), X4 - PXOR X4, X2 - MOVUPS 16(AX), X4 - PXOR X4, X3 - MOVUPS X0, (SI) - MOVUPS X1, 16(SI) - MOVUPS X2, 32(SI) - MOVUPS X3, 48(SI) - RET diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go deleted file mode 100644 index cd63e9740..000000000 --- a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go +++ /dev/null @@ -1,9 +0,0 @@ -// +build !amd64 - -package compress_sse41 - -import "github.com/zeebo/blake3/internal/alg/compress/compress_pure" - -func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) { - compress_pure.Compress(chain, block, counter, blen, flags, out) -} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go deleted file mode 100644 index ffd932d3c..000000000 --- a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go +++ /dev/null @@ -1,6 +0,0 @@ -// +build amd64 - -package compress_sse41 - -//go:noescape -func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go deleted file mode 100644 index ac43abb69..000000000 --- a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go +++ /dev/null @@ -1,23 +0,0 @@ -package hash - -import ( - "github.com/zeebo/blake3/internal/alg/hash/hash_avx2" - "github.com/zeebo/blake3/internal/alg/hash/hash_pure" - "github.com/zeebo/blake3/internal/consts" -) - -func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) { - if consts.HasAVX2 && length > 2*consts.ChunkLen { - hash_avx2.HashF(input, length, counter, flags, key, out, chain) - } else { - hash_pure.HashF(input, length, counter, flags, key, out, chain) - } -} - -func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) { - if consts.HasAVX2 && n >= 2 { - hash_avx2.HashP(left, right, flags, key, out, n) - } else { - hash_pure.HashP(left, right, flags, key, out, n) - } -} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s deleted file mode 100644 index d7531664b..000000000 --- a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s +++ /dev/null @@ -1,2561 +0,0 @@ -// Code generated by command: go run main.go. DO NOT EDIT. - -#include "textflag.h" - -DATA iv<>+0(SB)/4, $0x6a09e667 -DATA iv<>+4(SB)/4, $0xbb67ae85 -DATA iv<>+8(SB)/4, $0x3c6ef372 -DATA iv<>+12(SB)/4, $0xa54ff53a -DATA iv<>+16(SB)/4, $0x510e527f -DATA iv<>+20(SB)/4, $0x9b05688c -DATA iv<>+24(SB)/4, $0x1f83d9ab -DATA iv<>+28(SB)/4, $0x5be0cd19 -GLOBL iv<>(SB), RODATA|NOPTR, $32 - -DATA rot16_shuf<>+0(SB)/1, $0x02 -DATA rot16_shuf<>+1(SB)/1, $0x03 -DATA rot16_shuf<>+2(SB)/1, $0x00 -DATA rot16_shuf<>+3(SB)/1, $0x01 -DATA rot16_shuf<>+4(SB)/1, $0x06 -DATA rot16_shuf<>+5(SB)/1, $0x07 -DATA rot16_shuf<>+6(SB)/1, $0x04 -DATA rot16_shuf<>+7(SB)/1, $0x05 -DATA rot16_shuf<>+8(SB)/1, $0x0a -DATA rot16_shuf<>+9(SB)/1, $0x0b -DATA rot16_shuf<>+10(SB)/1, $0x08 -DATA rot16_shuf<>+11(SB)/1, $0x09 -DATA rot16_shuf<>+12(SB)/1, $0x0e -DATA rot16_shuf<>+13(SB)/1, $0x0f -DATA rot16_shuf<>+14(SB)/1, $0x0c -DATA rot16_shuf<>+15(SB)/1, $0x0d -DATA rot16_shuf<>+16(SB)/1, $0x12 -DATA rot16_shuf<>+17(SB)/1, $0x13 -DATA rot16_shuf<>+18(SB)/1, $0x10 -DATA rot16_shuf<>+19(SB)/1, $0x11 -DATA rot16_shuf<>+20(SB)/1, $0x16 -DATA rot16_shuf<>+21(SB)/1, $0x17 -DATA rot16_shuf<>+22(SB)/1, $0x14 -DATA rot16_shuf<>+23(SB)/1, $0x15 -DATA rot16_shuf<>+24(SB)/1, $0x1a -DATA rot16_shuf<>+25(SB)/1, $0x1b -DATA rot16_shuf<>+26(SB)/1, $0x18 -DATA rot16_shuf<>+27(SB)/1, $0x19 -DATA rot16_shuf<>+28(SB)/1, $0x1e -DATA rot16_shuf<>+29(SB)/1, $0x1f -DATA rot16_shuf<>+30(SB)/1, $0x1c -DATA rot16_shuf<>+31(SB)/1, $0x1d -GLOBL rot16_shuf<>(SB), RODATA|NOPTR, $32 - -DATA rot8_shuf<>+0(SB)/1, $0x01 -DATA rot8_shuf<>+1(SB)/1, $0x02 -DATA rot8_shuf<>+2(SB)/1, $0x03 -DATA rot8_shuf<>+3(SB)/1, $0x00 -DATA rot8_shuf<>+4(SB)/1, $0x05 -DATA rot8_shuf<>+5(SB)/1, $0x06 -DATA rot8_shuf<>+6(SB)/1, $0x07 -DATA rot8_shuf<>+7(SB)/1, $0x04 -DATA rot8_shuf<>+8(SB)/1, $0x09 -DATA rot8_shuf<>+9(SB)/1, $0x0a -DATA rot8_shuf<>+10(SB)/1, $0x0b -DATA rot8_shuf<>+11(SB)/1, $0x08 -DATA rot8_shuf<>+12(SB)/1, $0x0d -DATA rot8_shuf<>+13(SB)/1, $0x0e -DATA rot8_shuf<>+14(SB)/1, $0x0f -DATA rot8_shuf<>+15(SB)/1, $0x0c -DATA rot8_shuf<>+16(SB)/1, $0x11 -DATA rot8_shuf<>+17(SB)/1, $0x12 -DATA rot8_shuf<>+18(SB)/1, $0x13 -DATA rot8_shuf<>+19(SB)/1, $0x10 -DATA rot8_shuf<>+20(SB)/1, $0x15 -DATA rot8_shuf<>+21(SB)/1, $0x16 -DATA rot8_shuf<>+22(SB)/1, $0x17 -DATA rot8_shuf<>+23(SB)/1, $0x14 -DATA rot8_shuf<>+24(SB)/1, $0x19 -DATA rot8_shuf<>+25(SB)/1, $0x1a -DATA rot8_shuf<>+26(SB)/1, $0x1b -DATA rot8_shuf<>+27(SB)/1, $0x18 -DATA rot8_shuf<>+28(SB)/1, $0x1d -DATA rot8_shuf<>+29(SB)/1, $0x1e -DATA rot8_shuf<>+30(SB)/1, $0x1f -DATA rot8_shuf<>+31(SB)/1, $0x1c -GLOBL rot8_shuf<>(SB), RODATA|NOPTR, $32 - -DATA block_len<>+0(SB)/4, $0x00000040 -DATA block_len<>+4(SB)/4, $0x00000040 -DATA block_len<>+8(SB)/4, $0x00000040 -DATA block_len<>+12(SB)/4, $0x00000040 -DATA block_len<>+16(SB)/4, $0x00000040 -DATA block_len<>+20(SB)/4, $0x00000040 -DATA block_len<>+24(SB)/4, $0x00000040 -DATA block_len<>+28(SB)/4, $0x00000040 -GLOBL block_len<>(SB), RODATA|NOPTR, $32 - -DATA zero<>+0(SB)/4, $0x00000000 -DATA zero<>+4(SB)/4, $0x00000000 -DATA zero<>+8(SB)/4, $0x00000000 -DATA zero<>+12(SB)/4, $0x00000000 -DATA zero<>+16(SB)/4, $0x00000000 -DATA zero<>+20(SB)/4, $0x00000000 -DATA zero<>+24(SB)/4, $0x00000000 -DATA zero<>+28(SB)/4, $0x00000000 -GLOBL zero<>(SB), RODATA|NOPTR, $32 - -DATA counter<>+0(SB)/8, $0x0000000000000000 -DATA counter<>+8(SB)/8, $0x0000000000000001 -DATA counter<>+16(SB)/8, $0x0000000000000002 -DATA counter<>+24(SB)/8, $0x0000000000000003 -DATA counter<>+32(SB)/8, $0x0000000000000004 -DATA counter<>+40(SB)/8, $0x0000000000000005 -DATA counter<>+48(SB)/8, $0x0000000000000006 -DATA counter<>+56(SB)/8, $0x0000000000000007 -GLOBL counter<>(SB), RODATA|NOPTR, $64 - -// func HashF(input *[8192]byte, length uint64, counter uint64, flags uint32, key *[8]uint32, out *[32]uint32, chain *[8]uint32) -// Requires: AVX, AVX2 -TEXT ·HashF(SB), $688-56 - MOVQ input+0(FP), AX - MOVQ length+8(FP), CX - MOVQ counter+16(FP), DX - MOVL flags+24(FP), BX - MOVQ key+32(FP), BP - MOVQ out+40(FP), SI - MOVQ chain+48(FP), DI - - // Allocate local space and align it - LEAQ 31(SP), R10 - MOVQ $0x000000000000001f, R8 - NOTQ R8 - ANDQ R8, R10 - - // Skip if the length is zero - XORQ R8, R8 - XORQ R9, R9 - TESTQ CX, CX - JZ skip_compute - - // Compute complete chunks and blocks - SUBQ $0x01, CX - MOVQ CX, R8 - SHRQ $0x0a, R8 - MOVQ CX, R9 - ANDQ $0x000003c0, R9 - -skip_compute: - // Load some params into the stack (avo improvment?) - MOVL BX, 64(SP) - MOVQ DX, 72(SP) - - // Load IV into vectors - VPBROADCASTD (BP), Y0 - VPBROADCASTD 4(BP), Y1 - VPBROADCASTD 8(BP), Y2 - VPBROADCASTD 12(BP), Y3 - VPBROADCASTD 16(BP), Y4 - VPBROADCASTD 20(BP), Y5 - VPBROADCASTD 24(BP), Y6 - VPBROADCASTD 28(BP), Y7 - - // Build and store counter data on the stack - VPBROADCASTQ 72(SP), Y8 - VPADDQ counter<>+0(SB), Y8, Y8 - VPBROADCASTQ 72(SP), Y9 - VPADDQ counter<>+32(SB), Y9, Y9 - VPUNPCKLDQ Y9, Y8, Y10 - VPUNPCKHDQ Y9, Y8, Y8 - VPUNPCKLDQ Y8, Y10, Y9 - VPUNPCKHDQ Y8, Y10, Y8 - VPERMQ $0xd8, Y9, Y9 - VPERMQ $0xd8, Y8, Y8 - VMOVDQU Y9, 112(SP) - VMOVDQU Y8, 144(SP) - - // Set up block flags and variables for iteration - XORQ CX, CX - ORL $0x01, 64(SP) - -loop: - // Include end flags if last block - CMPQ CX, $0x000003c0 - JNE round_setup - ORL $0x02, 64(SP) - -round_setup: - // Load and transpose message vectors - VMOVDQU (AX)(CX*1), Y8 - VMOVDQU 1024(AX)(CX*1), Y9 - VMOVDQU 2048(AX)(CX*1), Y10 - VMOVDQU 3072(AX)(CX*1), Y11 - VMOVDQU 4096(AX)(CX*1), Y12 - VMOVDQU 5120(AX)(CX*1), Y13 - VMOVDQU 6144(AX)(CX*1), Y14 - VMOVDQU 7168(AX)(CX*1), Y15 - VMOVDQA Y0, (R10) - VPUNPCKLDQ Y9, Y8, Y0 - VPUNPCKHDQ Y9, Y8, Y8 - VPUNPCKLDQ Y11, Y10, Y9 - VPUNPCKHDQ Y11, Y10, Y10 - VPUNPCKLDQ Y13, Y12, Y11 - VPUNPCKHDQ Y13, Y12, Y12 - VPUNPCKLDQ Y15, Y14, Y13 - VPUNPCKHDQ Y15, Y14, Y14 - VPUNPCKLQDQ Y9, Y0, Y15 - VPUNPCKHQDQ Y9, Y0, Y0 - VPUNPCKLQDQ Y10, Y8, Y9 - VPUNPCKHQDQ Y10, Y8, Y8 - VPUNPCKLQDQ Y13, Y11, Y10 - VPUNPCKHQDQ Y13, Y11, Y11 - VPUNPCKLQDQ Y14, Y12, Y13 - VPUNPCKHQDQ Y14, Y12, Y12 - VINSERTI128 $0x01, X10, Y15, Y14 - VPERM2I128 $0x31, Y10, Y15, Y10 - VINSERTI128 $0x01, X11, Y0, Y15 - VPERM2I128 $0x31, Y11, Y0, Y0 - VINSERTI128 $0x01, X13, Y9, Y11 - VPERM2I128 $0x31, Y13, Y9, Y9 - VINSERTI128 $0x01, X12, Y8, Y13 - VPERM2I128 $0x31, Y12, Y8, Y8 - VMOVDQU Y14, 176(SP) - VMOVDQU Y15, 208(SP) - VMOVDQU Y11, 240(SP) - VMOVDQU Y13, 272(SP) - VMOVDQU Y10, 304(SP) - VMOVDQU Y0, 336(SP) - VMOVDQU Y9, 368(SP) - VMOVDQU Y8, 400(SP) - VMOVDQU 32(AX)(CX*1), Y0 - VMOVDQU 1056(AX)(CX*1), Y8 - VMOVDQU 2080(AX)(CX*1), Y9 - VMOVDQU 3104(AX)(CX*1), Y10 - VMOVDQU 4128(AX)(CX*1), Y11 - VMOVDQU 5152(AX)(CX*1), Y12 - VMOVDQU 6176(AX)(CX*1), Y13 - VMOVDQU 7200(AX)(CX*1), Y14 - VPUNPCKLDQ Y8, Y0, Y15 - VPUNPCKHDQ Y8, Y0, Y0 - VPUNPCKLDQ Y10, Y9, Y8 - VPUNPCKHDQ Y10, Y9, Y9 - VPUNPCKLDQ Y12, Y11, Y10 - VPUNPCKHDQ Y12, Y11, Y11 - VPUNPCKLDQ Y14, Y13, Y12 - VPUNPCKHDQ Y14, Y13, Y13 - VPUNPCKLQDQ Y8, Y15, Y14 - VPUNPCKHQDQ Y8, Y15, Y8 - VPUNPCKLQDQ Y9, Y0, Y15 - VPUNPCKHQDQ Y9, Y0, Y0 - VPUNPCKLQDQ Y12, Y10, Y9 - VPUNPCKHQDQ Y12, Y10, Y10 - VPUNPCKLQDQ Y13, Y11, Y12 - VPUNPCKHQDQ Y13, Y11, Y11 - VINSERTI128 $0x01, X9, Y14, Y13 - VPERM2I128 $0x31, Y9, Y14, Y9 - VINSERTI128 $0x01, X10, Y8, Y14 - VPERM2I128 $0x31, Y10, Y8, Y8 - VINSERTI128 $0x01, X12, Y15, Y10 - VPERM2I128 $0x31, Y12, Y15, Y12 - VINSERTI128 $0x01, X11, Y0, Y15 - VPERM2I128 $0x31, Y11, Y0, Y0 - VMOVDQU Y13, 432(SP) - VMOVDQU Y14, 464(SP) - VMOVDQU Y10, 496(SP) - VMOVDQU Y15, 528(SP) - VMOVDQU Y9, 560(SP) - VMOVDQU Y8, 592(SP) - VMOVDQU Y12, 624(SP) - VMOVDQU Y0, 656(SP) - - // Load constants for the round - VMOVDQA (R10), Y0 - VMOVDQU block_len<>+0(SB), Y8 - VPBROADCASTD 64(SP), Y9 - VPBROADCASTD iv<>+0(SB), Y10 - VPBROADCASTD iv<>+4(SB), Y11 - VPBROADCASTD iv<>+8(SB), Y12 - VPBROADCASTD iv<>+12(SB), Y13 - VMOVDQU 112(SP), Y14 - VMOVDQU 144(SP), Y15 - - // Save state for partial chunk if necessary - CMPQ CX, R9 - JNE begin_rounds - VMOVDQU Y0, 80(SP) - MOVL 80(SP)(R8*4), DX - MOVL DX, (DI) - VMOVDQU Y1, 80(SP) - MOVL 80(SP)(R8*4), DX - MOVL DX, 4(DI) - VMOVDQU Y2, 80(SP) - MOVL 80(SP)(R8*4), DX - MOVL DX, 8(DI) - VMOVDQU Y3, 80(SP) - MOVL 80(SP)(R8*4), DX - MOVL DX, 12(DI) - VMOVDQU Y4, 80(SP) - MOVL 80(SP)(R8*4), DX - MOVL DX, 16(DI) - VMOVDQU Y5, 80(SP) - MOVL 80(SP)(R8*4), DX - MOVL DX, 20(DI) - VMOVDQU Y6, 80(SP) - MOVL 80(SP)(R8*4), DX - MOVL DX, 24(DI) - VMOVDQU Y7, 80(SP) - MOVL 80(SP)(R8*4), DX - MOVL DX, 28(DI) - -begin_rounds: - // Perform the rounds - // Round 1 - VPADDD 176(SP), Y0, Y0 - VPADDD 240(SP), Y1, Y1 - VPADDD 304(SP), Y2, Y2 - VPADDD 368(SP), Y3, Y3 - VPADDD Y4, Y0, Y0 - VPXOR Y0, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y7, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y4, Y4 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y5, Y5 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y6, Y6 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y7, Y7 - VMOVDQA Y0, (R10) - VPSRLD $0x0c, Y4, Y0 - VPSLLD $0x14, Y4, Y4 - VPOR Y0, Y4, Y0 - VPSRLD $0x0c, Y5, Y4 - VPSLLD $0x14, Y5, Y5 - VPOR Y4, Y5, Y4 - VPSRLD $0x0c, Y6, Y5 - VPSLLD $0x14, Y6, Y6 - VPOR Y5, Y6, Y5 - VPSRLD $0x0c, Y7, Y6 - VPSLLD $0x14, Y7, Y7 - VPOR Y6, Y7, Y6 - VMOVDQA (R10), Y7 - VPADDD 208(SP), Y7, Y7 - VPADDD 272(SP), Y1, Y1 - VPADDD 336(SP), Y2, Y2 - VPADDD 400(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 432(SP), Y7, Y7 - VPADDD 496(SP), Y1, Y1 - VPADDD 560(SP), Y2, Y2 - VPADDD 624(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VMOVDQA (R10), Y7 - VPADDD 464(SP), Y7, Y7 - VPADDD 528(SP), Y1, Y1 - VPADDD 592(SP), Y2, Y2 - VPADDD 656(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - - // Round 2 - VMOVDQA (R10), Y7 - VPADDD 240(SP), Y7, Y7 - VPADDD 272(SP), Y1, Y1 - VPADDD 400(SP), Y2, Y2 - VPADDD 304(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 368(SP), Y7, Y7 - VPADDD 496(SP), Y1, Y1 - VPADDD 176(SP), Y2, Y2 - VPADDD 592(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 208(SP), Y7, Y7 - VPADDD 560(SP), Y1, Y1 - VPADDD 464(SP), Y2, Y2 - VPADDD 656(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VMOVDQA (R10), Y7 - VPADDD 528(SP), Y7, Y7 - VPADDD 336(SP), Y1, Y1 - VPADDD 624(SP), Y2, Y2 - VPADDD 432(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - - // Round 3 - VMOVDQA (R10), Y7 - VPADDD 272(SP), Y7, Y7 - VPADDD 496(SP), Y1, Y1 - VPADDD 592(SP), Y2, Y2 - VPADDD 400(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 304(SP), Y7, Y7 - VPADDD 560(SP), Y1, Y1 - VPADDD 240(SP), Y2, Y2 - VPADDD 624(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 368(SP), Y7, Y7 - VPADDD 464(SP), Y1, Y1 - VPADDD 528(SP), Y2, Y2 - VPADDD 432(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VMOVDQA (R10), Y7 - VPADDD 336(SP), Y7, Y7 - VPADDD 176(SP), Y1, Y1 - VPADDD 656(SP), Y2, Y2 - VPADDD 208(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - - // Round 4 - VMOVDQA (R10), Y7 - VPADDD 496(SP), Y7, Y7 - VPADDD 560(SP), Y1, Y1 - VPADDD 624(SP), Y2, Y2 - VPADDD 592(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 400(SP), Y7, Y7 - VPADDD 464(SP), Y1, Y1 - VPADDD 272(SP), Y2, Y2 - VPADDD 656(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 304(SP), Y7, Y7 - VPADDD 528(SP), Y1, Y1 - VPADDD 336(SP), Y2, Y2 - VPADDD 208(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VMOVDQA (R10), Y7 - VPADDD 176(SP), Y7, Y7 - VPADDD 240(SP), Y1, Y1 - VPADDD 432(SP), Y2, Y2 - VPADDD 368(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - - // Round 5 - VMOVDQA (R10), Y7 - VPADDD 560(SP), Y7, Y7 - VPADDD 464(SP), Y1, Y1 - VPADDD 656(SP), Y2, Y2 - VPADDD 624(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 592(SP), Y7, Y7 - VPADDD 528(SP), Y1, Y1 - VPADDD 496(SP), Y2, Y2 - VPADDD 432(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 400(SP), Y7, Y7 - VPADDD 336(SP), Y1, Y1 - VPADDD 176(SP), Y2, Y2 - VPADDD 368(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VMOVDQA (R10), Y7 - VPADDD 240(SP), Y7, Y7 - VPADDD 272(SP), Y1, Y1 - VPADDD 208(SP), Y2, Y2 - VPADDD 304(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - - // Round 6 - VMOVDQA (R10), Y7 - VPADDD 464(SP), Y7, Y7 - VPADDD 528(SP), Y1, Y1 - VPADDD 432(SP), Y2, Y2 - VPADDD 656(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 624(SP), Y7, Y7 - VPADDD 336(SP), Y1, Y1 - VPADDD 560(SP), Y2, Y2 - VPADDD 208(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 592(SP), Y7, Y7 - VPADDD 176(SP), Y1, Y1 - VPADDD 240(SP), Y2, Y2 - VPADDD 304(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VMOVDQA (R10), Y7 - VPADDD 272(SP), Y7, Y7 - VPADDD 496(SP), Y1, Y1 - VPADDD 368(SP), Y2, Y2 - VPADDD 400(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - - // Round 7 - VMOVDQA (R10), Y7 - VPADDD 528(SP), Y7, Y7 - VPADDD 336(SP), Y1, Y1 - VPADDD 208(SP), Y2, Y2 - VPADDD 432(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 656(SP), Y7, Y7 - VPADDD 176(SP), Y1, Y1 - VPADDD 464(SP), Y2, Y2 - VPADDD 368(SP), Y3, Y3 - VPADDD Y0, Y7, Y7 - VPXOR Y7, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y5, Y2, Y2 - VPXOR Y2, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y6, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y14, Y10, Y10 - VPXOR Y10, Y0, Y0 - VPADDD Y15, Y11, Y11 - VPXOR Y11, Y4, Y4 - VPADDD Y8, Y12, Y12 - VPXOR Y12, Y5, Y5 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y6, Y6 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VMOVDQA (R10), Y7 - VPADDD 624(SP), Y7, Y7 - VPADDD 240(SP), Y1, Y1 - VPADDD 272(SP), Y2, Y2 - VPADDD 400(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x0c, Y4, Y7 - VPSLLD $0x14, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x0c, Y5, Y7 - VPSLLD $0x14, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x0c, Y6, Y7 - VPSLLD $0x14, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x0c, Y0, Y7 - VPSLLD $0x14, Y0, Y0 - VPOR Y7, Y0, Y0 - VMOVDQA (R10), Y7 - VPADDD 496(SP), Y7, Y7 - VPADDD 560(SP), Y1, Y1 - VPADDD 304(SP), Y2, Y2 - VPADDD 592(SP), Y3, Y3 - VPADDD Y4, Y7, Y7 - VPXOR Y7, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y5, Y1, Y1 - VPXOR Y1, Y14, Y14 - VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y15, Y15 - VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y8, Y8 - VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 - VPADDD Y9, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPADDD Y14, Y13, Y13 - VPXOR Y13, Y5, Y5 - VPADDD Y15, Y10, Y10 - VPXOR Y10, Y6, Y6 - VPADDD Y8, Y11, Y11 - VPXOR Y11, Y0, Y0 - VMOVDQA Y7, (R10) - VPSRLD $0x07, Y4, Y7 - VPSLLD $0x19, Y4, Y4 - VPOR Y7, Y4, Y4 - VPSRLD $0x07, Y5, Y7 - VPSLLD $0x19, Y5, Y5 - VPOR Y7, Y5, Y5 - VPSRLD $0x07, Y6, Y7 - VPSLLD $0x19, Y6, Y6 - VPOR Y7, Y6, Y6 - VPSRLD $0x07, Y0, Y7 - VPSLLD $0x19, Y0, Y0 - VPOR Y7, Y0, Y0 - - // Finalize rounds - VPXOR Y9, Y6, Y6 - VPXOR (R10), Y10, Y7 - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y2, Y2 - VPXOR Y13, Y3, Y3 - VPXOR Y14, Y0, Y0 - VPXOR Y15, Y4, Y4 - VPXOR Y8, Y5, Y5 - - // Fix up registers for next iteration - VMOVDQU Y7, Y8 - VMOVDQU Y6, Y7 - VMOVDQU Y5, Y6 - VMOVDQU Y4, Y5 - VMOVDQU Y0, Y4 - VMOVDQU Y8, Y0 - - // If we have zero complete chunks, we're done - CMPQ R8, $0x00 - JNE loop_trailer - CMPQ R9, CX - JEQ finalize - -loop_trailer: - // Increment, reset flags, and loop - CMPQ CX, $0x000003c0 - JEQ finalize - ADDQ $0x40, CX - MOVL BX, 64(SP) - JMP loop - -finalize: - // Store result into out - VMOVDQU Y0, (SI) - VMOVDQU Y1, 32(SI) - VMOVDQU Y2, 64(SI) - VMOVDQU Y3, 96(SI) - VMOVDQU Y4, 128(SI) - VMOVDQU Y5, 160(SI) - VMOVDQU Y6, 192(SI) - VMOVDQU Y7, 224(SI) - VZEROUPPER - RET - -// func HashP(left *[32]uint32, right *[32]uint32, flags uint8, key *[8]uint32, out *[32]uint32, n int) -// Requires: AVX, AVX2 -TEXT ·HashP(SB), NOSPLIT, $72-48 - MOVQ left+0(FP), AX - MOVQ right+8(FP), CX - MOVBLZX flags+16(FP), DX - MOVQ key+24(FP), BX - MOVQ out+32(FP), BP - - // Allocate local space and align it - LEAQ 31(SP), SI - MOVQ $0x000000000000001f, DI - NOTQ DI - ANDQ DI, SI - - // Set up flags value - MOVL DX, 64(SP) - - // Perform the rounds - // Round 1 - VPBROADCASTD (BX), Y0 - VPADDD (AX), Y0, Y0 - VPBROADCASTD 4(BX), Y1 - VPADDD 64(AX), Y1, Y1 - VPBROADCASTD 8(BX), Y2 - VPADDD 128(AX), Y2, Y2 - VPBROADCASTD 12(BX), Y3 - VPADDD 192(AX), Y3, Y3 - VPBROADCASTD 16(BX), Y4 - VPADDD Y4, Y0, Y0 - VMOVDQU zero<>+0(SB), Y5 - VPXOR Y0, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPBROADCASTD 20(BX), Y6 - VPADDD Y6, Y1, Y1 - VMOVDQU zero<>+0(SB), Y7 - VPXOR Y1, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPBROADCASTD 24(BX), Y8 - VPADDD Y8, Y2, Y2 - VMOVDQU block_len<>+0(SB), Y9 - VPXOR Y2, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPBROADCASTD 28(BX), Y10 - VPADDD Y10, Y3, Y3 - VPBROADCASTD 64(SP), Y11 - VPXOR Y3, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPBROADCASTD iv<>+0(SB), Y12 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y4, Y4 - VPBROADCASTD iv<>+4(SB), Y13 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y6, Y6 - VPBROADCASTD iv<>+8(SB), Y14 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y8, Y8 - VPBROADCASTD iv<>+12(SB), Y15 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y10, Y10 - VMOVDQA Y0, (SI) - VPSRLD $0x0c, Y4, Y0 - VPSLLD $0x14, Y4, Y4 - VPOR Y0, Y4, Y0 - VPSRLD $0x0c, Y6, Y4 - VPSLLD $0x14, Y6, Y6 - VPOR Y4, Y6, Y4 - VPSRLD $0x0c, Y8, Y6 - VPSLLD $0x14, Y8, Y8 - VPOR Y6, Y8, Y6 - VPSRLD $0x0c, Y10, Y8 - VPSLLD $0x14, Y10, Y10 - VPOR Y8, Y10, Y8 - VMOVDQA (SI), Y10 - VPADDD 32(AX), Y10, Y10 - VPADDD 96(AX), Y1, Y1 - VPADDD 160(AX), Y2, Y2 - VPADDD 224(AX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD (CX), Y10, Y10 - VPADDD 64(CX), Y1, Y1 - VPADDD 128(CX), Y2, Y2 - VPADDD 192(CX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VMOVDQA (SI), Y10 - VPADDD 32(CX), Y10, Y10 - VPADDD 96(CX), Y1, Y1 - VPADDD 160(CX), Y2, Y2 - VPADDD 224(CX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - - // Round 2 - VMOVDQA (SI), Y10 - VPADDD 64(AX), Y10, Y10 - VPADDD 96(AX), Y1, Y1 - VPADDD 224(AX), Y2, Y2 - VPADDD 128(AX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 192(AX), Y10, Y10 - VPADDD 64(CX), Y1, Y1 - VPADDD (AX), Y2, Y2 - VPADDD 160(CX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 32(AX), Y10, Y10 - VPADDD 128(CX), Y1, Y1 - VPADDD 32(CX), Y2, Y2 - VPADDD 224(CX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VMOVDQA (SI), Y10 - VPADDD 96(CX), Y10, Y10 - VPADDD 160(AX), Y1, Y1 - VPADDD 192(CX), Y2, Y2 - VPADDD (CX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - - // Round 3 - VMOVDQA (SI), Y10 - VPADDD 96(AX), Y10, Y10 - VPADDD 64(CX), Y1, Y1 - VPADDD 160(CX), Y2, Y2 - VPADDD 224(AX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 128(AX), Y10, Y10 - VPADDD 128(CX), Y1, Y1 - VPADDD 64(AX), Y2, Y2 - VPADDD 192(CX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 192(AX), Y10, Y10 - VPADDD 32(CX), Y1, Y1 - VPADDD 96(CX), Y2, Y2 - VPADDD (CX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VMOVDQA (SI), Y10 - VPADDD 160(AX), Y10, Y10 - VPADDD (AX), Y1, Y1 - VPADDD 224(CX), Y2, Y2 - VPADDD 32(AX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - - // Round 4 - VMOVDQA (SI), Y10 - VPADDD 64(CX), Y10, Y10 - VPADDD 128(CX), Y1, Y1 - VPADDD 192(CX), Y2, Y2 - VPADDD 160(CX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 224(AX), Y10, Y10 - VPADDD 32(CX), Y1, Y1 - VPADDD 96(AX), Y2, Y2 - VPADDD 224(CX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 128(AX), Y10, Y10 - VPADDD 96(CX), Y1, Y1 - VPADDD 160(AX), Y2, Y2 - VPADDD 32(AX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VMOVDQA (SI), Y10 - VPADDD (AX), Y10, Y10 - VPADDD 64(AX), Y1, Y1 - VPADDD (CX), Y2, Y2 - VPADDD 192(AX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - - // Round 5 - VMOVDQA (SI), Y10 - VPADDD 128(CX), Y10, Y10 - VPADDD 32(CX), Y1, Y1 - VPADDD 224(CX), Y2, Y2 - VPADDD 192(CX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 160(CX), Y10, Y10 - VPADDD 96(CX), Y1, Y1 - VPADDD 64(CX), Y2, Y2 - VPADDD (CX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 224(AX), Y10, Y10 - VPADDD 160(AX), Y1, Y1 - VPADDD (AX), Y2, Y2 - VPADDD 192(AX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VMOVDQA (SI), Y10 - VPADDD 64(AX), Y10, Y10 - VPADDD 96(AX), Y1, Y1 - VPADDD 32(AX), Y2, Y2 - VPADDD 128(AX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - - // Round 6 - VMOVDQA (SI), Y10 - VPADDD 32(CX), Y10, Y10 - VPADDD 96(CX), Y1, Y1 - VPADDD (CX), Y2, Y2 - VPADDD 224(CX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 192(CX), Y10, Y10 - VPADDD 160(AX), Y1, Y1 - VPADDD 128(CX), Y2, Y2 - VPADDD 32(AX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 160(CX), Y10, Y10 - VPADDD (AX), Y1, Y1 - VPADDD 64(AX), Y2, Y2 - VPADDD 128(AX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VMOVDQA (SI), Y10 - VPADDD 96(AX), Y10, Y10 - VPADDD 64(CX), Y1, Y1 - VPADDD 192(AX), Y2, Y2 - VPADDD 224(AX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - - // Round 7 - VMOVDQA (SI), Y10 - VPADDD 96(CX), Y10, Y10 - VPADDD 160(AX), Y1, Y1 - VPADDD 32(AX), Y2, Y2 - VPADDD (CX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 224(CX), Y10, Y10 - VPADDD (AX), Y1, Y1 - VPADDD 32(CX), Y2, Y2 - VPADDD 192(AX), Y3, Y3 - VPADDD Y0, Y10, Y10 - VPXOR Y10, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y4, Y1, Y1 - VPXOR Y1, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y6, Y2, Y2 - VPXOR Y2, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y8, Y3, Y3 - VPXOR Y3, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y5, Y12, Y12 - VPXOR Y12, Y0, Y0 - VPADDD Y7, Y13, Y13 - VPXOR Y13, Y4, Y4 - VPADDD Y9, Y14, Y14 - VPXOR Y14, Y6, Y6 - VPADDD Y11, Y15, Y15 - VPXOR Y15, Y8, Y8 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VMOVDQA (SI), Y10 - VPADDD 192(CX), Y10, Y10 - VPADDD 64(AX), Y1, Y1 - VPADDD 96(AX), Y2, Y2 - VPADDD 224(AX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x0c, Y4, Y10 - VPSLLD $0x14, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x0c, Y6, Y10 - VPSLLD $0x14, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x0c, Y8, Y10 - VPSLLD $0x14, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x0c, Y0, Y10 - VPSLLD $0x14, Y0, Y0 - VPOR Y10, Y0, Y0 - VMOVDQA (SI), Y10 - VPADDD 64(CX), Y10, Y10 - VPADDD 128(CX), Y1, Y1 - VPADDD 128(AX), Y2, Y2 - VPADDD 160(CX), Y3, Y3 - VPADDD Y4, Y10, Y10 - VPXOR Y10, Y11, Y11 - VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 - VPADDD Y6, Y1, Y1 - VPXOR Y1, Y5, Y5 - VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 - VPADDD Y8, Y2, Y2 - VPXOR Y2, Y7, Y7 - VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 - VPADDD Y0, Y3, Y3 - VPXOR Y3, Y9, Y9 - VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 - VPADDD Y11, Y14, Y14 - VPXOR Y14, Y4, Y4 - VPADDD Y5, Y15, Y15 - VPXOR Y15, Y6, Y6 - VPADDD Y7, Y12, Y12 - VPXOR Y12, Y8, Y8 - VPADDD Y9, Y13, Y13 - VPXOR Y13, Y0, Y0 - VMOVDQA Y10, (SI) - VPSRLD $0x07, Y4, Y10 - VPSLLD $0x19, Y4, Y4 - VPOR Y10, Y4, Y4 - VPSRLD $0x07, Y6, Y10 - VPSLLD $0x19, Y6, Y6 - VPOR Y10, Y6, Y6 - VPSRLD $0x07, Y8, Y10 - VPSLLD $0x19, Y8, Y8 - VPOR Y10, Y8, Y8 - VPSRLD $0x07, Y0, Y10 - VPSLLD $0x19, Y0, Y0 - VPOR Y10, Y0, Y0 - - // Finalize - VPXOR (SI), Y12, Y10 - VPXOR Y13, Y1, Y1 - VPXOR Y14, Y2, Y2 - VPXOR Y15, Y3, Y3 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y6, Y5 - VPXOR Y11, Y8, Y6 - - // Store result into out - VMOVDQU Y10, (BP) - VMOVDQU Y1, 32(BP) - VMOVDQU Y2, 64(BP) - VMOVDQU Y3, 96(BP) - VMOVDQU Y0, 128(BP) - VMOVDQU Y4, 160(BP) - VMOVDQU Y5, 192(BP) - VMOVDQU Y6, 224(BP) - VZEROUPPER - RET diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go deleted file mode 100644 index 613972814..000000000 --- a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go +++ /dev/null @@ -1,13 +0,0 @@ -// +build !amd64 - -package hash_avx2 - -import "github.com/zeebo/blake3/internal/alg/hash/hash_pure" - -func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) { - hash_pure.HashF(input, length, counter, flags, key, out, chain) -} - -func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) { - hash_pure.HashP(left, right, flags, key, out, n) -} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go deleted file mode 100644 index 10e949550..000000000 --- a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go +++ /dev/null @@ -1,9 +0,0 @@ -// +build amd64 - -package hash_avx2 - -//go:noescape -func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) - -//go:noescape -func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go deleted file mode 100644 index 0c6fd63cd..000000000 --- a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go +++ /dev/null @@ -1,56 +0,0 @@ -package hash_pure - -import ( - "unsafe" - - "github.com/zeebo/blake3/internal/alg/compress" - "github.com/zeebo/blake3/internal/consts" - "github.com/zeebo/blake3/internal/utils" -) - -func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) { - var tmp [16]uint32 - - for i := uint64(0); consts.ChunkLen*i < length && i < 8; i++ { - bchain := *key - bflags := flags | consts.Flag_ChunkStart - start := consts.ChunkLen * i - - for n := uint64(0); n < 16; n++ { - if n == 15 { - bflags |= consts.Flag_ChunkEnd - } - if start+64*n >= length { - break - } - if start+64+64*n >= length { - *chain = bchain - } - - var blockPtr *[16]uint32 - if consts.IsLittleEndian { - blockPtr = (*[16]uint32)(unsafe.Pointer(&input[consts.ChunkLen*i+consts.BlockLen*n])) - } else { - var block [16]uint32 - utils.BytesToWords((*[64]uint8)(unsafe.Pointer(&input[consts.ChunkLen*i+consts.BlockLen*n])), &block) - blockPtr = &block - } - - compress.Compress(&bchain, blockPtr, counter, consts.BlockLen, bflags, &tmp) - - bchain = *(*[8]uint32)(unsafe.Pointer(&tmp[0])) - bflags = flags - } - - out[i+0] = bchain[0] - out[i+8] = bchain[1] - out[i+16] = bchain[2] - out[i+24] = bchain[3] - out[i+32] = bchain[4] - out[i+40] = bchain[5] - out[i+48] = bchain[6] - out[i+56] = bchain[7] - - counter++ - } -} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go deleted file mode 100644 index bee5d8dd0..000000000 --- a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go +++ /dev/null @@ -1,38 +0,0 @@ -package hash_pure - -import "github.com/zeebo/blake3/internal/alg/compress" - -func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) { - var tmp [16]uint32 - var block [16]uint32 - - for i := 0; i < n && i < 8; i++ { - block[0] = left[i+0] - block[1] = left[i+8] - block[2] = left[i+16] - block[3] = left[i+24] - block[4] = left[i+32] - block[5] = left[i+40] - block[6] = left[i+48] - block[7] = left[i+56] - block[8] = right[i+0] - block[9] = right[i+8] - block[10] = right[i+16] - block[11] = right[i+24] - block[12] = right[i+32] - block[13] = right[i+40] - block[14] = right[i+48] - block[15] = right[i+56] - - compress.Compress(key, &block, 0, 64, flags, &tmp) - - out[i+0] = tmp[0] - out[i+8] = tmp[1] - out[i+16] = tmp[2] - out[i+24] = tmp[3] - out[i+32] = tmp[4] - out[i+40] = tmp[5] - out[i+48] = tmp[6] - out[i+56] = tmp[7] - } -} |