summaryrefslogtreecommitdiff
path: root/vendor/github.com/zeebo/blake3/internal/alg
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/zeebo/blake3/internal/alg')
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/alg.go18
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go15
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go135
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s560
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go9
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go6
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go23
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s2561
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go13
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go9
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go56
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go38
12 files changed, 0 insertions, 3443 deletions
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/alg.go b/vendor/github.com/zeebo/blake3/internal/alg/alg.go
deleted file mode 100644
index 239fdec5b..000000000
--- a/vendor/github.com/zeebo/blake3/internal/alg/alg.go
+++ /dev/null
@@ -1,18 +0,0 @@
-package alg
-
-import (
- "github.com/zeebo/blake3/internal/alg/compress"
- "github.com/zeebo/blake3/internal/alg/hash"
-)
-
-func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) {
- hash.HashF(input, length, counter, flags, key, out, chain)
-}
-
-func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) {
- hash.HashP(left, right, flags, key, out, n)
-}
-
-func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) {
- compress.Compress(chain, block, counter, blen, flags, out)
-}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go
deleted file mode 100644
index 0b2685408..000000000
--- a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go
+++ /dev/null
@@ -1,15 +0,0 @@
-package compress
-
-import (
- "github.com/zeebo/blake3/internal/alg/compress/compress_pure"
- "github.com/zeebo/blake3/internal/alg/compress/compress_sse41"
- "github.com/zeebo/blake3/internal/consts"
-)
-
-func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) {
- if consts.HasSSE41 {
- compress_sse41.Compress(chain, block, counter, blen, flags, out)
- } else {
- compress_pure.Compress(chain, block, counter, blen, flags, out)
- }
-}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go
deleted file mode 100644
index 66ea1fb75..000000000
--- a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go
+++ /dev/null
@@ -1,135 +0,0 @@
-package compress_pure
-
-import (
- "math/bits"
-
- "github.com/zeebo/blake3/internal/consts"
-)
-
-func Compress(
- chain *[8]uint32,
- block *[16]uint32,
- counter uint64,
- blen uint32,
- flags uint32,
- out *[16]uint32,
-) {
-
- *out = [16]uint32{
- chain[0], chain[1], chain[2], chain[3],
- chain[4], chain[5], chain[6], chain[7],
- consts.IV0, consts.IV1, consts.IV2, consts.IV3,
- uint32(counter), uint32(counter >> 32), blen, flags,
- }
-
- rcompress(out, block)
-}
-
-func g(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) {
- a += b + mx
- d = bits.RotateLeft32(d^a, -16)
- c += d
- b = bits.RotateLeft32(b^c, -12)
- a += b + my
- d = bits.RotateLeft32(d^a, -8)
- c += d
- b = bits.RotateLeft32(b^c, -7)
- return a, b, c, d
-}
-
-func rcompress(s *[16]uint32, m *[16]uint32) {
- const (
- a = 10
- b = 11
- c = 12
- d = 13
- e = 14
- f = 15
- )
-
- s0, s1, s2, s3 := s[0+0], s[0+1], s[0+2], s[0+3]
- s4, s5, s6, s7 := s[0+4], s[0+5], s[0+6], s[0+7]
- s8, s9, sa, sb := s[8+0], s[8+1], s[8+2], s[8+3]
- sc, sd, se, sf := s[8+4], s[8+5], s[8+6], s[8+7]
-
- s0, s4, s8, sc = g(s0, s4, s8, sc, m[0], m[1])
- s1, s5, s9, sd = g(s1, s5, s9, sd, m[2], m[3])
- s2, s6, sa, se = g(s2, s6, sa, se, m[4], m[5])
- s3, s7, sb, sf = g(s3, s7, sb, sf, m[6], m[7])
- s0, s5, sa, sf = g(s0, s5, sa, sf, m[8], m[9])
- s1, s6, sb, sc = g(s1, s6, sb, sc, m[a], m[b])
- s2, s7, s8, sd = g(s2, s7, s8, sd, m[c], m[d])
- s3, s4, s9, se = g(s3, s4, s9, se, m[e], m[f])
-
- s0, s4, s8, sc = g(s0, s4, s8, sc, m[2], m[6])
- s1, s5, s9, sd = g(s1, s5, s9, sd, m[3], m[a])
- s2, s6, sa, se = g(s2, s6, sa, se, m[7], m[0])
- s3, s7, sb, sf = g(s3, s7, sb, sf, m[4], m[d])
- s0, s5, sa, sf = g(s0, s5, sa, sf, m[1], m[b])
- s1, s6, sb, sc = g(s1, s6, sb, sc, m[c], m[5])
- s2, s7, s8, sd = g(s2, s7, s8, sd, m[9], m[e])
- s3, s4, s9, se = g(s3, s4, s9, se, m[f], m[8])
-
- s0, s4, s8, sc = g(s0, s4, s8, sc, m[3], m[4])
- s1, s5, s9, sd = g(s1, s5, s9, sd, m[a], m[c])
- s2, s6, sa, se = g(s2, s6, sa, se, m[d], m[2])
- s3, s7, sb, sf = g(s3, s7, sb, sf, m[7], m[e])
- s0, s5, sa, sf = g(s0, s5, sa, sf, m[6], m[5])
- s1, s6, sb, sc = g(s1, s6, sb, sc, m[9], m[0])
- s2, s7, s8, sd = g(s2, s7, s8, sd, m[b], m[f])
- s3, s4, s9, se = g(s3, s4, s9, se, m[8], m[1])
-
- s0, s4, s8, sc = g(s0, s4, s8, sc, m[a], m[7])
- s1, s5, s9, sd = g(s1, s5, s9, sd, m[c], m[9])
- s2, s6, sa, se = g(s2, s6, sa, se, m[e], m[3])
- s3, s7, sb, sf = g(s3, s7, sb, sf, m[d], m[f])
- s0, s5, sa, sf = g(s0, s5, sa, sf, m[4], m[0])
- s1, s6, sb, sc = g(s1, s6, sb, sc, m[b], m[2])
- s2, s7, s8, sd = g(s2, s7, s8, sd, m[5], m[8])
- s3, s4, s9, se = g(s3, s4, s9, se, m[1], m[6])
-
- s0, s4, s8, sc = g(s0, s4, s8, sc, m[c], m[d])
- s1, s5, s9, sd = g(s1, s5, s9, sd, m[9], m[b])
- s2, s6, sa, se = g(s2, s6, sa, se, m[f], m[a])
- s3, s7, sb, sf = g(s3, s7, sb, sf, m[e], m[8])
- s0, s5, sa, sf = g(s0, s5, sa, sf, m[7], m[2])
- s1, s6, sb, sc = g(s1, s6, sb, sc, m[5], m[3])
- s2, s7, s8, sd = g(s2, s7, s8, sd, m[0], m[1])
- s3, s4, s9, se = g(s3, s4, s9, se, m[6], m[4])
-
- s0, s4, s8, sc = g(s0, s4, s8, sc, m[9], m[e])
- s1, s5, s9, sd = g(s1, s5, s9, sd, m[b], m[5])
- s2, s6, sa, se = g(s2, s6, sa, se, m[8], m[c])
- s3, s7, sb, sf = g(s3, s7, sb, sf, m[f], m[1])
- s0, s5, sa, sf = g(s0, s5, sa, sf, m[d], m[3])
- s1, s6, sb, sc = g(s1, s6, sb, sc, m[0], m[a])
- s2, s7, s8, sd = g(s2, s7, s8, sd, m[2], m[6])
- s3, s4, s9, se = g(s3, s4, s9, se, m[4], m[7])
-
- s0, s4, s8, sc = g(s0, s4, s8, sc, m[b], m[f])
- s1, s5, s9, sd = g(s1, s5, s9, sd, m[5], m[0])
- s2, s6, sa, se = g(s2, s6, sa, se, m[1], m[9])
- s3, s7, sb, sf = g(s3, s7, sb, sf, m[8], m[6])
- s0, s5, sa, sf = g(s0, s5, sa, sf, m[e], m[a])
- s1, s6, sb, sc = g(s1, s6, sb, sc, m[2], m[c])
- s2, s7, s8, sd = g(s2, s7, s8, sd, m[3], m[4])
- s3, s4, s9, se = g(s3, s4, s9, se, m[7], m[d])
-
- s[8+0] = s8 ^ s[0]
- s[8+1] = s9 ^ s[1]
- s[8+2] = sa ^ s[2]
- s[8+3] = sb ^ s[3]
- s[8+4] = sc ^ s[4]
- s[8+5] = sd ^ s[5]
- s[8+6] = se ^ s[6]
- s[8+7] = sf ^ s[7]
-
- s[0] = s0 ^ s8
- s[1] = s1 ^ s9
- s[2] = s2 ^ sa
- s[3] = s3 ^ sb
- s[4] = s4 ^ sc
- s[5] = s5 ^ sd
- s[6] = s6 ^ se
- s[7] = s7 ^ sf
-}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s
deleted file mode 100644
index 0fedf0b3a..000000000
--- a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s
+++ /dev/null
@@ -1,560 +0,0 @@
-// Code generated by command: go run compress.go. DO NOT EDIT.
-
-#include "textflag.h"
-
-DATA iv<>+0(SB)/4, $0x6a09e667
-DATA iv<>+4(SB)/4, $0xbb67ae85
-DATA iv<>+8(SB)/4, $0x3c6ef372
-DATA iv<>+12(SB)/4, $0xa54ff53a
-DATA iv<>+16(SB)/4, $0x510e527f
-DATA iv<>+20(SB)/4, $0x9b05688c
-DATA iv<>+24(SB)/4, $0x1f83d9ab
-DATA iv<>+28(SB)/4, $0x5be0cd19
-GLOBL iv<>(SB), RODATA|NOPTR, $32
-
-DATA rot16_shuf<>+0(SB)/1, $0x02
-DATA rot16_shuf<>+1(SB)/1, $0x03
-DATA rot16_shuf<>+2(SB)/1, $0x00
-DATA rot16_shuf<>+3(SB)/1, $0x01
-DATA rot16_shuf<>+4(SB)/1, $0x06
-DATA rot16_shuf<>+5(SB)/1, $0x07
-DATA rot16_shuf<>+6(SB)/1, $0x04
-DATA rot16_shuf<>+7(SB)/1, $0x05
-DATA rot16_shuf<>+8(SB)/1, $0x0a
-DATA rot16_shuf<>+9(SB)/1, $0x0b
-DATA rot16_shuf<>+10(SB)/1, $0x08
-DATA rot16_shuf<>+11(SB)/1, $0x09
-DATA rot16_shuf<>+12(SB)/1, $0x0e
-DATA rot16_shuf<>+13(SB)/1, $0x0f
-DATA rot16_shuf<>+14(SB)/1, $0x0c
-DATA rot16_shuf<>+15(SB)/1, $0x0d
-DATA rot16_shuf<>+16(SB)/1, $0x12
-DATA rot16_shuf<>+17(SB)/1, $0x13
-DATA rot16_shuf<>+18(SB)/1, $0x10
-DATA rot16_shuf<>+19(SB)/1, $0x11
-DATA rot16_shuf<>+20(SB)/1, $0x16
-DATA rot16_shuf<>+21(SB)/1, $0x17
-DATA rot16_shuf<>+22(SB)/1, $0x14
-DATA rot16_shuf<>+23(SB)/1, $0x15
-DATA rot16_shuf<>+24(SB)/1, $0x1a
-DATA rot16_shuf<>+25(SB)/1, $0x1b
-DATA rot16_shuf<>+26(SB)/1, $0x18
-DATA rot16_shuf<>+27(SB)/1, $0x19
-DATA rot16_shuf<>+28(SB)/1, $0x1e
-DATA rot16_shuf<>+29(SB)/1, $0x1f
-DATA rot16_shuf<>+30(SB)/1, $0x1c
-DATA rot16_shuf<>+31(SB)/1, $0x1d
-GLOBL rot16_shuf<>(SB), RODATA|NOPTR, $32
-
-DATA rot8_shuf<>+0(SB)/1, $0x01
-DATA rot8_shuf<>+1(SB)/1, $0x02
-DATA rot8_shuf<>+2(SB)/1, $0x03
-DATA rot8_shuf<>+3(SB)/1, $0x00
-DATA rot8_shuf<>+4(SB)/1, $0x05
-DATA rot8_shuf<>+5(SB)/1, $0x06
-DATA rot8_shuf<>+6(SB)/1, $0x07
-DATA rot8_shuf<>+7(SB)/1, $0x04
-DATA rot8_shuf<>+8(SB)/1, $0x09
-DATA rot8_shuf<>+9(SB)/1, $0x0a
-DATA rot8_shuf<>+10(SB)/1, $0x0b
-DATA rot8_shuf<>+11(SB)/1, $0x08
-DATA rot8_shuf<>+12(SB)/1, $0x0d
-DATA rot8_shuf<>+13(SB)/1, $0x0e
-DATA rot8_shuf<>+14(SB)/1, $0x0f
-DATA rot8_shuf<>+15(SB)/1, $0x0c
-DATA rot8_shuf<>+16(SB)/1, $0x11
-DATA rot8_shuf<>+17(SB)/1, $0x12
-DATA rot8_shuf<>+18(SB)/1, $0x13
-DATA rot8_shuf<>+19(SB)/1, $0x10
-DATA rot8_shuf<>+20(SB)/1, $0x15
-DATA rot8_shuf<>+21(SB)/1, $0x16
-DATA rot8_shuf<>+22(SB)/1, $0x17
-DATA rot8_shuf<>+23(SB)/1, $0x14
-DATA rot8_shuf<>+24(SB)/1, $0x19
-DATA rot8_shuf<>+25(SB)/1, $0x1a
-DATA rot8_shuf<>+26(SB)/1, $0x1b
-DATA rot8_shuf<>+27(SB)/1, $0x18
-DATA rot8_shuf<>+28(SB)/1, $0x1d
-DATA rot8_shuf<>+29(SB)/1, $0x1e
-DATA rot8_shuf<>+30(SB)/1, $0x1f
-DATA rot8_shuf<>+31(SB)/1, $0x1c
-GLOBL rot8_shuf<>(SB), RODATA|NOPTR, $32
-
-// func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32)
-// Requires: SSE, SSE2, SSE4.1, SSSE3
-TEXT ·Compress(SB), NOSPLIT, $0-40
- MOVQ chain+0(FP), AX
- MOVQ block+8(FP), CX
- MOVQ counter+16(FP), DX
- MOVL blen+24(FP), BX
- MOVL flags+28(FP), BP
- MOVQ out+32(FP), SI
- MOVUPS (AX), X0
- MOVUPS 16(AX), X1
- MOVUPS iv<>+0(SB), X2
- PINSRD $0x00, DX, X3
- SHRQ $0x20, DX
- PINSRD $0x01, DX, X3
- PINSRD $0x02, BX, X3
- PINSRD $0x03, BP, X3
- MOVUPS (CX), X4
- MOVUPS 16(CX), X5
- MOVUPS 32(CX), X6
- MOVUPS 48(CX), X7
- MOVUPS rot16_shuf<>+0(SB), X8
- MOVUPS rot8_shuf<>+0(SB), X9
-
- // round 1
- MOVAPS X4, X10
- SHUFPS $0x88, X5, X10
- PADDD X10, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X8, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X11
- PSRLL $0x0c, X1
- PSLLL $0x14, X11
- POR X11, X1
- MOVAPS X4, X4
- SHUFPS $0xdd, X5, X4
- PADDD X4, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X9, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X5
- PSRLL $0x07, X1
- PSLLL $0x19, X5
- POR X5, X1
- PSHUFD $0x93, X0, X0
- PSHUFD $0x4e, X3, X3
- PSHUFD $0x39, X2, X2
- MOVAPS X6, X5
- SHUFPS $0x88, X7, X5
- SHUFPS $0x93, X5, X5
- PADDD X5, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X8, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X11
- PSRLL $0x0c, X1
- PSLLL $0x14, X11
- POR X11, X1
- MOVAPS X6, X6
- SHUFPS $0xdd, X7, X6
- SHUFPS $0x93, X6, X6
- PADDD X6, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X9, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X7
- PSRLL $0x07, X1
- PSLLL $0x19, X7
- POR X7, X1
- PSHUFD $0x39, X0, X0
- PSHUFD $0x4e, X3, X3
- PSHUFD $0x93, X2, X2
-
- // round 2
- MOVAPS X10, X7
- SHUFPS $0xd6, X4, X7
- SHUFPS $0x39, X7, X7
- PADDD X7, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X8, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X11
- PSRLL $0x0c, X1
- PSLLL $0x14, X11
- POR X11, X1
- MOVAPS X5, X11
- SHUFPS $0xfa, X6, X11
- PSHUFD $0x0f, X10, X10
- PBLENDW $0x33, X10, X11
- PADDD X11, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X9, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X10
- PSRLL $0x07, X1
- PSLLL $0x19, X10
- POR X10, X1
- PSHUFD $0x93, X0, X0
- PSHUFD $0x4e, X3, X3
- PSHUFD $0x39, X2, X2
- MOVAPS X6, X12
- PUNPCKLLQ X4, X12
- PBLENDW $0xc0, X5, X12
- SHUFPS $0xb4, X12, X12
- PADDD X12, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X8, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X10
- PSRLL $0x0c, X1
- PSLLL $0x14, X10
- POR X10, X1
- MOVAPS X4, X10
- PUNPCKHLQ X6, X10
- MOVAPS X5, X4
- PUNPCKLLQ X10, X4
- SHUFPS $0x1e, X4, X4
- PADDD X4, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X9, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X5
- PSRLL $0x07, X1
- PSLLL $0x19, X5
- POR X5, X1
- PSHUFD $0x39, X0, X0
- PSHUFD $0x4e, X3, X3
- PSHUFD $0x93, X2, X2
-
- // round 3
- MOVAPS X7, X5
- SHUFPS $0xd6, X11, X5
- SHUFPS $0x39, X5, X5
- PADDD X5, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X8, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X6
- PSRLL $0x0c, X1
- PSLLL $0x14, X6
- POR X6, X1
- MOVAPS X12, X6
- SHUFPS $0xfa, X4, X6
- PSHUFD $0x0f, X7, X7
- PBLENDW $0x33, X7, X6
- PADDD X6, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X9, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X7
- PSRLL $0x07, X1
- PSLLL $0x19, X7
- POR X7, X1
- PSHUFD $0x93, X0, X0
- PSHUFD $0x4e, X3, X3
- PSHUFD $0x39, X2, X2
- MOVAPS X4, X10
- PUNPCKLLQ X11, X10
- PBLENDW $0xc0, X12, X10
- SHUFPS $0xb4, X10, X10
- PADDD X10, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X8, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X7
- PSRLL $0x0c, X1
- PSLLL $0x14, X7
- POR X7, X1
- MOVAPS X11, X7
- PUNPCKHLQ X4, X7
- MOVAPS X12, X4
- PUNPCKLLQ X7, X4
- SHUFPS $0x1e, X4, X4
- PADDD X4, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X9, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X7
- PSRLL $0x07, X1
- PSLLL $0x19, X7
- POR X7, X1
- PSHUFD $0x39, X0, X0
- PSHUFD $0x4e, X3, X3
- PSHUFD $0x93, X2, X2
-
- // round 4
- MOVAPS X5, X7
- SHUFPS $0xd6, X6, X7
- SHUFPS $0x39, X7, X7
- PADDD X7, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X8, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X11
- PSRLL $0x0c, X1
- PSLLL $0x14, X11
- POR X11, X1
- MOVAPS X10, X11
- SHUFPS $0xfa, X4, X11
- PSHUFD $0x0f, X5, X5
- PBLENDW $0x33, X5, X11
- PADDD X11, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X9, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X5
- PSRLL $0x07, X1
- PSLLL $0x19, X5
- POR X5, X1
- PSHUFD $0x93, X0, X0
- PSHUFD $0x4e, X3, X3
- PSHUFD $0x39, X2, X2
- MOVAPS X4, X12
- PUNPCKLLQ X6, X12
- PBLENDW $0xc0, X10, X12
- SHUFPS $0xb4, X12, X12
- PADDD X12, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X8, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X5
- PSRLL $0x0c, X1
- PSLLL $0x14, X5
- POR X5, X1
- MOVAPS X6, X5
- PUNPCKHLQ X4, X5
- MOVAPS X10, X4
- PUNPCKLLQ X5, X4
- SHUFPS $0x1e, X4, X4
- PADDD X4, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X9, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X5
- PSRLL $0x07, X1
- PSLLL $0x19, X5
- POR X5, X1
- PSHUFD $0x39, X0, X0
- PSHUFD $0x4e, X3, X3
- PSHUFD $0x93, X2, X2
-
- // round 5
- MOVAPS X7, X5
- SHUFPS $0xd6, X11, X5
- SHUFPS $0x39, X5, X5
- PADDD X5, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X8, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X6
- PSRLL $0x0c, X1
- PSLLL $0x14, X6
- POR X6, X1
- MOVAPS X12, X6
- SHUFPS $0xfa, X4, X6
- PSHUFD $0x0f, X7, X7
- PBLENDW $0x33, X7, X6
- PADDD X6, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X9, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X7
- PSRLL $0x07, X1
- PSLLL $0x19, X7
- POR X7, X1
- PSHUFD $0x93, X0, X0
- PSHUFD $0x4e, X3, X3
- PSHUFD $0x39, X2, X2
- MOVAPS X4, X10
- PUNPCKLLQ X11, X10
- PBLENDW $0xc0, X12, X10
- SHUFPS $0xb4, X10, X10
- PADDD X10, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X8, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X7
- PSRLL $0x0c, X1
- PSLLL $0x14, X7
- POR X7, X1
- MOVAPS X11, X7
- PUNPCKHLQ X4, X7
- MOVAPS X12, X4
- PUNPCKLLQ X7, X4
- SHUFPS $0x1e, X4, X4
- PADDD X4, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X9, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X7
- PSRLL $0x07, X1
- PSLLL $0x19, X7
- POR X7, X1
- PSHUFD $0x39, X0, X0
- PSHUFD $0x4e, X3, X3
- PSHUFD $0x93, X2, X2
-
- // round 6
- MOVAPS X5, X7
- SHUFPS $0xd6, X6, X7
- SHUFPS $0x39, X7, X7
- PADDD X7, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X8, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X11
- PSRLL $0x0c, X1
- PSLLL $0x14, X11
- POR X11, X1
- MOVAPS X10, X11
- SHUFPS $0xfa, X4, X11
- PSHUFD $0x0f, X5, X5
- PBLENDW $0x33, X5, X11
- PADDD X11, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X9, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X5
- PSRLL $0x07, X1
- PSLLL $0x19, X5
- POR X5, X1
- PSHUFD $0x93, X0, X0
- PSHUFD $0x4e, X3, X3
- PSHUFD $0x39, X2, X2
- MOVAPS X4, X12
- PUNPCKLLQ X6, X12
- PBLENDW $0xc0, X10, X12
- SHUFPS $0xb4, X12, X12
- PADDD X12, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X8, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X5
- PSRLL $0x0c, X1
- PSLLL $0x14, X5
- POR X5, X1
- MOVAPS X6, X5
- PUNPCKHLQ X4, X5
- MOVAPS X10, X4
- PUNPCKLLQ X5, X4
- SHUFPS $0x1e, X4, X4
- PADDD X4, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X9, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X5
- PSRLL $0x07, X1
- PSLLL $0x19, X5
- POR X5, X1
- PSHUFD $0x39, X0, X0
- PSHUFD $0x4e, X3, X3
- PSHUFD $0x93, X2, X2
-
- // round 7
- MOVAPS X7, X5
- SHUFPS $0xd6, X11, X5
- SHUFPS $0x39, X5, X5
- PADDD X5, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X8, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X5
- PSRLL $0x0c, X1
- PSLLL $0x14, X5
- POR X5, X1
- MOVAPS X12, X5
- SHUFPS $0xfa, X4, X5
- PSHUFD $0x0f, X7, X6
- PBLENDW $0x33, X6, X5
- PADDD X5, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X9, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X5
- PSRLL $0x07, X1
- PSLLL $0x19, X5
- POR X5, X1
- PSHUFD $0x93, X0, X0
- PSHUFD $0x4e, X3, X3
- PSHUFD $0x39, X2, X2
- MOVAPS X4, X5
- PUNPCKLLQ X11, X5
- PBLENDW $0xc0, X12, X5
- SHUFPS $0xb4, X5, X5
- PADDD X5, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X8, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X5
- PSRLL $0x0c, X1
- PSLLL $0x14, X5
- POR X5, X1
- MOVAPS X11, X6
- PUNPCKHLQ X4, X6
- MOVAPS X12, X4
- PUNPCKLLQ X6, X4
- SHUFPS $0x1e, X4, X4
- PADDD X4, X0
- PADDD X1, X0
- PXOR X0, X3
- PSHUFB X9, X3
- PADDD X3, X2
- PXOR X2, X1
- MOVAPS X1, X4
- PSRLL $0x07, X1
- PSLLL $0x19, X4
- POR X4, X1
- PSHUFD $0x39, X0, X0
- PSHUFD $0x4e, X3, X3
- PSHUFD $0x93, X2, X2
-
- // finalize
- PXOR X2, X0
- PXOR X3, X1
- MOVUPS (AX), X4
- PXOR X4, X2
- MOVUPS 16(AX), X4
- PXOR X4, X3
- MOVUPS X0, (SI)
- MOVUPS X1, 16(SI)
- MOVUPS X2, 32(SI)
- MOVUPS X3, 48(SI)
- RET
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go
deleted file mode 100644
index cd63e9740..000000000
--- a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go
+++ /dev/null
@@ -1,9 +0,0 @@
-// +build !amd64
-
-package compress_sse41
-
-import "github.com/zeebo/blake3/internal/alg/compress/compress_pure"
-
-func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) {
- compress_pure.Compress(chain, block, counter, blen, flags, out)
-}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go
deleted file mode 100644
index ffd932d3c..000000000
--- a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go
+++ /dev/null
@@ -1,6 +0,0 @@
-// +build amd64
-
-package compress_sse41
-
-//go:noescape
-func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32)
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go
deleted file mode 100644
index ac43abb69..000000000
--- a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go
+++ /dev/null
@@ -1,23 +0,0 @@
-package hash
-
-import (
- "github.com/zeebo/blake3/internal/alg/hash/hash_avx2"
- "github.com/zeebo/blake3/internal/alg/hash/hash_pure"
- "github.com/zeebo/blake3/internal/consts"
-)
-
-func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) {
- if consts.HasAVX2 && length > 2*consts.ChunkLen {
- hash_avx2.HashF(input, length, counter, flags, key, out, chain)
- } else {
- hash_pure.HashF(input, length, counter, flags, key, out, chain)
- }
-}
-
-func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) {
- if consts.HasAVX2 && n >= 2 {
- hash_avx2.HashP(left, right, flags, key, out, n)
- } else {
- hash_pure.HashP(left, right, flags, key, out, n)
- }
-}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s
deleted file mode 100644
index d7531664b..000000000
--- a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s
+++ /dev/null
@@ -1,2561 +0,0 @@
-// Code generated by command: go run main.go. DO NOT EDIT.
-
-#include "textflag.h"
-
-DATA iv<>+0(SB)/4, $0x6a09e667
-DATA iv<>+4(SB)/4, $0xbb67ae85
-DATA iv<>+8(SB)/4, $0x3c6ef372
-DATA iv<>+12(SB)/4, $0xa54ff53a
-DATA iv<>+16(SB)/4, $0x510e527f
-DATA iv<>+20(SB)/4, $0x9b05688c
-DATA iv<>+24(SB)/4, $0x1f83d9ab
-DATA iv<>+28(SB)/4, $0x5be0cd19
-GLOBL iv<>(SB), RODATA|NOPTR, $32
-
-DATA rot16_shuf<>+0(SB)/1, $0x02
-DATA rot16_shuf<>+1(SB)/1, $0x03
-DATA rot16_shuf<>+2(SB)/1, $0x00
-DATA rot16_shuf<>+3(SB)/1, $0x01
-DATA rot16_shuf<>+4(SB)/1, $0x06
-DATA rot16_shuf<>+5(SB)/1, $0x07
-DATA rot16_shuf<>+6(SB)/1, $0x04
-DATA rot16_shuf<>+7(SB)/1, $0x05
-DATA rot16_shuf<>+8(SB)/1, $0x0a
-DATA rot16_shuf<>+9(SB)/1, $0x0b
-DATA rot16_shuf<>+10(SB)/1, $0x08
-DATA rot16_shuf<>+11(SB)/1, $0x09
-DATA rot16_shuf<>+12(SB)/1, $0x0e
-DATA rot16_shuf<>+13(SB)/1, $0x0f
-DATA rot16_shuf<>+14(SB)/1, $0x0c
-DATA rot16_shuf<>+15(SB)/1, $0x0d
-DATA rot16_shuf<>+16(SB)/1, $0x12
-DATA rot16_shuf<>+17(SB)/1, $0x13
-DATA rot16_shuf<>+18(SB)/1, $0x10
-DATA rot16_shuf<>+19(SB)/1, $0x11
-DATA rot16_shuf<>+20(SB)/1, $0x16
-DATA rot16_shuf<>+21(SB)/1, $0x17
-DATA rot16_shuf<>+22(SB)/1, $0x14
-DATA rot16_shuf<>+23(SB)/1, $0x15
-DATA rot16_shuf<>+24(SB)/1, $0x1a
-DATA rot16_shuf<>+25(SB)/1, $0x1b
-DATA rot16_shuf<>+26(SB)/1, $0x18
-DATA rot16_shuf<>+27(SB)/1, $0x19
-DATA rot16_shuf<>+28(SB)/1, $0x1e
-DATA rot16_shuf<>+29(SB)/1, $0x1f
-DATA rot16_shuf<>+30(SB)/1, $0x1c
-DATA rot16_shuf<>+31(SB)/1, $0x1d
-GLOBL rot16_shuf<>(SB), RODATA|NOPTR, $32
-
-DATA rot8_shuf<>+0(SB)/1, $0x01
-DATA rot8_shuf<>+1(SB)/1, $0x02
-DATA rot8_shuf<>+2(SB)/1, $0x03
-DATA rot8_shuf<>+3(SB)/1, $0x00
-DATA rot8_shuf<>+4(SB)/1, $0x05
-DATA rot8_shuf<>+5(SB)/1, $0x06
-DATA rot8_shuf<>+6(SB)/1, $0x07
-DATA rot8_shuf<>+7(SB)/1, $0x04
-DATA rot8_shuf<>+8(SB)/1, $0x09
-DATA rot8_shuf<>+9(SB)/1, $0x0a
-DATA rot8_shuf<>+10(SB)/1, $0x0b
-DATA rot8_shuf<>+11(SB)/1, $0x08
-DATA rot8_shuf<>+12(SB)/1, $0x0d
-DATA rot8_shuf<>+13(SB)/1, $0x0e
-DATA rot8_shuf<>+14(SB)/1, $0x0f
-DATA rot8_shuf<>+15(SB)/1, $0x0c
-DATA rot8_shuf<>+16(SB)/1, $0x11
-DATA rot8_shuf<>+17(SB)/1, $0x12
-DATA rot8_shuf<>+18(SB)/1, $0x13
-DATA rot8_shuf<>+19(SB)/1, $0x10
-DATA rot8_shuf<>+20(SB)/1, $0x15
-DATA rot8_shuf<>+21(SB)/1, $0x16
-DATA rot8_shuf<>+22(SB)/1, $0x17
-DATA rot8_shuf<>+23(SB)/1, $0x14
-DATA rot8_shuf<>+24(SB)/1, $0x19
-DATA rot8_shuf<>+25(SB)/1, $0x1a
-DATA rot8_shuf<>+26(SB)/1, $0x1b
-DATA rot8_shuf<>+27(SB)/1, $0x18
-DATA rot8_shuf<>+28(SB)/1, $0x1d
-DATA rot8_shuf<>+29(SB)/1, $0x1e
-DATA rot8_shuf<>+30(SB)/1, $0x1f
-DATA rot8_shuf<>+31(SB)/1, $0x1c
-GLOBL rot8_shuf<>(SB), RODATA|NOPTR, $32
-
-DATA block_len<>+0(SB)/4, $0x00000040
-DATA block_len<>+4(SB)/4, $0x00000040
-DATA block_len<>+8(SB)/4, $0x00000040
-DATA block_len<>+12(SB)/4, $0x00000040
-DATA block_len<>+16(SB)/4, $0x00000040
-DATA block_len<>+20(SB)/4, $0x00000040
-DATA block_len<>+24(SB)/4, $0x00000040
-DATA block_len<>+28(SB)/4, $0x00000040
-GLOBL block_len<>(SB), RODATA|NOPTR, $32
-
-DATA zero<>+0(SB)/4, $0x00000000
-DATA zero<>+4(SB)/4, $0x00000000
-DATA zero<>+8(SB)/4, $0x00000000
-DATA zero<>+12(SB)/4, $0x00000000
-DATA zero<>+16(SB)/4, $0x00000000
-DATA zero<>+20(SB)/4, $0x00000000
-DATA zero<>+24(SB)/4, $0x00000000
-DATA zero<>+28(SB)/4, $0x00000000
-GLOBL zero<>(SB), RODATA|NOPTR, $32
-
-DATA counter<>+0(SB)/8, $0x0000000000000000
-DATA counter<>+8(SB)/8, $0x0000000000000001
-DATA counter<>+16(SB)/8, $0x0000000000000002
-DATA counter<>+24(SB)/8, $0x0000000000000003
-DATA counter<>+32(SB)/8, $0x0000000000000004
-DATA counter<>+40(SB)/8, $0x0000000000000005
-DATA counter<>+48(SB)/8, $0x0000000000000006
-DATA counter<>+56(SB)/8, $0x0000000000000007
-GLOBL counter<>(SB), RODATA|NOPTR, $64
-
-// func HashF(input *[8192]byte, length uint64, counter uint64, flags uint32, key *[8]uint32, out *[32]uint32, chain *[8]uint32)
-// Requires: AVX, AVX2
-TEXT ·HashF(SB), $688-56
- MOVQ input+0(FP), AX
- MOVQ length+8(FP), CX
- MOVQ counter+16(FP), DX
- MOVL flags+24(FP), BX
- MOVQ key+32(FP), BP
- MOVQ out+40(FP), SI
- MOVQ chain+48(FP), DI
-
- // Allocate local space and align it
- LEAQ 31(SP), R10
- MOVQ $0x000000000000001f, R8
- NOTQ R8
- ANDQ R8, R10
-
- // Skip if the length is zero
- XORQ R8, R8
- XORQ R9, R9
- TESTQ CX, CX
- JZ skip_compute
-
- // Compute complete chunks and blocks
- SUBQ $0x01, CX
- MOVQ CX, R8
- SHRQ $0x0a, R8
- MOVQ CX, R9
- ANDQ $0x000003c0, R9
-
-skip_compute:
- // Load some params into the stack (avo improvment?)
- MOVL BX, 64(SP)
- MOVQ DX, 72(SP)
-
- // Load IV into vectors
- VPBROADCASTD (BP), Y0
- VPBROADCASTD 4(BP), Y1
- VPBROADCASTD 8(BP), Y2
- VPBROADCASTD 12(BP), Y3
- VPBROADCASTD 16(BP), Y4
- VPBROADCASTD 20(BP), Y5
- VPBROADCASTD 24(BP), Y6
- VPBROADCASTD 28(BP), Y7
-
- // Build and store counter data on the stack
- VPBROADCASTQ 72(SP), Y8
- VPADDQ counter<>+0(SB), Y8, Y8
- VPBROADCASTQ 72(SP), Y9
- VPADDQ counter<>+32(SB), Y9, Y9
- VPUNPCKLDQ Y9, Y8, Y10
- VPUNPCKHDQ Y9, Y8, Y8
- VPUNPCKLDQ Y8, Y10, Y9
- VPUNPCKHDQ Y8, Y10, Y8
- VPERMQ $0xd8, Y9, Y9
- VPERMQ $0xd8, Y8, Y8
- VMOVDQU Y9, 112(SP)
- VMOVDQU Y8, 144(SP)
-
- // Set up block flags and variables for iteration
- XORQ CX, CX
- ORL $0x01, 64(SP)
-
-loop:
- // Include end flags if last block
- CMPQ CX, $0x000003c0
- JNE round_setup
- ORL $0x02, 64(SP)
-
-round_setup:
- // Load and transpose message vectors
- VMOVDQU (AX)(CX*1), Y8
- VMOVDQU 1024(AX)(CX*1), Y9
- VMOVDQU 2048(AX)(CX*1), Y10
- VMOVDQU 3072(AX)(CX*1), Y11
- VMOVDQU 4096(AX)(CX*1), Y12
- VMOVDQU 5120(AX)(CX*1), Y13
- VMOVDQU 6144(AX)(CX*1), Y14
- VMOVDQU 7168(AX)(CX*1), Y15
- VMOVDQA Y0, (R10)
- VPUNPCKLDQ Y9, Y8, Y0
- VPUNPCKHDQ Y9, Y8, Y8
- VPUNPCKLDQ Y11, Y10, Y9
- VPUNPCKHDQ Y11, Y10, Y10
- VPUNPCKLDQ Y13, Y12, Y11
- VPUNPCKHDQ Y13, Y12, Y12
- VPUNPCKLDQ Y15, Y14, Y13
- VPUNPCKHDQ Y15, Y14, Y14
- VPUNPCKLQDQ Y9, Y0, Y15
- VPUNPCKHQDQ Y9, Y0, Y0
- VPUNPCKLQDQ Y10, Y8, Y9
- VPUNPCKHQDQ Y10, Y8, Y8
- VPUNPCKLQDQ Y13, Y11, Y10
- VPUNPCKHQDQ Y13, Y11, Y11
- VPUNPCKLQDQ Y14, Y12, Y13
- VPUNPCKHQDQ Y14, Y12, Y12
- VINSERTI128 $0x01, X10, Y15, Y14
- VPERM2I128 $0x31, Y10, Y15, Y10
- VINSERTI128 $0x01, X11, Y0, Y15
- VPERM2I128 $0x31, Y11, Y0, Y0
- VINSERTI128 $0x01, X13, Y9, Y11
- VPERM2I128 $0x31, Y13, Y9, Y9
- VINSERTI128 $0x01, X12, Y8, Y13
- VPERM2I128 $0x31, Y12, Y8, Y8
- VMOVDQU Y14, 176(SP)
- VMOVDQU Y15, 208(SP)
- VMOVDQU Y11, 240(SP)
- VMOVDQU Y13, 272(SP)
- VMOVDQU Y10, 304(SP)
- VMOVDQU Y0, 336(SP)
- VMOVDQU Y9, 368(SP)
- VMOVDQU Y8, 400(SP)
- VMOVDQU 32(AX)(CX*1), Y0
- VMOVDQU 1056(AX)(CX*1), Y8
- VMOVDQU 2080(AX)(CX*1), Y9
- VMOVDQU 3104(AX)(CX*1), Y10
- VMOVDQU 4128(AX)(CX*1), Y11
- VMOVDQU 5152(AX)(CX*1), Y12
- VMOVDQU 6176(AX)(CX*1), Y13
- VMOVDQU 7200(AX)(CX*1), Y14
- VPUNPCKLDQ Y8, Y0, Y15
- VPUNPCKHDQ Y8, Y0, Y0
- VPUNPCKLDQ Y10, Y9, Y8
- VPUNPCKHDQ Y10, Y9, Y9
- VPUNPCKLDQ Y12, Y11, Y10
- VPUNPCKHDQ Y12, Y11, Y11
- VPUNPCKLDQ Y14, Y13, Y12
- VPUNPCKHDQ Y14, Y13, Y13
- VPUNPCKLQDQ Y8, Y15, Y14
- VPUNPCKHQDQ Y8, Y15, Y8
- VPUNPCKLQDQ Y9, Y0, Y15
- VPUNPCKHQDQ Y9, Y0, Y0
- VPUNPCKLQDQ Y12, Y10, Y9
- VPUNPCKHQDQ Y12, Y10, Y10
- VPUNPCKLQDQ Y13, Y11, Y12
- VPUNPCKHQDQ Y13, Y11, Y11
- VINSERTI128 $0x01, X9, Y14, Y13
- VPERM2I128 $0x31, Y9, Y14, Y9
- VINSERTI128 $0x01, X10, Y8, Y14
- VPERM2I128 $0x31, Y10, Y8, Y8
- VINSERTI128 $0x01, X12, Y15, Y10
- VPERM2I128 $0x31, Y12, Y15, Y12
- VINSERTI128 $0x01, X11, Y0, Y15
- VPERM2I128 $0x31, Y11, Y0, Y0
- VMOVDQU Y13, 432(SP)
- VMOVDQU Y14, 464(SP)
- VMOVDQU Y10, 496(SP)
- VMOVDQU Y15, 528(SP)
- VMOVDQU Y9, 560(SP)
- VMOVDQU Y8, 592(SP)
- VMOVDQU Y12, 624(SP)
- VMOVDQU Y0, 656(SP)
-
- // Load constants for the round
- VMOVDQA (R10), Y0
- VMOVDQU block_len<>+0(SB), Y8
- VPBROADCASTD 64(SP), Y9
- VPBROADCASTD iv<>+0(SB), Y10
- VPBROADCASTD iv<>+4(SB), Y11
- VPBROADCASTD iv<>+8(SB), Y12
- VPBROADCASTD iv<>+12(SB), Y13
- VMOVDQU 112(SP), Y14
- VMOVDQU 144(SP), Y15
-
- // Save state for partial chunk if necessary
- CMPQ CX, R9
- JNE begin_rounds
- VMOVDQU Y0, 80(SP)
- MOVL 80(SP)(R8*4), DX
- MOVL DX, (DI)
- VMOVDQU Y1, 80(SP)
- MOVL 80(SP)(R8*4), DX
- MOVL DX, 4(DI)
- VMOVDQU Y2, 80(SP)
- MOVL 80(SP)(R8*4), DX
- MOVL DX, 8(DI)
- VMOVDQU Y3, 80(SP)
- MOVL 80(SP)(R8*4), DX
- MOVL DX, 12(DI)
- VMOVDQU Y4, 80(SP)
- MOVL 80(SP)(R8*4), DX
- MOVL DX, 16(DI)
- VMOVDQU Y5, 80(SP)
- MOVL 80(SP)(R8*4), DX
- MOVL DX, 20(DI)
- VMOVDQU Y6, 80(SP)
- MOVL 80(SP)(R8*4), DX
- MOVL DX, 24(DI)
- VMOVDQU Y7, 80(SP)
- MOVL 80(SP)(R8*4), DX
- MOVL DX, 28(DI)
-
-begin_rounds:
- // Perform the rounds
- // Round 1
- VPADDD 176(SP), Y0, Y0
- VPADDD 240(SP), Y1, Y1
- VPADDD 304(SP), Y2, Y2
- VPADDD 368(SP), Y3, Y3
- VPADDD Y4, Y0, Y0
- VPXOR Y0, Y14, Y14
- VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
- VPADDD Y5, Y1, Y1
- VPXOR Y1, Y15, Y15
- VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y8, Y8
- VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
- VPADDD Y7, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y14, Y10, Y10
- VPXOR Y10, Y4, Y4
- VPADDD Y15, Y11, Y11
- VPXOR Y11, Y5, Y5
- VPADDD Y8, Y12, Y12
- VPXOR Y12, Y6, Y6
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y7, Y7
- VMOVDQA Y0, (R10)
- VPSRLD $0x0c, Y4, Y0
- VPSLLD $0x14, Y4, Y4
- VPOR Y0, Y4, Y0
- VPSRLD $0x0c, Y5, Y4
- VPSLLD $0x14, Y5, Y5
- VPOR Y4, Y5, Y4
- VPSRLD $0x0c, Y6, Y5
- VPSLLD $0x14, Y6, Y6
- VPOR Y5, Y6, Y5
- VPSRLD $0x0c, Y7, Y6
- VPSLLD $0x14, Y7, Y7
- VPOR Y6, Y7, Y6
- VMOVDQA (R10), Y7
- VPADDD 208(SP), Y7, Y7
- VPADDD 272(SP), Y1, Y1
- VPADDD 336(SP), Y2, Y2
- VPADDD 400(SP), Y3, Y3
- VPADDD Y0, Y7, Y7
- VPXOR Y7, Y14, Y14
- VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y15, Y15
- VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
- VPADDD Y5, Y2, Y2
- VPXOR Y2, Y8, Y8
- VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
- VPADDD Y6, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y14, Y10, Y10
- VPXOR Y10, Y0, Y0
- VPADDD Y15, Y11, Y11
- VPXOR Y11, Y4, Y4
- VPADDD Y8, Y12, Y12
- VPXOR Y12, Y5, Y5
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y6, Y6
- VMOVDQA Y7, (R10)
- VPSRLD $0x07, Y0, Y7
- VPSLLD $0x19, Y0, Y0
- VPOR Y7, Y0, Y0
- VPSRLD $0x07, Y4, Y7
- VPSLLD $0x19, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x07, Y5, Y7
- VPSLLD $0x19, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x07, Y6, Y7
- VPSLLD $0x19, Y6, Y6
- VPOR Y7, Y6, Y6
- VMOVDQA (R10), Y7
- VPADDD 432(SP), Y7, Y7
- VPADDD 496(SP), Y1, Y1
- VPADDD 560(SP), Y2, Y2
- VPADDD 624(SP), Y3, Y3
- VPADDD Y4, Y7, Y7
- VPXOR Y7, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y5, Y1, Y1
- VPXOR Y1, Y14, Y14
- VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y15, Y15
- VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y8, Y8
- VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
- VPADDD Y9, Y12, Y12
- VPXOR Y12, Y4, Y4
- VPADDD Y14, Y13, Y13
- VPXOR Y13, Y5, Y5
- VPADDD Y15, Y10, Y10
- VPXOR Y10, Y6, Y6
- VPADDD Y8, Y11, Y11
- VPXOR Y11, Y0, Y0
- VMOVDQA Y7, (R10)
- VPSRLD $0x0c, Y4, Y7
- VPSLLD $0x14, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x0c, Y5, Y7
- VPSLLD $0x14, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x0c, Y6, Y7
- VPSLLD $0x14, Y6, Y6
- VPOR Y7, Y6, Y6
- VPSRLD $0x0c, Y0, Y7
- VPSLLD $0x14, Y0, Y0
- VPOR Y7, Y0, Y0
- VMOVDQA (R10), Y7
- VPADDD 464(SP), Y7, Y7
- VPADDD 528(SP), Y1, Y1
- VPADDD 592(SP), Y2, Y2
- VPADDD 656(SP), Y3, Y3
- VPADDD Y4, Y7, Y7
- VPXOR Y7, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y5, Y1, Y1
- VPXOR Y1, Y14, Y14
- VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y15, Y15
- VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y8, Y8
- VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
- VPADDD Y9, Y12, Y12
- VPXOR Y12, Y4, Y4
- VPADDD Y14, Y13, Y13
- VPXOR Y13, Y5, Y5
- VPADDD Y15, Y10, Y10
- VPXOR Y10, Y6, Y6
- VPADDD Y8, Y11, Y11
- VPXOR Y11, Y0, Y0
- VMOVDQA Y7, (R10)
- VPSRLD $0x07, Y4, Y7
- VPSLLD $0x19, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x07, Y5, Y7
- VPSLLD $0x19, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x07, Y6, Y7
- VPSLLD $0x19, Y6, Y6
- VPOR Y7, Y6, Y6
- VPSRLD $0x07, Y0, Y7
- VPSLLD $0x19, Y0, Y0
- VPOR Y7, Y0, Y0
-
- // Round 2
- VMOVDQA (R10), Y7
- VPADDD 240(SP), Y7, Y7
- VPADDD 272(SP), Y1, Y1
- VPADDD 400(SP), Y2, Y2
- VPADDD 304(SP), Y3, Y3
- VPADDD Y0, Y7, Y7
- VPXOR Y7, Y14, Y14
- VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y15, Y15
- VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
- VPADDD Y5, Y2, Y2
- VPXOR Y2, Y8, Y8
- VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
- VPADDD Y6, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y14, Y10, Y10
- VPXOR Y10, Y0, Y0
- VPADDD Y15, Y11, Y11
- VPXOR Y11, Y4, Y4
- VPADDD Y8, Y12, Y12
- VPXOR Y12, Y5, Y5
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y6, Y6
- VMOVDQA Y7, (R10)
- VPSRLD $0x0c, Y0, Y7
- VPSLLD $0x14, Y0, Y0
- VPOR Y7, Y0, Y0
- VPSRLD $0x0c, Y4, Y7
- VPSLLD $0x14, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x0c, Y5, Y7
- VPSLLD $0x14, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x0c, Y6, Y7
- VPSLLD $0x14, Y6, Y6
- VPOR Y7, Y6, Y6
- VMOVDQA (R10), Y7
- VPADDD 368(SP), Y7, Y7
- VPADDD 496(SP), Y1, Y1
- VPADDD 176(SP), Y2, Y2
- VPADDD 592(SP), Y3, Y3
- VPADDD Y0, Y7, Y7
- VPXOR Y7, Y14, Y14
- VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y15, Y15
- VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
- VPADDD Y5, Y2, Y2
- VPXOR Y2, Y8, Y8
- VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
- VPADDD Y6, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y14, Y10, Y10
- VPXOR Y10, Y0, Y0
- VPADDD Y15, Y11, Y11
- VPXOR Y11, Y4, Y4
- VPADDD Y8, Y12, Y12
- VPXOR Y12, Y5, Y5
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y6, Y6
- VMOVDQA Y7, (R10)
- VPSRLD $0x07, Y0, Y7
- VPSLLD $0x19, Y0, Y0
- VPOR Y7, Y0, Y0
- VPSRLD $0x07, Y4, Y7
- VPSLLD $0x19, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x07, Y5, Y7
- VPSLLD $0x19, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x07, Y6, Y7
- VPSLLD $0x19, Y6, Y6
- VPOR Y7, Y6, Y6
- VMOVDQA (R10), Y7
- VPADDD 208(SP), Y7, Y7
- VPADDD 560(SP), Y1, Y1
- VPADDD 464(SP), Y2, Y2
- VPADDD 656(SP), Y3, Y3
- VPADDD Y4, Y7, Y7
- VPXOR Y7, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y5, Y1, Y1
- VPXOR Y1, Y14, Y14
- VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y15, Y15
- VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y8, Y8
- VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
- VPADDD Y9, Y12, Y12
- VPXOR Y12, Y4, Y4
- VPADDD Y14, Y13, Y13
- VPXOR Y13, Y5, Y5
- VPADDD Y15, Y10, Y10
- VPXOR Y10, Y6, Y6
- VPADDD Y8, Y11, Y11
- VPXOR Y11, Y0, Y0
- VMOVDQA Y7, (R10)
- VPSRLD $0x0c, Y4, Y7
- VPSLLD $0x14, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x0c, Y5, Y7
- VPSLLD $0x14, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x0c, Y6, Y7
- VPSLLD $0x14, Y6, Y6
- VPOR Y7, Y6, Y6
- VPSRLD $0x0c, Y0, Y7
- VPSLLD $0x14, Y0, Y0
- VPOR Y7, Y0, Y0
- VMOVDQA (R10), Y7
- VPADDD 528(SP), Y7, Y7
- VPADDD 336(SP), Y1, Y1
- VPADDD 624(SP), Y2, Y2
- VPADDD 432(SP), Y3, Y3
- VPADDD Y4, Y7, Y7
- VPXOR Y7, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y5, Y1, Y1
- VPXOR Y1, Y14, Y14
- VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y15, Y15
- VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y8, Y8
- VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
- VPADDD Y9, Y12, Y12
- VPXOR Y12, Y4, Y4
- VPADDD Y14, Y13, Y13
- VPXOR Y13, Y5, Y5
- VPADDD Y15, Y10, Y10
- VPXOR Y10, Y6, Y6
- VPADDD Y8, Y11, Y11
- VPXOR Y11, Y0, Y0
- VMOVDQA Y7, (R10)
- VPSRLD $0x07, Y4, Y7
- VPSLLD $0x19, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x07, Y5, Y7
- VPSLLD $0x19, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x07, Y6, Y7
- VPSLLD $0x19, Y6, Y6
- VPOR Y7, Y6, Y6
- VPSRLD $0x07, Y0, Y7
- VPSLLD $0x19, Y0, Y0
- VPOR Y7, Y0, Y0
-
- // Round 3
- VMOVDQA (R10), Y7
- VPADDD 272(SP), Y7, Y7
- VPADDD 496(SP), Y1, Y1
- VPADDD 592(SP), Y2, Y2
- VPADDD 400(SP), Y3, Y3
- VPADDD Y0, Y7, Y7
- VPXOR Y7, Y14, Y14
- VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y15, Y15
- VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
- VPADDD Y5, Y2, Y2
- VPXOR Y2, Y8, Y8
- VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
- VPADDD Y6, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y14, Y10, Y10
- VPXOR Y10, Y0, Y0
- VPADDD Y15, Y11, Y11
- VPXOR Y11, Y4, Y4
- VPADDD Y8, Y12, Y12
- VPXOR Y12, Y5, Y5
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y6, Y6
- VMOVDQA Y7, (R10)
- VPSRLD $0x0c, Y0, Y7
- VPSLLD $0x14, Y0, Y0
- VPOR Y7, Y0, Y0
- VPSRLD $0x0c, Y4, Y7
- VPSLLD $0x14, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x0c, Y5, Y7
- VPSLLD $0x14, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x0c, Y6, Y7
- VPSLLD $0x14, Y6, Y6
- VPOR Y7, Y6, Y6
- VMOVDQA (R10), Y7
- VPADDD 304(SP), Y7, Y7
- VPADDD 560(SP), Y1, Y1
- VPADDD 240(SP), Y2, Y2
- VPADDD 624(SP), Y3, Y3
- VPADDD Y0, Y7, Y7
- VPXOR Y7, Y14, Y14
- VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y15, Y15
- VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
- VPADDD Y5, Y2, Y2
- VPXOR Y2, Y8, Y8
- VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
- VPADDD Y6, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y14, Y10, Y10
- VPXOR Y10, Y0, Y0
- VPADDD Y15, Y11, Y11
- VPXOR Y11, Y4, Y4
- VPADDD Y8, Y12, Y12
- VPXOR Y12, Y5, Y5
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y6, Y6
- VMOVDQA Y7, (R10)
- VPSRLD $0x07, Y0, Y7
- VPSLLD $0x19, Y0, Y0
- VPOR Y7, Y0, Y0
- VPSRLD $0x07, Y4, Y7
- VPSLLD $0x19, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x07, Y5, Y7
- VPSLLD $0x19, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x07, Y6, Y7
- VPSLLD $0x19, Y6, Y6
- VPOR Y7, Y6, Y6
- VMOVDQA (R10), Y7
- VPADDD 368(SP), Y7, Y7
- VPADDD 464(SP), Y1, Y1
- VPADDD 528(SP), Y2, Y2
- VPADDD 432(SP), Y3, Y3
- VPADDD Y4, Y7, Y7
- VPXOR Y7, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y5, Y1, Y1
- VPXOR Y1, Y14, Y14
- VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y15, Y15
- VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y8, Y8
- VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
- VPADDD Y9, Y12, Y12
- VPXOR Y12, Y4, Y4
- VPADDD Y14, Y13, Y13
- VPXOR Y13, Y5, Y5
- VPADDD Y15, Y10, Y10
- VPXOR Y10, Y6, Y6
- VPADDD Y8, Y11, Y11
- VPXOR Y11, Y0, Y0
- VMOVDQA Y7, (R10)
- VPSRLD $0x0c, Y4, Y7
- VPSLLD $0x14, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x0c, Y5, Y7
- VPSLLD $0x14, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x0c, Y6, Y7
- VPSLLD $0x14, Y6, Y6
- VPOR Y7, Y6, Y6
- VPSRLD $0x0c, Y0, Y7
- VPSLLD $0x14, Y0, Y0
- VPOR Y7, Y0, Y0
- VMOVDQA (R10), Y7
- VPADDD 336(SP), Y7, Y7
- VPADDD 176(SP), Y1, Y1
- VPADDD 656(SP), Y2, Y2
- VPADDD 208(SP), Y3, Y3
- VPADDD Y4, Y7, Y7
- VPXOR Y7, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y5, Y1, Y1
- VPXOR Y1, Y14, Y14
- VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y15, Y15
- VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y8, Y8
- VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
- VPADDD Y9, Y12, Y12
- VPXOR Y12, Y4, Y4
- VPADDD Y14, Y13, Y13
- VPXOR Y13, Y5, Y5
- VPADDD Y15, Y10, Y10
- VPXOR Y10, Y6, Y6
- VPADDD Y8, Y11, Y11
- VPXOR Y11, Y0, Y0
- VMOVDQA Y7, (R10)
- VPSRLD $0x07, Y4, Y7
- VPSLLD $0x19, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x07, Y5, Y7
- VPSLLD $0x19, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x07, Y6, Y7
- VPSLLD $0x19, Y6, Y6
- VPOR Y7, Y6, Y6
- VPSRLD $0x07, Y0, Y7
- VPSLLD $0x19, Y0, Y0
- VPOR Y7, Y0, Y0
-
- // Round 4
- VMOVDQA (R10), Y7
- VPADDD 496(SP), Y7, Y7
- VPADDD 560(SP), Y1, Y1
- VPADDD 624(SP), Y2, Y2
- VPADDD 592(SP), Y3, Y3
- VPADDD Y0, Y7, Y7
- VPXOR Y7, Y14, Y14
- VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y15, Y15
- VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
- VPADDD Y5, Y2, Y2
- VPXOR Y2, Y8, Y8
- VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
- VPADDD Y6, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y14, Y10, Y10
- VPXOR Y10, Y0, Y0
- VPADDD Y15, Y11, Y11
- VPXOR Y11, Y4, Y4
- VPADDD Y8, Y12, Y12
- VPXOR Y12, Y5, Y5
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y6, Y6
- VMOVDQA Y7, (R10)
- VPSRLD $0x0c, Y0, Y7
- VPSLLD $0x14, Y0, Y0
- VPOR Y7, Y0, Y0
- VPSRLD $0x0c, Y4, Y7
- VPSLLD $0x14, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x0c, Y5, Y7
- VPSLLD $0x14, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x0c, Y6, Y7
- VPSLLD $0x14, Y6, Y6
- VPOR Y7, Y6, Y6
- VMOVDQA (R10), Y7
- VPADDD 400(SP), Y7, Y7
- VPADDD 464(SP), Y1, Y1
- VPADDD 272(SP), Y2, Y2
- VPADDD 656(SP), Y3, Y3
- VPADDD Y0, Y7, Y7
- VPXOR Y7, Y14, Y14
- VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y15, Y15
- VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
- VPADDD Y5, Y2, Y2
- VPXOR Y2, Y8, Y8
- VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
- VPADDD Y6, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y14, Y10, Y10
- VPXOR Y10, Y0, Y0
- VPADDD Y15, Y11, Y11
- VPXOR Y11, Y4, Y4
- VPADDD Y8, Y12, Y12
- VPXOR Y12, Y5, Y5
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y6, Y6
- VMOVDQA Y7, (R10)
- VPSRLD $0x07, Y0, Y7
- VPSLLD $0x19, Y0, Y0
- VPOR Y7, Y0, Y0
- VPSRLD $0x07, Y4, Y7
- VPSLLD $0x19, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x07, Y5, Y7
- VPSLLD $0x19, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x07, Y6, Y7
- VPSLLD $0x19, Y6, Y6
- VPOR Y7, Y6, Y6
- VMOVDQA (R10), Y7
- VPADDD 304(SP), Y7, Y7
- VPADDD 528(SP), Y1, Y1
- VPADDD 336(SP), Y2, Y2
- VPADDD 208(SP), Y3, Y3
- VPADDD Y4, Y7, Y7
- VPXOR Y7, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y5, Y1, Y1
- VPXOR Y1, Y14, Y14
- VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y15, Y15
- VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y8, Y8
- VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
- VPADDD Y9, Y12, Y12
- VPXOR Y12, Y4, Y4
- VPADDD Y14, Y13, Y13
- VPXOR Y13, Y5, Y5
- VPADDD Y15, Y10, Y10
- VPXOR Y10, Y6, Y6
- VPADDD Y8, Y11, Y11
- VPXOR Y11, Y0, Y0
- VMOVDQA Y7, (R10)
- VPSRLD $0x0c, Y4, Y7
- VPSLLD $0x14, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x0c, Y5, Y7
- VPSLLD $0x14, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x0c, Y6, Y7
- VPSLLD $0x14, Y6, Y6
- VPOR Y7, Y6, Y6
- VPSRLD $0x0c, Y0, Y7
- VPSLLD $0x14, Y0, Y0
- VPOR Y7, Y0, Y0
- VMOVDQA (R10), Y7
- VPADDD 176(SP), Y7, Y7
- VPADDD 240(SP), Y1, Y1
- VPADDD 432(SP), Y2, Y2
- VPADDD 368(SP), Y3, Y3
- VPADDD Y4, Y7, Y7
- VPXOR Y7, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y5, Y1, Y1
- VPXOR Y1, Y14, Y14
- VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y15, Y15
- VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y8, Y8
- VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
- VPADDD Y9, Y12, Y12
- VPXOR Y12, Y4, Y4
- VPADDD Y14, Y13, Y13
- VPXOR Y13, Y5, Y5
- VPADDD Y15, Y10, Y10
- VPXOR Y10, Y6, Y6
- VPADDD Y8, Y11, Y11
- VPXOR Y11, Y0, Y0
- VMOVDQA Y7, (R10)
- VPSRLD $0x07, Y4, Y7
- VPSLLD $0x19, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x07, Y5, Y7
- VPSLLD $0x19, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x07, Y6, Y7
- VPSLLD $0x19, Y6, Y6
- VPOR Y7, Y6, Y6
- VPSRLD $0x07, Y0, Y7
- VPSLLD $0x19, Y0, Y0
- VPOR Y7, Y0, Y0
-
- // Round 5
- VMOVDQA (R10), Y7
- VPADDD 560(SP), Y7, Y7
- VPADDD 464(SP), Y1, Y1
- VPADDD 656(SP), Y2, Y2
- VPADDD 624(SP), Y3, Y3
- VPADDD Y0, Y7, Y7
- VPXOR Y7, Y14, Y14
- VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y15, Y15
- VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
- VPADDD Y5, Y2, Y2
- VPXOR Y2, Y8, Y8
- VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
- VPADDD Y6, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y14, Y10, Y10
- VPXOR Y10, Y0, Y0
- VPADDD Y15, Y11, Y11
- VPXOR Y11, Y4, Y4
- VPADDD Y8, Y12, Y12
- VPXOR Y12, Y5, Y5
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y6, Y6
- VMOVDQA Y7, (R10)
- VPSRLD $0x0c, Y0, Y7
- VPSLLD $0x14, Y0, Y0
- VPOR Y7, Y0, Y0
- VPSRLD $0x0c, Y4, Y7
- VPSLLD $0x14, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x0c, Y5, Y7
- VPSLLD $0x14, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x0c, Y6, Y7
- VPSLLD $0x14, Y6, Y6
- VPOR Y7, Y6, Y6
- VMOVDQA (R10), Y7
- VPADDD 592(SP), Y7, Y7
- VPADDD 528(SP), Y1, Y1
- VPADDD 496(SP), Y2, Y2
- VPADDD 432(SP), Y3, Y3
- VPADDD Y0, Y7, Y7
- VPXOR Y7, Y14, Y14
- VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y15, Y15
- VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
- VPADDD Y5, Y2, Y2
- VPXOR Y2, Y8, Y8
- VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
- VPADDD Y6, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y14, Y10, Y10
- VPXOR Y10, Y0, Y0
- VPADDD Y15, Y11, Y11
- VPXOR Y11, Y4, Y4
- VPADDD Y8, Y12, Y12
- VPXOR Y12, Y5, Y5
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y6, Y6
- VMOVDQA Y7, (R10)
- VPSRLD $0x07, Y0, Y7
- VPSLLD $0x19, Y0, Y0
- VPOR Y7, Y0, Y0
- VPSRLD $0x07, Y4, Y7
- VPSLLD $0x19, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x07, Y5, Y7
- VPSLLD $0x19, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x07, Y6, Y7
- VPSLLD $0x19, Y6, Y6
- VPOR Y7, Y6, Y6
- VMOVDQA (R10), Y7
- VPADDD 400(SP), Y7, Y7
- VPADDD 336(SP), Y1, Y1
- VPADDD 176(SP), Y2, Y2
- VPADDD 368(SP), Y3, Y3
- VPADDD Y4, Y7, Y7
- VPXOR Y7, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y5, Y1, Y1
- VPXOR Y1, Y14, Y14
- VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y15, Y15
- VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y8, Y8
- VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
- VPADDD Y9, Y12, Y12
- VPXOR Y12, Y4, Y4
- VPADDD Y14, Y13, Y13
- VPXOR Y13, Y5, Y5
- VPADDD Y15, Y10, Y10
- VPXOR Y10, Y6, Y6
- VPADDD Y8, Y11, Y11
- VPXOR Y11, Y0, Y0
- VMOVDQA Y7, (R10)
- VPSRLD $0x0c, Y4, Y7
- VPSLLD $0x14, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x0c, Y5, Y7
- VPSLLD $0x14, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x0c, Y6, Y7
- VPSLLD $0x14, Y6, Y6
- VPOR Y7, Y6, Y6
- VPSRLD $0x0c, Y0, Y7
- VPSLLD $0x14, Y0, Y0
- VPOR Y7, Y0, Y0
- VMOVDQA (R10), Y7
- VPADDD 240(SP), Y7, Y7
- VPADDD 272(SP), Y1, Y1
- VPADDD 208(SP), Y2, Y2
- VPADDD 304(SP), Y3, Y3
- VPADDD Y4, Y7, Y7
- VPXOR Y7, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y5, Y1, Y1
- VPXOR Y1, Y14, Y14
- VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y15, Y15
- VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y8, Y8
- VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
- VPADDD Y9, Y12, Y12
- VPXOR Y12, Y4, Y4
- VPADDD Y14, Y13, Y13
- VPXOR Y13, Y5, Y5
- VPADDD Y15, Y10, Y10
- VPXOR Y10, Y6, Y6
- VPADDD Y8, Y11, Y11
- VPXOR Y11, Y0, Y0
- VMOVDQA Y7, (R10)
- VPSRLD $0x07, Y4, Y7
- VPSLLD $0x19, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x07, Y5, Y7
- VPSLLD $0x19, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x07, Y6, Y7
- VPSLLD $0x19, Y6, Y6
- VPOR Y7, Y6, Y6
- VPSRLD $0x07, Y0, Y7
- VPSLLD $0x19, Y0, Y0
- VPOR Y7, Y0, Y0
-
- // Round 6
- VMOVDQA (R10), Y7
- VPADDD 464(SP), Y7, Y7
- VPADDD 528(SP), Y1, Y1
- VPADDD 432(SP), Y2, Y2
- VPADDD 656(SP), Y3, Y3
- VPADDD Y0, Y7, Y7
- VPXOR Y7, Y14, Y14
- VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y15, Y15
- VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
- VPADDD Y5, Y2, Y2
- VPXOR Y2, Y8, Y8
- VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
- VPADDD Y6, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y14, Y10, Y10
- VPXOR Y10, Y0, Y0
- VPADDD Y15, Y11, Y11
- VPXOR Y11, Y4, Y4
- VPADDD Y8, Y12, Y12
- VPXOR Y12, Y5, Y5
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y6, Y6
- VMOVDQA Y7, (R10)
- VPSRLD $0x0c, Y0, Y7
- VPSLLD $0x14, Y0, Y0
- VPOR Y7, Y0, Y0
- VPSRLD $0x0c, Y4, Y7
- VPSLLD $0x14, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x0c, Y5, Y7
- VPSLLD $0x14, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x0c, Y6, Y7
- VPSLLD $0x14, Y6, Y6
- VPOR Y7, Y6, Y6
- VMOVDQA (R10), Y7
- VPADDD 624(SP), Y7, Y7
- VPADDD 336(SP), Y1, Y1
- VPADDD 560(SP), Y2, Y2
- VPADDD 208(SP), Y3, Y3
- VPADDD Y0, Y7, Y7
- VPXOR Y7, Y14, Y14
- VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y15, Y15
- VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
- VPADDD Y5, Y2, Y2
- VPXOR Y2, Y8, Y8
- VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
- VPADDD Y6, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y14, Y10, Y10
- VPXOR Y10, Y0, Y0
- VPADDD Y15, Y11, Y11
- VPXOR Y11, Y4, Y4
- VPADDD Y8, Y12, Y12
- VPXOR Y12, Y5, Y5
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y6, Y6
- VMOVDQA Y7, (R10)
- VPSRLD $0x07, Y0, Y7
- VPSLLD $0x19, Y0, Y0
- VPOR Y7, Y0, Y0
- VPSRLD $0x07, Y4, Y7
- VPSLLD $0x19, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x07, Y5, Y7
- VPSLLD $0x19, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x07, Y6, Y7
- VPSLLD $0x19, Y6, Y6
- VPOR Y7, Y6, Y6
- VMOVDQA (R10), Y7
- VPADDD 592(SP), Y7, Y7
- VPADDD 176(SP), Y1, Y1
- VPADDD 240(SP), Y2, Y2
- VPADDD 304(SP), Y3, Y3
- VPADDD Y4, Y7, Y7
- VPXOR Y7, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y5, Y1, Y1
- VPXOR Y1, Y14, Y14
- VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y15, Y15
- VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y8, Y8
- VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
- VPADDD Y9, Y12, Y12
- VPXOR Y12, Y4, Y4
- VPADDD Y14, Y13, Y13
- VPXOR Y13, Y5, Y5
- VPADDD Y15, Y10, Y10
- VPXOR Y10, Y6, Y6
- VPADDD Y8, Y11, Y11
- VPXOR Y11, Y0, Y0
- VMOVDQA Y7, (R10)
- VPSRLD $0x0c, Y4, Y7
- VPSLLD $0x14, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x0c, Y5, Y7
- VPSLLD $0x14, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x0c, Y6, Y7
- VPSLLD $0x14, Y6, Y6
- VPOR Y7, Y6, Y6
- VPSRLD $0x0c, Y0, Y7
- VPSLLD $0x14, Y0, Y0
- VPOR Y7, Y0, Y0
- VMOVDQA (R10), Y7
- VPADDD 272(SP), Y7, Y7
- VPADDD 496(SP), Y1, Y1
- VPADDD 368(SP), Y2, Y2
- VPADDD 400(SP), Y3, Y3
- VPADDD Y4, Y7, Y7
- VPXOR Y7, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y5, Y1, Y1
- VPXOR Y1, Y14, Y14
- VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y15, Y15
- VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y8, Y8
- VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
- VPADDD Y9, Y12, Y12
- VPXOR Y12, Y4, Y4
- VPADDD Y14, Y13, Y13
- VPXOR Y13, Y5, Y5
- VPADDD Y15, Y10, Y10
- VPXOR Y10, Y6, Y6
- VPADDD Y8, Y11, Y11
- VPXOR Y11, Y0, Y0
- VMOVDQA Y7, (R10)
- VPSRLD $0x07, Y4, Y7
- VPSLLD $0x19, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x07, Y5, Y7
- VPSLLD $0x19, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x07, Y6, Y7
- VPSLLD $0x19, Y6, Y6
- VPOR Y7, Y6, Y6
- VPSRLD $0x07, Y0, Y7
- VPSLLD $0x19, Y0, Y0
- VPOR Y7, Y0, Y0
-
- // Round 7
- VMOVDQA (R10), Y7
- VPADDD 528(SP), Y7, Y7
- VPADDD 336(SP), Y1, Y1
- VPADDD 208(SP), Y2, Y2
- VPADDD 432(SP), Y3, Y3
- VPADDD Y0, Y7, Y7
- VPXOR Y7, Y14, Y14
- VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y15, Y15
- VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
- VPADDD Y5, Y2, Y2
- VPXOR Y2, Y8, Y8
- VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
- VPADDD Y6, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y14, Y10, Y10
- VPXOR Y10, Y0, Y0
- VPADDD Y15, Y11, Y11
- VPXOR Y11, Y4, Y4
- VPADDD Y8, Y12, Y12
- VPXOR Y12, Y5, Y5
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y6, Y6
- VMOVDQA Y7, (R10)
- VPSRLD $0x0c, Y0, Y7
- VPSLLD $0x14, Y0, Y0
- VPOR Y7, Y0, Y0
- VPSRLD $0x0c, Y4, Y7
- VPSLLD $0x14, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x0c, Y5, Y7
- VPSLLD $0x14, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x0c, Y6, Y7
- VPSLLD $0x14, Y6, Y6
- VPOR Y7, Y6, Y6
- VMOVDQA (R10), Y7
- VPADDD 656(SP), Y7, Y7
- VPADDD 176(SP), Y1, Y1
- VPADDD 464(SP), Y2, Y2
- VPADDD 368(SP), Y3, Y3
- VPADDD Y0, Y7, Y7
- VPXOR Y7, Y14, Y14
- VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y15, Y15
- VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
- VPADDD Y5, Y2, Y2
- VPXOR Y2, Y8, Y8
- VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
- VPADDD Y6, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y14, Y10, Y10
- VPXOR Y10, Y0, Y0
- VPADDD Y15, Y11, Y11
- VPXOR Y11, Y4, Y4
- VPADDD Y8, Y12, Y12
- VPXOR Y12, Y5, Y5
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y6, Y6
- VMOVDQA Y7, (R10)
- VPSRLD $0x07, Y0, Y7
- VPSLLD $0x19, Y0, Y0
- VPOR Y7, Y0, Y0
- VPSRLD $0x07, Y4, Y7
- VPSLLD $0x19, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x07, Y5, Y7
- VPSLLD $0x19, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x07, Y6, Y7
- VPSLLD $0x19, Y6, Y6
- VPOR Y7, Y6, Y6
- VMOVDQA (R10), Y7
- VPADDD 624(SP), Y7, Y7
- VPADDD 240(SP), Y1, Y1
- VPADDD 272(SP), Y2, Y2
- VPADDD 400(SP), Y3, Y3
- VPADDD Y4, Y7, Y7
- VPXOR Y7, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y5, Y1, Y1
- VPXOR Y1, Y14, Y14
- VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y15, Y15
- VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y8, Y8
- VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
- VPADDD Y9, Y12, Y12
- VPXOR Y12, Y4, Y4
- VPADDD Y14, Y13, Y13
- VPXOR Y13, Y5, Y5
- VPADDD Y15, Y10, Y10
- VPXOR Y10, Y6, Y6
- VPADDD Y8, Y11, Y11
- VPXOR Y11, Y0, Y0
- VMOVDQA Y7, (R10)
- VPSRLD $0x0c, Y4, Y7
- VPSLLD $0x14, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x0c, Y5, Y7
- VPSLLD $0x14, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x0c, Y6, Y7
- VPSLLD $0x14, Y6, Y6
- VPOR Y7, Y6, Y6
- VPSRLD $0x0c, Y0, Y7
- VPSLLD $0x14, Y0, Y0
- VPOR Y7, Y0, Y0
- VMOVDQA (R10), Y7
- VPADDD 496(SP), Y7, Y7
- VPADDD 560(SP), Y1, Y1
- VPADDD 304(SP), Y2, Y2
- VPADDD 592(SP), Y3, Y3
- VPADDD Y4, Y7, Y7
- VPXOR Y7, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y5, Y1, Y1
- VPXOR Y1, Y14, Y14
- VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y15, Y15
- VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y8, Y8
- VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
- VPADDD Y9, Y12, Y12
- VPXOR Y12, Y4, Y4
- VPADDD Y14, Y13, Y13
- VPXOR Y13, Y5, Y5
- VPADDD Y15, Y10, Y10
- VPXOR Y10, Y6, Y6
- VPADDD Y8, Y11, Y11
- VPXOR Y11, Y0, Y0
- VMOVDQA Y7, (R10)
- VPSRLD $0x07, Y4, Y7
- VPSLLD $0x19, Y4, Y4
- VPOR Y7, Y4, Y4
- VPSRLD $0x07, Y5, Y7
- VPSLLD $0x19, Y5, Y5
- VPOR Y7, Y5, Y5
- VPSRLD $0x07, Y6, Y7
- VPSLLD $0x19, Y6, Y6
- VPOR Y7, Y6, Y6
- VPSRLD $0x07, Y0, Y7
- VPSLLD $0x19, Y0, Y0
- VPOR Y7, Y0, Y0
-
- // Finalize rounds
- VPXOR Y9, Y6, Y6
- VPXOR (R10), Y10, Y7
- VPXOR Y11, Y1, Y1
- VPXOR Y12, Y2, Y2
- VPXOR Y13, Y3, Y3
- VPXOR Y14, Y0, Y0
- VPXOR Y15, Y4, Y4
- VPXOR Y8, Y5, Y5
-
- // Fix up registers for next iteration
- VMOVDQU Y7, Y8
- VMOVDQU Y6, Y7
- VMOVDQU Y5, Y6
- VMOVDQU Y4, Y5
- VMOVDQU Y0, Y4
- VMOVDQU Y8, Y0
-
- // If we have zero complete chunks, we're done
- CMPQ R8, $0x00
- JNE loop_trailer
- CMPQ R9, CX
- JEQ finalize
-
-loop_trailer:
- // Increment, reset flags, and loop
- CMPQ CX, $0x000003c0
- JEQ finalize
- ADDQ $0x40, CX
- MOVL BX, 64(SP)
- JMP loop
-
-finalize:
- // Store result into out
- VMOVDQU Y0, (SI)
- VMOVDQU Y1, 32(SI)
- VMOVDQU Y2, 64(SI)
- VMOVDQU Y3, 96(SI)
- VMOVDQU Y4, 128(SI)
- VMOVDQU Y5, 160(SI)
- VMOVDQU Y6, 192(SI)
- VMOVDQU Y7, 224(SI)
- VZEROUPPER
- RET
-
-// func HashP(left *[32]uint32, right *[32]uint32, flags uint8, key *[8]uint32, out *[32]uint32, n int)
-// Requires: AVX, AVX2
-TEXT ·HashP(SB), NOSPLIT, $72-48
- MOVQ left+0(FP), AX
- MOVQ right+8(FP), CX
- MOVBLZX flags+16(FP), DX
- MOVQ key+24(FP), BX
- MOVQ out+32(FP), BP
-
- // Allocate local space and align it
- LEAQ 31(SP), SI
- MOVQ $0x000000000000001f, DI
- NOTQ DI
- ANDQ DI, SI
-
- // Set up flags value
- MOVL DX, 64(SP)
-
- // Perform the rounds
- // Round 1
- VPBROADCASTD (BX), Y0
- VPADDD (AX), Y0, Y0
- VPBROADCASTD 4(BX), Y1
- VPADDD 64(AX), Y1, Y1
- VPBROADCASTD 8(BX), Y2
- VPADDD 128(AX), Y2, Y2
- VPBROADCASTD 12(BX), Y3
- VPADDD 192(AX), Y3, Y3
- VPBROADCASTD 16(BX), Y4
- VPADDD Y4, Y0, Y0
- VMOVDQU zero<>+0(SB), Y5
- VPXOR Y0, Y5, Y5
- VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
- VPBROADCASTD 20(BX), Y6
- VPADDD Y6, Y1, Y1
- VMOVDQU zero<>+0(SB), Y7
- VPXOR Y1, Y7, Y7
- VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
- VPBROADCASTD 24(BX), Y8
- VPADDD Y8, Y2, Y2
- VMOVDQU block_len<>+0(SB), Y9
- VPXOR Y2, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPBROADCASTD 28(BX), Y10
- VPADDD Y10, Y3, Y3
- VPBROADCASTD 64(SP), Y11
- VPXOR Y3, Y11, Y11
- VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
- VPBROADCASTD iv<>+0(SB), Y12
- VPADDD Y5, Y12, Y12
- VPXOR Y12, Y4, Y4
- VPBROADCASTD iv<>+4(SB), Y13
- VPADDD Y7, Y13, Y13
- VPXOR Y13, Y6, Y6
- VPBROADCASTD iv<>+8(SB), Y14
- VPADDD Y9, Y14, Y14
- VPXOR Y14, Y8, Y8
- VPBROADCASTD iv<>+12(SB), Y15
- VPADDD Y11, Y15, Y15
- VPXOR Y15, Y10, Y10
- VMOVDQA Y0, (SI)
- VPSRLD $0x0c, Y4, Y0
- VPSLLD $0x14, Y4, Y4
- VPOR Y0, Y4, Y0
- VPSRLD $0x0c, Y6, Y4
- VPSLLD $0x14, Y6, Y6
- VPOR Y4, Y6, Y4
- VPSRLD $0x0c, Y8, Y6
- VPSLLD $0x14, Y8, Y8
- VPOR Y6, Y8, Y6
- VPSRLD $0x0c, Y10, Y8
- VPSLLD $0x14, Y10, Y10
- VPOR Y8, Y10, Y8
- VMOVDQA (SI), Y10
- VPADDD 32(AX), Y10, Y10
- VPADDD 96(AX), Y1, Y1
- VPADDD 160(AX), Y2, Y2
- VPADDD 224(AX), Y3, Y3
- VPADDD Y0, Y10, Y10
- VPXOR Y10, Y5, Y5
- VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y7, Y7
- VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y8, Y3, Y3
- VPXOR Y3, Y11, Y11
- VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
- VPADDD Y5, Y12, Y12
- VPXOR Y12, Y0, Y0
- VPADDD Y7, Y13, Y13
- VPXOR Y13, Y4, Y4
- VPADDD Y9, Y14, Y14
- VPXOR Y14, Y6, Y6
- VPADDD Y11, Y15, Y15
- VPXOR Y15, Y8, Y8
- VMOVDQA Y10, (SI)
- VPSRLD $0x07, Y0, Y10
- VPSLLD $0x19, Y0, Y0
- VPOR Y10, Y0, Y0
- VPSRLD $0x07, Y4, Y10
- VPSLLD $0x19, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x07, Y6, Y10
- VPSLLD $0x19, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x07, Y8, Y10
- VPSLLD $0x19, Y8, Y8
- VPOR Y10, Y8, Y8
- VMOVDQA (SI), Y10
- VPADDD (CX), Y10, Y10
- VPADDD 64(CX), Y1, Y1
- VPADDD 128(CX), Y2, Y2
- VPADDD 192(CX), Y3, Y3
- VPADDD Y4, Y10, Y10
- VPXOR Y10, Y11, Y11
- VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
- VPADDD Y6, Y1, Y1
- VPXOR Y1, Y5, Y5
- VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
- VPADDD Y8, Y2, Y2
- VPXOR Y2, Y7, Y7
- VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y11, Y14, Y14
- VPXOR Y14, Y4, Y4
- VPADDD Y5, Y15, Y15
- VPXOR Y15, Y6, Y6
- VPADDD Y7, Y12, Y12
- VPXOR Y12, Y8, Y8
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y0, Y0
- VMOVDQA Y10, (SI)
- VPSRLD $0x0c, Y4, Y10
- VPSLLD $0x14, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x0c, Y6, Y10
- VPSLLD $0x14, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x0c, Y8, Y10
- VPSLLD $0x14, Y8, Y8
- VPOR Y10, Y8, Y8
- VPSRLD $0x0c, Y0, Y10
- VPSLLD $0x14, Y0, Y0
- VPOR Y10, Y0, Y0
- VMOVDQA (SI), Y10
- VPADDD 32(CX), Y10, Y10
- VPADDD 96(CX), Y1, Y1
- VPADDD 160(CX), Y2, Y2
- VPADDD 224(CX), Y3, Y3
- VPADDD Y4, Y10, Y10
- VPXOR Y10, Y11, Y11
- VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
- VPADDD Y6, Y1, Y1
- VPXOR Y1, Y5, Y5
- VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
- VPADDD Y8, Y2, Y2
- VPXOR Y2, Y7, Y7
- VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y11, Y14, Y14
- VPXOR Y14, Y4, Y4
- VPADDD Y5, Y15, Y15
- VPXOR Y15, Y6, Y6
- VPADDD Y7, Y12, Y12
- VPXOR Y12, Y8, Y8
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y0, Y0
- VMOVDQA Y10, (SI)
- VPSRLD $0x07, Y4, Y10
- VPSLLD $0x19, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x07, Y6, Y10
- VPSLLD $0x19, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x07, Y8, Y10
- VPSLLD $0x19, Y8, Y8
- VPOR Y10, Y8, Y8
- VPSRLD $0x07, Y0, Y10
- VPSLLD $0x19, Y0, Y0
- VPOR Y10, Y0, Y0
-
- // Round 2
- VMOVDQA (SI), Y10
- VPADDD 64(AX), Y10, Y10
- VPADDD 96(AX), Y1, Y1
- VPADDD 224(AX), Y2, Y2
- VPADDD 128(AX), Y3, Y3
- VPADDD Y0, Y10, Y10
- VPXOR Y10, Y5, Y5
- VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y7, Y7
- VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y8, Y3, Y3
- VPXOR Y3, Y11, Y11
- VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
- VPADDD Y5, Y12, Y12
- VPXOR Y12, Y0, Y0
- VPADDD Y7, Y13, Y13
- VPXOR Y13, Y4, Y4
- VPADDD Y9, Y14, Y14
- VPXOR Y14, Y6, Y6
- VPADDD Y11, Y15, Y15
- VPXOR Y15, Y8, Y8
- VMOVDQA Y10, (SI)
- VPSRLD $0x0c, Y0, Y10
- VPSLLD $0x14, Y0, Y0
- VPOR Y10, Y0, Y0
- VPSRLD $0x0c, Y4, Y10
- VPSLLD $0x14, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x0c, Y6, Y10
- VPSLLD $0x14, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x0c, Y8, Y10
- VPSLLD $0x14, Y8, Y8
- VPOR Y10, Y8, Y8
- VMOVDQA (SI), Y10
- VPADDD 192(AX), Y10, Y10
- VPADDD 64(CX), Y1, Y1
- VPADDD (AX), Y2, Y2
- VPADDD 160(CX), Y3, Y3
- VPADDD Y0, Y10, Y10
- VPXOR Y10, Y5, Y5
- VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y7, Y7
- VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y8, Y3, Y3
- VPXOR Y3, Y11, Y11
- VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
- VPADDD Y5, Y12, Y12
- VPXOR Y12, Y0, Y0
- VPADDD Y7, Y13, Y13
- VPXOR Y13, Y4, Y4
- VPADDD Y9, Y14, Y14
- VPXOR Y14, Y6, Y6
- VPADDD Y11, Y15, Y15
- VPXOR Y15, Y8, Y8
- VMOVDQA Y10, (SI)
- VPSRLD $0x07, Y0, Y10
- VPSLLD $0x19, Y0, Y0
- VPOR Y10, Y0, Y0
- VPSRLD $0x07, Y4, Y10
- VPSLLD $0x19, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x07, Y6, Y10
- VPSLLD $0x19, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x07, Y8, Y10
- VPSLLD $0x19, Y8, Y8
- VPOR Y10, Y8, Y8
- VMOVDQA (SI), Y10
- VPADDD 32(AX), Y10, Y10
- VPADDD 128(CX), Y1, Y1
- VPADDD 32(CX), Y2, Y2
- VPADDD 224(CX), Y3, Y3
- VPADDD Y4, Y10, Y10
- VPXOR Y10, Y11, Y11
- VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
- VPADDD Y6, Y1, Y1
- VPXOR Y1, Y5, Y5
- VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
- VPADDD Y8, Y2, Y2
- VPXOR Y2, Y7, Y7
- VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y11, Y14, Y14
- VPXOR Y14, Y4, Y4
- VPADDD Y5, Y15, Y15
- VPXOR Y15, Y6, Y6
- VPADDD Y7, Y12, Y12
- VPXOR Y12, Y8, Y8
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y0, Y0
- VMOVDQA Y10, (SI)
- VPSRLD $0x0c, Y4, Y10
- VPSLLD $0x14, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x0c, Y6, Y10
- VPSLLD $0x14, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x0c, Y8, Y10
- VPSLLD $0x14, Y8, Y8
- VPOR Y10, Y8, Y8
- VPSRLD $0x0c, Y0, Y10
- VPSLLD $0x14, Y0, Y0
- VPOR Y10, Y0, Y0
- VMOVDQA (SI), Y10
- VPADDD 96(CX), Y10, Y10
- VPADDD 160(AX), Y1, Y1
- VPADDD 192(CX), Y2, Y2
- VPADDD (CX), Y3, Y3
- VPADDD Y4, Y10, Y10
- VPXOR Y10, Y11, Y11
- VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
- VPADDD Y6, Y1, Y1
- VPXOR Y1, Y5, Y5
- VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
- VPADDD Y8, Y2, Y2
- VPXOR Y2, Y7, Y7
- VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y11, Y14, Y14
- VPXOR Y14, Y4, Y4
- VPADDD Y5, Y15, Y15
- VPXOR Y15, Y6, Y6
- VPADDD Y7, Y12, Y12
- VPXOR Y12, Y8, Y8
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y0, Y0
- VMOVDQA Y10, (SI)
- VPSRLD $0x07, Y4, Y10
- VPSLLD $0x19, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x07, Y6, Y10
- VPSLLD $0x19, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x07, Y8, Y10
- VPSLLD $0x19, Y8, Y8
- VPOR Y10, Y8, Y8
- VPSRLD $0x07, Y0, Y10
- VPSLLD $0x19, Y0, Y0
- VPOR Y10, Y0, Y0
-
- // Round 3
- VMOVDQA (SI), Y10
- VPADDD 96(AX), Y10, Y10
- VPADDD 64(CX), Y1, Y1
- VPADDD 160(CX), Y2, Y2
- VPADDD 224(AX), Y3, Y3
- VPADDD Y0, Y10, Y10
- VPXOR Y10, Y5, Y5
- VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y7, Y7
- VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y8, Y3, Y3
- VPXOR Y3, Y11, Y11
- VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
- VPADDD Y5, Y12, Y12
- VPXOR Y12, Y0, Y0
- VPADDD Y7, Y13, Y13
- VPXOR Y13, Y4, Y4
- VPADDD Y9, Y14, Y14
- VPXOR Y14, Y6, Y6
- VPADDD Y11, Y15, Y15
- VPXOR Y15, Y8, Y8
- VMOVDQA Y10, (SI)
- VPSRLD $0x0c, Y0, Y10
- VPSLLD $0x14, Y0, Y0
- VPOR Y10, Y0, Y0
- VPSRLD $0x0c, Y4, Y10
- VPSLLD $0x14, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x0c, Y6, Y10
- VPSLLD $0x14, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x0c, Y8, Y10
- VPSLLD $0x14, Y8, Y8
- VPOR Y10, Y8, Y8
- VMOVDQA (SI), Y10
- VPADDD 128(AX), Y10, Y10
- VPADDD 128(CX), Y1, Y1
- VPADDD 64(AX), Y2, Y2
- VPADDD 192(CX), Y3, Y3
- VPADDD Y0, Y10, Y10
- VPXOR Y10, Y5, Y5
- VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y7, Y7
- VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y8, Y3, Y3
- VPXOR Y3, Y11, Y11
- VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
- VPADDD Y5, Y12, Y12
- VPXOR Y12, Y0, Y0
- VPADDD Y7, Y13, Y13
- VPXOR Y13, Y4, Y4
- VPADDD Y9, Y14, Y14
- VPXOR Y14, Y6, Y6
- VPADDD Y11, Y15, Y15
- VPXOR Y15, Y8, Y8
- VMOVDQA Y10, (SI)
- VPSRLD $0x07, Y0, Y10
- VPSLLD $0x19, Y0, Y0
- VPOR Y10, Y0, Y0
- VPSRLD $0x07, Y4, Y10
- VPSLLD $0x19, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x07, Y6, Y10
- VPSLLD $0x19, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x07, Y8, Y10
- VPSLLD $0x19, Y8, Y8
- VPOR Y10, Y8, Y8
- VMOVDQA (SI), Y10
- VPADDD 192(AX), Y10, Y10
- VPADDD 32(CX), Y1, Y1
- VPADDD 96(CX), Y2, Y2
- VPADDD (CX), Y3, Y3
- VPADDD Y4, Y10, Y10
- VPXOR Y10, Y11, Y11
- VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
- VPADDD Y6, Y1, Y1
- VPXOR Y1, Y5, Y5
- VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
- VPADDD Y8, Y2, Y2
- VPXOR Y2, Y7, Y7
- VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y11, Y14, Y14
- VPXOR Y14, Y4, Y4
- VPADDD Y5, Y15, Y15
- VPXOR Y15, Y6, Y6
- VPADDD Y7, Y12, Y12
- VPXOR Y12, Y8, Y8
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y0, Y0
- VMOVDQA Y10, (SI)
- VPSRLD $0x0c, Y4, Y10
- VPSLLD $0x14, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x0c, Y6, Y10
- VPSLLD $0x14, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x0c, Y8, Y10
- VPSLLD $0x14, Y8, Y8
- VPOR Y10, Y8, Y8
- VPSRLD $0x0c, Y0, Y10
- VPSLLD $0x14, Y0, Y0
- VPOR Y10, Y0, Y0
- VMOVDQA (SI), Y10
- VPADDD 160(AX), Y10, Y10
- VPADDD (AX), Y1, Y1
- VPADDD 224(CX), Y2, Y2
- VPADDD 32(AX), Y3, Y3
- VPADDD Y4, Y10, Y10
- VPXOR Y10, Y11, Y11
- VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
- VPADDD Y6, Y1, Y1
- VPXOR Y1, Y5, Y5
- VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
- VPADDD Y8, Y2, Y2
- VPXOR Y2, Y7, Y7
- VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y11, Y14, Y14
- VPXOR Y14, Y4, Y4
- VPADDD Y5, Y15, Y15
- VPXOR Y15, Y6, Y6
- VPADDD Y7, Y12, Y12
- VPXOR Y12, Y8, Y8
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y0, Y0
- VMOVDQA Y10, (SI)
- VPSRLD $0x07, Y4, Y10
- VPSLLD $0x19, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x07, Y6, Y10
- VPSLLD $0x19, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x07, Y8, Y10
- VPSLLD $0x19, Y8, Y8
- VPOR Y10, Y8, Y8
- VPSRLD $0x07, Y0, Y10
- VPSLLD $0x19, Y0, Y0
- VPOR Y10, Y0, Y0
-
- // Round 4
- VMOVDQA (SI), Y10
- VPADDD 64(CX), Y10, Y10
- VPADDD 128(CX), Y1, Y1
- VPADDD 192(CX), Y2, Y2
- VPADDD 160(CX), Y3, Y3
- VPADDD Y0, Y10, Y10
- VPXOR Y10, Y5, Y5
- VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y7, Y7
- VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y8, Y3, Y3
- VPXOR Y3, Y11, Y11
- VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
- VPADDD Y5, Y12, Y12
- VPXOR Y12, Y0, Y0
- VPADDD Y7, Y13, Y13
- VPXOR Y13, Y4, Y4
- VPADDD Y9, Y14, Y14
- VPXOR Y14, Y6, Y6
- VPADDD Y11, Y15, Y15
- VPXOR Y15, Y8, Y8
- VMOVDQA Y10, (SI)
- VPSRLD $0x0c, Y0, Y10
- VPSLLD $0x14, Y0, Y0
- VPOR Y10, Y0, Y0
- VPSRLD $0x0c, Y4, Y10
- VPSLLD $0x14, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x0c, Y6, Y10
- VPSLLD $0x14, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x0c, Y8, Y10
- VPSLLD $0x14, Y8, Y8
- VPOR Y10, Y8, Y8
- VMOVDQA (SI), Y10
- VPADDD 224(AX), Y10, Y10
- VPADDD 32(CX), Y1, Y1
- VPADDD 96(AX), Y2, Y2
- VPADDD 224(CX), Y3, Y3
- VPADDD Y0, Y10, Y10
- VPXOR Y10, Y5, Y5
- VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y7, Y7
- VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y8, Y3, Y3
- VPXOR Y3, Y11, Y11
- VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
- VPADDD Y5, Y12, Y12
- VPXOR Y12, Y0, Y0
- VPADDD Y7, Y13, Y13
- VPXOR Y13, Y4, Y4
- VPADDD Y9, Y14, Y14
- VPXOR Y14, Y6, Y6
- VPADDD Y11, Y15, Y15
- VPXOR Y15, Y8, Y8
- VMOVDQA Y10, (SI)
- VPSRLD $0x07, Y0, Y10
- VPSLLD $0x19, Y0, Y0
- VPOR Y10, Y0, Y0
- VPSRLD $0x07, Y4, Y10
- VPSLLD $0x19, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x07, Y6, Y10
- VPSLLD $0x19, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x07, Y8, Y10
- VPSLLD $0x19, Y8, Y8
- VPOR Y10, Y8, Y8
- VMOVDQA (SI), Y10
- VPADDD 128(AX), Y10, Y10
- VPADDD 96(CX), Y1, Y1
- VPADDD 160(AX), Y2, Y2
- VPADDD 32(AX), Y3, Y3
- VPADDD Y4, Y10, Y10
- VPXOR Y10, Y11, Y11
- VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
- VPADDD Y6, Y1, Y1
- VPXOR Y1, Y5, Y5
- VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
- VPADDD Y8, Y2, Y2
- VPXOR Y2, Y7, Y7
- VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y11, Y14, Y14
- VPXOR Y14, Y4, Y4
- VPADDD Y5, Y15, Y15
- VPXOR Y15, Y6, Y6
- VPADDD Y7, Y12, Y12
- VPXOR Y12, Y8, Y8
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y0, Y0
- VMOVDQA Y10, (SI)
- VPSRLD $0x0c, Y4, Y10
- VPSLLD $0x14, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x0c, Y6, Y10
- VPSLLD $0x14, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x0c, Y8, Y10
- VPSLLD $0x14, Y8, Y8
- VPOR Y10, Y8, Y8
- VPSRLD $0x0c, Y0, Y10
- VPSLLD $0x14, Y0, Y0
- VPOR Y10, Y0, Y0
- VMOVDQA (SI), Y10
- VPADDD (AX), Y10, Y10
- VPADDD 64(AX), Y1, Y1
- VPADDD (CX), Y2, Y2
- VPADDD 192(AX), Y3, Y3
- VPADDD Y4, Y10, Y10
- VPXOR Y10, Y11, Y11
- VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
- VPADDD Y6, Y1, Y1
- VPXOR Y1, Y5, Y5
- VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
- VPADDD Y8, Y2, Y2
- VPXOR Y2, Y7, Y7
- VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y11, Y14, Y14
- VPXOR Y14, Y4, Y4
- VPADDD Y5, Y15, Y15
- VPXOR Y15, Y6, Y6
- VPADDD Y7, Y12, Y12
- VPXOR Y12, Y8, Y8
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y0, Y0
- VMOVDQA Y10, (SI)
- VPSRLD $0x07, Y4, Y10
- VPSLLD $0x19, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x07, Y6, Y10
- VPSLLD $0x19, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x07, Y8, Y10
- VPSLLD $0x19, Y8, Y8
- VPOR Y10, Y8, Y8
- VPSRLD $0x07, Y0, Y10
- VPSLLD $0x19, Y0, Y0
- VPOR Y10, Y0, Y0
-
- // Round 5
- VMOVDQA (SI), Y10
- VPADDD 128(CX), Y10, Y10
- VPADDD 32(CX), Y1, Y1
- VPADDD 224(CX), Y2, Y2
- VPADDD 192(CX), Y3, Y3
- VPADDD Y0, Y10, Y10
- VPXOR Y10, Y5, Y5
- VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y7, Y7
- VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y8, Y3, Y3
- VPXOR Y3, Y11, Y11
- VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
- VPADDD Y5, Y12, Y12
- VPXOR Y12, Y0, Y0
- VPADDD Y7, Y13, Y13
- VPXOR Y13, Y4, Y4
- VPADDD Y9, Y14, Y14
- VPXOR Y14, Y6, Y6
- VPADDD Y11, Y15, Y15
- VPXOR Y15, Y8, Y8
- VMOVDQA Y10, (SI)
- VPSRLD $0x0c, Y0, Y10
- VPSLLD $0x14, Y0, Y0
- VPOR Y10, Y0, Y0
- VPSRLD $0x0c, Y4, Y10
- VPSLLD $0x14, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x0c, Y6, Y10
- VPSLLD $0x14, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x0c, Y8, Y10
- VPSLLD $0x14, Y8, Y8
- VPOR Y10, Y8, Y8
- VMOVDQA (SI), Y10
- VPADDD 160(CX), Y10, Y10
- VPADDD 96(CX), Y1, Y1
- VPADDD 64(CX), Y2, Y2
- VPADDD (CX), Y3, Y3
- VPADDD Y0, Y10, Y10
- VPXOR Y10, Y5, Y5
- VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y7, Y7
- VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y8, Y3, Y3
- VPXOR Y3, Y11, Y11
- VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
- VPADDD Y5, Y12, Y12
- VPXOR Y12, Y0, Y0
- VPADDD Y7, Y13, Y13
- VPXOR Y13, Y4, Y4
- VPADDD Y9, Y14, Y14
- VPXOR Y14, Y6, Y6
- VPADDD Y11, Y15, Y15
- VPXOR Y15, Y8, Y8
- VMOVDQA Y10, (SI)
- VPSRLD $0x07, Y0, Y10
- VPSLLD $0x19, Y0, Y0
- VPOR Y10, Y0, Y0
- VPSRLD $0x07, Y4, Y10
- VPSLLD $0x19, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x07, Y6, Y10
- VPSLLD $0x19, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x07, Y8, Y10
- VPSLLD $0x19, Y8, Y8
- VPOR Y10, Y8, Y8
- VMOVDQA (SI), Y10
- VPADDD 224(AX), Y10, Y10
- VPADDD 160(AX), Y1, Y1
- VPADDD (AX), Y2, Y2
- VPADDD 192(AX), Y3, Y3
- VPADDD Y4, Y10, Y10
- VPXOR Y10, Y11, Y11
- VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
- VPADDD Y6, Y1, Y1
- VPXOR Y1, Y5, Y5
- VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
- VPADDD Y8, Y2, Y2
- VPXOR Y2, Y7, Y7
- VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y11, Y14, Y14
- VPXOR Y14, Y4, Y4
- VPADDD Y5, Y15, Y15
- VPXOR Y15, Y6, Y6
- VPADDD Y7, Y12, Y12
- VPXOR Y12, Y8, Y8
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y0, Y0
- VMOVDQA Y10, (SI)
- VPSRLD $0x0c, Y4, Y10
- VPSLLD $0x14, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x0c, Y6, Y10
- VPSLLD $0x14, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x0c, Y8, Y10
- VPSLLD $0x14, Y8, Y8
- VPOR Y10, Y8, Y8
- VPSRLD $0x0c, Y0, Y10
- VPSLLD $0x14, Y0, Y0
- VPOR Y10, Y0, Y0
- VMOVDQA (SI), Y10
- VPADDD 64(AX), Y10, Y10
- VPADDD 96(AX), Y1, Y1
- VPADDD 32(AX), Y2, Y2
- VPADDD 128(AX), Y3, Y3
- VPADDD Y4, Y10, Y10
- VPXOR Y10, Y11, Y11
- VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
- VPADDD Y6, Y1, Y1
- VPXOR Y1, Y5, Y5
- VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
- VPADDD Y8, Y2, Y2
- VPXOR Y2, Y7, Y7
- VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y11, Y14, Y14
- VPXOR Y14, Y4, Y4
- VPADDD Y5, Y15, Y15
- VPXOR Y15, Y6, Y6
- VPADDD Y7, Y12, Y12
- VPXOR Y12, Y8, Y8
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y0, Y0
- VMOVDQA Y10, (SI)
- VPSRLD $0x07, Y4, Y10
- VPSLLD $0x19, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x07, Y6, Y10
- VPSLLD $0x19, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x07, Y8, Y10
- VPSLLD $0x19, Y8, Y8
- VPOR Y10, Y8, Y8
- VPSRLD $0x07, Y0, Y10
- VPSLLD $0x19, Y0, Y0
- VPOR Y10, Y0, Y0
-
- // Round 6
- VMOVDQA (SI), Y10
- VPADDD 32(CX), Y10, Y10
- VPADDD 96(CX), Y1, Y1
- VPADDD (CX), Y2, Y2
- VPADDD 224(CX), Y3, Y3
- VPADDD Y0, Y10, Y10
- VPXOR Y10, Y5, Y5
- VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y7, Y7
- VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y8, Y3, Y3
- VPXOR Y3, Y11, Y11
- VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
- VPADDD Y5, Y12, Y12
- VPXOR Y12, Y0, Y0
- VPADDD Y7, Y13, Y13
- VPXOR Y13, Y4, Y4
- VPADDD Y9, Y14, Y14
- VPXOR Y14, Y6, Y6
- VPADDD Y11, Y15, Y15
- VPXOR Y15, Y8, Y8
- VMOVDQA Y10, (SI)
- VPSRLD $0x0c, Y0, Y10
- VPSLLD $0x14, Y0, Y0
- VPOR Y10, Y0, Y0
- VPSRLD $0x0c, Y4, Y10
- VPSLLD $0x14, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x0c, Y6, Y10
- VPSLLD $0x14, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x0c, Y8, Y10
- VPSLLD $0x14, Y8, Y8
- VPOR Y10, Y8, Y8
- VMOVDQA (SI), Y10
- VPADDD 192(CX), Y10, Y10
- VPADDD 160(AX), Y1, Y1
- VPADDD 128(CX), Y2, Y2
- VPADDD 32(AX), Y3, Y3
- VPADDD Y0, Y10, Y10
- VPXOR Y10, Y5, Y5
- VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y7, Y7
- VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y8, Y3, Y3
- VPXOR Y3, Y11, Y11
- VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
- VPADDD Y5, Y12, Y12
- VPXOR Y12, Y0, Y0
- VPADDD Y7, Y13, Y13
- VPXOR Y13, Y4, Y4
- VPADDD Y9, Y14, Y14
- VPXOR Y14, Y6, Y6
- VPADDD Y11, Y15, Y15
- VPXOR Y15, Y8, Y8
- VMOVDQA Y10, (SI)
- VPSRLD $0x07, Y0, Y10
- VPSLLD $0x19, Y0, Y0
- VPOR Y10, Y0, Y0
- VPSRLD $0x07, Y4, Y10
- VPSLLD $0x19, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x07, Y6, Y10
- VPSLLD $0x19, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x07, Y8, Y10
- VPSLLD $0x19, Y8, Y8
- VPOR Y10, Y8, Y8
- VMOVDQA (SI), Y10
- VPADDD 160(CX), Y10, Y10
- VPADDD (AX), Y1, Y1
- VPADDD 64(AX), Y2, Y2
- VPADDD 128(AX), Y3, Y3
- VPADDD Y4, Y10, Y10
- VPXOR Y10, Y11, Y11
- VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
- VPADDD Y6, Y1, Y1
- VPXOR Y1, Y5, Y5
- VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
- VPADDD Y8, Y2, Y2
- VPXOR Y2, Y7, Y7
- VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y11, Y14, Y14
- VPXOR Y14, Y4, Y4
- VPADDD Y5, Y15, Y15
- VPXOR Y15, Y6, Y6
- VPADDD Y7, Y12, Y12
- VPXOR Y12, Y8, Y8
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y0, Y0
- VMOVDQA Y10, (SI)
- VPSRLD $0x0c, Y4, Y10
- VPSLLD $0x14, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x0c, Y6, Y10
- VPSLLD $0x14, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x0c, Y8, Y10
- VPSLLD $0x14, Y8, Y8
- VPOR Y10, Y8, Y8
- VPSRLD $0x0c, Y0, Y10
- VPSLLD $0x14, Y0, Y0
- VPOR Y10, Y0, Y0
- VMOVDQA (SI), Y10
- VPADDD 96(AX), Y10, Y10
- VPADDD 64(CX), Y1, Y1
- VPADDD 192(AX), Y2, Y2
- VPADDD 224(AX), Y3, Y3
- VPADDD Y4, Y10, Y10
- VPXOR Y10, Y11, Y11
- VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
- VPADDD Y6, Y1, Y1
- VPXOR Y1, Y5, Y5
- VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
- VPADDD Y8, Y2, Y2
- VPXOR Y2, Y7, Y7
- VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y11, Y14, Y14
- VPXOR Y14, Y4, Y4
- VPADDD Y5, Y15, Y15
- VPXOR Y15, Y6, Y6
- VPADDD Y7, Y12, Y12
- VPXOR Y12, Y8, Y8
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y0, Y0
- VMOVDQA Y10, (SI)
- VPSRLD $0x07, Y4, Y10
- VPSLLD $0x19, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x07, Y6, Y10
- VPSLLD $0x19, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x07, Y8, Y10
- VPSLLD $0x19, Y8, Y8
- VPOR Y10, Y8, Y8
- VPSRLD $0x07, Y0, Y10
- VPSLLD $0x19, Y0, Y0
- VPOR Y10, Y0, Y0
-
- // Round 7
- VMOVDQA (SI), Y10
- VPADDD 96(CX), Y10, Y10
- VPADDD 160(AX), Y1, Y1
- VPADDD 32(AX), Y2, Y2
- VPADDD (CX), Y3, Y3
- VPADDD Y0, Y10, Y10
- VPXOR Y10, Y5, Y5
- VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y7, Y7
- VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y8, Y3, Y3
- VPXOR Y3, Y11, Y11
- VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
- VPADDD Y5, Y12, Y12
- VPXOR Y12, Y0, Y0
- VPADDD Y7, Y13, Y13
- VPXOR Y13, Y4, Y4
- VPADDD Y9, Y14, Y14
- VPXOR Y14, Y6, Y6
- VPADDD Y11, Y15, Y15
- VPXOR Y15, Y8, Y8
- VMOVDQA Y10, (SI)
- VPSRLD $0x0c, Y0, Y10
- VPSLLD $0x14, Y0, Y0
- VPOR Y10, Y0, Y0
- VPSRLD $0x0c, Y4, Y10
- VPSLLD $0x14, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x0c, Y6, Y10
- VPSLLD $0x14, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x0c, Y8, Y10
- VPSLLD $0x14, Y8, Y8
- VPOR Y10, Y8, Y8
- VMOVDQA (SI), Y10
- VPADDD 224(CX), Y10, Y10
- VPADDD (AX), Y1, Y1
- VPADDD 32(CX), Y2, Y2
- VPADDD 192(AX), Y3, Y3
- VPADDD Y0, Y10, Y10
- VPXOR Y10, Y5, Y5
- VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
- VPADDD Y4, Y1, Y1
- VPXOR Y1, Y7, Y7
- VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
- VPADDD Y6, Y2, Y2
- VPXOR Y2, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y8, Y3, Y3
- VPXOR Y3, Y11, Y11
- VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
- VPADDD Y5, Y12, Y12
- VPXOR Y12, Y0, Y0
- VPADDD Y7, Y13, Y13
- VPXOR Y13, Y4, Y4
- VPADDD Y9, Y14, Y14
- VPXOR Y14, Y6, Y6
- VPADDD Y11, Y15, Y15
- VPXOR Y15, Y8, Y8
- VMOVDQA Y10, (SI)
- VPSRLD $0x07, Y0, Y10
- VPSLLD $0x19, Y0, Y0
- VPOR Y10, Y0, Y0
- VPSRLD $0x07, Y4, Y10
- VPSLLD $0x19, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x07, Y6, Y10
- VPSLLD $0x19, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x07, Y8, Y10
- VPSLLD $0x19, Y8, Y8
- VPOR Y10, Y8, Y8
- VMOVDQA (SI), Y10
- VPADDD 192(CX), Y10, Y10
- VPADDD 64(AX), Y1, Y1
- VPADDD 96(AX), Y2, Y2
- VPADDD 224(AX), Y3, Y3
- VPADDD Y4, Y10, Y10
- VPXOR Y10, Y11, Y11
- VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
- VPADDD Y6, Y1, Y1
- VPXOR Y1, Y5, Y5
- VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
- VPADDD Y8, Y2, Y2
- VPXOR Y2, Y7, Y7
- VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
- VPADDD Y11, Y14, Y14
- VPXOR Y14, Y4, Y4
- VPADDD Y5, Y15, Y15
- VPXOR Y15, Y6, Y6
- VPADDD Y7, Y12, Y12
- VPXOR Y12, Y8, Y8
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y0, Y0
- VMOVDQA Y10, (SI)
- VPSRLD $0x0c, Y4, Y10
- VPSLLD $0x14, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x0c, Y6, Y10
- VPSLLD $0x14, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x0c, Y8, Y10
- VPSLLD $0x14, Y8, Y8
- VPOR Y10, Y8, Y8
- VPSRLD $0x0c, Y0, Y10
- VPSLLD $0x14, Y0, Y0
- VPOR Y10, Y0, Y0
- VMOVDQA (SI), Y10
- VPADDD 64(CX), Y10, Y10
- VPADDD 128(CX), Y1, Y1
- VPADDD 128(AX), Y2, Y2
- VPADDD 160(CX), Y3, Y3
- VPADDD Y4, Y10, Y10
- VPXOR Y10, Y11, Y11
- VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
- VPADDD Y6, Y1, Y1
- VPXOR Y1, Y5, Y5
- VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
- VPADDD Y8, Y2, Y2
- VPXOR Y2, Y7, Y7
- VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
- VPADDD Y0, Y3, Y3
- VPXOR Y3, Y9, Y9
- VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
- VPADDD Y11, Y14, Y14
- VPXOR Y14, Y4, Y4
- VPADDD Y5, Y15, Y15
- VPXOR Y15, Y6, Y6
- VPADDD Y7, Y12, Y12
- VPXOR Y12, Y8, Y8
- VPADDD Y9, Y13, Y13
- VPXOR Y13, Y0, Y0
- VMOVDQA Y10, (SI)
- VPSRLD $0x07, Y4, Y10
- VPSLLD $0x19, Y4, Y4
- VPOR Y10, Y4, Y4
- VPSRLD $0x07, Y6, Y10
- VPSLLD $0x19, Y6, Y6
- VPOR Y10, Y6, Y6
- VPSRLD $0x07, Y8, Y10
- VPSLLD $0x19, Y8, Y8
- VPOR Y10, Y8, Y8
- VPSRLD $0x07, Y0, Y10
- VPSLLD $0x19, Y0, Y0
- VPOR Y10, Y0, Y0
-
- // Finalize
- VPXOR (SI), Y12, Y10
- VPXOR Y13, Y1, Y1
- VPXOR Y14, Y2, Y2
- VPXOR Y15, Y3, Y3
- VPXOR Y5, Y0, Y0
- VPXOR Y7, Y4, Y4
- VPXOR Y9, Y6, Y5
- VPXOR Y11, Y8, Y6
-
- // Store result into out
- VMOVDQU Y10, (BP)
- VMOVDQU Y1, 32(BP)
- VMOVDQU Y2, 64(BP)
- VMOVDQU Y3, 96(BP)
- VMOVDQU Y0, 128(BP)
- VMOVDQU Y4, 160(BP)
- VMOVDQU Y5, 192(BP)
- VMOVDQU Y6, 224(BP)
- VZEROUPPER
- RET
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go
deleted file mode 100644
index 613972814..000000000
--- a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go
+++ /dev/null
@@ -1,13 +0,0 @@
-// +build !amd64
-
-package hash_avx2
-
-import "github.com/zeebo/blake3/internal/alg/hash/hash_pure"
-
-func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) {
- hash_pure.HashF(input, length, counter, flags, key, out, chain)
-}
-
-func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) {
- hash_pure.HashP(left, right, flags, key, out, n)
-}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go
deleted file mode 100644
index 10e949550..000000000
--- a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go
+++ /dev/null
@@ -1,9 +0,0 @@
-// +build amd64
-
-package hash_avx2
-
-//go:noescape
-func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32)
-
-//go:noescape
-func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int)
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go
deleted file mode 100644
index 0c6fd63cd..000000000
--- a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go
+++ /dev/null
@@ -1,56 +0,0 @@
-package hash_pure
-
-import (
- "unsafe"
-
- "github.com/zeebo/blake3/internal/alg/compress"
- "github.com/zeebo/blake3/internal/consts"
- "github.com/zeebo/blake3/internal/utils"
-)
-
-func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) {
- var tmp [16]uint32
-
- for i := uint64(0); consts.ChunkLen*i < length && i < 8; i++ {
- bchain := *key
- bflags := flags | consts.Flag_ChunkStart
- start := consts.ChunkLen * i
-
- for n := uint64(0); n < 16; n++ {
- if n == 15 {
- bflags |= consts.Flag_ChunkEnd
- }
- if start+64*n >= length {
- break
- }
- if start+64+64*n >= length {
- *chain = bchain
- }
-
- var blockPtr *[16]uint32
- if consts.IsLittleEndian {
- blockPtr = (*[16]uint32)(unsafe.Pointer(&input[consts.ChunkLen*i+consts.BlockLen*n]))
- } else {
- var block [16]uint32
- utils.BytesToWords((*[64]uint8)(unsafe.Pointer(&input[consts.ChunkLen*i+consts.BlockLen*n])), &block)
- blockPtr = &block
- }
-
- compress.Compress(&bchain, blockPtr, counter, consts.BlockLen, bflags, &tmp)
-
- bchain = *(*[8]uint32)(unsafe.Pointer(&tmp[0]))
- bflags = flags
- }
-
- out[i+0] = bchain[0]
- out[i+8] = bchain[1]
- out[i+16] = bchain[2]
- out[i+24] = bchain[3]
- out[i+32] = bchain[4]
- out[i+40] = bchain[5]
- out[i+48] = bchain[6]
- out[i+56] = bchain[7]
-
- counter++
- }
-}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go
deleted file mode 100644
index bee5d8dd0..000000000
--- a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go
+++ /dev/null
@@ -1,38 +0,0 @@
-package hash_pure
-
-import "github.com/zeebo/blake3/internal/alg/compress"
-
-func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) {
- var tmp [16]uint32
- var block [16]uint32
-
- for i := 0; i < n && i < 8; i++ {
- block[0] = left[i+0]
- block[1] = left[i+8]
- block[2] = left[i+16]
- block[3] = left[i+24]
- block[4] = left[i+32]
- block[5] = left[i+40]
- block[6] = left[i+48]
- block[7] = left[i+56]
- block[8] = right[i+0]
- block[9] = right[i+8]
- block[10] = right[i+16]
- block[11] = right[i+24]
- block[12] = right[i+32]
- block[13] = right[i+40]
- block[14] = right[i+48]
- block[15] = right[i+56]
-
- compress.Compress(key, &block, 0, 64, flags, &tmp)
-
- out[i+0] = tmp[0]
- out[i+8] = tmp[1]
- out[i+16] = tmp[2]
- out[i+24] = tmp[3]
- out[i+32] = tmp[4]
- out[i+40] = tmp[5]
- out[i+48] = tmp[6]
- out[i+56] = tmp[7]
- }
-}