summaryrefslogtreecommitdiff
path: root/vendor/github.com/zeebo/blake3/internal/alg/hash
diff options
context:
space:
mode:
authorLibravatar tsmethurst <tobi.smethurst@protonmail.com>2022-01-16 18:52:30 +0100
committerLibravatar tsmethurst <tobi.smethurst@protonmail.com>2022-01-16 18:52:30 +0100
commit6f5ccf435585e43a00e3cc50f4bcefac36ada818 (patch)
treeba368d27464b79b1e5d010c0662fd3e340bf108e /vendor/github.com/zeebo/blake3/internal/alg/hash
parentadd go-runners to readme (diff)
downloadgotosocial-6f5ccf435585e43a00e3cc50f4bcefac36ada818.tar.xz
update dependencies
Diffstat (limited to 'vendor/github.com/zeebo/blake3/internal/alg/hash')
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go23
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s2561
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go13
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go9
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go56
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go38
6 files changed, 2700 insertions, 0 deletions
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go
new file mode 100644
index 000000000..ac43abb69
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go
@@ -0,0 +1,23 @@
+package hash
+
+import (
+ "github.com/zeebo/blake3/internal/alg/hash/hash_avx2"
+ "github.com/zeebo/blake3/internal/alg/hash/hash_pure"
+ "github.com/zeebo/blake3/internal/consts"
+)
+
+func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) {
+ if consts.HasAVX2 && length > 2*consts.ChunkLen {
+ hash_avx2.HashF(input, length, counter, flags, key, out, chain)
+ } else {
+ hash_pure.HashF(input, length, counter, flags, key, out, chain)
+ }
+}
+
+func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) {
+ if consts.HasAVX2 && n >= 2 {
+ hash_avx2.HashP(left, right, flags, key, out, n)
+ } else {
+ hash_pure.HashP(left, right, flags, key, out, n)
+ }
+}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s
new file mode 100644
index 000000000..d7531664b
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s
@@ -0,0 +1,2561 @@
+// Code generated by command: go run main.go. DO NOT EDIT.
+
+#include "textflag.h"
+
+DATA iv<>+0(SB)/4, $0x6a09e667
+DATA iv<>+4(SB)/4, $0xbb67ae85
+DATA iv<>+8(SB)/4, $0x3c6ef372
+DATA iv<>+12(SB)/4, $0xa54ff53a
+DATA iv<>+16(SB)/4, $0x510e527f
+DATA iv<>+20(SB)/4, $0x9b05688c
+DATA iv<>+24(SB)/4, $0x1f83d9ab
+DATA iv<>+28(SB)/4, $0x5be0cd19
+GLOBL iv<>(SB), RODATA|NOPTR, $32
+
+DATA rot16_shuf<>+0(SB)/1, $0x02
+DATA rot16_shuf<>+1(SB)/1, $0x03
+DATA rot16_shuf<>+2(SB)/1, $0x00
+DATA rot16_shuf<>+3(SB)/1, $0x01
+DATA rot16_shuf<>+4(SB)/1, $0x06
+DATA rot16_shuf<>+5(SB)/1, $0x07
+DATA rot16_shuf<>+6(SB)/1, $0x04
+DATA rot16_shuf<>+7(SB)/1, $0x05
+DATA rot16_shuf<>+8(SB)/1, $0x0a
+DATA rot16_shuf<>+9(SB)/1, $0x0b
+DATA rot16_shuf<>+10(SB)/1, $0x08
+DATA rot16_shuf<>+11(SB)/1, $0x09
+DATA rot16_shuf<>+12(SB)/1, $0x0e
+DATA rot16_shuf<>+13(SB)/1, $0x0f
+DATA rot16_shuf<>+14(SB)/1, $0x0c
+DATA rot16_shuf<>+15(SB)/1, $0x0d
+DATA rot16_shuf<>+16(SB)/1, $0x12
+DATA rot16_shuf<>+17(SB)/1, $0x13
+DATA rot16_shuf<>+18(SB)/1, $0x10
+DATA rot16_shuf<>+19(SB)/1, $0x11
+DATA rot16_shuf<>+20(SB)/1, $0x16
+DATA rot16_shuf<>+21(SB)/1, $0x17
+DATA rot16_shuf<>+22(SB)/1, $0x14
+DATA rot16_shuf<>+23(SB)/1, $0x15
+DATA rot16_shuf<>+24(SB)/1, $0x1a
+DATA rot16_shuf<>+25(SB)/1, $0x1b
+DATA rot16_shuf<>+26(SB)/1, $0x18
+DATA rot16_shuf<>+27(SB)/1, $0x19
+DATA rot16_shuf<>+28(SB)/1, $0x1e
+DATA rot16_shuf<>+29(SB)/1, $0x1f
+DATA rot16_shuf<>+30(SB)/1, $0x1c
+DATA rot16_shuf<>+31(SB)/1, $0x1d
+GLOBL rot16_shuf<>(SB), RODATA|NOPTR, $32
+
+DATA rot8_shuf<>+0(SB)/1, $0x01
+DATA rot8_shuf<>+1(SB)/1, $0x02
+DATA rot8_shuf<>+2(SB)/1, $0x03
+DATA rot8_shuf<>+3(SB)/1, $0x00
+DATA rot8_shuf<>+4(SB)/1, $0x05
+DATA rot8_shuf<>+5(SB)/1, $0x06
+DATA rot8_shuf<>+6(SB)/1, $0x07
+DATA rot8_shuf<>+7(SB)/1, $0x04
+DATA rot8_shuf<>+8(SB)/1, $0x09
+DATA rot8_shuf<>+9(SB)/1, $0x0a
+DATA rot8_shuf<>+10(SB)/1, $0x0b
+DATA rot8_shuf<>+11(SB)/1, $0x08
+DATA rot8_shuf<>+12(SB)/1, $0x0d
+DATA rot8_shuf<>+13(SB)/1, $0x0e
+DATA rot8_shuf<>+14(SB)/1, $0x0f
+DATA rot8_shuf<>+15(SB)/1, $0x0c
+DATA rot8_shuf<>+16(SB)/1, $0x11
+DATA rot8_shuf<>+17(SB)/1, $0x12
+DATA rot8_shuf<>+18(SB)/1, $0x13
+DATA rot8_shuf<>+19(SB)/1, $0x10
+DATA rot8_shuf<>+20(SB)/1, $0x15
+DATA rot8_shuf<>+21(SB)/1, $0x16
+DATA rot8_shuf<>+22(SB)/1, $0x17
+DATA rot8_shuf<>+23(SB)/1, $0x14
+DATA rot8_shuf<>+24(SB)/1, $0x19
+DATA rot8_shuf<>+25(SB)/1, $0x1a
+DATA rot8_shuf<>+26(SB)/1, $0x1b
+DATA rot8_shuf<>+27(SB)/1, $0x18
+DATA rot8_shuf<>+28(SB)/1, $0x1d
+DATA rot8_shuf<>+29(SB)/1, $0x1e
+DATA rot8_shuf<>+30(SB)/1, $0x1f
+DATA rot8_shuf<>+31(SB)/1, $0x1c
+GLOBL rot8_shuf<>(SB), RODATA|NOPTR, $32
+
+DATA block_len<>+0(SB)/4, $0x00000040
+DATA block_len<>+4(SB)/4, $0x00000040
+DATA block_len<>+8(SB)/4, $0x00000040
+DATA block_len<>+12(SB)/4, $0x00000040
+DATA block_len<>+16(SB)/4, $0x00000040
+DATA block_len<>+20(SB)/4, $0x00000040
+DATA block_len<>+24(SB)/4, $0x00000040
+DATA block_len<>+28(SB)/4, $0x00000040
+GLOBL block_len<>(SB), RODATA|NOPTR, $32
+
+DATA zero<>+0(SB)/4, $0x00000000
+DATA zero<>+4(SB)/4, $0x00000000
+DATA zero<>+8(SB)/4, $0x00000000
+DATA zero<>+12(SB)/4, $0x00000000
+DATA zero<>+16(SB)/4, $0x00000000
+DATA zero<>+20(SB)/4, $0x00000000
+DATA zero<>+24(SB)/4, $0x00000000
+DATA zero<>+28(SB)/4, $0x00000000
+GLOBL zero<>(SB), RODATA|NOPTR, $32
+
+DATA counter<>+0(SB)/8, $0x0000000000000000
+DATA counter<>+8(SB)/8, $0x0000000000000001
+DATA counter<>+16(SB)/8, $0x0000000000000002
+DATA counter<>+24(SB)/8, $0x0000000000000003
+DATA counter<>+32(SB)/8, $0x0000000000000004
+DATA counter<>+40(SB)/8, $0x0000000000000005
+DATA counter<>+48(SB)/8, $0x0000000000000006
+DATA counter<>+56(SB)/8, $0x0000000000000007
+GLOBL counter<>(SB), RODATA|NOPTR, $64
+
+// func HashF(input *[8192]byte, length uint64, counter uint64, flags uint32, key *[8]uint32, out *[32]uint32, chain *[8]uint32)
+// Requires: AVX, AVX2
+TEXT ·HashF(SB), $688-56
+ MOVQ input+0(FP), AX
+ MOVQ length+8(FP), CX
+ MOVQ counter+16(FP), DX
+ MOVL flags+24(FP), BX
+ MOVQ key+32(FP), BP
+ MOVQ out+40(FP), SI
+ MOVQ chain+48(FP), DI
+
+ // Allocate local space and align it
+ LEAQ 31(SP), R10
+ MOVQ $0x000000000000001f, R8
+ NOTQ R8
+ ANDQ R8, R10
+
+ // Skip if the length is zero
+ XORQ R8, R8
+ XORQ R9, R9
+ TESTQ CX, CX
+ JZ skip_compute
+
+ // Compute complete chunks and blocks
+ SUBQ $0x01, CX
+ MOVQ CX, R8
+ SHRQ $0x0a, R8
+ MOVQ CX, R9
+ ANDQ $0x000003c0, R9
+
+skip_compute:
+ // Load some params into the stack (avo improvment?)
+ MOVL BX, 64(SP)
+ MOVQ DX, 72(SP)
+
+ // Load IV into vectors
+ VPBROADCASTD (BP), Y0
+ VPBROADCASTD 4(BP), Y1
+ VPBROADCASTD 8(BP), Y2
+ VPBROADCASTD 12(BP), Y3
+ VPBROADCASTD 16(BP), Y4
+ VPBROADCASTD 20(BP), Y5
+ VPBROADCASTD 24(BP), Y6
+ VPBROADCASTD 28(BP), Y7
+
+ // Build and store counter data on the stack
+ VPBROADCASTQ 72(SP), Y8
+ VPADDQ counter<>+0(SB), Y8, Y8
+ VPBROADCASTQ 72(SP), Y9
+ VPADDQ counter<>+32(SB), Y9, Y9
+ VPUNPCKLDQ Y9, Y8, Y10
+ VPUNPCKHDQ Y9, Y8, Y8
+ VPUNPCKLDQ Y8, Y10, Y9
+ VPUNPCKHDQ Y8, Y10, Y8
+ VPERMQ $0xd8, Y9, Y9
+ VPERMQ $0xd8, Y8, Y8
+ VMOVDQU Y9, 112(SP)
+ VMOVDQU Y8, 144(SP)
+
+ // Set up block flags and variables for iteration
+ XORQ CX, CX
+ ORL $0x01, 64(SP)
+
+loop:
+ // Include end flags if last block
+ CMPQ CX, $0x000003c0
+ JNE round_setup
+ ORL $0x02, 64(SP)
+
+round_setup:
+ // Load and transpose message vectors
+ VMOVDQU (AX)(CX*1), Y8
+ VMOVDQU 1024(AX)(CX*1), Y9
+ VMOVDQU 2048(AX)(CX*1), Y10
+ VMOVDQU 3072(AX)(CX*1), Y11
+ VMOVDQU 4096(AX)(CX*1), Y12
+ VMOVDQU 5120(AX)(CX*1), Y13
+ VMOVDQU 6144(AX)(CX*1), Y14
+ VMOVDQU 7168(AX)(CX*1), Y15
+ VMOVDQA Y0, (R10)
+ VPUNPCKLDQ Y9, Y8, Y0
+ VPUNPCKHDQ Y9, Y8, Y8
+ VPUNPCKLDQ Y11, Y10, Y9
+ VPUNPCKHDQ Y11, Y10, Y10
+ VPUNPCKLDQ Y13, Y12, Y11
+ VPUNPCKHDQ Y13, Y12, Y12
+ VPUNPCKLDQ Y15, Y14, Y13
+ VPUNPCKHDQ Y15, Y14, Y14
+ VPUNPCKLQDQ Y9, Y0, Y15
+ VPUNPCKHQDQ Y9, Y0, Y0
+ VPUNPCKLQDQ Y10, Y8, Y9
+ VPUNPCKHQDQ Y10, Y8, Y8
+ VPUNPCKLQDQ Y13, Y11, Y10
+ VPUNPCKHQDQ Y13, Y11, Y11
+ VPUNPCKLQDQ Y14, Y12, Y13
+ VPUNPCKHQDQ Y14, Y12, Y12
+ VINSERTI128 $0x01, X10, Y15, Y14
+ VPERM2I128 $0x31, Y10, Y15, Y10
+ VINSERTI128 $0x01, X11, Y0, Y15
+ VPERM2I128 $0x31, Y11, Y0, Y0
+ VINSERTI128 $0x01, X13, Y9, Y11
+ VPERM2I128 $0x31, Y13, Y9, Y9
+ VINSERTI128 $0x01, X12, Y8, Y13
+ VPERM2I128 $0x31, Y12, Y8, Y8
+ VMOVDQU Y14, 176(SP)
+ VMOVDQU Y15, 208(SP)
+ VMOVDQU Y11, 240(SP)
+ VMOVDQU Y13, 272(SP)
+ VMOVDQU Y10, 304(SP)
+ VMOVDQU Y0, 336(SP)
+ VMOVDQU Y9, 368(SP)
+ VMOVDQU Y8, 400(SP)
+ VMOVDQU 32(AX)(CX*1), Y0
+ VMOVDQU 1056(AX)(CX*1), Y8
+ VMOVDQU 2080(AX)(CX*1), Y9
+ VMOVDQU 3104(AX)(CX*1), Y10
+ VMOVDQU 4128(AX)(CX*1), Y11
+ VMOVDQU 5152(AX)(CX*1), Y12
+ VMOVDQU 6176(AX)(CX*1), Y13
+ VMOVDQU 7200(AX)(CX*1), Y14
+ VPUNPCKLDQ Y8, Y0, Y15
+ VPUNPCKHDQ Y8, Y0, Y0
+ VPUNPCKLDQ Y10, Y9, Y8
+ VPUNPCKHDQ Y10, Y9, Y9
+ VPUNPCKLDQ Y12, Y11, Y10
+ VPUNPCKHDQ Y12, Y11, Y11
+ VPUNPCKLDQ Y14, Y13, Y12
+ VPUNPCKHDQ Y14, Y13, Y13
+ VPUNPCKLQDQ Y8, Y15, Y14
+ VPUNPCKHQDQ Y8, Y15, Y8
+ VPUNPCKLQDQ Y9, Y0, Y15
+ VPUNPCKHQDQ Y9, Y0, Y0
+ VPUNPCKLQDQ Y12, Y10, Y9
+ VPUNPCKHQDQ Y12, Y10, Y10
+ VPUNPCKLQDQ Y13, Y11, Y12
+ VPUNPCKHQDQ Y13, Y11, Y11
+ VINSERTI128 $0x01, X9, Y14, Y13
+ VPERM2I128 $0x31, Y9, Y14, Y9
+ VINSERTI128 $0x01, X10, Y8, Y14
+ VPERM2I128 $0x31, Y10, Y8, Y8
+ VINSERTI128 $0x01, X12, Y15, Y10
+ VPERM2I128 $0x31, Y12, Y15, Y12
+ VINSERTI128 $0x01, X11, Y0, Y15
+ VPERM2I128 $0x31, Y11, Y0, Y0
+ VMOVDQU Y13, 432(SP)
+ VMOVDQU Y14, 464(SP)
+ VMOVDQU Y10, 496(SP)
+ VMOVDQU Y15, 528(SP)
+ VMOVDQU Y9, 560(SP)
+ VMOVDQU Y8, 592(SP)
+ VMOVDQU Y12, 624(SP)
+ VMOVDQU Y0, 656(SP)
+
+ // Load constants for the round
+ VMOVDQA (R10), Y0
+ VMOVDQU block_len<>+0(SB), Y8
+ VPBROADCASTD 64(SP), Y9
+ VPBROADCASTD iv<>+0(SB), Y10
+ VPBROADCASTD iv<>+4(SB), Y11
+ VPBROADCASTD iv<>+8(SB), Y12
+ VPBROADCASTD iv<>+12(SB), Y13
+ VMOVDQU 112(SP), Y14
+ VMOVDQU 144(SP), Y15
+
+ // Save state for partial chunk if necessary
+ CMPQ CX, R9
+ JNE begin_rounds
+ VMOVDQU Y0, 80(SP)
+ MOVL 80(SP)(R8*4), DX
+ MOVL DX, (DI)
+ VMOVDQU Y1, 80(SP)
+ MOVL 80(SP)(R8*4), DX
+ MOVL DX, 4(DI)
+ VMOVDQU Y2, 80(SP)
+ MOVL 80(SP)(R8*4), DX
+ MOVL DX, 8(DI)
+ VMOVDQU Y3, 80(SP)
+ MOVL 80(SP)(R8*4), DX
+ MOVL DX, 12(DI)
+ VMOVDQU Y4, 80(SP)
+ MOVL 80(SP)(R8*4), DX
+ MOVL DX, 16(DI)
+ VMOVDQU Y5, 80(SP)
+ MOVL 80(SP)(R8*4), DX
+ MOVL DX, 20(DI)
+ VMOVDQU Y6, 80(SP)
+ MOVL 80(SP)(R8*4), DX
+ MOVL DX, 24(DI)
+ VMOVDQU Y7, 80(SP)
+ MOVL 80(SP)(R8*4), DX
+ MOVL DX, 28(DI)
+
+begin_rounds:
+ // Perform the rounds
+ // Round 1
+ VPADDD 176(SP), Y0, Y0
+ VPADDD 240(SP), Y1, Y1
+ VPADDD 304(SP), Y2, Y2
+ VPADDD 368(SP), Y3, Y3
+ VPADDD Y4, Y0, Y0
+ VPXOR Y0, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y7, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y4, Y4
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y5, Y5
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y6, Y6
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y7, Y7
+ VMOVDQA Y0, (R10)
+ VPSRLD $0x0c, Y4, Y0
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y0, Y4, Y0
+ VPSRLD $0x0c, Y5, Y4
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y4, Y5, Y4
+ VPSRLD $0x0c, Y6, Y5
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y5, Y6, Y5
+ VPSRLD $0x0c, Y7, Y6
+ VPSLLD $0x14, Y7, Y7
+ VPOR Y6, Y7, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 208(SP), Y7, Y7
+ VPADDD 272(SP), Y1, Y1
+ VPADDD 336(SP), Y2, Y2
+ VPADDD 400(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 432(SP), Y7, Y7
+ VPADDD 496(SP), Y1, Y1
+ VPADDD 560(SP), Y2, Y2
+ VPADDD 624(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VMOVDQA (R10), Y7
+ VPADDD 464(SP), Y7, Y7
+ VPADDD 528(SP), Y1, Y1
+ VPADDD 592(SP), Y2, Y2
+ VPADDD 656(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+
+ // Round 2
+ VMOVDQA (R10), Y7
+ VPADDD 240(SP), Y7, Y7
+ VPADDD 272(SP), Y1, Y1
+ VPADDD 400(SP), Y2, Y2
+ VPADDD 304(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 368(SP), Y7, Y7
+ VPADDD 496(SP), Y1, Y1
+ VPADDD 176(SP), Y2, Y2
+ VPADDD 592(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 208(SP), Y7, Y7
+ VPADDD 560(SP), Y1, Y1
+ VPADDD 464(SP), Y2, Y2
+ VPADDD 656(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VMOVDQA (R10), Y7
+ VPADDD 528(SP), Y7, Y7
+ VPADDD 336(SP), Y1, Y1
+ VPADDD 624(SP), Y2, Y2
+ VPADDD 432(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+
+ // Round 3
+ VMOVDQA (R10), Y7
+ VPADDD 272(SP), Y7, Y7
+ VPADDD 496(SP), Y1, Y1
+ VPADDD 592(SP), Y2, Y2
+ VPADDD 400(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 304(SP), Y7, Y7
+ VPADDD 560(SP), Y1, Y1
+ VPADDD 240(SP), Y2, Y2
+ VPADDD 624(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 368(SP), Y7, Y7
+ VPADDD 464(SP), Y1, Y1
+ VPADDD 528(SP), Y2, Y2
+ VPADDD 432(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VMOVDQA (R10), Y7
+ VPADDD 336(SP), Y7, Y7
+ VPADDD 176(SP), Y1, Y1
+ VPADDD 656(SP), Y2, Y2
+ VPADDD 208(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+
+ // Round 4
+ VMOVDQA (R10), Y7
+ VPADDD 496(SP), Y7, Y7
+ VPADDD 560(SP), Y1, Y1
+ VPADDD 624(SP), Y2, Y2
+ VPADDD 592(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 400(SP), Y7, Y7
+ VPADDD 464(SP), Y1, Y1
+ VPADDD 272(SP), Y2, Y2
+ VPADDD 656(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 304(SP), Y7, Y7
+ VPADDD 528(SP), Y1, Y1
+ VPADDD 336(SP), Y2, Y2
+ VPADDD 208(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VMOVDQA (R10), Y7
+ VPADDD 176(SP), Y7, Y7
+ VPADDD 240(SP), Y1, Y1
+ VPADDD 432(SP), Y2, Y2
+ VPADDD 368(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+
+ // Round 5
+ VMOVDQA (R10), Y7
+ VPADDD 560(SP), Y7, Y7
+ VPADDD 464(SP), Y1, Y1
+ VPADDD 656(SP), Y2, Y2
+ VPADDD 624(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 592(SP), Y7, Y7
+ VPADDD 528(SP), Y1, Y1
+ VPADDD 496(SP), Y2, Y2
+ VPADDD 432(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 400(SP), Y7, Y7
+ VPADDD 336(SP), Y1, Y1
+ VPADDD 176(SP), Y2, Y2
+ VPADDD 368(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VMOVDQA (R10), Y7
+ VPADDD 240(SP), Y7, Y7
+ VPADDD 272(SP), Y1, Y1
+ VPADDD 208(SP), Y2, Y2
+ VPADDD 304(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+
+ // Round 6
+ VMOVDQA (R10), Y7
+ VPADDD 464(SP), Y7, Y7
+ VPADDD 528(SP), Y1, Y1
+ VPADDD 432(SP), Y2, Y2
+ VPADDD 656(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 624(SP), Y7, Y7
+ VPADDD 336(SP), Y1, Y1
+ VPADDD 560(SP), Y2, Y2
+ VPADDD 208(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 592(SP), Y7, Y7
+ VPADDD 176(SP), Y1, Y1
+ VPADDD 240(SP), Y2, Y2
+ VPADDD 304(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VMOVDQA (R10), Y7
+ VPADDD 272(SP), Y7, Y7
+ VPADDD 496(SP), Y1, Y1
+ VPADDD 368(SP), Y2, Y2
+ VPADDD 400(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+
+ // Round 7
+ VMOVDQA (R10), Y7
+ VPADDD 528(SP), Y7, Y7
+ VPADDD 336(SP), Y1, Y1
+ VPADDD 208(SP), Y2, Y2
+ VPADDD 432(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 656(SP), Y7, Y7
+ VPADDD 176(SP), Y1, Y1
+ VPADDD 464(SP), Y2, Y2
+ VPADDD 368(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 624(SP), Y7, Y7
+ VPADDD 240(SP), Y1, Y1
+ VPADDD 272(SP), Y2, Y2
+ VPADDD 400(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VMOVDQA (R10), Y7
+ VPADDD 496(SP), Y7, Y7
+ VPADDD 560(SP), Y1, Y1
+ VPADDD 304(SP), Y2, Y2
+ VPADDD 592(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+
+ // Finalize rounds
+ VPXOR Y9, Y6, Y6
+ VPXOR (R10), Y10, Y7
+ VPXOR Y11, Y1, Y1
+ VPXOR Y12, Y2, Y2
+ VPXOR Y13, Y3, Y3
+ VPXOR Y14, Y0, Y0
+ VPXOR Y15, Y4, Y4
+ VPXOR Y8, Y5, Y5
+
+ // Fix up registers for next iteration
+ VMOVDQU Y7, Y8
+ VMOVDQU Y6, Y7
+ VMOVDQU Y5, Y6
+ VMOVDQU Y4, Y5
+ VMOVDQU Y0, Y4
+ VMOVDQU Y8, Y0
+
+ // If we have zero complete chunks, we're done
+ CMPQ R8, $0x00
+ JNE loop_trailer
+ CMPQ R9, CX
+ JEQ finalize
+
+loop_trailer:
+ // Increment, reset flags, and loop
+ CMPQ CX, $0x000003c0
+ JEQ finalize
+ ADDQ $0x40, CX
+ MOVL BX, 64(SP)
+ JMP loop
+
+finalize:
+ // Store result into out
+ VMOVDQU Y0, (SI)
+ VMOVDQU Y1, 32(SI)
+ VMOVDQU Y2, 64(SI)
+ VMOVDQU Y3, 96(SI)
+ VMOVDQU Y4, 128(SI)
+ VMOVDQU Y5, 160(SI)
+ VMOVDQU Y6, 192(SI)
+ VMOVDQU Y7, 224(SI)
+ VZEROUPPER
+ RET
+
+// func HashP(left *[32]uint32, right *[32]uint32, flags uint8, key *[8]uint32, out *[32]uint32, n int)
+// Requires: AVX, AVX2
+TEXT ·HashP(SB), NOSPLIT, $72-48
+ MOVQ left+0(FP), AX
+ MOVQ right+8(FP), CX
+ MOVBLZX flags+16(FP), DX
+ MOVQ key+24(FP), BX
+ MOVQ out+32(FP), BP
+
+ // Allocate local space and align it
+ LEAQ 31(SP), SI
+ MOVQ $0x000000000000001f, DI
+ NOTQ DI
+ ANDQ DI, SI
+
+ // Set up flags value
+ MOVL DX, 64(SP)
+
+ // Perform the rounds
+ // Round 1
+ VPBROADCASTD (BX), Y0
+ VPADDD (AX), Y0, Y0
+ VPBROADCASTD 4(BX), Y1
+ VPADDD 64(AX), Y1, Y1
+ VPBROADCASTD 8(BX), Y2
+ VPADDD 128(AX), Y2, Y2
+ VPBROADCASTD 12(BX), Y3
+ VPADDD 192(AX), Y3, Y3
+ VPBROADCASTD 16(BX), Y4
+ VPADDD Y4, Y0, Y0
+ VMOVDQU zero<>+0(SB), Y5
+ VPXOR Y0, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPBROADCASTD 20(BX), Y6
+ VPADDD Y6, Y1, Y1
+ VMOVDQU zero<>+0(SB), Y7
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPBROADCASTD 24(BX), Y8
+ VPADDD Y8, Y2, Y2
+ VMOVDQU block_len<>+0(SB), Y9
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPBROADCASTD 28(BX), Y10
+ VPADDD Y10, Y3, Y3
+ VPBROADCASTD 64(SP), Y11
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPBROADCASTD iv<>+0(SB), Y12
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPBROADCASTD iv<>+4(SB), Y13
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VPBROADCASTD iv<>+8(SB), Y14
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y8, Y8
+ VPBROADCASTD iv<>+12(SB), Y15
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y10, Y10
+ VMOVDQA Y0, (SI)
+ VPSRLD $0x0c, Y4, Y0
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y0, Y4, Y0
+ VPSRLD $0x0c, Y6, Y4
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y4, Y6, Y4
+ VPSRLD $0x0c, Y8, Y6
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y6, Y8, Y6
+ VPSRLD $0x0c, Y10, Y8
+ VPSLLD $0x14, Y10, Y10
+ VPOR Y8, Y10, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 32(AX), Y10, Y10
+ VPADDD 96(AX), Y1, Y1
+ VPADDD 160(AX), Y2, Y2
+ VPADDD 224(AX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD (CX), Y10, Y10
+ VPADDD 64(CX), Y1, Y1
+ VPADDD 128(CX), Y2, Y2
+ VPADDD 192(CX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VMOVDQA (SI), Y10
+ VPADDD 32(CX), Y10, Y10
+ VPADDD 96(CX), Y1, Y1
+ VPADDD 160(CX), Y2, Y2
+ VPADDD 224(CX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+
+ // Round 2
+ VMOVDQA (SI), Y10
+ VPADDD 64(AX), Y10, Y10
+ VPADDD 96(AX), Y1, Y1
+ VPADDD 224(AX), Y2, Y2
+ VPADDD 128(AX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 192(AX), Y10, Y10
+ VPADDD 64(CX), Y1, Y1
+ VPADDD (AX), Y2, Y2
+ VPADDD 160(CX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 32(AX), Y10, Y10
+ VPADDD 128(CX), Y1, Y1
+ VPADDD 32(CX), Y2, Y2
+ VPADDD 224(CX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VMOVDQA (SI), Y10
+ VPADDD 96(CX), Y10, Y10
+ VPADDD 160(AX), Y1, Y1
+ VPADDD 192(CX), Y2, Y2
+ VPADDD (CX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+
+ // Round 3
+ VMOVDQA (SI), Y10
+ VPADDD 96(AX), Y10, Y10
+ VPADDD 64(CX), Y1, Y1
+ VPADDD 160(CX), Y2, Y2
+ VPADDD 224(AX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 128(AX), Y10, Y10
+ VPADDD 128(CX), Y1, Y1
+ VPADDD 64(AX), Y2, Y2
+ VPADDD 192(CX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 192(AX), Y10, Y10
+ VPADDD 32(CX), Y1, Y1
+ VPADDD 96(CX), Y2, Y2
+ VPADDD (CX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VMOVDQA (SI), Y10
+ VPADDD 160(AX), Y10, Y10
+ VPADDD (AX), Y1, Y1
+ VPADDD 224(CX), Y2, Y2
+ VPADDD 32(AX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+
+ // Round 4
+ VMOVDQA (SI), Y10
+ VPADDD 64(CX), Y10, Y10
+ VPADDD 128(CX), Y1, Y1
+ VPADDD 192(CX), Y2, Y2
+ VPADDD 160(CX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 224(AX), Y10, Y10
+ VPADDD 32(CX), Y1, Y1
+ VPADDD 96(AX), Y2, Y2
+ VPADDD 224(CX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 128(AX), Y10, Y10
+ VPADDD 96(CX), Y1, Y1
+ VPADDD 160(AX), Y2, Y2
+ VPADDD 32(AX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VMOVDQA (SI), Y10
+ VPADDD (AX), Y10, Y10
+ VPADDD 64(AX), Y1, Y1
+ VPADDD (CX), Y2, Y2
+ VPADDD 192(AX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+
+ // Round 5
+ VMOVDQA (SI), Y10
+ VPADDD 128(CX), Y10, Y10
+ VPADDD 32(CX), Y1, Y1
+ VPADDD 224(CX), Y2, Y2
+ VPADDD 192(CX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 160(CX), Y10, Y10
+ VPADDD 96(CX), Y1, Y1
+ VPADDD 64(CX), Y2, Y2
+ VPADDD (CX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 224(AX), Y10, Y10
+ VPADDD 160(AX), Y1, Y1
+ VPADDD (AX), Y2, Y2
+ VPADDD 192(AX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VMOVDQA (SI), Y10
+ VPADDD 64(AX), Y10, Y10
+ VPADDD 96(AX), Y1, Y1
+ VPADDD 32(AX), Y2, Y2
+ VPADDD 128(AX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+
+ // Round 6
+ VMOVDQA (SI), Y10
+ VPADDD 32(CX), Y10, Y10
+ VPADDD 96(CX), Y1, Y1
+ VPADDD (CX), Y2, Y2
+ VPADDD 224(CX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 192(CX), Y10, Y10
+ VPADDD 160(AX), Y1, Y1
+ VPADDD 128(CX), Y2, Y2
+ VPADDD 32(AX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 160(CX), Y10, Y10
+ VPADDD (AX), Y1, Y1
+ VPADDD 64(AX), Y2, Y2
+ VPADDD 128(AX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VMOVDQA (SI), Y10
+ VPADDD 96(AX), Y10, Y10
+ VPADDD 64(CX), Y1, Y1
+ VPADDD 192(AX), Y2, Y2
+ VPADDD 224(AX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+
+ // Round 7
+ VMOVDQA (SI), Y10
+ VPADDD 96(CX), Y10, Y10
+ VPADDD 160(AX), Y1, Y1
+ VPADDD 32(AX), Y2, Y2
+ VPADDD (CX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 224(CX), Y10, Y10
+ VPADDD (AX), Y1, Y1
+ VPADDD 32(CX), Y2, Y2
+ VPADDD 192(AX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 192(CX), Y10, Y10
+ VPADDD 64(AX), Y1, Y1
+ VPADDD 96(AX), Y2, Y2
+ VPADDD 224(AX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VMOVDQA (SI), Y10
+ VPADDD 64(CX), Y10, Y10
+ VPADDD 128(CX), Y1, Y1
+ VPADDD 128(AX), Y2, Y2
+ VPADDD 160(CX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+
+ // Finalize
+ VPXOR (SI), Y12, Y10
+ VPXOR Y13, Y1, Y1
+ VPXOR Y14, Y2, Y2
+ VPXOR Y15, Y3, Y3
+ VPXOR Y5, Y0, Y0
+ VPXOR Y7, Y4, Y4
+ VPXOR Y9, Y6, Y5
+ VPXOR Y11, Y8, Y6
+
+ // Store result into out
+ VMOVDQU Y10, (BP)
+ VMOVDQU Y1, 32(BP)
+ VMOVDQU Y2, 64(BP)
+ VMOVDQU Y3, 96(BP)
+ VMOVDQU Y0, 128(BP)
+ VMOVDQU Y4, 160(BP)
+ VMOVDQU Y5, 192(BP)
+ VMOVDQU Y6, 224(BP)
+ VZEROUPPER
+ RET
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go
new file mode 100644
index 000000000..613972814
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go
@@ -0,0 +1,13 @@
+// +build !amd64
+
+package hash_avx2
+
+import "github.com/zeebo/blake3/internal/alg/hash/hash_pure"
+
+func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) {
+ hash_pure.HashF(input, length, counter, flags, key, out, chain)
+}
+
+func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) {
+ hash_pure.HashP(left, right, flags, key, out, n)
+}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go
new file mode 100644
index 000000000..10e949550
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go
@@ -0,0 +1,9 @@
+// +build amd64
+
+package hash_avx2
+
+//go:noescape
+func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32)
+
+//go:noescape
+func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int)
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go
new file mode 100644
index 000000000..0c6fd63cd
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go
@@ -0,0 +1,56 @@
+package hash_pure
+
+import (
+ "unsafe"
+
+ "github.com/zeebo/blake3/internal/alg/compress"
+ "github.com/zeebo/blake3/internal/consts"
+ "github.com/zeebo/blake3/internal/utils"
+)
+
+func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) {
+ var tmp [16]uint32
+
+ for i := uint64(0); consts.ChunkLen*i < length && i < 8; i++ {
+ bchain := *key
+ bflags := flags | consts.Flag_ChunkStart
+ start := consts.ChunkLen * i
+
+ for n := uint64(0); n < 16; n++ {
+ if n == 15 {
+ bflags |= consts.Flag_ChunkEnd
+ }
+ if start+64*n >= length {
+ break
+ }
+ if start+64+64*n >= length {
+ *chain = bchain
+ }
+
+ var blockPtr *[16]uint32
+ if consts.IsLittleEndian {
+ blockPtr = (*[16]uint32)(unsafe.Pointer(&input[consts.ChunkLen*i+consts.BlockLen*n]))
+ } else {
+ var block [16]uint32
+ utils.BytesToWords((*[64]uint8)(unsafe.Pointer(&input[consts.ChunkLen*i+consts.BlockLen*n])), &block)
+ blockPtr = &block
+ }
+
+ compress.Compress(&bchain, blockPtr, counter, consts.BlockLen, bflags, &tmp)
+
+ bchain = *(*[8]uint32)(unsafe.Pointer(&tmp[0]))
+ bflags = flags
+ }
+
+ out[i+0] = bchain[0]
+ out[i+8] = bchain[1]
+ out[i+16] = bchain[2]
+ out[i+24] = bchain[3]
+ out[i+32] = bchain[4]
+ out[i+40] = bchain[5]
+ out[i+48] = bchain[6]
+ out[i+56] = bchain[7]
+
+ counter++
+ }
+}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go
new file mode 100644
index 000000000..bee5d8dd0
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go
@@ -0,0 +1,38 @@
+package hash_pure
+
+import "github.com/zeebo/blake3/internal/alg/compress"
+
+func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) {
+ var tmp [16]uint32
+ var block [16]uint32
+
+ for i := 0; i < n && i < 8; i++ {
+ block[0] = left[i+0]
+ block[1] = left[i+8]
+ block[2] = left[i+16]
+ block[3] = left[i+24]
+ block[4] = left[i+32]
+ block[5] = left[i+40]
+ block[6] = left[i+48]
+ block[7] = left[i+56]
+ block[8] = right[i+0]
+ block[9] = right[i+8]
+ block[10] = right[i+16]
+ block[11] = right[i+24]
+ block[12] = right[i+32]
+ block[13] = right[i+40]
+ block[14] = right[i+48]
+ block[15] = right[i+56]
+
+ compress.Compress(key, &block, 0, 64, flags, &tmp)
+
+ out[i+0] = tmp[0]
+ out[i+8] = tmp[1]
+ out[i+16] = tmp[2]
+ out[i+24] = tmp[3]
+ out[i+32] = tmp[4]
+ out[i+40] = tmp[5]
+ out[i+48] = tmp[6]
+ out[i+56] = tmp[7]
+ }
+}