diff options
author | 2022-01-16 18:52:30 +0100 | |
---|---|---|
committer | 2022-01-16 18:52:30 +0100 | |
commit | 6f5ccf435585e43a00e3cc50f4bcefac36ada818 (patch) | |
tree | ba368d27464b79b1e5d010c0662fd3e340bf108e /vendor/github.com/zeebo/blake3/internal/alg/hash | |
parent | add go-runners to readme (diff) | |
download | gotosocial-6f5ccf435585e43a00e3cc50f4bcefac36ada818.tar.xz |
update dependencies
Diffstat (limited to 'vendor/github.com/zeebo/blake3/internal/alg/hash')
6 files changed, 2700 insertions, 0 deletions
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go new file mode 100644 index 000000000..ac43abb69 --- /dev/null +++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go @@ -0,0 +1,23 @@ +package hash + +import ( + "github.com/zeebo/blake3/internal/alg/hash/hash_avx2" + "github.com/zeebo/blake3/internal/alg/hash/hash_pure" + "github.com/zeebo/blake3/internal/consts" +) + +func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) { + if consts.HasAVX2 && length > 2*consts.ChunkLen { + hash_avx2.HashF(input, length, counter, flags, key, out, chain) + } else { + hash_pure.HashF(input, length, counter, flags, key, out, chain) + } +} + +func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) { + if consts.HasAVX2 && n >= 2 { + hash_avx2.HashP(left, right, flags, key, out, n) + } else { + hash_pure.HashP(left, right, flags, key, out, n) + } +} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s new file mode 100644 index 000000000..d7531664b --- /dev/null +++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s @@ -0,0 +1,2561 @@ +// Code generated by command: go run main.go. DO NOT EDIT. + +#include "textflag.h" + +DATA iv<>+0(SB)/4, $0x6a09e667 +DATA iv<>+4(SB)/4, $0xbb67ae85 +DATA iv<>+8(SB)/4, $0x3c6ef372 +DATA iv<>+12(SB)/4, $0xa54ff53a +DATA iv<>+16(SB)/4, $0x510e527f +DATA iv<>+20(SB)/4, $0x9b05688c +DATA iv<>+24(SB)/4, $0x1f83d9ab +DATA iv<>+28(SB)/4, $0x5be0cd19 +GLOBL iv<>(SB), RODATA|NOPTR, $32 + +DATA rot16_shuf<>+0(SB)/1, $0x02 +DATA rot16_shuf<>+1(SB)/1, $0x03 +DATA rot16_shuf<>+2(SB)/1, $0x00 +DATA rot16_shuf<>+3(SB)/1, $0x01 +DATA rot16_shuf<>+4(SB)/1, $0x06 +DATA rot16_shuf<>+5(SB)/1, $0x07 +DATA rot16_shuf<>+6(SB)/1, $0x04 +DATA rot16_shuf<>+7(SB)/1, $0x05 +DATA rot16_shuf<>+8(SB)/1, $0x0a +DATA rot16_shuf<>+9(SB)/1, $0x0b +DATA rot16_shuf<>+10(SB)/1, $0x08 +DATA rot16_shuf<>+11(SB)/1, $0x09 +DATA rot16_shuf<>+12(SB)/1, $0x0e +DATA rot16_shuf<>+13(SB)/1, $0x0f +DATA rot16_shuf<>+14(SB)/1, $0x0c +DATA rot16_shuf<>+15(SB)/1, $0x0d +DATA rot16_shuf<>+16(SB)/1, $0x12 +DATA rot16_shuf<>+17(SB)/1, $0x13 +DATA rot16_shuf<>+18(SB)/1, $0x10 +DATA rot16_shuf<>+19(SB)/1, $0x11 +DATA rot16_shuf<>+20(SB)/1, $0x16 +DATA rot16_shuf<>+21(SB)/1, $0x17 +DATA rot16_shuf<>+22(SB)/1, $0x14 +DATA rot16_shuf<>+23(SB)/1, $0x15 +DATA rot16_shuf<>+24(SB)/1, $0x1a +DATA rot16_shuf<>+25(SB)/1, $0x1b +DATA rot16_shuf<>+26(SB)/1, $0x18 +DATA rot16_shuf<>+27(SB)/1, $0x19 +DATA rot16_shuf<>+28(SB)/1, $0x1e +DATA rot16_shuf<>+29(SB)/1, $0x1f +DATA rot16_shuf<>+30(SB)/1, $0x1c +DATA rot16_shuf<>+31(SB)/1, $0x1d +GLOBL rot16_shuf<>(SB), RODATA|NOPTR, $32 + +DATA rot8_shuf<>+0(SB)/1, $0x01 +DATA rot8_shuf<>+1(SB)/1, $0x02 +DATA rot8_shuf<>+2(SB)/1, $0x03 +DATA rot8_shuf<>+3(SB)/1, $0x00 +DATA rot8_shuf<>+4(SB)/1, $0x05 +DATA rot8_shuf<>+5(SB)/1, $0x06 +DATA rot8_shuf<>+6(SB)/1, $0x07 +DATA rot8_shuf<>+7(SB)/1, $0x04 +DATA rot8_shuf<>+8(SB)/1, $0x09 +DATA rot8_shuf<>+9(SB)/1, $0x0a +DATA rot8_shuf<>+10(SB)/1, $0x0b +DATA rot8_shuf<>+11(SB)/1, $0x08 +DATA rot8_shuf<>+12(SB)/1, $0x0d +DATA rot8_shuf<>+13(SB)/1, $0x0e +DATA rot8_shuf<>+14(SB)/1, $0x0f +DATA rot8_shuf<>+15(SB)/1, $0x0c +DATA rot8_shuf<>+16(SB)/1, $0x11 +DATA rot8_shuf<>+17(SB)/1, $0x12 +DATA rot8_shuf<>+18(SB)/1, $0x13 +DATA rot8_shuf<>+19(SB)/1, $0x10 +DATA rot8_shuf<>+20(SB)/1, $0x15 +DATA rot8_shuf<>+21(SB)/1, $0x16 +DATA rot8_shuf<>+22(SB)/1, $0x17 +DATA rot8_shuf<>+23(SB)/1, $0x14 +DATA rot8_shuf<>+24(SB)/1, $0x19 +DATA rot8_shuf<>+25(SB)/1, $0x1a +DATA rot8_shuf<>+26(SB)/1, $0x1b +DATA rot8_shuf<>+27(SB)/1, $0x18 +DATA rot8_shuf<>+28(SB)/1, $0x1d +DATA rot8_shuf<>+29(SB)/1, $0x1e +DATA rot8_shuf<>+30(SB)/1, $0x1f +DATA rot8_shuf<>+31(SB)/1, $0x1c +GLOBL rot8_shuf<>(SB), RODATA|NOPTR, $32 + +DATA block_len<>+0(SB)/4, $0x00000040 +DATA block_len<>+4(SB)/4, $0x00000040 +DATA block_len<>+8(SB)/4, $0x00000040 +DATA block_len<>+12(SB)/4, $0x00000040 +DATA block_len<>+16(SB)/4, $0x00000040 +DATA block_len<>+20(SB)/4, $0x00000040 +DATA block_len<>+24(SB)/4, $0x00000040 +DATA block_len<>+28(SB)/4, $0x00000040 +GLOBL block_len<>(SB), RODATA|NOPTR, $32 + +DATA zero<>+0(SB)/4, $0x00000000 +DATA zero<>+4(SB)/4, $0x00000000 +DATA zero<>+8(SB)/4, $0x00000000 +DATA zero<>+12(SB)/4, $0x00000000 +DATA zero<>+16(SB)/4, $0x00000000 +DATA zero<>+20(SB)/4, $0x00000000 +DATA zero<>+24(SB)/4, $0x00000000 +DATA zero<>+28(SB)/4, $0x00000000 +GLOBL zero<>(SB), RODATA|NOPTR, $32 + +DATA counter<>+0(SB)/8, $0x0000000000000000 +DATA counter<>+8(SB)/8, $0x0000000000000001 +DATA counter<>+16(SB)/8, $0x0000000000000002 +DATA counter<>+24(SB)/8, $0x0000000000000003 +DATA counter<>+32(SB)/8, $0x0000000000000004 +DATA counter<>+40(SB)/8, $0x0000000000000005 +DATA counter<>+48(SB)/8, $0x0000000000000006 +DATA counter<>+56(SB)/8, $0x0000000000000007 +GLOBL counter<>(SB), RODATA|NOPTR, $64 + +// func HashF(input *[8192]byte, length uint64, counter uint64, flags uint32, key *[8]uint32, out *[32]uint32, chain *[8]uint32) +// Requires: AVX, AVX2 +TEXT ·HashF(SB), $688-56 + MOVQ input+0(FP), AX + MOVQ length+8(FP), CX + MOVQ counter+16(FP), DX + MOVL flags+24(FP), BX + MOVQ key+32(FP), BP + MOVQ out+40(FP), SI + MOVQ chain+48(FP), DI + + // Allocate local space and align it + LEAQ 31(SP), R10 + MOVQ $0x000000000000001f, R8 + NOTQ R8 + ANDQ R8, R10 + + // Skip if the length is zero + XORQ R8, R8 + XORQ R9, R9 + TESTQ CX, CX + JZ skip_compute + + // Compute complete chunks and blocks + SUBQ $0x01, CX + MOVQ CX, R8 + SHRQ $0x0a, R8 + MOVQ CX, R9 + ANDQ $0x000003c0, R9 + +skip_compute: + // Load some params into the stack (avo improvment?) + MOVL BX, 64(SP) + MOVQ DX, 72(SP) + + // Load IV into vectors + VPBROADCASTD (BP), Y0 + VPBROADCASTD 4(BP), Y1 + VPBROADCASTD 8(BP), Y2 + VPBROADCASTD 12(BP), Y3 + VPBROADCASTD 16(BP), Y4 + VPBROADCASTD 20(BP), Y5 + VPBROADCASTD 24(BP), Y6 + VPBROADCASTD 28(BP), Y7 + + // Build and store counter data on the stack + VPBROADCASTQ 72(SP), Y8 + VPADDQ counter<>+0(SB), Y8, Y8 + VPBROADCASTQ 72(SP), Y9 + VPADDQ counter<>+32(SB), Y9, Y9 + VPUNPCKLDQ Y9, Y8, Y10 + VPUNPCKHDQ Y9, Y8, Y8 + VPUNPCKLDQ Y8, Y10, Y9 + VPUNPCKHDQ Y8, Y10, Y8 + VPERMQ $0xd8, Y9, Y9 + VPERMQ $0xd8, Y8, Y8 + VMOVDQU Y9, 112(SP) + VMOVDQU Y8, 144(SP) + + // Set up block flags and variables for iteration + XORQ CX, CX + ORL $0x01, 64(SP) + +loop: + // Include end flags if last block + CMPQ CX, $0x000003c0 + JNE round_setup + ORL $0x02, 64(SP) + +round_setup: + // Load and transpose message vectors + VMOVDQU (AX)(CX*1), Y8 + VMOVDQU 1024(AX)(CX*1), Y9 + VMOVDQU 2048(AX)(CX*1), Y10 + VMOVDQU 3072(AX)(CX*1), Y11 + VMOVDQU 4096(AX)(CX*1), Y12 + VMOVDQU 5120(AX)(CX*1), Y13 + VMOVDQU 6144(AX)(CX*1), Y14 + VMOVDQU 7168(AX)(CX*1), Y15 + VMOVDQA Y0, (R10) + VPUNPCKLDQ Y9, Y8, Y0 + VPUNPCKHDQ Y9, Y8, Y8 + VPUNPCKLDQ Y11, Y10, Y9 + VPUNPCKHDQ Y11, Y10, Y10 + VPUNPCKLDQ Y13, Y12, Y11 + VPUNPCKHDQ Y13, Y12, Y12 + VPUNPCKLDQ Y15, Y14, Y13 + VPUNPCKHDQ Y15, Y14, Y14 + VPUNPCKLQDQ Y9, Y0, Y15 + VPUNPCKHQDQ Y9, Y0, Y0 + VPUNPCKLQDQ Y10, Y8, Y9 + VPUNPCKHQDQ Y10, Y8, Y8 + VPUNPCKLQDQ Y13, Y11, Y10 + VPUNPCKHQDQ Y13, Y11, Y11 + VPUNPCKLQDQ Y14, Y12, Y13 + VPUNPCKHQDQ Y14, Y12, Y12 + VINSERTI128 $0x01, X10, Y15, Y14 + VPERM2I128 $0x31, Y10, Y15, Y10 + VINSERTI128 $0x01, X11, Y0, Y15 + VPERM2I128 $0x31, Y11, Y0, Y0 + VINSERTI128 $0x01, X13, Y9, Y11 + VPERM2I128 $0x31, Y13, Y9, Y9 + VINSERTI128 $0x01, X12, Y8, Y13 + VPERM2I128 $0x31, Y12, Y8, Y8 + VMOVDQU Y14, 176(SP) + VMOVDQU Y15, 208(SP) + VMOVDQU Y11, 240(SP) + VMOVDQU Y13, 272(SP) + VMOVDQU Y10, 304(SP) + VMOVDQU Y0, 336(SP) + VMOVDQU Y9, 368(SP) + VMOVDQU Y8, 400(SP) + VMOVDQU 32(AX)(CX*1), Y0 + VMOVDQU 1056(AX)(CX*1), Y8 + VMOVDQU 2080(AX)(CX*1), Y9 + VMOVDQU 3104(AX)(CX*1), Y10 + VMOVDQU 4128(AX)(CX*1), Y11 + VMOVDQU 5152(AX)(CX*1), Y12 + VMOVDQU 6176(AX)(CX*1), Y13 + VMOVDQU 7200(AX)(CX*1), Y14 + VPUNPCKLDQ Y8, Y0, Y15 + VPUNPCKHDQ Y8, Y0, Y0 + VPUNPCKLDQ Y10, Y9, Y8 + VPUNPCKHDQ Y10, Y9, Y9 + VPUNPCKLDQ Y12, Y11, Y10 + VPUNPCKHDQ Y12, Y11, Y11 + VPUNPCKLDQ Y14, Y13, Y12 + VPUNPCKHDQ Y14, Y13, Y13 + VPUNPCKLQDQ Y8, Y15, Y14 + VPUNPCKHQDQ Y8, Y15, Y8 + VPUNPCKLQDQ Y9, Y0, Y15 + VPUNPCKHQDQ Y9, Y0, Y0 + VPUNPCKLQDQ Y12, Y10, Y9 + VPUNPCKHQDQ Y12, Y10, Y10 + VPUNPCKLQDQ Y13, Y11, Y12 + VPUNPCKHQDQ Y13, Y11, Y11 + VINSERTI128 $0x01, X9, Y14, Y13 + VPERM2I128 $0x31, Y9, Y14, Y9 + VINSERTI128 $0x01, X10, Y8, Y14 + VPERM2I128 $0x31, Y10, Y8, Y8 + VINSERTI128 $0x01, X12, Y15, Y10 + VPERM2I128 $0x31, Y12, Y15, Y12 + VINSERTI128 $0x01, X11, Y0, Y15 + VPERM2I128 $0x31, Y11, Y0, Y0 + VMOVDQU Y13, 432(SP) + VMOVDQU Y14, 464(SP) + VMOVDQU Y10, 496(SP) + VMOVDQU Y15, 528(SP) + VMOVDQU Y9, 560(SP) + VMOVDQU Y8, 592(SP) + VMOVDQU Y12, 624(SP) + VMOVDQU Y0, 656(SP) + + // Load constants for the round + VMOVDQA (R10), Y0 + VMOVDQU block_len<>+0(SB), Y8 + VPBROADCASTD 64(SP), Y9 + VPBROADCASTD iv<>+0(SB), Y10 + VPBROADCASTD iv<>+4(SB), Y11 + VPBROADCASTD iv<>+8(SB), Y12 + VPBROADCASTD iv<>+12(SB), Y13 + VMOVDQU 112(SP), Y14 + VMOVDQU 144(SP), Y15 + + // Save state for partial chunk if necessary + CMPQ CX, R9 + JNE begin_rounds + VMOVDQU Y0, 80(SP) + MOVL 80(SP)(R8*4), DX + MOVL DX, (DI) + VMOVDQU Y1, 80(SP) + MOVL 80(SP)(R8*4), DX + MOVL DX, 4(DI) + VMOVDQU Y2, 80(SP) + MOVL 80(SP)(R8*4), DX + MOVL DX, 8(DI) + VMOVDQU Y3, 80(SP) + MOVL 80(SP)(R8*4), DX + MOVL DX, 12(DI) + VMOVDQU Y4, 80(SP) + MOVL 80(SP)(R8*4), DX + MOVL DX, 16(DI) + VMOVDQU Y5, 80(SP) + MOVL 80(SP)(R8*4), DX + MOVL DX, 20(DI) + VMOVDQU Y6, 80(SP) + MOVL 80(SP)(R8*4), DX + MOVL DX, 24(DI) + VMOVDQU Y7, 80(SP) + MOVL 80(SP)(R8*4), DX + MOVL DX, 28(DI) + +begin_rounds: + // Perform the rounds + // Round 1 + VPADDD 176(SP), Y0, Y0 + VPADDD 240(SP), Y1, Y1 + VPADDD 304(SP), Y2, Y2 + VPADDD 368(SP), Y3, Y3 + VPADDD Y4, Y0, Y0 + VPXOR Y0, Y14, Y14 + VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 + VPADDD Y5, Y1, Y1 + VPXOR Y1, Y15, Y15 + VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y8, Y8 + VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 + VPADDD Y7, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y14, Y10, Y10 + VPXOR Y10, Y4, Y4 + VPADDD Y15, Y11, Y11 + VPXOR Y11, Y5, Y5 + VPADDD Y8, Y12, Y12 + VPXOR Y12, Y6, Y6 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y7, Y7 + VMOVDQA Y0, (R10) + VPSRLD $0x0c, Y4, Y0 + VPSLLD $0x14, Y4, Y4 + VPOR Y0, Y4, Y0 + VPSRLD $0x0c, Y5, Y4 + VPSLLD $0x14, Y5, Y5 + VPOR Y4, Y5, Y4 + VPSRLD $0x0c, Y6, Y5 + VPSLLD $0x14, Y6, Y6 + VPOR Y5, Y6, Y5 + VPSRLD $0x0c, Y7, Y6 + VPSLLD $0x14, Y7, Y7 + VPOR Y6, Y7, Y6 + VMOVDQA (R10), Y7 + VPADDD 208(SP), Y7, Y7 + VPADDD 272(SP), Y1, Y1 + VPADDD 336(SP), Y2, Y2 + VPADDD 400(SP), Y3, Y3 + VPADDD Y0, Y7, Y7 + VPXOR Y7, Y14, Y14 + VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y15, Y15 + VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 + VPADDD Y5, Y2, Y2 + VPXOR Y2, Y8, Y8 + VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 + VPADDD Y6, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y14, Y10, Y10 + VPXOR Y10, Y0, Y0 + VPADDD Y15, Y11, Y11 + VPXOR Y11, Y4, Y4 + VPADDD Y8, Y12, Y12 + VPXOR Y12, Y5, Y5 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y6, Y6 + VMOVDQA Y7, (R10) + VPSRLD $0x07, Y0, Y7 + VPSLLD $0x19, Y0, Y0 + VPOR Y7, Y0, Y0 + VPSRLD $0x07, Y4, Y7 + VPSLLD $0x19, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x07, Y5, Y7 + VPSLLD $0x19, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x07, Y6, Y7 + VPSLLD $0x19, Y6, Y6 + VPOR Y7, Y6, Y6 + VMOVDQA (R10), Y7 + VPADDD 432(SP), Y7, Y7 + VPADDD 496(SP), Y1, Y1 + VPADDD 560(SP), Y2, Y2 + VPADDD 624(SP), Y3, Y3 + VPADDD Y4, Y7, Y7 + VPXOR Y7, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y5, Y1, Y1 + VPXOR Y1, Y14, Y14 + VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y15, Y15 + VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 + VPADDD Y9, Y12, Y12 + VPXOR Y12, Y4, Y4 + VPADDD Y14, Y13, Y13 + VPXOR Y13, Y5, Y5 + VPADDD Y15, Y10, Y10 + VPXOR Y10, Y6, Y6 + VPADDD Y8, Y11, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQA Y7, (R10) + VPSRLD $0x0c, Y4, Y7 + VPSLLD $0x14, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x0c, Y5, Y7 + VPSLLD $0x14, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x0c, Y6, Y7 + VPSLLD $0x14, Y6, Y6 + VPOR Y7, Y6, Y6 + VPSRLD $0x0c, Y0, Y7 + VPSLLD $0x14, Y0, Y0 + VPOR Y7, Y0, Y0 + VMOVDQA (R10), Y7 + VPADDD 464(SP), Y7, Y7 + VPADDD 528(SP), Y1, Y1 + VPADDD 592(SP), Y2, Y2 + VPADDD 656(SP), Y3, Y3 + VPADDD Y4, Y7, Y7 + VPXOR Y7, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y5, Y1, Y1 + VPXOR Y1, Y14, Y14 + VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y15, Y15 + VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 + VPADDD Y9, Y12, Y12 + VPXOR Y12, Y4, Y4 + VPADDD Y14, Y13, Y13 + VPXOR Y13, Y5, Y5 + VPADDD Y15, Y10, Y10 + VPXOR Y10, Y6, Y6 + VPADDD Y8, Y11, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQA Y7, (R10) + VPSRLD $0x07, Y4, Y7 + VPSLLD $0x19, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x07, Y5, Y7 + VPSLLD $0x19, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x07, Y6, Y7 + VPSLLD $0x19, Y6, Y6 + VPOR Y7, Y6, Y6 + VPSRLD $0x07, Y0, Y7 + VPSLLD $0x19, Y0, Y0 + VPOR Y7, Y0, Y0 + + // Round 2 + VMOVDQA (R10), Y7 + VPADDD 240(SP), Y7, Y7 + VPADDD 272(SP), Y1, Y1 + VPADDD 400(SP), Y2, Y2 + VPADDD 304(SP), Y3, Y3 + VPADDD Y0, Y7, Y7 + VPXOR Y7, Y14, Y14 + VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y15, Y15 + VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 + VPADDD Y5, Y2, Y2 + VPXOR Y2, Y8, Y8 + VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 + VPADDD Y6, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y14, Y10, Y10 + VPXOR Y10, Y0, Y0 + VPADDD Y15, Y11, Y11 + VPXOR Y11, Y4, Y4 + VPADDD Y8, Y12, Y12 + VPXOR Y12, Y5, Y5 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y6, Y6 + VMOVDQA Y7, (R10) + VPSRLD $0x0c, Y0, Y7 + VPSLLD $0x14, Y0, Y0 + VPOR Y7, Y0, Y0 + VPSRLD $0x0c, Y4, Y7 + VPSLLD $0x14, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x0c, Y5, Y7 + VPSLLD $0x14, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x0c, Y6, Y7 + VPSLLD $0x14, Y6, Y6 + VPOR Y7, Y6, Y6 + VMOVDQA (R10), Y7 + VPADDD 368(SP), Y7, Y7 + VPADDD 496(SP), Y1, Y1 + VPADDD 176(SP), Y2, Y2 + VPADDD 592(SP), Y3, Y3 + VPADDD Y0, Y7, Y7 + VPXOR Y7, Y14, Y14 + VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y15, Y15 + VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 + VPADDD Y5, Y2, Y2 + VPXOR Y2, Y8, Y8 + VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 + VPADDD Y6, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y14, Y10, Y10 + VPXOR Y10, Y0, Y0 + VPADDD Y15, Y11, Y11 + VPXOR Y11, Y4, Y4 + VPADDD Y8, Y12, Y12 + VPXOR Y12, Y5, Y5 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y6, Y6 + VMOVDQA Y7, (R10) + VPSRLD $0x07, Y0, Y7 + VPSLLD $0x19, Y0, Y0 + VPOR Y7, Y0, Y0 + VPSRLD $0x07, Y4, Y7 + VPSLLD $0x19, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x07, Y5, Y7 + VPSLLD $0x19, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x07, Y6, Y7 + VPSLLD $0x19, Y6, Y6 + VPOR Y7, Y6, Y6 + VMOVDQA (R10), Y7 + VPADDD 208(SP), Y7, Y7 + VPADDD 560(SP), Y1, Y1 + VPADDD 464(SP), Y2, Y2 + VPADDD 656(SP), Y3, Y3 + VPADDD Y4, Y7, Y7 + VPXOR Y7, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y5, Y1, Y1 + VPXOR Y1, Y14, Y14 + VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y15, Y15 + VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 + VPADDD Y9, Y12, Y12 + VPXOR Y12, Y4, Y4 + VPADDD Y14, Y13, Y13 + VPXOR Y13, Y5, Y5 + VPADDD Y15, Y10, Y10 + VPXOR Y10, Y6, Y6 + VPADDD Y8, Y11, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQA Y7, (R10) + VPSRLD $0x0c, Y4, Y7 + VPSLLD $0x14, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x0c, Y5, Y7 + VPSLLD $0x14, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x0c, Y6, Y7 + VPSLLD $0x14, Y6, Y6 + VPOR Y7, Y6, Y6 + VPSRLD $0x0c, Y0, Y7 + VPSLLD $0x14, Y0, Y0 + VPOR Y7, Y0, Y0 + VMOVDQA (R10), Y7 + VPADDD 528(SP), Y7, Y7 + VPADDD 336(SP), Y1, Y1 + VPADDD 624(SP), Y2, Y2 + VPADDD 432(SP), Y3, Y3 + VPADDD Y4, Y7, Y7 + VPXOR Y7, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y5, Y1, Y1 + VPXOR Y1, Y14, Y14 + VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y15, Y15 + VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 + VPADDD Y9, Y12, Y12 + VPXOR Y12, Y4, Y4 + VPADDD Y14, Y13, Y13 + VPXOR Y13, Y5, Y5 + VPADDD Y15, Y10, Y10 + VPXOR Y10, Y6, Y6 + VPADDD Y8, Y11, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQA Y7, (R10) + VPSRLD $0x07, Y4, Y7 + VPSLLD $0x19, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x07, Y5, Y7 + VPSLLD $0x19, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x07, Y6, Y7 + VPSLLD $0x19, Y6, Y6 + VPOR Y7, Y6, Y6 + VPSRLD $0x07, Y0, Y7 + VPSLLD $0x19, Y0, Y0 + VPOR Y7, Y0, Y0 + + // Round 3 + VMOVDQA (R10), Y7 + VPADDD 272(SP), Y7, Y7 + VPADDD 496(SP), Y1, Y1 + VPADDD 592(SP), Y2, Y2 + VPADDD 400(SP), Y3, Y3 + VPADDD Y0, Y7, Y7 + VPXOR Y7, Y14, Y14 + VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y15, Y15 + VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 + VPADDD Y5, Y2, Y2 + VPXOR Y2, Y8, Y8 + VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 + VPADDD Y6, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y14, Y10, Y10 + VPXOR Y10, Y0, Y0 + VPADDD Y15, Y11, Y11 + VPXOR Y11, Y4, Y4 + VPADDD Y8, Y12, Y12 + VPXOR Y12, Y5, Y5 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y6, Y6 + VMOVDQA Y7, (R10) + VPSRLD $0x0c, Y0, Y7 + VPSLLD $0x14, Y0, Y0 + VPOR Y7, Y0, Y0 + VPSRLD $0x0c, Y4, Y7 + VPSLLD $0x14, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x0c, Y5, Y7 + VPSLLD $0x14, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x0c, Y6, Y7 + VPSLLD $0x14, Y6, Y6 + VPOR Y7, Y6, Y6 + VMOVDQA (R10), Y7 + VPADDD 304(SP), Y7, Y7 + VPADDD 560(SP), Y1, Y1 + VPADDD 240(SP), Y2, Y2 + VPADDD 624(SP), Y3, Y3 + VPADDD Y0, Y7, Y7 + VPXOR Y7, Y14, Y14 + VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y15, Y15 + VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 + VPADDD Y5, Y2, Y2 + VPXOR Y2, Y8, Y8 + VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 + VPADDD Y6, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y14, Y10, Y10 + VPXOR Y10, Y0, Y0 + VPADDD Y15, Y11, Y11 + VPXOR Y11, Y4, Y4 + VPADDD Y8, Y12, Y12 + VPXOR Y12, Y5, Y5 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y6, Y6 + VMOVDQA Y7, (R10) + VPSRLD $0x07, Y0, Y7 + VPSLLD $0x19, Y0, Y0 + VPOR Y7, Y0, Y0 + VPSRLD $0x07, Y4, Y7 + VPSLLD $0x19, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x07, Y5, Y7 + VPSLLD $0x19, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x07, Y6, Y7 + VPSLLD $0x19, Y6, Y6 + VPOR Y7, Y6, Y6 + VMOVDQA (R10), Y7 + VPADDD 368(SP), Y7, Y7 + VPADDD 464(SP), Y1, Y1 + VPADDD 528(SP), Y2, Y2 + VPADDD 432(SP), Y3, Y3 + VPADDD Y4, Y7, Y7 + VPXOR Y7, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y5, Y1, Y1 + VPXOR Y1, Y14, Y14 + VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y15, Y15 + VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 + VPADDD Y9, Y12, Y12 + VPXOR Y12, Y4, Y4 + VPADDD Y14, Y13, Y13 + VPXOR Y13, Y5, Y5 + VPADDD Y15, Y10, Y10 + VPXOR Y10, Y6, Y6 + VPADDD Y8, Y11, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQA Y7, (R10) + VPSRLD $0x0c, Y4, Y7 + VPSLLD $0x14, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x0c, Y5, Y7 + VPSLLD $0x14, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x0c, Y6, Y7 + VPSLLD $0x14, Y6, Y6 + VPOR Y7, Y6, Y6 + VPSRLD $0x0c, Y0, Y7 + VPSLLD $0x14, Y0, Y0 + VPOR Y7, Y0, Y0 + VMOVDQA (R10), Y7 + VPADDD 336(SP), Y7, Y7 + VPADDD 176(SP), Y1, Y1 + VPADDD 656(SP), Y2, Y2 + VPADDD 208(SP), Y3, Y3 + VPADDD Y4, Y7, Y7 + VPXOR Y7, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y5, Y1, Y1 + VPXOR Y1, Y14, Y14 + VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y15, Y15 + VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 + VPADDD Y9, Y12, Y12 + VPXOR Y12, Y4, Y4 + VPADDD Y14, Y13, Y13 + VPXOR Y13, Y5, Y5 + VPADDD Y15, Y10, Y10 + VPXOR Y10, Y6, Y6 + VPADDD Y8, Y11, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQA Y7, (R10) + VPSRLD $0x07, Y4, Y7 + VPSLLD $0x19, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x07, Y5, Y7 + VPSLLD $0x19, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x07, Y6, Y7 + VPSLLD $0x19, Y6, Y6 + VPOR Y7, Y6, Y6 + VPSRLD $0x07, Y0, Y7 + VPSLLD $0x19, Y0, Y0 + VPOR Y7, Y0, Y0 + + // Round 4 + VMOVDQA (R10), Y7 + VPADDD 496(SP), Y7, Y7 + VPADDD 560(SP), Y1, Y1 + VPADDD 624(SP), Y2, Y2 + VPADDD 592(SP), Y3, Y3 + VPADDD Y0, Y7, Y7 + VPXOR Y7, Y14, Y14 + VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y15, Y15 + VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 + VPADDD Y5, Y2, Y2 + VPXOR Y2, Y8, Y8 + VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 + VPADDD Y6, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y14, Y10, Y10 + VPXOR Y10, Y0, Y0 + VPADDD Y15, Y11, Y11 + VPXOR Y11, Y4, Y4 + VPADDD Y8, Y12, Y12 + VPXOR Y12, Y5, Y5 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y6, Y6 + VMOVDQA Y7, (R10) + VPSRLD $0x0c, Y0, Y7 + VPSLLD $0x14, Y0, Y0 + VPOR Y7, Y0, Y0 + VPSRLD $0x0c, Y4, Y7 + VPSLLD $0x14, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x0c, Y5, Y7 + VPSLLD $0x14, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x0c, Y6, Y7 + VPSLLD $0x14, Y6, Y6 + VPOR Y7, Y6, Y6 + VMOVDQA (R10), Y7 + VPADDD 400(SP), Y7, Y7 + VPADDD 464(SP), Y1, Y1 + VPADDD 272(SP), Y2, Y2 + VPADDD 656(SP), Y3, Y3 + VPADDD Y0, Y7, Y7 + VPXOR Y7, Y14, Y14 + VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y15, Y15 + VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 + VPADDD Y5, Y2, Y2 + VPXOR Y2, Y8, Y8 + VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 + VPADDD Y6, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y14, Y10, Y10 + VPXOR Y10, Y0, Y0 + VPADDD Y15, Y11, Y11 + VPXOR Y11, Y4, Y4 + VPADDD Y8, Y12, Y12 + VPXOR Y12, Y5, Y5 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y6, Y6 + VMOVDQA Y7, (R10) + VPSRLD $0x07, Y0, Y7 + VPSLLD $0x19, Y0, Y0 + VPOR Y7, Y0, Y0 + VPSRLD $0x07, Y4, Y7 + VPSLLD $0x19, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x07, Y5, Y7 + VPSLLD $0x19, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x07, Y6, Y7 + VPSLLD $0x19, Y6, Y6 + VPOR Y7, Y6, Y6 + VMOVDQA (R10), Y7 + VPADDD 304(SP), Y7, Y7 + VPADDD 528(SP), Y1, Y1 + VPADDD 336(SP), Y2, Y2 + VPADDD 208(SP), Y3, Y3 + VPADDD Y4, Y7, Y7 + VPXOR Y7, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y5, Y1, Y1 + VPXOR Y1, Y14, Y14 + VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y15, Y15 + VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 + VPADDD Y9, Y12, Y12 + VPXOR Y12, Y4, Y4 + VPADDD Y14, Y13, Y13 + VPXOR Y13, Y5, Y5 + VPADDD Y15, Y10, Y10 + VPXOR Y10, Y6, Y6 + VPADDD Y8, Y11, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQA Y7, (R10) + VPSRLD $0x0c, Y4, Y7 + VPSLLD $0x14, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x0c, Y5, Y7 + VPSLLD $0x14, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x0c, Y6, Y7 + VPSLLD $0x14, Y6, Y6 + VPOR Y7, Y6, Y6 + VPSRLD $0x0c, Y0, Y7 + VPSLLD $0x14, Y0, Y0 + VPOR Y7, Y0, Y0 + VMOVDQA (R10), Y7 + VPADDD 176(SP), Y7, Y7 + VPADDD 240(SP), Y1, Y1 + VPADDD 432(SP), Y2, Y2 + VPADDD 368(SP), Y3, Y3 + VPADDD Y4, Y7, Y7 + VPXOR Y7, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y5, Y1, Y1 + VPXOR Y1, Y14, Y14 + VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y15, Y15 + VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 + VPADDD Y9, Y12, Y12 + VPXOR Y12, Y4, Y4 + VPADDD Y14, Y13, Y13 + VPXOR Y13, Y5, Y5 + VPADDD Y15, Y10, Y10 + VPXOR Y10, Y6, Y6 + VPADDD Y8, Y11, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQA Y7, (R10) + VPSRLD $0x07, Y4, Y7 + VPSLLD $0x19, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x07, Y5, Y7 + VPSLLD $0x19, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x07, Y6, Y7 + VPSLLD $0x19, Y6, Y6 + VPOR Y7, Y6, Y6 + VPSRLD $0x07, Y0, Y7 + VPSLLD $0x19, Y0, Y0 + VPOR Y7, Y0, Y0 + + // Round 5 + VMOVDQA (R10), Y7 + VPADDD 560(SP), Y7, Y7 + VPADDD 464(SP), Y1, Y1 + VPADDD 656(SP), Y2, Y2 + VPADDD 624(SP), Y3, Y3 + VPADDD Y0, Y7, Y7 + VPXOR Y7, Y14, Y14 + VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y15, Y15 + VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 + VPADDD Y5, Y2, Y2 + VPXOR Y2, Y8, Y8 + VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 + VPADDD Y6, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y14, Y10, Y10 + VPXOR Y10, Y0, Y0 + VPADDD Y15, Y11, Y11 + VPXOR Y11, Y4, Y4 + VPADDD Y8, Y12, Y12 + VPXOR Y12, Y5, Y5 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y6, Y6 + VMOVDQA Y7, (R10) + VPSRLD $0x0c, Y0, Y7 + VPSLLD $0x14, Y0, Y0 + VPOR Y7, Y0, Y0 + VPSRLD $0x0c, Y4, Y7 + VPSLLD $0x14, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x0c, Y5, Y7 + VPSLLD $0x14, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x0c, Y6, Y7 + VPSLLD $0x14, Y6, Y6 + VPOR Y7, Y6, Y6 + VMOVDQA (R10), Y7 + VPADDD 592(SP), Y7, Y7 + VPADDD 528(SP), Y1, Y1 + VPADDD 496(SP), Y2, Y2 + VPADDD 432(SP), Y3, Y3 + VPADDD Y0, Y7, Y7 + VPXOR Y7, Y14, Y14 + VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y15, Y15 + VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 + VPADDD Y5, Y2, Y2 + VPXOR Y2, Y8, Y8 + VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 + VPADDD Y6, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y14, Y10, Y10 + VPXOR Y10, Y0, Y0 + VPADDD Y15, Y11, Y11 + VPXOR Y11, Y4, Y4 + VPADDD Y8, Y12, Y12 + VPXOR Y12, Y5, Y5 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y6, Y6 + VMOVDQA Y7, (R10) + VPSRLD $0x07, Y0, Y7 + VPSLLD $0x19, Y0, Y0 + VPOR Y7, Y0, Y0 + VPSRLD $0x07, Y4, Y7 + VPSLLD $0x19, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x07, Y5, Y7 + VPSLLD $0x19, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x07, Y6, Y7 + VPSLLD $0x19, Y6, Y6 + VPOR Y7, Y6, Y6 + VMOVDQA (R10), Y7 + VPADDD 400(SP), Y7, Y7 + VPADDD 336(SP), Y1, Y1 + VPADDD 176(SP), Y2, Y2 + VPADDD 368(SP), Y3, Y3 + VPADDD Y4, Y7, Y7 + VPXOR Y7, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y5, Y1, Y1 + VPXOR Y1, Y14, Y14 + VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y15, Y15 + VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 + VPADDD Y9, Y12, Y12 + VPXOR Y12, Y4, Y4 + VPADDD Y14, Y13, Y13 + VPXOR Y13, Y5, Y5 + VPADDD Y15, Y10, Y10 + VPXOR Y10, Y6, Y6 + VPADDD Y8, Y11, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQA Y7, (R10) + VPSRLD $0x0c, Y4, Y7 + VPSLLD $0x14, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x0c, Y5, Y7 + VPSLLD $0x14, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x0c, Y6, Y7 + VPSLLD $0x14, Y6, Y6 + VPOR Y7, Y6, Y6 + VPSRLD $0x0c, Y0, Y7 + VPSLLD $0x14, Y0, Y0 + VPOR Y7, Y0, Y0 + VMOVDQA (R10), Y7 + VPADDD 240(SP), Y7, Y7 + VPADDD 272(SP), Y1, Y1 + VPADDD 208(SP), Y2, Y2 + VPADDD 304(SP), Y3, Y3 + VPADDD Y4, Y7, Y7 + VPXOR Y7, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y5, Y1, Y1 + VPXOR Y1, Y14, Y14 + VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y15, Y15 + VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 + VPADDD Y9, Y12, Y12 + VPXOR Y12, Y4, Y4 + VPADDD Y14, Y13, Y13 + VPXOR Y13, Y5, Y5 + VPADDD Y15, Y10, Y10 + VPXOR Y10, Y6, Y6 + VPADDD Y8, Y11, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQA Y7, (R10) + VPSRLD $0x07, Y4, Y7 + VPSLLD $0x19, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x07, Y5, Y7 + VPSLLD $0x19, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x07, Y6, Y7 + VPSLLD $0x19, Y6, Y6 + VPOR Y7, Y6, Y6 + VPSRLD $0x07, Y0, Y7 + VPSLLD $0x19, Y0, Y0 + VPOR Y7, Y0, Y0 + + // Round 6 + VMOVDQA (R10), Y7 + VPADDD 464(SP), Y7, Y7 + VPADDD 528(SP), Y1, Y1 + VPADDD 432(SP), Y2, Y2 + VPADDD 656(SP), Y3, Y3 + VPADDD Y0, Y7, Y7 + VPXOR Y7, Y14, Y14 + VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y15, Y15 + VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 + VPADDD Y5, Y2, Y2 + VPXOR Y2, Y8, Y8 + VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 + VPADDD Y6, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y14, Y10, Y10 + VPXOR Y10, Y0, Y0 + VPADDD Y15, Y11, Y11 + VPXOR Y11, Y4, Y4 + VPADDD Y8, Y12, Y12 + VPXOR Y12, Y5, Y5 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y6, Y6 + VMOVDQA Y7, (R10) + VPSRLD $0x0c, Y0, Y7 + VPSLLD $0x14, Y0, Y0 + VPOR Y7, Y0, Y0 + VPSRLD $0x0c, Y4, Y7 + VPSLLD $0x14, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x0c, Y5, Y7 + VPSLLD $0x14, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x0c, Y6, Y7 + VPSLLD $0x14, Y6, Y6 + VPOR Y7, Y6, Y6 + VMOVDQA (R10), Y7 + VPADDD 624(SP), Y7, Y7 + VPADDD 336(SP), Y1, Y1 + VPADDD 560(SP), Y2, Y2 + VPADDD 208(SP), Y3, Y3 + VPADDD Y0, Y7, Y7 + VPXOR Y7, Y14, Y14 + VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y15, Y15 + VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 + VPADDD Y5, Y2, Y2 + VPXOR Y2, Y8, Y8 + VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 + VPADDD Y6, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y14, Y10, Y10 + VPXOR Y10, Y0, Y0 + VPADDD Y15, Y11, Y11 + VPXOR Y11, Y4, Y4 + VPADDD Y8, Y12, Y12 + VPXOR Y12, Y5, Y5 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y6, Y6 + VMOVDQA Y7, (R10) + VPSRLD $0x07, Y0, Y7 + VPSLLD $0x19, Y0, Y0 + VPOR Y7, Y0, Y0 + VPSRLD $0x07, Y4, Y7 + VPSLLD $0x19, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x07, Y5, Y7 + VPSLLD $0x19, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x07, Y6, Y7 + VPSLLD $0x19, Y6, Y6 + VPOR Y7, Y6, Y6 + VMOVDQA (R10), Y7 + VPADDD 592(SP), Y7, Y7 + VPADDD 176(SP), Y1, Y1 + VPADDD 240(SP), Y2, Y2 + VPADDD 304(SP), Y3, Y3 + VPADDD Y4, Y7, Y7 + VPXOR Y7, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y5, Y1, Y1 + VPXOR Y1, Y14, Y14 + VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y15, Y15 + VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 + VPADDD Y9, Y12, Y12 + VPXOR Y12, Y4, Y4 + VPADDD Y14, Y13, Y13 + VPXOR Y13, Y5, Y5 + VPADDD Y15, Y10, Y10 + VPXOR Y10, Y6, Y6 + VPADDD Y8, Y11, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQA Y7, (R10) + VPSRLD $0x0c, Y4, Y7 + VPSLLD $0x14, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x0c, Y5, Y7 + VPSLLD $0x14, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x0c, Y6, Y7 + VPSLLD $0x14, Y6, Y6 + VPOR Y7, Y6, Y6 + VPSRLD $0x0c, Y0, Y7 + VPSLLD $0x14, Y0, Y0 + VPOR Y7, Y0, Y0 + VMOVDQA (R10), Y7 + VPADDD 272(SP), Y7, Y7 + VPADDD 496(SP), Y1, Y1 + VPADDD 368(SP), Y2, Y2 + VPADDD 400(SP), Y3, Y3 + VPADDD Y4, Y7, Y7 + VPXOR Y7, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y5, Y1, Y1 + VPXOR Y1, Y14, Y14 + VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y15, Y15 + VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 + VPADDD Y9, Y12, Y12 + VPXOR Y12, Y4, Y4 + VPADDD Y14, Y13, Y13 + VPXOR Y13, Y5, Y5 + VPADDD Y15, Y10, Y10 + VPXOR Y10, Y6, Y6 + VPADDD Y8, Y11, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQA Y7, (R10) + VPSRLD $0x07, Y4, Y7 + VPSLLD $0x19, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x07, Y5, Y7 + VPSLLD $0x19, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x07, Y6, Y7 + VPSLLD $0x19, Y6, Y6 + VPOR Y7, Y6, Y6 + VPSRLD $0x07, Y0, Y7 + VPSLLD $0x19, Y0, Y0 + VPOR Y7, Y0, Y0 + + // Round 7 + VMOVDQA (R10), Y7 + VPADDD 528(SP), Y7, Y7 + VPADDD 336(SP), Y1, Y1 + VPADDD 208(SP), Y2, Y2 + VPADDD 432(SP), Y3, Y3 + VPADDD Y0, Y7, Y7 + VPXOR Y7, Y14, Y14 + VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y15, Y15 + VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 + VPADDD Y5, Y2, Y2 + VPXOR Y2, Y8, Y8 + VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 + VPADDD Y6, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y14, Y10, Y10 + VPXOR Y10, Y0, Y0 + VPADDD Y15, Y11, Y11 + VPXOR Y11, Y4, Y4 + VPADDD Y8, Y12, Y12 + VPXOR Y12, Y5, Y5 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y6, Y6 + VMOVDQA Y7, (R10) + VPSRLD $0x0c, Y0, Y7 + VPSLLD $0x14, Y0, Y0 + VPOR Y7, Y0, Y0 + VPSRLD $0x0c, Y4, Y7 + VPSLLD $0x14, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x0c, Y5, Y7 + VPSLLD $0x14, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x0c, Y6, Y7 + VPSLLD $0x14, Y6, Y6 + VPOR Y7, Y6, Y6 + VMOVDQA (R10), Y7 + VPADDD 656(SP), Y7, Y7 + VPADDD 176(SP), Y1, Y1 + VPADDD 464(SP), Y2, Y2 + VPADDD 368(SP), Y3, Y3 + VPADDD Y0, Y7, Y7 + VPXOR Y7, Y14, Y14 + VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y15, Y15 + VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 + VPADDD Y5, Y2, Y2 + VPXOR Y2, Y8, Y8 + VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 + VPADDD Y6, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y14, Y10, Y10 + VPXOR Y10, Y0, Y0 + VPADDD Y15, Y11, Y11 + VPXOR Y11, Y4, Y4 + VPADDD Y8, Y12, Y12 + VPXOR Y12, Y5, Y5 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y6, Y6 + VMOVDQA Y7, (R10) + VPSRLD $0x07, Y0, Y7 + VPSLLD $0x19, Y0, Y0 + VPOR Y7, Y0, Y0 + VPSRLD $0x07, Y4, Y7 + VPSLLD $0x19, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x07, Y5, Y7 + VPSLLD $0x19, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x07, Y6, Y7 + VPSLLD $0x19, Y6, Y6 + VPOR Y7, Y6, Y6 + VMOVDQA (R10), Y7 + VPADDD 624(SP), Y7, Y7 + VPADDD 240(SP), Y1, Y1 + VPADDD 272(SP), Y2, Y2 + VPADDD 400(SP), Y3, Y3 + VPADDD Y4, Y7, Y7 + VPXOR Y7, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y5, Y1, Y1 + VPXOR Y1, Y14, Y14 + VPSHUFB rot16_shuf<>+0(SB), Y14, Y14 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y15, Y15 + VPSHUFB rot16_shuf<>+0(SB), Y15, Y15 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSHUFB rot16_shuf<>+0(SB), Y8, Y8 + VPADDD Y9, Y12, Y12 + VPXOR Y12, Y4, Y4 + VPADDD Y14, Y13, Y13 + VPXOR Y13, Y5, Y5 + VPADDD Y15, Y10, Y10 + VPXOR Y10, Y6, Y6 + VPADDD Y8, Y11, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQA Y7, (R10) + VPSRLD $0x0c, Y4, Y7 + VPSLLD $0x14, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x0c, Y5, Y7 + VPSLLD $0x14, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x0c, Y6, Y7 + VPSLLD $0x14, Y6, Y6 + VPOR Y7, Y6, Y6 + VPSRLD $0x0c, Y0, Y7 + VPSLLD $0x14, Y0, Y0 + VPOR Y7, Y0, Y0 + VMOVDQA (R10), Y7 + VPADDD 496(SP), Y7, Y7 + VPADDD 560(SP), Y1, Y1 + VPADDD 304(SP), Y2, Y2 + VPADDD 592(SP), Y3, Y3 + VPADDD Y4, Y7, Y7 + VPXOR Y7, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y5, Y1, Y1 + VPXOR Y1, Y14, Y14 + VPSHUFB rot8_shuf<>+0(SB), Y14, Y14 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y15, Y15 + VPSHUFB rot8_shuf<>+0(SB), Y15, Y15 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y8, Y8 + VPSHUFB rot8_shuf<>+0(SB), Y8, Y8 + VPADDD Y9, Y12, Y12 + VPXOR Y12, Y4, Y4 + VPADDD Y14, Y13, Y13 + VPXOR Y13, Y5, Y5 + VPADDD Y15, Y10, Y10 + VPXOR Y10, Y6, Y6 + VPADDD Y8, Y11, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQA Y7, (R10) + VPSRLD $0x07, Y4, Y7 + VPSLLD $0x19, Y4, Y4 + VPOR Y7, Y4, Y4 + VPSRLD $0x07, Y5, Y7 + VPSLLD $0x19, Y5, Y5 + VPOR Y7, Y5, Y5 + VPSRLD $0x07, Y6, Y7 + VPSLLD $0x19, Y6, Y6 + VPOR Y7, Y6, Y6 + VPSRLD $0x07, Y0, Y7 + VPSLLD $0x19, Y0, Y0 + VPOR Y7, Y0, Y0 + + // Finalize rounds + VPXOR Y9, Y6, Y6 + VPXOR (R10), Y10, Y7 + VPXOR Y11, Y1, Y1 + VPXOR Y12, Y2, Y2 + VPXOR Y13, Y3, Y3 + VPXOR Y14, Y0, Y0 + VPXOR Y15, Y4, Y4 + VPXOR Y8, Y5, Y5 + + // Fix up registers for next iteration + VMOVDQU Y7, Y8 + VMOVDQU Y6, Y7 + VMOVDQU Y5, Y6 + VMOVDQU Y4, Y5 + VMOVDQU Y0, Y4 + VMOVDQU Y8, Y0 + + // If we have zero complete chunks, we're done + CMPQ R8, $0x00 + JNE loop_trailer + CMPQ R9, CX + JEQ finalize + +loop_trailer: + // Increment, reset flags, and loop + CMPQ CX, $0x000003c0 + JEQ finalize + ADDQ $0x40, CX + MOVL BX, 64(SP) + JMP loop + +finalize: + // Store result into out + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + VMOVDQU Y2, 64(SI) + VMOVDQU Y3, 96(SI) + VMOVDQU Y4, 128(SI) + VMOVDQU Y5, 160(SI) + VMOVDQU Y6, 192(SI) + VMOVDQU Y7, 224(SI) + VZEROUPPER + RET + +// func HashP(left *[32]uint32, right *[32]uint32, flags uint8, key *[8]uint32, out *[32]uint32, n int) +// Requires: AVX, AVX2 +TEXT ·HashP(SB), NOSPLIT, $72-48 + MOVQ left+0(FP), AX + MOVQ right+8(FP), CX + MOVBLZX flags+16(FP), DX + MOVQ key+24(FP), BX + MOVQ out+32(FP), BP + + // Allocate local space and align it + LEAQ 31(SP), SI + MOVQ $0x000000000000001f, DI + NOTQ DI + ANDQ DI, SI + + // Set up flags value + MOVL DX, 64(SP) + + // Perform the rounds + // Round 1 + VPBROADCASTD (BX), Y0 + VPADDD (AX), Y0, Y0 + VPBROADCASTD 4(BX), Y1 + VPADDD 64(AX), Y1, Y1 + VPBROADCASTD 8(BX), Y2 + VPADDD 128(AX), Y2, Y2 + VPBROADCASTD 12(BX), Y3 + VPADDD 192(AX), Y3, Y3 + VPBROADCASTD 16(BX), Y4 + VPADDD Y4, Y0, Y0 + VMOVDQU zero<>+0(SB), Y5 + VPXOR Y0, Y5, Y5 + VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 + VPBROADCASTD 20(BX), Y6 + VPADDD Y6, Y1, Y1 + VMOVDQU zero<>+0(SB), Y7 + VPXOR Y1, Y7, Y7 + VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 + VPBROADCASTD 24(BX), Y8 + VPADDD Y8, Y2, Y2 + VMOVDQU block_len<>+0(SB), Y9 + VPXOR Y2, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPBROADCASTD 28(BX), Y10 + VPADDD Y10, Y3, Y3 + VPBROADCASTD 64(SP), Y11 + VPXOR Y3, Y11, Y11 + VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 + VPBROADCASTD iv<>+0(SB), Y12 + VPADDD Y5, Y12, Y12 + VPXOR Y12, Y4, Y4 + VPBROADCASTD iv<>+4(SB), Y13 + VPADDD Y7, Y13, Y13 + VPXOR Y13, Y6, Y6 + VPBROADCASTD iv<>+8(SB), Y14 + VPADDD Y9, Y14, Y14 + VPXOR Y14, Y8, Y8 + VPBROADCASTD iv<>+12(SB), Y15 + VPADDD Y11, Y15, Y15 + VPXOR Y15, Y10, Y10 + VMOVDQA Y0, (SI) + VPSRLD $0x0c, Y4, Y0 + VPSLLD $0x14, Y4, Y4 + VPOR Y0, Y4, Y0 + VPSRLD $0x0c, Y6, Y4 + VPSLLD $0x14, Y6, Y6 + VPOR Y4, Y6, Y4 + VPSRLD $0x0c, Y8, Y6 + VPSLLD $0x14, Y8, Y8 + VPOR Y6, Y8, Y6 + VPSRLD $0x0c, Y10, Y8 + VPSLLD $0x14, Y10, Y10 + VPOR Y8, Y10, Y8 + VMOVDQA (SI), Y10 + VPADDD 32(AX), Y10, Y10 + VPADDD 96(AX), Y1, Y1 + VPADDD 160(AX), Y2, Y2 + VPADDD 224(AX), Y3, Y3 + VPADDD Y0, Y10, Y10 + VPXOR Y10, Y5, Y5 + VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y7, Y7 + VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y8, Y3, Y3 + VPXOR Y3, Y11, Y11 + VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 + VPADDD Y5, Y12, Y12 + VPXOR Y12, Y0, Y0 + VPADDD Y7, Y13, Y13 + VPXOR Y13, Y4, Y4 + VPADDD Y9, Y14, Y14 + VPXOR Y14, Y6, Y6 + VPADDD Y11, Y15, Y15 + VPXOR Y15, Y8, Y8 + VMOVDQA Y10, (SI) + VPSRLD $0x07, Y0, Y10 + VPSLLD $0x19, Y0, Y0 + VPOR Y10, Y0, Y0 + VPSRLD $0x07, Y4, Y10 + VPSLLD $0x19, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x07, Y6, Y10 + VPSLLD $0x19, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x07, Y8, Y10 + VPSLLD $0x19, Y8, Y8 + VPOR Y10, Y8, Y8 + VMOVDQA (SI), Y10 + VPADDD (CX), Y10, Y10 + VPADDD 64(CX), Y1, Y1 + VPADDD 128(CX), Y2, Y2 + VPADDD 192(CX), Y3, Y3 + VPADDD Y4, Y10, Y10 + VPXOR Y10, Y11, Y11 + VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 + VPADDD Y6, Y1, Y1 + VPXOR Y1, Y5, Y5 + VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 + VPADDD Y8, Y2, Y2 + VPXOR Y2, Y7, Y7 + VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y11, Y14, Y14 + VPXOR Y14, Y4, Y4 + VPADDD Y5, Y15, Y15 + VPXOR Y15, Y6, Y6 + VPADDD Y7, Y12, Y12 + VPXOR Y12, Y8, Y8 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y0, Y0 + VMOVDQA Y10, (SI) + VPSRLD $0x0c, Y4, Y10 + VPSLLD $0x14, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x0c, Y6, Y10 + VPSLLD $0x14, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x0c, Y8, Y10 + VPSLLD $0x14, Y8, Y8 + VPOR Y10, Y8, Y8 + VPSRLD $0x0c, Y0, Y10 + VPSLLD $0x14, Y0, Y0 + VPOR Y10, Y0, Y0 + VMOVDQA (SI), Y10 + VPADDD 32(CX), Y10, Y10 + VPADDD 96(CX), Y1, Y1 + VPADDD 160(CX), Y2, Y2 + VPADDD 224(CX), Y3, Y3 + VPADDD Y4, Y10, Y10 + VPXOR Y10, Y11, Y11 + VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 + VPADDD Y6, Y1, Y1 + VPXOR Y1, Y5, Y5 + VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 + VPADDD Y8, Y2, Y2 + VPXOR Y2, Y7, Y7 + VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y11, Y14, Y14 + VPXOR Y14, Y4, Y4 + VPADDD Y5, Y15, Y15 + VPXOR Y15, Y6, Y6 + VPADDD Y7, Y12, Y12 + VPXOR Y12, Y8, Y8 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y0, Y0 + VMOVDQA Y10, (SI) + VPSRLD $0x07, Y4, Y10 + VPSLLD $0x19, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x07, Y6, Y10 + VPSLLD $0x19, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x07, Y8, Y10 + VPSLLD $0x19, Y8, Y8 + VPOR Y10, Y8, Y8 + VPSRLD $0x07, Y0, Y10 + VPSLLD $0x19, Y0, Y0 + VPOR Y10, Y0, Y0 + + // Round 2 + VMOVDQA (SI), Y10 + VPADDD 64(AX), Y10, Y10 + VPADDD 96(AX), Y1, Y1 + VPADDD 224(AX), Y2, Y2 + VPADDD 128(AX), Y3, Y3 + VPADDD Y0, Y10, Y10 + VPXOR Y10, Y5, Y5 + VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y7, Y7 + VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y8, Y3, Y3 + VPXOR Y3, Y11, Y11 + VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 + VPADDD Y5, Y12, Y12 + VPXOR Y12, Y0, Y0 + VPADDD Y7, Y13, Y13 + VPXOR Y13, Y4, Y4 + VPADDD Y9, Y14, Y14 + VPXOR Y14, Y6, Y6 + VPADDD Y11, Y15, Y15 + VPXOR Y15, Y8, Y8 + VMOVDQA Y10, (SI) + VPSRLD $0x0c, Y0, Y10 + VPSLLD $0x14, Y0, Y0 + VPOR Y10, Y0, Y0 + VPSRLD $0x0c, Y4, Y10 + VPSLLD $0x14, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x0c, Y6, Y10 + VPSLLD $0x14, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x0c, Y8, Y10 + VPSLLD $0x14, Y8, Y8 + VPOR Y10, Y8, Y8 + VMOVDQA (SI), Y10 + VPADDD 192(AX), Y10, Y10 + VPADDD 64(CX), Y1, Y1 + VPADDD (AX), Y2, Y2 + VPADDD 160(CX), Y3, Y3 + VPADDD Y0, Y10, Y10 + VPXOR Y10, Y5, Y5 + VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y7, Y7 + VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y8, Y3, Y3 + VPXOR Y3, Y11, Y11 + VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 + VPADDD Y5, Y12, Y12 + VPXOR Y12, Y0, Y0 + VPADDD Y7, Y13, Y13 + VPXOR Y13, Y4, Y4 + VPADDD Y9, Y14, Y14 + VPXOR Y14, Y6, Y6 + VPADDD Y11, Y15, Y15 + VPXOR Y15, Y8, Y8 + VMOVDQA Y10, (SI) + VPSRLD $0x07, Y0, Y10 + VPSLLD $0x19, Y0, Y0 + VPOR Y10, Y0, Y0 + VPSRLD $0x07, Y4, Y10 + VPSLLD $0x19, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x07, Y6, Y10 + VPSLLD $0x19, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x07, Y8, Y10 + VPSLLD $0x19, Y8, Y8 + VPOR Y10, Y8, Y8 + VMOVDQA (SI), Y10 + VPADDD 32(AX), Y10, Y10 + VPADDD 128(CX), Y1, Y1 + VPADDD 32(CX), Y2, Y2 + VPADDD 224(CX), Y3, Y3 + VPADDD Y4, Y10, Y10 + VPXOR Y10, Y11, Y11 + VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 + VPADDD Y6, Y1, Y1 + VPXOR Y1, Y5, Y5 + VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 + VPADDD Y8, Y2, Y2 + VPXOR Y2, Y7, Y7 + VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y11, Y14, Y14 + VPXOR Y14, Y4, Y4 + VPADDD Y5, Y15, Y15 + VPXOR Y15, Y6, Y6 + VPADDD Y7, Y12, Y12 + VPXOR Y12, Y8, Y8 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y0, Y0 + VMOVDQA Y10, (SI) + VPSRLD $0x0c, Y4, Y10 + VPSLLD $0x14, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x0c, Y6, Y10 + VPSLLD $0x14, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x0c, Y8, Y10 + VPSLLD $0x14, Y8, Y8 + VPOR Y10, Y8, Y8 + VPSRLD $0x0c, Y0, Y10 + VPSLLD $0x14, Y0, Y0 + VPOR Y10, Y0, Y0 + VMOVDQA (SI), Y10 + VPADDD 96(CX), Y10, Y10 + VPADDD 160(AX), Y1, Y1 + VPADDD 192(CX), Y2, Y2 + VPADDD (CX), Y3, Y3 + VPADDD Y4, Y10, Y10 + VPXOR Y10, Y11, Y11 + VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 + VPADDD Y6, Y1, Y1 + VPXOR Y1, Y5, Y5 + VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 + VPADDD Y8, Y2, Y2 + VPXOR Y2, Y7, Y7 + VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y11, Y14, Y14 + VPXOR Y14, Y4, Y4 + VPADDD Y5, Y15, Y15 + VPXOR Y15, Y6, Y6 + VPADDD Y7, Y12, Y12 + VPXOR Y12, Y8, Y8 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y0, Y0 + VMOVDQA Y10, (SI) + VPSRLD $0x07, Y4, Y10 + VPSLLD $0x19, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x07, Y6, Y10 + VPSLLD $0x19, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x07, Y8, Y10 + VPSLLD $0x19, Y8, Y8 + VPOR Y10, Y8, Y8 + VPSRLD $0x07, Y0, Y10 + VPSLLD $0x19, Y0, Y0 + VPOR Y10, Y0, Y0 + + // Round 3 + VMOVDQA (SI), Y10 + VPADDD 96(AX), Y10, Y10 + VPADDD 64(CX), Y1, Y1 + VPADDD 160(CX), Y2, Y2 + VPADDD 224(AX), Y3, Y3 + VPADDD Y0, Y10, Y10 + VPXOR Y10, Y5, Y5 + VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y7, Y7 + VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y8, Y3, Y3 + VPXOR Y3, Y11, Y11 + VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 + VPADDD Y5, Y12, Y12 + VPXOR Y12, Y0, Y0 + VPADDD Y7, Y13, Y13 + VPXOR Y13, Y4, Y4 + VPADDD Y9, Y14, Y14 + VPXOR Y14, Y6, Y6 + VPADDD Y11, Y15, Y15 + VPXOR Y15, Y8, Y8 + VMOVDQA Y10, (SI) + VPSRLD $0x0c, Y0, Y10 + VPSLLD $0x14, Y0, Y0 + VPOR Y10, Y0, Y0 + VPSRLD $0x0c, Y4, Y10 + VPSLLD $0x14, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x0c, Y6, Y10 + VPSLLD $0x14, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x0c, Y8, Y10 + VPSLLD $0x14, Y8, Y8 + VPOR Y10, Y8, Y8 + VMOVDQA (SI), Y10 + VPADDD 128(AX), Y10, Y10 + VPADDD 128(CX), Y1, Y1 + VPADDD 64(AX), Y2, Y2 + VPADDD 192(CX), Y3, Y3 + VPADDD Y0, Y10, Y10 + VPXOR Y10, Y5, Y5 + VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y7, Y7 + VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y8, Y3, Y3 + VPXOR Y3, Y11, Y11 + VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 + VPADDD Y5, Y12, Y12 + VPXOR Y12, Y0, Y0 + VPADDD Y7, Y13, Y13 + VPXOR Y13, Y4, Y4 + VPADDD Y9, Y14, Y14 + VPXOR Y14, Y6, Y6 + VPADDD Y11, Y15, Y15 + VPXOR Y15, Y8, Y8 + VMOVDQA Y10, (SI) + VPSRLD $0x07, Y0, Y10 + VPSLLD $0x19, Y0, Y0 + VPOR Y10, Y0, Y0 + VPSRLD $0x07, Y4, Y10 + VPSLLD $0x19, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x07, Y6, Y10 + VPSLLD $0x19, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x07, Y8, Y10 + VPSLLD $0x19, Y8, Y8 + VPOR Y10, Y8, Y8 + VMOVDQA (SI), Y10 + VPADDD 192(AX), Y10, Y10 + VPADDD 32(CX), Y1, Y1 + VPADDD 96(CX), Y2, Y2 + VPADDD (CX), Y3, Y3 + VPADDD Y4, Y10, Y10 + VPXOR Y10, Y11, Y11 + VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 + VPADDD Y6, Y1, Y1 + VPXOR Y1, Y5, Y5 + VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 + VPADDD Y8, Y2, Y2 + VPXOR Y2, Y7, Y7 + VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y11, Y14, Y14 + VPXOR Y14, Y4, Y4 + VPADDD Y5, Y15, Y15 + VPXOR Y15, Y6, Y6 + VPADDD Y7, Y12, Y12 + VPXOR Y12, Y8, Y8 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y0, Y0 + VMOVDQA Y10, (SI) + VPSRLD $0x0c, Y4, Y10 + VPSLLD $0x14, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x0c, Y6, Y10 + VPSLLD $0x14, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x0c, Y8, Y10 + VPSLLD $0x14, Y8, Y8 + VPOR Y10, Y8, Y8 + VPSRLD $0x0c, Y0, Y10 + VPSLLD $0x14, Y0, Y0 + VPOR Y10, Y0, Y0 + VMOVDQA (SI), Y10 + VPADDD 160(AX), Y10, Y10 + VPADDD (AX), Y1, Y1 + VPADDD 224(CX), Y2, Y2 + VPADDD 32(AX), Y3, Y3 + VPADDD Y4, Y10, Y10 + VPXOR Y10, Y11, Y11 + VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 + VPADDD Y6, Y1, Y1 + VPXOR Y1, Y5, Y5 + VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 + VPADDD Y8, Y2, Y2 + VPXOR Y2, Y7, Y7 + VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y11, Y14, Y14 + VPXOR Y14, Y4, Y4 + VPADDD Y5, Y15, Y15 + VPXOR Y15, Y6, Y6 + VPADDD Y7, Y12, Y12 + VPXOR Y12, Y8, Y8 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y0, Y0 + VMOVDQA Y10, (SI) + VPSRLD $0x07, Y4, Y10 + VPSLLD $0x19, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x07, Y6, Y10 + VPSLLD $0x19, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x07, Y8, Y10 + VPSLLD $0x19, Y8, Y8 + VPOR Y10, Y8, Y8 + VPSRLD $0x07, Y0, Y10 + VPSLLD $0x19, Y0, Y0 + VPOR Y10, Y0, Y0 + + // Round 4 + VMOVDQA (SI), Y10 + VPADDD 64(CX), Y10, Y10 + VPADDD 128(CX), Y1, Y1 + VPADDD 192(CX), Y2, Y2 + VPADDD 160(CX), Y3, Y3 + VPADDD Y0, Y10, Y10 + VPXOR Y10, Y5, Y5 + VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y7, Y7 + VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y8, Y3, Y3 + VPXOR Y3, Y11, Y11 + VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 + VPADDD Y5, Y12, Y12 + VPXOR Y12, Y0, Y0 + VPADDD Y7, Y13, Y13 + VPXOR Y13, Y4, Y4 + VPADDD Y9, Y14, Y14 + VPXOR Y14, Y6, Y6 + VPADDD Y11, Y15, Y15 + VPXOR Y15, Y8, Y8 + VMOVDQA Y10, (SI) + VPSRLD $0x0c, Y0, Y10 + VPSLLD $0x14, Y0, Y0 + VPOR Y10, Y0, Y0 + VPSRLD $0x0c, Y4, Y10 + VPSLLD $0x14, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x0c, Y6, Y10 + VPSLLD $0x14, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x0c, Y8, Y10 + VPSLLD $0x14, Y8, Y8 + VPOR Y10, Y8, Y8 + VMOVDQA (SI), Y10 + VPADDD 224(AX), Y10, Y10 + VPADDD 32(CX), Y1, Y1 + VPADDD 96(AX), Y2, Y2 + VPADDD 224(CX), Y3, Y3 + VPADDD Y0, Y10, Y10 + VPXOR Y10, Y5, Y5 + VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y7, Y7 + VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y8, Y3, Y3 + VPXOR Y3, Y11, Y11 + VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 + VPADDD Y5, Y12, Y12 + VPXOR Y12, Y0, Y0 + VPADDD Y7, Y13, Y13 + VPXOR Y13, Y4, Y4 + VPADDD Y9, Y14, Y14 + VPXOR Y14, Y6, Y6 + VPADDD Y11, Y15, Y15 + VPXOR Y15, Y8, Y8 + VMOVDQA Y10, (SI) + VPSRLD $0x07, Y0, Y10 + VPSLLD $0x19, Y0, Y0 + VPOR Y10, Y0, Y0 + VPSRLD $0x07, Y4, Y10 + VPSLLD $0x19, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x07, Y6, Y10 + VPSLLD $0x19, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x07, Y8, Y10 + VPSLLD $0x19, Y8, Y8 + VPOR Y10, Y8, Y8 + VMOVDQA (SI), Y10 + VPADDD 128(AX), Y10, Y10 + VPADDD 96(CX), Y1, Y1 + VPADDD 160(AX), Y2, Y2 + VPADDD 32(AX), Y3, Y3 + VPADDD Y4, Y10, Y10 + VPXOR Y10, Y11, Y11 + VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 + VPADDD Y6, Y1, Y1 + VPXOR Y1, Y5, Y5 + VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 + VPADDD Y8, Y2, Y2 + VPXOR Y2, Y7, Y7 + VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y11, Y14, Y14 + VPXOR Y14, Y4, Y4 + VPADDD Y5, Y15, Y15 + VPXOR Y15, Y6, Y6 + VPADDD Y7, Y12, Y12 + VPXOR Y12, Y8, Y8 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y0, Y0 + VMOVDQA Y10, (SI) + VPSRLD $0x0c, Y4, Y10 + VPSLLD $0x14, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x0c, Y6, Y10 + VPSLLD $0x14, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x0c, Y8, Y10 + VPSLLD $0x14, Y8, Y8 + VPOR Y10, Y8, Y8 + VPSRLD $0x0c, Y0, Y10 + VPSLLD $0x14, Y0, Y0 + VPOR Y10, Y0, Y0 + VMOVDQA (SI), Y10 + VPADDD (AX), Y10, Y10 + VPADDD 64(AX), Y1, Y1 + VPADDD (CX), Y2, Y2 + VPADDD 192(AX), Y3, Y3 + VPADDD Y4, Y10, Y10 + VPXOR Y10, Y11, Y11 + VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 + VPADDD Y6, Y1, Y1 + VPXOR Y1, Y5, Y5 + VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 + VPADDD Y8, Y2, Y2 + VPXOR Y2, Y7, Y7 + VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y11, Y14, Y14 + VPXOR Y14, Y4, Y4 + VPADDD Y5, Y15, Y15 + VPXOR Y15, Y6, Y6 + VPADDD Y7, Y12, Y12 + VPXOR Y12, Y8, Y8 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y0, Y0 + VMOVDQA Y10, (SI) + VPSRLD $0x07, Y4, Y10 + VPSLLD $0x19, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x07, Y6, Y10 + VPSLLD $0x19, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x07, Y8, Y10 + VPSLLD $0x19, Y8, Y8 + VPOR Y10, Y8, Y8 + VPSRLD $0x07, Y0, Y10 + VPSLLD $0x19, Y0, Y0 + VPOR Y10, Y0, Y0 + + // Round 5 + VMOVDQA (SI), Y10 + VPADDD 128(CX), Y10, Y10 + VPADDD 32(CX), Y1, Y1 + VPADDD 224(CX), Y2, Y2 + VPADDD 192(CX), Y3, Y3 + VPADDD Y0, Y10, Y10 + VPXOR Y10, Y5, Y5 + VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y7, Y7 + VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y8, Y3, Y3 + VPXOR Y3, Y11, Y11 + VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 + VPADDD Y5, Y12, Y12 + VPXOR Y12, Y0, Y0 + VPADDD Y7, Y13, Y13 + VPXOR Y13, Y4, Y4 + VPADDD Y9, Y14, Y14 + VPXOR Y14, Y6, Y6 + VPADDD Y11, Y15, Y15 + VPXOR Y15, Y8, Y8 + VMOVDQA Y10, (SI) + VPSRLD $0x0c, Y0, Y10 + VPSLLD $0x14, Y0, Y0 + VPOR Y10, Y0, Y0 + VPSRLD $0x0c, Y4, Y10 + VPSLLD $0x14, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x0c, Y6, Y10 + VPSLLD $0x14, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x0c, Y8, Y10 + VPSLLD $0x14, Y8, Y8 + VPOR Y10, Y8, Y8 + VMOVDQA (SI), Y10 + VPADDD 160(CX), Y10, Y10 + VPADDD 96(CX), Y1, Y1 + VPADDD 64(CX), Y2, Y2 + VPADDD (CX), Y3, Y3 + VPADDD Y0, Y10, Y10 + VPXOR Y10, Y5, Y5 + VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y7, Y7 + VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y8, Y3, Y3 + VPXOR Y3, Y11, Y11 + VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 + VPADDD Y5, Y12, Y12 + VPXOR Y12, Y0, Y0 + VPADDD Y7, Y13, Y13 + VPXOR Y13, Y4, Y4 + VPADDD Y9, Y14, Y14 + VPXOR Y14, Y6, Y6 + VPADDD Y11, Y15, Y15 + VPXOR Y15, Y8, Y8 + VMOVDQA Y10, (SI) + VPSRLD $0x07, Y0, Y10 + VPSLLD $0x19, Y0, Y0 + VPOR Y10, Y0, Y0 + VPSRLD $0x07, Y4, Y10 + VPSLLD $0x19, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x07, Y6, Y10 + VPSLLD $0x19, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x07, Y8, Y10 + VPSLLD $0x19, Y8, Y8 + VPOR Y10, Y8, Y8 + VMOVDQA (SI), Y10 + VPADDD 224(AX), Y10, Y10 + VPADDD 160(AX), Y1, Y1 + VPADDD (AX), Y2, Y2 + VPADDD 192(AX), Y3, Y3 + VPADDD Y4, Y10, Y10 + VPXOR Y10, Y11, Y11 + VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 + VPADDD Y6, Y1, Y1 + VPXOR Y1, Y5, Y5 + VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 + VPADDD Y8, Y2, Y2 + VPXOR Y2, Y7, Y7 + VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y11, Y14, Y14 + VPXOR Y14, Y4, Y4 + VPADDD Y5, Y15, Y15 + VPXOR Y15, Y6, Y6 + VPADDD Y7, Y12, Y12 + VPXOR Y12, Y8, Y8 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y0, Y0 + VMOVDQA Y10, (SI) + VPSRLD $0x0c, Y4, Y10 + VPSLLD $0x14, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x0c, Y6, Y10 + VPSLLD $0x14, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x0c, Y8, Y10 + VPSLLD $0x14, Y8, Y8 + VPOR Y10, Y8, Y8 + VPSRLD $0x0c, Y0, Y10 + VPSLLD $0x14, Y0, Y0 + VPOR Y10, Y0, Y0 + VMOVDQA (SI), Y10 + VPADDD 64(AX), Y10, Y10 + VPADDD 96(AX), Y1, Y1 + VPADDD 32(AX), Y2, Y2 + VPADDD 128(AX), Y3, Y3 + VPADDD Y4, Y10, Y10 + VPXOR Y10, Y11, Y11 + VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 + VPADDD Y6, Y1, Y1 + VPXOR Y1, Y5, Y5 + VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 + VPADDD Y8, Y2, Y2 + VPXOR Y2, Y7, Y7 + VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y11, Y14, Y14 + VPXOR Y14, Y4, Y4 + VPADDD Y5, Y15, Y15 + VPXOR Y15, Y6, Y6 + VPADDD Y7, Y12, Y12 + VPXOR Y12, Y8, Y8 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y0, Y0 + VMOVDQA Y10, (SI) + VPSRLD $0x07, Y4, Y10 + VPSLLD $0x19, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x07, Y6, Y10 + VPSLLD $0x19, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x07, Y8, Y10 + VPSLLD $0x19, Y8, Y8 + VPOR Y10, Y8, Y8 + VPSRLD $0x07, Y0, Y10 + VPSLLD $0x19, Y0, Y0 + VPOR Y10, Y0, Y0 + + // Round 6 + VMOVDQA (SI), Y10 + VPADDD 32(CX), Y10, Y10 + VPADDD 96(CX), Y1, Y1 + VPADDD (CX), Y2, Y2 + VPADDD 224(CX), Y3, Y3 + VPADDD Y0, Y10, Y10 + VPXOR Y10, Y5, Y5 + VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y7, Y7 + VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y8, Y3, Y3 + VPXOR Y3, Y11, Y11 + VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 + VPADDD Y5, Y12, Y12 + VPXOR Y12, Y0, Y0 + VPADDD Y7, Y13, Y13 + VPXOR Y13, Y4, Y4 + VPADDD Y9, Y14, Y14 + VPXOR Y14, Y6, Y6 + VPADDD Y11, Y15, Y15 + VPXOR Y15, Y8, Y8 + VMOVDQA Y10, (SI) + VPSRLD $0x0c, Y0, Y10 + VPSLLD $0x14, Y0, Y0 + VPOR Y10, Y0, Y0 + VPSRLD $0x0c, Y4, Y10 + VPSLLD $0x14, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x0c, Y6, Y10 + VPSLLD $0x14, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x0c, Y8, Y10 + VPSLLD $0x14, Y8, Y8 + VPOR Y10, Y8, Y8 + VMOVDQA (SI), Y10 + VPADDD 192(CX), Y10, Y10 + VPADDD 160(AX), Y1, Y1 + VPADDD 128(CX), Y2, Y2 + VPADDD 32(AX), Y3, Y3 + VPADDD Y0, Y10, Y10 + VPXOR Y10, Y5, Y5 + VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y7, Y7 + VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y8, Y3, Y3 + VPXOR Y3, Y11, Y11 + VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 + VPADDD Y5, Y12, Y12 + VPXOR Y12, Y0, Y0 + VPADDD Y7, Y13, Y13 + VPXOR Y13, Y4, Y4 + VPADDD Y9, Y14, Y14 + VPXOR Y14, Y6, Y6 + VPADDD Y11, Y15, Y15 + VPXOR Y15, Y8, Y8 + VMOVDQA Y10, (SI) + VPSRLD $0x07, Y0, Y10 + VPSLLD $0x19, Y0, Y0 + VPOR Y10, Y0, Y0 + VPSRLD $0x07, Y4, Y10 + VPSLLD $0x19, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x07, Y6, Y10 + VPSLLD $0x19, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x07, Y8, Y10 + VPSLLD $0x19, Y8, Y8 + VPOR Y10, Y8, Y8 + VMOVDQA (SI), Y10 + VPADDD 160(CX), Y10, Y10 + VPADDD (AX), Y1, Y1 + VPADDD 64(AX), Y2, Y2 + VPADDD 128(AX), Y3, Y3 + VPADDD Y4, Y10, Y10 + VPXOR Y10, Y11, Y11 + VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 + VPADDD Y6, Y1, Y1 + VPXOR Y1, Y5, Y5 + VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 + VPADDD Y8, Y2, Y2 + VPXOR Y2, Y7, Y7 + VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y11, Y14, Y14 + VPXOR Y14, Y4, Y4 + VPADDD Y5, Y15, Y15 + VPXOR Y15, Y6, Y6 + VPADDD Y7, Y12, Y12 + VPXOR Y12, Y8, Y8 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y0, Y0 + VMOVDQA Y10, (SI) + VPSRLD $0x0c, Y4, Y10 + VPSLLD $0x14, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x0c, Y6, Y10 + VPSLLD $0x14, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x0c, Y8, Y10 + VPSLLD $0x14, Y8, Y8 + VPOR Y10, Y8, Y8 + VPSRLD $0x0c, Y0, Y10 + VPSLLD $0x14, Y0, Y0 + VPOR Y10, Y0, Y0 + VMOVDQA (SI), Y10 + VPADDD 96(AX), Y10, Y10 + VPADDD 64(CX), Y1, Y1 + VPADDD 192(AX), Y2, Y2 + VPADDD 224(AX), Y3, Y3 + VPADDD Y4, Y10, Y10 + VPXOR Y10, Y11, Y11 + VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 + VPADDD Y6, Y1, Y1 + VPXOR Y1, Y5, Y5 + VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 + VPADDD Y8, Y2, Y2 + VPXOR Y2, Y7, Y7 + VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y11, Y14, Y14 + VPXOR Y14, Y4, Y4 + VPADDD Y5, Y15, Y15 + VPXOR Y15, Y6, Y6 + VPADDD Y7, Y12, Y12 + VPXOR Y12, Y8, Y8 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y0, Y0 + VMOVDQA Y10, (SI) + VPSRLD $0x07, Y4, Y10 + VPSLLD $0x19, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x07, Y6, Y10 + VPSLLD $0x19, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x07, Y8, Y10 + VPSLLD $0x19, Y8, Y8 + VPOR Y10, Y8, Y8 + VPSRLD $0x07, Y0, Y10 + VPSLLD $0x19, Y0, Y0 + VPOR Y10, Y0, Y0 + + // Round 7 + VMOVDQA (SI), Y10 + VPADDD 96(CX), Y10, Y10 + VPADDD 160(AX), Y1, Y1 + VPADDD 32(AX), Y2, Y2 + VPADDD (CX), Y3, Y3 + VPADDD Y0, Y10, Y10 + VPXOR Y10, Y5, Y5 + VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y7, Y7 + VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y8, Y3, Y3 + VPXOR Y3, Y11, Y11 + VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 + VPADDD Y5, Y12, Y12 + VPXOR Y12, Y0, Y0 + VPADDD Y7, Y13, Y13 + VPXOR Y13, Y4, Y4 + VPADDD Y9, Y14, Y14 + VPXOR Y14, Y6, Y6 + VPADDD Y11, Y15, Y15 + VPXOR Y15, Y8, Y8 + VMOVDQA Y10, (SI) + VPSRLD $0x0c, Y0, Y10 + VPSLLD $0x14, Y0, Y0 + VPOR Y10, Y0, Y0 + VPSRLD $0x0c, Y4, Y10 + VPSLLD $0x14, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x0c, Y6, Y10 + VPSLLD $0x14, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x0c, Y8, Y10 + VPSLLD $0x14, Y8, Y8 + VPOR Y10, Y8, Y8 + VMOVDQA (SI), Y10 + VPADDD 224(CX), Y10, Y10 + VPADDD (AX), Y1, Y1 + VPADDD 32(CX), Y2, Y2 + VPADDD 192(AX), Y3, Y3 + VPADDD Y0, Y10, Y10 + VPXOR Y10, Y5, Y5 + VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 + VPADDD Y4, Y1, Y1 + VPXOR Y1, Y7, Y7 + VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 + VPADDD Y6, Y2, Y2 + VPXOR Y2, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y8, Y3, Y3 + VPXOR Y3, Y11, Y11 + VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 + VPADDD Y5, Y12, Y12 + VPXOR Y12, Y0, Y0 + VPADDD Y7, Y13, Y13 + VPXOR Y13, Y4, Y4 + VPADDD Y9, Y14, Y14 + VPXOR Y14, Y6, Y6 + VPADDD Y11, Y15, Y15 + VPXOR Y15, Y8, Y8 + VMOVDQA Y10, (SI) + VPSRLD $0x07, Y0, Y10 + VPSLLD $0x19, Y0, Y0 + VPOR Y10, Y0, Y0 + VPSRLD $0x07, Y4, Y10 + VPSLLD $0x19, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x07, Y6, Y10 + VPSLLD $0x19, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x07, Y8, Y10 + VPSLLD $0x19, Y8, Y8 + VPOR Y10, Y8, Y8 + VMOVDQA (SI), Y10 + VPADDD 192(CX), Y10, Y10 + VPADDD 64(AX), Y1, Y1 + VPADDD 96(AX), Y2, Y2 + VPADDD 224(AX), Y3, Y3 + VPADDD Y4, Y10, Y10 + VPXOR Y10, Y11, Y11 + VPSHUFB rot16_shuf<>+0(SB), Y11, Y11 + VPADDD Y6, Y1, Y1 + VPXOR Y1, Y5, Y5 + VPSHUFB rot16_shuf<>+0(SB), Y5, Y5 + VPADDD Y8, Y2, Y2 + VPXOR Y2, Y7, Y7 + VPSHUFB rot16_shuf<>+0(SB), Y7, Y7 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot16_shuf<>+0(SB), Y9, Y9 + VPADDD Y11, Y14, Y14 + VPXOR Y14, Y4, Y4 + VPADDD Y5, Y15, Y15 + VPXOR Y15, Y6, Y6 + VPADDD Y7, Y12, Y12 + VPXOR Y12, Y8, Y8 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y0, Y0 + VMOVDQA Y10, (SI) + VPSRLD $0x0c, Y4, Y10 + VPSLLD $0x14, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x0c, Y6, Y10 + VPSLLD $0x14, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x0c, Y8, Y10 + VPSLLD $0x14, Y8, Y8 + VPOR Y10, Y8, Y8 + VPSRLD $0x0c, Y0, Y10 + VPSLLD $0x14, Y0, Y0 + VPOR Y10, Y0, Y0 + VMOVDQA (SI), Y10 + VPADDD 64(CX), Y10, Y10 + VPADDD 128(CX), Y1, Y1 + VPADDD 128(AX), Y2, Y2 + VPADDD 160(CX), Y3, Y3 + VPADDD Y4, Y10, Y10 + VPXOR Y10, Y11, Y11 + VPSHUFB rot8_shuf<>+0(SB), Y11, Y11 + VPADDD Y6, Y1, Y1 + VPXOR Y1, Y5, Y5 + VPSHUFB rot8_shuf<>+0(SB), Y5, Y5 + VPADDD Y8, Y2, Y2 + VPXOR Y2, Y7, Y7 + VPSHUFB rot8_shuf<>+0(SB), Y7, Y7 + VPADDD Y0, Y3, Y3 + VPXOR Y3, Y9, Y9 + VPSHUFB rot8_shuf<>+0(SB), Y9, Y9 + VPADDD Y11, Y14, Y14 + VPXOR Y14, Y4, Y4 + VPADDD Y5, Y15, Y15 + VPXOR Y15, Y6, Y6 + VPADDD Y7, Y12, Y12 + VPXOR Y12, Y8, Y8 + VPADDD Y9, Y13, Y13 + VPXOR Y13, Y0, Y0 + VMOVDQA Y10, (SI) + VPSRLD $0x07, Y4, Y10 + VPSLLD $0x19, Y4, Y4 + VPOR Y10, Y4, Y4 + VPSRLD $0x07, Y6, Y10 + VPSLLD $0x19, Y6, Y6 + VPOR Y10, Y6, Y6 + VPSRLD $0x07, Y8, Y10 + VPSLLD $0x19, Y8, Y8 + VPOR Y10, Y8, Y8 + VPSRLD $0x07, Y0, Y10 + VPSLLD $0x19, Y0, Y0 + VPOR Y10, Y0, Y0 + + // Finalize + VPXOR (SI), Y12, Y10 + VPXOR Y13, Y1, Y1 + VPXOR Y14, Y2, Y2 + VPXOR Y15, Y3, Y3 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y6, Y5 + VPXOR Y11, Y8, Y6 + + // Store result into out + VMOVDQU Y10, (BP) + VMOVDQU Y1, 32(BP) + VMOVDQU Y2, 64(BP) + VMOVDQU Y3, 96(BP) + VMOVDQU Y0, 128(BP) + VMOVDQU Y4, 160(BP) + VMOVDQU Y5, 192(BP) + VMOVDQU Y6, 224(BP) + VZEROUPPER + RET diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go new file mode 100644 index 000000000..613972814 --- /dev/null +++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go @@ -0,0 +1,13 @@ +// +build !amd64 + +package hash_avx2 + +import "github.com/zeebo/blake3/internal/alg/hash/hash_pure" + +func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) { + hash_pure.HashF(input, length, counter, flags, key, out, chain) +} + +func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) { + hash_pure.HashP(left, right, flags, key, out, n) +} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go new file mode 100644 index 000000000..10e949550 --- /dev/null +++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go @@ -0,0 +1,9 @@ +// +build amd64 + +package hash_avx2 + +//go:noescape +func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) + +//go:noescape +func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go new file mode 100644 index 000000000..0c6fd63cd --- /dev/null +++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go @@ -0,0 +1,56 @@ +package hash_pure + +import ( + "unsafe" + + "github.com/zeebo/blake3/internal/alg/compress" + "github.com/zeebo/blake3/internal/consts" + "github.com/zeebo/blake3/internal/utils" +) + +func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) { + var tmp [16]uint32 + + for i := uint64(0); consts.ChunkLen*i < length && i < 8; i++ { + bchain := *key + bflags := flags | consts.Flag_ChunkStart + start := consts.ChunkLen * i + + for n := uint64(0); n < 16; n++ { + if n == 15 { + bflags |= consts.Flag_ChunkEnd + } + if start+64*n >= length { + break + } + if start+64+64*n >= length { + *chain = bchain + } + + var blockPtr *[16]uint32 + if consts.IsLittleEndian { + blockPtr = (*[16]uint32)(unsafe.Pointer(&input[consts.ChunkLen*i+consts.BlockLen*n])) + } else { + var block [16]uint32 + utils.BytesToWords((*[64]uint8)(unsafe.Pointer(&input[consts.ChunkLen*i+consts.BlockLen*n])), &block) + blockPtr = &block + } + + compress.Compress(&bchain, blockPtr, counter, consts.BlockLen, bflags, &tmp) + + bchain = *(*[8]uint32)(unsafe.Pointer(&tmp[0])) + bflags = flags + } + + out[i+0] = bchain[0] + out[i+8] = bchain[1] + out[i+16] = bchain[2] + out[i+24] = bchain[3] + out[i+32] = bchain[4] + out[i+40] = bchain[5] + out[i+48] = bchain[6] + out[i+56] = bchain[7] + + counter++ + } +} diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go new file mode 100644 index 000000000..bee5d8dd0 --- /dev/null +++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go @@ -0,0 +1,38 @@ +package hash_pure + +import "github.com/zeebo/blake3/internal/alg/compress" + +func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) { + var tmp [16]uint32 + var block [16]uint32 + + for i := 0; i < n && i < 8; i++ { + block[0] = left[i+0] + block[1] = left[i+8] + block[2] = left[i+16] + block[3] = left[i+24] + block[4] = left[i+32] + block[5] = left[i+40] + block[6] = left[i+48] + block[7] = left[i+56] + block[8] = right[i+0] + block[9] = right[i+8] + block[10] = right[i+16] + block[11] = right[i+24] + block[12] = right[i+32] + block[13] = right[i+40] + block[14] = right[i+48] + block[15] = right[i+56] + + compress.Compress(key, &block, 0, 64, flags, &tmp) + + out[i+0] = tmp[0] + out[i+8] = tmp[1] + out[i+16] = tmp[2] + out[i+24] = tmp[3] + out[i+32] = tmp[4] + out[i+40] = tmp[5] + out[i+48] = tmp[6] + out[i+56] = tmp[7] + } +} |