summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--go.mod4
-rw-r--r--go.sum12
-rw-r--r--vendor/golang.org/x/crypto/argon2/blamka_amd64.s2972
-rw-r--r--vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s5167
-rw-r--r--vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s1681
-rw-r--r--vendor/golang.org/x/crypto/blake2s/blake2s_amd64.s2571
-rw-r--r--vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s133
-rw-r--r--vendor/golang.org/x/sys/cpu/cpu.go19
-rw-r--r--vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go2
-rw-r--r--vendor/golang.org/x/sys/cpu/cpu_linux_riscv64.go137
-rw-r--r--vendor/golang.org/x/sys/cpu/cpu_riscv64.go11
-rw-r--r--vendor/golang.org/x/sys/unix/mkerrors.sh1
-rw-r--r--vendor/golang.org/x/sys/unix/syscall_darwin.go37
-rw-r--r--vendor/golang.org/x/sys/unix/syscall_hurd.go1
-rw-r--r--vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go7
-rw-r--r--vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go7
-rw-r--r--vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go2
-rw-r--r--vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go20
-rw-r--r--vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s5
-rw-r--r--vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go20
-rw-r--r--vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s5
-rw-r--r--vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go13
-rw-r--r--vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go13
-rw-r--r--vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go1
-rw-r--r--vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go1
-rw-r--r--vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go1
-rw-r--r--vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go1
-rw-r--r--vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go1
-rw-r--r--vendor/golang.org/x/sys/unix/ztypes_linux.go2
-rw-r--r--vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go33
-rw-r--r--vendor/golang.org/x/sys/windows/syscall_windows.go4
-rw-r--r--vendor/golang.org/x/sys/windows/types_windows.go1
-rw-r--r--vendor/golang.org/x/sys/windows/zsyscall_windows.go38
-rw-r--r--vendor/modules.txt4
34 files changed, 11278 insertions, 1649 deletions
diff --git a/go.mod b/go.mod
index 1ba0df6ac..0e9d80478 100644
--- a/go.mod
+++ b/go.mod
@@ -73,7 +73,7 @@ require (
go.opentelemetry.io/otel/sdk/metric v1.29.0
go.opentelemetry.io/otel/trace v1.29.0
go.uber.org/automaxprocs v1.5.3
- golang.org/x/crypto v0.26.0
+ golang.org/x/crypto v0.27.0
golang.org/x/image v0.20.0
golang.org/x/net v0.28.0
golang.org/x/oauth2 v0.23.0
@@ -215,7 +215,7 @@ require (
golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 // indirect
golang.org/x/mod v0.18.0 // indirect
golang.org/x/sync v0.8.0 // indirect
- golang.org/x/sys v0.24.0 // indirect
+ golang.org/x/sys v0.25.0 // indirect
golang.org/x/tools v0.22.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240822170219-fc7c04adadcd // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240822170219-fc7c04adadcd // indirect
diff --git a/go.sum b/go.sum
index 509d6074a..22383824d 100644
--- a/go.sum
+++ b/go.sum
@@ -674,8 +674,8 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.3.0/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4=
-golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw=
-golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54=
+golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A=
+golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
@@ -806,13 +806,13 @@ golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg=
-golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34=
+golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc=
-golang.org/x/term v0.23.0 h1:F6D4vR+EHoL9/sWAWgAR1H2DcHr4PareCbAaCo1RpuU=
-golang.org/x/term v0.23.0/go.mod h1:DgV24QBUrK6jhZXl+20l6UWznPlwAHm1Q1mGHtydmSk=
+golang.org/x/term v0.24.0 h1:Mh5cbb+Zk2hqqXNO7S1iTjEphVL+jb8ZWaqh/g+JWkM=
+golang.org/x/term v0.24.0/go.mod h1:lOBK/LVxemqiMij05LGJ0tzNr8xlmwBRJ81PX6wVLH8=
golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
diff --git a/vendor/golang.org/x/crypto/argon2/blamka_amd64.s b/vendor/golang.org/x/crypto/argon2/blamka_amd64.s
index 6713accac..c3895478e 100644
--- a/vendor/golang.org/x/crypto/argon2/blamka_amd64.s
+++ b/vendor/golang.org/x/crypto/argon2/blamka_amd64.s
@@ -1,243 +1,2791 @@
-// Copyright 2017 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run blamka_amd64.go -out ../blamka_amd64.s -pkg argon2. DO NOT EDIT.
//go:build amd64 && gc && !purego
#include "textflag.h"
-DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
-DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
-GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
-
-DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
-DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
-GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
-
-#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
- MOVO v4, t1; \
- MOVO v5, v4; \
- MOVO t1, v5; \
- MOVO v6, t1; \
- PUNPCKLQDQ v6, t2; \
- PUNPCKHQDQ v7, v6; \
- PUNPCKHQDQ t2, v6; \
- PUNPCKLQDQ v7, t2; \
- MOVO t1, v7; \
- MOVO v2, t1; \
- PUNPCKHQDQ t2, v7; \
- PUNPCKLQDQ v3, t2; \
- PUNPCKHQDQ t2, v2; \
- PUNPCKLQDQ t1, t2; \
- PUNPCKHQDQ t2, v3
-
-#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
- MOVO v4, t1; \
- MOVO v5, v4; \
- MOVO t1, v5; \
- MOVO v2, t1; \
- PUNPCKLQDQ v2, t2; \
- PUNPCKHQDQ v3, v2; \
- PUNPCKHQDQ t2, v2; \
- PUNPCKLQDQ v3, t2; \
- MOVO t1, v3; \
- MOVO v6, t1; \
- PUNPCKHQDQ t2, v3; \
- PUNPCKLQDQ v7, t2; \
- PUNPCKHQDQ t2, v6; \
- PUNPCKLQDQ t1, t2; \
- PUNPCKHQDQ t2, v7
-
-#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \
- MOVO v0, t0; \
- PMULULQ v2, t0; \
- PADDQ v2, v0; \
- PADDQ t0, v0; \
- PADDQ t0, v0; \
- PXOR v0, v6; \
- PSHUFD $0xB1, v6, v6; \
- MOVO v4, t0; \
- PMULULQ v6, t0; \
- PADDQ v6, v4; \
- PADDQ t0, v4; \
- PADDQ t0, v4; \
- PXOR v4, v2; \
- PSHUFB c40, v2; \
- MOVO v0, t0; \
- PMULULQ v2, t0; \
- PADDQ v2, v0; \
- PADDQ t0, v0; \
- PADDQ t0, v0; \
- PXOR v0, v6; \
- PSHUFB c48, v6; \
- MOVO v4, t0; \
- PMULULQ v6, t0; \
- PADDQ v6, v4; \
- PADDQ t0, v4; \
- PADDQ t0, v4; \
- PXOR v4, v2; \
- MOVO v2, t0; \
- PADDQ v2, t0; \
- PSRLQ $63, v2; \
- PXOR t0, v2; \
- MOVO v1, t0; \
- PMULULQ v3, t0; \
- PADDQ v3, v1; \
- PADDQ t0, v1; \
- PADDQ t0, v1; \
- PXOR v1, v7; \
- PSHUFD $0xB1, v7, v7; \
- MOVO v5, t0; \
- PMULULQ v7, t0; \
- PADDQ v7, v5; \
- PADDQ t0, v5; \
- PADDQ t0, v5; \
- PXOR v5, v3; \
- PSHUFB c40, v3; \
- MOVO v1, t0; \
- PMULULQ v3, t0; \
- PADDQ v3, v1; \
- PADDQ t0, v1; \
- PADDQ t0, v1; \
- PXOR v1, v7; \
- PSHUFB c48, v7; \
- MOVO v5, t0; \
- PMULULQ v7, t0; \
- PADDQ v7, v5; \
- PADDQ t0, v5; \
- PADDQ t0, v5; \
- PXOR v5, v3; \
- MOVO v3, t0; \
- PADDQ v3, t0; \
- PSRLQ $63, v3; \
- PXOR t0, v3
-
-#define LOAD_MSG_0(block, off) \
- MOVOU 8*(off+0)(block), X0; \
- MOVOU 8*(off+2)(block), X1; \
- MOVOU 8*(off+4)(block), X2; \
- MOVOU 8*(off+6)(block), X3; \
- MOVOU 8*(off+8)(block), X4; \
- MOVOU 8*(off+10)(block), X5; \
- MOVOU 8*(off+12)(block), X6; \
- MOVOU 8*(off+14)(block), X7
-
-#define STORE_MSG_0(block, off) \
- MOVOU X0, 8*(off+0)(block); \
- MOVOU X1, 8*(off+2)(block); \
- MOVOU X2, 8*(off+4)(block); \
- MOVOU X3, 8*(off+6)(block); \
- MOVOU X4, 8*(off+8)(block); \
- MOVOU X5, 8*(off+10)(block); \
- MOVOU X6, 8*(off+12)(block); \
- MOVOU X7, 8*(off+14)(block)
-
-#define LOAD_MSG_1(block, off) \
- MOVOU 8*off+0*8(block), X0; \
- MOVOU 8*off+16*8(block), X1; \
- MOVOU 8*off+32*8(block), X2; \
- MOVOU 8*off+48*8(block), X3; \
- MOVOU 8*off+64*8(block), X4; \
- MOVOU 8*off+80*8(block), X5; \
- MOVOU 8*off+96*8(block), X6; \
- MOVOU 8*off+112*8(block), X7
-
-#define STORE_MSG_1(block, off) \
- MOVOU X0, 8*off+0*8(block); \
- MOVOU X1, 8*off+16*8(block); \
- MOVOU X2, 8*off+32*8(block); \
- MOVOU X3, 8*off+48*8(block); \
- MOVOU X4, 8*off+64*8(block); \
- MOVOU X5, 8*off+80*8(block); \
- MOVOU X6, 8*off+96*8(block); \
- MOVOU X7, 8*off+112*8(block)
-
-#define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \
- LOAD_MSG_0(block, off); \
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
- SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
- SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \
- STORE_MSG_0(block, off)
-
-#define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \
- LOAD_MSG_1(block, off); \
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
- SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \
- SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \
- STORE_MSG_1(block, off)
-
// func blamkaSSE4(b *block)
-TEXT ·blamkaSSE4(SB), 4, $0-8
- MOVQ b+0(FP), AX
-
- MOVOU ·c40<>(SB), X10
- MOVOU ·c48<>(SB), X11
+// Requires: SSE2, SSSE3
+TEXT ·blamkaSSE4(SB), NOSPLIT, $0-8
+ MOVQ b+0(FP), AX
+ MOVOU ·c40<>+0(SB), X10
+ MOVOU ·c48<>+0(SB), X11
+ MOVOU (AX), X0
+ MOVOU 16(AX), X1
+ MOVOU 32(AX), X2
+ MOVOU 48(AX), X3
+ MOVOU 64(AX), X4
+ MOVOU 80(AX), X5
+ MOVOU 96(AX), X6
+ MOVOU 112(AX), X7
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ MOVOU X2, 32(AX)
+ MOVOU X3, 48(AX)
+ MOVOU X4, 64(AX)
+ MOVOU X5, 80(AX)
+ MOVOU X6, 96(AX)
+ MOVOU X7, 112(AX)
+ MOVOU 128(AX), X0
+ MOVOU 144(AX), X1
+ MOVOU 160(AX), X2
+ MOVOU 176(AX), X3
+ MOVOU 192(AX), X4
+ MOVOU 208(AX), X5
+ MOVOU 224(AX), X6
+ MOVOU 240(AX), X7
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVOU X0, 128(AX)
+ MOVOU X1, 144(AX)
+ MOVOU X2, 160(AX)
+ MOVOU X3, 176(AX)
+ MOVOU X4, 192(AX)
+ MOVOU X5, 208(AX)
+ MOVOU X6, 224(AX)
+ MOVOU X7, 240(AX)
+ MOVOU 256(AX), X0
+ MOVOU 272(AX), X1
+ MOVOU 288(AX), X2
+ MOVOU 304(AX), X3
+ MOVOU 320(AX), X4
+ MOVOU 336(AX), X5
+ MOVOU 352(AX), X6
+ MOVOU 368(AX), X7
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVOU X0, 256(AX)
+ MOVOU X1, 272(AX)
+ MOVOU X2, 288(AX)
+ MOVOU X3, 304(AX)
+ MOVOU X4, 320(AX)
+ MOVOU X5, 336(AX)
+ MOVOU X6, 352(AX)
+ MOVOU X7, 368(AX)
+ MOVOU 384(AX), X0
+ MOVOU 400(AX), X1
+ MOVOU 416(AX), X2
+ MOVOU 432(AX), X3
+ MOVOU 448(AX), X4
+ MOVOU 464(AX), X5
+ MOVOU 480(AX), X6
+ MOVOU 496(AX), X7
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVOU X0, 384(AX)
+ MOVOU X1, 400(AX)
+ MOVOU X2, 416(AX)
+ MOVOU X3, 432(AX)
+ MOVOU X4, 448(AX)
+ MOVOU X5, 464(AX)
+ MOVOU X6, 480(AX)
+ MOVOU X7, 496(AX)
+ MOVOU 512(AX), X0
+ MOVOU 528(AX), X1
+ MOVOU 544(AX), X2
+ MOVOU 560(AX), X3
+ MOVOU 576(AX), X4
+ MOVOU 592(AX), X5
+ MOVOU 608(AX), X6
+ MOVOU 624(AX), X7
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVOU X0, 512(AX)
+ MOVOU X1, 528(AX)
+ MOVOU X2, 544(AX)
+ MOVOU X3, 560(AX)
+ MOVOU X4, 576(AX)
+ MOVOU X5, 592(AX)
+ MOVOU X6, 608(AX)
+ MOVOU X7, 624(AX)
+ MOVOU 640(AX), X0
+ MOVOU 656(AX), X1
+ MOVOU 672(AX), X2
+ MOVOU 688(AX), X3
+ MOVOU 704(AX), X4
+ MOVOU 720(AX), X5
+ MOVOU 736(AX), X6
+ MOVOU 752(AX), X7
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVOU X0, 640(AX)
+ MOVOU X1, 656(AX)
+ MOVOU X2, 672(AX)
+ MOVOU X3, 688(AX)
+ MOVOU X4, 704(AX)
+ MOVOU X5, 720(AX)
+ MOVOU X6, 736(AX)
+ MOVOU X7, 752(AX)
+ MOVOU 768(AX), X0
+ MOVOU 784(AX), X1
+ MOVOU 800(AX), X2
+ MOVOU 816(AX), X3
+ MOVOU 832(AX), X4
+ MOVOU 848(AX), X5
+ MOVOU 864(AX), X6
+ MOVOU 880(AX), X7
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVOU X0, 768(AX)
+ MOVOU X1, 784(AX)
+ MOVOU X2, 800(AX)
+ MOVOU X3, 816(AX)
+ MOVOU X4, 832(AX)
+ MOVOU X5, 848(AX)
+ MOVOU X6, 864(AX)
+ MOVOU X7, 880(AX)
+ MOVOU 896(AX), X0
+ MOVOU 912(AX), X1
+ MOVOU 928(AX), X2
+ MOVOU 944(AX), X3
+ MOVOU 960(AX), X4
+ MOVOU 976(AX), X5
+ MOVOU 992(AX), X6
+ MOVOU 1008(AX), X7
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVOU X0, 896(AX)
+ MOVOU X1, 912(AX)
+ MOVOU X2, 928(AX)
+ MOVOU X3, 944(AX)
+ MOVOU X4, 960(AX)
+ MOVOU X5, 976(AX)
+ MOVOU X6, 992(AX)
+ MOVOU X7, 1008(AX)
+ MOVOU (AX), X0
+ MOVOU 128(AX), X1
+ MOVOU 256(AX), X2
+ MOVOU 384(AX), X3
+ MOVOU 512(AX), X4
+ MOVOU 640(AX), X5
+ MOVOU 768(AX), X6
+ MOVOU 896(AX), X7
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVOU X0, (AX)
+ MOVOU X1, 128(AX)
+ MOVOU X2, 256(AX)
+ MOVOU X3, 384(AX)
+ MOVOU X4, 512(AX)
+ MOVOU X5, 640(AX)
+ MOVOU X6, 768(AX)
+ MOVOU X7, 896(AX)
+ MOVOU 16(AX), X0
+ MOVOU 144(AX), X1
+ MOVOU 272(AX), X2
+ MOVOU 400(AX), X3
+ MOVOU 528(AX), X4
+ MOVOU 656(AX), X5
+ MOVOU 784(AX), X6
+ MOVOU 912(AX), X7
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVOU X0, 16(AX)
+ MOVOU X1, 144(AX)
+ MOVOU X2, 272(AX)
+ MOVOU X3, 400(AX)
+ MOVOU X4, 528(AX)
+ MOVOU X5, 656(AX)
+ MOVOU X6, 784(AX)
+ MOVOU X7, 912(AX)
+ MOVOU 32(AX), X0
+ MOVOU 160(AX), X1
+ MOVOU 288(AX), X2
+ MOVOU 416(AX), X3
+ MOVOU 544(AX), X4
+ MOVOU 672(AX), X5
+ MOVOU 800(AX), X6
+ MOVOU 928(AX), X7
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVOU X0, 32(AX)
+ MOVOU X1, 160(AX)
+ MOVOU X2, 288(AX)
+ MOVOU X3, 416(AX)
+ MOVOU X4, 544(AX)
+ MOVOU X5, 672(AX)
+ MOVOU X6, 800(AX)
+ MOVOU X7, 928(AX)
+ MOVOU 48(AX), X0
+ MOVOU 176(AX), X1
+ MOVOU 304(AX), X2
+ MOVOU 432(AX), X3
+ MOVOU 560(AX), X4
+ MOVOU 688(AX), X5
+ MOVOU 816(AX), X6
+ MOVOU 944(AX), X7
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVOU X0, 48(AX)
+ MOVOU X1, 176(AX)
+ MOVOU X2, 304(AX)
+ MOVOU X3, 432(AX)
+ MOVOU X4, 560(AX)
+ MOVOU X5, 688(AX)
+ MOVOU X6, 816(AX)
+ MOVOU X7, 944(AX)
+ MOVOU 64(AX), X0
+ MOVOU 192(AX), X1
+ MOVOU 320(AX), X2
+ MOVOU 448(AX), X3
+ MOVOU 576(AX), X4
+ MOVOU 704(AX), X5
+ MOVOU 832(AX), X6
+ MOVOU 960(AX), X7
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVOU X0, 64(AX)
+ MOVOU X1, 192(AX)
+ MOVOU X2, 320(AX)
+ MOVOU X3, 448(AX)
+ MOVOU X4, 576(AX)
+ MOVOU X5, 704(AX)
+ MOVOU X6, 832(AX)
+ MOVOU X7, 960(AX)
+ MOVOU 80(AX), X0
+ MOVOU 208(AX), X1
+ MOVOU 336(AX), X2
+ MOVOU 464(AX), X3
+ MOVOU 592(AX), X4
+ MOVOU 720(AX), X5
+ MOVOU 848(AX), X6
+ MOVOU 976(AX), X7
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVOU X0, 80(AX)
+ MOVOU X1, 208(AX)
+ MOVOU X2, 336(AX)
+ MOVOU X3, 464(AX)
+ MOVOU X4, 592(AX)
+ MOVOU X5, 720(AX)
+ MOVOU X6, 848(AX)
+ MOVOU X7, 976(AX)
+ MOVOU 96(AX), X0
+ MOVOU 224(AX), X1
+ MOVOU 352(AX), X2
+ MOVOU 480(AX), X3
+ MOVOU 608(AX), X4
+ MOVOU 736(AX), X5
+ MOVOU 864(AX), X6
+ MOVOU 992(AX), X7
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVOU X0, 96(AX)
+ MOVOU X1, 224(AX)
+ MOVOU X2, 352(AX)
+ MOVOU X3, 480(AX)
+ MOVOU X4, 608(AX)
+ MOVOU X5, 736(AX)
+ MOVOU X6, 864(AX)
+ MOVOU X7, 992(AX)
+ MOVOU 112(AX), X0
+ MOVOU 240(AX), X1
+ MOVOU 368(AX), X2
+ MOVOU 496(AX), X3
+ MOVOU 624(AX), X4
+ MOVOU 752(AX), X5
+ MOVOU 880(AX), X6
+ MOVOU 1008(AX), X7
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFD $0xb1, X6, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ PSHUFB X10, X2
+ MOVO X0, X8
+ PMULULQ X2, X8
+ PADDQ X2, X0
+ PADDQ X8, X0
+ PADDQ X8, X0
+ PXOR X0, X6
+ PSHUFB X11, X6
+ MOVO X4, X8
+ PMULULQ X6, X8
+ PADDQ X6, X4
+ PADDQ X8, X4
+ PADDQ X8, X4
+ PXOR X4, X2
+ MOVO X2, X8
+ PADDQ X2, X8
+ PSRLQ $0x3f, X2
+ PXOR X8, X2
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFD $0xb1, X7, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ PSHUFB X10, X3
+ MOVO X1, X8
+ PMULULQ X3, X8
+ PADDQ X3, X1
+ PADDQ X8, X1
+ PADDQ X8, X1
+ PXOR X1, X7
+ PSHUFB X11, X7
+ MOVO X5, X8
+ PMULULQ X7, X8
+ PADDQ X7, X5
+ PADDQ X8, X5
+ PADDQ X8, X5
+ PXOR X5, X3
+ MOVO X3, X8
+ PADDQ X3, X8
+ PSRLQ $0x3f, X3
+ PXOR X8, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVOU X0, 112(AX)
+ MOVOU X1, 240(AX)
+ MOVOU X2, 368(AX)
+ MOVOU X3, 496(AX)
+ MOVOU X4, 624(AX)
+ MOVOU X5, 752(AX)
+ MOVOU X6, 880(AX)
+ MOVOU X7, 1008(AX)
+ RET
- BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11)
- BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11)
- BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11)
- BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11)
- BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11)
- BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11)
- BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11)
- BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11)
+DATA ·c40<>+0(SB)/8, $0x0201000706050403
+DATA ·c40<>+8(SB)/8, $0x0a09080f0e0d0c0b
+GLOBL ·c40<>(SB), RODATA|NOPTR, $16
- BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11)
- BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11)
- BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11)
- BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11)
- BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11)
- BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11)
- BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11)
- BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11)
- RET
+DATA ·c48<>+0(SB)/8, $0x0100070605040302
+DATA ·c48<>+8(SB)/8, $0x09080f0e0d0c0b0a
+GLOBL ·c48<>(SB), RODATA|NOPTR, $16
-// func mixBlocksSSE2(out, a, b, c *block)
-TEXT ·mixBlocksSSE2(SB), 4, $0-32
+// func mixBlocksSSE2(out *block, a *block, b *block, c *block)
+// Requires: SSE2
+TEXT ·mixBlocksSSE2(SB), NOSPLIT, $0-32
MOVQ out+0(FP), DX
MOVQ a+8(FP), AX
MOVQ b+16(FP), BX
MOVQ c+24(FP), CX
- MOVQ $128, DI
+ MOVQ $0x00000080, DI
loop:
- MOVOU 0(AX), X0
- MOVOU 0(BX), X1
- MOVOU 0(CX), X2
+ MOVOU (AX), X0
+ MOVOU (BX), X1
+ MOVOU (CX), X2
PXOR X1, X0
PXOR X2, X0
- MOVOU X0, 0(DX)
- ADDQ $16, AX
- ADDQ $16, BX
- ADDQ $16, CX
- ADDQ $16, DX
- SUBQ $2, DI
+ MOVOU X0, (DX)
+ ADDQ $0x10, AX
+ ADDQ $0x10, BX
+ ADDQ $0x10, CX
+ ADDQ $0x10, DX
+ SUBQ $0x02, DI
JA loop
RET
-// func xorBlocksSSE2(out, a, b, c *block)
-TEXT ·xorBlocksSSE2(SB), 4, $0-32
+// func xorBlocksSSE2(out *block, a *block, b *block, c *block)
+// Requires: SSE2
+TEXT ·xorBlocksSSE2(SB), NOSPLIT, $0-32
MOVQ out+0(FP), DX
MOVQ a+8(FP), AX
MOVQ b+16(FP), BX
MOVQ c+24(FP), CX
- MOVQ $128, DI
+ MOVQ $0x00000080, DI
loop:
- MOVOU 0(AX), X0
- MOVOU 0(BX), X1
- MOVOU 0(CX), X2
- MOVOU 0(DX), X3
+ MOVOU (AX), X0
+ MOVOU (BX), X1
+ MOVOU (CX), X2
+ MOVOU (DX), X3
PXOR X1, X0
PXOR X2, X0
PXOR X3, X0
- MOVOU X0, 0(DX)
- ADDQ $16, AX
- ADDQ $16, BX
- ADDQ $16, CX
- ADDQ $16, DX
- SUBQ $2, DI
+ MOVOU X0, (DX)
+ ADDQ $0x10, AX
+ ADDQ $0x10, BX
+ ADDQ $0x10, CX
+ ADDQ $0x10, DX
+ SUBQ $0x02, DI
JA loop
RET
diff --git a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s
index 9ae8206c2..f75162e03 100644
--- a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s
+++ b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s
@@ -1,722 +1,4517 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run blake2bAVX2_amd64_asm.go -out ../../blake2bAVX2_amd64.s -pkg blake2b. DO NOT EDIT.
//go:build amd64 && gc && !purego
#include "textflag.h"
-DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
-DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
-DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b
-DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1
-GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32
-
-DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1
-DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
-DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b
-DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179
-GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32
-
-DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403
-DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
-DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403
-DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b
-GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32
-
-DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302
-DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
-DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302
-DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a
-GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32
-
-DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
-DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
-GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16
-
-DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
-DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
-GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16
-
-DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1
-DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
-GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16
-
-DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
-DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
-GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16
-
-DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403
-DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
-GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16
-
-DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302
-DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
-GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16
-
-#define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39
-#define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93
-#define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e
-#define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93
-#define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39
-
-#define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \
- VPADDQ m0, Y0, Y0; \
- VPADDQ Y1, Y0, Y0; \
- VPXOR Y0, Y3, Y3; \
- VPSHUFD $-79, Y3, Y3; \
- VPADDQ Y3, Y2, Y2; \
- VPXOR Y2, Y1, Y1; \
- VPSHUFB c40, Y1, Y1; \
- VPADDQ m1, Y0, Y0; \
- VPADDQ Y1, Y0, Y0; \
- VPXOR Y0, Y3, Y3; \
- VPSHUFB c48, Y3, Y3; \
- VPADDQ Y3, Y2, Y2; \
- VPXOR Y2, Y1, Y1; \
- VPADDQ Y1, Y1, t; \
- VPSRLQ $63, Y1, Y1; \
- VPXOR t, Y1, Y1; \
- VPERMQ_0x39_Y1_Y1; \
- VPERMQ_0x4E_Y2_Y2; \
- VPERMQ_0x93_Y3_Y3; \
- VPADDQ m2, Y0, Y0; \
- VPADDQ Y1, Y0, Y0; \
- VPXOR Y0, Y3, Y3; \
- VPSHUFD $-79, Y3, Y3; \
- VPADDQ Y3, Y2, Y2; \
- VPXOR Y2, Y1, Y1; \
- VPSHUFB c40, Y1, Y1; \
- VPADDQ m3, Y0, Y0; \
- VPADDQ Y1, Y0, Y0; \
- VPXOR Y0, Y3, Y3; \
- VPSHUFB c48, Y3, Y3; \
- VPADDQ Y3, Y2, Y2; \
- VPXOR Y2, Y1, Y1; \
- VPADDQ Y1, Y1, t; \
- VPSRLQ $63, Y1, Y1; \
- VPXOR t, Y1, Y1; \
- VPERMQ_0x39_Y3_Y3; \
- VPERMQ_0x4E_Y2_Y2; \
- VPERMQ_0x93_Y1_Y1
-
-#define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E
-#define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26
-#define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E
-#define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36
-#define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E
-
-#define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n
-#define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n
-#define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n
-#define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n
-#define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n
-
-#define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01
-#define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01
-#define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01
-#define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01
-#define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01
-
-#define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01
-#define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01
-#define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01
-#define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01
-#define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01
-
-#define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8
-#define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01
-
-// load msg: Y12 = (i0, i1, i2, i3)
-// i0, i1, i2, i3 must not be 0
-#define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \
- VMOVQ_SI_X12(i0*8); \
- VMOVQ_SI_X11(i2*8); \
- VPINSRQ_1_SI_X12(i1*8); \
- VPINSRQ_1_SI_X11(i3*8); \
- VINSERTI128 $1, X11, Y12, Y12
-
-// load msg: Y13 = (i0, i1, i2, i3)
-// i0, i1, i2, i3 must not be 0
-#define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \
- VMOVQ_SI_X13(i0*8); \
- VMOVQ_SI_X11(i2*8); \
- VPINSRQ_1_SI_X13(i1*8); \
- VPINSRQ_1_SI_X11(i3*8); \
- VINSERTI128 $1, X11, Y13, Y13
-
-// load msg: Y14 = (i0, i1, i2, i3)
-// i0, i1, i2, i3 must not be 0
-#define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \
- VMOVQ_SI_X14(i0*8); \
- VMOVQ_SI_X11(i2*8); \
- VPINSRQ_1_SI_X14(i1*8); \
- VPINSRQ_1_SI_X11(i3*8); \
- VINSERTI128 $1, X11, Y14, Y14
-
-// load msg: Y15 = (i0, i1, i2, i3)
-// i0, i1, i2, i3 must not be 0
-#define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \
- VMOVQ_SI_X15(i0*8); \
- VMOVQ_SI_X11(i2*8); \
- VPINSRQ_1_SI_X15(i1*8); \
- VPINSRQ_1_SI_X11(i3*8); \
- VINSERTI128 $1, X11, Y15, Y15
-
-#define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \
- VMOVQ_SI_X12_0; \
- VMOVQ_SI_X11(4*8); \
- VPINSRQ_1_SI_X12(2*8); \
- VPINSRQ_1_SI_X11(6*8); \
- VINSERTI128 $1, X11, Y12, Y12; \
- LOAD_MSG_AVX2_Y13(1, 3, 5, 7); \
- LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \
- LOAD_MSG_AVX2_Y15(9, 11, 13, 15)
-
-#define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \
- LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \
- LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \
- VMOVQ_SI_X11(11*8); \
- VPSHUFD $0x4E, 0*8(SI), X14; \
- VPINSRQ_1_SI_X11(5*8); \
- VINSERTI128 $1, X11, Y14, Y14; \
- LOAD_MSG_AVX2_Y15(12, 2, 7, 3)
-
-#define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \
- VMOVQ_SI_X11(5*8); \
- VMOVDQU 11*8(SI), X12; \
- VPINSRQ_1_SI_X11(15*8); \
- VINSERTI128 $1, X11, Y12, Y12; \
- VMOVQ_SI_X13(8*8); \
- VMOVQ_SI_X11(2*8); \
- VPINSRQ_1_SI_X13_0; \
- VPINSRQ_1_SI_X11(13*8); \
- VINSERTI128 $1, X11, Y13, Y13; \
- LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \
- LOAD_MSG_AVX2_Y15(14, 6, 1, 4)
-
-#define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \
- LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \
- LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \
- LOAD_MSG_AVX2_Y14(2, 5, 4, 15); \
- VMOVQ_SI_X15(6*8); \
- VMOVQ_SI_X11_0; \
- VPINSRQ_1_SI_X15(10*8); \
- VPINSRQ_1_SI_X11(8*8); \
- VINSERTI128 $1, X11, Y15, Y15
-
-#define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \
- LOAD_MSG_AVX2_Y12(9, 5, 2, 10); \
- VMOVQ_SI_X13_0; \
- VMOVQ_SI_X11(4*8); \
- VPINSRQ_1_SI_X13(7*8); \
- VPINSRQ_1_SI_X11(15*8); \
- VINSERTI128 $1, X11, Y13, Y13; \
- LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \
- LOAD_MSG_AVX2_Y15(1, 12, 8, 13)
-
-#define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \
- VMOVQ_SI_X12(2*8); \
- VMOVQ_SI_X11_0; \
- VPINSRQ_1_SI_X12(6*8); \
- VPINSRQ_1_SI_X11(8*8); \
- VINSERTI128 $1, X11, Y12, Y12; \
- LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \
- LOAD_MSG_AVX2_Y14(4, 7, 15, 1); \
- LOAD_MSG_AVX2_Y15(13, 5, 14, 9)
-
-#define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \
- LOAD_MSG_AVX2_Y12(12, 1, 14, 4); \
- LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \
- VMOVQ_SI_X14_0; \
- VPSHUFD $0x4E, 8*8(SI), X11; \
- VPINSRQ_1_SI_X14(6*8); \
- VINSERTI128 $1, X11, Y14, Y14; \
- LOAD_MSG_AVX2_Y15(7, 3, 2, 11)
-
-#define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \
- LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \
- LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \
- LOAD_MSG_AVX2_Y14(5, 15, 8, 2); \
- VMOVQ_SI_X15_0; \
- VMOVQ_SI_X11(6*8); \
- VPINSRQ_1_SI_X15(4*8); \
- VPINSRQ_1_SI_X11(10*8); \
- VINSERTI128 $1, X11, Y15, Y15
-
-#define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \
- VMOVQ_SI_X12(6*8); \
- VMOVQ_SI_X11(11*8); \
- VPINSRQ_1_SI_X12(14*8); \
- VPINSRQ_1_SI_X11_0; \
- VINSERTI128 $1, X11, Y12, Y12; \
- LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \
- VMOVQ_SI_X11(1*8); \
- VMOVDQU 12*8(SI), X14; \
- VPINSRQ_1_SI_X11(10*8); \
- VINSERTI128 $1, X11, Y14, Y14; \
- VMOVQ_SI_X15(2*8); \
- VMOVDQU 4*8(SI), X11; \
- VPINSRQ_1_SI_X15(7*8); \
- VINSERTI128 $1, X11, Y15, Y15
-
-#define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \
- LOAD_MSG_AVX2_Y12(10, 8, 7, 1); \
- VMOVQ_SI_X13(2*8); \
- VPSHUFD $0x4E, 5*8(SI), X11; \
- VPINSRQ_1_SI_X13(4*8); \
- VINSERTI128 $1, X11, Y13, Y13; \
- LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \
- VMOVQ_SI_X15(11*8); \
- VMOVQ_SI_X11(12*8); \
- VPINSRQ_1_SI_X15(14*8); \
- VPINSRQ_1_SI_X11_0; \
- VINSERTI128 $1, X11, Y15, Y15
-
// func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
-TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment
- MOVQ h+0(FP), AX
- MOVQ c+8(FP), BX
- MOVQ flag+16(FP), CX
- MOVQ blocks_base+24(FP), SI
- MOVQ blocks_len+32(FP), DI
-
- MOVQ SP, DX
- ADDQ $31, DX
- ANDQ $~31, DX
-
- MOVQ CX, 16(DX)
- XORQ CX, CX
- MOVQ CX, 24(DX)
-
- VMOVDQU ·AVX2_c40<>(SB), Y4
- VMOVDQU ·AVX2_c48<>(SB), Y5
-
- VMOVDQU 0(AX), Y8
+// Requires: AVX, AVX2
+TEXT ·hashBlocksAVX2(SB), NOSPLIT, $320-48
+ MOVQ h+0(FP), AX
+ MOVQ c+8(FP), BX
+ MOVQ flag+16(FP), CX
+ MOVQ blocks_base+24(FP), SI
+ MOVQ blocks_len+32(FP), DI
+ MOVQ SP, DX
+ ADDQ $+31, DX
+ ANDQ $-32, DX
+ MOVQ CX, 16(DX)
+ XORQ CX, CX
+ MOVQ CX, 24(DX)
+ VMOVDQU ·AVX2_c40<>+0(SB), Y4
+ VMOVDQU ·AVX2_c48<>+0(SB), Y5
+ VMOVDQU (AX), Y8
VMOVDQU 32(AX), Y9
- VMOVDQU ·AVX2_iv0<>(SB), Y6
- VMOVDQU ·AVX2_iv1<>(SB), Y7
-
- MOVQ 0(BX), R8
- MOVQ 8(BX), R9
- MOVQ R9, 8(DX)
+ VMOVDQU ·AVX2_iv0<>+0(SB), Y6
+ VMOVDQU ·AVX2_iv1<>+0(SB), Y7
+ MOVQ (BX), R8
+ MOVQ 8(BX), R9
+ MOVQ R9, 8(DX)
loop:
- ADDQ $128, R8
- MOVQ R8, 0(DX)
- CMPQ R8, $128
+ ADDQ $0x80, R8
+ MOVQ R8, (DX)
+ CMPQ R8, $0x80
JGE noinc
INCQ R9
MOVQ R9, 8(DX)
noinc:
- VMOVDQA Y8, Y0
- VMOVDQA Y9, Y1
- VMOVDQA Y6, Y2
- VPXOR 0(DX), Y7, Y3
-
- LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15()
- VMOVDQA Y12, 32(DX)
- VMOVDQA Y13, 64(DX)
- VMOVDQA Y14, 96(DX)
- VMOVDQA Y15, 128(DX)
- ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
- LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3()
- VMOVDQA Y12, 160(DX)
- VMOVDQA Y13, 192(DX)
- VMOVDQA Y14, 224(DX)
- VMOVDQA Y15, 256(DX)
-
- ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
- LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4()
- ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
- LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8()
- ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
- LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13()
- ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
- LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9()
- ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
- LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11()
- ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
- LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10()
- ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
- LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5()
- ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
- LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0()
- ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-
- ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5)
- ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5)
-
- VPXOR Y0, Y8, Y8
- VPXOR Y1, Y9, Y9
- VPXOR Y2, Y8, Y8
- VPXOR Y3, Y9, Y9
-
- LEAQ 128(SI), SI
- SUBQ $128, DI
- JNE loop
-
- MOVQ R8, 0(BX)
- MOVQ R9, 8(BX)
-
- VMOVDQU Y8, 0(AX)
- VMOVDQU Y9, 32(AX)
+ VMOVDQA Y8, Y0
+ VMOVDQA Y9, Y1
+ VMOVDQA Y6, Y2
+ VPXOR (DX), Y7, Y3
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x26
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x20
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x10
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x30
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y12, Y12
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x08
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x28
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x18
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x38
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y13, Y13
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x40
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x60
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x50
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x70
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y14, Y14
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x48
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x68
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x58
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x78
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y15, Y15
+ VMOVDQA Y12, 32(DX)
+ VMOVDQA Y13, 64(DX)
+ VMOVDQA Y14, 96(DX)
+ VMOVDQA Y15, 128(DX)
+ VPADDQ Y12, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y13, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x93
+ VPADDQ Y14, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y15, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x93
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x70
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x48
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x20
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x68
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y12, Y12
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x50
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x78
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x40
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x30
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y13, Y13
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x58
+ VPSHUFD $0x4e, (SI), X14
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x28
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y14, Y14
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x60
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x38
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x10
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x18
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y15, Y15
+ VMOVDQA Y12, 160(DX)
+ VMOVDQA Y13, 192(DX)
+ VMOVDQA Y14, 224(DX)
+ VMOVDQA Y15, 256(DX)
+ VPADDQ Y12, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y13, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x93
+ VPADDQ Y14, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y15, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x93
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x28
+ VMOVDQU 88(SI), X12
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x78
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y12, Y12
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x40
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x10
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x2e
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x68
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y13, Y13
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x50
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x38
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x18
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x48
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y14, Y14
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x70
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x08
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x30
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x20
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y15, Y15
+ VPADDQ Y12, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y13, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x93
+ VPADDQ Y14, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y15, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x93
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x38
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x68
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x18
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x58
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y12, Y12
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x48
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x60
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x08
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x70
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y13, Y13
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x10
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x20
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x28
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x78
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y14, Y14
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x30
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x1e
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x50
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x40
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y15, Y15
+ VPADDQ Y12, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y13, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x93
+ VPADDQ Y14, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y15, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x93
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x48
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x10
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x28
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x50
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y12, Y12
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x2e
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x20
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x38
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x78
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y13, Y13
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x70
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x30
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x58
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x18
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y14, Y14
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x08
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x40
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x60
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x68
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y15, Y15
+ VPADDQ Y12, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y13, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x93
+ VPADDQ Y14, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y15, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x93
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x10
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x1e
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x30
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x40
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y12, Y12
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x60
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x58
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x50
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x18
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y13, Y13
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x20
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x78
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x38
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x08
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y14, Y14
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x68
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x70
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x28
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x48
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y15, Y15
+ VPADDQ Y12, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y13, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x93
+ VPADDQ Y14, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y15, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x93
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x60
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x70
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x08
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x20
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y12, Y12
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x28
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x68
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x78
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x50
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y13, Y13
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x36
+ VPSHUFD $0x4e, 64(SI), X11
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x30
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y14, Y14
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x38
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x10
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x18
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x58
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y15, Y15
+ VPADDQ Y12, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y13, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x93
+ VPADDQ Y14, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y15, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x93
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x68
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x60
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x38
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x18
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y12, Y12
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x58
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x08
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x70
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x48
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y13, Y13
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x28
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x40
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x78
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x10
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y14, Y14
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x3e
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x30
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x20
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x50
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y15, Y15
+ VPADDQ Y12, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y13, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x93
+ VPADDQ Y14, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y15, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x93
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x30
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x58
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x70
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x1e
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y12, Y12
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x78
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x18
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x48
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x40
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y13, Y13
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x08
+ VMOVDQU 96(SI), X14
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x50
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y14, Y14
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x10
+ VMOVDQU 32(SI), X11
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x38
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y15, Y15
+ VPADDQ Y12, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y13, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x93
+ VPADDQ Y14, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y15, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x93
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x50
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x38
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x40
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x08
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y12, Y12
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x10
+ VPSHUFD $0x4e, 40(SI), X11
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x20
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y13, Y13
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x78
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x18
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x48
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x5e
+ BYTE $0x68
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y14, Y14
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x58
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x5e
+ BYTE $0x60
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x70
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0xa1
+ BYTE $0x22
+ BYTE $0x1e
+ BYTE $0x01
+ VINSERTI128 $0x01, X11, Y15, Y15
+ VPADDQ Y12, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y13, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x93
+ VPADDQ Y14, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ Y15, Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x93
+ VPADDQ 32(DX), Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ 64(DX), Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x93
+ VPADDQ 96(DX), Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ 128(DX), Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x93
+ VPADDQ 160(DX), Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ 192(DX), Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x93
+ VPADDQ 224(DX), Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFD $-79, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPSHUFB Y4, Y1, Y1
+ VPADDQ 256(DX), Y0, Y0
+ VPADDQ Y1, Y0, Y0
+ VPXOR Y0, Y3, Y3
+ VPSHUFB Y5, Y3, Y3
+ VPADDQ Y3, Y2, Y2
+ VPXOR Y2, Y1, Y1
+ VPADDQ Y1, Y1, Y10
+ VPSRLQ $0x3f, Y1, Y1
+ VPXOR Y10, Y1, Y1
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xdb
+ BYTE $0x39
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xd2
+ BYTE $0x4e
+ BYTE $0xc4
+ BYTE $0xe3
+ BYTE $0xfd
+ BYTE $0x00
+ BYTE $0xc9
+ BYTE $0x93
+ VPXOR Y0, Y8, Y8
+ VPXOR Y1, Y9, Y9
+ VPXOR Y2, Y8, Y8
+ VPXOR Y3, Y9, Y9
+ LEAQ 128(SI), SI
+ SUBQ $0x80, DI
+ JNE loop
+ MOVQ R8, (BX)
+ MOVQ R9, 8(BX)
+ VMOVDQU Y8, (AX)
+ VMOVDQU Y9, 32(AX)
VZEROUPPER
-
RET
-#define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA
-#define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB
-#define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF
-#define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD
-#define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE
-
-#define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7
-#define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF
-#define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7
-#define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF
-#define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7
-#define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7
-#define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF
-#define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF
-
-#define SHUFFLE_AVX() \
- VMOVDQA X6, X13; \
- VMOVDQA X2, X14; \
- VMOVDQA X4, X6; \
- VPUNPCKLQDQ_X13_X13_X15; \
- VMOVDQA X5, X4; \
- VMOVDQA X6, X5; \
- VPUNPCKHQDQ_X15_X7_X6; \
- VPUNPCKLQDQ_X7_X7_X15; \
- VPUNPCKHQDQ_X15_X13_X7; \
- VPUNPCKLQDQ_X3_X3_X15; \
- VPUNPCKHQDQ_X15_X2_X2; \
- VPUNPCKLQDQ_X14_X14_X15; \
- VPUNPCKHQDQ_X15_X3_X3; \
-
-#define SHUFFLE_AVX_INV() \
- VMOVDQA X2, X13; \
- VMOVDQA X4, X14; \
- VPUNPCKLQDQ_X2_X2_X15; \
- VMOVDQA X5, X4; \
- VPUNPCKHQDQ_X15_X3_X2; \
- VMOVDQA X14, X5; \
- VPUNPCKLQDQ_X3_X3_X15; \
- VMOVDQA X6, X14; \
- VPUNPCKHQDQ_X15_X13_X3; \
- VPUNPCKLQDQ_X7_X7_X15; \
- VPUNPCKHQDQ_X15_X6_X6; \
- VPUNPCKLQDQ_X14_X14_X15; \
- VPUNPCKHQDQ_X15_X7_X7; \
-
-#define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
- VPADDQ m0, v0, v0; \
- VPADDQ v2, v0, v0; \
- VPADDQ m1, v1, v1; \
- VPADDQ v3, v1, v1; \
- VPXOR v0, v6, v6; \
- VPXOR v1, v7, v7; \
- VPSHUFD $-79, v6, v6; \
- VPSHUFD $-79, v7, v7; \
- VPADDQ v6, v4, v4; \
- VPADDQ v7, v5, v5; \
- VPXOR v4, v2, v2; \
- VPXOR v5, v3, v3; \
- VPSHUFB c40, v2, v2; \
- VPSHUFB c40, v3, v3; \
- VPADDQ m2, v0, v0; \
- VPADDQ v2, v0, v0; \
- VPADDQ m3, v1, v1; \
- VPADDQ v3, v1, v1; \
- VPXOR v0, v6, v6; \
- VPXOR v1, v7, v7; \
- VPSHUFB c48, v6, v6; \
- VPSHUFB c48, v7, v7; \
- VPADDQ v6, v4, v4; \
- VPADDQ v7, v5, v5; \
- VPXOR v4, v2, v2; \
- VPXOR v5, v3, v3; \
- VPADDQ v2, v2, t0; \
- VPSRLQ $63, v2, v2; \
- VPXOR t0, v2, v2; \
- VPADDQ v3, v3, t0; \
- VPSRLQ $63, v3, v3; \
- VPXOR t0, v3, v3
-
-// load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7)
-// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0
-#define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \
- VMOVQ_SI_X12(i0*8); \
- VMOVQ_SI_X13(i2*8); \
- VMOVQ_SI_X14(i4*8); \
- VMOVQ_SI_X15(i6*8); \
- VPINSRQ_1_SI_X12(i1*8); \
- VPINSRQ_1_SI_X13(i3*8); \
- VPINSRQ_1_SI_X14(i5*8); \
- VPINSRQ_1_SI_X15(i7*8)
-
-// load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7)
-#define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \
- VMOVQ_SI_X12_0; \
- VMOVQ_SI_X13(4*8); \
- VMOVQ_SI_X14(1*8); \
- VMOVQ_SI_X15(5*8); \
- VPINSRQ_1_SI_X12(2*8); \
- VPINSRQ_1_SI_X13(6*8); \
- VPINSRQ_1_SI_X14(3*8); \
- VPINSRQ_1_SI_X15(7*8)
-
-// load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3)
-#define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \
- VPSHUFD $0x4E, 0*8(SI), X12; \
- VMOVQ_SI_X13(11*8); \
- VMOVQ_SI_X14(12*8); \
- VMOVQ_SI_X15(7*8); \
- VPINSRQ_1_SI_X13(5*8); \
- VPINSRQ_1_SI_X14(2*8); \
- VPINSRQ_1_SI_X15(3*8)
-
-// load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13)
-#define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \
- VMOVDQU 11*8(SI), X12; \
- VMOVQ_SI_X13(5*8); \
- VMOVQ_SI_X14(8*8); \
- VMOVQ_SI_X15(2*8); \
- VPINSRQ_1_SI_X13(15*8); \
- VPINSRQ_1_SI_X14_0; \
- VPINSRQ_1_SI_X15(13*8)
-
-// load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8)
-#define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \
- VMOVQ_SI_X12(2*8); \
- VMOVQ_SI_X13(4*8); \
- VMOVQ_SI_X14(6*8); \
- VMOVQ_SI_X15_0; \
- VPINSRQ_1_SI_X12(5*8); \
- VPINSRQ_1_SI_X13(15*8); \
- VPINSRQ_1_SI_X14(10*8); \
- VPINSRQ_1_SI_X15(8*8)
+DATA ·AVX2_c40<>+0(SB)/8, $0x0201000706050403
+DATA ·AVX2_c40<>+8(SB)/8, $0x0a09080f0e0d0c0b
+DATA ·AVX2_c40<>+16(SB)/8, $0x0201000706050403
+DATA ·AVX2_c40<>+24(SB)/8, $0x0a09080f0e0d0c0b
+GLOBL ·AVX2_c40<>(SB), RODATA|NOPTR, $32
-// load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15)
-#define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \
- VMOVQ_SI_X12(9*8); \
- VMOVQ_SI_X13(2*8); \
- VMOVQ_SI_X14_0; \
- VMOVQ_SI_X15(4*8); \
- VPINSRQ_1_SI_X12(5*8); \
- VPINSRQ_1_SI_X13(10*8); \
- VPINSRQ_1_SI_X14(7*8); \
- VPINSRQ_1_SI_X15(15*8)
+DATA ·AVX2_c48<>+0(SB)/8, $0x0100070605040302
+DATA ·AVX2_c48<>+8(SB)/8, $0x09080f0e0d0c0b0a
+DATA ·AVX2_c48<>+16(SB)/8, $0x0100070605040302
+DATA ·AVX2_c48<>+24(SB)/8, $0x09080f0e0d0c0b0a
+GLOBL ·AVX2_c48<>(SB), RODATA|NOPTR, $32
-// load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3)
-#define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \
- VMOVQ_SI_X12(2*8); \
- VMOVQ_SI_X13_0; \
- VMOVQ_SI_X14(12*8); \
- VMOVQ_SI_X15(11*8); \
- VPINSRQ_1_SI_X12(6*8); \
- VPINSRQ_1_SI_X13(8*8); \
- VPINSRQ_1_SI_X14(10*8); \
- VPINSRQ_1_SI_X15(3*8)
+DATA ·AVX2_iv0<>+0(SB)/8, $0x6a09e667f3bcc908
+DATA ·AVX2_iv0<>+8(SB)/8, $0xbb67ae8584caa73b
+DATA ·AVX2_iv0<>+16(SB)/8, $0x3c6ef372fe94f82b
+DATA ·AVX2_iv0<>+24(SB)/8, $0xa54ff53a5f1d36f1
+GLOBL ·AVX2_iv0<>(SB), RODATA|NOPTR, $32
-// load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11)
-#define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \
- MOVQ 0*8(SI), X12; \
- VPSHUFD $0x4E, 8*8(SI), X13; \
- MOVQ 7*8(SI), X14; \
- MOVQ 2*8(SI), X15; \
- VPINSRQ_1_SI_X12(6*8); \
- VPINSRQ_1_SI_X14(3*8); \
- VPINSRQ_1_SI_X15(11*8)
-
-// load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8)
-#define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \
- MOVQ 6*8(SI), X12; \
- MOVQ 11*8(SI), X13; \
- MOVQ 15*8(SI), X14; \
- MOVQ 3*8(SI), X15; \
- VPINSRQ_1_SI_X12(14*8); \
- VPINSRQ_1_SI_X13_0; \
- VPINSRQ_1_SI_X14(9*8); \
- VPINSRQ_1_SI_X15(8*8)
-
-// load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10)
-#define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \
- MOVQ 5*8(SI), X12; \
- MOVQ 8*8(SI), X13; \
- MOVQ 0*8(SI), X14; \
- MOVQ 6*8(SI), X15; \
- VPINSRQ_1_SI_X12(15*8); \
- VPINSRQ_1_SI_X13(2*8); \
- VPINSRQ_1_SI_X14(4*8); \
- VPINSRQ_1_SI_X15(10*8)
-
-// load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5)
-#define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \
- VMOVDQU 12*8(SI), X12; \
- MOVQ 1*8(SI), X13; \
- MOVQ 2*8(SI), X14; \
- VPINSRQ_1_SI_X13(10*8); \
- VPINSRQ_1_SI_X14(7*8); \
- VMOVDQU 4*8(SI), X15
-
-// load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0)
-#define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \
- MOVQ 15*8(SI), X12; \
- MOVQ 3*8(SI), X13; \
- MOVQ 11*8(SI), X14; \
- MOVQ 12*8(SI), X15; \
- VPINSRQ_1_SI_X12(9*8); \
- VPINSRQ_1_SI_X13(13*8); \
- VPINSRQ_1_SI_X14(14*8); \
- VPINSRQ_1_SI_X15_0
+DATA ·AVX2_iv1<>+0(SB)/8, $0x510e527fade682d1
+DATA ·AVX2_iv1<>+8(SB)/8, $0x9b05688c2b3e6c1f
+DATA ·AVX2_iv1<>+16(SB)/8, $0x1f83d9abfb41bd6b
+DATA ·AVX2_iv1<>+24(SB)/8, $0x5be0cd19137e2179
+GLOBL ·AVX2_iv1<>(SB), RODATA|NOPTR, $32
// func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
-TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
- MOVQ h+0(FP), AX
- MOVQ c+8(FP), BX
- MOVQ flag+16(FP), CX
- MOVQ blocks_base+24(FP), SI
- MOVQ blocks_len+32(FP), DI
-
- MOVQ SP, R10
- ADDQ $15, R10
- ANDQ $~15, R10
-
- VMOVDQU ·AVX_c40<>(SB), X0
- VMOVDQU ·AVX_c48<>(SB), X1
+// Requires: AVX, SSE2
+TEXT ·hashBlocksAVX(SB), NOSPLIT, $288-48
+ MOVQ h+0(FP), AX
+ MOVQ c+8(FP), BX
+ MOVQ flag+16(FP), CX
+ MOVQ blocks_base+24(FP), SI
+ MOVQ blocks_len+32(FP), DI
+ MOVQ SP, R10
+ ADDQ $0x0f, R10
+ ANDQ $-16, R10
+ VMOVDQU ·AVX_c40<>+0(SB), X0
+ VMOVDQU ·AVX_c48<>+0(SB), X1
VMOVDQA X0, X8
VMOVDQA X1, X9
-
- VMOVDQU ·AVX_iv3<>(SB), X0
- VMOVDQA X0, 0(R10)
- XORQ CX, 0(R10) // 0(R10) = ·AVX_iv3 ^ (CX || 0)
-
- VMOVDQU 0(AX), X10
+ VMOVDQU ·AVX_iv3<>+0(SB), X0
+ VMOVDQA X0, (R10)
+ XORQ CX, (R10)
+ VMOVDQU (AX), X10
VMOVDQU 16(AX), X11
VMOVDQU 32(AX), X2
VMOVDQU 48(AX), X3
-
- MOVQ 0(BX), R8
- MOVQ 8(BX), R9
+ MOVQ (BX), R8
+ MOVQ 8(BX), R9
loop:
- ADDQ $128, R8
- CMPQ R8, $128
+ ADDQ $0x80, R8
+ CMPQ R8, $0x80
JGE noinc
INCQ R9
noinc:
- VMOVQ_R8_X15
- VPINSRQ_1_R9_X15
-
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0xf9
+ BYTE $0x6e
+ BYTE $0xf8
+ BYTE $0xc4
+ BYTE $0x43
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0xf9
+ BYTE $0x01
VMOVDQA X10, X0
VMOVDQA X11, X1
- VMOVDQU ·AVX_iv0<>(SB), X4
- VMOVDQU ·AVX_iv1<>(SB), X5
- VMOVDQU ·AVX_iv2<>(SB), X6
-
+ VMOVDQU ·AVX_iv0<>+0(SB), X4
+ VMOVDQU ·AVX_iv1<>+0(SB), X5
+ VMOVDQU ·AVX_iv2<>+0(SB), X6
VPXOR X15, X6, X6
- VMOVDQA 0(R10), X7
-
- LOAD_MSG_AVX_0_2_4_6_1_3_5_7()
+ VMOVDQA (R10), X7
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x26
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x20
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x08
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x28
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x10
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x30
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x18
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x38
+ BYTE $0x01
VMOVDQA X12, 16(R10)
VMOVDQA X13, 32(R10)
VMOVDQA X14, 48(R10)
VMOVDQA X15, 64(R10)
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX()
- LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15)
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X6, X13
+ VMOVDQA X2, X14
+ VMOVDQA X4, X6
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x11
+ BYTE $0x6c
+ BYTE $0xfd
+ VMOVDQA X5, X4
+ VMOVDQA X6, X5
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x69
+ BYTE $0x6d
+ BYTE $0xd7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xdf
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x40
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x60
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x48
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x68
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x50
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x70
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x58
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x78
+ BYTE $0x01
VMOVDQA X12, 80(R10)
VMOVDQA X13, 96(R10)
VMOVDQA X14, 112(R10)
VMOVDQA X15, 128(R10)
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX_INV()
-
- LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6)
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X2, X13
+ VMOVDQA X4, X14
+ BYTE $0xc5
+ BYTE $0x69
+ BYTE $0x6c
+ BYTE $0xfa
+ VMOVDQA X5, X4
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xd7
+ VMOVDQA X14, X5
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ VMOVDQA X6, X14
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xdf
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x49
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x70
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x48
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x50
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x78
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x20
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x68
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x40
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x30
+ BYTE $0x01
VMOVDQA X12, 144(R10)
VMOVDQA X13, 160(R10)
VMOVDQA X14, 176(R10)
VMOVDQA X15, 192(R10)
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX()
- LOAD_MSG_AVX_1_0_11_5_12_2_7_3()
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X6, X13
+ VMOVDQA X2, X14
+ VMOVDQA X4, X6
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x11
+ BYTE $0x6c
+ BYTE $0xfd
+ VMOVDQA X5, X4
+ VMOVDQA X6, X5
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x69
+ BYTE $0x6d
+ BYTE $0xd7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xdf
+ VPSHUFD $0x4e, (SI), X12
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x58
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x60
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x38
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x28
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x10
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x18
+ BYTE $0x01
VMOVDQA X12, 208(R10)
VMOVDQA X13, 224(R10)
VMOVDQA X14, 240(R10)
VMOVDQA X15, 256(R10)
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX_INV()
-
- LOAD_MSG_AVX_11_12_5_15_8_0_2_13()
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX()
- LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4)
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX_INV()
-
- LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14)
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX()
- LOAD_MSG_AVX_2_5_4_15_6_10_0_8()
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX_INV()
-
- LOAD_MSG_AVX_9_5_2_10_0_7_4_15()
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX()
- LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13)
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX_INV()
-
- LOAD_MSG_AVX_2_6_0_8_12_10_11_3()
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX()
- LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9)
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX_INV()
-
- LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10)
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX()
- LOAD_MSG_AVX_0_6_9_8_7_3_2_11()
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX_INV()
-
- LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9)
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX()
- LOAD_MSG_AVX_5_15_8_2_0_4_6_10()
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX_INV()
-
- LOAD_MSG_AVX_6_14_11_0_15_9_3_8()
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX()
- LOAD_MSG_AVX_12_13_1_10_2_7_4_5()
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX_INV()
-
- LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5)
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX()
- LOAD_MSG_AVX_15_9_3_13_11_14_12_0()
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
- SHUFFLE_AVX_INV()
-
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9)
- SHUFFLE_AVX()
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9)
- SHUFFLE_AVX_INV()
-
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9)
- SHUFFLE_AVX()
- HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9)
- SHUFFLE_AVX_INV()
-
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X2, X13
+ VMOVDQA X4, X14
+ BYTE $0xc5
+ BYTE $0x69
+ BYTE $0x6c
+ BYTE $0xfa
+ VMOVDQA X5, X4
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xd7
+ VMOVDQA X14, X5
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ VMOVDQA X6, X14
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xdf
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x49
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xff
+ VMOVDQU 88(SI), X12
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x28
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x40
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x10
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x78
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x36
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x68
+ BYTE $0x01
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X6, X13
+ VMOVDQA X2, X14
+ VMOVDQA X4, X6
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x11
+ BYTE $0x6c
+ BYTE $0xfd
+ VMOVDQA X5, X4
+ VMOVDQA X6, X5
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x69
+ BYTE $0x6d
+ BYTE $0xd7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xdf
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x50
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x38
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x70
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x08
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x18
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x48
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x30
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x20
+ BYTE $0x01
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X2, X13
+ VMOVDQA X4, X14
+ BYTE $0xc5
+ BYTE $0x69
+ BYTE $0x6c
+ BYTE $0xfa
+ VMOVDQA X5, X4
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xd7
+ VMOVDQA X14, X5
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ VMOVDQA X6, X14
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xdf
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x49
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x38
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x68
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x48
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x60
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x18
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x58
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x08
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x70
+ BYTE $0x01
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X6, X13
+ VMOVDQA X2, X14
+ VMOVDQA X4, X6
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x11
+ BYTE $0x6c
+ BYTE $0xfd
+ VMOVDQA X5, X4
+ VMOVDQA X6, X5
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x69
+ BYTE $0x6d
+ BYTE $0xd7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xdf
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x10
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x20
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x30
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x3e
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x28
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x78
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x50
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x40
+ BYTE $0x01
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X2, X13
+ VMOVDQA X4, X14
+ BYTE $0xc5
+ BYTE $0x69
+ BYTE $0x6c
+ BYTE $0xfa
+ VMOVDQA X5, X4
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xd7
+ VMOVDQA X14, X5
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ VMOVDQA X6, X14
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xdf
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x49
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x48
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x10
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x36
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x20
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x28
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x50
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x38
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x78
+ BYTE $0x01
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X6, X13
+ VMOVDQA X2, X14
+ VMOVDQA X4, X6
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x11
+ BYTE $0x6c
+ BYTE $0xfd
+ VMOVDQA X5, X4
+ VMOVDQA X6, X5
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x69
+ BYTE $0x6d
+ BYTE $0xd7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xdf
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x70
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x30
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x08
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x40
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x58
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x18
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x60
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x68
+ BYTE $0x01
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X2, X13
+ VMOVDQA X4, X14
+ BYTE $0xc5
+ BYTE $0x69
+ BYTE $0x6c
+ BYTE $0xfa
+ VMOVDQA X5, X4
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xd7
+ VMOVDQA X14, X5
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ VMOVDQA X6, X14
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xdf
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x49
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x10
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x2e
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x60
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x58
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x30
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x40
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x50
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x18
+ BYTE $0x01
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X6, X13
+ VMOVDQA X2, X14
+ VMOVDQA X4, X6
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x11
+ BYTE $0x6c
+ BYTE $0xfd
+ VMOVDQA X5, X4
+ VMOVDQA X6, X5
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x69
+ BYTE $0x6d
+ BYTE $0xd7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xdf
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x20
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x78
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x68
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x70
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x38
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x08
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x28
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x48
+ BYTE $0x01
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X2, X13
+ VMOVDQA X4, X14
+ BYTE $0xc5
+ BYTE $0x69
+ BYTE $0x6c
+ BYTE $0xfa
+ VMOVDQA X5, X4
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xd7
+ VMOVDQA X14, X5
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ VMOVDQA X6, X14
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xdf
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x49
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x60
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x70
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x28
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x68
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x08
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x20
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x78
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x50
+ BYTE $0x01
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X6, X13
+ VMOVDQA X2, X14
+ VMOVDQA X4, X6
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x11
+ BYTE $0x6c
+ BYTE $0xfd
+ VMOVDQA X5, X4
+ VMOVDQA X6, X5
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x69
+ BYTE $0x6d
+ BYTE $0xd7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xdf
+ MOVQ (SI), X12
+ VPSHUFD $0x4e, 64(SI), X13
+ MOVQ 56(SI), X14
+ MOVQ 16(SI), X15
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x30
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x18
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x58
+ BYTE $0x01
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X2, X13
+ VMOVDQA X4, X14
+ BYTE $0xc5
+ BYTE $0x69
+ BYTE $0x6c
+ BYTE $0xfa
+ VMOVDQA X5, X4
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xd7
+ VMOVDQA X14, X5
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ VMOVDQA X6, X14
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xdf
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x49
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x68
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x60
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x58
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x08
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x38
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x18
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x70
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x48
+ BYTE $0x01
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X6, X13
+ VMOVDQA X2, X14
+ VMOVDQA X4, X6
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x11
+ BYTE $0x6c
+ BYTE $0xfd
+ VMOVDQA X5, X4
+ VMOVDQA X6, X5
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x69
+ BYTE $0x6d
+ BYTE $0xd7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xdf
+ MOVQ 40(SI), X12
+ MOVQ 64(SI), X13
+ MOVQ (SI), X14
+ MOVQ 48(SI), X15
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x78
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x10
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x20
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x50
+ BYTE $0x01
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X2, X13
+ VMOVDQA X4, X14
+ BYTE $0xc5
+ BYTE $0x69
+ BYTE $0x6c
+ BYTE $0xfa
+ VMOVDQA X5, X4
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xd7
+ VMOVDQA X14, X5
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ VMOVDQA X6, X14
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xdf
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x49
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xff
+ MOVQ 48(SI), X12
+ MOVQ 88(SI), X13
+ MOVQ 120(SI), X14
+ MOVQ 24(SI), X15
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x70
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x2e
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x48
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x40
+ BYTE $0x01
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X6, X13
+ VMOVDQA X2, X14
+ VMOVDQA X4, X6
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x11
+ BYTE $0x6c
+ BYTE $0xfd
+ VMOVDQA X5, X4
+ VMOVDQA X6, X5
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x69
+ BYTE $0x6d
+ BYTE $0xd7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xdf
+ VMOVDQU 96(SI), X12
+ MOVQ 8(SI), X13
+ MOVQ 16(SI), X14
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x50
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x38
+ BYTE $0x01
+ VMOVDQU 32(SI), X15
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X2, X13
+ VMOVDQA X4, X14
+ BYTE $0xc5
+ BYTE $0x69
+ BYTE $0x6c
+ BYTE $0xfa
+ VMOVDQA X5, X4
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xd7
+ VMOVDQA X14, X5
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ VMOVDQA X6, X14
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xdf
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x49
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x66
+ BYTE $0x50
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x6e
+ BYTE $0x38
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x76
+ BYTE $0x10
+ BYTE $0xc5
+ BYTE $0x7a
+ BYTE $0x7e
+ BYTE $0x7e
+ BYTE $0x30
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x40
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x08
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x20
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x7e
+ BYTE $0x28
+ BYTE $0x01
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X6, X13
+ VMOVDQA X2, X14
+ VMOVDQA X4, X6
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x11
+ BYTE $0x6c
+ BYTE $0xfd
+ VMOVDQA X5, X4
+ VMOVDQA X6, X5
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x69
+ BYTE $0x6d
+ BYTE $0xd7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xdf
+ MOVQ 120(SI), X12
+ MOVQ 24(SI), X13
+ MOVQ 88(SI), X14
+ MOVQ 96(SI), X15
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x99
+ BYTE $0x22
+ BYTE $0x66
+ BYTE $0x48
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x91
+ BYTE $0x22
+ BYTE $0x6e
+ BYTE $0x68
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x89
+ BYTE $0x22
+ BYTE $0x76
+ BYTE $0x70
+ BYTE $0x01
+ BYTE $0xc4
+ BYTE $0x63
+ BYTE $0x81
+ BYTE $0x22
+ BYTE $0x3e
+ BYTE $0x01
+ VPADDQ X12, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X13, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ X14, X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ X15, X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X2, X13
+ VMOVDQA X4, X14
+ BYTE $0xc5
+ BYTE $0x69
+ BYTE $0x6c
+ BYTE $0xfa
+ VMOVDQA X5, X4
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xd7
+ VMOVDQA X14, X5
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ VMOVDQA X6, X14
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xdf
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x49
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xff
+ VPADDQ 16(R10), X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ 32(R10), X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ 48(R10), X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ 64(R10), X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X6, X13
+ VMOVDQA X2, X14
+ VMOVDQA X4, X6
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x11
+ BYTE $0x6c
+ BYTE $0xfd
+ VMOVDQA X5, X4
+ VMOVDQA X6, X5
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x69
+ BYTE $0x6d
+ BYTE $0xd7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xdf
+ VPADDQ 80(R10), X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ 96(R10), X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ 112(R10), X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ 128(R10), X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X2, X13
+ VMOVDQA X4, X14
+ BYTE $0xc5
+ BYTE $0x69
+ BYTE $0x6c
+ BYTE $0xfa
+ VMOVDQA X5, X4
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xd7
+ VMOVDQA X14, X5
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ VMOVDQA X6, X14
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xdf
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x49
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xff
+ VPADDQ 144(R10), X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ 160(R10), X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ 176(R10), X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ 192(R10), X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X6, X13
+ VMOVDQA X2, X14
+ VMOVDQA X4, X6
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x11
+ BYTE $0x6c
+ BYTE $0xfd
+ VMOVDQA X5, X4
+ VMOVDQA X6, X5
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xff
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x69
+ BYTE $0x6d
+ BYTE $0xd7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xdf
+ VPADDQ 208(R10), X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ 224(R10), X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFD $-79, X6, X6
+ VPSHUFD $-79, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPSHUFB X8, X2, X2
+ VPSHUFB X8, X3, X3
+ VPADDQ 240(R10), X0, X0
+ VPADDQ X2, X0, X0
+ VPADDQ 256(R10), X1, X1
+ VPADDQ X3, X1, X1
+ VPXOR X0, X6, X6
+ VPXOR X1, X7, X7
+ VPSHUFB X9, X6, X6
+ VPSHUFB X9, X7, X7
+ VPADDQ X6, X4, X4
+ VPADDQ X7, X5, X5
+ VPXOR X4, X2, X2
+ VPXOR X5, X3, X3
+ VPADDQ X2, X2, X15
+ VPSRLQ $0x3f, X2, X2
+ VPXOR X15, X2, X2
+ VPADDQ X3, X3, X15
+ VPSRLQ $0x3f, X3, X3
+ VPXOR X15, X3, X3
+ VMOVDQA X2, X13
+ VMOVDQA X4, X14
+ BYTE $0xc5
+ BYTE $0x69
+ BYTE $0x6c
+ BYTE $0xfa
+ VMOVDQA X5, X4
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x61
+ BYTE $0x6d
+ BYTE $0xd7
+ VMOVDQA X14, X5
+ BYTE $0xc5
+ BYTE $0x61
+ BYTE $0x6c
+ BYTE $0xfb
+ VMOVDQA X6, X14
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x11
+ BYTE $0x6d
+ BYTE $0xdf
+ BYTE $0xc5
+ BYTE $0x41
+ BYTE $0x6c
+ BYTE $0xff
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x49
+ BYTE $0x6d
+ BYTE $0xf7
+ BYTE $0xc4
+ BYTE $0x41
+ BYTE $0x09
+ BYTE $0x6c
+ BYTE $0xfe
+ BYTE $0xc4
+ BYTE $0xc1
+ BYTE $0x41
+ BYTE $0x6d
+ BYTE $0xff
VMOVDQU 32(AX), X14
VMOVDQU 48(AX), X15
VPXOR X0, X10, X10
@@ -729,16 +4524,36 @@ noinc:
VPXOR X7, X15, X3
VMOVDQU X2, 32(AX)
VMOVDQU X3, 48(AX)
+ LEAQ 128(SI), SI
+ SUBQ $0x80, DI
+ JNE loop
+ VMOVDQU X10, (AX)
+ VMOVDQU X11, 16(AX)
+ MOVQ R8, (BX)
+ MOVQ R9, 8(BX)
+ VZEROUPPER
+ RET
- LEAQ 128(SI), SI
- SUBQ $128, DI
- JNE loop
+DATA ·AVX_c40<>+0(SB)/8, $0x0201000706050403
+DATA ·AVX_c40<>+8(SB)/8, $0x0a09080f0e0d0c0b
+GLOBL ·AVX_c40<>(SB), RODATA|NOPTR, $16
- VMOVDQU X10, 0(AX)
- VMOVDQU X11, 16(AX)
+DATA ·AVX_c48<>+0(SB)/8, $0x0100070605040302
+DATA ·AVX_c48<>+8(SB)/8, $0x09080f0e0d0c0b0a
+GLOBL ·AVX_c48<>(SB), RODATA|NOPTR, $16
- MOVQ R8, 0(BX)
- MOVQ R9, 8(BX)
- VZEROUPPER
+DATA ·AVX_iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b
+DATA ·AVX_iv3<>+8(SB)/8, $0x5be0cd19137e2179
+GLOBL ·AVX_iv3<>(SB), RODATA|NOPTR, $16
- RET
+DATA ·AVX_iv0<>+0(SB)/8, $0x6a09e667f3bcc908
+DATA ·AVX_iv0<>+8(SB)/8, $0xbb67ae8584caa73b
+GLOBL ·AVX_iv0<>(SB), RODATA|NOPTR, $16
+
+DATA ·AVX_iv1<>+0(SB)/8, $0x3c6ef372fe94f82b
+DATA ·AVX_iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1
+GLOBL ·AVX_iv1<>(SB), RODATA|NOPTR, $16
+
+DATA ·AVX_iv2<>+0(SB)/8, $0x510e527fade682d1
+DATA ·AVX_iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f
+GLOBL ·AVX_iv2<>(SB), RODATA|NOPTR, $16
diff --git a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s
index adfac00c1..9a0ce2124 100644
--- a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s
+++ b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s
@@ -1,278 +1,1441 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run blake2b_amd64_asm.go -out ../../blake2b_amd64.s -pkg blake2b. DO NOT EDIT.
//go:build amd64 && gc && !purego
#include "textflag.h"
-DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
-DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
-GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16
-
-DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
-DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
-GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16
-
-DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1
-DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
-GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16
-
-DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
-DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
-GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16
-
-DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
-DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
-GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
-
-DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
-DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
-GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
-
-#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
- MOVO v4, t1; \
- MOVO v5, v4; \
- MOVO t1, v5; \
- MOVO v6, t1; \
- PUNPCKLQDQ v6, t2; \
- PUNPCKHQDQ v7, v6; \
- PUNPCKHQDQ t2, v6; \
- PUNPCKLQDQ v7, t2; \
- MOVO t1, v7; \
- MOVO v2, t1; \
- PUNPCKHQDQ t2, v7; \
- PUNPCKLQDQ v3, t2; \
- PUNPCKHQDQ t2, v2; \
- PUNPCKLQDQ t1, t2; \
- PUNPCKHQDQ t2, v3
-
-#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
- MOVO v4, t1; \
- MOVO v5, v4; \
- MOVO t1, v5; \
- MOVO v2, t1; \
- PUNPCKLQDQ v2, t2; \
- PUNPCKHQDQ v3, v2; \
- PUNPCKHQDQ t2, v2; \
- PUNPCKLQDQ v3, t2; \
- MOVO t1, v3; \
- MOVO v6, t1; \
- PUNPCKHQDQ t2, v3; \
- PUNPCKLQDQ v7, t2; \
- PUNPCKHQDQ t2, v6; \
- PUNPCKLQDQ t1, t2; \
- PUNPCKHQDQ t2, v7
-
-#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
- PADDQ m0, v0; \
- PADDQ m1, v1; \
- PADDQ v2, v0; \
- PADDQ v3, v1; \
- PXOR v0, v6; \
- PXOR v1, v7; \
- PSHUFD $0xB1, v6, v6; \
- PSHUFD $0xB1, v7, v7; \
- PADDQ v6, v4; \
- PADDQ v7, v5; \
- PXOR v4, v2; \
- PXOR v5, v3; \
- PSHUFB c40, v2; \
- PSHUFB c40, v3; \
- PADDQ m2, v0; \
- PADDQ m3, v1; \
- PADDQ v2, v0; \
- PADDQ v3, v1; \
- PXOR v0, v6; \
- PXOR v1, v7; \
- PSHUFB c48, v6; \
- PSHUFB c48, v7; \
- PADDQ v6, v4; \
- PADDQ v7, v5; \
- PXOR v4, v2; \
- PXOR v5, v3; \
- MOVOU v2, t0; \
- PADDQ v2, t0; \
- PSRLQ $63, v2; \
- PXOR t0, v2; \
- MOVOU v3, t0; \
- PADDQ v3, t0; \
- PSRLQ $63, v3; \
- PXOR t0, v3
-
-#define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \
- MOVQ i0*8(src), m0; \
- PINSRQ $1, i1*8(src), m0; \
- MOVQ i2*8(src), m1; \
- PINSRQ $1, i3*8(src), m1; \
- MOVQ i4*8(src), m2; \
- PINSRQ $1, i5*8(src), m2; \
- MOVQ i6*8(src), m3; \
- PINSRQ $1, i7*8(src), m3
-
// func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
-TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
- MOVQ h+0(FP), AX
- MOVQ c+8(FP), BX
- MOVQ flag+16(FP), CX
- MOVQ blocks_base+24(FP), SI
- MOVQ blocks_len+32(FP), DI
-
- MOVQ SP, R10
- ADDQ $15, R10
- ANDQ $~15, R10
-
- MOVOU ·iv3<>(SB), X0
- MOVO X0, 0(R10)
- XORQ CX, 0(R10) // 0(R10) = ·iv3 ^ (CX || 0)
-
- MOVOU ·c40<>(SB), X13
- MOVOU ·c48<>(SB), X14
-
- MOVOU 0(AX), X12
+// Requires: SSE2, SSE4.1, SSSE3
+TEXT ·hashBlocksSSE4(SB), NOSPLIT, $288-48
+ MOVQ h+0(FP), AX
+ MOVQ c+8(FP), BX
+ MOVQ flag+16(FP), CX
+ MOVQ blocks_base+24(FP), SI
+ MOVQ blocks_len+32(FP), DI
+ MOVQ SP, R10
+ ADDQ $0x0f, R10
+ ANDQ $-16, R10
+ MOVOU ·iv3<>+0(SB), X0
+ MOVO X0, (R10)
+ XORQ CX, (R10)
+ MOVOU ·c40<>+0(SB), X13
+ MOVOU ·c48<>+0(SB), X14
+ MOVOU (AX), X12
MOVOU 16(AX), X15
-
- MOVQ 0(BX), R8
- MOVQ 8(BX), R9
+ MOVQ (BX), R8
+ MOVQ 8(BX), R9
loop:
- ADDQ $128, R8
- CMPQ R8, $128
+ ADDQ $0x80, R8
+ CMPQ R8, $0x80
JGE noinc
INCQ R9
noinc:
- MOVQ R8, X8
- PINSRQ $1, R9, X8
-
- MOVO X12, X0
- MOVO X15, X1
- MOVOU 32(AX), X2
- MOVOU 48(AX), X3
- MOVOU ·iv0<>(SB), X4
- MOVOU ·iv1<>(SB), X5
- MOVOU ·iv2<>(SB), X6
-
- PXOR X8, X6
- MOVO 0(R10), X7
-
- LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
- MOVO X8, 16(R10)
- MOVO X9, 32(R10)
- MOVO X10, 48(R10)
- MOVO X11, 64(R10)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
- LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
- MOVO X8, 80(R10)
- MOVO X9, 96(R10)
- MOVO X10, 112(R10)
- MOVO X11, 128(R10)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
- LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
- MOVO X8, 144(R10)
- MOVO X9, 160(R10)
- MOVO X10, 176(R10)
- MOVO X11, 192(R10)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
- LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
- MOVO X8, 208(R10)
- MOVO X9, 224(R10)
- MOVO X10, 240(R10)
- MOVO X11, 256(R10)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
- LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
- LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
- LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
- LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
- LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
- LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
- LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
- LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
- LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
- LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
- LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
- LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
- LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
- LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
- LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
- LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
- SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14)
- SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14)
- SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
+ MOVQ R8, X8
+ PINSRQ $0x01, R9, X8
+ MOVO X12, X0
+ MOVO X15, X1
+ MOVOU 32(AX), X2
+ MOVOU 48(AX), X3
+ MOVOU ·iv0<>+0(SB), X4
+ MOVOU ·iv1<>+0(SB), X5
+ MOVOU ·iv2<>+0(SB), X6
+ PXOR X8, X6
+ MOVO (R10), X7
+ MOVQ (SI), X8
+ PINSRQ $0x01, 16(SI), X8
+ MOVQ 32(SI), X9
+ PINSRQ $0x01, 48(SI), X9
+ MOVQ 8(SI), X10
+ PINSRQ $0x01, 24(SI), X10
+ MOVQ 40(SI), X11
+ PINSRQ $0x01, 56(SI), X11
+ MOVO X8, 16(R10)
+ MOVO X9, 32(R10)
+ MOVO X10, 48(R10)
+ MOVO X11, 64(R10)
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVQ 64(SI), X8
+ PINSRQ $0x01, 80(SI), X8
+ MOVQ 96(SI), X9
+ PINSRQ $0x01, 112(SI), X9
+ MOVQ 72(SI), X10
+ PINSRQ $0x01, 88(SI), X10
+ MOVQ 104(SI), X11
+ PINSRQ $0x01, 120(SI), X11
+ MOVO X8, 80(R10)
+ MOVO X9, 96(R10)
+ MOVO X10, 112(R10)
+ MOVO X11, 128(R10)
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVQ 112(SI), X8
+ PINSRQ $0x01, 32(SI), X8
+ MOVQ 72(SI), X9
+ PINSRQ $0x01, 104(SI), X9
+ MOVQ 80(SI), X10
+ PINSRQ $0x01, 64(SI), X10
+ MOVQ 120(SI), X11
+ PINSRQ $0x01, 48(SI), X11
+ MOVO X8, 144(R10)
+ MOVO X9, 160(R10)
+ MOVO X10, 176(R10)
+ MOVO X11, 192(R10)
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVQ 8(SI), X8
+ PINSRQ $0x01, (SI), X8
+ MOVQ 88(SI), X9
+ PINSRQ $0x01, 40(SI), X9
+ MOVQ 96(SI), X10
+ PINSRQ $0x01, 16(SI), X10
+ MOVQ 56(SI), X11
+ PINSRQ $0x01, 24(SI), X11
+ MOVO X8, 208(R10)
+ MOVO X9, 224(R10)
+ MOVO X10, 240(R10)
+ MOVO X11, 256(R10)
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVQ 88(SI), X8
+ PINSRQ $0x01, 96(SI), X8
+ MOVQ 40(SI), X9
+ PINSRQ $0x01, 120(SI), X9
+ MOVQ 64(SI), X10
+ PINSRQ $0x01, (SI), X10
+ MOVQ 16(SI), X11
+ PINSRQ $0x01, 104(SI), X11
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVQ 80(SI), X8
+ PINSRQ $0x01, 24(SI), X8
+ MOVQ 56(SI), X9
+ PINSRQ $0x01, 72(SI), X9
+ MOVQ 112(SI), X10
+ PINSRQ $0x01, 48(SI), X10
+ MOVQ 8(SI), X11
+ PINSRQ $0x01, 32(SI), X11
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVQ 56(SI), X8
+ PINSRQ $0x01, 24(SI), X8
+ MOVQ 104(SI), X9
+ PINSRQ $0x01, 88(SI), X9
+ MOVQ 72(SI), X10
+ PINSRQ $0x01, 8(SI), X10
+ MOVQ 96(SI), X11
+ PINSRQ $0x01, 112(SI), X11
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVQ 16(SI), X8
+ PINSRQ $0x01, 40(SI), X8
+ MOVQ 32(SI), X9
+ PINSRQ $0x01, 120(SI), X9
+ MOVQ 48(SI), X10
+ PINSRQ $0x01, 80(SI), X10
+ MOVQ (SI), X11
+ PINSRQ $0x01, 64(SI), X11
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVQ 72(SI), X8
+ PINSRQ $0x01, 40(SI), X8
+ MOVQ 16(SI), X9
+ PINSRQ $0x01, 80(SI), X9
+ MOVQ (SI), X10
+ PINSRQ $0x01, 56(SI), X10
+ MOVQ 32(SI), X11
+ PINSRQ $0x01, 120(SI), X11
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVQ 112(SI), X8
+ PINSRQ $0x01, 88(SI), X8
+ MOVQ 48(SI), X9
+ PINSRQ $0x01, 24(SI), X9
+ MOVQ 8(SI), X10
+ PINSRQ $0x01, 96(SI), X10
+ MOVQ 64(SI), X11
+ PINSRQ $0x01, 104(SI), X11
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVQ 16(SI), X8
+ PINSRQ $0x01, 48(SI), X8
+ MOVQ (SI), X9
+ PINSRQ $0x01, 64(SI), X9
+ MOVQ 96(SI), X10
+ PINSRQ $0x01, 80(SI), X10
+ MOVQ 88(SI), X11
+ PINSRQ $0x01, 24(SI), X11
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVQ 32(SI), X8
+ PINSRQ $0x01, 56(SI), X8
+ MOVQ 120(SI), X9
+ PINSRQ $0x01, 8(SI), X9
+ MOVQ 104(SI), X10
+ PINSRQ $0x01, 40(SI), X10
+ MOVQ 112(SI), X11
+ PINSRQ $0x01, 72(SI), X11
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVQ 96(SI), X8
+ PINSRQ $0x01, 8(SI), X8
+ MOVQ 112(SI), X9
+ PINSRQ $0x01, 32(SI), X9
+ MOVQ 40(SI), X10
+ PINSRQ $0x01, 120(SI), X10
+ MOVQ 104(SI), X11
+ PINSRQ $0x01, 80(SI), X11
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVQ (SI), X8
+ PINSRQ $0x01, 48(SI), X8
+ MOVQ 72(SI), X9
+ PINSRQ $0x01, 64(SI), X9
+ MOVQ 56(SI), X10
+ PINSRQ $0x01, 24(SI), X10
+ MOVQ 16(SI), X11
+ PINSRQ $0x01, 88(SI), X11
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVQ 104(SI), X8
+ PINSRQ $0x01, 56(SI), X8
+ MOVQ 96(SI), X9
+ PINSRQ $0x01, 24(SI), X9
+ MOVQ 88(SI), X10
+ PINSRQ $0x01, 112(SI), X10
+ MOVQ 8(SI), X11
+ PINSRQ $0x01, 72(SI), X11
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVQ 40(SI), X8
+ PINSRQ $0x01, 120(SI), X8
+ MOVQ 64(SI), X9
+ PINSRQ $0x01, 16(SI), X9
+ MOVQ (SI), X10
+ PINSRQ $0x01, 32(SI), X10
+ MOVQ 48(SI), X11
+ PINSRQ $0x01, 80(SI), X11
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVQ 48(SI), X8
+ PINSRQ $0x01, 112(SI), X8
+ MOVQ 88(SI), X9
+ PINSRQ $0x01, (SI), X9
+ MOVQ 120(SI), X10
+ PINSRQ $0x01, 72(SI), X10
+ MOVQ 24(SI), X11
+ PINSRQ $0x01, 64(SI), X11
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVQ 96(SI), X8
+ PINSRQ $0x01, 104(SI), X8
+ MOVQ 8(SI), X9
+ PINSRQ $0x01, 80(SI), X9
+ MOVQ 16(SI), X10
+ PINSRQ $0x01, 56(SI), X10
+ MOVQ 32(SI), X11
+ PINSRQ $0x01, 40(SI), X11
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVQ 80(SI), X8
+ PINSRQ $0x01, 64(SI), X8
+ MOVQ 56(SI), X9
+ PINSRQ $0x01, 8(SI), X9
+ MOVQ 16(SI), X10
+ PINSRQ $0x01, 32(SI), X10
+ MOVQ 48(SI), X11
+ PINSRQ $0x01, 40(SI), X11
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ MOVQ 120(SI), X8
+ PINSRQ $0x01, 72(SI), X8
+ MOVQ 24(SI), X9
+ PINSRQ $0x01, 104(SI), X9
+ MOVQ 88(SI), X10
+ PINSRQ $0x01, 112(SI), X10
+ MOVQ 96(SI), X11
+ PINSRQ $0x01, (SI), X11
+ PADDQ X8, X0
+ PADDQ X9, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ X10, X0
+ PADDQ X11, X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ PADDQ 16(R10), X0
+ PADDQ 32(R10), X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ 48(R10), X0
+ PADDQ 64(R10), X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ PADDQ 80(R10), X0
+ PADDQ 96(R10), X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ 112(R10), X0
+ PADDQ 128(R10), X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ PADDQ 144(R10), X0
+ PADDQ 160(R10), X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ 176(R10), X0
+ PADDQ 192(R10), X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X6, X8
+ PUNPCKLQDQ X6, X9
+ PUNPCKHQDQ X7, X6
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X7, X9
+ MOVO X8, X7
+ MOVO X2, X8
+ PUNPCKHQDQ X9, X7
+ PUNPCKLQDQ X3, X9
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X3
+ PADDQ 208(R10), X0
+ PADDQ 224(R10), X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFD $0xb1, X6, X6
+ PSHUFD $0xb1, X7, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ PSHUFB X13, X2
+ PSHUFB X13, X3
+ PADDQ 240(R10), X0
+ PADDQ 256(R10), X1
+ PADDQ X2, X0
+ PADDQ X3, X1
+ PXOR X0, X6
+ PXOR X1, X7
+ PSHUFB X14, X6
+ PSHUFB X14, X7
+ PADDQ X6, X4
+ PADDQ X7, X5
+ PXOR X4, X2
+ PXOR X5, X3
+ MOVOU X2, X11
+ PADDQ X2, X11
+ PSRLQ $0x3f, X2
+ PXOR X11, X2
+ MOVOU X3, X11
+ PADDQ X3, X11
+ PSRLQ $0x3f, X3
+ PXOR X11, X3
+ MOVO X4, X8
+ MOVO X5, X4
+ MOVO X8, X5
+ MOVO X2, X8
+ PUNPCKLQDQ X2, X9
+ PUNPCKHQDQ X3, X2
+ PUNPCKHQDQ X9, X2
+ PUNPCKLQDQ X3, X9
+ MOVO X8, X3
+ MOVO X6, X8
+ PUNPCKHQDQ X9, X3
+ PUNPCKLQDQ X7, X9
+ PUNPCKHQDQ X9, X6
+ PUNPCKLQDQ X8, X9
+ PUNPCKHQDQ X9, X7
+ MOVOU 32(AX), X10
+ MOVOU 48(AX), X11
+ PXOR X0, X12
+ PXOR X1, X15
+ PXOR X2, X10
+ PXOR X3, X11
+ PXOR X4, X12
+ PXOR X5, X15
+ PXOR X6, X10
+ PXOR X7, X11
+ MOVOU X10, 32(AX)
+ MOVOU X11, 48(AX)
+ LEAQ 128(SI), SI
+ SUBQ $0x80, DI
+ JNE loop
+ MOVOU X12, (AX)
+ MOVOU X15, 16(AX)
+ MOVQ R8, (BX)
+ MOVQ R9, 8(BX)
+ RET
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14)
- SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
- HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14)
- SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
+DATA ·iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b
+DATA ·iv3<>+8(SB)/8, $0x5be0cd19137e2179
+GLOBL ·iv3<>(SB), RODATA|NOPTR, $16
- MOVOU 32(AX), X10
- MOVOU 48(AX), X11
- PXOR X0, X12
- PXOR X1, X15
- PXOR X2, X10
- PXOR X3, X11
- PXOR X4, X12
- PXOR X5, X15
- PXOR X6, X10
- PXOR X7, X11
- MOVOU X10, 32(AX)
- MOVOU X11, 48(AX)
+DATA ·c40<>+0(SB)/8, $0x0201000706050403
+DATA ·c40<>+8(SB)/8, $0x0a09080f0e0d0c0b
+GLOBL ·c40<>(SB), RODATA|NOPTR, $16
- LEAQ 128(SI), SI
- SUBQ $128, DI
- JNE loop
+DATA ·c48<>+0(SB)/8, $0x0100070605040302
+DATA ·c48<>+8(SB)/8, $0x09080f0e0d0c0b0a
+GLOBL ·c48<>(SB), RODATA|NOPTR, $16
- MOVOU X12, 0(AX)
- MOVOU X15, 16(AX)
+DATA ·iv0<>+0(SB)/8, $0x6a09e667f3bcc908
+DATA ·iv0<>+8(SB)/8, $0xbb67ae8584caa73b
+GLOBL ·iv0<>(SB), RODATA|NOPTR, $16
- MOVQ R8, 0(BX)
- MOVQ R9, 8(BX)
+DATA ·iv1<>+0(SB)/8, $0x3c6ef372fe94f82b
+DATA ·iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1
+GLOBL ·iv1<>(SB), RODATA|NOPTR, $16
- RET
+DATA ·iv2<>+0(SB)/8, $0x510e527fade682d1
+DATA ·iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f
+GLOBL ·iv2<>(SB), RODATA|NOPTR, $16
diff --git a/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.s b/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.s
index fe4b818a3..57d510fc0 100644
--- a/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.s
+++ b/vendor/golang.org/x/crypto/blake2s/blake2s_amd64.s
@@ -1,432 +1,2173 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run blake2s_amd64_asm.go -out ../blake2s_amd64.s -pkg blake2s. DO NOT EDIT.
//go:build amd64 && gc && !purego
#include "textflag.h"
-DATA iv0<>+0x00(SB)/4, $0x6a09e667
-DATA iv0<>+0x04(SB)/4, $0xbb67ae85
-DATA iv0<>+0x08(SB)/4, $0x3c6ef372
-DATA iv0<>+0x0c(SB)/4, $0xa54ff53a
-GLOBL iv0<>(SB), (NOPTR+RODATA), $16
-
-DATA iv1<>+0x00(SB)/4, $0x510e527f
-DATA iv1<>+0x04(SB)/4, $0x9b05688c
-DATA iv1<>+0x08(SB)/4, $0x1f83d9ab
-DATA iv1<>+0x0c(SB)/4, $0x5be0cd19
-GLOBL iv1<>(SB), (NOPTR+RODATA), $16
-
-DATA rol16<>+0x00(SB)/8, $0x0504070601000302
-DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
-GLOBL rol16<>(SB), (NOPTR+RODATA), $16
-
-DATA rol8<>+0x00(SB)/8, $0x0407060500030201
-DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
-GLOBL rol8<>(SB), (NOPTR+RODATA), $16
-
-DATA counter<>+0x00(SB)/8, $0x40
-DATA counter<>+0x08(SB)/8, $0x0
-GLOBL counter<>(SB), (NOPTR+RODATA), $16
-
-#define ROTL_SSE2(n, t, v) \
- MOVO v, t; \
- PSLLL $n, t; \
- PSRLL $(32-n), v; \
- PXOR t, v
-
-#define ROTL_SSSE3(c, v) \
- PSHUFB c, v
-
-#define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \
- PADDL m0, v0; \
- PADDL v1, v0; \
- PXOR v0, v3; \
- ROTL_SSE2(16, t, v3); \
- PADDL v3, v2; \
- PXOR v2, v1; \
- ROTL_SSE2(20, t, v1); \
- PADDL m1, v0; \
- PADDL v1, v0; \
- PXOR v0, v3; \
- ROTL_SSE2(24, t, v3); \
- PADDL v3, v2; \
- PXOR v2, v1; \
- ROTL_SSE2(25, t, v1); \
- PSHUFL $0x39, v1, v1; \
- PSHUFL $0x4E, v2, v2; \
- PSHUFL $0x93, v3, v3; \
- PADDL m2, v0; \
- PADDL v1, v0; \
- PXOR v0, v3; \
- ROTL_SSE2(16, t, v3); \
- PADDL v3, v2; \
- PXOR v2, v1; \
- ROTL_SSE2(20, t, v1); \
- PADDL m3, v0; \
- PADDL v1, v0; \
- PXOR v0, v3; \
- ROTL_SSE2(24, t, v3); \
- PADDL v3, v2; \
- PXOR v2, v1; \
- ROTL_SSE2(25, t, v1); \
- PSHUFL $0x39, v3, v3; \
- PSHUFL $0x4E, v2, v2; \
- PSHUFL $0x93, v1, v1
-
-#define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \
- PADDL m0, v0; \
- PADDL v1, v0; \
- PXOR v0, v3; \
- ROTL_SSSE3(c16, v3); \
- PADDL v3, v2; \
- PXOR v2, v1; \
- ROTL_SSE2(20, t, v1); \
- PADDL m1, v0; \
- PADDL v1, v0; \
- PXOR v0, v3; \
- ROTL_SSSE3(c8, v3); \
- PADDL v3, v2; \
- PXOR v2, v1; \
- ROTL_SSE2(25, t, v1); \
- PSHUFL $0x39, v1, v1; \
- PSHUFL $0x4E, v2, v2; \
- PSHUFL $0x93, v3, v3; \
- PADDL m2, v0; \
- PADDL v1, v0; \
- PXOR v0, v3; \
- ROTL_SSSE3(c16, v3); \
- PADDL v3, v2; \
- PXOR v2, v1; \
- ROTL_SSE2(20, t, v1); \
- PADDL m3, v0; \
- PADDL v1, v0; \
- PXOR v0, v3; \
- ROTL_SSSE3(c8, v3); \
- PADDL v3, v2; \
- PXOR v2, v1; \
- ROTL_SSE2(25, t, v1); \
- PSHUFL $0x39, v3, v3; \
- PSHUFL $0x4E, v2, v2; \
- PSHUFL $0x93, v1, v1
-
-
-#define LOAD_MSG_SSE4(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15) \
- MOVL i0*4(src), m0; \
- PINSRD $1, i1*4(src), m0; \
- PINSRD $2, i2*4(src), m0; \
- PINSRD $3, i3*4(src), m0; \
- MOVL i4*4(src), m1; \
- PINSRD $1, i5*4(src), m1; \
- PINSRD $2, i6*4(src), m1; \
- PINSRD $3, i7*4(src), m1; \
- MOVL i8*4(src), m2; \
- PINSRD $1, i9*4(src), m2; \
- PINSRD $2, i10*4(src), m2; \
- PINSRD $3, i11*4(src), m2; \
- MOVL i12*4(src), m3; \
- PINSRD $1, i13*4(src), m3; \
- PINSRD $2, i14*4(src), m3; \
- PINSRD $3, i15*4(src), m3
+// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
+// Requires: SSE2
+TEXT ·hashBlocksSSE2(SB), $672-48
+ MOVQ h+0(FP), AX
+ MOVQ c+8(FP), BX
+ MOVL flag+16(FP), CX
+ MOVQ blocks_base+24(FP), SI
+ MOVQ blocks_len+32(FP), DX
+ MOVQ SP, BP
+ ADDQ $0x0f, BP
+ ANDQ $-16, BP
+ MOVQ (BX), R9
+ MOVQ R9, (BP)
+ MOVQ CX, 8(BP)
+ MOVOU (AX), X0
+ MOVOU 16(AX), X1
+ MOVOU iv0<>+0(SB), X2
+ MOVOU iv1<>+0(SB), X3
+ MOVOU counter<>+0(SB), X12
+ MOVOU rol16<>+0(SB), X13
+ MOVOU rol8<>+0(SB), X14
+ MOVO (BP), X15
-#define PRECOMPUTE_MSG(dst, off, src, R8, R9, R10, R11, R12, R13, R14, R15) \
- MOVQ 0*4(src), R8; \
- MOVQ 2*4(src), R9; \
- MOVQ 4*4(src), R10; \
- MOVQ 6*4(src), R11; \
- MOVQ 8*4(src), R12; \
- MOVQ 10*4(src), R13; \
- MOVQ 12*4(src), R14; \
- MOVQ 14*4(src), R15; \
- \
- MOVL R8, 0*4+off+0(dst); \
- MOVL R8, 9*4+off+64(dst); \
- MOVL R8, 5*4+off+128(dst); \
- MOVL R8, 14*4+off+192(dst); \
- MOVL R8, 4*4+off+256(dst); \
- MOVL R8, 2*4+off+320(dst); \
- MOVL R8, 8*4+off+384(dst); \
- MOVL R8, 12*4+off+448(dst); \
- MOVL R8, 3*4+off+512(dst); \
- MOVL R8, 15*4+off+576(dst); \
- SHRQ $32, R8; \
- MOVL R8, 4*4+off+0(dst); \
- MOVL R8, 8*4+off+64(dst); \
- MOVL R8, 14*4+off+128(dst); \
- MOVL R8, 5*4+off+192(dst); \
- MOVL R8, 12*4+off+256(dst); \
- MOVL R8, 11*4+off+320(dst); \
- MOVL R8, 1*4+off+384(dst); \
- MOVL R8, 6*4+off+448(dst); \
- MOVL R8, 10*4+off+512(dst); \
- MOVL R8, 3*4+off+576(dst); \
- \
- MOVL R9, 1*4+off+0(dst); \
- MOVL R9, 13*4+off+64(dst); \
- MOVL R9, 6*4+off+128(dst); \
- MOVL R9, 8*4+off+192(dst); \
- MOVL R9, 2*4+off+256(dst); \
- MOVL R9, 0*4+off+320(dst); \
- MOVL R9, 14*4+off+384(dst); \
- MOVL R9, 11*4+off+448(dst); \
- MOVL R9, 12*4+off+512(dst); \
- MOVL R9, 4*4+off+576(dst); \
- SHRQ $32, R9; \
- MOVL R9, 5*4+off+0(dst); \
- MOVL R9, 15*4+off+64(dst); \
- MOVL R9, 9*4+off+128(dst); \
- MOVL R9, 1*4+off+192(dst); \
- MOVL R9, 11*4+off+256(dst); \
- MOVL R9, 7*4+off+320(dst); \
- MOVL R9, 13*4+off+384(dst); \
- MOVL R9, 3*4+off+448(dst); \
- MOVL R9, 6*4+off+512(dst); \
- MOVL R9, 10*4+off+576(dst); \
- \
- MOVL R10, 2*4+off+0(dst); \
- MOVL R10, 1*4+off+64(dst); \
- MOVL R10, 15*4+off+128(dst); \
- MOVL R10, 10*4+off+192(dst); \
- MOVL R10, 6*4+off+256(dst); \
- MOVL R10, 8*4+off+320(dst); \
- MOVL R10, 3*4+off+384(dst); \
- MOVL R10, 13*4+off+448(dst); \
- MOVL R10, 14*4+off+512(dst); \
- MOVL R10, 5*4+off+576(dst); \
- SHRQ $32, R10; \
- MOVL R10, 6*4+off+0(dst); \
- MOVL R10, 11*4+off+64(dst); \
- MOVL R10, 2*4+off+128(dst); \
- MOVL R10, 9*4+off+192(dst); \
- MOVL R10, 1*4+off+256(dst); \
- MOVL R10, 13*4+off+320(dst); \
- MOVL R10, 4*4+off+384(dst); \
- MOVL R10, 8*4+off+448(dst); \
- MOVL R10, 15*4+off+512(dst); \
- MOVL R10, 7*4+off+576(dst); \
- \
- MOVL R11, 3*4+off+0(dst); \
- MOVL R11, 7*4+off+64(dst); \
- MOVL R11, 13*4+off+128(dst); \
- MOVL R11, 12*4+off+192(dst); \
- MOVL R11, 10*4+off+256(dst); \
- MOVL R11, 1*4+off+320(dst); \
- MOVL R11, 9*4+off+384(dst); \
- MOVL R11, 14*4+off+448(dst); \
- MOVL R11, 0*4+off+512(dst); \
- MOVL R11, 6*4+off+576(dst); \
- SHRQ $32, R11; \
- MOVL R11, 7*4+off+0(dst); \
- MOVL R11, 14*4+off+64(dst); \
- MOVL R11, 10*4+off+128(dst); \
- MOVL R11, 0*4+off+192(dst); \
- MOVL R11, 5*4+off+256(dst); \
- MOVL R11, 9*4+off+320(dst); \
- MOVL R11, 12*4+off+384(dst); \
- MOVL R11, 1*4+off+448(dst); \
- MOVL R11, 13*4+off+512(dst); \
- MOVL R11, 2*4+off+576(dst); \
- \
- MOVL R12, 8*4+off+0(dst); \
- MOVL R12, 5*4+off+64(dst); \
- MOVL R12, 4*4+off+128(dst); \
- MOVL R12, 15*4+off+192(dst); \
- MOVL R12, 14*4+off+256(dst); \
- MOVL R12, 3*4+off+320(dst); \
- MOVL R12, 11*4+off+384(dst); \
- MOVL R12, 10*4+off+448(dst); \
- MOVL R12, 7*4+off+512(dst); \
- MOVL R12, 1*4+off+576(dst); \
- SHRQ $32, R12; \
- MOVL R12, 12*4+off+0(dst); \
- MOVL R12, 2*4+off+64(dst); \
- MOVL R12, 11*4+off+128(dst); \
- MOVL R12, 4*4+off+192(dst); \
- MOVL R12, 0*4+off+256(dst); \
- MOVL R12, 15*4+off+320(dst); \
- MOVL R12, 10*4+off+384(dst); \
- MOVL R12, 7*4+off+448(dst); \
- MOVL R12, 5*4+off+512(dst); \
- MOVL R12, 9*4+off+576(dst); \
- \
- MOVL R13, 9*4+off+0(dst); \
- MOVL R13, 4*4+off+64(dst); \
- MOVL R13, 8*4+off+128(dst); \
- MOVL R13, 13*4+off+192(dst); \
- MOVL R13, 3*4+off+256(dst); \
- MOVL R13, 5*4+off+320(dst); \
- MOVL R13, 7*4+off+384(dst); \
- MOVL R13, 15*4+off+448(dst); \
- MOVL R13, 11*4+off+512(dst); \
- MOVL R13, 0*4+off+576(dst); \
- SHRQ $32, R13; \
- MOVL R13, 13*4+off+0(dst); \
- MOVL R13, 10*4+off+64(dst); \
- MOVL R13, 0*4+off+128(dst); \
- MOVL R13, 3*4+off+192(dst); \
- MOVL R13, 9*4+off+256(dst); \
- MOVL R13, 6*4+off+320(dst); \
- MOVL R13, 15*4+off+384(dst); \
- MOVL R13, 4*4+off+448(dst); \
- MOVL R13, 2*4+off+512(dst); \
- MOVL R13, 12*4+off+576(dst); \
- \
- MOVL R14, 10*4+off+0(dst); \
- MOVL R14, 12*4+off+64(dst); \
- MOVL R14, 1*4+off+128(dst); \
- MOVL R14, 6*4+off+192(dst); \
- MOVL R14, 13*4+off+256(dst); \
- MOVL R14, 4*4+off+320(dst); \
- MOVL R14, 0*4+off+384(dst); \
- MOVL R14, 2*4+off+448(dst); \
- MOVL R14, 8*4+off+512(dst); \
- MOVL R14, 14*4+off+576(dst); \
- SHRQ $32, R14; \
- MOVL R14, 14*4+off+0(dst); \
- MOVL R14, 3*4+off+64(dst); \
- MOVL R14, 7*4+off+128(dst); \
- MOVL R14, 2*4+off+192(dst); \
- MOVL R14, 15*4+off+256(dst); \
- MOVL R14, 12*4+off+320(dst); \
- MOVL R14, 6*4+off+384(dst); \
- MOVL R14, 0*4+off+448(dst); \
- MOVL R14, 9*4+off+512(dst); \
- MOVL R14, 11*4+off+576(dst); \
- \
- MOVL R15, 11*4+off+0(dst); \
- MOVL R15, 0*4+off+64(dst); \
- MOVL R15, 12*4+off+128(dst); \
- MOVL R15, 7*4+off+192(dst); \
- MOVL R15, 8*4+off+256(dst); \
- MOVL R15, 14*4+off+320(dst); \
- MOVL R15, 2*4+off+384(dst); \
- MOVL R15, 5*4+off+448(dst); \
- MOVL R15, 1*4+off+512(dst); \
- MOVL R15, 13*4+off+576(dst); \
- SHRQ $32, R15; \
- MOVL R15, 15*4+off+0(dst); \
- MOVL R15, 6*4+off+64(dst); \
- MOVL R15, 3*4+off+128(dst); \
- MOVL R15, 11*4+off+192(dst); \
- MOVL R15, 7*4+off+256(dst); \
- MOVL R15, 10*4+off+320(dst); \
- MOVL R15, 5*4+off+384(dst); \
- MOVL R15, 9*4+off+448(dst); \
- MOVL R15, 4*4+off+512(dst); \
- MOVL R15, 8*4+off+576(dst)
+loop:
+ MOVO X0, X4
+ MOVO X1, X5
+ MOVO X2, X6
+ MOVO X3, X7
+ PADDQ X12, X15
+ PXOR X15, X7
+ MOVQ (SI), R8
+ MOVQ 8(SI), R9
+ MOVQ 16(SI), R10
+ MOVQ 24(SI), R11
+ MOVQ 32(SI), R12
+ MOVQ 40(SI), R13
+ MOVQ 48(SI), R14
+ MOVQ 56(SI), R15
+ MOVL R8, 16(BP)
+ MOVL R8, 116(BP)
+ MOVL R8, 164(BP)
+ MOVL R8, 264(BP)
+ MOVL R8, 288(BP)
+ MOVL R8, 344(BP)
+ MOVL R8, 432(BP)
+ MOVL R8, 512(BP)
+ MOVL R8, 540(BP)
+ MOVL R8, 652(BP)
+ SHRQ $0x20, R8
+ MOVL R8, 32(BP)
+ MOVL R8, 112(BP)
+ MOVL R8, 200(BP)
+ MOVL R8, 228(BP)
+ MOVL R8, 320(BP)
+ MOVL R8, 380(BP)
+ MOVL R8, 404(BP)
+ MOVL R8, 488(BP)
+ MOVL R8, 568(BP)
+ MOVL R8, 604(BP)
+ MOVL R9, 20(BP)
+ MOVL R9, 132(BP)
+ MOVL R9, 168(BP)
+ MOVL R9, 240(BP)
+ MOVL R9, 280(BP)
+ MOVL R9, 336(BP)
+ MOVL R9, 456(BP)
+ MOVL R9, 508(BP)
+ MOVL R9, 576(BP)
+ MOVL R9, 608(BP)
+ SHRQ $0x20, R9
+ MOVL R9, 36(BP)
+ MOVL R9, 140(BP)
+ MOVL R9, 180(BP)
+ MOVL R9, 212(BP)
+ MOVL R9, 316(BP)
+ MOVL R9, 364(BP)
+ MOVL R9, 452(BP)
+ MOVL R9, 476(BP)
+ MOVL R9, 552(BP)
+ MOVL R9, 632(BP)
+ MOVL R10, 24(BP)
+ MOVL R10, 84(BP)
+ MOVL R10, 204(BP)
+ MOVL R10, 248(BP)
+ MOVL R10, 296(BP)
+ MOVL R10, 368(BP)
+ MOVL R10, 412(BP)
+ MOVL R10, 516(BP)
+ MOVL R10, 584(BP)
+ MOVL R10, 612(BP)
+ SHRQ $0x20, R10
+ MOVL R10, 40(BP)
+ MOVL R10, 124(BP)
+ MOVL R10, 152(BP)
+ MOVL R10, 244(BP)
+ MOVL R10, 276(BP)
+ MOVL R10, 388(BP)
+ MOVL R10, 416(BP)
+ MOVL R10, 496(BP)
+ MOVL R10, 588(BP)
+ MOVL R10, 620(BP)
+ MOVL R11, 28(BP)
+ MOVL R11, 108(BP)
+ MOVL R11, 196(BP)
+ MOVL R11, 256(BP)
+ MOVL R11, 312(BP)
+ MOVL R11, 340(BP)
+ MOVL R11, 436(BP)
+ MOVL R11, 520(BP)
+ MOVL R11, 528(BP)
+ MOVL R11, 616(BP)
+ SHRQ $0x20, R11
+ MOVL R11, 44(BP)
+ MOVL R11, 136(BP)
+ MOVL R11, 184(BP)
+ MOVL R11, 208(BP)
+ MOVL R11, 292(BP)
+ MOVL R11, 372(BP)
+ MOVL R11, 448(BP)
+ MOVL R11, 468(BP)
+ MOVL R11, 580(BP)
+ MOVL R11, 600(BP)
+ MOVL R12, 48(BP)
+ MOVL R12, 100(BP)
+ MOVL R12, 160(BP)
+ MOVL R12, 268(BP)
+ MOVL R12, 328(BP)
+ MOVL R12, 348(BP)
+ MOVL R12, 444(BP)
+ MOVL R12, 504(BP)
+ MOVL R12, 556(BP)
+ MOVL R12, 596(BP)
+ SHRQ $0x20, R12
+ MOVL R12, 64(BP)
+ MOVL R12, 88(BP)
+ MOVL R12, 188(BP)
+ MOVL R12, 224(BP)
+ MOVL R12, 272(BP)
+ MOVL R12, 396(BP)
+ MOVL R12, 440(BP)
+ MOVL R12, 492(BP)
+ MOVL R12, 548(BP)
+ MOVL R12, 628(BP)
+ MOVL R13, 52(BP)
+ MOVL R13, 96(BP)
+ MOVL R13, 176(BP)
+ MOVL R13, 260(BP)
+ MOVL R13, 284(BP)
+ MOVL R13, 356(BP)
+ MOVL R13, 428(BP)
+ MOVL R13, 524(BP)
+ MOVL R13, 572(BP)
+ MOVL R13, 592(BP)
+ SHRQ $0x20, R13
+ MOVL R13, 68(BP)
+ MOVL R13, 120(BP)
+ MOVL R13, 144(BP)
+ MOVL R13, 220(BP)
+ MOVL R13, 308(BP)
+ MOVL R13, 360(BP)
+ MOVL R13, 460(BP)
+ MOVL R13, 480(BP)
+ MOVL R13, 536(BP)
+ MOVL R13, 640(BP)
+ MOVL R14, 56(BP)
+ MOVL R14, 128(BP)
+ MOVL R14, 148(BP)
+ MOVL R14, 232(BP)
+ MOVL R14, 324(BP)
+ MOVL R14, 352(BP)
+ MOVL R14, 400(BP)
+ MOVL R14, 472(BP)
+ MOVL R14, 560(BP)
+ MOVL R14, 648(BP)
+ SHRQ $0x20, R14
+ MOVL R14, 72(BP)
+ MOVL R14, 92(BP)
+ MOVL R14, 172(BP)
+ MOVL R14, 216(BP)
+ MOVL R14, 332(BP)
+ MOVL R14, 384(BP)
+ MOVL R14, 424(BP)
+ MOVL R14, 464(BP)
+ MOVL R14, 564(BP)
+ MOVL R14, 636(BP)
+ MOVL R15, 60(BP)
+ MOVL R15, 80(BP)
+ MOVL R15, 192(BP)
+ MOVL R15, 236(BP)
+ MOVL R15, 304(BP)
+ MOVL R15, 392(BP)
+ MOVL R15, 408(BP)
+ MOVL R15, 484(BP)
+ MOVL R15, 532(BP)
+ MOVL R15, 644(BP)
+ SHRQ $0x20, R15
+ MOVL R15, 76(BP)
+ MOVL R15, 104(BP)
+ MOVL R15, 156(BP)
+ MOVL R15, 252(BP)
+ MOVL R15, 300(BP)
+ MOVL R15, 376(BP)
+ MOVL R15, 420(BP)
+ MOVL R15, 500(BP)
+ MOVL R15, 544(BP)
+ MOVL R15, 624(BP)
+ PADDL 16(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 32(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 48(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 64(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 80(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 96(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 112(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 128(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 144(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 160(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 176(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 192(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 208(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 224(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 240(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 256(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 272(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 288(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 304(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 320(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 336(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 352(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 368(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 384(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 400(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 416(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 432(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 448(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 464(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 480(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 496(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 512(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 528(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 544(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 560(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 576(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 592(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 608(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 624(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x10, X8
+ PSRLL $0x10, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 640(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ MOVO X7, X8
+ PSLLL $0x18, X8
+ PSRLL $0x08, X7
+ PXOR X8, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PXOR X4, X0
+ PXOR X5, X1
+ PXOR X6, X0
+ PXOR X7, X1
+ LEAQ 64(SI), SI
+ SUBQ $0x40, DX
+ JNE loop
+ MOVO X15, (BP)
+ MOVQ (BP), R9
+ MOVQ R9, (BX)
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
+ RET
-#define BLAKE2s_SSE2() \
- PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15); \
- ROUND_SSE2(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8); \
- ROUND_SSE2(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8)
+DATA iv0<>+0(SB)/4, $0x6a09e667
+DATA iv0<>+4(SB)/4, $0xbb67ae85
+DATA iv0<>+8(SB)/4, $0x3c6ef372
+DATA iv0<>+12(SB)/4, $0xa54ff53a
+GLOBL iv0<>(SB), RODATA|NOPTR, $16
-#define BLAKE2s_SSSE3() \
- PRECOMPUTE_MSG(BP, 16, SI, R8, R9, R10, R11, R12, R13, R14, R15); \
- ROUND_SSSE3(X4, X5, X6, X7, 16(BP), 32(BP), 48(BP), 64(BP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+64(BP), 32+64(BP), 48+64(BP), 64+64(BP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+128(BP), 32+128(BP), 48+128(BP), 64+128(BP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+192(BP), 32+192(BP), 48+192(BP), 64+192(BP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+256(BP), 32+256(BP), 48+256(BP), 64+256(BP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+320(BP), 32+320(BP), 48+320(BP), 64+320(BP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+384(BP), 32+384(BP), 48+384(BP), 64+384(BP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+448(BP), 32+448(BP), 48+448(BP), 64+448(BP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+512(BP), 32+512(BP), 48+512(BP), 64+512(BP), X8, X13, X14); \
- ROUND_SSSE3(X4, X5, X6, X7, 16+576(BP), 32+576(BP), 48+576(BP), 64+576(BP), X8, X13, X14)
+DATA iv1<>+0(SB)/4, $0x510e527f
+DATA iv1<>+4(SB)/4, $0x9b05688c
+DATA iv1<>+8(SB)/4, $0x1f83d9ab
+DATA iv1<>+12(SB)/4, $0x5be0cd19
+GLOBL iv1<>(SB), RODATA|NOPTR, $16
-#define BLAKE2s_SSE4() \
- LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15); \
- ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
- LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3); \
- ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
- LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4); \
- ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
- LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8); \
- ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
- LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13); \
- ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
- LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9); \
- ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
- LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11); \
- ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
- LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10); \
- ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
- LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5); \
- ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14); \
- LOAD_MSG_SSE4(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0); \
- ROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10, X11, X8, X13, X14)
+DATA counter<>+0(SB)/8, $0x0000000000000040
+DATA counter<>+8(SB)/8, $0x0000000000000000
+GLOBL counter<>(SB), RODATA|NOPTR, $16
-#define HASH_BLOCKS(h, c, flag, blocks_base, blocks_len, BLAKE2s_FUNC) \
- MOVQ h, AX; \
- MOVQ c, BX; \
- MOVL flag, CX; \
- MOVQ blocks_base, SI; \
- MOVQ blocks_len, DX; \
- \
- MOVQ SP, BP; \
- ADDQ $15, BP; \
- ANDQ $~15, BP; \
- \
- MOVQ 0(BX), R9; \
- MOVQ R9, 0(BP); \
- MOVQ CX, 8(BP); \
- \
- MOVOU 0(AX), X0; \
- MOVOU 16(AX), X1; \
- MOVOU iv0<>(SB), X2; \
- MOVOU iv1<>(SB), X3 \
- \
- MOVOU counter<>(SB), X12; \
- MOVOU rol16<>(SB), X13; \
- MOVOU rol8<>(SB), X14; \
- MOVO 0(BP), X15; \
- \
- loop: \
- MOVO X0, X4; \
- MOVO X1, X5; \
- MOVO X2, X6; \
- MOVO X3, X7; \
- \
- PADDQ X12, X15; \
- PXOR X15, X7; \
- \
- BLAKE2s_FUNC(); \
- \
- PXOR X4, X0; \
- PXOR X5, X1; \
- PXOR X6, X0; \
- PXOR X7, X1; \
- \
- LEAQ 64(SI), SI; \
- SUBQ $64, DX; \
- JNE loop; \
- \
- MOVO X15, 0(BP); \
- MOVQ 0(BP), R9; \
- MOVQ R9, 0(BX); \
- \
- MOVOU X0, 0(AX); \
- MOVOU X1, 16(AX)
+DATA rol16<>+0(SB)/8, $0x0504070601000302
+DATA rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a
+GLOBL rol16<>(SB), RODATA|NOPTR, $16
-// func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
-TEXT ·hashBlocksSSE2(SB), 0, $672-48 // frame = 656 + 16 byte alignment
- HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE2)
- RET
+DATA rol8<>+0(SB)/8, $0x0407060500030201
+DATA rol8<>+8(SB)/8, $0x0c0f0e0d080b0a09
+GLOBL rol8<>(SB), RODATA|NOPTR, $16
// func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
-TEXT ·hashBlocksSSSE3(SB), 0, $672-48 // frame = 656 + 16 byte alignment
- HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSSE3)
+// Requires: SSE2, SSSE3
+TEXT ·hashBlocksSSSE3(SB), $672-48
+ MOVQ h+0(FP), AX
+ MOVQ c+8(FP), BX
+ MOVL flag+16(FP), CX
+ MOVQ blocks_base+24(FP), SI
+ MOVQ blocks_len+32(FP), DX
+ MOVQ SP, BP
+ ADDQ $0x0f, BP
+ ANDQ $-16, BP
+ MOVQ (BX), R9
+ MOVQ R9, (BP)
+ MOVQ CX, 8(BP)
+ MOVOU (AX), X0
+ MOVOU 16(AX), X1
+ MOVOU iv0<>+0(SB), X2
+ MOVOU iv1<>+0(SB), X3
+ MOVOU counter<>+0(SB), X12
+ MOVOU rol16<>+0(SB), X13
+ MOVOU rol8<>+0(SB), X14
+ MOVO (BP), X15
+
+loop:
+ MOVO X0, X4
+ MOVO X1, X5
+ MOVO X2, X6
+ MOVO X3, X7
+ PADDQ X12, X15
+ PXOR X15, X7
+ MOVQ (SI), R8
+ MOVQ 8(SI), R9
+ MOVQ 16(SI), R10
+ MOVQ 24(SI), R11
+ MOVQ 32(SI), R12
+ MOVQ 40(SI), R13
+ MOVQ 48(SI), R14
+ MOVQ 56(SI), R15
+ MOVL R8, 16(BP)
+ MOVL R8, 116(BP)
+ MOVL R8, 164(BP)
+ MOVL R8, 264(BP)
+ MOVL R8, 288(BP)
+ MOVL R8, 344(BP)
+ MOVL R8, 432(BP)
+ MOVL R8, 512(BP)
+ MOVL R8, 540(BP)
+ MOVL R8, 652(BP)
+ SHRQ $0x20, R8
+ MOVL R8, 32(BP)
+ MOVL R8, 112(BP)
+ MOVL R8, 200(BP)
+ MOVL R8, 228(BP)
+ MOVL R8, 320(BP)
+ MOVL R8, 380(BP)
+ MOVL R8, 404(BP)
+ MOVL R8, 488(BP)
+ MOVL R8, 568(BP)
+ MOVL R8, 604(BP)
+ MOVL R9, 20(BP)
+ MOVL R9, 132(BP)
+ MOVL R9, 168(BP)
+ MOVL R9, 240(BP)
+ MOVL R9, 280(BP)
+ MOVL R9, 336(BP)
+ MOVL R9, 456(BP)
+ MOVL R9, 508(BP)
+ MOVL R9, 576(BP)
+ MOVL R9, 608(BP)
+ SHRQ $0x20, R9
+ MOVL R9, 36(BP)
+ MOVL R9, 140(BP)
+ MOVL R9, 180(BP)
+ MOVL R9, 212(BP)
+ MOVL R9, 316(BP)
+ MOVL R9, 364(BP)
+ MOVL R9, 452(BP)
+ MOVL R9, 476(BP)
+ MOVL R9, 552(BP)
+ MOVL R9, 632(BP)
+ MOVL R10, 24(BP)
+ MOVL R10, 84(BP)
+ MOVL R10, 204(BP)
+ MOVL R10, 248(BP)
+ MOVL R10, 296(BP)
+ MOVL R10, 368(BP)
+ MOVL R10, 412(BP)
+ MOVL R10, 516(BP)
+ MOVL R10, 584(BP)
+ MOVL R10, 612(BP)
+ SHRQ $0x20, R10
+ MOVL R10, 40(BP)
+ MOVL R10, 124(BP)
+ MOVL R10, 152(BP)
+ MOVL R10, 244(BP)
+ MOVL R10, 276(BP)
+ MOVL R10, 388(BP)
+ MOVL R10, 416(BP)
+ MOVL R10, 496(BP)
+ MOVL R10, 588(BP)
+ MOVL R10, 620(BP)
+ MOVL R11, 28(BP)
+ MOVL R11, 108(BP)
+ MOVL R11, 196(BP)
+ MOVL R11, 256(BP)
+ MOVL R11, 312(BP)
+ MOVL R11, 340(BP)
+ MOVL R11, 436(BP)
+ MOVL R11, 520(BP)
+ MOVL R11, 528(BP)
+ MOVL R11, 616(BP)
+ SHRQ $0x20, R11
+ MOVL R11, 44(BP)
+ MOVL R11, 136(BP)
+ MOVL R11, 184(BP)
+ MOVL R11, 208(BP)
+ MOVL R11, 292(BP)
+ MOVL R11, 372(BP)
+ MOVL R11, 448(BP)
+ MOVL R11, 468(BP)
+ MOVL R11, 580(BP)
+ MOVL R11, 600(BP)
+ MOVL R12, 48(BP)
+ MOVL R12, 100(BP)
+ MOVL R12, 160(BP)
+ MOVL R12, 268(BP)
+ MOVL R12, 328(BP)
+ MOVL R12, 348(BP)
+ MOVL R12, 444(BP)
+ MOVL R12, 504(BP)
+ MOVL R12, 556(BP)
+ MOVL R12, 596(BP)
+ SHRQ $0x20, R12
+ MOVL R12, 64(BP)
+ MOVL R12, 88(BP)
+ MOVL R12, 188(BP)
+ MOVL R12, 224(BP)
+ MOVL R12, 272(BP)
+ MOVL R12, 396(BP)
+ MOVL R12, 440(BP)
+ MOVL R12, 492(BP)
+ MOVL R12, 548(BP)
+ MOVL R12, 628(BP)
+ MOVL R13, 52(BP)
+ MOVL R13, 96(BP)
+ MOVL R13, 176(BP)
+ MOVL R13, 260(BP)
+ MOVL R13, 284(BP)
+ MOVL R13, 356(BP)
+ MOVL R13, 428(BP)
+ MOVL R13, 524(BP)
+ MOVL R13, 572(BP)
+ MOVL R13, 592(BP)
+ SHRQ $0x20, R13
+ MOVL R13, 68(BP)
+ MOVL R13, 120(BP)
+ MOVL R13, 144(BP)
+ MOVL R13, 220(BP)
+ MOVL R13, 308(BP)
+ MOVL R13, 360(BP)
+ MOVL R13, 460(BP)
+ MOVL R13, 480(BP)
+ MOVL R13, 536(BP)
+ MOVL R13, 640(BP)
+ MOVL R14, 56(BP)
+ MOVL R14, 128(BP)
+ MOVL R14, 148(BP)
+ MOVL R14, 232(BP)
+ MOVL R14, 324(BP)
+ MOVL R14, 352(BP)
+ MOVL R14, 400(BP)
+ MOVL R14, 472(BP)
+ MOVL R14, 560(BP)
+ MOVL R14, 648(BP)
+ SHRQ $0x20, R14
+ MOVL R14, 72(BP)
+ MOVL R14, 92(BP)
+ MOVL R14, 172(BP)
+ MOVL R14, 216(BP)
+ MOVL R14, 332(BP)
+ MOVL R14, 384(BP)
+ MOVL R14, 424(BP)
+ MOVL R14, 464(BP)
+ MOVL R14, 564(BP)
+ MOVL R14, 636(BP)
+ MOVL R15, 60(BP)
+ MOVL R15, 80(BP)
+ MOVL R15, 192(BP)
+ MOVL R15, 236(BP)
+ MOVL R15, 304(BP)
+ MOVL R15, 392(BP)
+ MOVL R15, 408(BP)
+ MOVL R15, 484(BP)
+ MOVL R15, 532(BP)
+ MOVL R15, 644(BP)
+ SHRQ $0x20, R15
+ MOVL R15, 76(BP)
+ MOVL R15, 104(BP)
+ MOVL R15, 156(BP)
+ MOVL R15, 252(BP)
+ MOVL R15, 300(BP)
+ MOVL R15, 376(BP)
+ MOVL R15, 420(BP)
+ MOVL R15, 500(BP)
+ MOVL R15, 544(BP)
+ MOVL R15, 624(BP)
+ PADDL 16(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 32(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 48(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 64(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 80(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 96(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 112(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 128(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 144(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 160(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 176(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 192(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 208(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 224(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 240(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 256(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 272(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 288(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 304(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 320(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 336(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 352(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 368(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 384(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 400(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 416(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 432(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 448(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 464(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 480(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 496(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 512(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 528(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 544(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 560(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 576(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PADDL 592(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 608(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL 624(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL 640(BP), X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PXOR X4, X0
+ PXOR X5, X1
+ PXOR X6, X0
+ PXOR X7, X1
+ LEAQ 64(SI), SI
+ SUBQ $0x40, DX
+ JNE loop
+ MOVO X15, (BP)
+ MOVQ (BP), R9
+ MOVQ R9, (BX)
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
RET
// func hashBlocksSSE4(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
-TEXT ·hashBlocksSSE4(SB), 0, $32-48 // frame = 16 + 16 byte alignment
- HASH_BLOCKS(h+0(FP), c+8(FP), flag+16(FP), blocks_base+24(FP), blocks_len+32(FP), BLAKE2s_SSE4)
+// Requires: SSE2, SSE4.1, SSSE3
+TEXT ·hashBlocksSSE4(SB), $32-48
+ MOVQ h+0(FP), AX
+ MOVQ c+8(FP), BX
+ MOVL flag+16(FP), CX
+ MOVQ blocks_base+24(FP), SI
+ MOVQ blocks_len+32(FP), DX
+ MOVQ SP, BP
+ ADDQ $0x0f, BP
+ ANDQ $-16, BP
+ MOVQ (BX), R9
+ MOVQ R9, (BP)
+ MOVQ CX, 8(BP)
+ MOVOU (AX), X0
+ MOVOU 16(AX), X1
+ MOVOU iv0<>+0(SB), X2
+ MOVOU iv1<>+0(SB), X3
+ MOVOU counter<>+0(SB), X12
+ MOVOU rol16<>+0(SB), X13
+ MOVOU rol8<>+0(SB), X14
+ MOVO (BP), X15
+
+loop:
+ MOVO X0, X4
+ MOVO X1, X5
+ MOVO X2, X6
+ MOVO X3, X7
+ PADDQ X12, X15
+ PXOR X15, X7
+ MOVL (SI), X8
+ PINSRD $0x01, 8(SI), X8
+ PINSRD $0x02, 16(SI), X8
+ PINSRD $0x03, 24(SI), X8
+ MOVL 4(SI), X9
+ PINSRD $0x01, 12(SI), X9
+ PINSRD $0x02, 20(SI), X9
+ PINSRD $0x03, 28(SI), X9
+ MOVL 32(SI), X10
+ PINSRD $0x01, 40(SI), X10
+ PINSRD $0x02, 48(SI), X10
+ PINSRD $0x03, 56(SI), X10
+ MOVL 36(SI), X11
+ PINSRD $0x01, 44(SI), X11
+ PINSRD $0x02, 52(SI), X11
+ PINSRD $0x03, 60(SI), X11
+ PADDL X8, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X9, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL X10, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X11, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ MOVL 56(SI), X8
+ PINSRD $0x01, 16(SI), X8
+ PINSRD $0x02, 36(SI), X8
+ PINSRD $0x03, 52(SI), X8
+ MOVL 40(SI), X9
+ PINSRD $0x01, 32(SI), X9
+ PINSRD $0x02, 60(SI), X9
+ PINSRD $0x03, 24(SI), X9
+ MOVL 4(SI), X10
+ PINSRD $0x01, (SI), X10
+ PINSRD $0x02, 44(SI), X10
+ PINSRD $0x03, 20(SI), X10
+ MOVL 48(SI), X11
+ PINSRD $0x01, 8(SI), X11
+ PINSRD $0x02, 28(SI), X11
+ PINSRD $0x03, 12(SI), X11
+ PADDL X8, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X9, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL X10, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X11, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ MOVL 44(SI), X8
+ PINSRD $0x01, 48(SI), X8
+ PINSRD $0x02, 20(SI), X8
+ PINSRD $0x03, 60(SI), X8
+ MOVL 32(SI), X9
+ PINSRD $0x01, (SI), X9
+ PINSRD $0x02, 8(SI), X9
+ PINSRD $0x03, 52(SI), X9
+ MOVL 40(SI), X10
+ PINSRD $0x01, 12(SI), X10
+ PINSRD $0x02, 28(SI), X10
+ PINSRD $0x03, 36(SI), X10
+ MOVL 56(SI), X11
+ PINSRD $0x01, 24(SI), X11
+ PINSRD $0x02, 4(SI), X11
+ PINSRD $0x03, 16(SI), X11
+ PADDL X8, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X9, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL X10, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X11, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ MOVL 28(SI), X8
+ PINSRD $0x01, 12(SI), X8
+ PINSRD $0x02, 52(SI), X8
+ PINSRD $0x03, 44(SI), X8
+ MOVL 36(SI), X9
+ PINSRD $0x01, 4(SI), X9
+ PINSRD $0x02, 48(SI), X9
+ PINSRD $0x03, 56(SI), X9
+ MOVL 8(SI), X10
+ PINSRD $0x01, 20(SI), X10
+ PINSRD $0x02, 16(SI), X10
+ PINSRD $0x03, 60(SI), X10
+ MOVL 24(SI), X11
+ PINSRD $0x01, 40(SI), X11
+ PINSRD $0x02, (SI), X11
+ PINSRD $0x03, 32(SI), X11
+ PADDL X8, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X9, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL X10, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X11, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ MOVL 36(SI), X8
+ PINSRD $0x01, 20(SI), X8
+ PINSRD $0x02, 8(SI), X8
+ PINSRD $0x03, 40(SI), X8
+ MOVL (SI), X9
+ PINSRD $0x01, 28(SI), X9
+ PINSRD $0x02, 16(SI), X9
+ PINSRD $0x03, 60(SI), X9
+ MOVL 56(SI), X10
+ PINSRD $0x01, 44(SI), X10
+ PINSRD $0x02, 24(SI), X10
+ PINSRD $0x03, 12(SI), X10
+ MOVL 4(SI), X11
+ PINSRD $0x01, 48(SI), X11
+ PINSRD $0x02, 32(SI), X11
+ PINSRD $0x03, 52(SI), X11
+ PADDL X8, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X9, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL X10, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X11, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ MOVL 8(SI), X8
+ PINSRD $0x01, 24(SI), X8
+ PINSRD $0x02, (SI), X8
+ PINSRD $0x03, 32(SI), X8
+ MOVL 48(SI), X9
+ PINSRD $0x01, 40(SI), X9
+ PINSRD $0x02, 44(SI), X9
+ PINSRD $0x03, 12(SI), X9
+ MOVL 16(SI), X10
+ PINSRD $0x01, 28(SI), X10
+ PINSRD $0x02, 60(SI), X10
+ PINSRD $0x03, 4(SI), X10
+ MOVL 52(SI), X11
+ PINSRD $0x01, 20(SI), X11
+ PINSRD $0x02, 56(SI), X11
+ PINSRD $0x03, 36(SI), X11
+ PADDL X8, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X9, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL X10, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X11, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ MOVL 48(SI), X8
+ PINSRD $0x01, 4(SI), X8
+ PINSRD $0x02, 56(SI), X8
+ PINSRD $0x03, 16(SI), X8
+ MOVL 20(SI), X9
+ PINSRD $0x01, 60(SI), X9
+ PINSRD $0x02, 52(SI), X9
+ PINSRD $0x03, 40(SI), X9
+ MOVL (SI), X10
+ PINSRD $0x01, 24(SI), X10
+ PINSRD $0x02, 36(SI), X10
+ PINSRD $0x03, 32(SI), X10
+ MOVL 28(SI), X11
+ PINSRD $0x01, 12(SI), X11
+ PINSRD $0x02, 8(SI), X11
+ PINSRD $0x03, 44(SI), X11
+ PADDL X8, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X9, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL X10, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X11, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ MOVL 52(SI), X8
+ PINSRD $0x01, 28(SI), X8
+ PINSRD $0x02, 48(SI), X8
+ PINSRD $0x03, 12(SI), X8
+ MOVL 44(SI), X9
+ PINSRD $0x01, 56(SI), X9
+ PINSRD $0x02, 4(SI), X9
+ PINSRD $0x03, 36(SI), X9
+ MOVL 20(SI), X10
+ PINSRD $0x01, 60(SI), X10
+ PINSRD $0x02, 32(SI), X10
+ PINSRD $0x03, 8(SI), X10
+ MOVL (SI), X11
+ PINSRD $0x01, 16(SI), X11
+ PINSRD $0x02, 24(SI), X11
+ PINSRD $0x03, 40(SI), X11
+ PADDL X8, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X9, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL X10, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X11, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ MOVL 24(SI), X8
+ PINSRD $0x01, 56(SI), X8
+ PINSRD $0x02, 44(SI), X8
+ PINSRD $0x03, (SI), X8
+ MOVL 60(SI), X9
+ PINSRD $0x01, 36(SI), X9
+ PINSRD $0x02, 12(SI), X9
+ PINSRD $0x03, 32(SI), X9
+ MOVL 48(SI), X10
+ PINSRD $0x01, 52(SI), X10
+ PINSRD $0x02, 4(SI), X10
+ PINSRD $0x03, 40(SI), X10
+ MOVL 8(SI), X11
+ PINSRD $0x01, 28(SI), X11
+ PINSRD $0x02, 16(SI), X11
+ PINSRD $0x03, 20(SI), X11
+ PADDL X8, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X9, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL X10, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X11, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ MOVL 40(SI), X8
+ PINSRD $0x01, 32(SI), X8
+ PINSRD $0x02, 28(SI), X8
+ PINSRD $0x03, 4(SI), X8
+ MOVL 8(SI), X9
+ PINSRD $0x01, 16(SI), X9
+ PINSRD $0x02, 24(SI), X9
+ PINSRD $0x03, 20(SI), X9
+ MOVL 60(SI), X10
+ PINSRD $0x01, 36(SI), X10
+ PINSRD $0x02, 12(SI), X10
+ PINSRD $0x03, 52(SI), X10
+ MOVL 44(SI), X11
+ PINSRD $0x01, 56(SI), X11
+ PINSRD $0x02, 48(SI), X11
+ PINSRD $0x03, (SI), X11
+ PADDL X8, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X9, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X5, X5
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X7, X7
+ PADDL X10, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X13, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x14, X8
+ PSRLL $0x0c, X5
+ PXOR X8, X5
+ PADDL X11, X4
+ PADDL X5, X4
+ PXOR X4, X7
+ PSHUFB X14, X7
+ PADDL X7, X6
+ PXOR X6, X5
+ MOVO X5, X8
+ PSLLL $0x19, X8
+ PSRLL $0x07, X5
+ PXOR X8, X5
+ PSHUFL $0x39, X7, X7
+ PSHUFL $0x4e, X6, X6
+ PSHUFL $0x93, X5, X5
+ PXOR X4, X0
+ PXOR X5, X1
+ PXOR X6, X0
+ PXOR X7, X1
+ LEAQ 64(SI), SI
+ SUBQ $0x40, DX
+ JNE loop
+ MOVO X15, (BP)
+ MOVQ (BP), R9
+ MOVQ R9, (BX)
+ MOVOU X0, (AX)
+ MOVOU X1, 16(AX)
RET
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s
index e0d3c6475..133757384 100644
--- a/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s
@@ -1,108 +1,93 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run sum_amd64_asm.go -out ../sum_amd64.s -pkg poly1305. DO NOT EDIT.
//go:build gc && !purego
-#include "textflag.h"
-
-#define POLY1305_ADD(msg, h0, h1, h2) \
- ADDQ 0(msg), h0; \
- ADCQ 8(msg), h1; \
- ADCQ $1, h2; \
- LEAQ 16(msg), msg
-
-#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
- MOVQ r0, AX; \
- MULQ h0; \
- MOVQ AX, t0; \
- MOVQ DX, t1; \
- MOVQ r0, AX; \
- MULQ h1; \
- ADDQ AX, t1; \
- ADCQ $0, DX; \
- MOVQ r0, t2; \
- IMULQ h2, t2; \
- ADDQ DX, t2; \
- \
- MOVQ r1, AX; \
- MULQ h0; \
- ADDQ AX, t1; \
- ADCQ $0, DX; \
- MOVQ DX, h0; \
- MOVQ r1, t3; \
- IMULQ h2, t3; \
- MOVQ r1, AX; \
- MULQ h1; \
- ADDQ AX, t2; \
- ADCQ DX, t3; \
- ADDQ h0, t2; \
- ADCQ $0, t3; \
- \
- MOVQ t0, h0; \
- MOVQ t1, h1; \
- MOVQ t2, h2; \
- ANDQ $3, h2; \
- MOVQ t2, t0; \
- ANDQ $0xFFFFFFFFFFFFFFFC, t0; \
- ADDQ t0, h0; \
- ADCQ t3, h1; \
- ADCQ $0, h2; \
- SHRQ $2, t3, t2; \
- SHRQ $2, t3; \
- ADDQ t2, h0; \
- ADCQ t3, h1; \
- ADCQ $0, h2
-
-// func update(state *[7]uint64, msg []byte)
+// func update(state *macState, msg []byte)
TEXT ·update(SB), $0-32
MOVQ state+0(FP), DI
MOVQ msg_base+8(FP), SI
MOVQ msg_len+16(FP), R15
-
- MOVQ 0(DI), R8 // h0
- MOVQ 8(DI), R9 // h1
- MOVQ 16(DI), R10 // h2
- MOVQ 24(DI), R11 // r0
- MOVQ 32(DI), R12 // r1
-
- CMPQ R15, $16
+ MOVQ (DI), R8
+ MOVQ 8(DI), R9
+ MOVQ 16(DI), R10
+ MOVQ 24(DI), R11
+ MOVQ 32(DI), R12
+ CMPQ R15, $0x10
JB bytes_between_0_and_15
loop:
- POLY1305_ADD(SI, R8, R9, R10)
+ ADDQ (SI), R8
+ ADCQ 8(SI), R9
+ ADCQ $0x01, R10
+ LEAQ 16(SI), SI
multiply:
- POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
- SUBQ $16, R15
- CMPQ R15, $16
- JAE loop
+ MOVQ R11, AX
+ MULQ R8
+ MOVQ AX, BX
+ MOVQ DX, CX
+ MOVQ R11, AX
+ MULQ R9
+ ADDQ AX, CX
+ ADCQ $0x00, DX
+ MOVQ R11, R13
+ IMULQ R10, R13
+ ADDQ DX, R13
+ MOVQ R12, AX
+ MULQ R8
+ ADDQ AX, CX
+ ADCQ $0x00, DX
+ MOVQ DX, R8
+ MOVQ R12, R14
+ IMULQ R10, R14
+ MOVQ R12, AX
+ MULQ R9
+ ADDQ AX, R13
+ ADCQ DX, R14
+ ADDQ R8, R13
+ ADCQ $0x00, R14
+ MOVQ BX, R8
+ MOVQ CX, R9
+ MOVQ R13, R10
+ ANDQ $0x03, R10
+ MOVQ R13, BX
+ ANDQ $-4, BX
+ ADDQ BX, R8
+ ADCQ R14, R9
+ ADCQ $0x00, R10
+ SHRQ $0x02, R14, R13
+ SHRQ $0x02, R14
+ ADDQ R13, R8
+ ADCQ R14, R9
+ ADCQ $0x00, R10
+ SUBQ $0x10, R15
+ CMPQ R15, $0x10
+ JAE loop
bytes_between_0_and_15:
TESTQ R15, R15
JZ done
- MOVQ $1, BX
+ MOVQ $0x00000001, BX
XORQ CX, CX
XORQ R13, R13
ADDQ R15, SI
flush_buffer:
- SHLQ $8, BX, CX
- SHLQ $8, BX
+ SHLQ $0x08, BX, CX
+ SHLQ $0x08, BX
MOVB -1(SI), R13
XORQ R13, BX
DECQ SI
DECQ R15
JNZ flush_buffer
-
ADDQ BX, R8
ADCQ CX, R9
- ADCQ $0, R10
- MOVQ $16, R15
+ ADCQ $0x00, R10
+ MOVQ $0x00000010, R15
JMP multiply
done:
- MOVQ R8, 0(DI)
+ MOVQ R8, (DI)
MOVQ R9, 8(DI)
MOVQ R10, 16(DI)
RET
diff --git a/vendor/golang.org/x/sys/cpu/cpu.go b/vendor/golang.org/x/sys/cpu/cpu.go
index ec07aab05..02609d5b2 100644
--- a/vendor/golang.org/x/sys/cpu/cpu.go
+++ b/vendor/golang.org/x/sys/cpu/cpu.go
@@ -201,6 +201,25 @@ var S390X struct {
_ CacheLinePad
}
+// RISCV64 contains the supported CPU features and performance characteristics for riscv64
+// platforms. The booleans in RISCV64, with the exception of HasFastMisaligned, indicate
+// the presence of RISC-V extensions.
+//
+// It is safe to assume that all the RV64G extensions are supported and so they are omitted from
+// this structure. As riscv64 Go programs require at least RV64G, the code that populates
+// this structure cannot run successfully if some of the RV64G extensions are missing.
+// The struct is padded to avoid false sharing.
+var RISCV64 struct {
+ _ CacheLinePad
+ HasFastMisaligned bool // Fast misaligned accesses
+ HasC bool // Compressed instruction-set extension
+ HasV bool // Vector extension compatible with RVV 1.0
+ HasZba bool // Address generation instructions extension
+ HasZbb bool // Basic bit-manipulation extension
+ HasZbs bool // Single-bit instructions extension
+ _ CacheLinePad
+}
+
func init() {
archInit()
initOptions()
diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go b/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go
index cd63e7335..7d902b684 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-//go:build linux && !arm && !arm64 && !mips64 && !mips64le && !ppc64 && !ppc64le && !s390x
+//go:build linux && !arm && !arm64 && !mips64 && !mips64le && !ppc64 && !ppc64le && !s390x && !riscv64
package cpu
diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_riscv64.go b/vendor/golang.org/x/sys/cpu/cpu_linux_riscv64.go
new file mode 100644
index 000000000..cb4a0c572
--- /dev/null
+++ b/vendor/golang.org/x/sys/cpu/cpu_linux_riscv64.go
@@ -0,0 +1,137 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpu
+
+import (
+ "syscall"
+ "unsafe"
+)
+
+// RISC-V extension discovery code for Linux. The approach here is to first try the riscv_hwprobe
+// syscall falling back to HWCAP to check for the C extension if riscv_hwprobe is not available.
+//
+// A note on detection of the Vector extension using HWCAP.
+//
+// Support for the Vector extension version 1.0 was added to the Linux kernel in release 6.5.
+// Support for the riscv_hwprobe syscall was added in 6.4. It follows that if the riscv_hwprobe
+// syscall is not available then neither is the Vector extension (which needs kernel support).
+// The riscv_hwprobe syscall should then be all we need to detect the Vector extension.
+// However, some RISC-V board manufacturers ship boards with an older kernel on top of which
+// they have back-ported various versions of the Vector extension patches but not the riscv_hwprobe
+// patches. These kernels advertise support for the Vector extension using HWCAP. Falling
+// back to HWCAP to detect the Vector extension, if riscv_hwprobe is not available, or simply not
+// bothering with riscv_hwprobe at all and just using HWCAP may then seem like an attractive option.
+//
+// Unfortunately, simply checking the 'V' bit in AT_HWCAP will not work as this bit is used by
+// RISC-V board and cloud instance providers to mean different things. The Lichee Pi 4A board
+// and the Scaleway RV1 cloud instances use the 'V' bit to advertise their support for the unratified
+// 0.7.1 version of the Vector Specification. The Banana Pi BPI-F3 and the CanMV-K230 board use
+// it to advertise support for 1.0 of the Vector extension. Versions 0.7.1 and 1.0 of the Vector
+// extension are binary incompatible. HWCAP can then not be used in isolation to populate the
+// HasV field as this field indicates that the underlying CPU is compatible with RVV 1.0.
+//
+// There is a way at runtime to distinguish between versions 0.7.1 and 1.0 of the Vector
+// specification by issuing a RVV 1.0 vsetvli instruction and checking the vill bit of the vtype
+// register. This check would allow us to safely detect version 1.0 of the Vector extension
+// with HWCAP, if riscv_hwprobe were not available. However, the check cannot
+// be added until the assembler supports the Vector instructions.
+//
+// Note the riscv_hwprobe syscall does not suffer from these ambiguities by design as all of the
+// extensions it advertises support for are explicitly versioned. It's also worth noting that
+// the riscv_hwprobe syscall is the only way to detect multi-letter RISC-V extensions, e.g., Zba.
+// These cannot be detected using HWCAP and so riscv_hwprobe must be used to detect the majority
+// of RISC-V extensions.
+//
+// Please see https://docs.kernel.org/arch/riscv/hwprobe.html for more information.
+
+// golang.org/x/sys/cpu is not allowed to depend on golang.org/x/sys/unix so we must
+// reproduce the constants, types and functions needed to make the riscv_hwprobe syscall
+// here.
+
+const (
+ // Copied from golang.org/x/sys/unix/ztypes_linux_riscv64.go.
+ riscv_HWPROBE_KEY_IMA_EXT_0 = 0x4
+ riscv_HWPROBE_IMA_C = 0x2
+ riscv_HWPROBE_IMA_V = 0x4
+ riscv_HWPROBE_EXT_ZBA = 0x8
+ riscv_HWPROBE_EXT_ZBB = 0x10
+ riscv_HWPROBE_EXT_ZBS = 0x20
+ riscv_HWPROBE_KEY_CPUPERF_0 = 0x5
+ riscv_HWPROBE_MISALIGNED_FAST = 0x3
+ riscv_HWPROBE_MISALIGNED_MASK = 0x7
+)
+
+const (
+ // sys_RISCV_HWPROBE is copied from golang.org/x/sys/unix/zsysnum_linux_riscv64.go.
+ sys_RISCV_HWPROBE = 258
+)
+
+// riscvHWProbePairs is copied from golang.org/x/sys/unix/ztypes_linux_riscv64.go.
+type riscvHWProbePairs struct {
+ key int64
+ value uint64
+}
+
+const (
+ // CPU features
+ hwcap_RISCV_ISA_C = 1 << ('C' - 'A')
+)
+
+func doinit() {
+ // A slice of key/value pair structures is passed to the RISCVHWProbe syscall. The key
+ // field should be initialised with one of the key constants defined above, e.g.,
+ // RISCV_HWPROBE_KEY_IMA_EXT_0. The syscall will set the value field to the appropriate value.
+ // If the kernel does not recognise a key it will set the key field to -1 and the value field to 0.
+
+ pairs := []riscvHWProbePairs{
+ {riscv_HWPROBE_KEY_IMA_EXT_0, 0},
+ {riscv_HWPROBE_KEY_CPUPERF_0, 0},
+ }
+
+ // This call only indicates that extensions are supported if they are implemented on all cores.
+ if riscvHWProbe(pairs, 0) {
+ if pairs[0].key != -1 {
+ v := uint(pairs[0].value)
+ RISCV64.HasC = isSet(v, riscv_HWPROBE_IMA_C)
+ RISCV64.HasV = isSet(v, riscv_HWPROBE_IMA_V)
+ RISCV64.HasZba = isSet(v, riscv_HWPROBE_EXT_ZBA)
+ RISCV64.HasZbb = isSet(v, riscv_HWPROBE_EXT_ZBB)
+ RISCV64.HasZbs = isSet(v, riscv_HWPROBE_EXT_ZBS)
+ }
+ if pairs[1].key != -1 {
+ v := pairs[1].value & riscv_HWPROBE_MISALIGNED_MASK
+ RISCV64.HasFastMisaligned = v == riscv_HWPROBE_MISALIGNED_FAST
+ }
+ }
+
+ // Let's double check with HWCAP if the C extension does not appear to be supported.
+ // This may happen if we're running on a kernel older than 6.4.
+
+ if !RISCV64.HasC {
+ RISCV64.HasC = isSet(hwCap, hwcap_RISCV_ISA_C)
+ }
+}
+
+func isSet(hwc uint, value uint) bool {
+ return hwc&value != 0
+}
+
+// riscvHWProbe is a simplified version of the generated wrapper function found in
+// golang.org/x/sys/unix/zsyscall_linux_riscv64.go. We simplify it by removing the
+// cpuCount and cpus parameters which we do not need. We always want to pass 0 for
+// these parameters here so the kernel only reports the extensions that are present
+// on all cores.
+func riscvHWProbe(pairs []riscvHWProbePairs, flags uint) bool {
+ var _zero uintptr
+ var p0 unsafe.Pointer
+ if len(pairs) > 0 {
+ p0 = unsafe.Pointer(&pairs[0])
+ } else {
+ p0 = unsafe.Pointer(&_zero)
+ }
+
+ _, _, e1 := syscall.Syscall6(sys_RISCV_HWPROBE, uintptr(p0), uintptr(len(pairs)), uintptr(0), uintptr(0), uintptr(flags), 0)
+ return e1 == 0
+}
diff --git a/vendor/golang.org/x/sys/cpu/cpu_riscv64.go b/vendor/golang.org/x/sys/cpu/cpu_riscv64.go
index 7f0c79c00..aca3199c9 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_riscv64.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_riscv64.go
@@ -8,4 +8,13 @@ package cpu
const cacheLineSize = 64
-func initOptions() {}
+func initOptions() {
+ options = []option{
+ {Name: "fastmisaligned", Feature: &RISCV64.HasFastMisaligned},
+ {Name: "c", Feature: &RISCV64.HasC},
+ {Name: "v", Feature: &RISCV64.HasV},
+ {Name: "zba", Feature: &RISCV64.HasZba},
+ {Name: "zbb", Feature: &RISCV64.HasZbb},
+ {Name: "zbs", Feature: &RISCV64.HasZbs},
+ }
+}
diff --git a/vendor/golang.org/x/sys/unix/mkerrors.sh b/vendor/golang.org/x/sys/unix/mkerrors.sh
index d07dd09eb..e14b766a3 100644
--- a/vendor/golang.org/x/sys/unix/mkerrors.sh
+++ b/vendor/golang.org/x/sys/unix/mkerrors.sh
@@ -552,6 +552,7 @@ ccflags="$@"
$2 !~ /^RTC_VL_(ACCURACY|BACKUP|DATA)/ &&
$2 ~ /^(NETLINK|NLM|NLMSG|NLA|IFA|IFAN|RT|RTC|RTCF|RTN|RTPROT|RTNH|ARPHRD|ETH_P|NETNSA)_/ ||
$2 ~ /^SOCK_|SK_DIAG_|SKNLGRP_$/ ||
+ $2 ~ /^(CONNECT|SAE)_/ ||
$2 ~ /^FIORDCHK$/ ||
$2 ~ /^SIOC/ ||
$2 ~ /^TIOC/ ||
diff --git a/vendor/golang.org/x/sys/unix/syscall_darwin.go b/vendor/golang.org/x/sys/unix/syscall_darwin.go
index 2d15200ad..099867dee 100644
--- a/vendor/golang.org/x/sys/unix/syscall_darwin.go
+++ b/vendor/golang.org/x/sys/unix/syscall_darwin.go
@@ -566,6 +566,43 @@ func PthreadFchdir(fd int) (err error) {
return pthread_fchdir_np(fd)
}
+// Connectx calls connectx(2) to initiate a connection on a socket.
+//
+// srcIf, srcAddr, and dstAddr are filled into a [SaEndpoints] struct and passed as the endpoints argument.
+//
+// - srcIf is the optional source interface index. 0 means unspecified.
+// - srcAddr is the optional source address. nil means unspecified.
+// - dstAddr is the destination address.
+//
+// On success, Connectx returns the number of bytes enqueued for transmission.
+func Connectx(fd int, srcIf uint32, srcAddr, dstAddr Sockaddr, associd SaeAssocID, flags uint32, iov []Iovec, connid *SaeConnID) (n uintptr, err error) {
+ endpoints := SaEndpoints{
+ Srcif: srcIf,
+ }
+
+ if srcAddr != nil {
+ addrp, addrlen, err := srcAddr.sockaddr()
+ if err != nil {
+ return 0, err
+ }
+ endpoints.Srcaddr = (*RawSockaddr)(addrp)
+ endpoints.Srcaddrlen = uint32(addrlen)
+ }
+
+ if dstAddr != nil {
+ addrp, addrlen, err := dstAddr.sockaddr()
+ if err != nil {
+ return 0, err
+ }
+ endpoints.Dstaddr = (*RawSockaddr)(addrp)
+ endpoints.Dstaddrlen = uint32(addrlen)
+ }
+
+ err = connectx(fd, &endpoints, associd, flags, iov, &n, connid)
+ return
+}
+
+//sys connectx(fd int, endpoints *SaEndpoints, associd SaeAssocID, flags uint32, iov []Iovec, n *uintptr, connid *SaeConnID) (err error)
//sys sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error)
//sys shmat(id int, addr uintptr, flag int) (ret uintptr, err error)
diff --git a/vendor/golang.org/x/sys/unix/syscall_hurd.go b/vendor/golang.org/x/sys/unix/syscall_hurd.go
index ba46651f8..a6a2d2fc2 100644
--- a/vendor/golang.org/x/sys/unix/syscall_hurd.go
+++ b/vendor/golang.org/x/sys/unix/syscall_hurd.go
@@ -11,6 +11,7 @@ package unix
int ioctl(int, unsigned long int, uintptr_t);
*/
import "C"
+import "unsafe"
func ioctl(fd int, req uint, arg uintptr) (err error) {
r0, er := C.ioctl(C.int(fd), C.ulong(req), C.uintptr_t(arg))
diff --git a/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go
index 4308ac177..d73c4652e 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go
@@ -237,6 +237,9 @@ const (
CLOCK_UPTIME_RAW_APPROX = 0x9
CLONE_NOFOLLOW = 0x1
CLONE_NOOWNERCOPY = 0x2
+ CONNECT_DATA_AUTHENTICATED = 0x4
+ CONNECT_DATA_IDEMPOTENT = 0x2
+ CONNECT_RESUME_ON_READ_WRITE = 0x1
CR0 = 0x0
CR1 = 0x1000
CR2 = 0x2000
@@ -1265,6 +1268,10 @@ const (
RTV_SSTHRESH = 0x20
RUSAGE_CHILDREN = -0x1
RUSAGE_SELF = 0x0
+ SAE_ASSOCID_ALL = 0xffffffff
+ SAE_ASSOCID_ANY = 0x0
+ SAE_CONNID_ALL = 0xffffffff
+ SAE_CONNID_ANY = 0x0
SCM_CREDS = 0x3
SCM_RIGHTS = 0x1
SCM_TIMESTAMP = 0x2
diff --git a/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go
index c8068a7a1..4a55a4005 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go
@@ -237,6 +237,9 @@ const (
CLOCK_UPTIME_RAW_APPROX = 0x9
CLONE_NOFOLLOW = 0x1
CLONE_NOOWNERCOPY = 0x2
+ CONNECT_DATA_AUTHENTICATED = 0x4
+ CONNECT_DATA_IDEMPOTENT = 0x2
+ CONNECT_RESUME_ON_READ_WRITE = 0x1
CR0 = 0x0
CR1 = 0x1000
CR2 = 0x2000
@@ -1265,6 +1268,10 @@ const (
RTV_SSTHRESH = 0x20
RUSAGE_CHILDREN = -0x1
RUSAGE_SELF = 0x0
+ SAE_ASSOCID_ALL = 0xffffffff
+ SAE_ASSOCID_ANY = 0x0
+ SAE_CONNID_ALL = 0xffffffff
+ SAE_CONNID_ANY = 0x0
SCM_CREDS = 0x3
SCM_RIGHTS = 0x1
SCM_TIMESTAMP = 0x2
diff --git a/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go b/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go
index da08b2ab3..1ec2b1407 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go
@@ -581,6 +581,8 @@ const (
AT_EMPTY_PATH = 0x1000
AT_REMOVEDIR = 0x200
RENAME_NOREPLACE = 1 << 0
+ ST_RDONLY = 1
+ ST_NOSUID = 2
)
const (
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go
index b622533ef..24b346e1a 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go
@@ -841,6 +841,26 @@ var libc_pthread_fchdir_np_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+func connectx(fd int, endpoints *SaEndpoints, associd SaeAssocID, flags uint32, iov []Iovec, n *uintptr, connid *SaeConnID) (err error) {
+ var _p0 unsafe.Pointer
+ if len(iov) > 0 {
+ _p0 = unsafe.Pointer(&iov[0])
+ } else {
+ _p0 = unsafe.Pointer(&_zero)
+ }
+ _, _, e1 := syscall_syscall9(libc_connectx_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(endpoints)), uintptr(associd), uintptr(flags), uintptr(_p0), uintptr(len(iov)), uintptr(unsafe.Pointer(n)), uintptr(unsafe.Pointer(connid)), 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_connectx_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_connectx connectx "/usr/lib/libSystem.B.dylib"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error) {
_, _, e1 := syscall_syscall6(libc_sendfile_trampoline_addr, uintptr(infd), uintptr(outfd), uintptr(offset), uintptr(unsafe.Pointer(len)), uintptr(hdtr), uintptr(flags))
if e1 != 0 {
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s
index cfe6646ba..ebd213100 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s
@@ -248,6 +248,11 @@ TEXT libc_pthread_fchdir_np_trampoline<>(SB),NOSPLIT,$0-0
GLOBL ·libc_pthread_fchdir_np_trampoline_addr(SB), RODATA, $8
DATA ·libc_pthread_fchdir_np_trampoline_addr(SB)/8, $libc_pthread_fchdir_np_trampoline<>(SB)
+TEXT libc_connectx_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_connectx(SB)
+GLOBL ·libc_connectx_trampoline_addr(SB), RODATA, $8
+DATA ·libc_connectx_trampoline_addr(SB)/8, $libc_connectx_trampoline<>(SB)
+
TEXT libc_sendfile_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_sendfile(SB)
GLOBL ·libc_sendfile_trampoline_addr(SB), RODATA, $8
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go
index 13f624f69..824b9c2d5 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go
@@ -841,6 +841,26 @@ var libc_pthread_fchdir_np_trampoline_addr uintptr
// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+func connectx(fd int, endpoints *SaEndpoints, associd SaeAssocID, flags uint32, iov []Iovec, n *uintptr, connid *SaeConnID) (err error) {
+ var _p0 unsafe.Pointer
+ if len(iov) > 0 {
+ _p0 = unsafe.Pointer(&iov[0])
+ } else {
+ _p0 = unsafe.Pointer(&_zero)
+ }
+ _, _, e1 := syscall_syscall9(libc_connectx_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(endpoints)), uintptr(associd), uintptr(flags), uintptr(_p0), uintptr(len(iov)), uintptr(unsafe.Pointer(n)), uintptr(unsafe.Pointer(connid)), 0)
+ if e1 != 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
+var libc_connectx_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_connectx connectx "/usr/lib/libSystem.B.dylib"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
func sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error) {
_, _, e1 := syscall_syscall6(libc_sendfile_trampoline_addr, uintptr(infd), uintptr(outfd), uintptr(offset), uintptr(unsafe.Pointer(len)), uintptr(hdtr), uintptr(flags))
if e1 != 0 {
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s
index fe222b75d..4f178a229 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s
@@ -248,6 +248,11 @@ TEXT libc_pthread_fchdir_np_trampoline<>(SB),NOSPLIT,$0-0
GLOBL ·libc_pthread_fchdir_np_trampoline_addr(SB), RODATA, $8
DATA ·libc_pthread_fchdir_np_trampoline_addr(SB)/8, $libc_pthread_fchdir_np_trampoline<>(SB)
+TEXT libc_connectx_trampoline<>(SB),NOSPLIT,$0-0
+ JMP libc_connectx(SB)
+GLOBL ·libc_connectx_trampoline_addr(SB), RODATA, $8
+DATA ·libc_connectx_trampoline_addr(SB)/8, $libc_connectx_trampoline<>(SB)
+
TEXT libc_sendfile_trampoline<>(SB),NOSPLIT,$0-0
JMP libc_sendfile(SB)
GLOBL ·libc_sendfile_trampoline_addr(SB), RODATA, $8
diff --git a/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go
index 091d107f3..d003c3d43 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go
@@ -306,6 +306,19 @@ type XVSockPgen struct {
type _Socklen uint32
+type SaeAssocID uint32
+
+type SaeConnID uint32
+
+type SaEndpoints struct {
+ Srcif uint32
+ Srcaddr *RawSockaddr
+ Srcaddrlen uint32
+ Dstaddr *RawSockaddr
+ Dstaddrlen uint32
+ _ [4]byte
+}
+
type Xucred struct {
Version uint32
Uid uint32
diff --git a/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go
index 28ff4ef74..0d45a941a 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go
@@ -306,6 +306,19 @@ type XVSockPgen struct {
type _Socklen uint32
+type SaeAssocID uint32
+
+type SaeConnID uint32
+
+type SaEndpoints struct {
+ Srcif uint32
+ Srcaddr *RawSockaddr
+ Srcaddrlen uint32
+ Dstaddr *RawSockaddr
+ Dstaddrlen uint32
+ _ [4]byte
+}
+
type Xucred struct {
Version uint32
Uid uint32
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go
index 6cbd094a3..51e13eb05 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go
@@ -625,6 +625,7 @@ const (
POLLRDNORM = 0x40
POLLWRBAND = 0x100
POLLWRNORM = 0x4
+ POLLRDHUP = 0x4000
)
type CapRights struct {
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go
index 7c03b6ee7..d002d8ef3 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go
@@ -630,6 +630,7 @@ const (
POLLRDNORM = 0x40
POLLWRBAND = 0x100
POLLWRNORM = 0x4
+ POLLRDHUP = 0x4000
)
type CapRights struct {
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go
index 422107ee8..3f863d898 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go
@@ -616,6 +616,7 @@ const (
POLLRDNORM = 0x40
POLLWRBAND = 0x100
POLLWRNORM = 0x4
+ POLLRDHUP = 0x4000
)
type CapRights struct {
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go
index 505a12acf..61c729310 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go
@@ -610,6 +610,7 @@ const (
POLLRDNORM = 0x40
POLLWRBAND = 0x100
POLLWRNORM = 0x4
+ POLLRDHUP = 0x4000
)
type CapRights struct {
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go
index cc986c790..b5d17414f 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go
@@ -612,6 +612,7 @@ const (
POLLRDNORM = 0x40
POLLWRBAND = 0x100
POLLWRNORM = 0x4
+ POLLRDHUP = 0x4000
)
type CapRights struct {
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux.go b/vendor/golang.org/x/sys/unix/ztypes_linux.go
index 7f1961b90..9f2550dc3 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux.go
@@ -2486,7 +2486,7 @@ type XDPMmapOffsets struct {
type XDPUmemReg struct {
Addr uint64
Len uint64
- Chunk_size uint32
+ Size uint32
Headroom uint32
Flags uint32
Tx_metadata_len uint32
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go
index 15adc0414..ad05b51a6 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go
@@ -727,6 +727,37 @@ const (
RISCV_HWPROBE_EXT_ZBA = 0x8
RISCV_HWPROBE_EXT_ZBB = 0x10
RISCV_HWPROBE_EXT_ZBS = 0x20
+ RISCV_HWPROBE_EXT_ZICBOZ = 0x40
+ RISCV_HWPROBE_EXT_ZBC = 0x80
+ RISCV_HWPROBE_EXT_ZBKB = 0x100
+ RISCV_HWPROBE_EXT_ZBKC = 0x200
+ RISCV_HWPROBE_EXT_ZBKX = 0x400
+ RISCV_HWPROBE_EXT_ZKND = 0x800
+ RISCV_HWPROBE_EXT_ZKNE = 0x1000
+ RISCV_HWPROBE_EXT_ZKNH = 0x2000
+ RISCV_HWPROBE_EXT_ZKSED = 0x4000
+ RISCV_HWPROBE_EXT_ZKSH = 0x8000
+ RISCV_HWPROBE_EXT_ZKT = 0x10000
+ RISCV_HWPROBE_EXT_ZVBB = 0x20000
+ RISCV_HWPROBE_EXT_ZVBC = 0x40000
+ RISCV_HWPROBE_EXT_ZVKB = 0x80000
+ RISCV_HWPROBE_EXT_ZVKG = 0x100000
+ RISCV_HWPROBE_EXT_ZVKNED = 0x200000
+ RISCV_HWPROBE_EXT_ZVKNHA = 0x400000
+ RISCV_HWPROBE_EXT_ZVKNHB = 0x800000
+ RISCV_HWPROBE_EXT_ZVKSED = 0x1000000
+ RISCV_HWPROBE_EXT_ZVKSH = 0x2000000
+ RISCV_HWPROBE_EXT_ZVKT = 0x4000000
+ RISCV_HWPROBE_EXT_ZFH = 0x8000000
+ RISCV_HWPROBE_EXT_ZFHMIN = 0x10000000
+ RISCV_HWPROBE_EXT_ZIHINTNTL = 0x20000000
+ RISCV_HWPROBE_EXT_ZVFH = 0x40000000
+ RISCV_HWPROBE_EXT_ZVFHMIN = 0x80000000
+ RISCV_HWPROBE_EXT_ZFA = 0x100000000
+ RISCV_HWPROBE_EXT_ZTSO = 0x200000000
+ RISCV_HWPROBE_EXT_ZACAS = 0x400000000
+ RISCV_HWPROBE_EXT_ZICOND = 0x800000000
+ RISCV_HWPROBE_EXT_ZIHINTPAUSE = 0x1000000000
RISCV_HWPROBE_KEY_CPUPERF_0 = 0x5
RISCV_HWPROBE_MISALIGNED_UNKNOWN = 0x0
RISCV_HWPROBE_MISALIGNED_EMULATED = 0x1
@@ -734,4 +765,6 @@ const (
RISCV_HWPROBE_MISALIGNED_FAST = 0x3
RISCV_HWPROBE_MISALIGNED_UNSUPPORTED = 0x4
RISCV_HWPROBE_MISALIGNED_MASK = 0x7
+ RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE = 0x6
+ RISCV_HWPROBE_WHICH_CPUS = 0x1
)
diff --git a/vendor/golang.org/x/sys/windows/syscall_windows.go b/vendor/golang.org/x/sys/windows/syscall_windows.go
index 1fa34fd17..5cee9a314 100644
--- a/vendor/golang.org/x/sys/windows/syscall_windows.go
+++ b/vendor/golang.org/x/sys/windows/syscall_windows.go
@@ -313,6 +313,10 @@ func NewCallbackCDecl(fn interface{}) uintptr {
//sys SetConsoleMode(console Handle, mode uint32) (err error) = kernel32.SetConsoleMode
//sys GetConsoleScreenBufferInfo(console Handle, info *ConsoleScreenBufferInfo) (err error) = kernel32.GetConsoleScreenBufferInfo
//sys setConsoleCursorPosition(console Handle, position uint32) (err error) = kernel32.SetConsoleCursorPosition
+//sys GetConsoleCP() (cp uint32, err error) = kernel32.GetConsoleCP
+//sys GetConsoleOutputCP() (cp uint32, err error) = kernel32.GetConsoleOutputCP
+//sys SetConsoleCP(cp uint32) (err error) = kernel32.SetConsoleCP
+//sys SetConsoleOutputCP(cp uint32) (err error) = kernel32.SetConsoleOutputCP
//sys WriteConsole(console Handle, buf *uint16, towrite uint32, written *uint32, reserved *byte) (err error) = kernel32.WriteConsoleW
//sys ReadConsole(console Handle, buf *uint16, toread uint32, read *uint32, inputControl *byte) (err error) = kernel32.ReadConsoleW
//sys resizePseudoConsole(pconsole Handle, size uint32) (hr error) = kernel32.ResizePseudoConsole
diff --git a/vendor/golang.org/x/sys/windows/types_windows.go b/vendor/golang.org/x/sys/windows/types_windows.go
index 3f03b3d57..7b97a154c 100644
--- a/vendor/golang.org/x/sys/windows/types_windows.go
+++ b/vendor/golang.org/x/sys/windows/types_windows.go
@@ -1060,6 +1060,7 @@ const (
SIO_GET_EXTENSION_FUNCTION_POINTER = IOC_INOUT | IOC_WS2 | 6
SIO_KEEPALIVE_VALS = IOC_IN | IOC_VENDOR | 4
SIO_UDP_CONNRESET = IOC_IN | IOC_VENDOR | 12
+ SIO_UDP_NETRESET = IOC_IN | IOC_VENDOR | 15
// cf. http://support.microsoft.com/default.aspx?scid=kb;en-us;257460
diff --git a/vendor/golang.org/x/sys/windows/zsyscall_windows.go b/vendor/golang.org/x/sys/windows/zsyscall_windows.go
index 9bb979a3e..4c2e1bdc0 100644
--- a/vendor/golang.org/x/sys/windows/zsyscall_windows.go
+++ b/vendor/golang.org/x/sys/windows/zsyscall_windows.go
@@ -247,7 +247,9 @@ var (
procGetCommandLineW = modkernel32.NewProc("GetCommandLineW")
procGetComputerNameExW = modkernel32.NewProc("GetComputerNameExW")
procGetComputerNameW = modkernel32.NewProc("GetComputerNameW")
+ procGetConsoleCP = modkernel32.NewProc("GetConsoleCP")
procGetConsoleMode = modkernel32.NewProc("GetConsoleMode")
+ procGetConsoleOutputCP = modkernel32.NewProc("GetConsoleOutputCP")
procGetConsoleScreenBufferInfo = modkernel32.NewProc("GetConsoleScreenBufferInfo")
procGetCurrentDirectoryW = modkernel32.NewProc("GetCurrentDirectoryW")
procGetCurrentProcessId = modkernel32.NewProc("GetCurrentProcessId")
@@ -347,8 +349,10 @@ var (
procSetCommMask = modkernel32.NewProc("SetCommMask")
procSetCommState = modkernel32.NewProc("SetCommState")
procSetCommTimeouts = modkernel32.NewProc("SetCommTimeouts")
+ procSetConsoleCP = modkernel32.NewProc("SetConsoleCP")
procSetConsoleCursorPosition = modkernel32.NewProc("SetConsoleCursorPosition")
procSetConsoleMode = modkernel32.NewProc("SetConsoleMode")
+ procSetConsoleOutputCP = modkernel32.NewProc("SetConsoleOutputCP")
procSetCurrentDirectoryW = modkernel32.NewProc("SetCurrentDirectoryW")
procSetDefaultDllDirectories = modkernel32.NewProc("SetDefaultDllDirectories")
procSetDllDirectoryW = modkernel32.NewProc("SetDllDirectoryW")
@@ -2162,6 +2166,15 @@ func GetComputerName(buf *uint16, n *uint32) (err error) {
return
}
+func GetConsoleCP() (cp uint32, err error) {
+ r0, _, e1 := syscall.Syscall(procGetConsoleCP.Addr(), 0, 0, 0, 0)
+ cp = uint32(r0)
+ if cp == 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
func GetConsoleMode(console Handle, mode *uint32) (err error) {
r1, _, e1 := syscall.Syscall(procGetConsoleMode.Addr(), 2, uintptr(console), uintptr(unsafe.Pointer(mode)), 0)
if r1 == 0 {
@@ -2170,6 +2183,15 @@ func GetConsoleMode(console Handle, mode *uint32) (err error) {
return
}
+func GetConsoleOutputCP() (cp uint32, err error) {
+ r0, _, e1 := syscall.Syscall(procGetConsoleOutputCP.Addr(), 0, 0, 0, 0)
+ cp = uint32(r0)
+ if cp == 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
func GetConsoleScreenBufferInfo(console Handle, info *ConsoleScreenBufferInfo) (err error) {
r1, _, e1 := syscall.Syscall(procGetConsoleScreenBufferInfo.Addr(), 2, uintptr(console), uintptr(unsafe.Pointer(info)), 0)
if r1 == 0 {
@@ -3038,6 +3060,14 @@ func SetCommTimeouts(handle Handle, timeouts *CommTimeouts) (err error) {
return
}
+func SetConsoleCP(cp uint32) (err error) {
+ r1, _, e1 := syscall.Syscall(procSetConsoleCP.Addr(), 1, uintptr(cp), 0, 0)
+ if r1 == 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
func setConsoleCursorPosition(console Handle, position uint32) (err error) {
r1, _, e1 := syscall.Syscall(procSetConsoleCursorPosition.Addr(), 2, uintptr(console), uintptr(position), 0)
if r1 == 0 {
@@ -3054,6 +3084,14 @@ func SetConsoleMode(console Handle, mode uint32) (err error) {
return
}
+func SetConsoleOutputCP(cp uint32) (err error) {
+ r1, _, e1 := syscall.Syscall(procSetConsoleOutputCP.Addr(), 1, uintptr(cp), 0, 0)
+ if r1 == 0 {
+ err = errnoErr(e1)
+ }
+ return
+}
+
func SetCurrentDirectory(path *uint16) (err error) {
r1, _, e1 := syscall.Syscall(procSetCurrentDirectoryW.Addr(), 1, uintptr(unsafe.Pointer(path)), 0, 0)
if r1 == 0 {
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 3ac4c6056..c3a88fe55 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -1063,7 +1063,7 @@ go.uber.org/multierr
# golang.org/x/arch v0.8.0
## explicit; go 1.18
golang.org/x/arch/x86/x86asm
-# golang.org/x/crypto v0.26.0
+# golang.org/x/crypto v0.27.0
## explicit; go 1.20
golang.org/x/crypto/acme
golang.org/x/crypto/acme/autocert
@@ -1129,7 +1129,7 @@ golang.org/x/oauth2/internal
## explicit; go 1.18
golang.org/x/sync/errgroup
golang.org/x/sync/semaphore
-# golang.org/x/sys v0.24.0
+# golang.org/x/sys v0.25.0
## explicit; go 1.18
golang.org/x/sys/cpu
golang.org/x/sys/unix