summaryrefslogtreecommitdiff
path: root/vendor/github.com
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com')
-rw-r--r--vendor/github.com/zeebo/blake3/.gitignore6
-rw-r--r--vendor/github.com/zeebo/blake3/LICENSE125
-rw-r--r--vendor/github.com/zeebo/blake3/Makefile11
-rw-r--r--vendor/github.com/zeebo/blake3/README.md77
-rw-r--r--vendor/github.com/zeebo/blake3/api.go166
-rw-r--r--vendor/github.com/zeebo/blake3/blake3.go285
-rw-r--r--vendor/github.com/zeebo/blake3/digest.go100
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/alg.go18
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go15
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go135
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s560
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go9
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go6
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go23
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s2561
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go13
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go9
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go56
-rw-r--r--vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go38
-rw-r--r--vendor/github.com/zeebo/blake3/internal/consts/consts.go29
-rw-r--r--vendor/github.com/zeebo/blake3/internal/consts/cpu.go17
-rw-r--r--vendor/github.com/zeebo/blake3/internal/consts/cpu_big.go5
-rw-r--r--vendor/github.com/zeebo/blake3/internal/consts/cpu_little.go5
-rw-r--r--vendor/github.com/zeebo/blake3/internal/consts/cpu_other.go7
-rw-r--r--vendor/github.com/zeebo/blake3/internal/utils/utils.go60
25 files changed, 4336 insertions, 0 deletions
diff --git a/vendor/github.com/zeebo/blake3/.gitignore b/vendor/github.com/zeebo/blake3/.gitignore
new file mode 100644
index 000000000..c6bfdf2c3
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/.gitignore
@@ -0,0 +1,6 @@
+*.pprof
+*.test
+*.txt
+*.out
+
+/upstream
diff --git a/vendor/github.com/zeebo/blake3/LICENSE b/vendor/github.com/zeebo/blake3/LICENSE
new file mode 100644
index 000000000..3a63575d3
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/LICENSE
@@ -0,0 +1,125 @@
+This work is released into the public domain with CC0 1.0.
+
+-------------------------------------------------------------------------------
+
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+ CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+ LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+ ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+ INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+ REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+ PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+ THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+ HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+ i. the right to reproduce, adapt, distribute, perform, display,
+ communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+ likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+ subject to the limitations in paragraph 4(a), below;
+ v. rights protecting the extraction, dissemination, use and reuse of data
+ in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+ European Parliament and of the Council of 11 March 1996 on the legal
+ protection of databases, and under any national implementation
+ thereof, including any amended or successor version of such
+ directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+ world based on applicable law or treaty, and any national
+ implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+ surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+ warranties of any kind concerning the Work, express, implied,
+ statutory or otherwise, including without limitation warranties of
+ title, merchantability, fitness for a particular purpose, non
+ infringement, or the absence of latent or other defects, accuracy, or
+ the present or absence of errors, whether or not discoverable, all to
+ the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+ that may apply to the Work or any use thereof, including without
+ limitation any person's Copyright and Related Rights in the Work.
+ Further, Affirmer disclaims responsibility for obtaining any necessary
+ consents, permissions or other rights required for any use of the
+ Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+ party to this document and has no duty or obligation with respect to
+ this CC0 or use of the Work.
diff --git a/vendor/github.com/zeebo/blake3/Makefile b/vendor/github.com/zeebo/blake3/Makefile
new file mode 100644
index 000000000..f98f0f093
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/Makefile
@@ -0,0 +1,11 @@
+asm: internal/alg/hash/hash_avx2/impl_amd64.s internal/alg/compress/compress_sse41/impl_amd64.s
+
+internal/alg/hash/hash_avx2/impl_amd64.s: avo/avx2/*.go
+ ( cd avo; go run ./avx2 ) > internal/alg/hash/hash_avx2/impl_amd64.s
+
+internal/alg/compress/compress_sse41/impl_amd64.s: avo/sse41/*.go
+ ( cd avo; go run ./sse41 ) > internal/alg/compress/compress_sse41/impl_amd64.s
+
+.PHONY: test
+test:
+ go test -race -bench=. -benchtime=1x
diff --git a/vendor/github.com/zeebo/blake3/README.md b/vendor/github.com/zeebo/blake3/README.md
new file mode 100644
index 000000000..0a0f2e186
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/README.md
@@ -0,0 +1,77 @@
+# BLAKE3
+
+<p>
+ <a href="https://pkg.go.dev/github.com/zeebo/blake3"><img src="https://img.shields.io/badge/doc-reference-007d9b?logo=go&style=flat-square" alt="go.dev" /></a>
+ <a href="https://goreportcard.com/report/github.com/zeebo/blake3"><img src="https://goreportcard.com/badge/github.com/zeebo/blake3?style=flat-square" alt="Go Report Card" /></a>
+ <a href="https://sourcegraph.com/github.com/zeebo/blake3?badge"><img src="https://sourcegraph.com/github.com/zeebo/blake3/-/badge.svg?style=flat-square" alt="SourceGraph" /></a>
+</p>
+
+Pure Go implementation of [BLAKE3](https://blake3.io) with AVX2 and SSE4.1 acceleration.
+
+Special thanks to the excellent [avo](https://github.com/mmcloughlin/avo) making writing vectorized version much easier.
+
+# Benchmarks
+
+## Caveats
+
+This library makes some different design decisions than the upstream Rust crate around internal buffering. Specifically, because it does not target the embedded system space, nor does it support multithreading, it elects to do its own internal buffering. This means that a user does not have to worry about providing large enough buffers to get the best possible performance, but it does worse on smaller input sizes. So some notes:
+
+- The Rust benchmarks below are all single-threaded to match this Go implementation.
+- I make no attempt to get precise measurements (cpu throttling, noisy environment, etc.) so please benchmark on your own systems.
+- These benchmarks are run on an i7-6700K which does not support AVX-512, so Rust is limited to use AVX2 at sizes above 8 kib.
+- I tried my best to make them benchmark the same thing, but who knows? :smile:
+
+## Charts
+
+In this case, both libraries are able to avoid a lot of data copying and will use vectorized instructions to hash as fast as possible, and perform similarly.
+
+![Large Full Buffer](/assets/large-full-buffer.svg)
+
+For incremental writes, you must provide the Rust version large enough buffers so that it can use vectorized instructions. This Go library performs consistently regardless of the size being sent into the update function.
+
+![Incremental](/assets/incremental.svg)
+
+The downside of internal buffering is most apparent with small sizes as most time is spent initializing the hasher state. In terms of hashing rate, the difference is 3-4x, but in an absolute sense it's ~100ns (see tables below). If you wish to hash a large number of very small strings and you care about those nanoseconds, be sure to use the Reset method to avoid re-initializing the state.
+
+![Small Full Buffer](/assets/small-full-buffer.svg)
+
+## Timing Tables
+
+### Small
+
+| Size | Full Buffer | Reset | | Full Buffer Rate | Reset Rate |
+|--------|-------------|------------|-|------------------|--------------|
+| 64 b | `205ns` | `86.5ns` | | `312MB/s` | `740MB/s` |
+| 256 b | `364ns` | `250ns` | | `703MB/s` | `1.03GB/s` |
+| 512 b | `575ns` | `468ns` | | `892MB/s` | `1.10GB/s` |
+| 768 b | `795ns` | `682ns` | | `967MB/s` | `1.13GB/s` |
+
+### Large
+
+| Size | Incremental | Full Buffer | Reset | | Incremental Rate | Full Buffer Rate | Reset Rate |
+|----------|-------------|-------------|------------|-|------------------|------------------|--------------|
+| 1 kib | `1.02µs` | `1.01µs` | `891ns` | | `1.00GB/s` | `1.01GB/s` | `1.15GB/s` |
+| 2 kib | `2.11µs` | `2.07µs` | `1.95µs` | | `968MB/s` | `990MB/s` | `1.05GB/s` |
+| 4 kib | `2.28µs` | `2.15µs` | `2.05µs` | | `1.80GB/s` | `1.90GB/s` | `2.00GB/s` |
+| 8 kib | `2.64µs` | `2.52µs` | `2.44µs` | | `3.11GB/s` | `3.25GB/s` | `3.36GB/s` |
+| 16 kib | `4.93µs` | `4.54µs` | `4.48µs` | | `3.33GB/s` | `3.61GB/s` | `3.66GB/s` |
+| 32 kib | `9.41µs` | `8.62µs` | `8.54µs` | | `3.48GB/s` | `3.80GB/s` | `3.84GB/s` |
+| 64 kib | `18.2µs` | `16.7µs` | `16.6µs` | | `3.59GB/s` | `3.91GB/s` | `3.94GB/s` |
+| 128 kib | `36.3µs` | `32.9µs` | `33.1µs` | | `3.61GB/s` | `3.99GB/s` | `3.96GB/s` |
+| 256 kib | `72.5µs` | `65.7µs` | `66.0µs` | | `3.62GB/s` | `3.99GB/s` | `3.97GB/s` |
+| 512 kib | `145µs` | `131µs` | `132µs` | | `3.60GB/s` | `4.00GB/s` | `3.97GB/s` |
+| 1024 kib | `290µs` | `262µs` | `262µs` | | `3.62GB/s` | `4.00GB/s` | `4.00GB/s` |
+
+### No ASM
+
+| Size | Incremental | Full Buffer | Reset | | Incremental Rate | Full Buffer Rate | Reset Rate |
+|----------|-------------|-------------|------------|-|------------------|------------------|-------------|
+| 64 b | `253ns` | `254ns` | `134ns` | | `253MB/s` | `252MB/s` | `478MB/s` |
+| 256 b | `553ns` | `557ns` | `441ns` | | `463MB/s` | `459MB/s` | `580MB/s` |
+| 512 b | `948ns` | `953ns` | `841ns` | | `540MB/s` | `538MB/s` | `609MB/s` |
+| 768 b | `1.38µs` | `1.40µs` | `1.35µs` | | `558MB/s` | `547MB/s` | `570MB/s` |
+| 1 kib | `1.77µs` | `1.77µs` | `1.70µs` | | `577MB/s` | `580MB/s` | `602MB/s` |
+| | | | | | | | |
+| 1024 kib | `880µs` | `883µs` | `878µs` | | `596MB/s` | `595MB/s` | `598MB/s` |
+
+The speed caps out at around 1 kib, so most rows have been elided from the presentation.
diff --git a/vendor/github.com/zeebo/blake3/api.go b/vendor/github.com/zeebo/blake3/api.go
new file mode 100644
index 000000000..5de263f08
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/api.go
@@ -0,0 +1,166 @@
+// Package blake3 provides an SSE4.1/AVX2 accelerated BLAKE3 implementation.
+package blake3
+
+import (
+ "errors"
+
+ "github.com/zeebo/blake3/internal/consts"
+ "github.com/zeebo/blake3/internal/utils"
+)
+
+// Hasher is a hash.Hash for BLAKE3.
+type Hasher struct {
+ size int
+ h hasher
+}
+
+// New returns a new Hasher that has a digest size of 32 bytes.
+//
+// If you need more or less output bytes than that, use Digest method.
+func New() *Hasher {
+ return &Hasher{
+ size: 32,
+ h: hasher{
+ key: consts.IV,
+ },
+ }
+}
+
+// NewKeyed returns a new Hasher that uses the 32 byte input key and has
+// a digest size of 32 bytes.
+//
+// If you need more or less output bytes than that, use the Digest method.
+func NewKeyed(key []byte) (*Hasher, error) {
+ if len(key) != 32 {
+ return nil, errors.New("invalid key size")
+ }
+
+ h := &Hasher{
+ size: 32,
+ h: hasher{
+ flags: consts.Flag_Keyed,
+ },
+ }
+ utils.KeyFromBytes(key, &h.h.key)
+
+ return h, nil
+}
+
+// DeriveKey derives a key based on reusable key material of any
+// length, in the given context. The key will be stored in out, using
+// all of its current length.
+//
+// Context strings must be hardcoded constants, and the recommended
+// format is "[application] [commit timestamp] [purpose]", e.g.,
+// "example.com 2019-12-25 16:18:03 session tokens v1".
+func DeriveKey(context string, material []byte, out []byte) {
+ h := NewDeriveKey(context)
+ _, _ = h.Write(material)
+ _, _ = h.Digest().Read(out)
+}
+
+// NewDeriveKey returns a Hasher that is initialized with the context
+// string. See DeriveKey for details. It has a digest size of 32 bytes.
+//
+// If you need more or less output bytes than that, use the Digest method.
+func NewDeriveKey(context string) *Hasher {
+ // hash the context string and use that instead of IV
+ h := &Hasher{
+ size: 32,
+ h: hasher{
+ key: consts.IV,
+ flags: consts.Flag_DeriveKeyContext,
+ },
+ }
+
+ var buf [32]byte
+ _, _ = h.WriteString(context)
+ _, _ = h.Digest().Read(buf[:])
+
+ h.Reset()
+ utils.KeyFromBytes(buf[:], &h.h.key)
+ h.h.flags = consts.Flag_DeriveKeyMaterial
+
+ return h
+}
+
+// Write implements part of the hash.Hash interface. It never returns an error.
+func (h *Hasher) Write(p []byte) (int, error) {
+ h.h.update(p)
+ return len(p), nil
+}
+
+// WriteString is like Write but specialized to strings to avoid allocations.
+func (h *Hasher) WriteString(p string) (int, error) {
+ h.h.updateString(p)
+ return len(p), nil
+}
+
+// Reset implements part of the hash.Hash interface. It causes the Hasher to
+// act as if it was newly created.
+func (h *Hasher) Reset() {
+ h.h.reset()
+}
+
+// Clone returns a new Hasher with the same internal state.
+//
+// Modifying the resulting Hasher will not modify the original Hasher, and vice versa.
+func (h *Hasher) Clone() *Hasher {
+ return &Hasher{size: h.size, h: h.h}
+}
+
+// Size implements part of the hash.Hash interface. It returns the number of
+// bytes the hash will output in Sum.
+func (h *Hasher) Size() int {
+ return h.size
+}
+
+// BlockSize implements part of the hash.Hash interface. It returns the most
+// natural size to write to the Hasher.
+func (h *Hasher) BlockSize() int {
+ // TODO: is there a downside to picking this large size?
+ return 8192
+}
+
+// Sum implements part of the hash.Hash interface. It appends the digest of
+// the Hasher to the provided buffer and returns it.
+func (h *Hasher) Sum(b []byte) []byte {
+ if top := len(b) + h.size; top <= cap(b) && top >= len(b) {
+ h.h.finalize(b[len(b):top])
+ return b[:top]
+ }
+
+ tmp := make([]byte, h.size)
+ h.h.finalize(tmp)
+ return append(b, tmp...)
+}
+
+// Digest takes a snapshot of the hash state and returns an object that can
+// be used to read and seek through 2^64 bytes of digest output.
+func (h *Hasher) Digest() *Digest {
+ var d Digest
+ h.h.finalizeDigest(&d)
+ return &d
+}
+
+// Sum256 returns the first 256 bits of the unkeyed digest of the data.
+func Sum256(data []byte) (sum [32]byte) {
+ out := Sum512(data)
+ copy(sum[:], out[:32])
+ return sum
+}
+
+// Sum512 returns the first 512 bits of the unkeyed digest of the data.
+func Sum512(data []byte) (sum [64]byte) {
+ if len(data) <= consts.ChunkLen {
+ var d Digest
+ compressAll(&d, data, 0, consts.IV)
+ _, _ = d.Read(sum[:])
+ return sum
+ } else {
+ h := hasher{key: consts.IV}
+ h.update(data)
+ h.finalize(sum[:])
+ return sum
+ }
+}
diff --git a/vendor/github.com/zeebo/blake3/blake3.go b/vendor/github.com/zeebo/blake3/blake3.go
new file mode 100644
index 000000000..98dedcabe
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/blake3.go
@@ -0,0 +1,285 @@
+package blake3
+
+import (
+ "math/bits"
+ "unsafe"
+
+ "github.com/zeebo/blake3/internal/alg"
+ "github.com/zeebo/blake3/internal/consts"
+ "github.com/zeebo/blake3/internal/utils"
+)
+
+//
+// hasher contains state for a blake3 hash
+//
+
+type hasher struct {
+ len uint64
+ chunks uint64
+ flags uint32
+ key [8]uint32
+ stack cvstack
+ buf [8192]byte
+}
+
+func (a *hasher) reset() {
+ a.len = 0
+ a.chunks = 0
+ a.stack.occ = 0
+ a.stack.lvls = [8]uint8{}
+ a.stack.bufn = 0
+}
+
+func (a *hasher) update(buf []byte) {
+ // relies on the first two words of a string being the same as a slice
+ a.updateString(*(*string)(unsafe.Pointer(&buf)))
+}
+
+func (a *hasher) updateString(buf string) {
+ var input *[8192]byte
+
+ for len(buf) > 0 {
+ if a.len == 0 && len(buf) > 8192 {
+ // relies on the data pointer being the first word in the string header
+ input = (*[8192]byte)(*(*unsafe.Pointer)(unsafe.Pointer(&buf)))
+ buf = buf[8192:]
+ } else if a.len < 8192 {
+ n := copy(a.buf[a.len:], buf)
+ a.len += uint64(n)
+ buf = buf[n:]
+ continue
+ } else {
+ input = &a.buf
+ }
+
+ a.consume(input)
+ a.len = 0
+ a.chunks += 8
+ }
+}
+
+func (a *hasher) consume(input *[8192]byte) {
+ var out chainVector
+ var chain [8]uint32
+ alg.HashF(input, 8192, a.chunks, a.flags, &a.key, &out, &chain)
+ a.stack.pushN(0, &out, 8, a.flags, &a.key)
+}
+
+func (a *hasher) finalize(p []byte) {
+ var d Digest
+ a.finalizeDigest(&d)
+ _, _ = d.Read(p)
+}
+
+func (a *hasher) finalizeDigest(d *Digest) {
+ if a.chunks == 0 && a.len <= consts.ChunkLen {
+ compressAll(d, a.buf[:a.len], a.flags, a.key)
+ return
+ }
+
+ d.chain = a.key
+ d.flags = a.flags | consts.Flag_ChunkEnd
+
+ if a.len > 64 {
+ var buf chainVector
+ alg.HashF(&a.buf, a.len, a.chunks, a.flags, &a.key, &buf, &d.chain)
+
+ if a.len > consts.ChunkLen {
+ complete := (a.len - 1) / consts.ChunkLen
+ a.stack.pushN(0, &buf, int(complete), a.flags, &a.key)
+ a.chunks += complete
+ a.len = uint64(copy(a.buf[:], a.buf[complete*consts.ChunkLen:a.len]))
+ }
+ }
+
+ if a.len <= 64 {
+ d.flags |= consts.Flag_ChunkStart
+ }
+
+ d.counter = a.chunks
+ d.blen = uint32(a.len) % 64
+
+ base := a.len / 64 * 64
+ if a.len > 0 && d.blen == 0 {
+ d.blen = 64
+ base -= 64
+ }
+
+ if consts.IsLittleEndian {
+ copy((*[64]byte)(unsafe.Pointer(&d.block[0]))[:], a.buf[base:a.len])
+ } else {
+ var tmp [64]byte
+ copy(tmp[:], a.buf[base:a.len])
+ utils.BytesToWords(&tmp, &d.block)
+ }
+
+ for a.stack.bufn > 0 {
+ a.stack.flush(a.flags, &a.key)
+ }
+
+ var tmp [16]uint32
+ for occ := a.stack.occ; occ != 0; occ &= occ - 1 {
+ col := uint(bits.TrailingZeros64(occ)) % 64
+
+ alg.Compress(&d.chain, &d.block, d.counter, d.blen, d.flags, &tmp)
+
+ *(*[8]uint32)(unsafe.Pointer(&d.block[0])) = a.stack.stack[col]
+ *(*[8]uint32)(unsafe.Pointer(&d.block[8])) = *(*[8]uint32)(unsafe.Pointer(&tmp[0]))
+
+ if occ == a.stack.occ {
+ d.chain = a.key
+ d.counter = 0
+ d.blen = consts.BlockLen
+ d.flags = a.flags | consts.Flag_Parent
+ }
+ }
+
+ d.flags |= consts.Flag_Root
+}
+
+//
+// chain value stack
+//
+
+type chainVector = [64]uint32
+
+type cvstack struct {
+ occ uint64 // which levels in stack are occupied
+ lvls [8]uint8 // what level the buf input was in
+ bufn int // how many pairs are loaded into buf
+ buf [2]chainVector
+ stack [64][8]uint32
+}
+
+func (a *cvstack) pushN(l uint8, cv *chainVector, n int, flags uint32, key *[8]uint32) {
+ for i := 0; i < n; i++ {
+ a.pushL(l, cv, i)
+ for a.bufn == 8 {
+ a.flush(flags, key)
+ }
+ }
+}
+
+func (a *cvstack) pushL(l uint8, cv *chainVector, n int) {
+ bit := uint64(1) << (l & 63)
+ if a.occ&bit == 0 {
+ readChain(cv, n, &a.stack[l&63])
+ a.occ ^= bit
+ return
+ }
+
+ a.lvls[a.bufn&7] = l
+ writeChain(&a.stack[l&63], &a.buf[0], a.bufn)
+ copyChain(cv, n, &a.buf[1], a.bufn)
+ a.bufn++
+ a.occ ^= bit
+}
+
+func (a *cvstack) flush(flags uint32, key *[8]uint32) {
+ var out chainVector
+ alg.HashP(&a.buf[0], &a.buf[1], flags|consts.Flag_Parent, key, &out, a.bufn)
+
+ bufn, lvls := a.bufn, a.lvls
+ a.bufn, a.lvls = 0, [8]uint8{}
+
+ for i := 0; i < bufn; i++ {
+ a.pushL(lvls[i]+1, &out, i)
+ }
+}
+
+//
+// helpers to deal with reading/writing transposed values
+//
+
+func copyChain(in *chainVector, icol int, out *chainVector, ocol int) {
+ type u = uintptr
+ type p = unsafe.Pointer
+ type a = *uint32
+
+ i := p(u(p(in)) + u(icol*4))
+ o := p(u(p(out)) + u(ocol*4))
+
+ *a(p(u(o) + 0*32)) = *a(p(u(i) + 0*32))
+ *a(p(u(o) + 1*32)) = *a(p(u(i) + 1*32))
+ *a(p(u(o) + 2*32)) = *a(p(u(i) + 2*32))
+ *a(p(u(o) + 3*32)) = *a(p(u(i) + 3*32))
+ *a(p(u(o) + 4*32)) = *a(p(u(i) + 4*32))
+ *a(p(u(o) + 5*32)) = *a(p(u(i) + 5*32))
+ *a(p(u(o) + 6*32)) = *a(p(u(i) + 6*32))
+ *a(p(u(o) + 7*32)) = *a(p(u(i) + 7*32))
+}
+
+func readChain(in *chainVector, col int, out *[8]uint32) {
+ type u = uintptr
+ type p = unsafe.Pointer
+ type a = *uint32
+
+ i := p(u(p(in)) + u(col*4))
+
+ out[0] = *a(p(u(i) + 0*32))
+ out[1] = *a(p(u(i) + 1*32))
+ out[2] = *a(p(u(i) + 2*32))
+ out[3] = *a(p(u(i) + 3*32))
+ out[4] = *a(p(u(i) + 4*32))
+ out[5] = *a(p(u(i) + 5*32))
+ out[6] = *a(p(u(i) + 6*32))
+ out[7] = *a(p(u(i) + 7*32))
+}
+
+func writeChain(in *[8]uint32, out *chainVector, col int) {
+ type u = uintptr
+ type p = unsafe.Pointer
+ type a = *uint32
+
+ o := p(u(p(out)) + u(col*4))
+
+ *a(p(u(o) + 0*32)) = in[0]
+ *a(p(u(o) + 1*32)) = in[1]
+ *a(p(u(o) + 2*32)) = in[2]
+ *a(p(u(o) + 3*32)) = in[3]
+ *a(p(u(o) + 4*32)) = in[4]
+ *a(p(u(o) + 5*32)) = in[5]
+ *a(p(u(o) + 6*32)) = in[6]
+ *a(p(u(o) + 7*32)) = in[7]
+}
+
+//
+// compress <= chunkLen bytes in one shot
+//
+
+func compressAll(d *Digest, in []byte, flags uint32, key [8]uint32) {
+ var compressed [16]uint32
+
+ d.chain = key
+ d.flags = flags | consts.Flag_ChunkStart
+
+ for len(in) > 64 {
+ buf := (*[64]byte)(unsafe.Pointer(&in[0]))
+
+ var block *[16]uint32
+ if consts.IsLittleEndian {
+ block = (*[16]uint32)(unsafe.Pointer(buf))
+ } else {
+ block = &d.block
+ utils.BytesToWords(buf, block)
+ }
+
+ alg.Compress(&d.chain, block, 0, consts.BlockLen, d.flags, &compressed)
+
+ d.chain = *(*[8]uint32)(unsafe.Pointer(&compressed[0]))
+ d.flags &^= consts.Flag_ChunkStart
+
+ in = in[64:]
+ }
+
+ if consts.IsLittleEndian {
+ copy((*[64]byte)(unsafe.Pointer(&d.block[0]))[:], in)
+ } else {
+ var tmp [64]byte
+ copy(tmp[:], in)
+ utils.BytesToWords(&tmp, &d.block)
+ }
+
+ d.blen = uint32(len(in))
+ d.flags |= consts.Flag_ChunkEnd | consts.Flag_Root
+}
diff --git a/vendor/github.com/zeebo/blake3/digest.go b/vendor/github.com/zeebo/blake3/digest.go
new file mode 100644
index 000000000..58365d5ab
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/digest.go
@@ -0,0 +1,100 @@
+package blake3
+
+import (
+ "fmt"
+ "io"
+ "unsafe"
+
+ "github.com/zeebo/blake3/internal/alg"
+ "github.com/zeebo/blake3/internal/consts"
+ "github.com/zeebo/blake3/internal/utils"
+)
+
+// Digest captures the state of a Hasher allowing reading and seeking through
+// the output stream.
+type Digest struct {
+ counter uint64
+ chain [8]uint32
+ block [16]uint32
+ blen uint32
+ flags uint32
+ buf [16]uint32
+ bufn int
+}
+
+// Read reads data frm the hasher into out. It always fills the entire buffer and
+// never errors. The stream will wrap around when reading past 2^64 bytes.
+func (d *Digest) Read(p []byte) (n int, err error) {
+ n = len(p)
+
+ if d.bufn > 0 {
+ n := d.slowCopy(p)
+ p = p[n:]
+ d.bufn -= n
+ }
+
+ for len(p) >= 64 {
+ d.fillBuf()
+
+ if consts.IsLittleEndian {
+ *(*[64]byte)(unsafe.Pointer(&p[0])) = *(*[64]byte)(unsafe.Pointer(&d.buf[0]))
+ } else {
+ utils.WordsToBytes(&d.buf, p)
+ }
+
+ p = p[64:]
+ d.bufn = 0
+ }
+
+ if len(p) == 0 {
+ return n, nil
+ }
+
+ d.fillBuf()
+ d.bufn -= d.slowCopy(p)
+
+ return n, nil
+}
+
+// Seek sets the position to the provided location. Only SeekStart and
+// SeekCurrent are allowed.
+func (d *Digest) Seek(offset int64, whence int) (int64, error) {
+ switch whence {
+ case io.SeekStart:
+ case io.SeekEnd:
+ return 0, fmt.Errorf("seek from end not supported")
+ case io.SeekCurrent:
+ offset += int64(consts.BlockLen*d.counter) - int64(d.bufn)
+ default:
+ return 0, fmt.Errorf("invalid whence: %d", whence)
+ }
+ if offset < 0 {
+ return 0, fmt.Errorf("seek before start")
+ }
+ d.setPosition(uint64(offset))
+ return offset, nil
+}
+
+func (d *Digest) setPosition(pos uint64) {
+ d.counter = pos / consts.BlockLen
+ d.fillBuf()
+ d.bufn -= int(pos % consts.BlockLen)
+}
+
+func (d *Digest) slowCopy(p []byte) (n int) {
+ off := uint(consts.BlockLen-d.bufn) % consts.BlockLen
+ if consts.IsLittleEndian {
+ n = copy(p, (*[consts.BlockLen]byte)(unsafe.Pointer(&d.buf[0]))[off:])
+ } else {
+ var tmp [consts.BlockLen]byte
+ utils.WordsToBytes(&d.buf, tmp[:])
+ n = copy(p, tmp[off:])
+ }
+ return n
+}
+
+func (d *Digest) fillBuf() {
+ alg.Compress(&d.chain, &d.block, d.counter, d.blen, d.flags, &d.buf)
+ d.counter++
+ d.bufn = consts.BlockLen
+}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/alg.go b/vendor/github.com/zeebo/blake3/internal/alg/alg.go
new file mode 100644
index 000000000..239fdec5b
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/alg.go
@@ -0,0 +1,18 @@
+package alg
+
+import (
+ "github.com/zeebo/blake3/internal/alg/compress"
+ "github.com/zeebo/blake3/internal/alg/hash"
+)
+
+func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) {
+ hash.HashF(input, length, counter, flags, key, out, chain)
+}
+
+func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) {
+ hash.HashP(left, right, flags, key, out, n)
+}
+
+func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) {
+ compress.Compress(chain, block, counter, blen, flags, out)
+}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go
new file mode 100644
index 000000000..0b2685408
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress.go
@@ -0,0 +1,15 @@
+package compress
+
+import (
+ "github.com/zeebo/blake3/internal/alg/compress/compress_pure"
+ "github.com/zeebo/blake3/internal/alg/compress/compress_sse41"
+ "github.com/zeebo/blake3/internal/consts"
+)
+
+func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) {
+ if consts.HasSSE41 {
+ compress_sse41.Compress(chain, block, counter, blen, flags, out)
+ } else {
+ compress_pure.Compress(chain, block, counter, blen, flags, out)
+ }
+}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go
new file mode 100644
index 000000000..66ea1fb75
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_pure/compress.go
@@ -0,0 +1,135 @@
+package compress_pure
+
+import (
+ "math/bits"
+
+ "github.com/zeebo/blake3/internal/consts"
+)
+
+func Compress(
+ chain *[8]uint32,
+ block *[16]uint32,
+ counter uint64,
+ blen uint32,
+ flags uint32,
+ out *[16]uint32,
+) {
+
+ *out = [16]uint32{
+ chain[0], chain[1], chain[2], chain[3],
+ chain[4], chain[5], chain[6], chain[7],
+ consts.IV0, consts.IV1, consts.IV2, consts.IV3,
+ uint32(counter), uint32(counter >> 32), blen, flags,
+ }
+
+ rcompress(out, block)
+}
+
+func g(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) {
+ a += b + mx
+ d = bits.RotateLeft32(d^a, -16)
+ c += d
+ b = bits.RotateLeft32(b^c, -12)
+ a += b + my
+ d = bits.RotateLeft32(d^a, -8)
+ c += d
+ b = bits.RotateLeft32(b^c, -7)
+ return a, b, c, d
+}
+
+func rcompress(s *[16]uint32, m *[16]uint32) {
+ const (
+ a = 10
+ b = 11
+ c = 12
+ d = 13
+ e = 14
+ f = 15
+ )
+
+ s0, s1, s2, s3 := s[0+0], s[0+1], s[0+2], s[0+3]
+ s4, s5, s6, s7 := s[0+4], s[0+5], s[0+6], s[0+7]
+ s8, s9, sa, sb := s[8+0], s[8+1], s[8+2], s[8+3]
+ sc, sd, se, sf := s[8+4], s[8+5], s[8+6], s[8+7]
+
+ s0, s4, s8, sc = g(s0, s4, s8, sc, m[0], m[1])
+ s1, s5, s9, sd = g(s1, s5, s9, sd, m[2], m[3])
+ s2, s6, sa, se = g(s2, s6, sa, se, m[4], m[5])
+ s3, s7, sb, sf = g(s3, s7, sb, sf, m[6], m[7])
+ s0, s5, sa, sf = g(s0, s5, sa, sf, m[8], m[9])
+ s1, s6, sb, sc = g(s1, s6, sb, sc, m[a], m[b])
+ s2, s7, s8, sd = g(s2, s7, s8, sd, m[c], m[d])
+ s3, s4, s9, se = g(s3, s4, s9, se, m[e], m[f])
+
+ s0, s4, s8, sc = g(s0, s4, s8, sc, m[2], m[6])
+ s1, s5, s9, sd = g(s1, s5, s9, sd, m[3], m[a])
+ s2, s6, sa, se = g(s2, s6, sa, se, m[7], m[0])
+ s3, s7, sb, sf = g(s3, s7, sb, sf, m[4], m[d])
+ s0, s5, sa, sf = g(s0, s5, sa, sf, m[1], m[b])
+ s1, s6, sb, sc = g(s1, s6, sb, sc, m[c], m[5])
+ s2, s7, s8, sd = g(s2, s7, s8, sd, m[9], m[e])
+ s3, s4, s9, se = g(s3, s4, s9, se, m[f], m[8])
+
+ s0, s4, s8, sc = g(s0, s4, s8, sc, m[3], m[4])
+ s1, s5, s9, sd = g(s1, s5, s9, sd, m[a], m[c])
+ s2, s6, sa, se = g(s2, s6, sa, se, m[d], m[2])
+ s3, s7, sb, sf = g(s3, s7, sb, sf, m[7], m[e])
+ s0, s5, sa, sf = g(s0, s5, sa, sf, m[6], m[5])
+ s1, s6, sb, sc = g(s1, s6, sb, sc, m[9], m[0])
+ s2, s7, s8, sd = g(s2, s7, s8, sd, m[b], m[f])
+ s3, s4, s9, se = g(s3, s4, s9, se, m[8], m[1])
+
+ s0, s4, s8, sc = g(s0, s4, s8, sc, m[a], m[7])
+ s1, s5, s9, sd = g(s1, s5, s9, sd, m[c], m[9])
+ s2, s6, sa, se = g(s2, s6, sa, se, m[e], m[3])
+ s3, s7, sb, sf = g(s3, s7, sb, sf, m[d], m[f])
+ s0, s5, sa, sf = g(s0, s5, sa, sf, m[4], m[0])
+ s1, s6, sb, sc = g(s1, s6, sb, sc, m[b], m[2])
+ s2, s7, s8, sd = g(s2, s7, s8, sd, m[5], m[8])
+ s3, s4, s9, se = g(s3, s4, s9, se, m[1], m[6])
+
+ s0, s4, s8, sc = g(s0, s4, s8, sc, m[c], m[d])
+ s1, s5, s9, sd = g(s1, s5, s9, sd, m[9], m[b])
+ s2, s6, sa, se = g(s2, s6, sa, se, m[f], m[a])
+ s3, s7, sb, sf = g(s3, s7, sb, sf, m[e], m[8])
+ s0, s5, sa, sf = g(s0, s5, sa, sf, m[7], m[2])
+ s1, s6, sb, sc = g(s1, s6, sb, sc, m[5], m[3])
+ s2, s7, s8, sd = g(s2, s7, s8, sd, m[0], m[1])
+ s3, s4, s9, se = g(s3, s4, s9, se, m[6], m[4])
+
+ s0, s4, s8, sc = g(s0, s4, s8, sc, m[9], m[e])
+ s1, s5, s9, sd = g(s1, s5, s9, sd, m[b], m[5])
+ s2, s6, sa, se = g(s2, s6, sa, se, m[8], m[c])
+ s3, s7, sb, sf = g(s3, s7, sb, sf, m[f], m[1])
+ s0, s5, sa, sf = g(s0, s5, sa, sf, m[d], m[3])
+ s1, s6, sb, sc = g(s1, s6, sb, sc, m[0], m[a])
+ s2, s7, s8, sd = g(s2, s7, s8, sd, m[2], m[6])
+ s3, s4, s9, se = g(s3, s4, s9, se, m[4], m[7])
+
+ s0, s4, s8, sc = g(s0, s4, s8, sc, m[b], m[f])
+ s1, s5, s9, sd = g(s1, s5, s9, sd, m[5], m[0])
+ s2, s6, sa, se = g(s2, s6, sa, se, m[1], m[9])
+ s3, s7, sb, sf = g(s3, s7, sb, sf, m[8], m[6])
+ s0, s5, sa, sf = g(s0, s5, sa, sf, m[e], m[a])
+ s1, s6, sb, sc = g(s1, s6, sb, sc, m[2], m[c])
+ s2, s7, s8, sd = g(s2, s7, s8, sd, m[3], m[4])
+ s3, s4, s9, se = g(s3, s4, s9, se, m[7], m[d])
+
+ s[8+0] = s8 ^ s[0]
+ s[8+1] = s9 ^ s[1]
+ s[8+2] = sa ^ s[2]
+ s[8+3] = sb ^ s[3]
+ s[8+4] = sc ^ s[4]
+ s[8+5] = sd ^ s[5]
+ s[8+6] = se ^ s[6]
+ s[8+7] = sf ^ s[7]
+
+ s[0] = s0 ^ s8
+ s[1] = s1 ^ s9
+ s[2] = s2 ^ sa
+ s[3] = s3 ^ sb
+ s[4] = s4 ^ sc
+ s[5] = s5 ^ sd
+ s[6] = s6 ^ se
+ s[7] = s7 ^ sf
+}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s
new file mode 100644
index 000000000..0fedf0b3a
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_amd64.s
@@ -0,0 +1,560 @@
+// Code generated by command: go run compress.go. DO NOT EDIT.
+
+#include "textflag.h"
+
+DATA iv<>+0(SB)/4, $0x6a09e667
+DATA iv<>+4(SB)/4, $0xbb67ae85
+DATA iv<>+8(SB)/4, $0x3c6ef372
+DATA iv<>+12(SB)/4, $0xa54ff53a
+DATA iv<>+16(SB)/4, $0x510e527f
+DATA iv<>+20(SB)/4, $0x9b05688c
+DATA iv<>+24(SB)/4, $0x1f83d9ab
+DATA iv<>+28(SB)/4, $0x5be0cd19
+GLOBL iv<>(SB), RODATA|NOPTR, $32
+
+DATA rot16_shuf<>+0(SB)/1, $0x02
+DATA rot16_shuf<>+1(SB)/1, $0x03
+DATA rot16_shuf<>+2(SB)/1, $0x00
+DATA rot16_shuf<>+3(SB)/1, $0x01
+DATA rot16_shuf<>+4(SB)/1, $0x06
+DATA rot16_shuf<>+5(SB)/1, $0x07
+DATA rot16_shuf<>+6(SB)/1, $0x04
+DATA rot16_shuf<>+7(SB)/1, $0x05
+DATA rot16_shuf<>+8(SB)/1, $0x0a
+DATA rot16_shuf<>+9(SB)/1, $0x0b
+DATA rot16_shuf<>+10(SB)/1, $0x08
+DATA rot16_shuf<>+11(SB)/1, $0x09
+DATA rot16_shuf<>+12(SB)/1, $0x0e
+DATA rot16_shuf<>+13(SB)/1, $0x0f
+DATA rot16_shuf<>+14(SB)/1, $0x0c
+DATA rot16_shuf<>+15(SB)/1, $0x0d
+DATA rot16_shuf<>+16(SB)/1, $0x12
+DATA rot16_shuf<>+17(SB)/1, $0x13
+DATA rot16_shuf<>+18(SB)/1, $0x10
+DATA rot16_shuf<>+19(SB)/1, $0x11
+DATA rot16_shuf<>+20(SB)/1, $0x16
+DATA rot16_shuf<>+21(SB)/1, $0x17
+DATA rot16_shuf<>+22(SB)/1, $0x14
+DATA rot16_shuf<>+23(SB)/1, $0x15
+DATA rot16_shuf<>+24(SB)/1, $0x1a
+DATA rot16_shuf<>+25(SB)/1, $0x1b
+DATA rot16_shuf<>+26(SB)/1, $0x18
+DATA rot16_shuf<>+27(SB)/1, $0x19
+DATA rot16_shuf<>+28(SB)/1, $0x1e
+DATA rot16_shuf<>+29(SB)/1, $0x1f
+DATA rot16_shuf<>+30(SB)/1, $0x1c
+DATA rot16_shuf<>+31(SB)/1, $0x1d
+GLOBL rot16_shuf<>(SB), RODATA|NOPTR, $32
+
+DATA rot8_shuf<>+0(SB)/1, $0x01
+DATA rot8_shuf<>+1(SB)/1, $0x02
+DATA rot8_shuf<>+2(SB)/1, $0x03
+DATA rot8_shuf<>+3(SB)/1, $0x00
+DATA rot8_shuf<>+4(SB)/1, $0x05
+DATA rot8_shuf<>+5(SB)/1, $0x06
+DATA rot8_shuf<>+6(SB)/1, $0x07
+DATA rot8_shuf<>+7(SB)/1, $0x04
+DATA rot8_shuf<>+8(SB)/1, $0x09
+DATA rot8_shuf<>+9(SB)/1, $0x0a
+DATA rot8_shuf<>+10(SB)/1, $0x0b
+DATA rot8_shuf<>+11(SB)/1, $0x08
+DATA rot8_shuf<>+12(SB)/1, $0x0d
+DATA rot8_shuf<>+13(SB)/1, $0x0e
+DATA rot8_shuf<>+14(SB)/1, $0x0f
+DATA rot8_shuf<>+15(SB)/1, $0x0c
+DATA rot8_shuf<>+16(SB)/1, $0x11
+DATA rot8_shuf<>+17(SB)/1, $0x12
+DATA rot8_shuf<>+18(SB)/1, $0x13
+DATA rot8_shuf<>+19(SB)/1, $0x10
+DATA rot8_shuf<>+20(SB)/1, $0x15
+DATA rot8_shuf<>+21(SB)/1, $0x16
+DATA rot8_shuf<>+22(SB)/1, $0x17
+DATA rot8_shuf<>+23(SB)/1, $0x14
+DATA rot8_shuf<>+24(SB)/1, $0x19
+DATA rot8_shuf<>+25(SB)/1, $0x1a
+DATA rot8_shuf<>+26(SB)/1, $0x1b
+DATA rot8_shuf<>+27(SB)/1, $0x18
+DATA rot8_shuf<>+28(SB)/1, $0x1d
+DATA rot8_shuf<>+29(SB)/1, $0x1e
+DATA rot8_shuf<>+30(SB)/1, $0x1f
+DATA rot8_shuf<>+31(SB)/1, $0x1c
+GLOBL rot8_shuf<>(SB), RODATA|NOPTR, $32
+
+// func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32)
+// Requires: SSE, SSE2, SSE4.1, SSSE3
+TEXT ·Compress(SB), NOSPLIT, $0-40
+ MOVQ chain+0(FP), AX
+ MOVQ block+8(FP), CX
+ MOVQ counter+16(FP), DX
+ MOVL blen+24(FP), BX
+ MOVL flags+28(FP), BP
+ MOVQ out+32(FP), SI
+ MOVUPS (AX), X0
+ MOVUPS 16(AX), X1
+ MOVUPS iv<>+0(SB), X2
+ PINSRD $0x00, DX, X3
+ SHRQ $0x20, DX
+ PINSRD $0x01, DX, X3
+ PINSRD $0x02, BX, X3
+ PINSRD $0x03, BP, X3
+ MOVUPS (CX), X4
+ MOVUPS 16(CX), X5
+ MOVUPS 32(CX), X6
+ MOVUPS 48(CX), X7
+ MOVUPS rot16_shuf<>+0(SB), X8
+ MOVUPS rot8_shuf<>+0(SB), X9
+
+ // round 1
+ MOVAPS X4, X10
+ SHUFPS $0x88, X5, X10
+ PADDD X10, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X11
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X11
+ POR X11, X1
+ MOVAPS X4, X4
+ SHUFPS $0xdd, X5, X4
+ PADDD X4, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x07, X1
+ PSLLL $0x19, X5
+ POR X5, X1
+ PSHUFD $0x93, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x39, X2, X2
+ MOVAPS X6, X5
+ SHUFPS $0x88, X7, X5
+ SHUFPS $0x93, X5, X5
+ PADDD X5, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X11
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X11
+ POR X11, X1
+ MOVAPS X6, X6
+ SHUFPS $0xdd, X7, X6
+ SHUFPS $0x93, X6, X6
+ PADDD X6, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X7
+ PSRLL $0x07, X1
+ PSLLL $0x19, X7
+ POR X7, X1
+ PSHUFD $0x39, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x93, X2, X2
+
+ // round 2
+ MOVAPS X10, X7
+ SHUFPS $0xd6, X4, X7
+ SHUFPS $0x39, X7, X7
+ PADDD X7, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X11
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X11
+ POR X11, X1
+ MOVAPS X5, X11
+ SHUFPS $0xfa, X6, X11
+ PSHUFD $0x0f, X10, X10
+ PBLENDW $0x33, X10, X11
+ PADDD X11, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X10
+ PSRLL $0x07, X1
+ PSLLL $0x19, X10
+ POR X10, X1
+ PSHUFD $0x93, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x39, X2, X2
+ MOVAPS X6, X12
+ PUNPCKLLQ X4, X12
+ PBLENDW $0xc0, X5, X12
+ SHUFPS $0xb4, X12, X12
+ PADDD X12, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X10
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X10
+ POR X10, X1
+ MOVAPS X4, X10
+ PUNPCKHLQ X6, X10
+ MOVAPS X5, X4
+ PUNPCKLLQ X10, X4
+ SHUFPS $0x1e, X4, X4
+ PADDD X4, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x07, X1
+ PSLLL $0x19, X5
+ POR X5, X1
+ PSHUFD $0x39, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x93, X2, X2
+
+ // round 3
+ MOVAPS X7, X5
+ SHUFPS $0xd6, X11, X5
+ SHUFPS $0x39, X5, X5
+ PADDD X5, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X6
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X6
+ POR X6, X1
+ MOVAPS X12, X6
+ SHUFPS $0xfa, X4, X6
+ PSHUFD $0x0f, X7, X7
+ PBLENDW $0x33, X7, X6
+ PADDD X6, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X7
+ PSRLL $0x07, X1
+ PSLLL $0x19, X7
+ POR X7, X1
+ PSHUFD $0x93, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x39, X2, X2
+ MOVAPS X4, X10
+ PUNPCKLLQ X11, X10
+ PBLENDW $0xc0, X12, X10
+ SHUFPS $0xb4, X10, X10
+ PADDD X10, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X7
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X7
+ POR X7, X1
+ MOVAPS X11, X7
+ PUNPCKHLQ X4, X7
+ MOVAPS X12, X4
+ PUNPCKLLQ X7, X4
+ SHUFPS $0x1e, X4, X4
+ PADDD X4, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X7
+ PSRLL $0x07, X1
+ PSLLL $0x19, X7
+ POR X7, X1
+ PSHUFD $0x39, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x93, X2, X2
+
+ // round 4
+ MOVAPS X5, X7
+ SHUFPS $0xd6, X6, X7
+ SHUFPS $0x39, X7, X7
+ PADDD X7, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X11
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X11
+ POR X11, X1
+ MOVAPS X10, X11
+ SHUFPS $0xfa, X4, X11
+ PSHUFD $0x0f, X5, X5
+ PBLENDW $0x33, X5, X11
+ PADDD X11, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x07, X1
+ PSLLL $0x19, X5
+ POR X5, X1
+ PSHUFD $0x93, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x39, X2, X2
+ MOVAPS X4, X12
+ PUNPCKLLQ X6, X12
+ PBLENDW $0xc0, X10, X12
+ SHUFPS $0xb4, X12, X12
+ PADDD X12, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X5
+ POR X5, X1
+ MOVAPS X6, X5
+ PUNPCKHLQ X4, X5
+ MOVAPS X10, X4
+ PUNPCKLLQ X5, X4
+ SHUFPS $0x1e, X4, X4
+ PADDD X4, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x07, X1
+ PSLLL $0x19, X5
+ POR X5, X1
+ PSHUFD $0x39, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x93, X2, X2
+
+ // round 5
+ MOVAPS X7, X5
+ SHUFPS $0xd6, X11, X5
+ SHUFPS $0x39, X5, X5
+ PADDD X5, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X6
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X6
+ POR X6, X1
+ MOVAPS X12, X6
+ SHUFPS $0xfa, X4, X6
+ PSHUFD $0x0f, X7, X7
+ PBLENDW $0x33, X7, X6
+ PADDD X6, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X7
+ PSRLL $0x07, X1
+ PSLLL $0x19, X7
+ POR X7, X1
+ PSHUFD $0x93, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x39, X2, X2
+ MOVAPS X4, X10
+ PUNPCKLLQ X11, X10
+ PBLENDW $0xc0, X12, X10
+ SHUFPS $0xb4, X10, X10
+ PADDD X10, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X7
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X7
+ POR X7, X1
+ MOVAPS X11, X7
+ PUNPCKHLQ X4, X7
+ MOVAPS X12, X4
+ PUNPCKLLQ X7, X4
+ SHUFPS $0x1e, X4, X4
+ PADDD X4, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X7
+ PSRLL $0x07, X1
+ PSLLL $0x19, X7
+ POR X7, X1
+ PSHUFD $0x39, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x93, X2, X2
+
+ // round 6
+ MOVAPS X5, X7
+ SHUFPS $0xd6, X6, X7
+ SHUFPS $0x39, X7, X7
+ PADDD X7, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X11
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X11
+ POR X11, X1
+ MOVAPS X10, X11
+ SHUFPS $0xfa, X4, X11
+ PSHUFD $0x0f, X5, X5
+ PBLENDW $0x33, X5, X11
+ PADDD X11, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x07, X1
+ PSLLL $0x19, X5
+ POR X5, X1
+ PSHUFD $0x93, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x39, X2, X2
+ MOVAPS X4, X12
+ PUNPCKLLQ X6, X12
+ PBLENDW $0xc0, X10, X12
+ SHUFPS $0xb4, X12, X12
+ PADDD X12, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X5
+ POR X5, X1
+ MOVAPS X6, X5
+ PUNPCKHLQ X4, X5
+ MOVAPS X10, X4
+ PUNPCKLLQ X5, X4
+ SHUFPS $0x1e, X4, X4
+ PADDD X4, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x07, X1
+ PSLLL $0x19, X5
+ POR X5, X1
+ PSHUFD $0x39, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x93, X2, X2
+
+ // round 7
+ MOVAPS X7, X5
+ SHUFPS $0xd6, X11, X5
+ SHUFPS $0x39, X5, X5
+ PADDD X5, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X5
+ POR X5, X1
+ MOVAPS X12, X5
+ SHUFPS $0xfa, X4, X5
+ PSHUFD $0x0f, X7, X6
+ PBLENDW $0x33, X6, X5
+ PADDD X5, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x07, X1
+ PSLLL $0x19, X5
+ POR X5, X1
+ PSHUFD $0x93, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x39, X2, X2
+ MOVAPS X4, X5
+ PUNPCKLLQ X11, X5
+ PBLENDW $0xc0, X12, X5
+ SHUFPS $0xb4, X5, X5
+ PADDD X5, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X8, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X5
+ PSRLL $0x0c, X1
+ PSLLL $0x14, X5
+ POR X5, X1
+ MOVAPS X11, X6
+ PUNPCKHLQ X4, X6
+ MOVAPS X12, X4
+ PUNPCKLLQ X6, X4
+ SHUFPS $0x1e, X4, X4
+ PADDD X4, X0
+ PADDD X1, X0
+ PXOR X0, X3
+ PSHUFB X9, X3
+ PADDD X3, X2
+ PXOR X2, X1
+ MOVAPS X1, X4
+ PSRLL $0x07, X1
+ PSLLL $0x19, X4
+ POR X4, X1
+ PSHUFD $0x39, X0, X0
+ PSHUFD $0x4e, X3, X3
+ PSHUFD $0x93, X2, X2
+
+ // finalize
+ PXOR X2, X0
+ PXOR X3, X1
+ MOVUPS (AX), X4
+ PXOR X4, X2
+ MOVUPS 16(AX), X4
+ PXOR X4, X3
+ MOVUPS X0, (SI)
+ MOVUPS X1, 16(SI)
+ MOVUPS X2, 32(SI)
+ MOVUPS X3, 48(SI)
+ RET
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go
new file mode 100644
index 000000000..cd63e9740
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/impl_other.go
@@ -0,0 +1,9 @@
+// +build !amd64
+
+package compress_sse41
+
+import "github.com/zeebo/blake3/internal/alg/compress/compress_pure"
+
+func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32) {
+ compress_pure.Compress(chain, block, counter, blen, flags, out)
+}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go
new file mode 100644
index 000000000..ffd932d3c
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/compress/compress_sse41/stubs.go
@@ -0,0 +1,6 @@
+// +build amd64
+
+package compress_sse41
+
+//go:noescape
+func Compress(chain *[8]uint32, block *[16]uint32, counter uint64, blen uint32, flags uint32, out *[16]uint32)
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go
new file mode 100644
index 000000000..ac43abb69
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash.go
@@ -0,0 +1,23 @@
+package hash
+
+import (
+ "github.com/zeebo/blake3/internal/alg/hash/hash_avx2"
+ "github.com/zeebo/blake3/internal/alg/hash/hash_pure"
+ "github.com/zeebo/blake3/internal/consts"
+)
+
+func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) {
+ if consts.HasAVX2 && length > 2*consts.ChunkLen {
+ hash_avx2.HashF(input, length, counter, flags, key, out, chain)
+ } else {
+ hash_pure.HashF(input, length, counter, flags, key, out, chain)
+ }
+}
+
+func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) {
+ if consts.HasAVX2 && n >= 2 {
+ hash_avx2.HashP(left, right, flags, key, out, n)
+ } else {
+ hash_pure.HashP(left, right, flags, key, out, n)
+ }
+}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s
new file mode 100644
index 000000000..d7531664b
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_amd64.s
@@ -0,0 +1,2561 @@
+// Code generated by command: go run main.go. DO NOT EDIT.
+
+#include "textflag.h"
+
+DATA iv<>+0(SB)/4, $0x6a09e667
+DATA iv<>+4(SB)/4, $0xbb67ae85
+DATA iv<>+8(SB)/4, $0x3c6ef372
+DATA iv<>+12(SB)/4, $0xa54ff53a
+DATA iv<>+16(SB)/4, $0x510e527f
+DATA iv<>+20(SB)/4, $0x9b05688c
+DATA iv<>+24(SB)/4, $0x1f83d9ab
+DATA iv<>+28(SB)/4, $0x5be0cd19
+GLOBL iv<>(SB), RODATA|NOPTR, $32
+
+DATA rot16_shuf<>+0(SB)/1, $0x02
+DATA rot16_shuf<>+1(SB)/1, $0x03
+DATA rot16_shuf<>+2(SB)/1, $0x00
+DATA rot16_shuf<>+3(SB)/1, $0x01
+DATA rot16_shuf<>+4(SB)/1, $0x06
+DATA rot16_shuf<>+5(SB)/1, $0x07
+DATA rot16_shuf<>+6(SB)/1, $0x04
+DATA rot16_shuf<>+7(SB)/1, $0x05
+DATA rot16_shuf<>+8(SB)/1, $0x0a
+DATA rot16_shuf<>+9(SB)/1, $0x0b
+DATA rot16_shuf<>+10(SB)/1, $0x08
+DATA rot16_shuf<>+11(SB)/1, $0x09
+DATA rot16_shuf<>+12(SB)/1, $0x0e
+DATA rot16_shuf<>+13(SB)/1, $0x0f
+DATA rot16_shuf<>+14(SB)/1, $0x0c
+DATA rot16_shuf<>+15(SB)/1, $0x0d
+DATA rot16_shuf<>+16(SB)/1, $0x12
+DATA rot16_shuf<>+17(SB)/1, $0x13
+DATA rot16_shuf<>+18(SB)/1, $0x10
+DATA rot16_shuf<>+19(SB)/1, $0x11
+DATA rot16_shuf<>+20(SB)/1, $0x16
+DATA rot16_shuf<>+21(SB)/1, $0x17
+DATA rot16_shuf<>+22(SB)/1, $0x14
+DATA rot16_shuf<>+23(SB)/1, $0x15
+DATA rot16_shuf<>+24(SB)/1, $0x1a
+DATA rot16_shuf<>+25(SB)/1, $0x1b
+DATA rot16_shuf<>+26(SB)/1, $0x18
+DATA rot16_shuf<>+27(SB)/1, $0x19
+DATA rot16_shuf<>+28(SB)/1, $0x1e
+DATA rot16_shuf<>+29(SB)/1, $0x1f
+DATA rot16_shuf<>+30(SB)/1, $0x1c
+DATA rot16_shuf<>+31(SB)/1, $0x1d
+GLOBL rot16_shuf<>(SB), RODATA|NOPTR, $32
+
+DATA rot8_shuf<>+0(SB)/1, $0x01
+DATA rot8_shuf<>+1(SB)/1, $0x02
+DATA rot8_shuf<>+2(SB)/1, $0x03
+DATA rot8_shuf<>+3(SB)/1, $0x00
+DATA rot8_shuf<>+4(SB)/1, $0x05
+DATA rot8_shuf<>+5(SB)/1, $0x06
+DATA rot8_shuf<>+6(SB)/1, $0x07
+DATA rot8_shuf<>+7(SB)/1, $0x04
+DATA rot8_shuf<>+8(SB)/1, $0x09
+DATA rot8_shuf<>+9(SB)/1, $0x0a
+DATA rot8_shuf<>+10(SB)/1, $0x0b
+DATA rot8_shuf<>+11(SB)/1, $0x08
+DATA rot8_shuf<>+12(SB)/1, $0x0d
+DATA rot8_shuf<>+13(SB)/1, $0x0e
+DATA rot8_shuf<>+14(SB)/1, $0x0f
+DATA rot8_shuf<>+15(SB)/1, $0x0c
+DATA rot8_shuf<>+16(SB)/1, $0x11
+DATA rot8_shuf<>+17(SB)/1, $0x12
+DATA rot8_shuf<>+18(SB)/1, $0x13
+DATA rot8_shuf<>+19(SB)/1, $0x10
+DATA rot8_shuf<>+20(SB)/1, $0x15
+DATA rot8_shuf<>+21(SB)/1, $0x16
+DATA rot8_shuf<>+22(SB)/1, $0x17
+DATA rot8_shuf<>+23(SB)/1, $0x14
+DATA rot8_shuf<>+24(SB)/1, $0x19
+DATA rot8_shuf<>+25(SB)/1, $0x1a
+DATA rot8_shuf<>+26(SB)/1, $0x1b
+DATA rot8_shuf<>+27(SB)/1, $0x18
+DATA rot8_shuf<>+28(SB)/1, $0x1d
+DATA rot8_shuf<>+29(SB)/1, $0x1e
+DATA rot8_shuf<>+30(SB)/1, $0x1f
+DATA rot8_shuf<>+31(SB)/1, $0x1c
+GLOBL rot8_shuf<>(SB), RODATA|NOPTR, $32
+
+DATA block_len<>+0(SB)/4, $0x00000040
+DATA block_len<>+4(SB)/4, $0x00000040
+DATA block_len<>+8(SB)/4, $0x00000040
+DATA block_len<>+12(SB)/4, $0x00000040
+DATA block_len<>+16(SB)/4, $0x00000040
+DATA block_len<>+20(SB)/4, $0x00000040
+DATA block_len<>+24(SB)/4, $0x00000040
+DATA block_len<>+28(SB)/4, $0x00000040
+GLOBL block_len<>(SB), RODATA|NOPTR, $32
+
+DATA zero<>+0(SB)/4, $0x00000000
+DATA zero<>+4(SB)/4, $0x00000000
+DATA zero<>+8(SB)/4, $0x00000000
+DATA zero<>+12(SB)/4, $0x00000000
+DATA zero<>+16(SB)/4, $0x00000000
+DATA zero<>+20(SB)/4, $0x00000000
+DATA zero<>+24(SB)/4, $0x00000000
+DATA zero<>+28(SB)/4, $0x00000000
+GLOBL zero<>(SB), RODATA|NOPTR, $32
+
+DATA counter<>+0(SB)/8, $0x0000000000000000
+DATA counter<>+8(SB)/8, $0x0000000000000001
+DATA counter<>+16(SB)/8, $0x0000000000000002
+DATA counter<>+24(SB)/8, $0x0000000000000003
+DATA counter<>+32(SB)/8, $0x0000000000000004
+DATA counter<>+40(SB)/8, $0x0000000000000005
+DATA counter<>+48(SB)/8, $0x0000000000000006
+DATA counter<>+56(SB)/8, $0x0000000000000007
+GLOBL counter<>(SB), RODATA|NOPTR, $64
+
+// func HashF(input *[8192]byte, length uint64, counter uint64, flags uint32, key *[8]uint32, out *[32]uint32, chain *[8]uint32)
+// Requires: AVX, AVX2
+TEXT ·HashF(SB), $688-56
+ MOVQ input+0(FP), AX
+ MOVQ length+8(FP), CX
+ MOVQ counter+16(FP), DX
+ MOVL flags+24(FP), BX
+ MOVQ key+32(FP), BP
+ MOVQ out+40(FP), SI
+ MOVQ chain+48(FP), DI
+
+ // Allocate local space and align it
+ LEAQ 31(SP), R10
+ MOVQ $0x000000000000001f, R8
+ NOTQ R8
+ ANDQ R8, R10
+
+ // Skip if the length is zero
+ XORQ R8, R8
+ XORQ R9, R9
+ TESTQ CX, CX
+ JZ skip_compute
+
+ // Compute complete chunks and blocks
+ SUBQ $0x01, CX
+ MOVQ CX, R8
+ SHRQ $0x0a, R8
+ MOVQ CX, R9
+ ANDQ $0x000003c0, R9
+
+skip_compute:
+ // Load some params into the stack (avo improvment?)
+ MOVL BX, 64(SP)
+ MOVQ DX, 72(SP)
+
+ // Load IV into vectors
+ VPBROADCASTD (BP), Y0
+ VPBROADCASTD 4(BP), Y1
+ VPBROADCASTD 8(BP), Y2
+ VPBROADCASTD 12(BP), Y3
+ VPBROADCASTD 16(BP), Y4
+ VPBROADCASTD 20(BP), Y5
+ VPBROADCASTD 24(BP), Y6
+ VPBROADCASTD 28(BP), Y7
+
+ // Build and store counter data on the stack
+ VPBROADCASTQ 72(SP), Y8
+ VPADDQ counter<>+0(SB), Y8, Y8
+ VPBROADCASTQ 72(SP), Y9
+ VPADDQ counter<>+32(SB), Y9, Y9
+ VPUNPCKLDQ Y9, Y8, Y10
+ VPUNPCKHDQ Y9, Y8, Y8
+ VPUNPCKLDQ Y8, Y10, Y9
+ VPUNPCKHDQ Y8, Y10, Y8
+ VPERMQ $0xd8, Y9, Y9
+ VPERMQ $0xd8, Y8, Y8
+ VMOVDQU Y9, 112(SP)
+ VMOVDQU Y8, 144(SP)
+
+ // Set up block flags and variables for iteration
+ XORQ CX, CX
+ ORL $0x01, 64(SP)
+
+loop:
+ // Include end flags if last block
+ CMPQ CX, $0x000003c0
+ JNE round_setup
+ ORL $0x02, 64(SP)
+
+round_setup:
+ // Load and transpose message vectors
+ VMOVDQU (AX)(CX*1), Y8
+ VMOVDQU 1024(AX)(CX*1), Y9
+ VMOVDQU 2048(AX)(CX*1), Y10
+ VMOVDQU 3072(AX)(CX*1), Y11
+ VMOVDQU 4096(AX)(CX*1), Y12
+ VMOVDQU 5120(AX)(CX*1), Y13
+ VMOVDQU 6144(AX)(CX*1), Y14
+ VMOVDQU 7168(AX)(CX*1), Y15
+ VMOVDQA Y0, (R10)
+ VPUNPCKLDQ Y9, Y8, Y0
+ VPUNPCKHDQ Y9, Y8, Y8
+ VPUNPCKLDQ Y11, Y10, Y9
+ VPUNPCKHDQ Y11, Y10, Y10
+ VPUNPCKLDQ Y13, Y12, Y11
+ VPUNPCKHDQ Y13, Y12, Y12
+ VPUNPCKLDQ Y15, Y14, Y13
+ VPUNPCKHDQ Y15, Y14, Y14
+ VPUNPCKLQDQ Y9, Y0, Y15
+ VPUNPCKHQDQ Y9, Y0, Y0
+ VPUNPCKLQDQ Y10, Y8, Y9
+ VPUNPCKHQDQ Y10, Y8, Y8
+ VPUNPCKLQDQ Y13, Y11, Y10
+ VPUNPCKHQDQ Y13, Y11, Y11
+ VPUNPCKLQDQ Y14, Y12, Y13
+ VPUNPCKHQDQ Y14, Y12, Y12
+ VINSERTI128 $0x01, X10, Y15, Y14
+ VPERM2I128 $0x31, Y10, Y15, Y10
+ VINSERTI128 $0x01, X11, Y0, Y15
+ VPERM2I128 $0x31, Y11, Y0, Y0
+ VINSERTI128 $0x01, X13, Y9, Y11
+ VPERM2I128 $0x31, Y13, Y9, Y9
+ VINSERTI128 $0x01, X12, Y8, Y13
+ VPERM2I128 $0x31, Y12, Y8, Y8
+ VMOVDQU Y14, 176(SP)
+ VMOVDQU Y15, 208(SP)
+ VMOVDQU Y11, 240(SP)
+ VMOVDQU Y13, 272(SP)
+ VMOVDQU Y10, 304(SP)
+ VMOVDQU Y0, 336(SP)
+ VMOVDQU Y9, 368(SP)
+ VMOVDQU Y8, 400(SP)
+ VMOVDQU 32(AX)(CX*1), Y0
+ VMOVDQU 1056(AX)(CX*1), Y8
+ VMOVDQU 2080(AX)(CX*1), Y9
+ VMOVDQU 3104(AX)(CX*1), Y10
+ VMOVDQU 4128(AX)(CX*1), Y11
+ VMOVDQU 5152(AX)(CX*1), Y12
+ VMOVDQU 6176(AX)(CX*1), Y13
+ VMOVDQU 7200(AX)(CX*1), Y14
+ VPUNPCKLDQ Y8, Y0, Y15
+ VPUNPCKHDQ Y8, Y0, Y0
+ VPUNPCKLDQ Y10, Y9, Y8
+ VPUNPCKHDQ Y10, Y9, Y9
+ VPUNPCKLDQ Y12, Y11, Y10
+ VPUNPCKHDQ Y12, Y11, Y11
+ VPUNPCKLDQ Y14, Y13, Y12
+ VPUNPCKHDQ Y14, Y13, Y13
+ VPUNPCKLQDQ Y8, Y15, Y14
+ VPUNPCKHQDQ Y8, Y15, Y8
+ VPUNPCKLQDQ Y9, Y0, Y15
+ VPUNPCKHQDQ Y9, Y0, Y0
+ VPUNPCKLQDQ Y12, Y10, Y9
+ VPUNPCKHQDQ Y12, Y10, Y10
+ VPUNPCKLQDQ Y13, Y11, Y12
+ VPUNPCKHQDQ Y13, Y11, Y11
+ VINSERTI128 $0x01, X9, Y14, Y13
+ VPERM2I128 $0x31, Y9, Y14, Y9
+ VINSERTI128 $0x01, X10, Y8, Y14
+ VPERM2I128 $0x31, Y10, Y8, Y8
+ VINSERTI128 $0x01, X12, Y15, Y10
+ VPERM2I128 $0x31, Y12, Y15, Y12
+ VINSERTI128 $0x01, X11, Y0, Y15
+ VPERM2I128 $0x31, Y11, Y0, Y0
+ VMOVDQU Y13, 432(SP)
+ VMOVDQU Y14, 464(SP)
+ VMOVDQU Y10, 496(SP)
+ VMOVDQU Y15, 528(SP)
+ VMOVDQU Y9, 560(SP)
+ VMOVDQU Y8, 592(SP)
+ VMOVDQU Y12, 624(SP)
+ VMOVDQU Y0, 656(SP)
+
+ // Load constants for the round
+ VMOVDQA (R10), Y0
+ VMOVDQU block_len<>+0(SB), Y8
+ VPBROADCASTD 64(SP), Y9
+ VPBROADCASTD iv<>+0(SB), Y10
+ VPBROADCASTD iv<>+4(SB), Y11
+ VPBROADCASTD iv<>+8(SB), Y12
+ VPBROADCASTD iv<>+12(SB), Y13
+ VMOVDQU 112(SP), Y14
+ VMOVDQU 144(SP), Y15
+
+ // Save state for partial chunk if necessary
+ CMPQ CX, R9
+ JNE begin_rounds
+ VMOVDQU Y0, 80(SP)
+ MOVL 80(SP)(R8*4), DX
+ MOVL DX, (DI)
+ VMOVDQU Y1, 80(SP)
+ MOVL 80(SP)(R8*4), DX
+ MOVL DX, 4(DI)
+ VMOVDQU Y2, 80(SP)
+ MOVL 80(SP)(R8*4), DX
+ MOVL DX, 8(DI)
+ VMOVDQU Y3, 80(SP)
+ MOVL 80(SP)(R8*4), DX
+ MOVL DX, 12(DI)
+ VMOVDQU Y4, 80(SP)
+ MOVL 80(SP)(R8*4), DX
+ MOVL DX, 16(DI)
+ VMOVDQU Y5, 80(SP)
+ MOVL 80(SP)(R8*4), DX
+ MOVL DX, 20(DI)
+ VMOVDQU Y6, 80(SP)
+ MOVL 80(SP)(R8*4), DX
+ MOVL DX, 24(DI)
+ VMOVDQU Y7, 80(SP)
+ MOVL 80(SP)(R8*4), DX
+ MOVL DX, 28(DI)
+
+begin_rounds:
+ // Perform the rounds
+ // Round 1
+ VPADDD 176(SP), Y0, Y0
+ VPADDD 240(SP), Y1, Y1
+ VPADDD 304(SP), Y2, Y2
+ VPADDD 368(SP), Y3, Y3
+ VPADDD Y4, Y0, Y0
+ VPXOR Y0, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y7, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y4, Y4
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y5, Y5
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y6, Y6
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y7, Y7
+ VMOVDQA Y0, (R10)
+ VPSRLD $0x0c, Y4, Y0
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y0, Y4, Y0
+ VPSRLD $0x0c, Y5, Y4
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y4, Y5, Y4
+ VPSRLD $0x0c, Y6, Y5
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y5, Y6, Y5
+ VPSRLD $0x0c, Y7, Y6
+ VPSLLD $0x14, Y7, Y7
+ VPOR Y6, Y7, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 208(SP), Y7, Y7
+ VPADDD 272(SP), Y1, Y1
+ VPADDD 336(SP), Y2, Y2
+ VPADDD 400(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 432(SP), Y7, Y7
+ VPADDD 496(SP), Y1, Y1
+ VPADDD 560(SP), Y2, Y2
+ VPADDD 624(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VMOVDQA (R10), Y7
+ VPADDD 464(SP), Y7, Y7
+ VPADDD 528(SP), Y1, Y1
+ VPADDD 592(SP), Y2, Y2
+ VPADDD 656(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+
+ // Round 2
+ VMOVDQA (R10), Y7
+ VPADDD 240(SP), Y7, Y7
+ VPADDD 272(SP), Y1, Y1
+ VPADDD 400(SP), Y2, Y2
+ VPADDD 304(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 368(SP), Y7, Y7
+ VPADDD 496(SP), Y1, Y1
+ VPADDD 176(SP), Y2, Y2
+ VPADDD 592(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 208(SP), Y7, Y7
+ VPADDD 560(SP), Y1, Y1
+ VPADDD 464(SP), Y2, Y2
+ VPADDD 656(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VMOVDQA (R10), Y7
+ VPADDD 528(SP), Y7, Y7
+ VPADDD 336(SP), Y1, Y1
+ VPADDD 624(SP), Y2, Y2
+ VPADDD 432(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+
+ // Round 3
+ VMOVDQA (R10), Y7
+ VPADDD 272(SP), Y7, Y7
+ VPADDD 496(SP), Y1, Y1
+ VPADDD 592(SP), Y2, Y2
+ VPADDD 400(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 304(SP), Y7, Y7
+ VPADDD 560(SP), Y1, Y1
+ VPADDD 240(SP), Y2, Y2
+ VPADDD 624(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 368(SP), Y7, Y7
+ VPADDD 464(SP), Y1, Y1
+ VPADDD 528(SP), Y2, Y2
+ VPADDD 432(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VMOVDQA (R10), Y7
+ VPADDD 336(SP), Y7, Y7
+ VPADDD 176(SP), Y1, Y1
+ VPADDD 656(SP), Y2, Y2
+ VPADDD 208(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+
+ // Round 4
+ VMOVDQA (R10), Y7
+ VPADDD 496(SP), Y7, Y7
+ VPADDD 560(SP), Y1, Y1
+ VPADDD 624(SP), Y2, Y2
+ VPADDD 592(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 400(SP), Y7, Y7
+ VPADDD 464(SP), Y1, Y1
+ VPADDD 272(SP), Y2, Y2
+ VPADDD 656(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 304(SP), Y7, Y7
+ VPADDD 528(SP), Y1, Y1
+ VPADDD 336(SP), Y2, Y2
+ VPADDD 208(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VMOVDQA (R10), Y7
+ VPADDD 176(SP), Y7, Y7
+ VPADDD 240(SP), Y1, Y1
+ VPADDD 432(SP), Y2, Y2
+ VPADDD 368(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+
+ // Round 5
+ VMOVDQA (R10), Y7
+ VPADDD 560(SP), Y7, Y7
+ VPADDD 464(SP), Y1, Y1
+ VPADDD 656(SP), Y2, Y2
+ VPADDD 624(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 592(SP), Y7, Y7
+ VPADDD 528(SP), Y1, Y1
+ VPADDD 496(SP), Y2, Y2
+ VPADDD 432(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 400(SP), Y7, Y7
+ VPADDD 336(SP), Y1, Y1
+ VPADDD 176(SP), Y2, Y2
+ VPADDD 368(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VMOVDQA (R10), Y7
+ VPADDD 240(SP), Y7, Y7
+ VPADDD 272(SP), Y1, Y1
+ VPADDD 208(SP), Y2, Y2
+ VPADDD 304(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+
+ // Round 6
+ VMOVDQA (R10), Y7
+ VPADDD 464(SP), Y7, Y7
+ VPADDD 528(SP), Y1, Y1
+ VPADDD 432(SP), Y2, Y2
+ VPADDD 656(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 624(SP), Y7, Y7
+ VPADDD 336(SP), Y1, Y1
+ VPADDD 560(SP), Y2, Y2
+ VPADDD 208(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 592(SP), Y7, Y7
+ VPADDD 176(SP), Y1, Y1
+ VPADDD 240(SP), Y2, Y2
+ VPADDD 304(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VMOVDQA (R10), Y7
+ VPADDD 272(SP), Y7, Y7
+ VPADDD 496(SP), Y1, Y1
+ VPADDD 368(SP), Y2, Y2
+ VPADDD 400(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+
+ // Round 7
+ VMOVDQA (R10), Y7
+ VPADDD 528(SP), Y7, Y7
+ VPADDD 336(SP), Y1, Y1
+ VPADDD 208(SP), Y2, Y2
+ VPADDD 432(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 656(SP), Y7, Y7
+ VPADDD 176(SP), Y1, Y1
+ VPADDD 464(SP), Y2, Y2
+ VPADDD 368(SP), Y3, Y3
+ VPADDD Y0, Y7, Y7
+ VPXOR Y7, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y5, Y2, Y2
+ VPXOR Y2, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y6, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y14, Y10, Y10
+ VPXOR Y10, Y0, Y0
+ VPADDD Y15, Y11, Y11
+ VPXOR Y11, Y4, Y4
+ VPADDD Y8, Y12, Y12
+ VPXOR Y12, Y5, Y5
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VMOVDQA (R10), Y7
+ VPADDD 624(SP), Y7, Y7
+ VPADDD 240(SP), Y1, Y1
+ VPADDD 272(SP), Y2, Y2
+ VPADDD 400(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot16_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot16_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot16_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x0c, Y4, Y7
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x0c, Y5, Y7
+ VPSLLD $0x14, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x0c, Y6, Y7
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x0c, Y0, Y7
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y7, Y0, Y0
+ VMOVDQA (R10), Y7
+ VPADDD 496(SP), Y7, Y7
+ VPADDD 560(SP), Y1, Y1
+ VPADDD 304(SP), Y2, Y2
+ VPADDD 592(SP), Y3, Y3
+ VPADDD Y4, Y7, Y7
+ VPXOR Y7, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y5, Y1, Y1
+ VPXOR Y1, Y14, Y14
+ VPSHUFB rot8_shuf<>+0(SB), Y14, Y14
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y15, Y15
+ VPSHUFB rot8_shuf<>+0(SB), Y15, Y15
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y8, Y8
+ VPSHUFB rot8_shuf<>+0(SB), Y8, Y8
+ VPADDD Y9, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPADDD Y14, Y13, Y13
+ VPXOR Y13, Y5, Y5
+ VPADDD Y15, Y10, Y10
+ VPXOR Y10, Y6, Y6
+ VPADDD Y8, Y11, Y11
+ VPXOR Y11, Y0, Y0
+ VMOVDQA Y7, (R10)
+ VPSRLD $0x07, Y4, Y7
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y7, Y4, Y4
+ VPSRLD $0x07, Y5, Y7
+ VPSLLD $0x19, Y5, Y5
+ VPOR Y7, Y5, Y5
+ VPSRLD $0x07, Y6, Y7
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y7, Y6, Y6
+ VPSRLD $0x07, Y0, Y7
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y7, Y0, Y0
+
+ // Finalize rounds
+ VPXOR Y9, Y6, Y6
+ VPXOR (R10), Y10, Y7
+ VPXOR Y11, Y1, Y1
+ VPXOR Y12, Y2, Y2
+ VPXOR Y13, Y3, Y3
+ VPXOR Y14, Y0, Y0
+ VPXOR Y15, Y4, Y4
+ VPXOR Y8, Y5, Y5
+
+ // Fix up registers for next iteration
+ VMOVDQU Y7, Y8
+ VMOVDQU Y6, Y7
+ VMOVDQU Y5, Y6
+ VMOVDQU Y4, Y5
+ VMOVDQU Y0, Y4
+ VMOVDQU Y8, Y0
+
+ // If we have zero complete chunks, we're done
+ CMPQ R8, $0x00
+ JNE loop_trailer
+ CMPQ R9, CX
+ JEQ finalize
+
+loop_trailer:
+ // Increment, reset flags, and loop
+ CMPQ CX, $0x000003c0
+ JEQ finalize
+ ADDQ $0x40, CX
+ MOVL BX, 64(SP)
+ JMP loop
+
+finalize:
+ // Store result into out
+ VMOVDQU Y0, (SI)
+ VMOVDQU Y1, 32(SI)
+ VMOVDQU Y2, 64(SI)
+ VMOVDQU Y3, 96(SI)
+ VMOVDQU Y4, 128(SI)
+ VMOVDQU Y5, 160(SI)
+ VMOVDQU Y6, 192(SI)
+ VMOVDQU Y7, 224(SI)
+ VZEROUPPER
+ RET
+
+// func HashP(left *[32]uint32, right *[32]uint32, flags uint8, key *[8]uint32, out *[32]uint32, n int)
+// Requires: AVX, AVX2
+TEXT ·HashP(SB), NOSPLIT, $72-48
+ MOVQ left+0(FP), AX
+ MOVQ right+8(FP), CX
+ MOVBLZX flags+16(FP), DX
+ MOVQ key+24(FP), BX
+ MOVQ out+32(FP), BP
+
+ // Allocate local space and align it
+ LEAQ 31(SP), SI
+ MOVQ $0x000000000000001f, DI
+ NOTQ DI
+ ANDQ DI, SI
+
+ // Set up flags value
+ MOVL DX, 64(SP)
+
+ // Perform the rounds
+ // Round 1
+ VPBROADCASTD (BX), Y0
+ VPADDD (AX), Y0, Y0
+ VPBROADCASTD 4(BX), Y1
+ VPADDD 64(AX), Y1, Y1
+ VPBROADCASTD 8(BX), Y2
+ VPADDD 128(AX), Y2, Y2
+ VPBROADCASTD 12(BX), Y3
+ VPADDD 192(AX), Y3, Y3
+ VPBROADCASTD 16(BX), Y4
+ VPADDD Y4, Y0, Y0
+ VMOVDQU zero<>+0(SB), Y5
+ VPXOR Y0, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPBROADCASTD 20(BX), Y6
+ VPADDD Y6, Y1, Y1
+ VMOVDQU zero<>+0(SB), Y7
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPBROADCASTD 24(BX), Y8
+ VPADDD Y8, Y2, Y2
+ VMOVDQU block_len<>+0(SB), Y9
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPBROADCASTD 28(BX), Y10
+ VPADDD Y10, Y3, Y3
+ VPBROADCASTD 64(SP), Y11
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPBROADCASTD iv<>+0(SB), Y12
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y4, Y4
+ VPBROADCASTD iv<>+4(SB), Y13
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y6, Y6
+ VPBROADCASTD iv<>+8(SB), Y14
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y8, Y8
+ VPBROADCASTD iv<>+12(SB), Y15
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y10, Y10
+ VMOVDQA Y0, (SI)
+ VPSRLD $0x0c, Y4, Y0
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y0, Y4, Y0
+ VPSRLD $0x0c, Y6, Y4
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y4, Y6, Y4
+ VPSRLD $0x0c, Y8, Y6
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y6, Y8, Y6
+ VPSRLD $0x0c, Y10, Y8
+ VPSLLD $0x14, Y10, Y10
+ VPOR Y8, Y10, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 32(AX), Y10, Y10
+ VPADDD 96(AX), Y1, Y1
+ VPADDD 160(AX), Y2, Y2
+ VPADDD 224(AX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD (CX), Y10, Y10
+ VPADDD 64(CX), Y1, Y1
+ VPADDD 128(CX), Y2, Y2
+ VPADDD 192(CX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VMOVDQA (SI), Y10
+ VPADDD 32(CX), Y10, Y10
+ VPADDD 96(CX), Y1, Y1
+ VPADDD 160(CX), Y2, Y2
+ VPADDD 224(CX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+
+ // Round 2
+ VMOVDQA (SI), Y10
+ VPADDD 64(AX), Y10, Y10
+ VPADDD 96(AX), Y1, Y1
+ VPADDD 224(AX), Y2, Y2
+ VPADDD 128(AX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 192(AX), Y10, Y10
+ VPADDD 64(CX), Y1, Y1
+ VPADDD (AX), Y2, Y2
+ VPADDD 160(CX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 32(AX), Y10, Y10
+ VPADDD 128(CX), Y1, Y1
+ VPADDD 32(CX), Y2, Y2
+ VPADDD 224(CX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VMOVDQA (SI), Y10
+ VPADDD 96(CX), Y10, Y10
+ VPADDD 160(AX), Y1, Y1
+ VPADDD 192(CX), Y2, Y2
+ VPADDD (CX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+
+ // Round 3
+ VMOVDQA (SI), Y10
+ VPADDD 96(AX), Y10, Y10
+ VPADDD 64(CX), Y1, Y1
+ VPADDD 160(CX), Y2, Y2
+ VPADDD 224(AX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 128(AX), Y10, Y10
+ VPADDD 128(CX), Y1, Y1
+ VPADDD 64(AX), Y2, Y2
+ VPADDD 192(CX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 192(AX), Y10, Y10
+ VPADDD 32(CX), Y1, Y1
+ VPADDD 96(CX), Y2, Y2
+ VPADDD (CX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VMOVDQA (SI), Y10
+ VPADDD 160(AX), Y10, Y10
+ VPADDD (AX), Y1, Y1
+ VPADDD 224(CX), Y2, Y2
+ VPADDD 32(AX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+
+ // Round 4
+ VMOVDQA (SI), Y10
+ VPADDD 64(CX), Y10, Y10
+ VPADDD 128(CX), Y1, Y1
+ VPADDD 192(CX), Y2, Y2
+ VPADDD 160(CX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 224(AX), Y10, Y10
+ VPADDD 32(CX), Y1, Y1
+ VPADDD 96(AX), Y2, Y2
+ VPADDD 224(CX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 128(AX), Y10, Y10
+ VPADDD 96(CX), Y1, Y1
+ VPADDD 160(AX), Y2, Y2
+ VPADDD 32(AX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VMOVDQA (SI), Y10
+ VPADDD (AX), Y10, Y10
+ VPADDD 64(AX), Y1, Y1
+ VPADDD (CX), Y2, Y2
+ VPADDD 192(AX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+
+ // Round 5
+ VMOVDQA (SI), Y10
+ VPADDD 128(CX), Y10, Y10
+ VPADDD 32(CX), Y1, Y1
+ VPADDD 224(CX), Y2, Y2
+ VPADDD 192(CX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 160(CX), Y10, Y10
+ VPADDD 96(CX), Y1, Y1
+ VPADDD 64(CX), Y2, Y2
+ VPADDD (CX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 224(AX), Y10, Y10
+ VPADDD 160(AX), Y1, Y1
+ VPADDD (AX), Y2, Y2
+ VPADDD 192(AX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VMOVDQA (SI), Y10
+ VPADDD 64(AX), Y10, Y10
+ VPADDD 96(AX), Y1, Y1
+ VPADDD 32(AX), Y2, Y2
+ VPADDD 128(AX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+
+ // Round 6
+ VMOVDQA (SI), Y10
+ VPADDD 32(CX), Y10, Y10
+ VPADDD 96(CX), Y1, Y1
+ VPADDD (CX), Y2, Y2
+ VPADDD 224(CX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 192(CX), Y10, Y10
+ VPADDD 160(AX), Y1, Y1
+ VPADDD 128(CX), Y2, Y2
+ VPADDD 32(AX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 160(CX), Y10, Y10
+ VPADDD (AX), Y1, Y1
+ VPADDD 64(AX), Y2, Y2
+ VPADDD 128(AX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VMOVDQA (SI), Y10
+ VPADDD 96(AX), Y10, Y10
+ VPADDD 64(CX), Y1, Y1
+ VPADDD 192(AX), Y2, Y2
+ VPADDD 224(AX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+
+ // Round 7
+ VMOVDQA (SI), Y10
+ VPADDD 96(CX), Y10, Y10
+ VPADDD 160(AX), Y1, Y1
+ VPADDD 32(AX), Y2, Y2
+ VPADDD (CX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 224(CX), Y10, Y10
+ VPADDD (AX), Y1, Y1
+ VPADDD 32(CX), Y2, Y2
+ VPADDD 192(AX), Y3, Y3
+ VPADDD Y0, Y10, Y10
+ VPXOR Y10, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y4, Y1, Y1
+ VPXOR Y1, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y6, Y2, Y2
+ VPXOR Y2, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y8, Y3, Y3
+ VPXOR Y3, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y5, Y12, Y12
+ VPXOR Y12, Y0, Y0
+ VPADDD Y7, Y13, Y13
+ VPXOR Y13, Y4, Y4
+ VPADDD Y9, Y14, Y14
+ VPXOR Y14, Y6, Y6
+ VPADDD Y11, Y15, Y15
+ VPXOR Y15, Y8, Y8
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VMOVDQA (SI), Y10
+ VPADDD 192(CX), Y10, Y10
+ VPADDD 64(AX), Y1, Y1
+ VPADDD 96(AX), Y2, Y2
+ VPADDD 224(AX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot16_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot16_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot16_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot16_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x0c, Y4, Y10
+ VPSLLD $0x14, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x0c, Y6, Y10
+ VPSLLD $0x14, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x0c, Y8, Y10
+ VPSLLD $0x14, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x0c, Y0, Y10
+ VPSLLD $0x14, Y0, Y0
+ VPOR Y10, Y0, Y0
+ VMOVDQA (SI), Y10
+ VPADDD 64(CX), Y10, Y10
+ VPADDD 128(CX), Y1, Y1
+ VPADDD 128(AX), Y2, Y2
+ VPADDD 160(CX), Y3, Y3
+ VPADDD Y4, Y10, Y10
+ VPXOR Y10, Y11, Y11
+ VPSHUFB rot8_shuf<>+0(SB), Y11, Y11
+ VPADDD Y6, Y1, Y1
+ VPXOR Y1, Y5, Y5
+ VPSHUFB rot8_shuf<>+0(SB), Y5, Y5
+ VPADDD Y8, Y2, Y2
+ VPXOR Y2, Y7, Y7
+ VPSHUFB rot8_shuf<>+0(SB), Y7, Y7
+ VPADDD Y0, Y3, Y3
+ VPXOR Y3, Y9, Y9
+ VPSHUFB rot8_shuf<>+0(SB), Y9, Y9
+ VPADDD Y11, Y14, Y14
+ VPXOR Y14, Y4, Y4
+ VPADDD Y5, Y15, Y15
+ VPXOR Y15, Y6, Y6
+ VPADDD Y7, Y12, Y12
+ VPXOR Y12, Y8, Y8
+ VPADDD Y9, Y13, Y13
+ VPXOR Y13, Y0, Y0
+ VMOVDQA Y10, (SI)
+ VPSRLD $0x07, Y4, Y10
+ VPSLLD $0x19, Y4, Y4
+ VPOR Y10, Y4, Y4
+ VPSRLD $0x07, Y6, Y10
+ VPSLLD $0x19, Y6, Y6
+ VPOR Y10, Y6, Y6
+ VPSRLD $0x07, Y8, Y10
+ VPSLLD $0x19, Y8, Y8
+ VPOR Y10, Y8, Y8
+ VPSRLD $0x07, Y0, Y10
+ VPSLLD $0x19, Y0, Y0
+ VPOR Y10, Y0, Y0
+
+ // Finalize
+ VPXOR (SI), Y12, Y10
+ VPXOR Y13, Y1, Y1
+ VPXOR Y14, Y2, Y2
+ VPXOR Y15, Y3, Y3
+ VPXOR Y5, Y0, Y0
+ VPXOR Y7, Y4, Y4
+ VPXOR Y9, Y6, Y5
+ VPXOR Y11, Y8, Y6
+
+ // Store result into out
+ VMOVDQU Y10, (BP)
+ VMOVDQU Y1, 32(BP)
+ VMOVDQU Y2, 64(BP)
+ VMOVDQU Y3, 96(BP)
+ VMOVDQU Y0, 128(BP)
+ VMOVDQU Y4, 160(BP)
+ VMOVDQU Y5, 192(BP)
+ VMOVDQU Y6, 224(BP)
+ VZEROUPPER
+ RET
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go
new file mode 100644
index 000000000..613972814
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/impl_other.go
@@ -0,0 +1,13 @@
+// +build !amd64
+
+package hash_avx2
+
+import "github.com/zeebo/blake3/internal/alg/hash/hash_pure"
+
+func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) {
+ hash_pure.HashF(input, length, counter, flags, key, out, chain)
+}
+
+func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) {
+ hash_pure.HashP(left, right, flags, key, out, n)
+}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go
new file mode 100644
index 000000000..10e949550
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_avx2/stubs.go
@@ -0,0 +1,9 @@
+// +build amd64
+
+package hash_avx2
+
+//go:noescape
+func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32)
+
+//go:noescape
+func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int)
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go
new file mode 100644
index 000000000..0c6fd63cd
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashf.go
@@ -0,0 +1,56 @@
+package hash_pure
+
+import (
+ "unsafe"
+
+ "github.com/zeebo/blake3/internal/alg/compress"
+ "github.com/zeebo/blake3/internal/consts"
+ "github.com/zeebo/blake3/internal/utils"
+)
+
+func HashF(input *[8192]byte, length, counter uint64, flags uint32, key *[8]uint32, out *[64]uint32, chain *[8]uint32) {
+ var tmp [16]uint32
+
+ for i := uint64(0); consts.ChunkLen*i < length && i < 8; i++ {
+ bchain := *key
+ bflags := flags | consts.Flag_ChunkStart
+ start := consts.ChunkLen * i
+
+ for n := uint64(0); n < 16; n++ {
+ if n == 15 {
+ bflags |= consts.Flag_ChunkEnd
+ }
+ if start+64*n >= length {
+ break
+ }
+ if start+64+64*n >= length {
+ *chain = bchain
+ }
+
+ var blockPtr *[16]uint32
+ if consts.IsLittleEndian {
+ blockPtr = (*[16]uint32)(unsafe.Pointer(&input[consts.ChunkLen*i+consts.BlockLen*n]))
+ } else {
+ var block [16]uint32
+ utils.BytesToWords((*[64]uint8)(unsafe.Pointer(&input[consts.ChunkLen*i+consts.BlockLen*n])), &block)
+ blockPtr = &block
+ }
+
+ compress.Compress(&bchain, blockPtr, counter, consts.BlockLen, bflags, &tmp)
+
+ bchain = *(*[8]uint32)(unsafe.Pointer(&tmp[0]))
+ bflags = flags
+ }
+
+ out[i+0] = bchain[0]
+ out[i+8] = bchain[1]
+ out[i+16] = bchain[2]
+ out[i+24] = bchain[3]
+ out[i+32] = bchain[4]
+ out[i+40] = bchain[5]
+ out[i+48] = bchain[6]
+ out[i+56] = bchain[7]
+
+ counter++
+ }
+}
diff --git a/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go
new file mode 100644
index 000000000..bee5d8dd0
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/alg/hash/hash_pure/hashp.go
@@ -0,0 +1,38 @@
+package hash_pure
+
+import "github.com/zeebo/blake3/internal/alg/compress"
+
+func HashP(left, right *[64]uint32, flags uint32, key *[8]uint32, out *[64]uint32, n int) {
+ var tmp [16]uint32
+ var block [16]uint32
+
+ for i := 0; i < n && i < 8; i++ {
+ block[0] = left[i+0]
+ block[1] = left[i+8]
+ block[2] = left[i+16]
+ block[3] = left[i+24]
+ block[4] = left[i+32]
+ block[5] = left[i+40]
+ block[6] = left[i+48]
+ block[7] = left[i+56]
+ block[8] = right[i+0]
+ block[9] = right[i+8]
+ block[10] = right[i+16]
+ block[11] = right[i+24]
+ block[12] = right[i+32]
+ block[13] = right[i+40]
+ block[14] = right[i+48]
+ block[15] = right[i+56]
+
+ compress.Compress(key, &block, 0, 64, flags, &tmp)
+
+ out[i+0] = tmp[0]
+ out[i+8] = tmp[1]
+ out[i+16] = tmp[2]
+ out[i+24] = tmp[3]
+ out[i+32] = tmp[4]
+ out[i+40] = tmp[5]
+ out[i+48] = tmp[6]
+ out[i+56] = tmp[7]
+ }
+}
diff --git a/vendor/github.com/zeebo/blake3/internal/consts/consts.go b/vendor/github.com/zeebo/blake3/internal/consts/consts.go
new file mode 100644
index 000000000..89f08fe10
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/consts/consts.go
@@ -0,0 +1,29 @@
+package consts
+
+var IV = [...]uint32{IV0, IV1, IV2, IV3, IV4, IV5, IV6, IV7}
+
+const (
+ IV0 = 0x6A09E667
+ IV1 = 0xBB67AE85
+ IV2 = 0x3C6EF372
+ IV3 = 0xA54FF53A
+ IV4 = 0x510E527F
+ IV5 = 0x9B05688C
+ IV6 = 0x1F83D9AB
+ IV7 = 0x5BE0CD19
+)
+
+const (
+ Flag_ChunkStart uint32 = 1 << 0
+ Flag_ChunkEnd uint32 = 1 << 1
+ Flag_Parent uint32 = 1 << 2
+ Flag_Root uint32 = 1 << 3
+ Flag_Keyed uint32 = 1 << 4
+ Flag_DeriveKeyContext uint32 = 1 << 5
+ Flag_DeriveKeyMaterial uint32 = 1 << 6
+)
+
+const (
+ BlockLen = 64
+ ChunkLen = 1024
+)
diff --git a/vendor/github.com/zeebo/blake3/internal/consts/cpu.go b/vendor/github.com/zeebo/blake3/internal/consts/cpu.go
new file mode 100644
index 000000000..1eebff943
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/consts/cpu.go
@@ -0,0 +1,17 @@
+package consts
+
+import (
+ "os"
+
+ "golang.org/x/sys/cpu"
+)
+
+var (
+ HasAVX2 = cpu.X86.HasAVX2 &&
+ os.Getenv("BLAKE3_DISABLE_AVX2") == "" &&
+ os.Getenv("BLAKE3_PUREGO") == ""
+
+ HasSSE41 = cpu.X86.HasSSE41 &&
+ os.Getenv("BLAKE3_DISABLE_SSE41") == "" &&
+ os.Getenv("BLAKE3_PUREGO") == ""
+)
diff --git a/vendor/github.com/zeebo/blake3/internal/consts/cpu_big.go b/vendor/github.com/zeebo/blake3/internal/consts/cpu_big.go
new file mode 100644
index 000000000..fb730464f
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/consts/cpu_big.go
@@ -0,0 +1,5 @@
+// +build mips mips64 ppc64 s390x
+
+package consts
+
+const IsLittleEndian = false
diff --git a/vendor/github.com/zeebo/blake3/internal/consts/cpu_little.go b/vendor/github.com/zeebo/blake3/internal/consts/cpu_little.go
new file mode 100644
index 000000000..1bae02a74
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/consts/cpu_little.go
@@ -0,0 +1,5 @@
+// +build amd64 386 arm arm64 mipsle mips64le ppc64le riscv64 wasm
+
+package consts
+
+const IsLittleEndian = true
diff --git a/vendor/github.com/zeebo/blake3/internal/consts/cpu_other.go b/vendor/github.com/zeebo/blake3/internal/consts/cpu_other.go
new file mode 100644
index 000000000..5f7407a6a
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/consts/cpu_other.go
@@ -0,0 +1,7 @@
+// +build !mips,!mips64,!ppc64,!s390x,!amd64,!386,!arm,!arm64,!mipsle,!mips64le,!ppc64le,!riscv64,!wasm
+
+package consts
+
+import "unsafe"
+
+var IsLittleEndian = *(*uint16)(unsafe.Pointer(&[2]byte{0, 1})) != 1
diff --git a/vendor/github.com/zeebo/blake3/internal/utils/utils.go b/vendor/github.com/zeebo/blake3/internal/utils/utils.go
new file mode 100644
index 000000000..0b36f0f0f
--- /dev/null
+++ b/vendor/github.com/zeebo/blake3/internal/utils/utils.go
@@ -0,0 +1,60 @@
+package utils
+
+import (
+ "encoding/binary"
+ "unsafe"
+)
+
+func SliceToArray32(bytes []byte) *[32]uint8 { return (*[32]uint8)(unsafe.Pointer(&bytes[0])) }
+func SliceToArray64(bytes []byte) *[64]uint8 { return (*[64]uint8)(unsafe.Pointer(&bytes[0])) }
+
+func BytesToWords(bytes *[64]uint8, words *[16]uint32) {
+ words[0] = binary.LittleEndian.Uint32(bytes[0*4:])
+ words[1] = binary.LittleEndian.Uint32(bytes[1*4:])
+ words[2] = binary.LittleEndian.Uint32(bytes[2*4:])
+ words[3] = binary.LittleEndian.Uint32(bytes[3*4:])
+ words[4] = binary.LittleEndian.Uint32(bytes[4*4:])
+ words[5] = binary.LittleEndian.Uint32(bytes[5*4:])
+ words[6] = binary.LittleEndian.Uint32(bytes[6*4:])
+ words[7] = binary.LittleEndian.Uint32(bytes[7*4:])
+ words[8] = binary.LittleEndian.Uint32(bytes[8*4:])
+ words[9] = binary.LittleEndian.Uint32(bytes[9*4:])
+ words[10] = binary.LittleEndian.Uint32(bytes[10*4:])
+ words[11] = binary.LittleEndian.Uint32(bytes[11*4:])
+ words[12] = binary.LittleEndian.Uint32(bytes[12*4:])
+ words[13] = binary.LittleEndian.Uint32(bytes[13*4:])
+ words[14] = binary.LittleEndian.Uint32(bytes[14*4:])
+ words[15] = binary.LittleEndian.Uint32(bytes[15*4:])
+}
+
+func WordsToBytes(words *[16]uint32, bytes []byte) {
+ bytes = bytes[:64]
+ binary.LittleEndian.PutUint32(bytes[0*4:1*4], words[0])
+ binary.LittleEndian.PutUint32(bytes[1*4:2*4], words[1])
+ binary.LittleEndian.PutUint32(bytes[2*4:3*4], words[2])
+ binary.LittleEndian.PutUint32(bytes[3*4:4*4], words[3])
+ binary.LittleEndian.PutUint32(bytes[4*4:5*4], words[4])
+ binary.LittleEndian.PutUint32(bytes[5*4:6*4], words[5])
+ binary.LittleEndian.PutUint32(bytes[6*4:7*4], words[6])
+ binary.LittleEndian.PutUint32(bytes[7*4:8*4], words[7])
+ binary.LittleEndian.PutUint32(bytes[8*4:9*4], words[8])
+ binary.LittleEndian.PutUint32(bytes[9*4:10*4], words[9])
+ binary.LittleEndian.PutUint32(bytes[10*4:11*4], words[10])
+ binary.LittleEndian.PutUint32(bytes[11*4:12*4], words[11])
+ binary.LittleEndian.PutUint32(bytes[12*4:13*4], words[12])
+ binary.LittleEndian.PutUint32(bytes[13*4:14*4], words[13])
+ binary.LittleEndian.PutUint32(bytes[14*4:15*4], words[14])
+ binary.LittleEndian.PutUint32(bytes[15*4:16*4], words[15])
+}
+
+func KeyFromBytes(key []byte, out *[8]uint32) {
+ key = key[:32]
+ out[0] = binary.LittleEndian.Uint32(key[0:])
+ out[1] = binary.LittleEndian.Uint32(key[4:])
+ out[2] = binary.LittleEndian.Uint32(key[8:])
+ out[3] = binary.LittleEndian.Uint32(key[12:])
+ out[4] = binary.LittleEndian.Uint32(key[16:])
+ out[5] = binary.LittleEndian.Uint32(key[20:])
+ out[6] = binary.LittleEndian.Uint32(key[24:])
+ out[7] = binary.LittleEndian.Uint32(key[28:])
+}