summaryrefslogtreecommitdiff
path: root/vendor/github.com
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com')
-rw-r--r--vendor/github.com/zeebo/xxh3/.gitignore6
-rw-r--r--vendor/github.com/zeebo/xxh3/LICENSE25
-rw-r--r--vendor/github.com/zeebo/xxh3/Makefile27
-rw-r--r--vendor/github.com/zeebo/xxh3/README.md38
-rw-r--r--vendor/github.com/zeebo/xxh3/_compat.c39
-rw-r--r--vendor/github.com/zeebo/xxh3/accum_generic.go542
-rw-r--r--vendor/github.com/zeebo/xxh3/accum_stubs_amd64.go40
-rw-r--r--vendor/github.com/zeebo/xxh3/accum_stubs_other.go25
-rw-r--r--vendor/github.com/zeebo/xxh3/accum_vector_avx512_amd64.s379
-rw-r--r--vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s586
-rw-r--r--vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s1236
-rw-r--r--vendor/github.com/zeebo/xxh3/consts.go97
-rw-r--r--vendor/github.com/zeebo/xxh3/hash128.go253
-rw-r--r--vendor/github.com/zeebo/xxh3/hash128_seed.go264
-rw-r--r--vendor/github.com/zeebo/xxh3/hash64.go126
-rw-r--r--vendor/github.com/zeebo/xxh3/hash64_seed.go134
-rw-r--r--vendor/github.com/zeebo/xxh3/hasher.go239
-rw-r--r--vendor/github.com/zeebo/xxh3/utils.go129
18 files changed, 4185 insertions, 0 deletions
diff --git a/vendor/github.com/zeebo/xxh3/.gitignore b/vendor/github.com/zeebo/xxh3/.gitignore
new file mode 100644
index 000000000..928e12f53
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/.gitignore
@@ -0,0 +1,6 @@
+upstream
+*.pprof
+xxh3.test
+.vscode
+*.txt
+_compat
diff --git a/vendor/github.com/zeebo/xxh3/LICENSE b/vendor/github.com/zeebo/xxh3/LICENSE
new file mode 100644
index 000000000..477f8e5e1
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/LICENSE
@@ -0,0 +1,25 @@
+xxHash Library
+Copyright (c) 2012-2014, Yann Collet
+Copyright (c) 2019, Jeff Wendling
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+ list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/zeebo/xxh3/Makefile b/vendor/github.com/zeebo/xxh3/Makefile
new file mode 100644
index 000000000..8bd78c482
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/Makefile
@@ -0,0 +1,27 @@
+.PHONY: all vet
+all: genasm _compat
+
+genasm: avo/avx.go avo/sse.go
+ cd ./avo; go generate gen.go
+
+clean:
+ rm accum_vector_avx_amd64.s
+ rm accum_vector_sse_amd64.s
+ rm _compat
+
+upstream/xxhash.o: upstream/xxhash.h
+ ( cd upstream && make )
+
+_compat: _compat.c upstream/xxhash.o
+ gcc -o _compat _compat.c ./upstream/xxhash.o
+
+vet:
+ GOOS=linux GOARCH=386 GO386=softfloat go vet ./...
+ GOOS=windows GOARCH=386 GO386=softfloat go vet ./...
+ GOOS=linux GOARCH=amd64 go vet ./...
+ GOOS=windows GOARCH=amd64 go vet ./...
+ GOOS=darwin GOARCH=amd64 go vet ./...
+ GOOS=linux GOARCH=arm go vet ./...
+ GOOS=linux GOARCH=arm64 go vet ./...
+ GOOS=windows GOARCH=arm64 go vet ./...
+ GOOS=darwin GOARCH=arm64 go vet ./... \ No newline at end of file
diff --git a/vendor/github.com/zeebo/xxh3/README.md b/vendor/github.com/zeebo/xxh3/README.md
new file mode 100644
index 000000000..4633fc03a
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/README.md
@@ -0,0 +1,38 @@
+# XXH3
+[![GoDoc](https://godoc.org/github.com/zeebo/xxh3?status.svg)](https://godoc.org/github.com/zeebo/xxh3)
+[![Sourcegraph](https://sourcegraph.com/github.com/zeebo/xxh3/-/badge.svg)](https://sourcegraph.com/github.com/zeebo/xxh3?badge)
+[![Go Report Card](https://goreportcard.com/badge/github.com/zeebo/xxh3)](https://goreportcard.com/report/github.com/zeebo/xxh3)
+
+This package is a port of the [xxh3](https://github.com/Cyan4973/xxHash) library to Go.
+
+Upstream has fixed the output as of v0.8.0, and this package matches that.
+
+---
+
+# Benchmarks
+
+Run on my `i7-8850H CPU @ 2.60GHz`
+
+## Small Sizes
+
+| Bytes | Rate |
+|-----------|--------------------------------------|
+|` 0 ` |` 0.74 ns/op ` |
+|` 1-3 ` |` 4.19 ns/op (0.24 GB/s - 0.71 GB/s) `|
+|` 4-8 ` |` 4.16 ns/op (0.97 GB/s - 1.98 GB/s) `|
+|` 9-16 ` |` 4.46 ns/op (2.02 GB/s - 3.58 GB/s) `|
+|` 17-32 ` |` 6.22 ns/op (2.76 GB/s - 5.15 GB/s) `|
+|` 33-64 ` |` 8.00 ns/op (4.13 GB/s - 8.13 GB/s) `|
+|` 65-96 ` |` 11.0 ns/op (5.91 GB/s - 8.84 GB/s) `|
+|` 97-128 ` |` 12.8 ns/op (7.68 GB/s - 10.0 GB/s) `|
+
+## Large Sizes
+
+| Bytes | Rate | SSE2 Rate | AVX2 Rate |
+|---------|--------------------------|--------------------------|--------------------------|
+|` 129 ` |` 13.6 ns/op (9.45 GB/s) `| | |
+|` 240 ` |` 23.8 ns/op (10.1 GB/s) `| | |
+|` 241 ` |` 40.5 ns/op (5.97 GB/s) `|` 23.3 ns/op (10.4 GB/s) `|` 20.1 ns/op (12.0 GB/s) `|
+|` 512 ` |` 69.8 ns/op (7.34 GB/s) `|` 30.4 ns/op (16.9 GB/s) `|` 24.7 ns/op (20.7 GB/s) `|
+|` 1024 ` |` 132 ns/op (7.77 GB/s) `|` 48.9 ns/op (20.9 GB/s) `|` 37.7 ns/op (27.2 GB/s) `|
+|` 100KB `|` 13.0 us/op (7.88 GB/s) `|` 4.05 us/op (25.3 GB/s) `|` 2.31 us/op (44.3 GB/s) `|
diff --git a/vendor/github.com/zeebo/xxh3/_compat.c b/vendor/github.com/zeebo/xxh3/_compat.c
new file mode 100644
index 000000000..fda9f36ff
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/_compat.c
@@ -0,0 +1,39 @@
+#include "upstream/xxhash.h"
+#include <stdio.h>
+
+int main() {
+ unsigned char buf[4096];
+ for (int i = 0; i < 4096; i++) {
+ buf[i] = (unsigned char)((i+1)%251);
+ }
+
+ printf("var testVecs64 = []uint64{\n");
+ for (int i = 0; i < 4096; i++) {
+ if (i % 4 == 0) {
+ printf("\t");
+ }
+
+ uint64_t h = XXH3_64bits(buf, (size_t)i);
+ printf("0x%lx, ", h);
+
+ if (i % 4 == 3) {
+ printf("\n\t");
+ }
+ }
+ printf("}\n\n");
+
+ printf("var testVecs128 = [][2]uint64{\n");
+ for (int i = 0; i < 4096; i++) {
+ if (i % 4 == 0) {
+ printf("\t");
+ }
+
+ XXH128_hash_t h = XXH3_128bits(buf, (size_t)i);
+ printf("{0x%lx, 0x%lx}, ", h.high64, h.low64);
+
+ if (i % 4 == 3) {
+ printf("\n");
+ }
+ }
+ printf("}\n\n");
+}
diff --git a/vendor/github.com/zeebo/xxh3/accum_generic.go b/vendor/github.com/zeebo/xxh3/accum_generic.go
new file mode 100644
index 000000000..b1be78507
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/accum_generic.go
@@ -0,0 +1,542 @@
+package xxh3
+
+// avx512Switch is the size at which the avx512 code is used.
+// Bigger blocks benefit more.
+const avx512Switch = 1 << 10
+
+func accumScalar(accs *[8]u64, p, secret ptr, l u64) {
+ if secret != key {
+ accumScalarSeed(accs, p, secret, l)
+ return
+ }
+ for l > _block {
+ k := secret
+
+ // accs
+ for i := 0; i < 16; i++ {
+ dv0 := readU64(p, 8*0)
+ dk0 := dv0 ^ readU64(k, 8*0)
+ accs[1] += dv0
+ accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
+
+ dv1 := readU64(p, 8*1)
+ dk1 := dv1 ^ readU64(k, 8*1)
+ accs[0] += dv1
+ accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
+
+ dv2 := readU64(p, 8*2)
+ dk2 := dv2 ^ readU64(k, 8*2)
+ accs[3] += dv2
+ accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
+
+ dv3 := readU64(p, 8*3)
+ dk3 := dv3 ^ readU64(k, 8*3)
+ accs[2] += dv3
+ accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
+
+ dv4 := readU64(p, 8*4)
+ dk4 := dv4 ^ readU64(k, 8*4)
+ accs[5] += dv4
+ accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
+
+ dv5 := readU64(p, 8*5)
+ dk5 := dv5 ^ readU64(k, 8*5)
+ accs[4] += dv5
+ accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
+
+ dv6 := readU64(p, 8*6)
+ dk6 := dv6 ^ readU64(k, 8*6)
+ accs[7] += dv6
+ accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
+
+ dv7 := readU64(p, 8*7)
+ dk7 := dv7 ^ readU64(k, 8*7)
+ accs[6] += dv7
+ accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
+
+ l -= _stripe
+ if l > 0 {
+ p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
+ }
+ }
+
+ // scramble accs
+ accs[0] ^= accs[0] >> 47
+ accs[0] ^= key64_128
+ accs[0] *= prime32_1
+
+ accs[1] ^= accs[1] >> 47
+ accs[1] ^= key64_136
+ accs[1] *= prime32_1
+
+ accs[2] ^= accs[2] >> 47
+ accs[2] ^= key64_144
+ accs[2] *= prime32_1
+
+ accs[3] ^= accs[3] >> 47
+ accs[3] ^= key64_152
+ accs[3] *= prime32_1
+
+ accs[4] ^= accs[4] >> 47
+ accs[4] ^= key64_160
+ accs[4] *= prime32_1
+
+ accs[5] ^= accs[5] >> 47
+ accs[5] ^= key64_168
+ accs[5] *= prime32_1
+
+ accs[6] ^= accs[6] >> 47
+ accs[6] ^= key64_176
+ accs[6] *= prime32_1
+
+ accs[7] ^= accs[7] >> 47
+ accs[7] ^= key64_184
+ accs[7] *= prime32_1
+ }
+
+ if l > 0 {
+ t, k := (l-1)/_stripe, secret
+
+ for i := u64(0); i < t; i++ {
+ dv0 := readU64(p, 8*0)
+ dk0 := dv0 ^ readU64(k, 8*0)
+ accs[1] += dv0
+ accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
+
+ dv1 := readU64(p, 8*1)
+ dk1 := dv1 ^ readU64(k, 8*1)
+ accs[0] += dv1
+ accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
+
+ dv2 := readU64(p, 8*2)
+ dk2 := dv2 ^ readU64(k, 8*2)
+ accs[3] += dv2
+ accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
+
+ dv3 := readU64(p, 8*3)
+ dk3 := dv3 ^ readU64(k, 8*3)
+ accs[2] += dv3
+ accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
+
+ dv4 := readU64(p, 8*4)
+ dk4 := dv4 ^ readU64(k, 8*4)
+ accs[5] += dv4
+ accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
+
+ dv5 := readU64(p, 8*5)
+ dk5 := dv5 ^ readU64(k, 8*5)
+ accs[4] += dv5
+ accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
+
+ dv6 := readU64(p, 8*6)
+ dk6 := dv6 ^ readU64(k, 8*6)
+ accs[7] += dv6
+ accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
+
+ dv7 := readU64(p, 8*7)
+ dk7 := dv7 ^ readU64(k, 8*7)
+ accs[6] += dv7
+ accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
+
+ l -= _stripe
+ if l > 0 {
+ p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
+ }
+ }
+
+ if l > 0 {
+ p = ptr(ui(p) - uintptr(_stripe-l))
+
+ dv0 := readU64(p, 8*0)
+ dk0 := dv0 ^ key64_121
+ accs[1] += dv0
+ accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
+
+ dv1 := readU64(p, 8*1)
+ dk1 := dv1 ^ key64_129
+ accs[0] += dv1
+ accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
+
+ dv2 := readU64(p, 8*2)
+ dk2 := dv2 ^ key64_137
+ accs[3] += dv2
+ accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
+
+ dv3 := readU64(p, 8*3)
+ dk3 := dv3 ^ key64_145
+ accs[2] += dv3
+ accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
+
+ dv4 := readU64(p, 8*4)
+ dk4 := dv4 ^ key64_153
+ accs[5] += dv4
+ accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
+
+ dv5 := readU64(p, 8*5)
+ dk5 := dv5 ^ key64_161
+ accs[4] += dv5
+ accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
+
+ dv6 := readU64(p, 8*6)
+ dk6 := dv6 ^ key64_169
+ accs[7] += dv6
+ accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
+
+ dv7 := readU64(p, 8*7)
+ dk7 := dv7 ^ key64_177
+ accs[6] += dv7
+ accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
+ }
+ }
+}
+
+func accumBlockScalar(accs *[8]u64, p, secret ptr) {
+ if secret != key {
+ accumBlockScalarSeed(accs, p, secret)
+ return
+ }
+ // accs
+ for i := 0; i < 16; i++ {
+ dv0 := readU64(p, 8*0)
+ dk0 := dv0 ^ readU64(secret, 8*0)
+ accs[1] += dv0
+ accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
+
+ dv1 := readU64(p, 8*1)
+ dk1 := dv1 ^ readU64(secret, 8*1)
+ accs[0] += dv1
+ accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
+
+ dv2 := readU64(p, 8*2)
+ dk2 := dv2 ^ readU64(secret, 8*2)
+ accs[3] += dv2
+ accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
+
+ dv3 := readU64(p, 8*3)
+ dk3 := dv3 ^ readU64(secret, 8*3)
+ accs[2] += dv3
+ accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
+
+ dv4 := readU64(p, 8*4)
+ dk4 := dv4 ^ readU64(secret, 8*4)
+ accs[5] += dv4
+ accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
+
+ dv5 := readU64(p, 8*5)
+ dk5 := dv5 ^ readU64(secret, 8*5)
+ accs[4] += dv5
+ accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
+
+ dv6 := readU64(p, 8*6)
+ dk6 := dv6 ^ readU64(secret, 8*6)
+ accs[7] += dv6
+ accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
+
+ dv7 := readU64(p, 8*7)
+ dk7 := dv7 ^ readU64(secret, 8*7)
+ accs[6] += dv7
+ accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
+
+ p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8)
+ }
+
+ // scramble accs
+ accs[0] ^= accs[0] >> 47
+ accs[0] ^= key64_128
+ accs[0] *= prime32_1
+
+ accs[1] ^= accs[1] >> 47
+ accs[1] ^= key64_136
+ accs[1] *= prime32_1
+
+ accs[2] ^= accs[2] >> 47
+ accs[2] ^= key64_144
+ accs[2] *= prime32_1
+
+ accs[3] ^= accs[3] >> 47
+ accs[3] ^= key64_152
+ accs[3] *= prime32_1
+
+ accs[4] ^= accs[4] >> 47
+ accs[4] ^= key64_160
+ accs[4] *= prime32_1
+
+ accs[5] ^= accs[5] >> 47
+ accs[5] ^= key64_168
+ accs[5] *= prime32_1
+
+ accs[6] ^= accs[6] >> 47
+ accs[6] ^= key64_176
+ accs[6] *= prime32_1
+
+ accs[7] ^= accs[7] >> 47
+ accs[7] ^= key64_184
+ accs[7] *= prime32_1
+}
+
+// accumScalarSeed should be used with custom key.
+func accumScalarSeed(accs *[8]u64, p, secret ptr, l u64) {
+ for l > _block {
+ k := secret
+
+ // accs
+ for i := 0; i < 16; i++ {
+ dv0 := readU64(p, 8*0)
+ dk0 := dv0 ^ readU64(k, 8*0)
+ accs[1] += dv0
+ accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
+
+ dv1 := readU64(p, 8*1)
+ dk1 := dv1 ^ readU64(k, 8*1)
+ accs[0] += dv1
+ accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
+
+ dv2 := readU64(p, 8*2)
+ dk2 := dv2 ^ readU64(k, 8*2)
+ accs[3] += dv2
+ accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
+
+ dv3 := readU64(p, 8*3)
+ dk3 := dv3 ^ readU64(k, 8*3)
+ accs[2] += dv3
+ accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
+
+ dv4 := readU64(p, 8*4)
+ dk4 := dv4 ^ readU64(k, 8*4)
+ accs[5] += dv4
+ accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
+
+ dv5 := readU64(p, 8*5)
+ dk5 := dv5 ^ readU64(k, 8*5)
+ accs[4] += dv5
+ accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
+
+ dv6 := readU64(p, 8*6)
+ dk6 := dv6 ^ readU64(k, 8*6)
+ accs[7] += dv6
+ accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
+
+ dv7 := readU64(p, 8*7)
+ dk7 := dv7 ^ readU64(k, 8*7)
+ accs[6] += dv7
+ accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
+
+ l -= _stripe
+ if l > 0 {
+ p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
+ }
+ }
+
+ // scramble accs
+ accs[0] ^= accs[0] >> 47
+ accs[0] ^= readU64(secret, 128)
+ accs[0] *= prime32_1
+
+ accs[1] ^= accs[1] >> 47
+ accs[1] ^= readU64(secret, 136)
+ accs[1] *= prime32_1
+
+ accs[2] ^= accs[2] >> 47
+ accs[2] ^= readU64(secret, 144)
+ accs[2] *= prime32_1
+
+ accs[3] ^= accs[3] >> 47
+ accs[3] ^= readU64(secret, 152)
+ accs[3] *= prime32_1
+
+ accs[4] ^= accs[4] >> 47
+ accs[4] ^= readU64(secret, 160)
+ accs[4] *= prime32_1
+
+ accs[5] ^= accs[5] >> 47
+ accs[5] ^= readU64(secret, 168)
+ accs[5] *= prime32_1
+
+ accs[6] ^= accs[6] >> 47
+ accs[6] ^= readU64(secret, 176)
+ accs[6] *= prime32_1
+
+ accs[7] ^= accs[7] >> 47
+ accs[7] ^= readU64(secret, 184)
+ accs[7] *= prime32_1
+ }
+
+ if l > 0 {
+ t, k := (l-1)/_stripe, secret
+
+ for i := u64(0); i < t; i++ {
+ dv0 := readU64(p, 8*0)
+ dk0 := dv0 ^ readU64(k, 8*0)
+ accs[1] += dv0
+ accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
+
+ dv1 := readU64(p, 8*1)
+ dk1 := dv1 ^ readU64(k, 8*1)
+ accs[0] += dv1
+ accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
+
+ dv2 := readU64(p, 8*2)
+ dk2 := dv2 ^ readU64(k, 8*2)
+ accs[3] += dv2
+ accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
+
+ dv3 := readU64(p, 8*3)
+ dk3 := dv3 ^ readU64(k, 8*3)
+ accs[2] += dv3
+ accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
+
+ dv4 := readU64(p, 8*4)
+ dk4 := dv4 ^ readU64(k, 8*4)
+ accs[5] += dv4
+ accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
+
+ dv5 := readU64(p, 8*5)
+ dk5 := dv5 ^ readU64(k, 8*5)
+ accs[4] += dv5
+ accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
+
+ dv6 := readU64(p, 8*6)
+ dk6 := dv6 ^ readU64(k, 8*6)
+ accs[7] += dv6
+ accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
+
+ dv7 := readU64(p, 8*7)
+ dk7 := dv7 ^ readU64(k, 8*7)
+ accs[6] += dv7
+ accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
+
+ l -= _stripe
+ if l > 0 {
+ p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
+ }
+ }
+
+ if l > 0 {
+ p = ptr(ui(p) - uintptr(_stripe-l))
+
+ dv0 := readU64(p, 8*0)
+ dk0 := dv0 ^ readU64(secret, 121)
+ accs[1] += dv0
+ accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
+
+ dv1 := readU64(p, 8*1)
+ dk1 := dv1 ^ readU64(secret, 129)
+ accs[0] += dv1
+ accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
+
+ dv2 := readU64(p, 8*2)
+ dk2 := dv2 ^ readU64(secret, 137)
+ accs[3] += dv2
+ accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
+
+ dv3 := readU64(p, 8*3)
+ dk3 := dv3 ^ readU64(secret, 145)
+ accs[2] += dv3
+ accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
+
+ dv4 := readU64(p, 8*4)
+ dk4 := dv4 ^ readU64(secret, 153)
+ accs[5] += dv4
+ accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
+
+ dv5 := readU64(p, 8*5)
+ dk5 := dv5 ^ readU64(secret, 161)
+ accs[4] += dv5
+ accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
+
+ dv6 := readU64(p, 8*6)
+ dk6 := dv6 ^ readU64(secret, 169)
+ accs[7] += dv6
+ accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
+
+ dv7 := readU64(p, 8*7)
+ dk7 := dv7 ^ readU64(secret, 177)
+ accs[6] += dv7
+ accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
+ }
+ }
+}
+
+// accumBlockScalarSeed should be used with custom key.
+func accumBlockScalarSeed(accs *[8]u64, p, secret ptr) {
+ // accs
+ {
+ secret := secret
+ for i := 0; i < 16; i++ {
+ dv0 := readU64(p, 8*0)
+ dk0 := dv0 ^ readU64(secret, 8*0)
+ accs[1] += dv0
+ accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
+
+ dv1 := readU64(p, 8*1)
+ dk1 := dv1 ^ readU64(secret, 8*1)
+ accs[0] += dv1
+ accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
+
+ dv2 := readU64(p, 8*2)
+ dk2 := dv2 ^ readU64(secret, 8*2)
+ accs[3] += dv2
+ accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
+
+ dv3 := readU64(p, 8*3)
+ dk3 := dv3 ^ readU64(secret, 8*3)
+ accs[2] += dv3
+ accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
+
+ dv4 := readU64(p, 8*4)
+ dk4 := dv4 ^ readU64(secret, 8*4)
+ accs[5] += dv4
+ accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
+
+ dv5 := readU64(p, 8*5)
+ dk5 := dv5 ^ readU64(secret, 8*5)
+ accs[4] += dv5
+ accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
+
+ dv6 := readU64(p, 8*6)
+ dk6 := dv6 ^ readU64(secret, 8*6)
+ accs[7] += dv6
+ accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
+
+ dv7 := readU64(p, 8*7)
+ dk7 := dv7 ^ readU64(secret, 8*7)
+ accs[6] += dv7
+ accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
+
+ p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8)
+ }
+ }
+
+ // scramble accs
+ accs[0] ^= accs[0] >> 47
+ accs[0] ^= readU64(secret, 128)
+ accs[0] *= prime32_1
+
+ accs[1] ^= accs[1] >> 47
+ accs[1] ^= readU64(secret, 136)
+ accs[1] *= prime32_1
+
+ accs[2] ^= accs[2] >> 47
+ accs[2] ^= readU64(secret, 144)
+ accs[2] *= prime32_1
+
+ accs[3] ^= accs[3] >> 47
+ accs[3] ^= readU64(secret, 152)
+ accs[3] *= prime32_1
+
+ accs[4] ^= accs[4] >> 47
+ accs[4] ^= readU64(secret, 160)
+ accs[4] *= prime32_1
+
+ accs[5] ^= accs[5] >> 47
+ accs[5] ^= readU64(secret, 168)
+ accs[5] *= prime32_1
+
+ accs[6] ^= accs[6] >> 47
+ accs[6] ^= readU64(secret, 176)
+ accs[6] *= prime32_1
+
+ accs[7] ^= accs[7] >> 47
+ accs[7] ^= readU64(secret, 184)
+ accs[7] *= prime32_1
+}
diff --git a/vendor/github.com/zeebo/xxh3/accum_stubs_amd64.go b/vendor/github.com/zeebo/xxh3/accum_stubs_amd64.go
new file mode 100644
index 000000000..9baff6c41
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/accum_stubs_amd64.go
@@ -0,0 +1,40 @@
+package xxh3
+
+import (
+ "unsafe"
+
+ "github.com/klauspost/cpuid/v2"
+)
+
+var (
+ hasAVX2 = cpuid.CPU.Has(cpuid.AVX2)
+ hasSSE2 = cpuid.CPU.Has(cpuid.SSE2) // Always true on amd64
+ hasAVX512 = cpuid.CPU.Has(cpuid.AVX512F)
+)
+
+//go:noescape
+func accumAVX2(acc *[8]u64, data, key unsafe.Pointer, len u64)
+
+//go:noescape
+func accumAVX512(acc *[8]u64, data, key unsafe.Pointer, len u64)
+
+//go:noescape
+func accumSSE(acc *[8]u64, data, key unsafe.Pointer, len u64)
+
+//go:noescape
+func accumBlockAVX2(acc *[8]u64, data, key unsafe.Pointer)
+
+//go:noescape
+func accumBlockSSE(acc *[8]u64, data, key unsafe.Pointer)
+
+func withOverrides(avx512, avx2, sse2 bool, cb func()) {
+ avx512Orig, avx2Orig, sse2Orig := hasAVX512, hasAVX2, hasSSE2
+ hasAVX512, hasAVX2, hasSSE2 = avx512, avx2, sse2
+ defer func() { hasAVX512, hasAVX2, hasSSE2 = avx512Orig, avx2Orig, sse2Orig }()
+ cb()
+}
+
+func withAVX512(cb func()) { withOverrides(hasAVX512, false, false, cb) }
+func withAVX2(cb func()) { withOverrides(false, hasAVX2, false, cb) }
+func withSSE2(cb func()) { withOverrides(false, false, hasSSE2, cb) }
+func withGeneric(cb func()) { withOverrides(false, false, false, cb) }
diff --git a/vendor/github.com/zeebo/xxh3/accum_stubs_other.go b/vendor/github.com/zeebo/xxh3/accum_stubs_other.go
new file mode 100644
index 000000000..93bf6258a
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/accum_stubs_other.go
@@ -0,0 +1,25 @@
+//go:build !amd64
+// +build !amd64
+
+package xxh3
+
+import (
+ "unsafe"
+)
+
+const (
+ hasAVX2 = false
+ hasSSE2 = false
+ hasAVX512 = false
+)
+
+func accumAVX2(acc *[8]u64, data, key unsafe.Pointer, len u64) { panic("unreachable") }
+func accumSSE(acc *[8]u64, data, key unsafe.Pointer, len u64) { panic("unreachable") }
+func accumBlockAVX2(acc *[8]u64, data, key unsafe.Pointer) { panic("unreachable") }
+func accumBlockSSE(acc *[8]u64, data, key unsafe.Pointer) { panic("unreachable") }
+func accumAVX512(acc *[8]u64, data, key unsafe.Pointer, len u64) { panic("unreachable") }
+
+func withAVX512(cb func()) { cb() }
+func withAVX2(cb func()) { cb() }
+func withSSE2(cb func()) { cb() }
+func withGeneric(cb func()) { cb() }
diff --git a/vendor/github.com/zeebo/xxh3/accum_vector_avx512_amd64.s b/vendor/github.com/zeebo/xxh3/accum_vector_avx512_amd64.s
new file mode 100644
index 000000000..cfaf9f0a7
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/accum_vector_avx512_amd64.s
@@ -0,0 +1,379 @@
+// Code generated by command: go run gen.go -avx512 -out ../accum_vector_avx512_amd64.s -pkg xxh3. DO NOT EDIT.
+
+#include "textflag.h"
+
+DATA prime_avx512<>+0(SB)/8, $0x000000009e3779b1
+DATA prime_avx512<>+8(SB)/8, $0x000000009e3779b1
+DATA prime_avx512<>+16(SB)/8, $0x000000009e3779b1
+DATA prime_avx512<>+24(SB)/8, $0x000000009e3779b1
+DATA prime_avx512<>+32(SB)/8, $0x000000009e3779b1
+DATA prime_avx512<>+40(SB)/8, $0x000000009e3779b1
+DATA prime_avx512<>+48(SB)/8, $0x000000009e3779b1
+DATA prime_avx512<>+56(SB)/8, $0x000000009e3779b1
+GLOBL prime_avx512<>(SB), RODATA|NOPTR, $64
+
+// func accumAVX512(acc *[8]uint64, data *byte, key *byte, len uint64)
+// Requires: AVX, AVX512F, MMX+
+TEXT ·accumAVX512(SB), NOSPLIT, $0-32
+ MOVQ acc+0(FP), AX
+ MOVQ data+8(FP), CX
+ MOVQ key+16(FP), DX
+ MOVQ len+24(FP), BX
+ VMOVDQU64 (AX), Z1
+ VMOVDQU64 prime_avx512<>+0(SB), Z0
+ VMOVDQU64 (DX), Z2
+ VMOVDQU64 8(DX), Z3
+ VMOVDQU64 16(DX), Z4
+ VMOVDQU64 24(DX), Z5
+ VMOVDQU64 32(DX), Z6
+ VMOVDQU64 40(DX), Z7
+ VMOVDQU64 48(DX), Z8
+ VMOVDQU64 56(DX), Z9
+ VMOVDQU64 64(DX), Z10
+ VMOVDQU64 72(DX), Z11
+ VMOVDQU64 80(DX), Z12
+ VMOVDQU64 88(DX), Z13
+ VMOVDQU64 96(DX), Z14
+ VMOVDQU64 104(DX), Z15
+ VMOVDQU64 112(DX), Z16
+ VMOVDQU64 120(DX), Z17
+ VMOVDQU64 128(DX), Z18
+ VMOVDQU64 121(DX), Z19
+
+accum_large:
+ CMPQ BX, $0x00000400
+ JLE accum
+ VMOVDQU64 (CX), Z20
+ PREFETCHT0 1024(CX)
+ VPXORD Z2, Z20, Z21
+ VPSHUFD $0x31, Z21, Z22
+ VPMULUDQ Z21, Z22, Z21
+ VPSHUFD $0x4e, Z20, Z20
+ VPADDQ Z1, Z20, Z1
+ VPADDQ Z1, Z21, Z1
+ VMOVDQU64 64(CX), Z20
+ PREFETCHT0 1088(CX)
+ VPXORD Z3, Z20, Z21
+ VPSHUFD $0x31, Z21, Z22
+ VPMULUDQ Z21, Z22, Z21
+ VPSHUFD $0x4e, Z20, Z20
+ VPADDQ Z1, Z20, Z1
+ VPADDQ Z1, Z21, Z1
+ VMOVDQU64 128(CX), Z20
+ PREFETCHT0 1152(CX)
+ VPXORD Z4, Z20, Z21
+ VPSHUFD $0x31, Z21, Z22
+ VPMULUDQ Z21, Z22, Z21
+ VPSHUFD $0x4e, Z20, Z20
+ VPADDQ Z1, Z20, Z1
+ VPADDQ Z1, Z21, Z1
+ VMOVDQU64 192(CX), Z20
+ PREFETCHT0 1216(CX)
+ VPXORD Z5, Z20, Z21
+ VPSHUFD $0x31, Z21, Z22
+ VPMULUDQ Z21, Z22, Z21
+ VPSHUFD $0x4e, Z20, Z20
+ VPADDQ Z1, Z20, Z1
+ VPADDQ Z1, Z21, Z1
+ VMOVDQU64 256(CX), Z20
+ PREFETCHT0 1280(CX)
+ VPXORD Z6, Z20, Z21
+ VPSHUFD $0x31, Z21, Z22
+ VPMULUDQ Z21, Z22, Z21
+ VPSHUFD $0x4e, Z20, Z20
+ VPADDQ Z1, Z20, Z1
+ VPADDQ Z1, Z21, Z1
+ VMOVDQU64 320(CX), Z20
+ PREFETCHT0 1344(CX)
+ VPXORD Z7, Z20, Z21
+ VPSHUFD $0x31, Z21, Z22
+ VPMULUDQ Z21, Z22, Z21
+ VPSHUFD $0x4e, Z20, Z20
+ VPADDQ Z1, Z20, Z1
+ VPADDQ Z1, Z21, Z1
+ VMOVDQU64 384(CX), Z20
+ PREFETCHT0 1408(CX)
+ VPXORD Z8, Z20, Z21
+ VPSHUFD $0x31, Z21, Z22
+ VPMULUDQ Z21, Z22, Z21
+ VPSHUFD $0x4e, Z20, Z20
+ VPADDQ Z1, Z20, Z1
+ VPADDQ Z1, Z21, Z1
+ VMOVDQU64 448(CX), Z20
+ PREFETCHT0 1472(CX)
+ VPXORD Z9, Z20, Z21
+ VPSHUFD $0x31, Z21, Z22
+ VPMULUDQ Z21, Z22, Z21
+ VPSHUFD $0x4e, Z20, Z20
+ VPADDQ Z1, Z20, Z1
+ VPADDQ Z1, Z21, Z1
+ VMOVDQU64 512(CX), Z20
+ PREFETCHT0 1536(CX)
+ VPXORD Z10, Z20, Z21
+ VPSHUFD $0x31, Z21, Z22
+ VPMULUDQ Z21, Z22, Z21
+ VPSHUFD $0x4e, Z20, Z20
+ VPADDQ Z1, Z20, Z1
+ VPADDQ Z1, Z21, Z1
+ VMOVDQU64 576(CX), Z20
+ PREFETCHT0 1600(CX)
+ VPXORD Z11, Z20, Z21
+ VPSHUFD $0x31, Z21, Z22
+ VPMULUDQ Z21, Z22, Z21
+ VPSHUFD $0x4e, Z20, Z20
+ VPADDQ Z1, Z20, Z1
+ VPADDQ Z1, Z21, Z1
+ VMOVDQU64 640(CX), Z20
+ PREFETCHT0 1664(CX)
+ VPXORD Z12, Z20, Z21
+ VPSHUFD $0x31, Z21, Z22
+ VPMULUDQ Z21, Z22, Z21
+ VPSHUFD $0x4e, Z20, Z20
+ VPADDQ Z1, Z20, Z1
+ VPADDQ Z1, Z21, Z1
+ VMOVDQU64 704(CX), Z20
+ PREFETCHT0 1728(CX)
+ VPXORD Z13, Z20, Z21
+ VPSHUFD $0x31, Z21, Z22
+ VPMULUDQ Z21, Z22, Z21
+ VPSHUFD $0x4e, Z20, Z20
+ VPADDQ Z1, Z20, Z1
+ VPADDQ Z1, Z21, Z1
+ VMOVDQU64 768(CX), Z20
+ PREFETCHT0 1792(CX)
+ VPXORD Z14, Z20, Z21
+ VPSHUFD $0x31, Z21, Z22
+ VPMULUDQ Z21, Z22, Z21
+ VPSHUFD $0x4e, Z20, Z20
+ VPADDQ Z1, Z20, Z1
+ VPADDQ Z1, Z21, Z1
+ VMOVDQU64 832(CX), Z20
+ PREFETCHT0 1856(CX)
+ VPXORD Z15, Z20, Z21
+ VPSHUFD $0x31, Z21, Z22
+ VPMULUDQ Z21, Z22, Z21
+ VPSHUFD $0x4e, Z20, Z20
+ VPADDQ Z1, Z20, Z1
+ VPADDQ Z1, Z21, Z1
+ VMOVDQU64 896(CX), Z20
+ PREFETCHT0 1920(CX)
+ VPXORD Z16, Z20, Z21
+ VPSHUFD $0x31, Z21, Z22
+ VPMULUDQ Z21, Z22, Z21
+ VPSHUFD $0x4e, Z20, Z20
+ VPADDQ Z1, Z20, Z1
+ VPADDQ Z1, Z21, Z1
+ VMOVDQU64 960(CX), Z20
+ PREFETCHT0 1984(CX)
+ VPXORD Z17, Z20, Z21
+ VPSHUFD $0x31, Z21, Z22
+ VPMULUDQ Z21, Z22, Z21
+ VPSHUFD $0x4e, Z20, Z20
+ VPADDQ Z1, Z20, Z1
+ VPADDQ Z1, Z21, Z1
+ ADDQ $0x00000400, CX
+ SUBQ $0x00000400, BX
+ VPSRLQ $0x2f, Z1, Z20
+ VPTERNLOGD $0x96, Z1, Z18, Z20
+ VPMULUDQ Z0, Z20, Z1
+ VPSHUFD $0xf5, Z20, Z20
+ VPMULUDQ Z0, Z20, Z20
+ VPSLLQ $0x20, Z20, Z20
+ VPADDQ Z1, Z20, Z1
+ JMP accum_large
+
+accum:
+ CMPQ BX, $0x40
+ JLE finalize
+ VMOVDQU64 (CX), Z0
+ VPXORD Z2, Z0, Z2
+ VPSHUFD $0x31, Z2, Z18
+ VPMULUDQ Z2, Z18, Z2
+ VPSHUFD $0x4e, Z0, Z0
+ VPADDQ Z1, Z0, Z1
+ VPADDQ Z1, Z2, Z1
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, BX
+ CMPQ BX, $0x40
+ JLE finalize
+ VMOVDQU64 (CX), Z0
+ VPXORD Z3, Z0, Z2
+ VPSHUFD $0x31, Z2, Z3
+ VPMULUDQ Z2, Z3, Z2
+ VPSHUFD $0x4e, Z0, Z0
+ VPADDQ Z1, Z0, Z1
+ VPADDQ Z1, Z2, Z1
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, BX
+ CMPQ BX, $0x40
+ JLE finalize
+ VMOVDQU64 (CX), Z0
+ VPXORD Z4, Z0, Z2
+ VPSHUFD $0x31, Z2, Z3
+ VPMULUDQ Z2, Z3, Z2
+ VPSHUFD $0x4e, Z0, Z0
+ VPADDQ Z1, Z0, Z1
+ VPADDQ Z1, Z2, Z1
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, BX
+ CMPQ BX, $0x40
+ JLE finalize
+ VMOVDQU64 (CX), Z0
+ VPXORD Z5, Z0, Z2
+ VPSHUFD $0x31, Z2, Z3
+ VPMULUDQ Z2, Z3, Z2
+ VPSHUFD $0x4e, Z0, Z0
+ VPADDQ Z1, Z0, Z1
+ VPADDQ Z1, Z2, Z1
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, BX
+ CMPQ BX, $0x40
+ JLE finalize
+ VMOVDQU64 (CX), Z0
+ VPXORD Z6, Z0, Z2
+ VPSHUFD $0x31, Z2, Z3
+ VPMULUDQ Z2, Z3, Z2
+ VPSHUFD $0x4e, Z0, Z0
+ VPADDQ Z1, Z0, Z1
+ VPADDQ Z1, Z2, Z1
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, BX
+ CMPQ BX, $0x40
+ JLE finalize
+ VMOVDQU64 (CX), Z0
+ VPXORD Z7, Z0, Z2
+ VPSHUFD $0x31, Z2, Z3
+ VPMULUDQ Z2, Z3, Z2
+ VPSHUFD $0x4e, Z0, Z0
+ VPADDQ Z1, Z0, Z1
+ VPADDQ Z1, Z2, Z1
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, BX
+ CMPQ BX, $0x40
+ JLE finalize
+ VMOVDQU64 (CX), Z0
+ VPXORD Z8, Z0, Z2
+ VPSHUFD $0x31, Z2, Z3
+ VPMULUDQ Z2, Z3, Z2
+ VPSHUFD $0x4e, Z0, Z0
+ VPADDQ Z1, Z0, Z1
+ VPADDQ Z1, Z2, Z1
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, BX
+ CMPQ BX, $0x40
+ JLE finalize
+ VMOVDQU64 (CX), Z0
+ VPXORD Z9, Z0, Z2
+ VPSHUFD $0x31, Z2, Z3
+ VPMULUDQ Z2, Z3, Z2
+ VPSHUFD $0x4e, Z0, Z0
+ VPADDQ Z1, Z0, Z1
+ VPADDQ Z1, Z2, Z1
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, BX
+ CMPQ BX, $0x40
+ JLE finalize
+ VMOVDQU64 (CX), Z0
+ VPXORD Z10, Z0, Z2
+ VPSHUFD $0x31, Z2, Z3
+ VPMULUDQ Z2, Z3, Z2
+ VPSHUFD $0x4e, Z0, Z0
+ VPADDQ Z1, Z0, Z1
+ VPADDQ Z1, Z2, Z1
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, BX
+ CMPQ BX, $0x40
+ JLE finalize
+ VMOVDQU64 (CX), Z0
+ VPXORD Z11, Z0, Z2
+ VPSHUFD $0x31, Z2, Z3
+ VPMULUDQ Z2, Z3, Z2
+ VPSHUFD $0x4e, Z0, Z0
+ VPADDQ Z1, Z0, Z1
+ VPADDQ Z1, Z2, Z1
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, BX
+ CMPQ BX, $0x40
+ JLE finalize
+ VMOVDQU64 (CX), Z0
+ VPXORD Z12, Z0, Z2
+ VPSHUFD $0x31, Z2, Z3
+ VPMULUDQ Z2, Z3, Z2
+ VPSHUFD $0x4e, Z0, Z0
+ VPADDQ Z1, Z0, Z1
+ VPADDQ Z1, Z2, Z1
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, BX
+ CMPQ BX, $0x40
+ JLE finalize
+ VMOVDQU64 (CX), Z0
+ VPXORD Z13, Z0, Z2
+ VPSHUFD $0x31, Z2, Z3
+ VPMULUDQ Z2, Z3, Z2
+ VPSHUFD $0x4e, Z0, Z0
+ VPADDQ Z1, Z0, Z1
+ VPADDQ Z1, Z2, Z1
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, BX
+ CMPQ BX, $0x40
+ JLE finalize
+ VMOVDQU64 (CX), Z0
+ VPXORD Z14, Z0, Z2
+ VPSHUFD $0x31, Z2, Z3
+ VPMULUDQ Z2, Z3, Z2
+ VPSHUFD $0x4e, Z0, Z0
+ VPADDQ Z1, Z0, Z1
+ VPADDQ Z1, Z2, Z1
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, BX
+ CMPQ BX, $0x40
+ JLE finalize
+ VMOVDQU64 (CX), Z0
+ VPXORD Z15, Z0, Z2
+ VPSHUFD $0x31, Z2, Z3
+ VPMULUDQ Z2, Z3, Z2
+ VPSHUFD $0x4e, Z0, Z0
+ VPADDQ Z1, Z0, Z1
+ VPADDQ Z1, Z2, Z1
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, BX
+ CMPQ BX, $0x40
+ JLE finalize
+ VMOVDQU64 (CX), Z0
+ VPXORD Z16, Z0, Z2
+ VPSHUFD $0x31, Z2, Z3
+ VPMULUDQ Z2, Z3, Z2
+ VPSHUFD $0x4e, Z0, Z0
+ VPADDQ Z1, Z0, Z1
+ VPADDQ Z1, Z2, Z1
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, BX
+ CMPQ BX, $0x40
+ JLE finalize
+ VMOVDQU64 (CX), Z0
+ VPXORD Z17, Z0, Z2
+ VPSHUFD $0x31, Z2, Z3
+ VPMULUDQ Z2, Z3, Z2
+ VPSHUFD $0x4e, Z0, Z0
+ VPADDQ Z1, Z0, Z1
+ VPADDQ Z1, Z2, Z1
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, BX
+
+finalize:
+ CMPQ BX, $0x00
+ JE return
+ SUBQ $0x40, CX
+ ADDQ BX, CX
+ VMOVDQU64 (CX), Z0
+ VPXORD Z19, Z0, Z2
+ VPSHUFD $0x31, Z2, Z3
+ VPMULUDQ Z2, Z3, Z2
+ VPSHUFD $0x4e, Z0, Z0
+ VPADDQ Z1, Z0, Z1
+ VPADDQ Z1, Z2, Z1
+
+return:
+ VMOVDQU64 Z1, (AX)
+ VZEROUPPER
+ RET
diff --git a/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s b/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s
new file mode 100644
index 000000000..b53c1521f
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s
@@ -0,0 +1,586 @@
+// Code generated by command: go run gen.go -avx -out ../accum_vector_avx_amd64.s -pkg xxh3. DO NOT EDIT.
+
+#include "textflag.h"
+
+DATA prime_avx<>+0(SB)/8, $0x000000009e3779b1
+DATA prime_avx<>+8(SB)/8, $0x000000009e3779b1
+DATA prime_avx<>+16(SB)/8, $0x000000009e3779b1
+DATA prime_avx<>+24(SB)/8, $0x000000009e3779b1
+GLOBL prime_avx<>(SB), RODATA|NOPTR, $32
+
+// func accumAVX2(acc *[8]uint64, data *byte, key *byte, len uint64)
+// Requires: AVX, AVX2, MMX+
+TEXT ·accumAVX2(SB), NOSPLIT, $0-32
+ MOVQ acc+0(FP), AX
+ MOVQ data+8(FP), CX
+ MOVQ key+16(FP), DX
+ MOVQ key+16(FP), BX
+ MOVQ len+24(FP), SI
+ VMOVDQU (AX), Y1
+ VMOVDQU 32(AX), Y2
+ VMOVDQU prime_avx<>+0(SB), Y0
+
+accum_large:
+ CMPQ SI, $0x00000400
+ JLE accum
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y6
+ PREFETCHT0 512(CX)
+ VPXOR (DX), Y3, Y4
+ VPXOR 32(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y6
+ PREFETCHT0 576(CX)
+ VPXOR 8(DX), Y3, Y4
+ VPXOR 40(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y6
+ PREFETCHT0 640(CX)
+ VPXOR 16(DX), Y3, Y4
+ VPXOR 48(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y6
+ PREFETCHT0 704(CX)
+ VPXOR 24(DX), Y3, Y4
+ VPXOR 56(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 256(CX), Y3
+ VMOVDQU 288(CX), Y6
+ PREFETCHT0 768(CX)
+ VPXOR 32(DX), Y3, Y4
+ VPXOR 64(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 320(CX), Y3
+ VMOVDQU 352(CX), Y6
+ PREFETCHT0 832(CX)
+ VPXOR 40(DX), Y3, Y4
+ VPXOR 72(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 384(CX), Y3
+ VMOVDQU 416(CX), Y6
+ PREFETCHT0 896(CX)
+ VPXOR 48(DX), Y3, Y4
+ VPXOR 80(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 448(CX), Y3
+ VMOVDQU 480(CX), Y6
+ PREFETCHT0 960(CX)
+ VPXOR 56(DX), Y3, Y4
+ VPXOR 88(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 512(CX), Y3
+ VMOVDQU 544(CX), Y6
+ PREFETCHT0 1024(CX)
+ VPXOR 64(DX), Y3, Y4
+ VPXOR 96(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 576(CX), Y3
+ VMOVDQU 608(CX), Y6
+ PREFETCHT0 1088(CX)
+ VPXOR 72(DX), Y3, Y4
+ VPXOR 104(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 640(CX), Y3
+ VMOVDQU 672(CX), Y6
+ PREFETCHT0 1152(CX)
+ VPXOR 80(DX), Y3, Y4
+ VPXOR 112(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 704(CX), Y3
+ VMOVDQU 736(CX), Y6
+ PREFETCHT0 1216(CX)
+ VPXOR 88(DX), Y3, Y4
+ VPXOR 120(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 768(CX), Y3
+ VMOVDQU 800(CX), Y6
+ PREFETCHT0 1280(CX)
+ VPXOR 96(DX), Y3, Y4
+ VPXOR 128(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 832(CX), Y3
+ VMOVDQU 864(CX), Y6
+ PREFETCHT0 1344(CX)
+ VPXOR 104(DX), Y3, Y4
+ VPXOR 136(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 896(CX), Y3
+ VMOVDQU 928(CX), Y6
+ PREFETCHT0 1408(CX)
+ VPXOR 112(DX), Y3, Y4
+ VPXOR 144(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 960(CX), Y3
+ VMOVDQU 992(CX), Y6
+ PREFETCHT0 1472(CX)
+ VPXOR 120(DX), Y3, Y4
+ VPXOR 152(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ ADDQ $0x00000400, CX
+ SUBQ $0x00000400, SI
+ VPSRLQ $0x2f, Y1, Y3
+ VPXOR Y1, Y3, Y3
+ VPXOR 128(DX), Y3, Y3
+ VPMULUDQ Y0, Y3, Y1
+ VPSHUFD $0xf5, Y3, Y3
+ VPMULUDQ Y0, Y3, Y3
+ VPSLLQ $0x20, Y3, Y3
+ VPADDQ Y1, Y3, Y1
+ VPSRLQ $0x2f, Y2, Y3
+ VPXOR Y2, Y3, Y3
+ VPXOR 160(DX), Y3, Y3
+ VPMULUDQ Y0, Y3, Y2
+ VPSHUFD $0xf5, Y3, Y3
+ VPMULUDQ Y0, Y3, Y3
+ VPSLLQ $0x20, Y3, Y3
+ VPADDQ Y2, Y3, Y2
+ JMP accum_large
+
+accum:
+ CMPQ SI, $0x40
+ JLE finalize
+ VMOVDQU (CX), Y0
+ VMOVDQU 32(CX), Y5
+ VPXOR (BX), Y0, Y3
+ VPXOR 32(BX), Y5, Y6
+ VPSHUFD $0x31, Y3, Y4
+ VPSHUFD $0x31, Y6, Y7
+ VPMULUDQ Y3, Y4, Y3
+ VPMULUDQ Y6, Y7, Y6
+ VPSHUFD $0x4e, Y0, Y0
+ VPSHUFD $0x4e, Y5, Y5
+ VPADDQ Y1, Y0, Y1
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y2, Y5, Y2
+ VPADDQ Y2, Y6, Y2
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, SI
+ ADDQ $0x00000008, BX
+ JMP accum
+
+finalize:
+ CMPQ SI, $0x00
+ JE return
+ SUBQ $0x40, CX
+ ADDQ SI, CX
+ VMOVDQU (CX), Y0
+ VMOVDQU 32(CX), Y5
+ VPXOR 121(DX), Y0, Y3
+ VPXOR 153(DX), Y5, Y6
+ VPSHUFD $0x31, Y3, Y4
+ VPSHUFD $0x31, Y6, Y7
+ VPMULUDQ Y3, Y4, Y3
+ VPMULUDQ Y6, Y7, Y6
+ VPSHUFD $0x4e, Y0, Y0
+ VPSHUFD $0x4e, Y5, Y5
+ VPADDQ Y1, Y0, Y1
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y2, Y5, Y2
+ VPADDQ Y2, Y6, Y2
+
+return:
+ VMOVDQU Y1, (AX)
+ VMOVDQU Y2, 32(AX)
+ VZEROUPPER
+ RET
+
+// func accumBlockAVX2(acc *[8]uint64, data *byte, key *byte)
+// Requires: AVX, AVX2
+TEXT ·accumBlockAVX2(SB), NOSPLIT, $0-24
+ MOVQ acc+0(FP), AX
+ MOVQ data+8(FP), CX
+ MOVQ key+16(FP), DX
+ VMOVDQU (AX), Y1
+ VMOVDQU 32(AX), Y2
+ VMOVDQU prime_avx<>+0(SB), Y0
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y6
+ VPXOR (DX), Y3, Y4
+ VPXOR 32(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y6
+ VPXOR 8(DX), Y3, Y4
+ VPXOR 40(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y6
+ VPXOR 16(DX), Y3, Y4
+ VPXOR 48(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y6
+ VPXOR 24(DX), Y3, Y4
+ VPXOR 56(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 256(CX), Y3
+ VMOVDQU 288(CX), Y6
+ VPXOR 32(DX), Y3, Y4
+ VPXOR 64(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 320(CX), Y3
+ VMOVDQU 352(CX), Y6
+ VPXOR 40(DX), Y3, Y4
+ VPXOR 72(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 384(CX), Y3
+ VMOVDQU 416(CX), Y6
+ VPXOR 48(DX), Y3, Y4
+ VPXOR 80(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 448(CX), Y3
+ VMOVDQU 480(CX), Y6
+ VPXOR 56(DX), Y3, Y4
+ VPXOR 88(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 512(CX), Y3
+ VMOVDQU 544(CX), Y6
+ VPXOR 64(DX), Y3, Y4
+ VPXOR 96(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 576(CX), Y3
+ VMOVDQU 608(CX), Y6
+ VPXOR 72(DX), Y3, Y4
+ VPXOR 104(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 640(CX), Y3
+ VMOVDQU 672(CX), Y6
+ VPXOR 80(DX), Y3, Y4
+ VPXOR 112(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 704(CX), Y3
+ VMOVDQU 736(CX), Y6
+ VPXOR 88(DX), Y3, Y4
+ VPXOR 120(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 768(CX), Y3
+ VMOVDQU 800(CX), Y6
+ VPXOR 96(DX), Y3, Y4
+ VPXOR 128(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 832(CX), Y3
+ VMOVDQU 864(CX), Y6
+ VPXOR 104(DX), Y3, Y4
+ VPXOR 136(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 896(CX), Y3
+ VMOVDQU 928(CX), Y6
+ VPXOR 112(DX), Y3, Y4
+ VPXOR 144(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 960(CX), Y3
+ VMOVDQU 992(CX), Y6
+ VPXOR 120(DX), Y3, Y4
+ VPXOR 152(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VPSRLQ $0x2f, Y1, Y3
+ VPXOR Y1, Y3, Y3
+ VPXOR 128(DX), Y3, Y3
+ VPMULUDQ Y0, Y3, Y1
+ VPSHUFD $0xf5, Y3, Y3
+ VPMULUDQ Y0, Y3, Y3
+ VPSLLQ $0x20, Y3, Y3
+ VPADDQ Y1, Y3, Y1
+ VPSRLQ $0x2f, Y2, Y3
+ VPXOR Y2, Y3, Y3
+ VPXOR 160(DX), Y3, Y3
+ VPMULUDQ Y0, Y3, Y2
+ VPSHUFD $0xf5, Y3, Y3
+ VPMULUDQ Y0, Y3, Y3
+ VPSLLQ $0x20, Y3, Y3
+ VPADDQ Y2, Y3, Y2
+ VMOVDQU Y1, (AX)
+ VMOVDQU Y2, 32(AX)
+ VZEROUPPER
+ RET
diff --git a/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s b/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s
new file mode 100644
index 000000000..ba670e560
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s
@@ -0,0 +1,1236 @@
+// Code generated by command: go run gen.go -sse -out ../accum_vector_sse_amd64.s -pkg xxh3. DO NOT EDIT.
+
+#include "textflag.h"
+
+DATA prime_sse<>+0(SB)/4, $0x9e3779b1
+DATA prime_sse<>+4(SB)/4, $0x9e3779b1
+DATA prime_sse<>+8(SB)/4, $0x9e3779b1
+DATA prime_sse<>+12(SB)/4, $0x9e3779b1
+GLOBL prime_sse<>(SB), RODATA|NOPTR, $16
+
+// func accumSSE(acc *[8]uint64, data *byte, key *byte, len uint64)
+// Requires: SSE2
+TEXT ·accumSSE(SB), NOSPLIT, $0-32
+ MOVQ acc+0(FP), AX
+ MOVQ data+8(FP), CX
+ MOVQ key+16(FP), DX
+ MOVQ key+16(FP), BX
+ MOVQ len+24(FP), SI
+ MOVOU (AX), X1
+ MOVOU 16(AX), X2
+ MOVOU 32(AX), X3
+ MOVOU 48(AX), X4
+ MOVOU prime_sse<>+0(SB), X0
+
+accum_large:
+ CMPQ SI, $0x00000400
+ JLE accum
+ MOVOU (CX), X5
+ MOVOU (DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 16(CX), X5
+ MOVOU 16(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 32(CX), X5
+ MOVOU 32(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 48(CX), X5
+ MOVOU 48(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 64(CX), X5
+ MOVOU 8(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 80(CX), X5
+ MOVOU 24(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 96(CX), X5
+ MOVOU 40(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 112(CX), X5
+ MOVOU 56(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 128(CX), X5
+ MOVOU 16(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 144(CX), X5
+ MOVOU 32(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 160(CX), X5
+ MOVOU 48(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 176(CX), X5
+ MOVOU 64(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 192(CX), X5
+ MOVOU 24(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 208(CX), X5
+ MOVOU 40(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 224(CX), X5
+ MOVOU 56(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 240(CX), X5
+ MOVOU 72(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 256(CX), X5
+ MOVOU 32(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 272(CX), X5
+ MOVOU 48(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 288(CX), X5
+ MOVOU 64(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 304(CX), X5
+ MOVOU 80(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 320(CX), X5
+ MOVOU 40(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 336(CX), X5
+ MOVOU 56(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 352(CX), X5
+ MOVOU 72(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 368(CX), X5
+ MOVOU 88(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 384(CX), X5
+ MOVOU 48(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 400(CX), X5
+ MOVOU 64(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 416(CX), X5
+ MOVOU 80(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 432(CX), X5
+ MOVOU 96(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 448(CX), X5
+ MOVOU 56(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 464(CX), X5
+ MOVOU 72(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 480(CX), X5
+ MOVOU 88(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 496(CX), X5
+ MOVOU 104(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 512(CX), X5
+ MOVOU 64(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 528(CX), X5
+ MOVOU 80(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 544(CX), X5
+ MOVOU 96(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 560(CX), X5
+ MOVOU 112(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 576(CX), X5
+ MOVOU 72(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 592(CX), X5
+ MOVOU 88(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 608(CX), X5
+ MOVOU 104(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 624(CX), X5
+ MOVOU 120(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 640(CX), X5
+ MOVOU 80(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 656(CX), X5
+ MOVOU 96(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 672(CX), X5
+ MOVOU 112(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 688(CX), X5
+ MOVOU 128(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 704(CX), X5
+ MOVOU 88(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 720(CX), X5
+ MOVOU 104(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 736(CX), X5
+ MOVOU 120(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 752(CX), X5
+ MOVOU 136(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 768(CX), X5
+ MOVOU 96(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 784(CX), X5
+ MOVOU 112(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 800(CX), X5
+ MOVOU 128(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 816(CX), X5
+ MOVOU 144(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 832(CX), X5
+ MOVOU 104(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 848(CX), X5
+ MOVOU 120(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 864(CX), X5
+ MOVOU 136(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 880(CX), X5
+ MOVOU 152(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 896(CX), X5
+ MOVOU 112(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 912(CX), X5
+ MOVOU 128(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 928(CX), X5
+ MOVOU 144(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 944(CX), X5
+ MOVOU 160(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 960(CX), X5
+ MOVOU 120(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 976(CX), X5
+ MOVOU 136(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 992(CX), X5
+ MOVOU 152(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 1008(CX), X5
+ MOVOU 168(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ ADDQ $0x00000400, CX
+ SUBQ $0x00000400, SI
+ MOVOU X1, X5
+ PSRLQ $0x2f, X5
+ PXOR X5, X1
+ MOVOU 128(DX), X5
+ PXOR X5, X1
+ PSHUFD $0xf5, X1, X5
+ PMULULQ X0, X1
+ PMULULQ X0, X5
+ PSLLQ $0x20, X5
+ PADDQ X5, X1
+ MOVOU X2, X5
+ PSRLQ $0x2f, X5
+ PXOR X5, X2
+ MOVOU 144(DX), X5
+ PXOR X5, X2
+ PSHUFD $0xf5, X2, X5
+ PMULULQ X0, X2
+ PMULULQ X0, X5
+ PSLLQ $0x20, X5
+ PADDQ X5, X2
+ MOVOU X3, X5
+ PSRLQ $0x2f, X5
+ PXOR X5, X3
+ MOVOU 160(DX), X5
+ PXOR X5, X3
+ PSHUFD $0xf5, X3, X5
+ PMULULQ X0, X3
+ PMULULQ X0, X5
+ PSLLQ $0x20, X5
+ PADDQ X5, X3
+ MOVOU X4, X5
+ PSRLQ $0x2f, X5
+ PXOR X5, X4
+ MOVOU 176(DX), X5
+ PXOR X5, X4
+ PSHUFD $0xf5, X4, X5
+ PMULULQ X0, X4
+ PMULULQ X0, X5
+ PSLLQ $0x20, X5
+ PADDQ X5, X4
+ JMP accum_large
+
+accum:
+ CMPQ SI, $0x40
+ JLE finalize
+ MOVOU (CX), X0
+ MOVOU (BX), X5
+ PXOR X0, X5
+ PSHUFD $0x31, X5, X6
+ PMULULQ X5, X6
+ PSHUFD $0x4e, X0, X0
+ PADDQ X0, X1
+ PADDQ X6, X1
+ MOVOU 16(CX), X0
+ MOVOU 16(BX), X5
+ PXOR X0, X5
+ PSHUFD $0x31, X5, X6
+ PMULULQ X5, X6
+ PSHUFD $0x4e, X0, X0
+ PADDQ X0, X2
+ PADDQ X6, X2
+ MOVOU 32(CX), X0
+ MOVOU 32(BX), X5
+ PXOR X0, X5
+ PSHUFD $0x31, X5, X6
+ PMULULQ X5, X6
+ PSHUFD $0x4e, X0, X0
+ PADDQ X0, X3
+ PADDQ X6, X3
+ MOVOU 48(CX), X0
+ MOVOU 48(BX), X5
+ PXOR X0, X5
+ PSHUFD $0x31, X5, X6
+ PMULULQ X5, X6
+ PSHUFD $0x4e, X0, X0
+ PADDQ X0, X4
+ PADDQ X6, X4
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, SI
+ ADDQ $0x00000008, BX
+ JMP accum
+
+finalize:
+ CMPQ SI, $0x00
+ JE return
+ SUBQ $0x40, CX
+ ADDQ SI, CX
+ MOVOU (CX), X0
+ MOVOU 121(DX), X5
+ PXOR X0, X5
+ PSHUFD $0x31, X5, X6
+ PMULULQ X5, X6
+ PSHUFD $0x4e, X0, X0
+ PADDQ X0, X1
+ PADDQ X6, X1
+ MOVOU 16(CX), X0
+ MOVOU 137(DX), X5
+ PXOR X0, X5
+ PSHUFD $0x31, X5, X6
+ PMULULQ X5, X6
+ PSHUFD $0x4e, X0, X0
+ PADDQ X0, X2
+ PADDQ X6, X2
+ MOVOU 32(CX), X0
+ MOVOU 153(DX), X5
+ PXOR X0, X5
+ PSHUFD $0x31, X5, X6
+ PMULULQ X5, X6
+ PSHUFD $0x4e, X0, X0
+ PADDQ X0, X3
+ PADDQ X6, X3
+ MOVOU 48(CX), X0
+ MOVOU 169(DX), X5
+ PXOR X0, X5
+ PSHUFD $0x31, X5, X6
+ PMULULQ X5, X6
+ PSHUFD $0x4e, X0, X0
+ PADDQ X0, X4
+ PADDQ X6, X4
+
+return:
+ MOVOU X1, (AX)
+ MOVOU X2, 16(AX)
+ MOVOU X3, 32(AX)
+ MOVOU X4, 48(AX)
+ RET
+
+// func accumBlockSSE(acc *[8]uint64, data *byte, key *byte)
+// Requires: SSE2
+TEXT ·accumBlockSSE(SB), NOSPLIT, $0-24
+ MOVQ acc+0(FP), AX
+ MOVQ data+8(FP), CX
+ MOVQ key+16(FP), DX
+ MOVOU (AX), X1
+ MOVOU 16(AX), X2
+ MOVOU 32(AX), X3
+ MOVOU 48(AX), X4
+ MOVOU prime_sse<>+0(SB), X0
+ MOVOU (CX), X5
+ MOVOU (DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 16(CX), X5
+ MOVOU 16(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 32(CX), X5
+ MOVOU 32(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 48(CX), X5
+ MOVOU 48(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 64(CX), X5
+ MOVOU 8(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 80(CX), X5
+ MOVOU 24(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 96(CX), X5
+ MOVOU 40(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 112(CX), X5
+ MOVOU 56(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 128(CX), X5
+ MOVOU 16(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 144(CX), X5
+ MOVOU 32(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 160(CX), X5
+ MOVOU 48(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 176(CX), X5
+ MOVOU 64(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 192(CX), X5
+ MOVOU 24(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 208(CX), X5
+ MOVOU 40(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 224(CX), X5
+ MOVOU 56(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 240(CX), X5
+ MOVOU 72(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 256(CX), X5
+ MOVOU 32(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 272(CX), X5
+ MOVOU 48(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 288(CX), X5
+ MOVOU 64(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 304(CX), X5
+ MOVOU 80(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 320(CX), X5
+ MOVOU 40(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 336(CX), X5
+ MOVOU 56(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 352(CX), X5
+ MOVOU 72(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 368(CX), X5
+ MOVOU 88(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 384(CX), X5
+ MOVOU 48(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 400(CX), X5
+ MOVOU 64(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 416(CX), X5
+ MOVOU 80(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 432(CX), X5
+ MOVOU 96(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 448(CX), X5
+ MOVOU 56(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 464(CX), X5
+ MOVOU 72(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 480(CX), X5
+ MOVOU 88(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 496(CX), X5
+ MOVOU 104(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 512(CX), X5
+ MOVOU 64(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 528(CX), X5
+ MOVOU 80(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 544(CX), X5
+ MOVOU 96(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 560(CX), X5
+ MOVOU 112(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 576(CX), X5
+ MOVOU 72(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 592(CX), X5
+ MOVOU 88(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 608(CX), X5
+ MOVOU 104(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 624(CX), X5
+ MOVOU 120(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 640(CX), X5
+ MOVOU 80(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 656(CX), X5
+ MOVOU 96(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 672(CX), X5
+ MOVOU 112(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 688(CX), X5
+ MOVOU 128(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 704(CX), X5
+ MOVOU 88(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 720(CX), X5
+ MOVOU 104(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 736(CX), X5
+ MOVOU 120(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 752(CX), X5
+ MOVOU 136(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 768(CX), X5
+ MOVOU 96(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 784(CX), X5
+ MOVOU 112(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 800(CX), X5
+ MOVOU 128(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 816(CX), X5
+ MOVOU 144(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 832(CX), X5
+ MOVOU 104(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 848(CX), X5
+ MOVOU 120(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 864(CX), X5
+ MOVOU 136(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 880(CX), X5
+ MOVOU 152(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 896(CX), X5
+ MOVOU 112(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 912(CX), X5
+ MOVOU 128(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 928(CX), X5
+ MOVOU 144(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 944(CX), X5
+ MOVOU 160(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 960(CX), X5
+ MOVOU 120(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 976(CX), X5
+ MOVOU 136(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 992(CX), X5
+ MOVOU 152(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 1008(CX), X5
+ MOVOU 168(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU X1, X5
+ PSRLQ $0x2f, X5
+ PXOR X5, X1
+ MOVOU 128(DX), X5
+ PXOR X5, X1
+ PSHUFD $0xf5, X1, X5
+ PMULULQ X0, X1
+ PMULULQ X0, X5
+ PSLLQ $0x20, X5
+ PADDQ X5, X1
+ MOVOU X2, X5
+ PSRLQ $0x2f, X5
+ PXOR X5, X2
+ MOVOU 144(DX), X5
+ PXOR X5, X2
+ PSHUFD $0xf5, X2, X5
+ PMULULQ X0, X2
+ PMULULQ X0, X5
+ PSLLQ $0x20, X5
+ PADDQ X5, X2
+ MOVOU X3, X5
+ PSRLQ $0x2f, X5
+ PXOR X5, X3
+ MOVOU 160(DX), X5
+ PXOR X5, X3
+ PSHUFD $0xf5, X3, X5
+ PMULULQ X0, X3
+ PMULULQ X0, X5
+ PSLLQ $0x20, X5
+ PADDQ X5, X3
+ MOVOU X4, X5
+ PSRLQ $0x2f, X5
+ PXOR X5, X4
+ MOVOU 176(DX), X5
+ PXOR X5, X4
+ PSHUFD $0xf5, X4, X5
+ PMULULQ X0, X4
+ PMULULQ X0, X5
+ PSLLQ $0x20, X5
+ PADDQ X5, X4
+ MOVOU X1, (AX)
+ MOVOU X2, 16(AX)
+ MOVOU X3, 32(AX)
+ MOVOU X4, 48(AX)
+ RET
diff --git a/vendor/github.com/zeebo/xxh3/consts.go b/vendor/github.com/zeebo/xxh3/consts.go
new file mode 100644
index 000000000..39ef6e179
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/consts.go
@@ -0,0 +1,97 @@
+package xxh3
+
+const (
+ _stripe = 64
+ _block = 1024
+
+ prime32_1 = 2654435761
+ prime32_2 = 2246822519
+ prime32_3 = 3266489917
+
+ prime64_1 = 11400714785074694791
+ prime64_2 = 14029467366897019727
+ prime64_3 = 1609587929392839161
+ prime64_4 = 9650029242287828579
+ prime64_5 = 2870177450012600261
+)
+
+var key = ptr(&[...]u8{
+ 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe /* 8 */, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, /* 16 */
+ 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb /* 24 */, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, /* 32 */
+ 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78 /* 40 */, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, /* 48 */
+ 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e /* 56 */, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, /* 64 */
+ 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb /* 72 */, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, /* 80 */
+ 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e /* 88 */, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, /* 96 */
+ 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f /* 104 */, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, /* 112 */
+ 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31 /* 120 */, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, /* 128 */
+ 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3 /* 136 */, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, /* 144 */
+ 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49 /* 152 */, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, /* 160 */
+ 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc /* 168 */, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, /* 176 */
+ 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28 /* 184 */, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, /* 192 */
+})
+
+const (
+ key64_000 u64 = 0xbe4ba423396cfeb8
+ key64_008 u64 = 0x1cad21f72c81017c
+ key64_016 u64 = 0xdb979083e96dd4de
+ key64_024 u64 = 0x1f67b3b7a4a44072
+ key64_032 u64 = 0x78e5c0cc4ee679cb
+ key64_040 u64 = 0x2172ffcc7dd05a82
+ key64_048 u64 = 0x8e2443f7744608b8
+ key64_056 u64 = 0x4c263a81e69035e0
+ key64_064 u64 = 0xcb00c391bb52283c
+ key64_072 u64 = 0xa32e531b8b65d088
+ key64_080 u64 = 0x4ef90da297486471
+ key64_088 u64 = 0xd8acdea946ef1938
+ key64_096 u64 = 0x3f349ce33f76faa8
+ key64_104 u64 = 0x1d4f0bc7c7bbdcf9
+ key64_112 u64 = 0x3159b4cd4be0518a
+ key64_120 u64 = 0x647378d9c97e9fc8
+ key64_128 u64 = 0xc3ebd33483acc5ea
+ key64_136 u64 = 0xeb6313faffa081c5
+ key64_144 u64 = 0x49daf0b751dd0d17
+ key64_152 u64 = 0x9e68d429265516d3
+ key64_160 u64 = 0xfca1477d58be162b
+ key64_168 u64 = 0xce31d07ad1b8f88f
+ key64_176 u64 = 0x280416958f3acb45
+ key64_184 u64 = 0x7e404bbbcafbd7af
+
+ key64_103 u64 = 0x4f0bc7c7bbdcf93f
+ key64_111 u64 = 0x59b4cd4be0518a1d
+ key64_119 u64 = 0x7378d9c97e9fc831
+ key64_127 u64 = 0xebd33483acc5ea64
+
+ key64_121 u64 = 0xea647378d9c97e9f
+ key64_129 u64 = 0xc5c3ebd33483acc5
+ key64_137 u64 = 0x17eb6313faffa081
+ key64_145 u64 = 0xd349daf0b751dd0d
+ key64_153 u64 = 0x2b9e68d429265516
+ key64_161 u64 = 0x8ffca1477d58be16
+ key64_169 u64 = 0x45ce31d07ad1b8f8
+ key64_177 u64 = 0xaf280416958f3acb
+
+ key64_011 = 0x6dd4de1cad21f72c
+ key64_019 = 0xa44072db979083e9
+ key64_027 = 0xe679cb1f67b3b7a4
+ key64_035 = 0xd05a8278e5c0cc4e
+ key64_043 = 0x4608b82172ffcc7d
+ key64_051 = 0x9035e08e2443f774
+ key64_059 = 0x52283c4c263a81e6
+ key64_067 = 0x65d088cb00c391bb
+
+ key64_117 = 0xd9c97e9fc83159b4
+ key64_125 = 0x3483acc5ea647378
+ key64_133 = 0xfaffa081c5c3ebd3
+ key64_141 = 0xb751dd0d17eb6313
+ key64_149 = 0x29265516d349daf0
+ key64_157 = 0x7d58be162b9e68d4
+ key64_165 = 0x7ad1b8f88ffca147
+ key64_173 = 0x958f3acb45ce31d0
+)
+
+const (
+ key32_000 u32 = 0xbe4ba423
+ key32_004 u32 = 0x396cfeb8
+ key32_008 u32 = 0x1cad21f7
+ key32_012 u32 = 0x2c81017c
+)
diff --git a/vendor/github.com/zeebo/xxh3/hash128.go b/vendor/github.com/zeebo/xxh3/hash128.go
new file mode 100644
index 000000000..0040a21bb
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/hash128.go
@@ -0,0 +1,253 @@
+package xxh3
+
+import (
+ "math/bits"
+)
+
+// Hash128 returns the 128-bit hash of the byte slice.
+func Hash128(b []byte) Uint128 {
+ return hashAny128(*(*str)(ptr(&b)))
+}
+
+// HashString128 returns the 128-bit hash of the string slice.
+func HashString128(s string) Uint128 {
+ return hashAny128(*(*str)(ptr(&s)))
+}
+
+func hashAny128(s str) (acc u128) {
+ p, l := s.p, s.l
+
+ switch {
+ case l <= 16:
+ switch {
+ case l > 8: // 9-16
+ const bitflipl = key64_032 ^ key64_040
+ const bitfliph = key64_048 ^ key64_056
+
+ input_lo := readU64(p, 0)
+ input_hi := readU64(p, ui(l)-8)
+
+ m128_h, m128_l := bits.Mul64(input_lo^input_hi^bitflipl, prime64_1)
+
+ m128_l += uint64(l-1) << 54
+ input_hi ^= bitfliph
+
+ m128_h += input_hi + uint64(uint32(input_hi))*(prime32_2-1)
+
+ m128_l ^= bits.ReverseBytes64(m128_h)
+
+ acc.Hi, acc.Lo = bits.Mul64(m128_l, prime64_2)
+ acc.Hi += m128_h * prime64_2
+
+ acc.Lo = xxh3Avalanche(acc.Lo)
+ acc.Hi = xxh3Avalanche(acc.Hi)
+
+ return acc
+
+ case l > 3: // 4-8
+ const bitflip = key64_016 ^ key64_024
+
+ input_lo := readU32(p, 0)
+ input_hi := readU32(p, ui(l)-4)
+ input_64 := u64(input_lo) + u64(input_hi)<<32
+ keyed := input_64 ^ bitflip
+
+ acc.Hi, acc.Lo = bits.Mul64(keyed, prime64_1+(uint64(l)<<2))
+
+ acc.Hi += acc.Lo << 1
+ acc.Lo ^= acc.Hi >> 3
+
+ acc.Lo ^= acc.Lo >> 35
+ acc.Lo *= 0x9fb21c651e98df25
+ acc.Lo ^= acc.Lo >> 28
+ acc.Hi = xxh3Avalanche(acc.Hi)
+
+ return acc
+
+ case l == 3: // 3
+ c12 := u64(readU16(p, 0))
+ c3 := u64(readU8(p, 2))
+ acc.Lo = c12<<16 + c3 + 3<<8
+
+ case l > 1: // 2
+ c12 := u64(readU16(p, 0))
+ acc.Lo = c12*(1<<24+1)>>8 + 2<<8
+
+ case l == 1: // 1
+ c1 := u64(readU8(p, 0))
+ acc.Lo = c1*(1<<24+1<<16+1) + 1<<8
+
+ default: // 0
+ return u128{0x99aa06d3014798d8, 0x6001c324468d497f}
+ }
+
+ acc.Hi = uint64(bits.RotateLeft32(bits.ReverseBytes32(uint32(acc.Lo)), 13))
+ acc.Lo ^= uint64(key32_000 ^ key32_004)
+ acc.Hi ^= uint64(key32_008 ^ key32_012)
+
+ acc.Lo = xxh64AvalancheSmall(acc.Lo)
+ acc.Hi = xxh64AvalancheSmall(acc.Hi)
+
+ return acc
+
+ case l <= 128:
+ acc.Lo = u64(l) * prime64_1
+
+ if l > 32 {
+ if l > 64 {
+ if l > 96 {
+ in8, in7 := readU64(p, ui(l)-8*8), readU64(p, ui(l)-7*8)
+ i6, i7 := readU64(p, 6*8), readU64(p, 7*8)
+
+ acc.Hi += mulFold64(in8^key64_112, in7^key64_120)
+ acc.Hi ^= i6 + i7
+ acc.Lo += mulFold64(i6^key64_096, i7^key64_104)
+ acc.Lo ^= in8 + in7
+
+ } // 96
+
+ in6, in5 := readU64(p, ui(l)-6*8), readU64(p, ui(l)-5*8)
+ i4, i5 := readU64(p, 4*8), readU64(p, 5*8)
+
+ acc.Hi += mulFold64(in6^key64_080, in5^key64_088)
+ acc.Hi ^= i4 + i5
+ acc.Lo += mulFold64(i4^key64_064, i5^key64_072)
+ acc.Lo ^= in6 + in5
+
+ } // 64
+
+ in4, in3 := readU64(p, ui(l)-4*8), readU64(p, ui(l)-3*8)
+ i2, i3 := readU64(p, 2*8), readU64(p, 3*8)
+
+ acc.Hi += mulFold64(in4^key64_048, in3^key64_056)
+ acc.Hi ^= i2 + i3
+ acc.Lo += mulFold64(i2^key64_032, i3^key64_040)
+ acc.Lo ^= in4 + in3
+
+ } // 32
+
+ in2, in1 := readU64(p, ui(l)-2*8), readU64(p, ui(l)-1*8)
+ i0, i1 := readU64(p, 0*8), readU64(p, 1*8)
+
+ acc.Hi += mulFold64(in2^key64_016, in1^key64_024)
+ acc.Hi ^= i0 + i1
+ acc.Lo += mulFold64(i0^key64_000, i1^key64_008)
+ acc.Lo ^= in2 + in1
+
+ acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+(u64(l)*prime64_2), acc.Hi+acc.Lo
+
+ acc.Hi = -xxh3Avalanche(acc.Hi)
+ acc.Lo = xxh3Avalanche(acc.Lo)
+
+ return acc
+
+ case l <= 240:
+ acc.Lo = u64(l) * prime64_1
+
+ {
+ i0, i1, i2, i3 := readU64(p, 0*8), readU64(p, 1*8), readU64(p, 2*8), readU64(p, 3*8)
+
+ acc.Hi += mulFold64(i2^key64_016, i3^key64_024)
+ acc.Hi ^= i0 + i1
+ acc.Lo += mulFold64(i0^key64_000, i1^key64_008)
+ acc.Lo ^= i2 + i3
+ }
+
+ {
+ i0, i1, i2, i3 := readU64(p, 4*8), readU64(p, 5*8), readU64(p, 6*8), readU64(p, 7*8)
+
+ acc.Hi += mulFold64(i2^key64_048, i3^key64_056)
+ acc.Hi ^= i0 + i1
+ acc.Lo += mulFold64(i0^key64_032, i1^key64_040)
+ acc.Lo ^= i2 + i3
+ }
+
+ {
+ i0, i1, i2, i3 := readU64(p, 8*8), readU64(p, 9*8), readU64(p, 10*8), readU64(p, 11*8)
+
+ acc.Hi += mulFold64(i2^key64_080, i3^key64_088)
+ acc.Hi ^= i0 + i1
+ acc.Lo += mulFold64(i0^key64_064, i1^key64_072)
+ acc.Lo ^= i2 + i3
+ }
+
+ {
+ i0, i1, i2, i3 := readU64(p, 12*8), readU64(p, 13*8), readU64(p, 14*8), readU64(p, 15*8)
+
+ acc.Hi += mulFold64(i2^key64_112, i3^key64_120)
+ acc.Hi ^= i0 + i1
+ acc.Lo += mulFold64(i0^key64_096, i1^key64_104)
+ acc.Lo ^= i2 + i3
+ }
+
+ // avalanche
+ acc.Hi = xxh3Avalanche(acc.Hi)
+ acc.Lo = xxh3Avalanche(acc.Lo)
+
+ // trailing groups after 128
+ top := ui(l) &^ 31
+ for i := ui(4 * 32); i < top; i += 32 {
+ i0, i1, i2, i3 := readU64(p, i+0), readU64(p, i+8), readU64(p, i+16), readU64(p, i+24)
+ k0, k1, k2, k3 := readU64(key, i-125), readU64(key, i-117), readU64(key, i-109), readU64(key, i-101)
+
+ acc.Hi += mulFold64(i2^k2, i3^k3)
+ acc.Hi ^= i0 + i1
+ acc.Lo += mulFold64(i0^k0, i1^k1)
+ acc.Lo ^= i2 + i3
+ }
+
+ // last 32 bytes
+ {
+ i0, i1, i2, i3 := readU64(p, ui(l)-32), readU64(p, ui(l)-24), readU64(p, ui(l)-16), readU64(p, ui(l)-8)
+
+ acc.Hi += mulFold64(i0^key64_119, i1^key64_127)
+ acc.Hi ^= i2 + i3
+ acc.Lo += mulFold64(i2^key64_103, i3^key64_111)
+ acc.Lo ^= i0 + i1
+ }
+
+ acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+(u64(l)*prime64_2), acc.Hi+acc.Lo
+
+ acc.Hi = -xxh3Avalanche(acc.Hi)
+ acc.Lo = xxh3Avalanche(acc.Lo)
+
+ return acc
+
+ default:
+ acc.Lo = u64(l) * prime64_1
+ acc.Hi = ^(u64(l) * prime64_2)
+
+ accs := [8]u64{
+ prime32_3, prime64_1, prime64_2, prime64_3,
+ prime64_4, prime32_2, prime64_5, prime32_1,
+ }
+
+ if hasAVX512 && l >= avx512Switch {
+ accumAVX512(&accs, p, key, u64(l))
+ } else if hasAVX2 {
+ accumAVX2(&accs, p, key, u64(l))
+ } else if hasSSE2 {
+ accumSSE(&accs, p, key, u64(l))
+ } else {
+ accumScalar(&accs, p, key, u64(l))
+ }
+
+ // merge accs
+ acc.Lo += mulFold64(accs[0]^key64_011, accs[1]^key64_019)
+ acc.Hi += mulFold64(accs[0]^key64_117, accs[1]^key64_125)
+
+ acc.Lo += mulFold64(accs[2]^key64_027, accs[3]^key64_035)
+ acc.Hi += mulFold64(accs[2]^key64_133, accs[3]^key64_141)
+
+ acc.Lo += mulFold64(accs[4]^key64_043, accs[5]^key64_051)
+ acc.Hi += mulFold64(accs[4]^key64_149, accs[5]^key64_157)
+
+ acc.Lo += mulFold64(accs[6]^key64_059, accs[7]^key64_067)
+ acc.Hi += mulFold64(accs[6]^key64_165, accs[7]^key64_173)
+
+ acc.Lo = xxh3Avalanche(acc.Lo)
+ acc.Hi = xxh3Avalanche(acc.Hi)
+
+ return acc
+ }
+}
diff --git a/vendor/github.com/zeebo/xxh3/hash128_seed.go b/vendor/github.com/zeebo/xxh3/hash128_seed.go
new file mode 100644
index 000000000..358009be3
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/hash128_seed.go
@@ -0,0 +1,264 @@
+package xxh3
+
+import (
+ "math/bits"
+)
+
+// Hash128Seed returns the 128-bit hash of the byte slice.
+func Hash128Seed(b []byte, seed uint64) Uint128 {
+ return hashAny128Seed(*(*str)(ptr(&b)), seed)
+}
+
+// HashString128Seed returns the 128-bit hash of the string slice.
+func HashString128Seed(s string, seed uint64) Uint128 {
+ return hashAny128Seed(*(*str)(ptr(&s)), seed)
+}
+
+func hashAny128Seed(s str, seed uint64) (acc u128) {
+ p, l := s.p, s.l
+
+ switch {
+ case l <= 16:
+ switch {
+ case l > 8: // 9-16
+ bitflipl := (key64_032 ^ key64_040) - seed
+ bitfliph := (key64_048 ^ key64_056) + seed
+
+ input_lo := readU64(p, 0)
+ input_hi := readU64(p, ui(l)-8)
+
+ m128_h, m128_l := bits.Mul64(input_lo^input_hi^bitflipl, prime64_1)
+
+ m128_l += uint64(l-1) << 54
+ input_hi ^= bitfliph
+
+ m128_h += input_hi + uint64(uint32(input_hi))*(prime32_2-1)
+
+ m128_l ^= bits.ReverseBytes64(m128_h)
+
+ acc.Hi, acc.Lo = bits.Mul64(m128_l, prime64_2)
+ acc.Hi += m128_h * prime64_2
+
+ acc.Lo = xxh3Avalanche(acc.Lo)
+ acc.Hi = xxh3Avalanche(acc.Hi)
+
+ return acc
+
+ case l > 3: // 4-8
+ seed ^= u64(bits.ReverseBytes32(u32(seed))) << 32
+ bitflip := (key64_016 ^ key64_024) + seed
+ input_lo := readU32(p, 0)
+ input_hi := readU32(p, ui(l)-4)
+ input_64 := u64(input_lo) + u64(input_hi)<<32
+ keyed := input_64 ^ bitflip
+
+ acc.Hi, acc.Lo = bits.Mul64(keyed, prime64_1+(uint64(l)<<2))
+
+ acc.Hi += acc.Lo << 1
+ acc.Lo ^= acc.Hi >> 3
+
+ acc.Lo ^= acc.Lo >> 35
+ acc.Lo *= 0x9fb21c651e98df25
+ acc.Lo ^= acc.Lo >> 28
+ acc.Hi = xxh3Avalanche(acc.Hi)
+
+ return acc
+
+ case l == 3: // 3
+ c12 := u64(readU16(p, 0))
+ c3 := u64(readU8(p, 2))
+ acc.Lo = c12<<16 + c3 + 3<<8
+
+ case l > 1: // 2
+ c12 := u64(readU16(p, 0))
+ acc.Lo = c12*(1<<24+1)>>8 + 2<<8
+
+ case l == 1: // 1
+ c1 := u64(readU8(p, 0))
+ acc.Lo = c1*(1<<24+1<<16+1) + 1<<8
+
+ default: // 0
+ bitflipl := key64_064 ^ key64_072 ^ seed
+ bitfliph := key64_080 ^ key64_088 ^ seed
+ return u128{Lo: xxh64AvalancheFull(bitflipl), Hi: xxh64AvalancheFull(bitfliph)}
+ }
+
+ acc.Hi = uint64(bits.RotateLeft32(bits.ReverseBytes32(uint32(acc.Lo)), 13))
+ acc.Lo ^= uint64(key32_000^key32_004) + seed
+ acc.Hi ^= uint64(key32_008^key32_012) - seed
+
+ acc.Lo = xxh64AvalancheFull(acc.Lo)
+ acc.Hi = xxh64AvalancheFull(acc.Hi)
+
+ return acc
+
+ case l <= 128:
+ acc.Lo = u64(l) * prime64_1
+
+ if l > 32 {
+ if l > 64 {
+ if l > 96 {
+ in8, in7 := readU64(p, ui(l)-8*8), readU64(p, ui(l)-7*8)
+ i6, i7 := readU64(p, 6*8), readU64(p, 7*8)
+
+ acc.Hi += mulFold64(in8^(key64_112+seed), in7^(key64_120-seed))
+ acc.Hi ^= i6 + i7
+ acc.Lo += mulFold64(i6^(key64_096+seed), i7^(key64_104-seed))
+ acc.Lo ^= in8 + in7
+
+ } // 96
+
+ in6, in5 := readU64(p, ui(l)-6*8), readU64(p, ui(l)-5*8)
+ i4, i5 := readU64(p, 4*8), readU64(p, 5*8)
+
+ acc.Hi += mulFold64(in6^(key64_080+seed), in5^(key64_088-seed))
+ acc.Hi ^= i4 + i5
+ acc.Lo += mulFold64(i4^(key64_064+seed), i5^(key64_072-seed))
+ acc.Lo ^= in6 + in5
+
+ } // 64
+
+ in4, in3 := readU64(p, ui(l)-4*8), readU64(p, ui(l)-3*8)
+ i2, i3 := readU64(p, 2*8), readU64(p, 3*8)
+
+ acc.Hi += mulFold64(in4^(key64_048+seed), in3^(key64_056-seed))
+ acc.Hi ^= i2 + i3
+ acc.Lo += mulFold64(i2^(key64_032+seed), i3^(key64_040-seed))
+ acc.Lo ^= in4 + in3
+
+ } // 32
+
+ in2, in1 := readU64(p, ui(l)-2*8), readU64(p, ui(l)-1*8)
+ i0, i1 := readU64(p, 0*8), readU64(p, 1*8)
+
+ acc.Hi += mulFold64(in2^(key64_016+seed), in1^(key64_024-seed))
+ acc.Hi ^= i0 + i1
+ acc.Lo += mulFold64(i0^(key64_000+seed), i1^(key64_008-seed))
+ acc.Lo ^= in2 + in1
+
+ acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+((u64(l)-seed)*prime64_2), acc.Hi+acc.Lo
+
+ acc.Hi = -xxh3Avalanche(acc.Hi)
+ acc.Lo = xxh3Avalanche(acc.Lo)
+
+ return acc
+
+ case l <= 240:
+ acc.Lo = u64(l) * prime64_1
+
+ {
+ i0, i1, i2, i3 := readU64(p, 0*8), readU64(p, 1*8), readU64(p, 2*8), readU64(p, 3*8)
+
+ acc.Hi += mulFold64(i2^(key64_016+seed), i3^(key64_024-seed))
+ acc.Hi ^= i0 + i1
+ acc.Lo += mulFold64(i0^(key64_000+seed), i1^(key64_008-seed))
+ acc.Lo ^= i2 + i3
+ }
+
+ {
+ i0, i1, i2, i3 := readU64(p, 4*8), readU64(p, 5*8), readU64(p, 6*8), readU64(p, 7*8)
+
+ acc.Hi += mulFold64(i2^(key64_048+seed), i3^(key64_056-seed))
+ acc.Hi ^= i0 + i1
+ acc.Lo += mulFold64(i0^(key64_032+seed), i1^(key64_040-seed))
+ acc.Lo ^= i2 + i3
+ }
+
+ {
+ i0, i1, i2, i3 := readU64(p, 8*8), readU64(p, 9*8), readU64(p, 10*8), readU64(p, 11*8)
+
+ acc.Hi += mulFold64(i2^(key64_080+seed), i3^(key64_088-seed))
+ acc.Hi ^= i0 + i1
+ acc.Lo += mulFold64(i0^(key64_064+seed), i1^(key64_072-seed))
+ acc.Lo ^= i2 + i3
+ }
+
+ {
+ i0, i1, i2, i3 := readU64(p, 12*8), readU64(p, 13*8), readU64(p, 14*8), readU64(p, 15*8)
+
+ acc.Hi += mulFold64(i2^(key64_112+seed), i3^(key64_120-seed))
+ acc.Hi ^= i0 + i1
+ acc.Lo += mulFold64(i0^(key64_096+seed), i1^(key64_104-seed))
+ acc.Lo ^= i2 + i3
+ }
+
+ // avalanche
+ acc.Hi = xxh3Avalanche(acc.Hi)
+ acc.Lo = xxh3Avalanche(acc.Lo)
+
+ // trailing groups after 128
+ top := ui(l) &^ 31
+ for i := ui(4 * 32); i < top; i += 32 {
+ i0, i1, i2, i3 := readU64(p, i+0), readU64(p, i+8), readU64(p, i+16), readU64(p, i+24)
+ k0, k1, k2, k3 := readU64(key, i-125)+seed, readU64(key, i-117)-seed, readU64(key, i-109)+seed, readU64(key, i-101)-seed
+
+ acc.Hi += mulFold64(i2^k2, i3^k3)
+ acc.Hi ^= i0 + i1
+ acc.Lo += mulFold64(i0^k0, i1^k1)
+ acc.Lo ^= i2 + i3
+ }
+
+ // last 32 bytes
+ {
+ i0, i1, i2, i3 := readU64(p, ui(l)-32), readU64(p, ui(l)-24), readU64(p, ui(l)-16), readU64(p, ui(l)-8)
+
+ seed := 0 - seed
+ acc.Hi += mulFold64(i0^(key64_119+seed), i1^(key64_127-seed))
+ acc.Hi ^= i2 + i3
+ acc.Lo += mulFold64(i2^(key64_103+seed), i3^(key64_111-seed))
+ acc.Lo ^= i0 + i1
+ }
+
+ acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+((u64(l)-seed)*prime64_2), acc.Hi+acc.Lo
+
+ acc.Hi = -xxh3Avalanche(acc.Hi)
+ acc.Lo = xxh3Avalanche(acc.Lo)
+
+ return acc
+
+ default:
+ acc.Lo = u64(l) * prime64_1
+ acc.Hi = ^(u64(l) * prime64_2)
+
+ secret := key
+ if seed != 0 {
+ secret = ptr(&[secretSize]byte{})
+ initSecret(secret, seed)
+ }
+
+ accs := [8]u64{
+ prime32_3, prime64_1, prime64_2, prime64_3,
+ prime64_4, prime32_2, prime64_5, prime32_1,
+ }
+
+ if hasAVX512 && l >= avx512Switch {
+ accumAVX512(&accs, p, secret, u64(l))
+ } else if hasAVX2 {
+ accumAVX2(&accs, p, secret, u64(l))
+ } else if hasSSE2 {
+ accumSSE(&accs, p, secret, u64(l))
+ } else {
+ accumScalar(&accs, p, secret, u64(l))
+ }
+
+ // merge accs
+ const hi_off = 117 - 11
+
+ acc.Lo += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19))
+ acc.Hi += mulFold64(accs[0]^readU64(secret, 11+hi_off), accs[1]^readU64(secret, 19+hi_off))
+
+ acc.Lo += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35))
+ acc.Hi += mulFold64(accs[2]^readU64(secret, 27+hi_off), accs[3]^readU64(secret, 35+hi_off))
+
+ acc.Lo += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51))
+ acc.Hi += mulFold64(accs[4]^readU64(secret, 43+hi_off), accs[5]^readU64(secret, 51+hi_off))
+
+ acc.Lo += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67))
+ acc.Hi += mulFold64(accs[6]^readU64(secret, 59+hi_off), accs[7]^readU64(secret, 67+hi_off))
+
+ acc.Lo = xxh3Avalanche(acc.Lo)
+ acc.Hi = xxh3Avalanche(acc.Hi)
+
+ return acc
+ }
+}
diff --git a/vendor/github.com/zeebo/xxh3/hash64.go b/vendor/github.com/zeebo/xxh3/hash64.go
new file mode 100644
index 000000000..13aab9585
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/hash64.go
@@ -0,0 +1,126 @@
+package xxh3
+
+import "math/bits"
+
+// Hash returns the hash of the byte slice.
+func Hash(b []byte) uint64 {
+ return hashAny(*(*str)(ptr(&b)))
+}
+
+// Hash returns the hash of the string slice.
+func HashString(s string) uint64 {
+ return hashAny(*(*str)(ptr(&s)))
+}
+
+func hashAny(s str) (acc u64) {
+ p, l := s.p, s.l
+
+ switch {
+ case l <= 16:
+ switch {
+ case l > 8: // 9-16
+ inputlo := readU64(p, 0) ^ (key64_024 ^ key64_032)
+ inputhi := readU64(p, ui(l)-8) ^ (key64_040 ^ key64_048)
+ folded := mulFold64(inputlo, inputhi)
+ return xxh3Avalanche(u64(l) + bits.ReverseBytes64(inputlo) + inputhi + folded)
+
+ case l > 3: // 4-8
+ input1 := readU32(p, 0)
+ input2 := readU32(p, ui(l)-4)
+ input64 := u64(input2) + u64(input1)<<32
+ keyed := input64 ^ (key64_008 ^ key64_016)
+ return rrmxmx(keyed, u64(l))
+
+ case l == 3: // 3
+ c12 := u64(readU16(p, 0))
+ c3 := u64(readU8(p, 2))
+ acc = c12<<16 + c3 + 3<<8
+
+ case l > 1: // 2
+ c12 := u64(readU16(p, 0))
+ acc = c12*(1<<24+1)>>8 + 2<<8
+
+ case l == 1: // 1
+ c1 := u64(readU8(p, 0))
+ acc = c1*(1<<24+1<<16+1) + 1<<8
+
+ default: // 0
+ return 0x2d06800538d394c2 // xxh_avalanche(key64_056 ^ key64_064)
+ }
+
+ acc ^= u64(key32_000 ^ key32_004)
+ return xxhAvalancheSmall(acc)
+
+ case l <= 128:
+ acc = u64(l) * prime64_1
+
+ if l > 32 {
+ if l > 64 {
+ if l > 96 {
+ acc += mulFold64(readU64(p, 6*8)^key64_096, readU64(p, 7*8)^key64_104)
+ acc += mulFold64(readU64(p, ui(l)-8*8)^key64_112, readU64(p, ui(l)-7*8)^key64_120)
+ } // 96
+ acc += mulFold64(readU64(p, 4*8)^key64_064, readU64(p, 5*8)^key64_072)
+ acc += mulFold64(readU64(p, ui(l)-6*8)^key64_080, readU64(p, ui(l)-5*8)^key64_088)
+ } // 64
+ acc += mulFold64(readU64(p, 2*8)^key64_032, readU64(p, 3*8)^key64_040)
+ acc += mulFold64(readU64(p, ui(l)-4*8)^key64_048, readU64(p, ui(l)-3*8)^key64_056)
+ } // 32
+ acc += mulFold64(readU64(p, 0*8)^key64_000, readU64(p, 1*8)^key64_008)
+ acc += mulFold64(readU64(p, ui(l)-2*8)^key64_016, readU64(p, ui(l)-1*8)^key64_024)
+
+ return xxh3Avalanche(acc)
+
+ case l <= 240:
+ acc = u64(l) * prime64_1
+
+ acc += mulFold64(readU64(p, 0*16+0)^key64_000, readU64(p, 0*16+8)^key64_008)
+ acc += mulFold64(readU64(p, 1*16+0)^key64_016, readU64(p, 1*16+8)^key64_024)
+ acc += mulFold64(readU64(p, 2*16+0)^key64_032, readU64(p, 2*16+8)^key64_040)
+ acc += mulFold64(readU64(p, 3*16+0)^key64_048, readU64(p, 3*16+8)^key64_056)
+ acc += mulFold64(readU64(p, 4*16+0)^key64_064, readU64(p, 4*16+8)^key64_072)
+ acc += mulFold64(readU64(p, 5*16+0)^key64_080, readU64(p, 5*16+8)^key64_088)
+ acc += mulFold64(readU64(p, 6*16+0)^key64_096, readU64(p, 6*16+8)^key64_104)
+ acc += mulFold64(readU64(p, 7*16+0)^key64_112, readU64(p, 7*16+8)^key64_120)
+
+ // avalanche
+ acc = xxh3Avalanche(acc)
+
+ // trailing groups after 128
+ top := ui(l) &^ 15
+ for i := ui(8 * 16); i < top; i += 16 {
+ acc += mulFold64(readU64(p, i+0)^readU64(key, i-125), readU64(p, i+8)^readU64(key, i-117))
+ }
+
+ // last 16 bytes
+ acc += mulFold64(readU64(p, ui(l)-16)^key64_119, readU64(p, ui(l)-8)^key64_127)
+
+ return xxh3Avalanche(acc)
+
+ default:
+ acc = u64(l) * prime64_1
+
+ accs := [8]u64{
+ prime32_3, prime64_1, prime64_2, prime64_3,
+ prime64_4, prime32_2, prime64_5, prime32_1,
+ }
+
+ if hasAVX512 && l >= avx512Switch {
+ accumAVX512(&accs, p, key, u64(l))
+ } else if hasAVX2 {
+ accumAVX2(&accs, p, key, u64(l))
+ } else if hasSSE2 {
+ accumSSE(&accs, p, key, u64(l))
+ } else {
+ accumScalar(&accs, p, key, u64(l))
+ }
+
+ // merge accs
+ acc += mulFold64(accs[0]^key64_011, accs[1]^key64_019)
+ acc += mulFold64(accs[2]^key64_027, accs[3]^key64_035)
+ acc += mulFold64(accs[4]^key64_043, accs[5]^key64_051)
+ acc += mulFold64(accs[6]^key64_059, accs[7]^key64_067)
+
+ return xxh3Avalanche(acc)
+ }
+}
diff --git a/vendor/github.com/zeebo/xxh3/hash64_seed.go b/vendor/github.com/zeebo/xxh3/hash64_seed.go
new file mode 100644
index 000000000..429994c36
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/hash64_seed.go
@@ -0,0 +1,134 @@
+package xxh3
+
+import "math/bits"
+
+// HashSeed returns the hash of the byte slice with given seed.
+func HashSeed(b []byte, seed uint64) uint64 {
+ return hashAnySeed(*(*str)(ptr(&b)), seed)
+
+}
+
+// HashStringSeed returns the hash of the string slice with given seed.
+func HashStringSeed(s string, seed uint64) uint64 {
+ return hashAnySeed(*(*str)(ptr(&s)), seed)
+}
+
+func hashAnySeed(s str, seed uint64) (acc u64) {
+ p, l := s.p, s.l
+
+ switch {
+ case l <= 16:
+ switch {
+ case l > 8:
+ inputlo := readU64(p, 0) ^ (key64_024 ^ key64_032 + seed)
+ inputhi := readU64(p, ui(l)-8) ^ (key64_040 ^ key64_048 - seed)
+ folded := mulFold64(inputlo, inputhi)
+ return xxh3Avalanche(u64(l) + bits.ReverseBytes64(inputlo) + inputhi + folded)
+
+ case l > 3:
+ seed ^= u64(bits.ReverseBytes32(u32(seed))) << 32
+ input1 := readU32(p, 0)
+ input2 := readU32(p, ui(l)-4)
+ input64 := u64(input2) + u64(input1)<<32
+ keyed := input64 ^ (key64_008 ^ key64_016 - seed)
+ return rrmxmx(keyed, u64(l))
+
+ case l == 3: // 3
+ c12 := u64(readU16(p, 0))
+ c3 := u64(readU8(p, 2))
+ acc = c12<<16 + c3 + 3<<8
+
+ case l > 1: // 2
+ c12 := u64(readU16(p, 0))
+ acc = c12*(1<<24+1)>>8 + 2<<8
+
+ case l == 1: // 1
+ c1 := u64(readU8(p, 0))
+ acc = c1*(1<<24+1<<16+1) + 1<<8
+
+ default:
+ return xxhAvalancheSmall(seed ^ key64_056 ^ key64_064)
+ }
+
+ acc ^= u64(key32_000^key32_004) + seed
+ return xxhAvalancheSmall(acc)
+
+ case l <= 128:
+ acc = u64(l) * prime64_1
+
+ if l > 32 {
+ if l > 64 {
+ if l > 96 {
+ acc += mulFold64(readU64(p, 6*8)^(key64_096+seed), readU64(p, 7*8)^(key64_104-seed))
+ acc += mulFold64(readU64(p, ui(l)-8*8)^(key64_112+seed), readU64(p, ui(l)-7*8)^(key64_120-seed))
+ } // 96
+ acc += mulFold64(readU64(p, 4*8)^(key64_064+seed), readU64(p, 5*8)^(key64_072-seed))
+ acc += mulFold64(readU64(p, ui(l)-6*8)^(key64_080+seed), readU64(p, ui(l)-5*8)^(key64_088-seed))
+ } // 64
+ acc += mulFold64(readU64(p, 2*8)^(key64_032+seed), readU64(p, 3*8)^(key64_040-seed))
+ acc += mulFold64(readU64(p, ui(l)-4*8)^(key64_048+seed), readU64(p, ui(l)-3*8)^(key64_056-seed))
+ } // 32
+ acc += mulFold64(readU64(p, 0*8)^(key64_000+seed), readU64(p, 1*8)^(key64_008-seed))
+ acc += mulFold64(readU64(p, ui(l)-2*8)^(key64_016+seed), readU64(p, ui(l)-1*8)^(key64_024-seed))
+
+ return xxh3Avalanche(acc)
+
+ case l <= 240:
+ acc = u64(l) * prime64_1
+
+ acc += mulFold64(readU64(p, 0*16+0)^(key64_000+seed), readU64(p, 0*16+8)^(key64_008-seed))
+ acc += mulFold64(readU64(p, 1*16+0)^(key64_016+seed), readU64(p, 1*16+8)^(key64_024-seed))
+ acc += mulFold64(readU64(p, 2*16+0)^(key64_032+seed), readU64(p, 2*16+8)^(key64_040-seed))
+ acc += mulFold64(readU64(p, 3*16+0)^(key64_048+seed), readU64(p, 3*16+8)^(key64_056-seed))
+ acc += mulFold64(readU64(p, 4*16+0)^(key64_064+seed), readU64(p, 4*16+8)^(key64_072-seed))
+ acc += mulFold64(readU64(p, 5*16+0)^(key64_080+seed), readU64(p, 5*16+8)^(key64_088-seed))
+ acc += mulFold64(readU64(p, 6*16+0)^(key64_096+seed), readU64(p, 6*16+8)^(key64_104-seed))
+ acc += mulFold64(readU64(p, 7*16+0)^(key64_112+seed), readU64(p, 7*16+8)^(key64_120-seed))
+
+ // avalanche
+ acc = xxh3Avalanche(acc)
+
+ // trailing groups after 128
+ top := ui(l) &^ 15
+ for i := ui(8 * 16); i < top; i += 16 {
+ acc += mulFold64(readU64(p, i+0)^(readU64(key, i-125)+seed), readU64(p, i+8)^(readU64(key, i-117)-seed))
+ }
+
+ // last 16 bytes
+ acc += mulFold64(readU64(p, ui(l)-16)^(key64_119+seed), readU64(p, ui(l)-8)^(key64_127-seed))
+
+ return xxh3Avalanche(acc)
+
+ default:
+ acc = u64(l) * prime64_1
+
+ secret := key
+ if seed != 0 {
+ secret = ptr(&[secretSize]byte{})
+ initSecret(secret, seed)
+ }
+
+ accs := [8]u64{
+ prime32_3, prime64_1, prime64_2, prime64_3,
+ prime64_4, prime32_2, prime64_5, prime32_1,
+ }
+
+ if hasAVX512 && l >= avx512Switch {
+ accumAVX512(&accs, p, secret, u64(l))
+ } else if hasAVX2 {
+ accumAVX2(&accs, p, secret, u64(l))
+ } else if hasSSE2 {
+ accumSSE(&accs, p, secret, u64(l))
+ } else {
+ accumScalarSeed(&accs, p, secret, u64(l))
+ }
+
+ // merge accs
+ acc += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19))
+ acc += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35))
+ acc += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51))
+ acc += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67))
+
+ return xxh3Avalanche(acc)
+ }
+}
diff --git a/vendor/github.com/zeebo/xxh3/hasher.go b/vendor/github.com/zeebo/xxh3/hasher.go
new file mode 100644
index 000000000..d9789980a
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/hasher.go
@@ -0,0 +1,239 @@
+package xxh3
+
+import (
+ "encoding/binary"
+ "hash"
+)
+
+// Hasher implements the hash.Hash interface
+type Hasher struct {
+ acc [8]u64
+ blk u64
+ len u64
+ key ptr
+ buf [_block + _stripe]byte
+ seed u64
+}
+
+var (
+ _ hash.Hash = (*Hasher)(nil)
+ _ hash.Hash64 = (*Hasher)(nil)
+)
+
+// New returns a new Hasher that implements the hash.Hash interface.
+func New() *Hasher {
+ return new(Hasher)
+}
+
+// NewSeed returns a new Hasher that implements the hash.Hash interface.
+func NewSeed(seed uint64) *Hasher {
+ var h Hasher
+ h.Reset()
+ h.seed = seed
+ h.key = key
+
+ // Only initiate once, not on reset.
+ if seed != 0 {
+ h.key = ptr(&[secretSize]byte{})
+ initSecret(h.key, seed)
+ }
+ return &h
+}
+
+// Reset resets the Hash to its initial state.
+func (h *Hasher) Reset() {
+ h.acc = [8]u64{
+ prime32_3, prime64_1, prime64_2, prime64_3,
+ prime64_4, prime32_2, prime64_5, prime32_1,
+ }
+ h.blk = 0
+ h.len = 0
+}
+
+// BlockSize returns the hash's underlying block size.
+// The Write method will accept any amount of data, but
+// it may operate more efficiently if all writes are a
+// multiple of the block size.
+func (h *Hasher) BlockSize() int { return _stripe }
+
+// Size returns the number of bytes Sum will return.
+func (h *Hasher) Size() int { return 8 }
+
+// Sum appends the current hash to b and returns the resulting slice.
+// It does not change the underlying hash state.
+func (h *Hasher) Sum(b []byte) []byte {
+ var tmp [8]byte
+ binary.BigEndian.PutUint64(tmp[:], h.Sum64())
+ return append(b, tmp[:]...)
+}
+
+// Write adds more data to the running hash.
+// It never returns an error.
+func (h *Hasher) Write(buf []byte) (int, error) {
+ h.update(buf)
+ return len(buf), nil
+}
+
+// WriteString adds more data to the running hash.
+// It never returns an error.
+func (h *Hasher) WriteString(buf string) (int, error) {
+ h.updateString(buf)
+ return len(buf), nil
+}
+
+func (h *Hasher) update(buf []byte) {
+ // relies on the data pointer being the first word in the string header
+ h.updateString(*(*string)(ptr(&buf)))
+}
+
+func (h *Hasher) updateString(buf string) {
+ if h.key == nil {
+ h.key = key
+ h.Reset()
+ }
+
+ // On first write, if more than 1 block, process without copy.
+ for h.len == 0 && len(buf) > len(h.buf) {
+ if hasAVX2 {
+ accumBlockAVX2(&h.acc, *(*ptr)(ptr(&buf)), h.key)
+ } else if hasSSE2 {
+ accumBlockSSE(&h.acc, *(*ptr)(ptr(&buf)), h.key)
+ } else {
+ accumBlockScalar(&h.acc, *(*ptr)(ptr(&buf)), h.key)
+ }
+ buf = buf[_block:]
+ h.blk++
+ }
+
+ for len(buf) > 0 {
+ if h.len < u64(len(h.buf)) {
+ n := copy(h.buf[h.len:], buf)
+ h.len += u64(n)
+ buf = buf[n:]
+ continue
+ }
+
+ if hasAVX2 {
+ accumBlockAVX2(&h.acc, ptr(&h.buf), h.key)
+ } else if hasSSE2 {
+ accumBlockSSE(&h.acc, ptr(&h.buf), h.key)
+ } else {
+ accumBlockScalar(&h.acc, ptr(&h.buf), h.key)
+ }
+
+ h.blk++
+ h.len = _stripe
+ copy(h.buf[:_stripe], h.buf[_block:])
+ }
+}
+
+// Sum64 returns the 64-bit hash of the written data.
+func (h *Hasher) Sum64() uint64 {
+ if h.key == nil {
+ h.key = key
+ h.Reset()
+ }
+
+ if h.blk == 0 {
+ if h.seed == 0 {
+ return Hash(h.buf[:h.len])
+ }
+ return HashSeed(h.buf[:h.len], h.seed)
+ }
+
+ l := h.blk*_block + h.len
+ acc := l * prime64_1
+ accs := h.acc
+
+ if h.len > 0 {
+ // We are only ever doing 1 block here, so no avx512.
+ if hasAVX2 {
+ accumAVX2(&accs, ptr(&h.buf[0]), h.key, h.len)
+ } else if hasSSE2 {
+ accumSSE(&accs, ptr(&h.buf[0]), h.key, h.len)
+ } else {
+ accumScalar(&accs, ptr(&h.buf[0]), h.key, h.len)
+ }
+ }
+
+ if h.seed == 0 {
+ acc += mulFold64(accs[0]^key64_011, accs[1]^key64_019)
+ acc += mulFold64(accs[2]^key64_027, accs[3]^key64_035)
+ acc += mulFold64(accs[4]^key64_043, accs[5]^key64_051)
+ acc += mulFold64(accs[6]^key64_059, accs[7]^key64_067)
+ } else {
+ secret := h.key
+ acc += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19))
+ acc += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35))
+ acc += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51))
+ acc += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67))
+ }
+
+ acc = xxh3Avalanche(acc)
+
+ return acc
+}
+
+// Sum128 returns the 128-bit hash of the written data.
+func (h *Hasher) Sum128() Uint128 {
+ if h.key == nil {
+ h.key = key
+ h.Reset()
+ }
+
+ if h.blk == 0 {
+ if h.seed == 0 {
+ return Hash128(h.buf[:h.len])
+ }
+ return Hash128Seed(h.buf[:h.len], h.seed)
+ }
+
+ l := h.blk*_block + h.len
+ acc := Uint128{Lo: l * prime64_1, Hi: ^(l * prime64_2)}
+ accs := h.acc
+
+ if h.len > 0 {
+ // We are only ever doing 1 block here, so no avx512.
+ if hasAVX2 {
+ accumAVX2(&accs, ptr(&h.buf[0]), h.key, h.len)
+ } else if hasSSE2 {
+ accumSSE(&accs, ptr(&h.buf[0]), h.key, h.len)
+ } else {
+ accumScalar(&accs, ptr(&h.buf[0]), h.key, h.len)
+ }
+ }
+
+ if h.seed == 0 {
+ acc.Lo += mulFold64(accs[0]^key64_011, accs[1]^key64_019)
+ acc.Hi += mulFold64(accs[0]^key64_117, accs[1]^key64_125)
+
+ acc.Lo += mulFold64(accs[2]^key64_027, accs[3]^key64_035)
+ acc.Hi += mulFold64(accs[2]^key64_133, accs[3]^key64_141)
+
+ acc.Lo += mulFold64(accs[4]^key64_043, accs[5]^key64_051)
+ acc.Hi += mulFold64(accs[4]^key64_149, accs[5]^key64_157)
+
+ acc.Lo += mulFold64(accs[6]^key64_059, accs[7]^key64_067)
+ acc.Hi += mulFold64(accs[6]^key64_165, accs[7]^key64_173)
+ } else {
+ secret := h.key
+ const hi_off = 117 - 11
+
+ acc.Lo += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19))
+ acc.Hi += mulFold64(accs[0]^readU64(secret, 11+hi_off), accs[1]^readU64(secret, 19+hi_off))
+
+ acc.Lo += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35))
+ acc.Hi += mulFold64(accs[2]^readU64(secret, 27+hi_off), accs[3]^readU64(secret, 35+hi_off))
+
+ acc.Lo += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51))
+ acc.Hi += mulFold64(accs[4]^readU64(secret, 43+hi_off), accs[5]^readU64(secret, 51+hi_off))
+
+ acc.Lo += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67))
+ acc.Hi += mulFold64(accs[6]^readU64(secret, 59+hi_off), accs[7]^readU64(secret, 67+hi_off))
+ }
+
+ acc.Lo = xxh3Avalanche(acc.Lo)
+ acc.Hi = xxh3Avalanche(acc.Hi)
+
+ return acc
+}
diff --git a/vendor/github.com/zeebo/xxh3/utils.go b/vendor/github.com/zeebo/xxh3/utils.go
new file mode 100644
index 000000000..a837e68a6
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/utils.go
@@ -0,0 +1,129 @@
+package xxh3
+
+import (
+ "math/bits"
+ "unsafe"
+)
+
+// Uint128 is a 128 bit value.
+// The actual value can be thought of as u.Hi<<64 | u.Lo.
+type Uint128 struct {
+ Hi, Lo uint64
+}
+
+// Bytes returns the uint128 as an array of bytes in canonical form (big-endian encoded).
+func (u Uint128) Bytes() [16]byte {
+ return [16]byte{
+ byte(u.Hi >> 0x38), byte(u.Hi >> 0x30), byte(u.Hi >> 0x28), byte(u.Hi >> 0x20),
+ byte(u.Hi >> 0x18), byte(u.Hi >> 0x10), byte(u.Hi >> 0x08), byte(u.Hi),
+ byte(u.Lo >> 0x38), byte(u.Lo >> 0x30), byte(u.Lo >> 0x28), byte(u.Lo >> 0x20),
+ byte(u.Lo >> 0x18), byte(u.Lo >> 0x10), byte(u.Lo >> 0x08), byte(u.Lo),
+ }
+}
+
+type (
+ ptr = unsafe.Pointer
+ ui = uintptr
+
+ u8 = uint8
+ u32 = uint32
+ u64 = uint64
+ u128 = Uint128
+)
+
+type str struct {
+ p ptr
+ l uint
+}
+
+func readU8(p ptr, o ui) uint8 {
+ return *(*uint8)(ptr(ui(p) + o))
+}
+
+func readU16(p ptr, o ui) uint16 {
+ b := (*[2]byte)(ptr(ui(p) + o))
+ return uint16(b[0]) | uint16(b[1])<<8
+}
+
+func readU32(p ptr, o ui) uint32 {
+ b := (*[4]byte)(ptr(ui(p) + o))
+ return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func readU64(p ptr, o ui) uint64 {
+ b := (*[8]byte)(ptr(ui(p) + o))
+ return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+ uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func writeU64(p ptr, o ui, v u64) {
+ b := (*[8]byte)(ptr(ui(p) + o))
+ b[0] = byte(v)
+ b[1] = byte(v >> 8)
+ b[2] = byte(v >> 16)
+ b[3] = byte(v >> 24)
+ b[4] = byte(v >> 32)
+ b[5] = byte(v >> 40)
+ b[6] = byte(v >> 48)
+ b[7] = byte(v >> 56)
+}
+
+const secretSize = 192
+
+func initSecret(secret ptr, seed u64) {
+ for i := ui(0); i < secretSize/16; i++ {
+ lo := readU64(key, 16*i) + seed
+ hi := readU64(key, 16*i+8) - seed
+ writeU64(secret, 16*i, lo)
+ writeU64(secret, 16*i+8, hi)
+ }
+}
+
+func xxh64AvalancheSmall(x u64) u64 {
+ // x ^= x >> 33 // x must be < 32 bits
+ // x ^= u64(key32_000 ^ key32_004) // caller must do this
+ x *= prime64_2
+ x ^= x >> 29
+ x *= prime64_3
+ x ^= x >> 32
+ return x
+}
+
+func xxhAvalancheSmall(x u64) u64 {
+ x ^= x >> 33
+ x *= prime64_2
+ x ^= x >> 29
+ x *= prime64_3
+ x ^= x >> 32
+ return x
+}
+
+func xxh64AvalancheFull(x u64) u64 {
+ x ^= x >> 33
+ x *= prime64_2
+ x ^= x >> 29
+ x *= prime64_3
+ x ^= x >> 32
+ return x
+}
+
+func xxh3Avalanche(x u64) u64 {
+ x ^= x >> 37
+ x *= 0x165667919e3779f9
+ x ^= x >> 32
+ return x
+}
+
+func rrmxmx(h64 u64, len u64) u64 {
+ h64 ^= bits.RotateLeft64(h64, 49) ^ bits.RotateLeft64(h64, 24)
+ h64 *= 0x9fb21c651e98df25
+ h64 ^= (h64 >> 35) + len
+ h64 *= 0x9fb21c651e98df25
+ h64 ^= (h64 >> 28)
+ return h64
+}
+
+func mulFold64(x, y u64) u64 {
+ hi, lo := bits.Mul64(x, y)
+ return hi ^ lo
+}