diff options
Diffstat (limited to 'vendor/github.com')
18 files changed, 4185 insertions, 0 deletions
diff --git a/vendor/github.com/zeebo/xxh3/.gitignore b/vendor/github.com/zeebo/xxh3/.gitignore new file mode 100644 index 000000000..928e12f53 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/.gitignore @@ -0,0 +1,6 @@ +upstream +*.pprof +xxh3.test +.vscode +*.txt +_compat diff --git a/vendor/github.com/zeebo/xxh3/LICENSE b/vendor/github.com/zeebo/xxh3/LICENSE new file mode 100644 index 000000000..477f8e5e1 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/LICENSE @@ -0,0 +1,25 @@ +xxHash Library +Copyright (c) 2012-2014, Yann Collet +Copyright (c) 2019, Jeff Wendling +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/zeebo/xxh3/Makefile b/vendor/github.com/zeebo/xxh3/Makefile new file mode 100644 index 000000000..8bd78c482 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/Makefile @@ -0,0 +1,27 @@ +.PHONY: all vet +all: genasm _compat + +genasm: avo/avx.go avo/sse.go + cd ./avo; go generate gen.go + +clean: + rm accum_vector_avx_amd64.s + rm accum_vector_sse_amd64.s + rm _compat + +upstream/xxhash.o: upstream/xxhash.h + ( cd upstream && make ) + +_compat: _compat.c upstream/xxhash.o + gcc -o _compat _compat.c ./upstream/xxhash.o + +vet: + GOOS=linux GOARCH=386 GO386=softfloat go vet ./... + GOOS=windows GOARCH=386 GO386=softfloat go vet ./... + GOOS=linux GOARCH=amd64 go vet ./... + GOOS=windows GOARCH=amd64 go vet ./... + GOOS=darwin GOARCH=amd64 go vet ./... + GOOS=linux GOARCH=arm go vet ./... + GOOS=linux GOARCH=arm64 go vet ./... + GOOS=windows GOARCH=arm64 go vet ./... + GOOS=darwin GOARCH=arm64 go vet ./...
\ No newline at end of file diff --git a/vendor/github.com/zeebo/xxh3/README.md b/vendor/github.com/zeebo/xxh3/README.md new file mode 100644 index 000000000..4633fc03a --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/README.md @@ -0,0 +1,38 @@ +# XXH3 +[](https://godoc.org/github.com/zeebo/xxh3) +[](https://sourcegraph.com/github.com/zeebo/xxh3?badge) +[](https://goreportcard.com/report/github.com/zeebo/xxh3) + +This package is a port of the [xxh3](https://github.com/Cyan4973/xxHash) library to Go. + +Upstream has fixed the output as of v0.8.0, and this package matches that. + +--- + +# Benchmarks + +Run on my `i7-8850H CPU @ 2.60GHz` + +## Small Sizes + +| Bytes | Rate | +|-----------|--------------------------------------| +|` 0 ` |` 0.74 ns/op ` | +|` 1-3 ` |` 4.19 ns/op (0.24 GB/s - 0.71 GB/s) `| +|` 4-8 ` |` 4.16 ns/op (0.97 GB/s - 1.98 GB/s) `| +|` 9-16 ` |` 4.46 ns/op (2.02 GB/s - 3.58 GB/s) `| +|` 17-32 ` |` 6.22 ns/op (2.76 GB/s - 5.15 GB/s) `| +|` 33-64 ` |` 8.00 ns/op (4.13 GB/s - 8.13 GB/s) `| +|` 65-96 ` |` 11.0 ns/op (5.91 GB/s - 8.84 GB/s) `| +|` 97-128 ` |` 12.8 ns/op (7.68 GB/s - 10.0 GB/s) `| + +## Large Sizes + +| Bytes | Rate | SSE2 Rate | AVX2 Rate | +|---------|--------------------------|--------------------------|--------------------------| +|` 129 ` |` 13.6 ns/op (9.45 GB/s) `| | | +|` 240 ` |` 23.8 ns/op (10.1 GB/s) `| | | +|` 241 ` |` 40.5 ns/op (5.97 GB/s) `|` 23.3 ns/op (10.4 GB/s) `|` 20.1 ns/op (12.0 GB/s) `| +|` 512 ` |` 69.8 ns/op (7.34 GB/s) `|` 30.4 ns/op (16.9 GB/s) `|` 24.7 ns/op (20.7 GB/s) `| +|` 1024 ` |` 132 ns/op (7.77 GB/s) `|` 48.9 ns/op (20.9 GB/s) `|` 37.7 ns/op (27.2 GB/s) `| +|` 100KB `|` 13.0 us/op (7.88 GB/s) `|` 4.05 us/op (25.3 GB/s) `|` 2.31 us/op (44.3 GB/s) `| diff --git a/vendor/github.com/zeebo/xxh3/_compat.c b/vendor/github.com/zeebo/xxh3/_compat.c new file mode 100644 index 000000000..fda9f36ff --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/_compat.c @@ -0,0 +1,39 @@ +#include "upstream/xxhash.h" +#include <stdio.h> + +int main() { + unsigned char buf[4096]; + for (int i = 0; i < 4096; i++) { + buf[i] = (unsigned char)((i+1)%251); + } + + printf("var testVecs64 = []uint64{\n"); + for (int i = 0; i < 4096; i++) { + if (i % 4 == 0) { + printf("\t"); + } + + uint64_t h = XXH3_64bits(buf, (size_t)i); + printf("0x%lx, ", h); + + if (i % 4 == 3) { + printf("\n\t"); + } + } + printf("}\n\n"); + + printf("var testVecs128 = [][2]uint64{\n"); + for (int i = 0; i < 4096; i++) { + if (i % 4 == 0) { + printf("\t"); + } + + XXH128_hash_t h = XXH3_128bits(buf, (size_t)i); + printf("{0x%lx, 0x%lx}, ", h.high64, h.low64); + + if (i % 4 == 3) { + printf("\n"); + } + } + printf("}\n\n"); +} diff --git a/vendor/github.com/zeebo/xxh3/accum_generic.go b/vendor/github.com/zeebo/xxh3/accum_generic.go new file mode 100644 index 000000000..b1be78507 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_generic.go @@ -0,0 +1,542 @@ +package xxh3 + +// avx512Switch is the size at which the avx512 code is used. +// Bigger blocks benefit more. +const avx512Switch = 1 << 10 + +func accumScalar(accs *[8]u64, p, secret ptr, l u64) { + if secret != key { + accumScalarSeed(accs, p, secret, l) + return + } + for l > _block { + k := secret + + // accs + for i := 0; i < 16; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(k, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(k, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(k, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(k, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(k, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(k, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(k, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(k, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + l -= _stripe + if l > 0 { + p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8) + } + } + + // scramble accs + accs[0] ^= accs[0] >> 47 + accs[0] ^= key64_128 + accs[0] *= prime32_1 + + accs[1] ^= accs[1] >> 47 + accs[1] ^= key64_136 + accs[1] *= prime32_1 + + accs[2] ^= accs[2] >> 47 + accs[2] ^= key64_144 + accs[2] *= prime32_1 + + accs[3] ^= accs[3] >> 47 + accs[3] ^= key64_152 + accs[3] *= prime32_1 + + accs[4] ^= accs[4] >> 47 + accs[4] ^= key64_160 + accs[4] *= prime32_1 + + accs[5] ^= accs[5] >> 47 + accs[5] ^= key64_168 + accs[5] *= prime32_1 + + accs[6] ^= accs[6] >> 47 + accs[6] ^= key64_176 + accs[6] *= prime32_1 + + accs[7] ^= accs[7] >> 47 + accs[7] ^= key64_184 + accs[7] *= prime32_1 + } + + if l > 0 { + t, k := (l-1)/_stripe, secret + + for i := u64(0); i < t; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(k, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(k, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(k, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(k, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(k, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(k, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(k, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(k, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + l -= _stripe + if l > 0 { + p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8) + } + } + + if l > 0 { + p = ptr(ui(p) - uintptr(_stripe-l)) + + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ key64_121 + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ key64_129 + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ key64_137 + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ key64_145 + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ key64_153 + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ key64_161 + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ key64_169 + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ key64_177 + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + } + } +} + +func accumBlockScalar(accs *[8]u64, p, secret ptr) { + if secret != key { + accumBlockScalarSeed(accs, p, secret) + return + } + // accs + for i := 0; i < 16; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(secret, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(secret, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(secret, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(secret, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(secret, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(secret, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(secret, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(secret, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8) + } + + // scramble accs + accs[0] ^= accs[0] >> 47 + accs[0] ^= key64_128 + accs[0] *= prime32_1 + + accs[1] ^= accs[1] >> 47 + accs[1] ^= key64_136 + accs[1] *= prime32_1 + + accs[2] ^= accs[2] >> 47 + accs[2] ^= key64_144 + accs[2] *= prime32_1 + + accs[3] ^= accs[3] >> 47 + accs[3] ^= key64_152 + accs[3] *= prime32_1 + + accs[4] ^= accs[4] >> 47 + accs[4] ^= key64_160 + accs[4] *= prime32_1 + + accs[5] ^= accs[5] >> 47 + accs[5] ^= key64_168 + accs[5] *= prime32_1 + + accs[6] ^= accs[6] >> 47 + accs[6] ^= key64_176 + accs[6] *= prime32_1 + + accs[7] ^= accs[7] >> 47 + accs[7] ^= key64_184 + accs[7] *= prime32_1 +} + +// accumScalarSeed should be used with custom key. +func accumScalarSeed(accs *[8]u64, p, secret ptr, l u64) { + for l > _block { + k := secret + + // accs + for i := 0; i < 16; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(k, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(k, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(k, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(k, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(k, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(k, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(k, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(k, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + l -= _stripe + if l > 0 { + p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8) + } + } + + // scramble accs + accs[0] ^= accs[0] >> 47 + accs[0] ^= readU64(secret, 128) + accs[0] *= prime32_1 + + accs[1] ^= accs[1] >> 47 + accs[1] ^= readU64(secret, 136) + accs[1] *= prime32_1 + + accs[2] ^= accs[2] >> 47 + accs[2] ^= readU64(secret, 144) + accs[2] *= prime32_1 + + accs[3] ^= accs[3] >> 47 + accs[3] ^= readU64(secret, 152) + accs[3] *= prime32_1 + + accs[4] ^= accs[4] >> 47 + accs[4] ^= readU64(secret, 160) + accs[4] *= prime32_1 + + accs[5] ^= accs[5] >> 47 + accs[5] ^= readU64(secret, 168) + accs[5] *= prime32_1 + + accs[6] ^= accs[6] >> 47 + accs[6] ^= readU64(secret, 176) + accs[6] *= prime32_1 + + accs[7] ^= accs[7] >> 47 + accs[7] ^= readU64(secret, 184) + accs[7] *= prime32_1 + } + + if l > 0 { + t, k := (l-1)/_stripe, secret + + for i := u64(0); i < t; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(k, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(k, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(k, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(k, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(k, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(k, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(k, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(k, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + l -= _stripe + if l > 0 { + p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8) + } + } + + if l > 0 { + p = ptr(ui(p) - uintptr(_stripe-l)) + + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(secret, 121) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(secret, 129) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(secret, 137) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(secret, 145) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(secret, 153) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(secret, 161) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(secret, 169) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(secret, 177) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + } + } +} + +// accumBlockScalarSeed should be used with custom key. +func accumBlockScalarSeed(accs *[8]u64, p, secret ptr) { + // accs + { + secret := secret + for i := 0; i < 16; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(secret, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(secret, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(secret, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(secret, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(secret, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(secret, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(secret, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(secret, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8) + } + } + + // scramble accs + accs[0] ^= accs[0] >> 47 + accs[0] ^= readU64(secret, 128) + accs[0] *= prime32_1 + + accs[1] ^= accs[1] >> 47 + accs[1] ^= readU64(secret, 136) + accs[1] *= prime32_1 + + accs[2] ^= accs[2] >> 47 + accs[2] ^= readU64(secret, 144) + accs[2] *= prime32_1 + + accs[3] ^= accs[3] >> 47 + accs[3] ^= readU64(secret, 152) + accs[3] *= prime32_1 + + accs[4] ^= accs[4] >> 47 + accs[4] ^= readU64(secret, 160) + accs[4] *= prime32_1 + + accs[5] ^= accs[5] >> 47 + accs[5] ^= readU64(secret, 168) + accs[5] *= prime32_1 + + accs[6] ^= accs[6] >> 47 + accs[6] ^= readU64(secret, 176) + accs[6] *= prime32_1 + + accs[7] ^= accs[7] >> 47 + accs[7] ^= readU64(secret, 184) + accs[7] *= prime32_1 +} diff --git a/vendor/github.com/zeebo/xxh3/accum_stubs_amd64.go b/vendor/github.com/zeebo/xxh3/accum_stubs_amd64.go new file mode 100644 index 000000000..9baff6c41 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_stubs_amd64.go @@ -0,0 +1,40 @@ +package xxh3 + +import ( + "unsafe" + + "github.com/klauspost/cpuid/v2" +) + +var ( + hasAVX2 = cpuid.CPU.Has(cpuid.AVX2) + hasSSE2 = cpuid.CPU.Has(cpuid.SSE2) // Always true on amd64 + hasAVX512 = cpuid.CPU.Has(cpuid.AVX512F) +) + +//go:noescape +func accumAVX2(acc *[8]u64, data, key unsafe.Pointer, len u64) + +//go:noescape +func accumAVX512(acc *[8]u64, data, key unsafe.Pointer, len u64) + +//go:noescape +func accumSSE(acc *[8]u64, data, key unsafe.Pointer, len u64) + +//go:noescape +func accumBlockAVX2(acc *[8]u64, data, key unsafe.Pointer) + +//go:noescape +func accumBlockSSE(acc *[8]u64, data, key unsafe.Pointer) + +func withOverrides(avx512, avx2, sse2 bool, cb func()) { + avx512Orig, avx2Orig, sse2Orig := hasAVX512, hasAVX2, hasSSE2 + hasAVX512, hasAVX2, hasSSE2 = avx512, avx2, sse2 + defer func() { hasAVX512, hasAVX2, hasSSE2 = avx512Orig, avx2Orig, sse2Orig }() + cb() +} + +func withAVX512(cb func()) { withOverrides(hasAVX512, false, false, cb) } +func withAVX2(cb func()) { withOverrides(false, hasAVX2, false, cb) } +func withSSE2(cb func()) { withOverrides(false, false, hasSSE2, cb) } +func withGeneric(cb func()) { withOverrides(false, false, false, cb) } diff --git a/vendor/github.com/zeebo/xxh3/accum_stubs_other.go b/vendor/github.com/zeebo/xxh3/accum_stubs_other.go new file mode 100644 index 000000000..93bf6258a --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_stubs_other.go @@ -0,0 +1,25 @@ +//go:build !amd64 +// +build !amd64 + +package xxh3 + +import ( + "unsafe" +) + +const ( + hasAVX2 = false + hasSSE2 = false + hasAVX512 = false +) + +func accumAVX2(acc *[8]u64, data, key unsafe.Pointer, len u64) { panic("unreachable") } +func accumSSE(acc *[8]u64, data, key unsafe.Pointer, len u64) { panic("unreachable") } +func accumBlockAVX2(acc *[8]u64, data, key unsafe.Pointer) { panic("unreachable") } +func accumBlockSSE(acc *[8]u64, data, key unsafe.Pointer) { panic("unreachable") } +func accumAVX512(acc *[8]u64, data, key unsafe.Pointer, len u64) { panic("unreachable") } + +func withAVX512(cb func()) { cb() } +func withAVX2(cb func()) { cb() } +func withSSE2(cb func()) { cb() } +func withGeneric(cb func()) { cb() } diff --git a/vendor/github.com/zeebo/xxh3/accum_vector_avx512_amd64.s b/vendor/github.com/zeebo/xxh3/accum_vector_avx512_amd64.s new file mode 100644 index 000000000..cfaf9f0a7 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_vector_avx512_amd64.s @@ -0,0 +1,379 @@ +// Code generated by command: go run gen.go -avx512 -out ../accum_vector_avx512_amd64.s -pkg xxh3. DO NOT EDIT. + +#include "textflag.h" + +DATA prime_avx512<>+0(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+8(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+16(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+24(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+32(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+40(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+48(SB)/8, $0x000000009e3779b1 +DATA prime_avx512<>+56(SB)/8, $0x000000009e3779b1 +GLOBL prime_avx512<>(SB), RODATA|NOPTR, $64 + +// func accumAVX512(acc *[8]uint64, data *byte, key *byte, len uint64) +// Requires: AVX, AVX512F, MMX+ +TEXT ·accumAVX512(SB), NOSPLIT, $0-32 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + MOVQ len+24(FP), BX + VMOVDQU64 (AX), Z1 + VMOVDQU64 prime_avx512<>+0(SB), Z0 + VMOVDQU64 (DX), Z2 + VMOVDQU64 8(DX), Z3 + VMOVDQU64 16(DX), Z4 + VMOVDQU64 24(DX), Z5 + VMOVDQU64 32(DX), Z6 + VMOVDQU64 40(DX), Z7 + VMOVDQU64 48(DX), Z8 + VMOVDQU64 56(DX), Z9 + VMOVDQU64 64(DX), Z10 + VMOVDQU64 72(DX), Z11 + VMOVDQU64 80(DX), Z12 + VMOVDQU64 88(DX), Z13 + VMOVDQU64 96(DX), Z14 + VMOVDQU64 104(DX), Z15 + VMOVDQU64 112(DX), Z16 + VMOVDQU64 120(DX), Z17 + VMOVDQU64 128(DX), Z18 + VMOVDQU64 121(DX), Z19 + +accum_large: + CMPQ BX, $0x00000400 + JLE accum + VMOVDQU64 (CX), Z20 + PREFETCHT0 1024(CX) + VPXORD Z2, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 64(CX), Z20 + PREFETCHT0 1088(CX) + VPXORD Z3, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 128(CX), Z20 + PREFETCHT0 1152(CX) + VPXORD Z4, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 192(CX), Z20 + PREFETCHT0 1216(CX) + VPXORD Z5, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 256(CX), Z20 + PREFETCHT0 1280(CX) + VPXORD Z6, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 320(CX), Z20 + PREFETCHT0 1344(CX) + VPXORD Z7, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 384(CX), Z20 + PREFETCHT0 1408(CX) + VPXORD Z8, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 448(CX), Z20 + PREFETCHT0 1472(CX) + VPXORD Z9, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 512(CX), Z20 + PREFETCHT0 1536(CX) + VPXORD Z10, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 576(CX), Z20 + PREFETCHT0 1600(CX) + VPXORD Z11, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 640(CX), Z20 + PREFETCHT0 1664(CX) + VPXORD Z12, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 704(CX), Z20 + PREFETCHT0 1728(CX) + VPXORD Z13, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 768(CX), Z20 + PREFETCHT0 1792(CX) + VPXORD Z14, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 832(CX), Z20 + PREFETCHT0 1856(CX) + VPXORD Z15, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 896(CX), Z20 + PREFETCHT0 1920(CX) + VPXORD Z16, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + VMOVDQU64 960(CX), Z20 + PREFETCHT0 1984(CX) + VPXORD Z17, Z20, Z21 + VPSHUFD $0x31, Z21, Z22 + VPMULUDQ Z21, Z22, Z21 + VPSHUFD $0x4e, Z20, Z20 + VPADDQ Z1, Z20, Z1 + VPADDQ Z1, Z21, Z1 + ADDQ $0x00000400, CX + SUBQ $0x00000400, BX + VPSRLQ $0x2f, Z1, Z20 + VPTERNLOGD $0x96, Z1, Z18, Z20 + VPMULUDQ Z0, Z20, Z1 + VPSHUFD $0xf5, Z20, Z20 + VPMULUDQ Z0, Z20, Z20 + VPSLLQ $0x20, Z20, Z20 + VPADDQ Z1, Z20, Z1 + JMP accum_large + +accum: + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z2, Z0, Z2 + VPSHUFD $0x31, Z2, Z18 + VPMULUDQ Z2, Z18, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z3, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z4, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z5, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z6, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z7, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z8, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z9, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z10, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z11, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z12, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z13, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z14, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z15, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z16, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + CMPQ BX, $0x40 + JLE finalize + VMOVDQU64 (CX), Z0 + VPXORD Z17, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + ADDQ $0x00000040, CX + SUBQ $0x00000040, BX + +finalize: + CMPQ BX, $0x00 + JE return + SUBQ $0x40, CX + ADDQ BX, CX + VMOVDQU64 (CX), Z0 + VPXORD Z19, Z0, Z2 + VPSHUFD $0x31, Z2, Z3 + VPMULUDQ Z2, Z3, Z2 + VPSHUFD $0x4e, Z0, Z0 + VPADDQ Z1, Z0, Z1 + VPADDQ Z1, Z2, Z1 + +return: + VMOVDQU64 Z1, (AX) + VZEROUPPER + RET diff --git a/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s b/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s new file mode 100644 index 000000000..b53c1521f --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s @@ -0,0 +1,586 @@ +// Code generated by command: go run gen.go -avx -out ../accum_vector_avx_amd64.s -pkg xxh3. DO NOT EDIT. + +#include "textflag.h" + +DATA prime_avx<>+0(SB)/8, $0x000000009e3779b1 +DATA prime_avx<>+8(SB)/8, $0x000000009e3779b1 +DATA prime_avx<>+16(SB)/8, $0x000000009e3779b1 +DATA prime_avx<>+24(SB)/8, $0x000000009e3779b1 +GLOBL prime_avx<>(SB), RODATA|NOPTR, $32 + +// func accumAVX2(acc *[8]uint64, data *byte, key *byte, len uint64) +// Requires: AVX, AVX2, MMX+ +TEXT ·accumAVX2(SB), NOSPLIT, $0-32 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + MOVQ key+16(FP), BX + MOVQ len+24(FP), SI + VMOVDQU (AX), Y1 + VMOVDQU 32(AX), Y2 + VMOVDQU prime_avx<>+0(SB), Y0 + +accum_large: + CMPQ SI, $0x00000400 + JLE accum + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y6 + PREFETCHT0 512(CX) + VPXOR (DX), Y3, Y4 + VPXOR 32(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y6 + PREFETCHT0 576(CX) + VPXOR 8(DX), Y3, Y4 + VPXOR 40(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y6 + PREFETCHT0 640(CX) + VPXOR 16(DX), Y3, Y4 + VPXOR 48(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y6 + PREFETCHT0 704(CX) + VPXOR 24(DX), Y3, Y4 + VPXOR 56(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y6 + PREFETCHT0 768(CX) + VPXOR 32(DX), Y3, Y4 + VPXOR 64(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y6 + PREFETCHT0 832(CX) + VPXOR 40(DX), Y3, Y4 + VPXOR 72(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y6 + PREFETCHT0 896(CX) + VPXOR 48(DX), Y3, Y4 + VPXOR 80(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y6 + PREFETCHT0 960(CX) + VPXOR 56(DX), Y3, Y4 + VPXOR 88(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y6 + PREFETCHT0 1024(CX) + VPXOR 64(DX), Y3, Y4 + VPXOR 96(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y6 + PREFETCHT0 1088(CX) + VPXOR 72(DX), Y3, Y4 + VPXOR 104(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y6 + PREFETCHT0 1152(CX) + VPXOR 80(DX), Y3, Y4 + VPXOR 112(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y6 + PREFETCHT0 1216(CX) + VPXOR 88(DX), Y3, Y4 + VPXOR 120(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y6 + PREFETCHT0 1280(CX) + VPXOR 96(DX), Y3, Y4 + VPXOR 128(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y6 + PREFETCHT0 1344(CX) + VPXOR 104(DX), Y3, Y4 + VPXOR 136(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y6 + PREFETCHT0 1408(CX) + VPXOR 112(DX), Y3, Y4 + VPXOR 144(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y6 + PREFETCHT0 1472(CX) + VPXOR 120(DX), Y3, Y4 + VPXOR 152(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + ADDQ $0x00000400, CX + SUBQ $0x00000400, SI + VPSRLQ $0x2f, Y1, Y3 + VPXOR Y1, Y3, Y3 + VPXOR 128(DX), Y3, Y3 + VPMULUDQ Y0, Y3, Y1 + VPSHUFD $0xf5, Y3, Y3 + VPMULUDQ Y0, Y3, Y3 + VPSLLQ $0x20, Y3, Y3 + VPADDQ Y1, Y3, Y1 + VPSRLQ $0x2f, Y2, Y3 + VPXOR Y2, Y3, Y3 + VPXOR 160(DX), Y3, Y3 + VPMULUDQ Y0, Y3, Y2 + VPSHUFD $0xf5, Y3, Y3 + VPMULUDQ Y0, Y3, Y3 + VPSLLQ $0x20, Y3, Y3 + VPADDQ Y2, Y3, Y2 + JMP accum_large + +accum: + CMPQ SI, $0x40 + JLE finalize + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y5 + VPXOR (BX), Y0, Y3 + VPXOR 32(BX), Y5, Y6 + VPSHUFD $0x31, Y3, Y4 + VPSHUFD $0x31, Y6, Y7 + VPMULUDQ Y3, Y4, Y3 + VPMULUDQ Y6, Y7, Y6 + VPSHUFD $0x4e, Y0, Y0 + VPSHUFD $0x4e, Y5, Y5 + VPADDQ Y1, Y0, Y1 + VPADDQ Y1, Y3, Y1 + VPADDQ Y2, Y5, Y2 + VPADDQ Y2, Y6, Y2 + ADDQ $0x00000040, CX + SUBQ $0x00000040, SI + ADDQ $0x00000008, BX + JMP accum + +finalize: + CMPQ SI, $0x00 + JE return + SUBQ $0x40, CX + ADDQ SI, CX + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y5 + VPXOR 121(DX), Y0, Y3 + VPXOR 153(DX), Y5, Y6 + VPSHUFD $0x31, Y3, Y4 + VPSHUFD $0x31, Y6, Y7 + VPMULUDQ Y3, Y4, Y3 + VPMULUDQ Y6, Y7, Y6 + VPSHUFD $0x4e, Y0, Y0 + VPSHUFD $0x4e, Y5, Y5 + VPADDQ Y1, Y0, Y1 + VPADDQ Y1, Y3, Y1 + VPADDQ Y2, Y5, Y2 + VPADDQ Y2, Y6, Y2 + +return: + VMOVDQU Y1, (AX) + VMOVDQU Y2, 32(AX) + VZEROUPPER + RET + +// func accumBlockAVX2(acc *[8]uint64, data *byte, key *byte) +// Requires: AVX, AVX2 +TEXT ·accumBlockAVX2(SB), NOSPLIT, $0-24 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + VMOVDQU (AX), Y1 + VMOVDQU 32(AX), Y2 + VMOVDQU prime_avx<>+0(SB), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y6 + VPXOR (DX), Y3, Y4 + VPXOR 32(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y6 + VPXOR 8(DX), Y3, Y4 + VPXOR 40(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y6 + VPXOR 16(DX), Y3, Y4 + VPXOR 48(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y6 + VPXOR 24(DX), Y3, Y4 + VPXOR 56(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y6 + VPXOR 32(DX), Y3, Y4 + VPXOR 64(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y6 + VPXOR 40(DX), Y3, Y4 + VPXOR 72(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y6 + VPXOR 48(DX), Y3, Y4 + VPXOR 80(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y6 + VPXOR 56(DX), Y3, Y4 + VPXOR 88(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y6 + VPXOR 64(DX), Y3, Y4 + VPXOR 96(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y6 + VPXOR 72(DX), Y3, Y4 + VPXOR 104(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y6 + VPXOR 80(DX), Y3, Y4 + VPXOR 112(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y6 + VPXOR 88(DX), Y3, Y4 + VPXOR 120(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y6 + VPXOR 96(DX), Y3, Y4 + VPXOR 128(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y6 + VPXOR 104(DX), Y3, Y4 + VPXOR 136(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y6 + VPXOR 112(DX), Y3, Y4 + VPXOR 144(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y6 + VPXOR 120(DX), Y3, Y4 + VPXOR 152(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VPSRLQ $0x2f, Y1, Y3 + VPXOR Y1, Y3, Y3 + VPXOR 128(DX), Y3, Y3 + VPMULUDQ Y0, Y3, Y1 + VPSHUFD $0xf5, Y3, Y3 + VPMULUDQ Y0, Y3, Y3 + VPSLLQ $0x20, Y3, Y3 + VPADDQ Y1, Y3, Y1 + VPSRLQ $0x2f, Y2, Y3 + VPXOR Y2, Y3, Y3 + VPXOR 160(DX), Y3, Y3 + VPMULUDQ Y0, Y3, Y2 + VPSHUFD $0xf5, Y3, Y3 + VPMULUDQ Y0, Y3, Y3 + VPSLLQ $0x20, Y3, Y3 + VPADDQ Y2, Y3, Y2 + VMOVDQU Y1, (AX) + VMOVDQU Y2, 32(AX) + VZEROUPPER + RET diff --git a/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s b/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s new file mode 100644 index 000000000..ba670e560 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s @@ -0,0 +1,1236 @@ +// Code generated by command: go run gen.go -sse -out ../accum_vector_sse_amd64.s -pkg xxh3. DO NOT EDIT. + +#include "textflag.h" + +DATA prime_sse<>+0(SB)/4, $0x9e3779b1 +DATA prime_sse<>+4(SB)/4, $0x9e3779b1 +DATA prime_sse<>+8(SB)/4, $0x9e3779b1 +DATA prime_sse<>+12(SB)/4, $0x9e3779b1 +GLOBL prime_sse<>(SB), RODATA|NOPTR, $16 + +// func accumSSE(acc *[8]uint64, data *byte, key *byte, len uint64) +// Requires: SSE2 +TEXT ·accumSSE(SB), NOSPLIT, $0-32 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + MOVQ key+16(FP), BX + MOVQ len+24(FP), SI + MOVOU (AX), X1 + MOVOU 16(AX), X2 + MOVOU 32(AX), X3 + MOVOU 48(AX), X4 + MOVOU prime_sse<>+0(SB), X0 + +accum_large: + CMPQ SI, $0x00000400 + JLE accum + MOVOU (CX), X5 + MOVOU (DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 16(CX), X5 + MOVOU 16(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 32(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 48(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 64(CX), X5 + MOVOU 8(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 80(CX), X5 + MOVOU 24(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 96(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 112(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 128(CX), X5 + MOVOU 16(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 144(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 160(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 176(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 192(CX), X5 + MOVOU 24(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 208(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 224(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 240(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 256(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 272(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 288(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 304(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 320(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 336(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 352(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 368(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 384(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 400(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 416(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 432(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 448(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 464(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 480(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 496(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 512(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 528(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 544(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 560(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 576(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 592(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 608(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 624(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 640(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 656(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 672(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 688(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 704(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 720(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 736(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 752(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 768(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 784(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 800(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 816(CX), X5 + MOVOU 144(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 832(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 848(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 864(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 880(CX), X5 + MOVOU 152(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 896(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 912(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 928(CX), X5 + MOVOU 144(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 944(CX), X5 + MOVOU 160(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 960(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 976(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 992(CX), X5 + MOVOU 152(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 1008(CX), X5 + MOVOU 168(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + ADDQ $0x00000400, CX + SUBQ $0x00000400, SI + MOVOU X1, X5 + PSRLQ $0x2f, X5 + PXOR X5, X1 + MOVOU 128(DX), X5 + PXOR X5, X1 + PSHUFD $0xf5, X1, X5 + PMULULQ X0, X1 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X1 + MOVOU X2, X5 + PSRLQ $0x2f, X5 + PXOR X5, X2 + MOVOU 144(DX), X5 + PXOR X5, X2 + PSHUFD $0xf5, X2, X5 + PMULULQ X0, X2 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X2 + MOVOU X3, X5 + PSRLQ $0x2f, X5 + PXOR X5, X3 + MOVOU 160(DX), X5 + PXOR X5, X3 + PSHUFD $0xf5, X3, X5 + PMULULQ X0, X3 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X3 + MOVOU X4, X5 + PSRLQ $0x2f, X5 + PXOR X5, X4 + MOVOU 176(DX), X5 + PXOR X5, X4 + PSHUFD $0xf5, X4, X5 + PMULULQ X0, X4 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X4 + JMP accum_large + +accum: + CMPQ SI, $0x40 + JLE finalize + MOVOU (CX), X0 + MOVOU (BX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X1 + PADDQ X6, X1 + MOVOU 16(CX), X0 + MOVOU 16(BX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X2 + PADDQ X6, X2 + MOVOU 32(CX), X0 + MOVOU 32(BX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X3 + PADDQ X6, X3 + MOVOU 48(CX), X0 + MOVOU 48(BX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X4 + PADDQ X6, X4 + ADDQ $0x00000040, CX + SUBQ $0x00000040, SI + ADDQ $0x00000008, BX + JMP accum + +finalize: + CMPQ SI, $0x00 + JE return + SUBQ $0x40, CX + ADDQ SI, CX + MOVOU (CX), X0 + MOVOU 121(DX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X1 + PADDQ X6, X1 + MOVOU 16(CX), X0 + MOVOU 137(DX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X2 + PADDQ X6, X2 + MOVOU 32(CX), X0 + MOVOU 153(DX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X3 + PADDQ X6, X3 + MOVOU 48(CX), X0 + MOVOU 169(DX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X4 + PADDQ X6, X4 + +return: + MOVOU X1, (AX) + MOVOU X2, 16(AX) + MOVOU X3, 32(AX) + MOVOU X4, 48(AX) + RET + +// func accumBlockSSE(acc *[8]uint64, data *byte, key *byte) +// Requires: SSE2 +TEXT ·accumBlockSSE(SB), NOSPLIT, $0-24 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + MOVOU (AX), X1 + MOVOU 16(AX), X2 + MOVOU 32(AX), X3 + MOVOU 48(AX), X4 + MOVOU prime_sse<>+0(SB), X0 + MOVOU (CX), X5 + MOVOU (DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 16(CX), X5 + MOVOU 16(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 32(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 48(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 64(CX), X5 + MOVOU 8(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 80(CX), X5 + MOVOU 24(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 96(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 112(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 128(CX), X5 + MOVOU 16(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 144(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 160(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 176(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 192(CX), X5 + MOVOU 24(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 208(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 224(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 240(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 256(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 272(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 288(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 304(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 320(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 336(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 352(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 368(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 384(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 400(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 416(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 432(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 448(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 464(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 480(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 496(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 512(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 528(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 544(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 560(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 576(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 592(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 608(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 624(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 640(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 656(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 672(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 688(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 704(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 720(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 736(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 752(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 768(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 784(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 800(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 816(CX), X5 + MOVOU 144(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 832(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 848(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 864(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 880(CX), X5 + MOVOU 152(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 896(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 912(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 928(CX), X5 + MOVOU 144(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 944(CX), X5 + MOVOU 160(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 960(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 976(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 992(CX), X5 + MOVOU 152(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 1008(CX), X5 + MOVOU 168(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU X1, X5 + PSRLQ $0x2f, X5 + PXOR X5, X1 + MOVOU 128(DX), X5 + PXOR X5, X1 + PSHUFD $0xf5, X1, X5 + PMULULQ X0, X1 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X1 + MOVOU X2, X5 + PSRLQ $0x2f, X5 + PXOR X5, X2 + MOVOU 144(DX), X5 + PXOR X5, X2 + PSHUFD $0xf5, X2, X5 + PMULULQ X0, X2 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X2 + MOVOU X3, X5 + PSRLQ $0x2f, X5 + PXOR X5, X3 + MOVOU 160(DX), X5 + PXOR X5, X3 + PSHUFD $0xf5, X3, X5 + PMULULQ X0, X3 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X3 + MOVOU X4, X5 + PSRLQ $0x2f, X5 + PXOR X5, X4 + MOVOU 176(DX), X5 + PXOR X5, X4 + PSHUFD $0xf5, X4, X5 + PMULULQ X0, X4 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X4 + MOVOU X1, (AX) + MOVOU X2, 16(AX) + MOVOU X3, 32(AX) + MOVOU X4, 48(AX) + RET diff --git a/vendor/github.com/zeebo/xxh3/consts.go b/vendor/github.com/zeebo/xxh3/consts.go new file mode 100644 index 000000000..39ef6e179 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/consts.go @@ -0,0 +1,97 @@ +package xxh3 + +const ( + _stripe = 64 + _block = 1024 + + prime32_1 = 2654435761 + prime32_2 = 2246822519 + prime32_3 = 3266489917 + + prime64_1 = 11400714785074694791 + prime64_2 = 14029467366897019727 + prime64_3 = 1609587929392839161 + prime64_4 = 9650029242287828579 + prime64_5 = 2870177450012600261 +) + +var key = ptr(&[...]u8{ + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe /* 8 */, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, /* 16 */ + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb /* 24 */, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, /* 32 */ + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78 /* 40 */, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, /* 48 */ + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e /* 56 */, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, /* 64 */ + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb /* 72 */, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, /* 80 */ + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e /* 88 */, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, /* 96 */ + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f /* 104 */, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, /* 112 */ + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31 /* 120 */, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, /* 128 */ + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3 /* 136 */, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, /* 144 */ + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49 /* 152 */, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, /* 160 */ + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc /* 168 */, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, /* 176 */ + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28 /* 184 */, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, /* 192 */ +}) + +const ( + key64_000 u64 = 0xbe4ba423396cfeb8 + key64_008 u64 = 0x1cad21f72c81017c + key64_016 u64 = 0xdb979083e96dd4de + key64_024 u64 = 0x1f67b3b7a4a44072 + key64_032 u64 = 0x78e5c0cc4ee679cb + key64_040 u64 = 0x2172ffcc7dd05a82 + key64_048 u64 = 0x8e2443f7744608b8 + key64_056 u64 = 0x4c263a81e69035e0 + key64_064 u64 = 0xcb00c391bb52283c + key64_072 u64 = 0xa32e531b8b65d088 + key64_080 u64 = 0x4ef90da297486471 + key64_088 u64 = 0xd8acdea946ef1938 + key64_096 u64 = 0x3f349ce33f76faa8 + key64_104 u64 = 0x1d4f0bc7c7bbdcf9 + key64_112 u64 = 0x3159b4cd4be0518a + key64_120 u64 = 0x647378d9c97e9fc8 + key64_128 u64 = 0xc3ebd33483acc5ea + key64_136 u64 = 0xeb6313faffa081c5 + key64_144 u64 = 0x49daf0b751dd0d17 + key64_152 u64 = 0x9e68d429265516d3 + key64_160 u64 = 0xfca1477d58be162b + key64_168 u64 = 0xce31d07ad1b8f88f + key64_176 u64 = 0x280416958f3acb45 + key64_184 u64 = 0x7e404bbbcafbd7af + + key64_103 u64 = 0x4f0bc7c7bbdcf93f + key64_111 u64 = 0x59b4cd4be0518a1d + key64_119 u64 = 0x7378d9c97e9fc831 + key64_127 u64 = 0xebd33483acc5ea64 + + key64_121 u64 = 0xea647378d9c97e9f + key64_129 u64 = 0xc5c3ebd33483acc5 + key64_137 u64 = 0x17eb6313faffa081 + key64_145 u64 = 0xd349daf0b751dd0d + key64_153 u64 = 0x2b9e68d429265516 + key64_161 u64 = 0x8ffca1477d58be16 + key64_169 u64 = 0x45ce31d07ad1b8f8 + key64_177 u64 = 0xaf280416958f3acb + + key64_011 = 0x6dd4de1cad21f72c + key64_019 = 0xa44072db979083e9 + key64_027 = 0xe679cb1f67b3b7a4 + key64_035 = 0xd05a8278e5c0cc4e + key64_043 = 0x4608b82172ffcc7d + key64_051 = 0x9035e08e2443f774 + key64_059 = 0x52283c4c263a81e6 + key64_067 = 0x65d088cb00c391bb + + key64_117 = 0xd9c97e9fc83159b4 + key64_125 = 0x3483acc5ea647378 + key64_133 = 0xfaffa081c5c3ebd3 + key64_141 = 0xb751dd0d17eb6313 + key64_149 = 0x29265516d349daf0 + key64_157 = 0x7d58be162b9e68d4 + key64_165 = 0x7ad1b8f88ffca147 + key64_173 = 0x958f3acb45ce31d0 +) + +const ( + key32_000 u32 = 0xbe4ba423 + key32_004 u32 = 0x396cfeb8 + key32_008 u32 = 0x1cad21f7 + key32_012 u32 = 0x2c81017c +) diff --git a/vendor/github.com/zeebo/xxh3/hash128.go b/vendor/github.com/zeebo/xxh3/hash128.go new file mode 100644 index 000000000..0040a21bb --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/hash128.go @@ -0,0 +1,253 @@ +package xxh3 + +import ( + "math/bits" +) + +// Hash128 returns the 128-bit hash of the byte slice. +func Hash128(b []byte) Uint128 { + return hashAny128(*(*str)(ptr(&b))) +} + +// HashString128 returns the 128-bit hash of the string slice. +func HashString128(s string) Uint128 { + return hashAny128(*(*str)(ptr(&s))) +} + +func hashAny128(s str) (acc u128) { + p, l := s.p, s.l + + switch { + case l <= 16: + switch { + case l > 8: // 9-16 + const bitflipl = key64_032 ^ key64_040 + const bitfliph = key64_048 ^ key64_056 + + input_lo := readU64(p, 0) + input_hi := readU64(p, ui(l)-8) + + m128_h, m128_l := bits.Mul64(input_lo^input_hi^bitflipl, prime64_1) + + m128_l += uint64(l-1) << 54 + input_hi ^= bitfliph + + m128_h += input_hi + uint64(uint32(input_hi))*(prime32_2-1) + + m128_l ^= bits.ReverseBytes64(m128_h) + + acc.Hi, acc.Lo = bits.Mul64(m128_l, prime64_2) + acc.Hi += m128_h * prime64_2 + + acc.Lo = xxh3Avalanche(acc.Lo) + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + + case l > 3: // 4-8 + const bitflip = key64_016 ^ key64_024 + + input_lo := readU32(p, 0) + input_hi := readU32(p, ui(l)-4) + input_64 := u64(input_lo) + u64(input_hi)<<32 + keyed := input_64 ^ bitflip + + acc.Hi, acc.Lo = bits.Mul64(keyed, prime64_1+(uint64(l)<<2)) + + acc.Hi += acc.Lo << 1 + acc.Lo ^= acc.Hi >> 3 + + acc.Lo ^= acc.Lo >> 35 + acc.Lo *= 0x9fb21c651e98df25 + acc.Lo ^= acc.Lo >> 28 + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + + case l == 3: // 3 + c12 := u64(readU16(p, 0)) + c3 := u64(readU8(p, 2)) + acc.Lo = c12<<16 + c3 + 3<<8 + + case l > 1: // 2 + c12 := u64(readU16(p, 0)) + acc.Lo = c12*(1<<24+1)>>8 + 2<<8 + + case l == 1: // 1 + c1 := u64(readU8(p, 0)) + acc.Lo = c1*(1<<24+1<<16+1) + 1<<8 + + default: // 0 + return u128{0x99aa06d3014798d8, 0x6001c324468d497f} + } + + acc.Hi = uint64(bits.RotateLeft32(bits.ReverseBytes32(uint32(acc.Lo)), 13)) + acc.Lo ^= uint64(key32_000 ^ key32_004) + acc.Hi ^= uint64(key32_008 ^ key32_012) + + acc.Lo = xxh64AvalancheSmall(acc.Lo) + acc.Hi = xxh64AvalancheSmall(acc.Hi) + + return acc + + case l <= 128: + acc.Lo = u64(l) * prime64_1 + + if l > 32 { + if l > 64 { + if l > 96 { + in8, in7 := readU64(p, ui(l)-8*8), readU64(p, ui(l)-7*8) + i6, i7 := readU64(p, 6*8), readU64(p, 7*8) + + acc.Hi += mulFold64(in8^key64_112, in7^key64_120) + acc.Hi ^= i6 + i7 + acc.Lo += mulFold64(i6^key64_096, i7^key64_104) + acc.Lo ^= in8 + in7 + + } // 96 + + in6, in5 := readU64(p, ui(l)-6*8), readU64(p, ui(l)-5*8) + i4, i5 := readU64(p, 4*8), readU64(p, 5*8) + + acc.Hi += mulFold64(in6^key64_080, in5^key64_088) + acc.Hi ^= i4 + i5 + acc.Lo += mulFold64(i4^key64_064, i5^key64_072) + acc.Lo ^= in6 + in5 + + } // 64 + + in4, in3 := readU64(p, ui(l)-4*8), readU64(p, ui(l)-3*8) + i2, i3 := readU64(p, 2*8), readU64(p, 3*8) + + acc.Hi += mulFold64(in4^key64_048, in3^key64_056) + acc.Hi ^= i2 + i3 + acc.Lo += mulFold64(i2^key64_032, i3^key64_040) + acc.Lo ^= in4 + in3 + + } // 32 + + in2, in1 := readU64(p, ui(l)-2*8), readU64(p, ui(l)-1*8) + i0, i1 := readU64(p, 0*8), readU64(p, 1*8) + + acc.Hi += mulFold64(in2^key64_016, in1^key64_024) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^key64_000, i1^key64_008) + acc.Lo ^= in2 + in1 + + acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+(u64(l)*prime64_2), acc.Hi+acc.Lo + + acc.Hi = -xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + return acc + + case l <= 240: + acc.Lo = u64(l) * prime64_1 + + { + i0, i1, i2, i3 := readU64(p, 0*8), readU64(p, 1*8), readU64(p, 2*8), readU64(p, 3*8) + + acc.Hi += mulFold64(i2^key64_016, i3^key64_024) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^key64_000, i1^key64_008) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 4*8), readU64(p, 5*8), readU64(p, 6*8), readU64(p, 7*8) + + acc.Hi += mulFold64(i2^key64_048, i3^key64_056) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^key64_032, i1^key64_040) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 8*8), readU64(p, 9*8), readU64(p, 10*8), readU64(p, 11*8) + + acc.Hi += mulFold64(i2^key64_080, i3^key64_088) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^key64_064, i1^key64_072) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 12*8), readU64(p, 13*8), readU64(p, 14*8), readU64(p, 15*8) + + acc.Hi += mulFold64(i2^key64_112, i3^key64_120) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^key64_096, i1^key64_104) + acc.Lo ^= i2 + i3 + } + + // avalanche + acc.Hi = xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + // trailing groups after 128 + top := ui(l) &^ 31 + for i := ui(4 * 32); i < top; i += 32 { + i0, i1, i2, i3 := readU64(p, i+0), readU64(p, i+8), readU64(p, i+16), readU64(p, i+24) + k0, k1, k2, k3 := readU64(key, i-125), readU64(key, i-117), readU64(key, i-109), readU64(key, i-101) + + acc.Hi += mulFold64(i2^k2, i3^k3) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^k0, i1^k1) + acc.Lo ^= i2 + i3 + } + + // last 32 bytes + { + i0, i1, i2, i3 := readU64(p, ui(l)-32), readU64(p, ui(l)-24), readU64(p, ui(l)-16), readU64(p, ui(l)-8) + + acc.Hi += mulFold64(i0^key64_119, i1^key64_127) + acc.Hi ^= i2 + i3 + acc.Lo += mulFold64(i2^key64_103, i3^key64_111) + acc.Lo ^= i0 + i1 + } + + acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+(u64(l)*prime64_2), acc.Hi+acc.Lo + + acc.Hi = -xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + return acc + + default: + acc.Lo = u64(l) * prime64_1 + acc.Hi = ^(u64(l) * prime64_2) + + accs := [8]u64{ + prime32_3, prime64_1, prime64_2, prime64_3, + prime64_4, prime32_2, prime64_5, prime32_1, + } + + if hasAVX512 && l >= avx512Switch { + accumAVX512(&accs, p, key, u64(l)) + } else if hasAVX2 { + accumAVX2(&accs, p, key, u64(l)) + } else if hasSSE2 { + accumSSE(&accs, p, key, u64(l)) + } else { + accumScalar(&accs, p, key, u64(l)) + } + + // merge accs + acc.Lo += mulFold64(accs[0]^key64_011, accs[1]^key64_019) + acc.Hi += mulFold64(accs[0]^key64_117, accs[1]^key64_125) + + acc.Lo += mulFold64(accs[2]^key64_027, accs[3]^key64_035) + acc.Hi += mulFold64(accs[2]^key64_133, accs[3]^key64_141) + + acc.Lo += mulFold64(accs[4]^key64_043, accs[5]^key64_051) + acc.Hi += mulFold64(accs[4]^key64_149, accs[5]^key64_157) + + acc.Lo += mulFold64(accs[6]^key64_059, accs[7]^key64_067) + acc.Hi += mulFold64(accs[6]^key64_165, accs[7]^key64_173) + + acc.Lo = xxh3Avalanche(acc.Lo) + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + } +} diff --git a/vendor/github.com/zeebo/xxh3/hash128_seed.go b/vendor/github.com/zeebo/xxh3/hash128_seed.go new file mode 100644 index 000000000..358009be3 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/hash128_seed.go @@ -0,0 +1,264 @@ +package xxh3 + +import ( + "math/bits" +) + +// Hash128Seed returns the 128-bit hash of the byte slice. +func Hash128Seed(b []byte, seed uint64) Uint128 { + return hashAny128Seed(*(*str)(ptr(&b)), seed) +} + +// HashString128Seed returns the 128-bit hash of the string slice. +func HashString128Seed(s string, seed uint64) Uint128 { + return hashAny128Seed(*(*str)(ptr(&s)), seed) +} + +func hashAny128Seed(s str, seed uint64) (acc u128) { + p, l := s.p, s.l + + switch { + case l <= 16: + switch { + case l > 8: // 9-16 + bitflipl := (key64_032 ^ key64_040) - seed + bitfliph := (key64_048 ^ key64_056) + seed + + input_lo := readU64(p, 0) + input_hi := readU64(p, ui(l)-8) + + m128_h, m128_l := bits.Mul64(input_lo^input_hi^bitflipl, prime64_1) + + m128_l += uint64(l-1) << 54 + input_hi ^= bitfliph + + m128_h += input_hi + uint64(uint32(input_hi))*(prime32_2-1) + + m128_l ^= bits.ReverseBytes64(m128_h) + + acc.Hi, acc.Lo = bits.Mul64(m128_l, prime64_2) + acc.Hi += m128_h * prime64_2 + + acc.Lo = xxh3Avalanche(acc.Lo) + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + + case l > 3: // 4-8 + seed ^= u64(bits.ReverseBytes32(u32(seed))) << 32 + bitflip := (key64_016 ^ key64_024) + seed + input_lo := readU32(p, 0) + input_hi := readU32(p, ui(l)-4) + input_64 := u64(input_lo) + u64(input_hi)<<32 + keyed := input_64 ^ bitflip + + acc.Hi, acc.Lo = bits.Mul64(keyed, prime64_1+(uint64(l)<<2)) + + acc.Hi += acc.Lo << 1 + acc.Lo ^= acc.Hi >> 3 + + acc.Lo ^= acc.Lo >> 35 + acc.Lo *= 0x9fb21c651e98df25 + acc.Lo ^= acc.Lo >> 28 + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + + case l == 3: // 3 + c12 := u64(readU16(p, 0)) + c3 := u64(readU8(p, 2)) + acc.Lo = c12<<16 + c3 + 3<<8 + + case l > 1: // 2 + c12 := u64(readU16(p, 0)) + acc.Lo = c12*(1<<24+1)>>8 + 2<<8 + + case l == 1: // 1 + c1 := u64(readU8(p, 0)) + acc.Lo = c1*(1<<24+1<<16+1) + 1<<8 + + default: // 0 + bitflipl := key64_064 ^ key64_072 ^ seed + bitfliph := key64_080 ^ key64_088 ^ seed + return u128{Lo: xxh64AvalancheFull(bitflipl), Hi: xxh64AvalancheFull(bitfliph)} + } + + acc.Hi = uint64(bits.RotateLeft32(bits.ReverseBytes32(uint32(acc.Lo)), 13)) + acc.Lo ^= uint64(key32_000^key32_004) + seed + acc.Hi ^= uint64(key32_008^key32_012) - seed + + acc.Lo = xxh64AvalancheFull(acc.Lo) + acc.Hi = xxh64AvalancheFull(acc.Hi) + + return acc + + case l <= 128: + acc.Lo = u64(l) * prime64_1 + + if l > 32 { + if l > 64 { + if l > 96 { + in8, in7 := readU64(p, ui(l)-8*8), readU64(p, ui(l)-7*8) + i6, i7 := readU64(p, 6*8), readU64(p, 7*8) + + acc.Hi += mulFold64(in8^(key64_112+seed), in7^(key64_120-seed)) + acc.Hi ^= i6 + i7 + acc.Lo += mulFold64(i6^(key64_096+seed), i7^(key64_104-seed)) + acc.Lo ^= in8 + in7 + + } // 96 + + in6, in5 := readU64(p, ui(l)-6*8), readU64(p, ui(l)-5*8) + i4, i5 := readU64(p, 4*8), readU64(p, 5*8) + + acc.Hi += mulFold64(in6^(key64_080+seed), in5^(key64_088-seed)) + acc.Hi ^= i4 + i5 + acc.Lo += mulFold64(i4^(key64_064+seed), i5^(key64_072-seed)) + acc.Lo ^= in6 + in5 + + } // 64 + + in4, in3 := readU64(p, ui(l)-4*8), readU64(p, ui(l)-3*8) + i2, i3 := readU64(p, 2*8), readU64(p, 3*8) + + acc.Hi += mulFold64(in4^(key64_048+seed), in3^(key64_056-seed)) + acc.Hi ^= i2 + i3 + acc.Lo += mulFold64(i2^(key64_032+seed), i3^(key64_040-seed)) + acc.Lo ^= in4 + in3 + + } // 32 + + in2, in1 := readU64(p, ui(l)-2*8), readU64(p, ui(l)-1*8) + i0, i1 := readU64(p, 0*8), readU64(p, 1*8) + + acc.Hi += mulFold64(in2^(key64_016+seed), in1^(key64_024-seed)) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^(key64_000+seed), i1^(key64_008-seed)) + acc.Lo ^= in2 + in1 + + acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+((u64(l)-seed)*prime64_2), acc.Hi+acc.Lo + + acc.Hi = -xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + return acc + + case l <= 240: + acc.Lo = u64(l) * prime64_1 + + { + i0, i1, i2, i3 := readU64(p, 0*8), readU64(p, 1*8), readU64(p, 2*8), readU64(p, 3*8) + + acc.Hi += mulFold64(i2^(key64_016+seed), i3^(key64_024-seed)) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^(key64_000+seed), i1^(key64_008-seed)) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 4*8), readU64(p, 5*8), readU64(p, 6*8), readU64(p, 7*8) + + acc.Hi += mulFold64(i2^(key64_048+seed), i3^(key64_056-seed)) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^(key64_032+seed), i1^(key64_040-seed)) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 8*8), readU64(p, 9*8), readU64(p, 10*8), readU64(p, 11*8) + + acc.Hi += mulFold64(i2^(key64_080+seed), i3^(key64_088-seed)) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^(key64_064+seed), i1^(key64_072-seed)) + acc.Lo ^= i2 + i3 + } + + { + i0, i1, i2, i3 := readU64(p, 12*8), readU64(p, 13*8), readU64(p, 14*8), readU64(p, 15*8) + + acc.Hi += mulFold64(i2^(key64_112+seed), i3^(key64_120-seed)) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^(key64_096+seed), i1^(key64_104-seed)) + acc.Lo ^= i2 + i3 + } + + // avalanche + acc.Hi = xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + // trailing groups after 128 + top := ui(l) &^ 31 + for i := ui(4 * 32); i < top; i += 32 { + i0, i1, i2, i3 := readU64(p, i+0), readU64(p, i+8), readU64(p, i+16), readU64(p, i+24) + k0, k1, k2, k3 := readU64(key, i-125)+seed, readU64(key, i-117)-seed, readU64(key, i-109)+seed, readU64(key, i-101)-seed + + acc.Hi += mulFold64(i2^k2, i3^k3) + acc.Hi ^= i0 + i1 + acc.Lo += mulFold64(i0^k0, i1^k1) + acc.Lo ^= i2 + i3 + } + + // last 32 bytes + { + i0, i1, i2, i3 := readU64(p, ui(l)-32), readU64(p, ui(l)-24), readU64(p, ui(l)-16), readU64(p, ui(l)-8) + + seed := 0 - seed + acc.Hi += mulFold64(i0^(key64_119+seed), i1^(key64_127-seed)) + acc.Hi ^= i2 + i3 + acc.Lo += mulFold64(i2^(key64_103+seed), i3^(key64_111-seed)) + acc.Lo ^= i0 + i1 + } + + acc.Hi, acc.Lo = (acc.Lo*prime64_1)+(acc.Hi*prime64_4)+((u64(l)-seed)*prime64_2), acc.Hi+acc.Lo + + acc.Hi = -xxh3Avalanche(acc.Hi) + acc.Lo = xxh3Avalanche(acc.Lo) + + return acc + + default: + acc.Lo = u64(l) * prime64_1 + acc.Hi = ^(u64(l) * prime64_2) + + secret := key + if seed != 0 { + secret = ptr(&[secretSize]byte{}) + initSecret(secret, seed) + } + + accs := [8]u64{ + prime32_3, prime64_1, prime64_2, prime64_3, + prime64_4, prime32_2, prime64_5, prime32_1, + } + + if hasAVX512 && l >= avx512Switch { + accumAVX512(&accs, p, secret, u64(l)) + } else if hasAVX2 { + accumAVX2(&accs, p, secret, u64(l)) + } else if hasSSE2 { + accumSSE(&accs, p, secret, u64(l)) + } else { + accumScalar(&accs, p, secret, u64(l)) + } + + // merge accs + const hi_off = 117 - 11 + + acc.Lo += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19)) + acc.Hi += mulFold64(accs[0]^readU64(secret, 11+hi_off), accs[1]^readU64(secret, 19+hi_off)) + + acc.Lo += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35)) + acc.Hi += mulFold64(accs[2]^readU64(secret, 27+hi_off), accs[3]^readU64(secret, 35+hi_off)) + + acc.Lo += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51)) + acc.Hi += mulFold64(accs[4]^readU64(secret, 43+hi_off), accs[5]^readU64(secret, 51+hi_off)) + + acc.Lo += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67)) + acc.Hi += mulFold64(accs[6]^readU64(secret, 59+hi_off), accs[7]^readU64(secret, 67+hi_off)) + + acc.Lo = xxh3Avalanche(acc.Lo) + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc + } +} diff --git a/vendor/github.com/zeebo/xxh3/hash64.go b/vendor/github.com/zeebo/xxh3/hash64.go new file mode 100644 index 000000000..13aab9585 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/hash64.go @@ -0,0 +1,126 @@ +package xxh3 + +import "math/bits" + +// Hash returns the hash of the byte slice. +func Hash(b []byte) uint64 { + return hashAny(*(*str)(ptr(&b))) +} + +// Hash returns the hash of the string slice. +func HashString(s string) uint64 { + return hashAny(*(*str)(ptr(&s))) +} + +func hashAny(s str) (acc u64) { + p, l := s.p, s.l + + switch { + case l <= 16: + switch { + case l > 8: // 9-16 + inputlo := readU64(p, 0) ^ (key64_024 ^ key64_032) + inputhi := readU64(p, ui(l)-8) ^ (key64_040 ^ key64_048) + folded := mulFold64(inputlo, inputhi) + return xxh3Avalanche(u64(l) + bits.ReverseBytes64(inputlo) + inputhi + folded) + + case l > 3: // 4-8 + input1 := readU32(p, 0) + input2 := readU32(p, ui(l)-4) + input64 := u64(input2) + u64(input1)<<32 + keyed := input64 ^ (key64_008 ^ key64_016) + return rrmxmx(keyed, u64(l)) + + case l == 3: // 3 + c12 := u64(readU16(p, 0)) + c3 := u64(readU8(p, 2)) + acc = c12<<16 + c3 + 3<<8 + + case l > 1: // 2 + c12 := u64(readU16(p, 0)) + acc = c12*(1<<24+1)>>8 + 2<<8 + + case l == 1: // 1 + c1 := u64(readU8(p, 0)) + acc = c1*(1<<24+1<<16+1) + 1<<8 + + default: // 0 + return 0x2d06800538d394c2 // xxh_avalanche(key64_056 ^ key64_064) + } + + acc ^= u64(key32_000 ^ key32_004) + return xxhAvalancheSmall(acc) + + case l <= 128: + acc = u64(l) * prime64_1 + + if l > 32 { + if l > 64 { + if l > 96 { + acc += mulFold64(readU64(p, 6*8)^key64_096, readU64(p, 7*8)^key64_104) + acc += mulFold64(readU64(p, ui(l)-8*8)^key64_112, readU64(p, ui(l)-7*8)^key64_120) + } // 96 + acc += mulFold64(readU64(p, 4*8)^key64_064, readU64(p, 5*8)^key64_072) + acc += mulFold64(readU64(p, ui(l)-6*8)^key64_080, readU64(p, ui(l)-5*8)^key64_088) + } // 64 + acc += mulFold64(readU64(p, 2*8)^key64_032, readU64(p, 3*8)^key64_040) + acc += mulFold64(readU64(p, ui(l)-4*8)^key64_048, readU64(p, ui(l)-3*8)^key64_056) + } // 32 + acc += mulFold64(readU64(p, 0*8)^key64_000, readU64(p, 1*8)^key64_008) + acc += mulFold64(readU64(p, ui(l)-2*8)^key64_016, readU64(p, ui(l)-1*8)^key64_024) + + return xxh3Avalanche(acc) + + case l <= 240: + acc = u64(l) * prime64_1 + + acc += mulFold64(readU64(p, 0*16+0)^key64_000, readU64(p, 0*16+8)^key64_008) + acc += mulFold64(readU64(p, 1*16+0)^key64_016, readU64(p, 1*16+8)^key64_024) + acc += mulFold64(readU64(p, 2*16+0)^key64_032, readU64(p, 2*16+8)^key64_040) + acc += mulFold64(readU64(p, 3*16+0)^key64_048, readU64(p, 3*16+8)^key64_056) + acc += mulFold64(readU64(p, 4*16+0)^key64_064, readU64(p, 4*16+8)^key64_072) + acc += mulFold64(readU64(p, 5*16+0)^key64_080, readU64(p, 5*16+8)^key64_088) + acc += mulFold64(readU64(p, 6*16+0)^key64_096, readU64(p, 6*16+8)^key64_104) + acc += mulFold64(readU64(p, 7*16+0)^key64_112, readU64(p, 7*16+8)^key64_120) + + // avalanche + acc = xxh3Avalanche(acc) + + // trailing groups after 128 + top := ui(l) &^ 15 + for i := ui(8 * 16); i < top; i += 16 { + acc += mulFold64(readU64(p, i+0)^readU64(key, i-125), readU64(p, i+8)^readU64(key, i-117)) + } + + // last 16 bytes + acc += mulFold64(readU64(p, ui(l)-16)^key64_119, readU64(p, ui(l)-8)^key64_127) + + return xxh3Avalanche(acc) + + default: + acc = u64(l) * prime64_1 + + accs := [8]u64{ + prime32_3, prime64_1, prime64_2, prime64_3, + prime64_4, prime32_2, prime64_5, prime32_1, + } + + if hasAVX512 && l >= avx512Switch { + accumAVX512(&accs, p, key, u64(l)) + } else if hasAVX2 { + accumAVX2(&accs, p, key, u64(l)) + } else if hasSSE2 { + accumSSE(&accs, p, key, u64(l)) + } else { + accumScalar(&accs, p, key, u64(l)) + } + + // merge accs + acc += mulFold64(accs[0]^key64_011, accs[1]^key64_019) + acc += mulFold64(accs[2]^key64_027, accs[3]^key64_035) + acc += mulFold64(accs[4]^key64_043, accs[5]^key64_051) + acc += mulFold64(accs[6]^key64_059, accs[7]^key64_067) + + return xxh3Avalanche(acc) + } +} diff --git a/vendor/github.com/zeebo/xxh3/hash64_seed.go b/vendor/github.com/zeebo/xxh3/hash64_seed.go new file mode 100644 index 000000000..429994c36 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/hash64_seed.go @@ -0,0 +1,134 @@ +package xxh3 + +import "math/bits" + +// HashSeed returns the hash of the byte slice with given seed. +func HashSeed(b []byte, seed uint64) uint64 { + return hashAnySeed(*(*str)(ptr(&b)), seed) + +} + +// HashStringSeed returns the hash of the string slice with given seed. +func HashStringSeed(s string, seed uint64) uint64 { + return hashAnySeed(*(*str)(ptr(&s)), seed) +} + +func hashAnySeed(s str, seed uint64) (acc u64) { + p, l := s.p, s.l + + switch { + case l <= 16: + switch { + case l > 8: + inputlo := readU64(p, 0) ^ (key64_024 ^ key64_032 + seed) + inputhi := readU64(p, ui(l)-8) ^ (key64_040 ^ key64_048 - seed) + folded := mulFold64(inputlo, inputhi) + return xxh3Avalanche(u64(l) + bits.ReverseBytes64(inputlo) + inputhi + folded) + + case l > 3: + seed ^= u64(bits.ReverseBytes32(u32(seed))) << 32 + input1 := readU32(p, 0) + input2 := readU32(p, ui(l)-4) + input64 := u64(input2) + u64(input1)<<32 + keyed := input64 ^ (key64_008 ^ key64_016 - seed) + return rrmxmx(keyed, u64(l)) + + case l == 3: // 3 + c12 := u64(readU16(p, 0)) + c3 := u64(readU8(p, 2)) + acc = c12<<16 + c3 + 3<<8 + + case l > 1: // 2 + c12 := u64(readU16(p, 0)) + acc = c12*(1<<24+1)>>8 + 2<<8 + + case l == 1: // 1 + c1 := u64(readU8(p, 0)) + acc = c1*(1<<24+1<<16+1) + 1<<8 + + default: + return xxhAvalancheSmall(seed ^ key64_056 ^ key64_064) + } + + acc ^= u64(key32_000^key32_004) + seed + return xxhAvalancheSmall(acc) + + case l <= 128: + acc = u64(l) * prime64_1 + + if l > 32 { + if l > 64 { + if l > 96 { + acc += mulFold64(readU64(p, 6*8)^(key64_096+seed), readU64(p, 7*8)^(key64_104-seed)) + acc += mulFold64(readU64(p, ui(l)-8*8)^(key64_112+seed), readU64(p, ui(l)-7*8)^(key64_120-seed)) + } // 96 + acc += mulFold64(readU64(p, 4*8)^(key64_064+seed), readU64(p, 5*8)^(key64_072-seed)) + acc += mulFold64(readU64(p, ui(l)-6*8)^(key64_080+seed), readU64(p, ui(l)-5*8)^(key64_088-seed)) + } // 64 + acc += mulFold64(readU64(p, 2*8)^(key64_032+seed), readU64(p, 3*8)^(key64_040-seed)) + acc += mulFold64(readU64(p, ui(l)-4*8)^(key64_048+seed), readU64(p, ui(l)-3*8)^(key64_056-seed)) + } // 32 + acc += mulFold64(readU64(p, 0*8)^(key64_000+seed), readU64(p, 1*8)^(key64_008-seed)) + acc += mulFold64(readU64(p, ui(l)-2*8)^(key64_016+seed), readU64(p, ui(l)-1*8)^(key64_024-seed)) + + return xxh3Avalanche(acc) + + case l <= 240: + acc = u64(l) * prime64_1 + + acc += mulFold64(readU64(p, 0*16+0)^(key64_000+seed), readU64(p, 0*16+8)^(key64_008-seed)) + acc += mulFold64(readU64(p, 1*16+0)^(key64_016+seed), readU64(p, 1*16+8)^(key64_024-seed)) + acc += mulFold64(readU64(p, 2*16+0)^(key64_032+seed), readU64(p, 2*16+8)^(key64_040-seed)) + acc += mulFold64(readU64(p, 3*16+0)^(key64_048+seed), readU64(p, 3*16+8)^(key64_056-seed)) + acc += mulFold64(readU64(p, 4*16+0)^(key64_064+seed), readU64(p, 4*16+8)^(key64_072-seed)) + acc += mulFold64(readU64(p, 5*16+0)^(key64_080+seed), readU64(p, 5*16+8)^(key64_088-seed)) + acc += mulFold64(readU64(p, 6*16+0)^(key64_096+seed), readU64(p, 6*16+8)^(key64_104-seed)) + acc += mulFold64(readU64(p, 7*16+0)^(key64_112+seed), readU64(p, 7*16+8)^(key64_120-seed)) + + // avalanche + acc = xxh3Avalanche(acc) + + // trailing groups after 128 + top := ui(l) &^ 15 + for i := ui(8 * 16); i < top; i += 16 { + acc += mulFold64(readU64(p, i+0)^(readU64(key, i-125)+seed), readU64(p, i+8)^(readU64(key, i-117)-seed)) + } + + // last 16 bytes + acc += mulFold64(readU64(p, ui(l)-16)^(key64_119+seed), readU64(p, ui(l)-8)^(key64_127-seed)) + + return xxh3Avalanche(acc) + + default: + acc = u64(l) * prime64_1 + + secret := key + if seed != 0 { + secret = ptr(&[secretSize]byte{}) + initSecret(secret, seed) + } + + accs := [8]u64{ + prime32_3, prime64_1, prime64_2, prime64_3, + prime64_4, prime32_2, prime64_5, prime32_1, + } + + if hasAVX512 && l >= avx512Switch { + accumAVX512(&accs, p, secret, u64(l)) + } else if hasAVX2 { + accumAVX2(&accs, p, secret, u64(l)) + } else if hasSSE2 { + accumSSE(&accs, p, secret, u64(l)) + } else { + accumScalarSeed(&accs, p, secret, u64(l)) + } + + // merge accs + acc += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19)) + acc += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35)) + acc += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51)) + acc += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67)) + + return xxh3Avalanche(acc) + } +} diff --git a/vendor/github.com/zeebo/xxh3/hasher.go b/vendor/github.com/zeebo/xxh3/hasher.go new file mode 100644 index 000000000..d9789980a --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/hasher.go @@ -0,0 +1,239 @@ +package xxh3 + +import ( + "encoding/binary" + "hash" +) + +// Hasher implements the hash.Hash interface +type Hasher struct { + acc [8]u64 + blk u64 + len u64 + key ptr + buf [_block + _stripe]byte + seed u64 +} + +var ( + _ hash.Hash = (*Hasher)(nil) + _ hash.Hash64 = (*Hasher)(nil) +) + +// New returns a new Hasher that implements the hash.Hash interface. +func New() *Hasher { + return new(Hasher) +} + +// NewSeed returns a new Hasher that implements the hash.Hash interface. +func NewSeed(seed uint64) *Hasher { + var h Hasher + h.Reset() + h.seed = seed + h.key = key + + // Only initiate once, not on reset. + if seed != 0 { + h.key = ptr(&[secretSize]byte{}) + initSecret(h.key, seed) + } + return &h +} + +// Reset resets the Hash to its initial state. +func (h *Hasher) Reset() { + h.acc = [8]u64{ + prime32_3, prime64_1, prime64_2, prime64_3, + prime64_4, prime32_2, prime64_5, prime32_1, + } + h.blk = 0 + h.len = 0 +} + +// BlockSize returns the hash's underlying block size. +// The Write method will accept any amount of data, but +// it may operate more efficiently if all writes are a +// multiple of the block size. +func (h *Hasher) BlockSize() int { return _stripe } + +// Size returns the number of bytes Sum will return. +func (h *Hasher) Size() int { return 8 } + +// Sum appends the current hash to b and returns the resulting slice. +// It does not change the underlying hash state. +func (h *Hasher) Sum(b []byte) []byte { + var tmp [8]byte + binary.BigEndian.PutUint64(tmp[:], h.Sum64()) + return append(b, tmp[:]...) +} + +// Write adds more data to the running hash. +// It never returns an error. +func (h *Hasher) Write(buf []byte) (int, error) { + h.update(buf) + return len(buf), nil +} + +// WriteString adds more data to the running hash. +// It never returns an error. +func (h *Hasher) WriteString(buf string) (int, error) { + h.updateString(buf) + return len(buf), nil +} + +func (h *Hasher) update(buf []byte) { + // relies on the data pointer being the first word in the string header + h.updateString(*(*string)(ptr(&buf))) +} + +func (h *Hasher) updateString(buf string) { + if h.key == nil { + h.key = key + h.Reset() + } + + // On first write, if more than 1 block, process without copy. + for h.len == 0 && len(buf) > len(h.buf) { + if hasAVX2 { + accumBlockAVX2(&h.acc, *(*ptr)(ptr(&buf)), h.key) + } else if hasSSE2 { + accumBlockSSE(&h.acc, *(*ptr)(ptr(&buf)), h.key) + } else { + accumBlockScalar(&h.acc, *(*ptr)(ptr(&buf)), h.key) + } + buf = buf[_block:] + h.blk++ + } + + for len(buf) > 0 { + if h.len < u64(len(h.buf)) { + n := copy(h.buf[h.len:], buf) + h.len += u64(n) + buf = buf[n:] + continue + } + + if hasAVX2 { + accumBlockAVX2(&h.acc, ptr(&h.buf), h.key) + } else if hasSSE2 { + accumBlockSSE(&h.acc, ptr(&h.buf), h.key) + } else { + accumBlockScalar(&h.acc, ptr(&h.buf), h.key) + } + + h.blk++ + h.len = _stripe + copy(h.buf[:_stripe], h.buf[_block:]) + } +} + +// Sum64 returns the 64-bit hash of the written data. +func (h *Hasher) Sum64() uint64 { + if h.key == nil { + h.key = key + h.Reset() + } + + if h.blk == 0 { + if h.seed == 0 { + return Hash(h.buf[:h.len]) + } + return HashSeed(h.buf[:h.len], h.seed) + } + + l := h.blk*_block + h.len + acc := l * prime64_1 + accs := h.acc + + if h.len > 0 { + // We are only ever doing 1 block here, so no avx512. + if hasAVX2 { + accumAVX2(&accs, ptr(&h.buf[0]), h.key, h.len) + } else if hasSSE2 { + accumSSE(&accs, ptr(&h.buf[0]), h.key, h.len) + } else { + accumScalar(&accs, ptr(&h.buf[0]), h.key, h.len) + } + } + + if h.seed == 0 { + acc += mulFold64(accs[0]^key64_011, accs[1]^key64_019) + acc += mulFold64(accs[2]^key64_027, accs[3]^key64_035) + acc += mulFold64(accs[4]^key64_043, accs[5]^key64_051) + acc += mulFold64(accs[6]^key64_059, accs[7]^key64_067) + } else { + secret := h.key + acc += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19)) + acc += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35)) + acc += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51)) + acc += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67)) + } + + acc = xxh3Avalanche(acc) + + return acc +} + +// Sum128 returns the 128-bit hash of the written data. +func (h *Hasher) Sum128() Uint128 { + if h.key == nil { + h.key = key + h.Reset() + } + + if h.blk == 0 { + if h.seed == 0 { + return Hash128(h.buf[:h.len]) + } + return Hash128Seed(h.buf[:h.len], h.seed) + } + + l := h.blk*_block + h.len + acc := Uint128{Lo: l * prime64_1, Hi: ^(l * prime64_2)} + accs := h.acc + + if h.len > 0 { + // We are only ever doing 1 block here, so no avx512. + if hasAVX2 { + accumAVX2(&accs, ptr(&h.buf[0]), h.key, h.len) + } else if hasSSE2 { + accumSSE(&accs, ptr(&h.buf[0]), h.key, h.len) + } else { + accumScalar(&accs, ptr(&h.buf[0]), h.key, h.len) + } + } + + if h.seed == 0 { + acc.Lo += mulFold64(accs[0]^key64_011, accs[1]^key64_019) + acc.Hi += mulFold64(accs[0]^key64_117, accs[1]^key64_125) + + acc.Lo += mulFold64(accs[2]^key64_027, accs[3]^key64_035) + acc.Hi += mulFold64(accs[2]^key64_133, accs[3]^key64_141) + + acc.Lo += mulFold64(accs[4]^key64_043, accs[5]^key64_051) + acc.Hi += mulFold64(accs[4]^key64_149, accs[5]^key64_157) + + acc.Lo += mulFold64(accs[6]^key64_059, accs[7]^key64_067) + acc.Hi += mulFold64(accs[6]^key64_165, accs[7]^key64_173) + } else { + secret := h.key + const hi_off = 117 - 11 + + acc.Lo += mulFold64(accs[0]^readU64(secret, 11), accs[1]^readU64(secret, 19)) + acc.Hi += mulFold64(accs[0]^readU64(secret, 11+hi_off), accs[1]^readU64(secret, 19+hi_off)) + + acc.Lo += mulFold64(accs[2]^readU64(secret, 27), accs[3]^readU64(secret, 35)) + acc.Hi += mulFold64(accs[2]^readU64(secret, 27+hi_off), accs[3]^readU64(secret, 35+hi_off)) + + acc.Lo += mulFold64(accs[4]^readU64(secret, 43), accs[5]^readU64(secret, 51)) + acc.Hi += mulFold64(accs[4]^readU64(secret, 43+hi_off), accs[5]^readU64(secret, 51+hi_off)) + + acc.Lo += mulFold64(accs[6]^readU64(secret, 59), accs[7]^readU64(secret, 67)) + acc.Hi += mulFold64(accs[6]^readU64(secret, 59+hi_off), accs[7]^readU64(secret, 67+hi_off)) + } + + acc.Lo = xxh3Avalanche(acc.Lo) + acc.Hi = xxh3Avalanche(acc.Hi) + + return acc +} diff --git a/vendor/github.com/zeebo/xxh3/utils.go b/vendor/github.com/zeebo/xxh3/utils.go new file mode 100644 index 000000000..a837e68a6 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/utils.go @@ -0,0 +1,129 @@ +package xxh3 + +import ( + "math/bits" + "unsafe" +) + +// Uint128 is a 128 bit value. +// The actual value can be thought of as u.Hi<<64 | u.Lo. +type Uint128 struct { + Hi, Lo uint64 +} + +// Bytes returns the uint128 as an array of bytes in canonical form (big-endian encoded). +func (u Uint128) Bytes() [16]byte { + return [16]byte{ + byte(u.Hi >> 0x38), byte(u.Hi >> 0x30), byte(u.Hi >> 0x28), byte(u.Hi >> 0x20), + byte(u.Hi >> 0x18), byte(u.Hi >> 0x10), byte(u.Hi >> 0x08), byte(u.Hi), + byte(u.Lo >> 0x38), byte(u.Lo >> 0x30), byte(u.Lo >> 0x28), byte(u.Lo >> 0x20), + byte(u.Lo >> 0x18), byte(u.Lo >> 0x10), byte(u.Lo >> 0x08), byte(u.Lo), + } +} + +type ( + ptr = unsafe.Pointer + ui = uintptr + + u8 = uint8 + u32 = uint32 + u64 = uint64 + u128 = Uint128 +) + +type str struct { + p ptr + l uint +} + +func readU8(p ptr, o ui) uint8 { + return *(*uint8)(ptr(ui(p) + o)) +} + +func readU16(p ptr, o ui) uint16 { + b := (*[2]byte)(ptr(ui(p) + o)) + return uint16(b[0]) | uint16(b[1])<<8 +} + +func readU32(p ptr, o ui) uint32 { + b := (*[4]byte)(ptr(ui(p) + o)) + return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +} + +func readU64(p ptr, o ui) uint64 { + b := (*[8]byte)(ptr(ui(p) + o)) + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} + +func writeU64(p ptr, o ui, v u64) { + b := (*[8]byte)(ptr(ui(p) + o)) + b[0] = byte(v) + b[1] = byte(v >> 8) + b[2] = byte(v >> 16) + b[3] = byte(v >> 24) + b[4] = byte(v >> 32) + b[5] = byte(v >> 40) + b[6] = byte(v >> 48) + b[7] = byte(v >> 56) +} + +const secretSize = 192 + +func initSecret(secret ptr, seed u64) { + for i := ui(0); i < secretSize/16; i++ { + lo := readU64(key, 16*i) + seed + hi := readU64(key, 16*i+8) - seed + writeU64(secret, 16*i, lo) + writeU64(secret, 16*i+8, hi) + } +} + +func xxh64AvalancheSmall(x u64) u64 { + // x ^= x >> 33 // x must be < 32 bits + // x ^= u64(key32_000 ^ key32_004) // caller must do this + x *= prime64_2 + x ^= x >> 29 + x *= prime64_3 + x ^= x >> 32 + return x +} + +func xxhAvalancheSmall(x u64) u64 { + x ^= x >> 33 + x *= prime64_2 + x ^= x >> 29 + x *= prime64_3 + x ^= x >> 32 + return x +} + +func xxh64AvalancheFull(x u64) u64 { + x ^= x >> 33 + x *= prime64_2 + x ^= x >> 29 + x *= prime64_3 + x ^= x >> 32 + return x +} + +func xxh3Avalanche(x u64) u64 { + x ^= x >> 37 + x *= 0x165667919e3779f9 + x ^= x >> 32 + return x +} + +func rrmxmx(h64 u64, len u64) u64 { + h64 ^= bits.RotateLeft64(h64, 49) ^ bits.RotateLeft64(h64, 24) + h64 *= 0x9fb21c651e98df25 + h64 ^= (h64 >> 35) + len + h64 *= 0x9fb21c651e98df25 + h64 ^= (h64 >> 28) + return h64 +} + +func mulFold64(x, y u64) u64 { + hi, lo := bits.Mul64(x, y) + return hi ^ lo +} |