summaryrefslogtreecommitdiff
path: root/vendor/github.com/zeebo/xxh3/accum_generic.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/zeebo/xxh3/accum_generic.go')
-rw-r--r--vendor/github.com/zeebo/xxh3/accum_generic.go542
1 files changed, 542 insertions, 0 deletions
diff --git a/vendor/github.com/zeebo/xxh3/accum_generic.go b/vendor/github.com/zeebo/xxh3/accum_generic.go
new file mode 100644
index 000000000..b1be78507
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/accum_generic.go
@@ -0,0 +1,542 @@
+package xxh3
+
+// avx512Switch is the size at which the avx512 code is used.
+// Bigger blocks benefit more.
+const avx512Switch = 1 << 10
+
+func accumScalar(accs *[8]u64, p, secret ptr, l u64) {
+ if secret != key {
+ accumScalarSeed(accs, p, secret, l)
+ return
+ }
+ for l > _block {
+ k := secret
+
+ // accs
+ for i := 0; i < 16; i++ {
+ dv0 := readU64(p, 8*0)
+ dk0 := dv0 ^ readU64(k, 8*0)
+ accs[1] += dv0
+ accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
+
+ dv1 := readU64(p, 8*1)
+ dk1 := dv1 ^ readU64(k, 8*1)
+ accs[0] += dv1
+ accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
+
+ dv2 := readU64(p, 8*2)
+ dk2 := dv2 ^ readU64(k, 8*2)
+ accs[3] += dv2
+ accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
+
+ dv3 := readU64(p, 8*3)
+ dk3 := dv3 ^ readU64(k, 8*3)
+ accs[2] += dv3
+ accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
+
+ dv4 := readU64(p, 8*4)
+ dk4 := dv4 ^ readU64(k, 8*4)
+ accs[5] += dv4
+ accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
+
+ dv5 := readU64(p, 8*5)
+ dk5 := dv5 ^ readU64(k, 8*5)
+ accs[4] += dv5
+ accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
+
+ dv6 := readU64(p, 8*6)
+ dk6 := dv6 ^ readU64(k, 8*6)
+ accs[7] += dv6
+ accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
+
+ dv7 := readU64(p, 8*7)
+ dk7 := dv7 ^ readU64(k, 8*7)
+ accs[6] += dv7
+ accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
+
+ l -= _stripe
+ if l > 0 {
+ p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
+ }
+ }
+
+ // scramble accs
+ accs[0] ^= accs[0] >> 47
+ accs[0] ^= key64_128
+ accs[0] *= prime32_1
+
+ accs[1] ^= accs[1] >> 47
+ accs[1] ^= key64_136
+ accs[1] *= prime32_1
+
+ accs[2] ^= accs[2] >> 47
+ accs[2] ^= key64_144
+ accs[2] *= prime32_1
+
+ accs[3] ^= accs[3] >> 47
+ accs[3] ^= key64_152
+ accs[3] *= prime32_1
+
+ accs[4] ^= accs[4] >> 47
+ accs[4] ^= key64_160
+ accs[4] *= prime32_1
+
+ accs[5] ^= accs[5] >> 47
+ accs[5] ^= key64_168
+ accs[5] *= prime32_1
+
+ accs[6] ^= accs[6] >> 47
+ accs[6] ^= key64_176
+ accs[6] *= prime32_1
+
+ accs[7] ^= accs[7] >> 47
+ accs[7] ^= key64_184
+ accs[7] *= prime32_1
+ }
+
+ if l > 0 {
+ t, k := (l-1)/_stripe, secret
+
+ for i := u64(0); i < t; i++ {
+ dv0 := readU64(p, 8*0)
+ dk0 := dv0 ^ readU64(k, 8*0)
+ accs[1] += dv0
+ accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
+
+ dv1 := readU64(p, 8*1)
+ dk1 := dv1 ^ readU64(k, 8*1)
+ accs[0] += dv1
+ accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
+
+ dv2 := readU64(p, 8*2)
+ dk2 := dv2 ^ readU64(k, 8*2)
+ accs[3] += dv2
+ accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
+
+ dv3 := readU64(p, 8*3)
+ dk3 := dv3 ^ readU64(k, 8*3)
+ accs[2] += dv3
+ accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
+
+ dv4 := readU64(p, 8*4)
+ dk4 := dv4 ^ readU64(k, 8*4)
+ accs[5] += dv4
+ accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
+
+ dv5 := readU64(p, 8*5)
+ dk5 := dv5 ^ readU64(k, 8*5)
+ accs[4] += dv5
+ accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
+
+ dv6 := readU64(p, 8*6)
+ dk6 := dv6 ^ readU64(k, 8*6)
+ accs[7] += dv6
+ accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
+
+ dv7 := readU64(p, 8*7)
+ dk7 := dv7 ^ readU64(k, 8*7)
+ accs[6] += dv7
+ accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
+
+ l -= _stripe
+ if l > 0 {
+ p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
+ }
+ }
+
+ if l > 0 {
+ p = ptr(ui(p) - uintptr(_stripe-l))
+
+ dv0 := readU64(p, 8*0)
+ dk0 := dv0 ^ key64_121
+ accs[1] += dv0
+ accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
+
+ dv1 := readU64(p, 8*1)
+ dk1 := dv1 ^ key64_129
+ accs[0] += dv1
+ accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
+
+ dv2 := readU64(p, 8*2)
+ dk2 := dv2 ^ key64_137
+ accs[3] += dv2
+ accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
+
+ dv3 := readU64(p, 8*3)
+ dk3 := dv3 ^ key64_145
+ accs[2] += dv3
+ accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
+
+ dv4 := readU64(p, 8*4)
+ dk4 := dv4 ^ key64_153
+ accs[5] += dv4
+ accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
+
+ dv5 := readU64(p, 8*5)
+ dk5 := dv5 ^ key64_161
+ accs[4] += dv5
+ accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
+
+ dv6 := readU64(p, 8*6)
+ dk6 := dv6 ^ key64_169
+ accs[7] += dv6
+ accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
+
+ dv7 := readU64(p, 8*7)
+ dk7 := dv7 ^ key64_177
+ accs[6] += dv7
+ accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
+ }
+ }
+}
+
+func accumBlockScalar(accs *[8]u64, p, secret ptr) {
+ if secret != key {
+ accumBlockScalarSeed(accs, p, secret)
+ return
+ }
+ // accs
+ for i := 0; i < 16; i++ {
+ dv0 := readU64(p, 8*0)
+ dk0 := dv0 ^ readU64(secret, 8*0)
+ accs[1] += dv0
+ accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
+
+ dv1 := readU64(p, 8*1)
+ dk1 := dv1 ^ readU64(secret, 8*1)
+ accs[0] += dv1
+ accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
+
+ dv2 := readU64(p, 8*2)
+ dk2 := dv2 ^ readU64(secret, 8*2)
+ accs[3] += dv2
+ accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
+
+ dv3 := readU64(p, 8*3)
+ dk3 := dv3 ^ readU64(secret, 8*3)
+ accs[2] += dv3
+ accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
+
+ dv4 := readU64(p, 8*4)
+ dk4 := dv4 ^ readU64(secret, 8*4)
+ accs[5] += dv4
+ accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
+
+ dv5 := readU64(p, 8*5)
+ dk5 := dv5 ^ readU64(secret, 8*5)
+ accs[4] += dv5
+ accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
+
+ dv6 := readU64(p, 8*6)
+ dk6 := dv6 ^ readU64(secret, 8*6)
+ accs[7] += dv6
+ accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
+
+ dv7 := readU64(p, 8*7)
+ dk7 := dv7 ^ readU64(secret, 8*7)
+ accs[6] += dv7
+ accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
+
+ p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8)
+ }
+
+ // scramble accs
+ accs[0] ^= accs[0] >> 47
+ accs[0] ^= key64_128
+ accs[0] *= prime32_1
+
+ accs[1] ^= accs[1] >> 47
+ accs[1] ^= key64_136
+ accs[1] *= prime32_1
+
+ accs[2] ^= accs[2] >> 47
+ accs[2] ^= key64_144
+ accs[2] *= prime32_1
+
+ accs[3] ^= accs[3] >> 47
+ accs[3] ^= key64_152
+ accs[3] *= prime32_1
+
+ accs[4] ^= accs[4] >> 47
+ accs[4] ^= key64_160
+ accs[4] *= prime32_1
+
+ accs[5] ^= accs[5] >> 47
+ accs[5] ^= key64_168
+ accs[5] *= prime32_1
+
+ accs[6] ^= accs[6] >> 47
+ accs[6] ^= key64_176
+ accs[6] *= prime32_1
+
+ accs[7] ^= accs[7] >> 47
+ accs[7] ^= key64_184
+ accs[7] *= prime32_1
+}
+
+// accumScalarSeed should be used with custom key.
+func accumScalarSeed(accs *[8]u64, p, secret ptr, l u64) {
+ for l > _block {
+ k := secret
+
+ // accs
+ for i := 0; i < 16; i++ {
+ dv0 := readU64(p, 8*0)
+ dk0 := dv0 ^ readU64(k, 8*0)
+ accs[1] += dv0
+ accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
+
+ dv1 := readU64(p, 8*1)
+ dk1 := dv1 ^ readU64(k, 8*1)
+ accs[0] += dv1
+ accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
+
+ dv2 := readU64(p, 8*2)
+ dk2 := dv2 ^ readU64(k, 8*2)
+ accs[3] += dv2
+ accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
+
+ dv3 := readU64(p, 8*3)
+ dk3 := dv3 ^ readU64(k, 8*3)
+ accs[2] += dv3
+ accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
+
+ dv4 := readU64(p, 8*4)
+ dk4 := dv4 ^ readU64(k, 8*4)
+ accs[5] += dv4
+ accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
+
+ dv5 := readU64(p, 8*5)
+ dk5 := dv5 ^ readU64(k, 8*5)
+ accs[4] += dv5
+ accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
+
+ dv6 := readU64(p, 8*6)
+ dk6 := dv6 ^ readU64(k, 8*6)
+ accs[7] += dv6
+ accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
+
+ dv7 := readU64(p, 8*7)
+ dk7 := dv7 ^ readU64(k, 8*7)
+ accs[6] += dv7
+ accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
+
+ l -= _stripe
+ if l > 0 {
+ p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
+ }
+ }
+
+ // scramble accs
+ accs[0] ^= accs[0] >> 47
+ accs[0] ^= readU64(secret, 128)
+ accs[0] *= prime32_1
+
+ accs[1] ^= accs[1] >> 47
+ accs[1] ^= readU64(secret, 136)
+ accs[1] *= prime32_1
+
+ accs[2] ^= accs[2] >> 47
+ accs[2] ^= readU64(secret, 144)
+ accs[2] *= prime32_1
+
+ accs[3] ^= accs[3] >> 47
+ accs[3] ^= readU64(secret, 152)
+ accs[3] *= prime32_1
+
+ accs[4] ^= accs[4] >> 47
+ accs[4] ^= readU64(secret, 160)
+ accs[4] *= prime32_1
+
+ accs[5] ^= accs[5] >> 47
+ accs[5] ^= readU64(secret, 168)
+ accs[5] *= prime32_1
+
+ accs[6] ^= accs[6] >> 47
+ accs[6] ^= readU64(secret, 176)
+ accs[6] *= prime32_1
+
+ accs[7] ^= accs[7] >> 47
+ accs[7] ^= readU64(secret, 184)
+ accs[7] *= prime32_1
+ }
+
+ if l > 0 {
+ t, k := (l-1)/_stripe, secret
+
+ for i := u64(0); i < t; i++ {
+ dv0 := readU64(p, 8*0)
+ dk0 := dv0 ^ readU64(k, 8*0)
+ accs[1] += dv0
+ accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
+
+ dv1 := readU64(p, 8*1)
+ dk1 := dv1 ^ readU64(k, 8*1)
+ accs[0] += dv1
+ accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
+
+ dv2 := readU64(p, 8*2)
+ dk2 := dv2 ^ readU64(k, 8*2)
+ accs[3] += dv2
+ accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
+
+ dv3 := readU64(p, 8*3)
+ dk3 := dv3 ^ readU64(k, 8*3)
+ accs[2] += dv3
+ accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
+
+ dv4 := readU64(p, 8*4)
+ dk4 := dv4 ^ readU64(k, 8*4)
+ accs[5] += dv4
+ accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
+
+ dv5 := readU64(p, 8*5)
+ dk5 := dv5 ^ readU64(k, 8*5)
+ accs[4] += dv5
+ accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
+
+ dv6 := readU64(p, 8*6)
+ dk6 := dv6 ^ readU64(k, 8*6)
+ accs[7] += dv6
+ accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
+
+ dv7 := readU64(p, 8*7)
+ dk7 := dv7 ^ readU64(k, 8*7)
+ accs[6] += dv7
+ accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
+
+ l -= _stripe
+ if l > 0 {
+ p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
+ }
+ }
+
+ if l > 0 {
+ p = ptr(ui(p) - uintptr(_stripe-l))
+
+ dv0 := readU64(p, 8*0)
+ dk0 := dv0 ^ readU64(secret, 121)
+ accs[1] += dv0
+ accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
+
+ dv1 := readU64(p, 8*1)
+ dk1 := dv1 ^ readU64(secret, 129)
+ accs[0] += dv1
+ accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
+
+ dv2 := readU64(p, 8*2)
+ dk2 := dv2 ^ readU64(secret, 137)
+ accs[3] += dv2
+ accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
+
+ dv3 := readU64(p, 8*3)
+ dk3 := dv3 ^ readU64(secret, 145)
+ accs[2] += dv3
+ accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
+
+ dv4 := readU64(p, 8*4)
+ dk4 := dv4 ^ readU64(secret, 153)
+ accs[5] += dv4
+ accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
+
+ dv5 := readU64(p, 8*5)
+ dk5 := dv5 ^ readU64(secret, 161)
+ accs[4] += dv5
+ accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
+
+ dv6 := readU64(p, 8*6)
+ dk6 := dv6 ^ readU64(secret, 169)
+ accs[7] += dv6
+ accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
+
+ dv7 := readU64(p, 8*7)
+ dk7 := dv7 ^ readU64(secret, 177)
+ accs[6] += dv7
+ accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
+ }
+ }
+}
+
+// accumBlockScalarSeed should be used with custom key.
+func accumBlockScalarSeed(accs *[8]u64, p, secret ptr) {
+ // accs
+ {
+ secret := secret
+ for i := 0; i < 16; i++ {
+ dv0 := readU64(p, 8*0)
+ dk0 := dv0 ^ readU64(secret, 8*0)
+ accs[1] += dv0
+ accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
+
+ dv1 := readU64(p, 8*1)
+ dk1 := dv1 ^ readU64(secret, 8*1)
+ accs[0] += dv1
+ accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
+
+ dv2 := readU64(p, 8*2)
+ dk2 := dv2 ^ readU64(secret, 8*2)
+ accs[3] += dv2
+ accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
+
+ dv3 := readU64(p, 8*3)
+ dk3 := dv3 ^ readU64(secret, 8*3)
+ accs[2] += dv3
+ accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
+
+ dv4 := readU64(p, 8*4)
+ dk4 := dv4 ^ readU64(secret, 8*4)
+ accs[5] += dv4
+ accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
+
+ dv5 := readU64(p, 8*5)
+ dk5 := dv5 ^ readU64(secret, 8*5)
+ accs[4] += dv5
+ accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
+
+ dv6 := readU64(p, 8*6)
+ dk6 := dv6 ^ readU64(secret, 8*6)
+ accs[7] += dv6
+ accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
+
+ dv7 := readU64(p, 8*7)
+ dk7 := dv7 ^ readU64(secret, 8*7)
+ accs[6] += dv7
+ accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
+
+ p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8)
+ }
+ }
+
+ // scramble accs
+ accs[0] ^= accs[0] >> 47
+ accs[0] ^= readU64(secret, 128)
+ accs[0] *= prime32_1
+
+ accs[1] ^= accs[1] >> 47
+ accs[1] ^= readU64(secret, 136)
+ accs[1] *= prime32_1
+
+ accs[2] ^= accs[2] >> 47
+ accs[2] ^= readU64(secret, 144)
+ accs[2] *= prime32_1
+
+ accs[3] ^= accs[3] >> 47
+ accs[3] ^= readU64(secret, 152)
+ accs[3] *= prime32_1
+
+ accs[4] ^= accs[4] >> 47
+ accs[4] ^= readU64(secret, 160)
+ accs[4] *= prime32_1
+
+ accs[5] ^= accs[5] >> 47
+ accs[5] ^= readU64(secret, 168)
+ accs[5] *= prime32_1
+
+ accs[6] ^= accs[6] >> 47
+ accs[6] ^= readU64(secret, 176)
+ accs[6] *= prime32_1
+
+ accs[7] ^= accs[7] >> 47
+ accs[7] ^= readU64(secret, 184)
+ accs[7] *= prime32_1
+}