diff options
Diffstat (limited to 'vendor/github.com/zeebo/xxh3/accum_generic.go')
-rw-r--r-- | vendor/github.com/zeebo/xxh3/accum_generic.go | 542 |
1 files changed, 542 insertions, 0 deletions
diff --git a/vendor/github.com/zeebo/xxh3/accum_generic.go b/vendor/github.com/zeebo/xxh3/accum_generic.go new file mode 100644 index 000000000..b1be78507 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_generic.go @@ -0,0 +1,542 @@ +package xxh3 + +// avx512Switch is the size at which the avx512 code is used. +// Bigger blocks benefit more. +const avx512Switch = 1 << 10 + +func accumScalar(accs *[8]u64, p, secret ptr, l u64) { + if secret != key { + accumScalarSeed(accs, p, secret, l) + return + } + for l > _block { + k := secret + + // accs + for i := 0; i < 16; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(k, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(k, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(k, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(k, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(k, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(k, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(k, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(k, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + l -= _stripe + if l > 0 { + p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8) + } + } + + // scramble accs + accs[0] ^= accs[0] >> 47 + accs[0] ^= key64_128 + accs[0] *= prime32_1 + + accs[1] ^= accs[1] >> 47 + accs[1] ^= key64_136 + accs[1] *= prime32_1 + + accs[2] ^= accs[2] >> 47 + accs[2] ^= key64_144 + accs[2] *= prime32_1 + + accs[3] ^= accs[3] >> 47 + accs[3] ^= key64_152 + accs[3] *= prime32_1 + + accs[4] ^= accs[4] >> 47 + accs[4] ^= key64_160 + accs[4] *= prime32_1 + + accs[5] ^= accs[5] >> 47 + accs[5] ^= key64_168 + accs[5] *= prime32_1 + + accs[6] ^= accs[6] >> 47 + accs[6] ^= key64_176 + accs[6] *= prime32_1 + + accs[7] ^= accs[7] >> 47 + accs[7] ^= key64_184 + accs[7] *= prime32_1 + } + + if l > 0 { + t, k := (l-1)/_stripe, secret + + for i := u64(0); i < t; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(k, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(k, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(k, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(k, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(k, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(k, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(k, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(k, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + l -= _stripe + if l > 0 { + p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8) + } + } + + if l > 0 { + p = ptr(ui(p) - uintptr(_stripe-l)) + + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ key64_121 + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ key64_129 + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ key64_137 + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ key64_145 + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ key64_153 + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ key64_161 + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ key64_169 + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ key64_177 + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + } + } +} + +func accumBlockScalar(accs *[8]u64, p, secret ptr) { + if secret != key { + accumBlockScalarSeed(accs, p, secret) + return + } + // accs + for i := 0; i < 16; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(secret, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(secret, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(secret, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(secret, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(secret, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(secret, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(secret, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(secret, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8) + } + + // scramble accs + accs[0] ^= accs[0] >> 47 + accs[0] ^= key64_128 + accs[0] *= prime32_1 + + accs[1] ^= accs[1] >> 47 + accs[1] ^= key64_136 + accs[1] *= prime32_1 + + accs[2] ^= accs[2] >> 47 + accs[2] ^= key64_144 + accs[2] *= prime32_1 + + accs[3] ^= accs[3] >> 47 + accs[3] ^= key64_152 + accs[3] *= prime32_1 + + accs[4] ^= accs[4] >> 47 + accs[4] ^= key64_160 + accs[4] *= prime32_1 + + accs[5] ^= accs[5] >> 47 + accs[5] ^= key64_168 + accs[5] *= prime32_1 + + accs[6] ^= accs[6] >> 47 + accs[6] ^= key64_176 + accs[6] *= prime32_1 + + accs[7] ^= accs[7] >> 47 + accs[7] ^= key64_184 + accs[7] *= prime32_1 +} + +// accumScalarSeed should be used with custom key. +func accumScalarSeed(accs *[8]u64, p, secret ptr, l u64) { + for l > _block { + k := secret + + // accs + for i := 0; i < 16; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(k, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(k, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(k, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(k, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(k, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(k, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(k, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(k, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + l -= _stripe + if l > 0 { + p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8) + } + } + + // scramble accs + accs[0] ^= accs[0] >> 47 + accs[0] ^= readU64(secret, 128) + accs[0] *= prime32_1 + + accs[1] ^= accs[1] >> 47 + accs[1] ^= readU64(secret, 136) + accs[1] *= prime32_1 + + accs[2] ^= accs[2] >> 47 + accs[2] ^= readU64(secret, 144) + accs[2] *= prime32_1 + + accs[3] ^= accs[3] >> 47 + accs[3] ^= readU64(secret, 152) + accs[3] *= prime32_1 + + accs[4] ^= accs[4] >> 47 + accs[4] ^= readU64(secret, 160) + accs[4] *= prime32_1 + + accs[5] ^= accs[5] >> 47 + accs[5] ^= readU64(secret, 168) + accs[5] *= prime32_1 + + accs[6] ^= accs[6] >> 47 + accs[6] ^= readU64(secret, 176) + accs[6] *= prime32_1 + + accs[7] ^= accs[7] >> 47 + accs[7] ^= readU64(secret, 184) + accs[7] *= prime32_1 + } + + if l > 0 { + t, k := (l-1)/_stripe, secret + + for i := u64(0); i < t; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(k, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(k, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(k, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(k, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(k, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(k, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(k, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(k, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + l -= _stripe + if l > 0 { + p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8) + } + } + + if l > 0 { + p = ptr(ui(p) - uintptr(_stripe-l)) + + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(secret, 121) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(secret, 129) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(secret, 137) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(secret, 145) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(secret, 153) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(secret, 161) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(secret, 169) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(secret, 177) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + } + } +} + +// accumBlockScalarSeed should be used with custom key. +func accumBlockScalarSeed(accs *[8]u64, p, secret ptr) { + // accs + { + secret := secret + for i := 0; i < 16; i++ { + dv0 := readU64(p, 8*0) + dk0 := dv0 ^ readU64(secret, 8*0) + accs[1] += dv0 + accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32) + + dv1 := readU64(p, 8*1) + dk1 := dv1 ^ readU64(secret, 8*1) + accs[0] += dv1 + accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32) + + dv2 := readU64(p, 8*2) + dk2 := dv2 ^ readU64(secret, 8*2) + accs[3] += dv2 + accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32) + + dv3 := readU64(p, 8*3) + dk3 := dv3 ^ readU64(secret, 8*3) + accs[2] += dv3 + accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32) + + dv4 := readU64(p, 8*4) + dk4 := dv4 ^ readU64(secret, 8*4) + accs[5] += dv4 + accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32) + + dv5 := readU64(p, 8*5) + dk5 := dv5 ^ readU64(secret, 8*5) + accs[4] += dv5 + accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32) + + dv6 := readU64(p, 8*6) + dk6 := dv6 ^ readU64(secret, 8*6) + accs[7] += dv6 + accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32) + + dv7 := readU64(p, 8*7) + dk7 := dv7 ^ readU64(secret, 8*7) + accs[6] += dv7 + accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32) + + p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8) + } + } + + // scramble accs + accs[0] ^= accs[0] >> 47 + accs[0] ^= readU64(secret, 128) + accs[0] *= prime32_1 + + accs[1] ^= accs[1] >> 47 + accs[1] ^= readU64(secret, 136) + accs[1] *= prime32_1 + + accs[2] ^= accs[2] >> 47 + accs[2] ^= readU64(secret, 144) + accs[2] *= prime32_1 + + accs[3] ^= accs[3] >> 47 + accs[3] ^= readU64(secret, 152) + accs[3] *= prime32_1 + + accs[4] ^= accs[4] >> 47 + accs[4] ^= readU64(secret, 160) + accs[4] *= prime32_1 + + accs[5] ^= accs[5] >> 47 + accs[5] ^= readU64(secret, 168) + accs[5] *= prime32_1 + + accs[6] ^= accs[6] >> 47 + accs[6] ^= readU64(secret, 176) + accs[6] *= prime32_1 + + accs[7] ^= accs[7] >> 47 + accs[7] ^= readU64(secret, 184) + accs[7] *= prime32_1 +} |