diff options
Diffstat (limited to 'vendor/github.com/cespare/xxhash/v2/xxhash_amd64.s')
-rw-r--r-- | vendor/github.com/cespare/xxhash/v2/xxhash_amd64.s | 209 |
1 files changed, 209 insertions, 0 deletions
diff --git a/vendor/github.com/cespare/xxhash/v2/xxhash_amd64.s b/vendor/github.com/cespare/xxhash/v2/xxhash_amd64.s new file mode 100644 index 000000000..3e8b13257 --- /dev/null +++ b/vendor/github.com/cespare/xxhash/v2/xxhash_amd64.s @@ -0,0 +1,209 @@ +//go:build !appengine && gc && !purego +// +build !appengine +// +build gc +// +build !purego + +#include "textflag.h" + +// Registers: +#define h AX +#define d AX +#define p SI // pointer to advance through b +#define n DX +#define end BX // loop end +#define v1 R8 +#define v2 R9 +#define v3 R10 +#define v4 R11 +#define x R12 +#define prime1 R13 +#define prime2 R14 +#define prime4 DI + +#define round(acc, x) \ + IMULQ prime2, x \ + ADDQ x, acc \ + ROLQ $31, acc \ + IMULQ prime1, acc + +// round0 performs the operation x = round(0, x). +#define round0(x) \ + IMULQ prime2, x \ + ROLQ $31, x \ + IMULQ prime1, x + +// mergeRound applies a merge round on the two registers acc and x. +// It assumes that prime1, prime2, and prime4 have been loaded. +#define mergeRound(acc, x) \ + round0(x) \ + XORQ x, acc \ + IMULQ prime1, acc \ + ADDQ prime4, acc + +// blockLoop processes as many 32-byte blocks as possible, +// updating v1, v2, v3, and v4. It assumes that there is at least one block +// to process. +#define blockLoop() \ +loop: \ + MOVQ +0(p), x \ + round(v1, x) \ + MOVQ +8(p), x \ + round(v2, x) \ + MOVQ +16(p), x \ + round(v3, x) \ + MOVQ +24(p), x \ + round(v4, x) \ + ADDQ $32, p \ + CMPQ p, end \ + JLE loop + +// func Sum64(b []byte) uint64 +TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32 + // Load fixed primes. + MOVQ ·primes+0(SB), prime1 + MOVQ ·primes+8(SB), prime2 + MOVQ ·primes+24(SB), prime4 + + // Load slice. + MOVQ b_base+0(FP), p + MOVQ b_len+8(FP), n + LEAQ (p)(n*1), end + + // The first loop limit will be len(b)-32. + SUBQ $32, end + + // Check whether we have at least one block. + CMPQ n, $32 + JLT noBlocks + + // Set up initial state (v1, v2, v3, v4). + MOVQ prime1, v1 + ADDQ prime2, v1 + MOVQ prime2, v2 + XORQ v3, v3 + XORQ v4, v4 + SUBQ prime1, v4 + + blockLoop() + + MOVQ v1, h + ROLQ $1, h + MOVQ v2, x + ROLQ $7, x + ADDQ x, h + MOVQ v3, x + ROLQ $12, x + ADDQ x, h + MOVQ v4, x + ROLQ $18, x + ADDQ x, h + + mergeRound(h, v1) + mergeRound(h, v2) + mergeRound(h, v3) + mergeRound(h, v4) + + JMP afterBlocks + +noBlocks: + MOVQ ·primes+32(SB), h + +afterBlocks: + ADDQ n, h + + ADDQ $24, end + CMPQ p, end + JG try4 + +loop8: + MOVQ (p), x + ADDQ $8, p + round0(x) + XORQ x, h + ROLQ $27, h + IMULQ prime1, h + ADDQ prime4, h + + CMPQ p, end + JLE loop8 + +try4: + ADDQ $4, end + CMPQ p, end + JG try1 + + MOVL (p), x + ADDQ $4, p + IMULQ prime1, x + XORQ x, h + + ROLQ $23, h + IMULQ prime2, h + ADDQ ·primes+16(SB), h + +try1: + ADDQ $4, end + CMPQ p, end + JGE finalize + +loop1: + MOVBQZX (p), x + ADDQ $1, p + IMULQ ·primes+32(SB), x + XORQ x, h + ROLQ $11, h + IMULQ prime1, h + + CMPQ p, end + JL loop1 + +finalize: + MOVQ h, x + SHRQ $33, x + XORQ x, h + IMULQ prime2, h + MOVQ h, x + SHRQ $29, x + XORQ x, h + IMULQ ·primes+16(SB), h + MOVQ h, x + SHRQ $32, x + XORQ x, h + + MOVQ h, ret+24(FP) + RET + +// func writeBlocks(d *Digest, b []byte) int +TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40 + // Load fixed primes needed for round. + MOVQ ·primes+0(SB), prime1 + MOVQ ·primes+8(SB), prime2 + + // Load slice. + MOVQ b_base+8(FP), p + MOVQ b_len+16(FP), n + LEAQ (p)(n*1), end + SUBQ $32, end + + // Load vN from d. + MOVQ s+0(FP), d + MOVQ 0(d), v1 + MOVQ 8(d), v2 + MOVQ 16(d), v3 + MOVQ 24(d), v4 + + // We don't need to check the loop condition here; this function is + // always called with at least one block of data to process. + blockLoop() + + // Copy vN back to d. + MOVQ v1, 0(d) + MOVQ v2, 8(d) + MOVQ v3, 16(d) + MOVQ v4, 24(d) + + // The number of bytes written is p minus the old base pointer. + SUBQ b_base+8(FP), p + MOVQ p, ret+32(FP) + + RET |