diff options
Diffstat (limited to 'vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s')
-rw-r--r-- | vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s | 1236 |
1 files changed, 1236 insertions, 0 deletions
diff --git a/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s b/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s new file mode 100644 index 000000000..ba670e560 --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s @@ -0,0 +1,1236 @@ +// Code generated by command: go run gen.go -sse -out ../accum_vector_sse_amd64.s -pkg xxh3. DO NOT EDIT. + +#include "textflag.h" + +DATA prime_sse<>+0(SB)/4, $0x9e3779b1 +DATA prime_sse<>+4(SB)/4, $0x9e3779b1 +DATA prime_sse<>+8(SB)/4, $0x9e3779b1 +DATA prime_sse<>+12(SB)/4, $0x9e3779b1 +GLOBL prime_sse<>(SB), RODATA|NOPTR, $16 + +// func accumSSE(acc *[8]uint64, data *byte, key *byte, len uint64) +// Requires: SSE2 +TEXT ·accumSSE(SB), NOSPLIT, $0-32 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + MOVQ key+16(FP), BX + MOVQ len+24(FP), SI + MOVOU (AX), X1 + MOVOU 16(AX), X2 + MOVOU 32(AX), X3 + MOVOU 48(AX), X4 + MOVOU prime_sse<>+0(SB), X0 + +accum_large: + CMPQ SI, $0x00000400 + JLE accum + MOVOU (CX), X5 + MOVOU (DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 16(CX), X5 + MOVOU 16(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 32(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 48(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 64(CX), X5 + MOVOU 8(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 80(CX), X5 + MOVOU 24(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 96(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 112(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 128(CX), X5 + MOVOU 16(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 144(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 160(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 176(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 192(CX), X5 + MOVOU 24(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 208(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 224(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 240(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 256(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 272(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 288(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 304(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 320(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 336(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 352(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 368(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 384(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 400(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 416(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 432(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 448(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 464(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 480(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 496(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 512(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 528(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 544(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 560(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 576(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 592(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 608(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 624(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 640(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 656(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 672(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 688(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 704(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 720(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 736(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 752(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 768(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 784(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 800(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 816(CX), X5 + MOVOU 144(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 832(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 848(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 864(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 880(CX), X5 + MOVOU 152(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 896(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 912(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 928(CX), X5 + MOVOU 144(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 944(CX), X5 + MOVOU 160(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 960(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 976(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 992(CX), X5 + MOVOU 152(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 1008(CX), X5 + MOVOU 168(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + ADDQ $0x00000400, CX + SUBQ $0x00000400, SI + MOVOU X1, X5 + PSRLQ $0x2f, X5 + PXOR X5, X1 + MOVOU 128(DX), X5 + PXOR X5, X1 + PSHUFD $0xf5, X1, X5 + PMULULQ X0, X1 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X1 + MOVOU X2, X5 + PSRLQ $0x2f, X5 + PXOR X5, X2 + MOVOU 144(DX), X5 + PXOR X5, X2 + PSHUFD $0xf5, X2, X5 + PMULULQ X0, X2 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X2 + MOVOU X3, X5 + PSRLQ $0x2f, X5 + PXOR X5, X3 + MOVOU 160(DX), X5 + PXOR X5, X3 + PSHUFD $0xf5, X3, X5 + PMULULQ X0, X3 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X3 + MOVOU X4, X5 + PSRLQ $0x2f, X5 + PXOR X5, X4 + MOVOU 176(DX), X5 + PXOR X5, X4 + PSHUFD $0xf5, X4, X5 + PMULULQ X0, X4 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X4 + JMP accum_large + +accum: + CMPQ SI, $0x40 + JLE finalize + MOVOU (CX), X0 + MOVOU (BX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X1 + PADDQ X6, X1 + MOVOU 16(CX), X0 + MOVOU 16(BX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X2 + PADDQ X6, X2 + MOVOU 32(CX), X0 + MOVOU 32(BX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X3 + PADDQ X6, X3 + MOVOU 48(CX), X0 + MOVOU 48(BX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X4 + PADDQ X6, X4 + ADDQ $0x00000040, CX + SUBQ $0x00000040, SI + ADDQ $0x00000008, BX + JMP accum + +finalize: + CMPQ SI, $0x00 + JE return + SUBQ $0x40, CX + ADDQ SI, CX + MOVOU (CX), X0 + MOVOU 121(DX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X1 + PADDQ X6, X1 + MOVOU 16(CX), X0 + MOVOU 137(DX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X2 + PADDQ X6, X2 + MOVOU 32(CX), X0 + MOVOU 153(DX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X3 + PADDQ X6, X3 + MOVOU 48(CX), X0 + MOVOU 169(DX), X5 + PXOR X0, X5 + PSHUFD $0x31, X5, X6 + PMULULQ X5, X6 + PSHUFD $0x4e, X0, X0 + PADDQ X0, X4 + PADDQ X6, X4 + +return: + MOVOU X1, (AX) + MOVOU X2, 16(AX) + MOVOU X3, 32(AX) + MOVOU X4, 48(AX) + RET + +// func accumBlockSSE(acc *[8]uint64, data *byte, key *byte) +// Requires: SSE2 +TEXT ·accumBlockSSE(SB), NOSPLIT, $0-24 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + MOVOU (AX), X1 + MOVOU 16(AX), X2 + MOVOU 32(AX), X3 + MOVOU 48(AX), X4 + MOVOU prime_sse<>+0(SB), X0 + MOVOU (CX), X5 + MOVOU (DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 16(CX), X5 + MOVOU 16(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 32(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 48(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 64(CX), X5 + MOVOU 8(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 80(CX), X5 + MOVOU 24(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 96(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 112(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 128(CX), X5 + MOVOU 16(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 144(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 160(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 176(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 192(CX), X5 + MOVOU 24(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 208(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 224(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 240(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 256(CX), X5 + MOVOU 32(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 272(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 288(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 304(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 320(CX), X5 + MOVOU 40(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 336(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 352(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 368(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 384(CX), X5 + MOVOU 48(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 400(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 416(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 432(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 448(CX), X5 + MOVOU 56(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 464(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 480(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 496(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 512(CX), X5 + MOVOU 64(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 528(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 544(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 560(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 576(CX), X5 + MOVOU 72(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 592(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 608(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 624(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 640(CX), X5 + MOVOU 80(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 656(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 672(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 688(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 704(CX), X5 + MOVOU 88(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 720(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 736(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 752(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 768(CX), X5 + MOVOU 96(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 784(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 800(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 816(CX), X5 + MOVOU 144(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 832(CX), X5 + MOVOU 104(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 848(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 864(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 880(CX), X5 + MOVOU 152(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 896(CX), X5 + MOVOU 112(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 912(CX), X5 + MOVOU 128(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 928(CX), X5 + MOVOU 144(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 944(CX), X5 + MOVOU 160(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU 960(CX), X5 + MOVOU 120(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X1 + PADDQ X7, X1 + MOVOU 976(CX), X5 + MOVOU 136(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X2 + PADDQ X7, X2 + MOVOU 992(CX), X5 + MOVOU 152(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X3 + PADDQ X7, X3 + MOVOU 1008(CX), X5 + MOVOU 168(DX), X6 + PXOR X5, X6 + PSHUFD $0x31, X6, X7 + PMULULQ X6, X7 + PSHUFD $0x4e, X5, X5 + PADDQ X5, X4 + PADDQ X7, X4 + MOVOU X1, X5 + PSRLQ $0x2f, X5 + PXOR X5, X1 + MOVOU 128(DX), X5 + PXOR X5, X1 + PSHUFD $0xf5, X1, X5 + PMULULQ X0, X1 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X1 + MOVOU X2, X5 + PSRLQ $0x2f, X5 + PXOR X5, X2 + MOVOU 144(DX), X5 + PXOR X5, X2 + PSHUFD $0xf5, X2, X5 + PMULULQ X0, X2 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X2 + MOVOU X3, X5 + PSRLQ $0x2f, X5 + PXOR X5, X3 + MOVOU 160(DX), X5 + PXOR X5, X3 + PSHUFD $0xf5, X3, X5 + PMULULQ X0, X3 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X3 + MOVOU X4, X5 + PSRLQ $0x2f, X5 + PXOR X5, X4 + MOVOU 176(DX), X5 + PXOR X5, X4 + PSHUFD $0xf5, X4, X5 + PMULULQ X0, X4 + PMULULQ X0, X5 + PSLLQ $0x20, X5 + PADDQ X5, X4 + MOVOU X1, (AX) + MOVOU X2, 16(AX) + MOVOU X3, 32(AX) + MOVOU X4, 48(AX) + RET |