summaryrefslogtreecommitdiff
path: root/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s')
-rw-r--r--vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s1236
1 files changed, 1236 insertions, 0 deletions
diff --git a/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s b/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s
new file mode 100644
index 000000000..ba670e560
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/accum_vector_sse_amd64.s
@@ -0,0 +1,1236 @@
+// Code generated by command: go run gen.go -sse -out ../accum_vector_sse_amd64.s -pkg xxh3. DO NOT EDIT.
+
+#include "textflag.h"
+
+DATA prime_sse<>+0(SB)/4, $0x9e3779b1
+DATA prime_sse<>+4(SB)/4, $0x9e3779b1
+DATA prime_sse<>+8(SB)/4, $0x9e3779b1
+DATA prime_sse<>+12(SB)/4, $0x9e3779b1
+GLOBL prime_sse<>(SB), RODATA|NOPTR, $16
+
+// func accumSSE(acc *[8]uint64, data *byte, key *byte, len uint64)
+// Requires: SSE2
+TEXT ·accumSSE(SB), NOSPLIT, $0-32
+ MOVQ acc+0(FP), AX
+ MOVQ data+8(FP), CX
+ MOVQ key+16(FP), DX
+ MOVQ key+16(FP), BX
+ MOVQ len+24(FP), SI
+ MOVOU (AX), X1
+ MOVOU 16(AX), X2
+ MOVOU 32(AX), X3
+ MOVOU 48(AX), X4
+ MOVOU prime_sse<>+0(SB), X0
+
+accum_large:
+ CMPQ SI, $0x00000400
+ JLE accum
+ MOVOU (CX), X5
+ MOVOU (DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 16(CX), X5
+ MOVOU 16(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 32(CX), X5
+ MOVOU 32(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 48(CX), X5
+ MOVOU 48(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 64(CX), X5
+ MOVOU 8(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 80(CX), X5
+ MOVOU 24(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 96(CX), X5
+ MOVOU 40(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 112(CX), X5
+ MOVOU 56(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 128(CX), X5
+ MOVOU 16(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 144(CX), X5
+ MOVOU 32(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 160(CX), X5
+ MOVOU 48(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 176(CX), X5
+ MOVOU 64(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 192(CX), X5
+ MOVOU 24(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 208(CX), X5
+ MOVOU 40(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 224(CX), X5
+ MOVOU 56(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 240(CX), X5
+ MOVOU 72(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 256(CX), X5
+ MOVOU 32(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 272(CX), X5
+ MOVOU 48(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 288(CX), X5
+ MOVOU 64(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 304(CX), X5
+ MOVOU 80(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 320(CX), X5
+ MOVOU 40(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 336(CX), X5
+ MOVOU 56(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 352(CX), X5
+ MOVOU 72(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 368(CX), X5
+ MOVOU 88(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 384(CX), X5
+ MOVOU 48(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 400(CX), X5
+ MOVOU 64(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 416(CX), X5
+ MOVOU 80(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 432(CX), X5
+ MOVOU 96(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 448(CX), X5
+ MOVOU 56(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 464(CX), X5
+ MOVOU 72(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 480(CX), X5
+ MOVOU 88(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 496(CX), X5
+ MOVOU 104(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 512(CX), X5
+ MOVOU 64(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 528(CX), X5
+ MOVOU 80(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 544(CX), X5
+ MOVOU 96(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 560(CX), X5
+ MOVOU 112(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 576(CX), X5
+ MOVOU 72(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 592(CX), X5
+ MOVOU 88(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 608(CX), X5
+ MOVOU 104(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 624(CX), X5
+ MOVOU 120(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 640(CX), X5
+ MOVOU 80(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 656(CX), X5
+ MOVOU 96(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 672(CX), X5
+ MOVOU 112(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 688(CX), X5
+ MOVOU 128(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 704(CX), X5
+ MOVOU 88(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 720(CX), X5
+ MOVOU 104(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 736(CX), X5
+ MOVOU 120(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 752(CX), X5
+ MOVOU 136(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 768(CX), X5
+ MOVOU 96(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 784(CX), X5
+ MOVOU 112(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 800(CX), X5
+ MOVOU 128(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 816(CX), X5
+ MOVOU 144(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 832(CX), X5
+ MOVOU 104(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 848(CX), X5
+ MOVOU 120(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 864(CX), X5
+ MOVOU 136(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 880(CX), X5
+ MOVOU 152(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 896(CX), X5
+ MOVOU 112(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 912(CX), X5
+ MOVOU 128(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 928(CX), X5
+ MOVOU 144(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 944(CX), X5
+ MOVOU 160(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 960(CX), X5
+ MOVOU 120(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 976(CX), X5
+ MOVOU 136(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 992(CX), X5
+ MOVOU 152(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 1008(CX), X5
+ MOVOU 168(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ ADDQ $0x00000400, CX
+ SUBQ $0x00000400, SI
+ MOVOU X1, X5
+ PSRLQ $0x2f, X5
+ PXOR X5, X1
+ MOVOU 128(DX), X5
+ PXOR X5, X1
+ PSHUFD $0xf5, X1, X5
+ PMULULQ X0, X1
+ PMULULQ X0, X5
+ PSLLQ $0x20, X5
+ PADDQ X5, X1
+ MOVOU X2, X5
+ PSRLQ $0x2f, X5
+ PXOR X5, X2
+ MOVOU 144(DX), X5
+ PXOR X5, X2
+ PSHUFD $0xf5, X2, X5
+ PMULULQ X0, X2
+ PMULULQ X0, X5
+ PSLLQ $0x20, X5
+ PADDQ X5, X2
+ MOVOU X3, X5
+ PSRLQ $0x2f, X5
+ PXOR X5, X3
+ MOVOU 160(DX), X5
+ PXOR X5, X3
+ PSHUFD $0xf5, X3, X5
+ PMULULQ X0, X3
+ PMULULQ X0, X5
+ PSLLQ $0x20, X5
+ PADDQ X5, X3
+ MOVOU X4, X5
+ PSRLQ $0x2f, X5
+ PXOR X5, X4
+ MOVOU 176(DX), X5
+ PXOR X5, X4
+ PSHUFD $0xf5, X4, X5
+ PMULULQ X0, X4
+ PMULULQ X0, X5
+ PSLLQ $0x20, X5
+ PADDQ X5, X4
+ JMP accum_large
+
+accum:
+ CMPQ SI, $0x40
+ JLE finalize
+ MOVOU (CX), X0
+ MOVOU (BX), X5
+ PXOR X0, X5
+ PSHUFD $0x31, X5, X6
+ PMULULQ X5, X6
+ PSHUFD $0x4e, X0, X0
+ PADDQ X0, X1
+ PADDQ X6, X1
+ MOVOU 16(CX), X0
+ MOVOU 16(BX), X5
+ PXOR X0, X5
+ PSHUFD $0x31, X5, X6
+ PMULULQ X5, X6
+ PSHUFD $0x4e, X0, X0
+ PADDQ X0, X2
+ PADDQ X6, X2
+ MOVOU 32(CX), X0
+ MOVOU 32(BX), X5
+ PXOR X0, X5
+ PSHUFD $0x31, X5, X6
+ PMULULQ X5, X6
+ PSHUFD $0x4e, X0, X0
+ PADDQ X0, X3
+ PADDQ X6, X3
+ MOVOU 48(CX), X0
+ MOVOU 48(BX), X5
+ PXOR X0, X5
+ PSHUFD $0x31, X5, X6
+ PMULULQ X5, X6
+ PSHUFD $0x4e, X0, X0
+ PADDQ X0, X4
+ PADDQ X6, X4
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, SI
+ ADDQ $0x00000008, BX
+ JMP accum
+
+finalize:
+ CMPQ SI, $0x00
+ JE return
+ SUBQ $0x40, CX
+ ADDQ SI, CX
+ MOVOU (CX), X0
+ MOVOU 121(DX), X5
+ PXOR X0, X5
+ PSHUFD $0x31, X5, X6
+ PMULULQ X5, X6
+ PSHUFD $0x4e, X0, X0
+ PADDQ X0, X1
+ PADDQ X6, X1
+ MOVOU 16(CX), X0
+ MOVOU 137(DX), X5
+ PXOR X0, X5
+ PSHUFD $0x31, X5, X6
+ PMULULQ X5, X6
+ PSHUFD $0x4e, X0, X0
+ PADDQ X0, X2
+ PADDQ X6, X2
+ MOVOU 32(CX), X0
+ MOVOU 153(DX), X5
+ PXOR X0, X5
+ PSHUFD $0x31, X5, X6
+ PMULULQ X5, X6
+ PSHUFD $0x4e, X0, X0
+ PADDQ X0, X3
+ PADDQ X6, X3
+ MOVOU 48(CX), X0
+ MOVOU 169(DX), X5
+ PXOR X0, X5
+ PSHUFD $0x31, X5, X6
+ PMULULQ X5, X6
+ PSHUFD $0x4e, X0, X0
+ PADDQ X0, X4
+ PADDQ X6, X4
+
+return:
+ MOVOU X1, (AX)
+ MOVOU X2, 16(AX)
+ MOVOU X3, 32(AX)
+ MOVOU X4, 48(AX)
+ RET
+
+// func accumBlockSSE(acc *[8]uint64, data *byte, key *byte)
+// Requires: SSE2
+TEXT ·accumBlockSSE(SB), NOSPLIT, $0-24
+ MOVQ acc+0(FP), AX
+ MOVQ data+8(FP), CX
+ MOVQ key+16(FP), DX
+ MOVOU (AX), X1
+ MOVOU 16(AX), X2
+ MOVOU 32(AX), X3
+ MOVOU 48(AX), X4
+ MOVOU prime_sse<>+0(SB), X0
+ MOVOU (CX), X5
+ MOVOU (DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 16(CX), X5
+ MOVOU 16(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 32(CX), X5
+ MOVOU 32(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 48(CX), X5
+ MOVOU 48(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 64(CX), X5
+ MOVOU 8(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 80(CX), X5
+ MOVOU 24(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 96(CX), X5
+ MOVOU 40(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 112(CX), X5
+ MOVOU 56(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 128(CX), X5
+ MOVOU 16(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 144(CX), X5
+ MOVOU 32(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 160(CX), X5
+ MOVOU 48(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 176(CX), X5
+ MOVOU 64(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 192(CX), X5
+ MOVOU 24(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 208(CX), X5
+ MOVOU 40(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 224(CX), X5
+ MOVOU 56(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 240(CX), X5
+ MOVOU 72(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 256(CX), X5
+ MOVOU 32(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 272(CX), X5
+ MOVOU 48(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 288(CX), X5
+ MOVOU 64(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 304(CX), X5
+ MOVOU 80(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 320(CX), X5
+ MOVOU 40(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 336(CX), X5
+ MOVOU 56(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 352(CX), X5
+ MOVOU 72(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 368(CX), X5
+ MOVOU 88(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 384(CX), X5
+ MOVOU 48(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 400(CX), X5
+ MOVOU 64(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 416(CX), X5
+ MOVOU 80(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 432(CX), X5
+ MOVOU 96(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 448(CX), X5
+ MOVOU 56(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 464(CX), X5
+ MOVOU 72(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 480(CX), X5
+ MOVOU 88(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 496(CX), X5
+ MOVOU 104(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 512(CX), X5
+ MOVOU 64(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 528(CX), X5
+ MOVOU 80(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 544(CX), X5
+ MOVOU 96(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 560(CX), X5
+ MOVOU 112(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 576(CX), X5
+ MOVOU 72(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 592(CX), X5
+ MOVOU 88(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 608(CX), X5
+ MOVOU 104(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 624(CX), X5
+ MOVOU 120(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 640(CX), X5
+ MOVOU 80(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 656(CX), X5
+ MOVOU 96(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 672(CX), X5
+ MOVOU 112(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 688(CX), X5
+ MOVOU 128(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 704(CX), X5
+ MOVOU 88(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 720(CX), X5
+ MOVOU 104(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 736(CX), X5
+ MOVOU 120(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 752(CX), X5
+ MOVOU 136(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 768(CX), X5
+ MOVOU 96(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 784(CX), X5
+ MOVOU 112(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 800(CX), X5
+ MOVOU 128(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 816(CX), X5
+ MOVOU 144(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 832(CX), X5
+ MOVOU 104(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 848(CX), X5
+ MOVOU 120(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 864(CX), X5
+ MOVOU 136(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 880(CX), X5
+ MOVOU 152(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 896(CX), X5
+ MOVOU 112(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 912(CX), X5
+ MOVOU 128(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 928(CX), X5
+ MOVOU 144(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 944(CX), X5
+ MOVOU 160(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU 960(CX), X5
+ MOVOU 120(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X1
+ PADDQ X7, X1
+ MOVOU 976(CX), X5
+ MOVOU 136(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X2
+ PADDQ X7, X2
+ MOVOU 992(CX), X5
+ MOVOU 152(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X3
+ PADDQ X7, X3
+ MOVOU 1008(CX), X5
+ MOVOU 168(DX), X6
+ PXOR X5, X6
+ PSHUFD $0x31, X6, X7
+ PMULULQ X6, X7
+ PSHUFD $0x4e, X5, X5
+ PADDQ X5, X4
+ PADDQ X7, X4
+ MOVOU X1, X5
+ PSRLQ $0x2f, X5
+ PXOR X5, X1
+ MOVOU 128(DX), X5
+ PXOR X5, X1
+ PSHUFD $0xf5, X1, X5
+ PMULULQ X0, X1
+ PMULULQ X0, X5
+ PSLLQ $0x20, X5
+ PADDQ X5, X1
+ MOVOU X2, X5
+ PSRLQ $0x2f, X5
+ PXOR X5, X2
+ MOVOU 144(DX), X5
+ PXOR X5, X2
+ PSHUFD $0xf5, X2, X5
+ PMULULQ X0, X2
+ PMULULQ X0, X5
+ PSLLQ $0x20, X5
+ PADDQ X5, X2
+ MOVOU X3, X5
+ PSRLQ $0x2f, X5
+ PXOR X5, X3
+ MOVOU 160(DX), X5
+ PXOR X5, X3
+ PSHUFD $0xf5, X3, X5
+ PMULULQ X0, X3
+ PMULULQ X0, X5
+ PSLLQ $0x20, X5
+ PADDQ X5, X3
+ MOVOU X4, X5
+ PSRLQ $0x2f, X5
+ PXOR X5, X4
+ MOVOU 176(DX), X5
+ PXOR X5, X4
+ PSHUFD $0xf5, X4, X5
+ PMULULQ X0, X4
+ PMULULQ X0, X5
+ PSLLQ $0x20, X5
+ PADDQ X5, X4
+ MOVOU X1, (AX)
+ MOVOU X2, 16(AX)
+ MOVOU X3, 32(AX)
+ MOVOU X4, 48(AX)
+ RET