summaryrefslogtreecommitdiff
path: root/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s')
-rw-r--r--vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s586
1 files changed, 586 insertions, 0 deletions
diff --git a/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s b/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s
new file mode 100644
index 000000000..b53c1521f
--- /dev/null
+++ b/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s
@@ -0,0 +1,586 @@
+// Code generated by command: go run gen.go -avx -out ../accum_vector_avx_amd64.s -pkg xxh3. DO NOT EDIT.
+
+#include "textflag.h"
+
+DATA prime_avx<>+0(SB)/8, $0x000000009e3779b1
+DATA prime_avx<>+8(SB)/8, $0x000000009e3779b1
+DATA prime_avx<>+16(SB)/8, $0x000000009e3779b1
+DATA prime_avx<>+24(SB)/8, $0x000000009e3779b1
+GLOBL prime_avx<>(SB), RODATA|NOPTR, $32
+
+// func accumAVX2(acc *[8]uint64, data *byte, key *byte, len uint64)
+// Requires: AVX, AVX2, MMX+
+TEXT ·accumAVX2(SB), NOSPLIT, $0-32
+ MOVQ acc+0(FP), AX
+ MOVQ data+8(FP), CX
+ MOVQ key+16(FP), DX
+ MOVQ key+16(FP), BX
+ MOVQ len+24(FP), SI
+ VMOVDQU (AX), Y1
+ VMOVDQU 32(AX), Y2
+ VMOVDQU prime_avx<>+0(SB), Y0
+
+accum_large:
+ CMPQ SI, $0x00000400
+ JLE accum
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y6
+ PREFETCHT0 512(CX)
+ VPXOR (DX), Y3, Y4
+ VPXOR 32(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y6
+ PREFETCHT0 576(CX)
+ VPXOR 8(DX), Y3, Y4
+ VPXOR 40(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y6
+ PREFETCHT0 640(CX)
+ VPXOR 16(DX), Y3, Y4
+ VPXOR 48(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y6
+ PREFETCHT0 704(CX)
+ VPXOR 24(DX), Y3, Y4
+ VPXOR 56(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 256(CX), Y3
+ VMOVDQU 288(CX), Y6
+ PREFETCHT0 768(CX)
+ VPXOR 32(DX), Y3, Y4
+ VPXOR 64(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 320(CX), Y3
+ VMOVDQU 352(CX), Y6
+ PREFETCHT0 832(CX)
+ VPXOR 40(DX), Y3, Y4
+ VPXOR 72(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 384(CX), Y3
+ VMOVDQU 416(CX), Y6
+ PREFETCHT0 896(CX)
+ VPXOR 48(DX), Y3, Y4
+ VPXOR 80(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 448(CX), Y3
+ VMOVDQU 480(CX), Y6
+ PREFETCHT0 960(CX)
+ VPXOR 56(DX), Y3, Y4
+ VPXOR 88(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 512(CX), Y3
+ VMOVDQU 544(CX), Y6
+ PREFETCHT0 1024(CX)
+ VPXOR 64(DX), Y3, Y4
+ VPXOR 96(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 576(CX), Y3
+ VMOVDQU 608(CX), Y6
+ PREFETCHT0 1088(CX)
+ VPXOR 72(DX), Y3, Y4
+ VPXOR 104(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 640(CX), Y3
+ VMOVDQU 672(CX), Y6
+ PREFETCHT0 1152(CX)
+ VPXOR 80(DX), Y3, Y4
+ VPXOR 112(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 704(CX), Y3
+ VMOVDQU 736(CX), Y6
+ PREFETCHT0 1216(CX)
+ VPXOR 88(DX), Y3, Y4
+ VPXOR 120(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 768(CX), Y3
+ VMOVDQU 800(CX), Y6
+ PREFETCHT0 1280(CX)
+ VPXOR 96(DX), Y3, Y4
+ VPXOR 128(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 832(CX), Y3
+ VMOVDQU 864(CX), Y6
+ PREFETCHT0 1344(CX)
+ VPXOR 104(DX), Y3, Y4
+ VPXOR 136(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 896(CX), Y3
+ VMOVDQU 928(CX), Y6
+ PREFETCHT0 1408(CX)
+ VPXOR 112(DX), Y3, Y4
+ VPXOR 144(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 960(CX), Y3
+ VMOVDQU 992(CX), Y6
+ PREFETCHT0 1472(CX)
+ VPXOR 120(DX), Y3, Y4
+ VPXOR 152(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ ADDQ $0x00000400, CX
+ SUBQ $0x00000400, SI
+ VPSRLQ $0x2f, Y1, Y3
+ VPXOR Y1, Y3, Y3
+ VPXOR 128(DX), Y3, Y3
+ VPMULUDQ Y0, Y3, Y1
+ VPSHUFD $0xf5, Y3, Y3
+ VPMULUDQ Y0, Y3, Y3
+ VPSLLQ $0x20, Y3, Y3
+ VPADDQ Y1, Y3, Y1
+ VPSRLQ $0x2f, Y2, Y3
+ VPXOR Y2, Y3, Y3
+ VPXOR 160(DX), Y3, Y3
+ VPMULUDQ Y0, Y3, Y2
+ VPSHUFD $0xf5, Y3, Y3
+ VPMULUDQ Y0, Y3, Y3
+ VPSLLQ $0x20, Y3, Y3
+ VPADDQ Y2, Y3, Y2
+ JMP accum_large
+
+accum:
+ CMPQ SI, $0x40
+ JLE finalize
+ VMOVDQU (CX), Y0
+ VMOVDQU 32(CX), Y5
+ VPXOR (BX), Y0, Y3
+ VPXOR 32(BX), Y5, Y6
+ VPSHUFD $0x31, Y3, Y4
+ VPSHUFD $0x31, Y6, Y7
+ VPMULUDQ Y3, Y4, Y3
+ VPMULUDQ Y6, Y7, Y6
+ VPSHUFD $0x4e, Y0, Y0
+ VPSHUFD $0x4e, Y5, Y5
+ VPADDQ Y1, Y0, Y1
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y2, Y5, Y2
+ VPADDQ Y2, Y6, Y2
+ ADDQ $0x00000040, CX
+ SUBQ $0x00000040, SI
+ ADDQ $0x00000008, BX
+ JMP accum
+
+finalize:
+ CMPQ SI, $0x00
+ JE return
+ SUBQ $0x40, CX
+ ADDQ SI, CX
+ VMOVDQU (CX), Y0
+ VMOVDQU 32(CX), Y5
+ VPXOR 121(DX), Y0, Y3
+ VPXOR 153(DX), Y5, Y6
+ VPSHUFD $0x31, Y3, Y4
+ VPSHUFD $0x31, Y6, Y7
+ VPMULUDQ Y3, Y4, Y3
+ VPMULUDQ Y6, Y7, Y6
+ VPSHUFD $0x4e, Y0, Y0
+ VPSHUFD $0x4e, Y5, Y5
+ VPADDQ Y1, Y0, Y1
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y2, Y5, Y2
+ VPADDQ Y2, Y6, Y2
+
+return:
+ VMOVDQU Y1, (AX)
+ VMOVDQU Y2, 32(AX)
+ VZEROUPPER
+ RET
+
+// func accumBlockAVX2(acc *[8]uint64, data *byte, key *byte)
+// Requires: AVX, AVX2
+TEXT ·accumBlockAVX2(SB), NOSPLIT, $0-24
+ MOVQ acc+0(FP), AX
+ MOVQ data+8(FP), CX
+ MOVQ key+16(FP), DX
+ VMOVDQU (AX), Y1
+ VMOVDQU 32(AX), Y2
+ VMOVDQU prime_avx<>+0(SB), Y0
+ VMOVDQU (CX), Y3
+ VMOVDQU 32(CX), Y6
+ VPXOR (DX), Y3, Y4
+ VPXOR 32(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 64(CX), Y3
+ VMOVDQU 96(CX), Y6
+ VPXOR 8(DX), Y3, Y4
+ VPXOR 40(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 128(CX), Y3
+ VMOVDQU 160(CX), Y6
+ VPXOR 16(DX), Y3, Y4
+ VPXOR 48(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 192(CX), Y3
+ VMOVDQU 224(CX), Y6
+ VPXOR 24(DX), Y3, Y4
+ VPXOR 56(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 256(CX), Y3
+ VMOVDQU 288(CX), Y6
+ VPXOR 32(DX), Y3, Y4
+ VPXOR 64(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 320(CX), Y3
+ VMOVDQU 352(CX), Y6
+ VPXOR 40(DX), Y3, Y4
+ VPXOR 72(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 384(CX), Y3
+ VMOVDQU 416(CX), Y6
+ VPXOR 48(DX), Y3, Y4
+ VPXOR 80(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 448(CX), Y3
+ VMOVDQU 480(CX), Y6
+ VPXOR 56(DX), Y3, Y4
+ VPXOR 88(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 512(CX), Y3
+ VMOVDQU 544(CX), Y6
+ VPXOR 64(DX), Y3, Y4
+ VPXOR 96(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 576(CX), Y3
+ VMOVDQU 608(CX), Y6
+ VPXOR 72(DX), Y3, Y4
+ VPXOR 104(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 640(CX), Y3
+ VMOVDQU 672(CX), Y6
+ VPXOR 80(DX), Y3, Y4
+ VPXOR 112(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 704(CX), Y3
+ VMOVDQU 736(CX), Y6
+ VPXOR 88(DX), Y3, Y4
+ VPXOR 120(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 768(CX), Y3
+ VMOVDQU 800(CX), Y6
+ VPXOR 96(DX), Y3, Y4
+ VPXOR 128(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 832(CX), Y3
+ VMOVDQU 864(CX), Y6
+ VPXOR 104(DX), Y3, Y4
+ VPXOR 136(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 896(CX), Y3
+ VMOVDQU 928(CX), Y6
+ VPXOR 112(DX), Y3, Y4
+ VPXOR 144(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VMOVDQU 960(CX), Y3
+ VMOVDQU 992(CX), Y6
+ VPXOR 120(DX), Y3, Y4
+ VPXOR 152(DX), Y6, Y7
+ VPSHUFD $0x31, Y4, Y5
+ VPSHUFD $0x31, Y7, Y8
+ VPMULUDQ Y4, Y5, Y4
+ VPMULUDQ Y7, Y8, Y7
+ VPSHUFD $0x4e, Y3, Y3
+ VPSHUFD $0x4e, Y6, Y6
+ VPADDQ Y1, Y3, Y1
+ VPADDQ Y1, Y4, Y1
+ VPADDQ Y2, Y6, Y2
+ VPADDQ Y2, Y7, Y2
+ VPSRLQ $0x2f, Y1, Y3
+ VPXOR Y1, Y3, Y3
+ VPXOR 128(DX), Y3, Y3
+ VPMULUDQ Y0, Y3, Y1
+ VPSHUFD $0xf5, Y3, Y3
+ VPMULUDQ Y0, Y3, Y3
+ VPSLLQ $0x20, Y3, Y3
+ VPADDQ Y1, Y3, Y1
+ VPSRLQ $0x2f, Y2, Y3
+ VPXOR Y2, Y3, Y3
+ VPXOR 160(DX), Y3, Y3
+ VPMULUDQ Y0, Y3, Y2
+ VPSHUFD $0xf5, Y3, Y3
+ VPMULUDQ Y0, Y3, Y3
+ VPSLLQ $0x20, Y3, Y3
+ VPADDQ Y2, Y3, Y2
+ VMOVDQU Y1, (AX)
+ VMOVDQU Y2, 32(AX)
+ VZEROUPPER
+ RET