diff options
Diffstat (limited to 'vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s')
-rw-r--r-- | vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s | 586 |
1 files changed, 586 insertions, 0 deletions
diff --git a/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s b/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s new file mode 100644 index 000000000..b53c1521f --- /dev/null +++ b/vendor/github.com/zeebo/xxh3/accum_vector_avx_amd64.s @@ -0,0 +1,586 @@ +// Code generated by command: go run gen.go -avx -out ../accum_vector_avx_amd64.s -pkg xxh3. DO NOT EDIT. + +#include "textflag.h" + +DATA prime_avx<>+0(SB)/8, $0x000000009e3779b1 +DATA prime_avx<>+8(SB)/8, $0x000000009e3779b1 +DATA prime_avx<>+16(SB)/8, $0x000000009e3779b1 +DATA prime_avx<>+24(SB)/8, $0x000000009e3779b1 +GLOBL prime_avx<>(SB), RODATA|NOPTR, $32 + +// func accumAVX2(acc *[8]uint64, data *byte, key *byte, len uint64) +// Requires: AVX, AVX2, MMX+ +TEXT ·accumAVX2(SB), NOSPLIT, $0-32 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + MOVQ key+16(FP), BX + MOVQ len+24(FP), SI + VMOVDQU (AX), Y1 + VMOVDQU 32(AX), Y2 + VMOVDQU prime_avx<>+0(SB), Y0 + +accum_large: + CMPQ SI, $0x00000400 + JLE accum + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y6 + PREFETCHT0 512(CX) + VPXOR (DX), Y3, Y4 + VPXOR 32(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y6 + PREFETCHT0 576(CX) + VPXOR 8(DX), Y3, Y4 + VPXOR 40(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y6 + PREFETCHT0 640(CX) + VPXOR 16(DX), Y3, Y4 + VPXOR 48(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y6 + PREFETCHT0 704(CX) + VPXOR 24(DX), Y3, Y4 + VPXOR 56(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y6 + PREFETCHT0 768(CX) + VPXOR 32(DX), Y3, Y4 + VPXOR 64(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y6 + PREFETCHT0 832(CX) + VPXOR 40(DX), Y3, Y4 + VPXOR 72(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y6 + PREFETCHT0 896(CX) + VPXOR 48(DX), Y3, Y4 + VPXOR 80(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y6 + PREFETCHT0 960(CX) + VPXOR 56(DX), Y3, Y4 + VPXOR 88(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y6 + PREFETCHT0 1024(CX) + VPXOR 64(DX), Y3, Y4 + VPXOR 96(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y6 + PREFETCHT0 1088(CX) + VPXOR 72(DX), Y3, Y4 + VPXOR 104(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y6 + PREFETCHT0 1152(CX) + VPXOR 80(DX), Y3, Y4 + VPXOR 112(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y6 + PREFETCHT0 1216(CX) + VPXOR 88(DX), Y3, Y4 + VPXOR 120(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y6 + PREFETCHT0 1280(CX) + VPXOR 96(DX), Y3, Y4 + VPXOR 128(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y6 + PREFETCHT0 1344(CX) + VPXOR 104(DX), Y3, Y4 + VPXOR 136(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y6 + PREFETCHT0 1408(CX) + VPXOR 112(DX), Y3, Y4 + VPXOR 144(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y6 + PREFETCHT0 1472(CX) + VPXOR 120(DX), Y3, Y4 + VPXOR 152(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + ADDQ $0x00000400, CX + SUBQ $0x00000400, SI + VPSRLQ $0x2f, Y1, Y3 + VPXOR Y1, Y3, Y3 + VPXOR 128(DX), Y3, Y3 + VPMULUDQ Y0, Y3, Y1 + VPSHUFD $0xf5, Y3, Y3 + VPMULUDQ Y0, Y3, Y3 + VPSLLQ $0x20, Y3, Y3 + VPADDQ Y1, Y3, Y1 + VPSRLQ $0x2f, Y2, Y3 + VPXOR Y2, Y3, Y3 + VPXOR 160(DX), Y3, Y3 + VPMULUDQ Y0, Y3, Y2 + VPSHUFD $0xf5, Y3, Y3 + VPMULUDQ Y0, Y3, Y3 + VPSLLQ $0x20, Y3, Y3 + VPADDQ Y2, Y3, Y2 + JMP accum_large + +accum: + CMPQ SI, $0x40 + JLE finalize + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y5 + VPXOR (BX), Y0, Y3 + VPXOR 32(BX), Y5, Y6 + VPSHUFD $0x31, Y3, Y4 + VPSHUFD $0x31, Y6, Y7 + VPMULUDQ Y3, Y4, Y3 + VPMULUDQ Y6, Y7, Y6 + VPSHUFD $0x4e, Y0, Y0 + VPSHUFD $0x4e, Y5, Y5 + VPADDQ Y1, Y0, Y1 + VPADDQ Y1, Y3, Y1 + VPADDQ Y2, Y5, Y2 + VPADDQ Y2, Y6, Y2 + ADDQ $0x00000040, CX + SUBQ $0x00000040, SI + ADDQ $0x00000008, BX + JMP accum + +finalize: + CMPQ SI, $0x00 + JE return + SUBQ $0x40, CX + ADDQ SI, CX + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y5 + VPXOR 121(DX), Y0, Y3 + VPXOR 153(DX), Y5, Y6 + VPSHUFD $0x31, Y3, Y4 + VPSHUFD $0x31, Y6, Y7 + VPMULUDQ Y3, Y4, Y3 + VPMULUDQ Y6, Y7, Y6 + VPSHUFD $0x4e, Y0, Y0 + VPSHUFD $0x4e, Y5, Y5 + VPADDQ Y1, Y0, Y1 + VPADDQ Y1, Y3, Y1 + VPADDQ Y2, Y5, Y2 + VPADDQ Y2, Y6, Y2 + +return: + VMOVDQU Y1, (AX) + VMOVDQU Y2, 32(AX) + VZEROUPPER + RET + +// func accumBlockAVX2(acc *[8]uint64, data *byte, key *byte) +// Requires: AVX, AVX2 +TEXT ·accumBlockAVX2(SB), NOSPLIT, $0-24 + MOVQ acc+0(FP), AX + MOVQ data+8(FP), CX + MOVQ key+16(FP), DX + VMOVDQU (AX), Y1 + VMOVDQU 32(AX), Y2 + VMOVDQU prime_avx<>+0(SB), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y6 + VPXOR (DX), Y3, Y4 + VPXOR 32(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y6 + VPXOR 8(DX), Y3, Y4 + VPXOR 40(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y6 + VPXOR 16(DX), Y3, Y4 + VPXOR 48(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y6 + VPXOR 24(DX), Y3, Y4 + VPXOR 56(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y6 + VPXOR 32(DX), Y3, Y4 + VPXOR 64(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y6 + VPXOR 40(DX), Y3, Y4 + VPXOR 72(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y6 + VPXOR 48(DX), Y3, Y4 + VPXOR 80(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y6 + VPXOR 56(DX), Y3, Y4 + VPXOR 88(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y6 + VPXOR 64(DX), Y3, Y4 + VPXOR 96(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y6 + VPXOR 72(DX), Y3, Y4 + VPXOR 104(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y6 + VPXOR 80(DX), Y3, Y4 + VPXOR 112(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y6 + VPXOR 88(DX), Y3, Y4 + VPXOR 120(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y6 + VPXOR 96(DX), Y3, Y4 + VPXOR 128(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y6 + VPXOR 104(DX), Y3, Y4 + VPXOR 136(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y6 + VPXOR 112(DX), Y3, Y4 + VPXOR 144(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y6 + VPXOR 120(DX), Y3, Y4 + VPXOR 152(DX), Y6, Y7 + VPSHUFD $0x31, Y4, Y5 + VPSHUFD $0x31, Y7, Y8 + VPMULUDQ Y4, Y5, Y4 + VPMULUDQ Y7, Y8, Y7 + VPSHUFD $0x4e, Y3, Y3 + VPSHUFD $0x4e, Y6, Y6 + VPADDQ Y1, Y3, Y1 + VPADDQ Y1, Y4, Y1 + VPADDQ Y2, Y6, Y2 + VPADDQ Y2, Y7, Y2 + VPSRLQ $0x2f, Y1, Y3 + VPXOR Y1, Y3, Y3 + VPXOR 128(DX), Y3, Y3 + VPMULUDQ Y0, Y3, Y1 + VPSHUFD $0xf5, Y3, Y3 + VPMULUDQ Y0, Y3, Y3 + VPSLLQ $0x20, Y3, Y3 + VPADDQ Y1, Y3, Y1 + VPSRLQ $0x2f, Y2, Y3 + VPXOR Y2, Y3, Y3 + VPXOR 160(DX), Y3, Y3 + VPMULUDQ Y0, Y3, Y2 + VPSHUFD $0xf5, Y3, Y3 + VPMULUDQ Y0, Y3, Y3 + VPSLLQ $0x20, Y3, Y3 + VPADDQ Y2, Y3, Y2 + VMOVDQU Y1, (AX) + VMOVDQU Y2, 32(AX) + VZEROUPPER + RET |