diff options
Diffstat (limited to 'vendor/github.com/minio/md5-simd/block16_amd64.s')
-rw-r--r-- | vendor/github.com/minio/md5-simd/block16_amd64.s | 107 |
1 files changed, 54 insertions, 53 deletions
diff --git a/vendor/github.com/minio/md5-simd/block16_amd64.s b/vendor/github.com/minio/md5-simd/block16_amd64.s index d32c12200..be0a43a3b 100644 --- a/vendor/github.com/minio/md5-simd/block16_amd64.s +++ b/vendor/github.com/minio/md5-simd/block16_amd64.s @@ -2,70 +2,72 @@ // Use of this source code is governed by a license that can be // found in the LICENSE file. +//+build !noasm,!appengine,gc + // This is the AVX512 implementation of the MD5 block function (16-way parallel) #define prep(index) \ - KMOVQ kmask, ktmp \ + KMOVQ kmask, ktmp \ VPGATHERDD index*4(base)(ptrs*1), ktmp, mem #define ROUND1(a, b, c, d, index, const, shift) \ - VXORPS c, tmp, tmp \ - VPADDD 64*const(consts), a, a \ - VPADDD mem, a, a \ - VPTERNLOGD $0x6C, b, d, tmp \ - prep(index) \ - VPADDD tmp, a, a \ - VPROLD $shift, a, a \ - VMOVAPD c, tmp \ - VPADDD b, a, a + VPXORQ c, tmp, tmp \ + VPADDD 64*const(consts), a, a \ + VPADDD mem, a, a \ + VPTERNLOGD $0x6C, b, d, tmp \ + prep(index) \ + VPADDD tmp, a, a \ + VPROLD $shift, a, a \ + VMOVAPD c, tmp \ + VPADDD b, a, a #define ROUND1noload(a, b, c, d, const, shift) \ - VXORPS c, tmp, tmp \ - VPADDD 64*const(consts), a, a \ - VPADDD mem, a, a \ - VPTERNLOGD $0x6C, b, d, tmp \ - VPADDD tmp, a, a \ - VPROLD $shift, a, a \ - VMOVAPD c, tmp \ - VPADDD b, a, a + VPXORQ c, tmp, tmp \ + VPADDD 64*const(consts), a, a \ + VPADDD mem, a, a \ + VPTERNLOGD $0x6C, b, d, tmp \ + VPADDD tmp, a, a \ + VPROLD $shift, a, a \ + VMOVAPD c, tmp \ + VPADDD b, a, a #define ROUND2(a, b, c, d, zreg, const, shift) \ - VPADDD 64*const(consts), a, a \ - VPADDD zreg, a, a \ - VANDNPS c, tmp, tmp \ - VPTERNLOGD $0xEC, b, tmp, tmp2 \ - VMOVAPD c, tmp \ - VPADDD tmp2, a, a \ - VMOVAPD c, tmp2 \ - VPROLD $shift, a, a \ - VPADDD b, a, a + VPADDD 64*const(consts), a, a \ + VPADDD zreg, a, a \ + VANDNPD c, tmp, tmp \ + VPTERNLOGD $0xEC, b, tmp, tmp2 \ + VMOVAPD c, tmp \ + VPADDD tmp2, a, a \ + VMOVAPD c, tmp2 \ + VPROLD $shift, a, a \ + VPADDD b, a, a #define ROUND3(a, b, c, d, zreg, const, shift) \ - VPADDD 64*const(consts), a, a \ - VPADDD zreg, a, a \ - VPTERNLOGD $0x96, b, d, tmp \ - VPADDD tmp, a, a \ - VPROLD $shift, a, a \ - VMOVAPD b, tmp \ - VPADDD b, a, a + VPADDD 64*const(consts), a, a \ + VPADDD zreg, a, a \ + VPTERNLOGD $0x96, b, d, tmp \ + VPADDD tmp, a, a \ + VPROLD $shift, a, a \ + VMOVAPD b, tmp \ + VPADDD b, a, a #define ROUND4(a, b, c, d, zreg, const, shift) \ - VPADDD 64*const(consts), a, a \ - VPADDD zreg, a, a \ - VPTERNLOGD $0x36, b, c, tmp \ - VPADDD tmp, a, a \ - VPROLD $shift, a, a \ - VXORPS c, ones, tmp \ - VPADDD b, a, a - -TEXT ·block16(SB),4,$0-40 - - MOVQ state+0(FP), BX - MOVQ base+8(FP), SI - MOVQ ptrs+16(FP), AX - KMOVQ mask+24(FP), K1 - MOVQ n+32(FP), DX - MOVQ ·avx512md5consts+0(SB), DI + VPADDD 64*const(consts), a, a \ + VPADDD zreg, a, a \ + VPTERNLOGD $0x36, b, c, tmp \ + VPADDD tmp, a, a \ + VPROLD $shift, a, a \ + VPXORQ c, ones, tmp \ + VPADDD b, a, a + +TEXT ·block16(SB), 4, $0-40 + + MOVQ state+0(FP), BX + MOVQ base+8(FP), SI + MOVQ ptrs+16(FP), AX + KMOVQ mask+24(FP), K1 + MOVQ n+32(FP), DX + MOVQ ·avx512md5consts+0(SB), DI #define a Z0 #define b Z1 @@ -90,7 +92,6 @@ TEXT ·block16(SB),4,$0-40 // Registers Z16 through to Z31 are used for caching purposes // ---------------------------------------------------------- - #define dig BX #define count DX #define base SI @@ -105,7 +106,7 @@ TEXT ·block16(SB),4,$0-40 // load source pointers VMOVUPD 0x00(AX), ptrs - MOVQ $-1, AX + MOVQ $-1, AX VPBROADCASTQ AX, ones loop: @@ -190,7 +191,7 @@ loop: ROUND3(c,d,a,b, Z31,0x2e,16) ROUND3(b,c,d,a, Z18,0x2f,23) - VXORPS d, ones, tmp + VPXORQ d, ones, tmp ROUND4(a,b,c,d, Z16,0x30, 6) ROUND4(d,a,b,c, Z23,0x31,10) |