diff options
author | 2022-09-28 18:30:40 +0100 | |
---|---|---|
committer | 2022-09-28 18:30:40 +0100 | |
commit | a156188b3eb5cb3da44aa1b7452265f5fa38a607 (patch) | |
tree | 7097fa48d56fbabc7c2c8750b1f3bc9321d71c0f /vendor/github.com/minio/md5-simd | |
parent | [bugfix] Fix emphasis being added to emoji shortcodes with markdown parsing (... (diff) | |
download | gotosocial-a156188b3eb5cb3da44aa1b7452265f5fa38a607.tar.xz |
[chore] update dependencies, bump to Go 1.19.1 (#826)
* update dependencies, bump Go version to 1.19
* bump test image Go version
* update golangci-lint
* update gotosocial-drone-build
* sign
* linting, go fmt
* update swagger docs
* update swagger docs
* whitespace
* update contributing.md
* fuckin whoopsie doopsie
* linterino, linteroni
* fix followrequest test not starting processor
* fix other api/client tests not starting processor
* fix remaining tests where processor not started
* bump go-runners version
* don't check last-webfingered-at, processor may have updated this
* update swagger command
* update bun to latest version
* fix embed to work the same as before with new bun
Signed-off-by: kim <grufwub@gmail.com>
Co-authored-by: tsmethurst <tobi.smethurst@protonmail.com>
Diffstat (limited to 'vendor/github.com/minio/md5-simd')
-rw-r--r-- | vendor/github.com/minio/md5-simd/LICENSE.Golang | 27 | ||||
-rw-r--r-- | vendor/github.com/minio/md5-simd/README.md | 2 | ||||
-rw-r--r-- | vendor/github.com/minio/md5-simd/block-generic.go | 132 | ||||
-rw-r--r-- | vendor/github.com/minio/md5-simd/block16_amd64.s | 107 | ||||
-rw-r--r-- | vendor/github.com/minio/md5-simd/block8_amd64.s | 36 | ||||
-rw-r--r-- | vendor/github.com/minio/md5-simd/block_amd64.go | 77 | ||||
-rw-r--r-- | vendor/github.com/minio/md5-simd/md5-digest_amd64.go | 12 | ||||
-rw-r--r-- | vendor/github.com/minio/md5-simd/md5-server_amd64.go | 96 | ||||
-rw-r--r-- | vendor/github.com/minio/md5-simd/md5-util_amd64.go | 37 | ||||
-rw-r--r-- | vendor/github.com/minio/md5-simd/md5.go | 6 | ||||
-rw-r--r-- | vendor/github.com/minio/md5-simd/md5block_amd64.go | 11 | ||||
-rw-r--r-- | vendor/github.com/minio/md5-simd/md5block_amd64.s | 714 |
12 files changed, 1007 insertions, 250 deletions
diff --git a/vendor/github.com/minio/md5-simd/LICENSE.Golang b/vendor/github.com/minio/md5-simd/LICENSE.Golang new file mode 100644 index 000000000..6a66aea5e --- /dev/null +++ b/vendor/github.com/minio/md5-simd/LICENSE.Golang @@ -0,0 +1,27 @@ +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/minio/md5-simd/README.md b/vendor/github.com/minio/md5-simd/README.md index 374214d1a..fa6fce1a4 100644 --- a/vendor/github.com/minio/md5-simd/README.md +++ b/vendor/github.com/minio/md5-simd/README.md @@ -116,6 +116,8 @@ BenchmarkParallel/8MB-4 2182.48 17252.88 7.91x These measurements were performed on AWS EC2 instance of type `c5.xlarge` equipped with a Xeon Platinum 8124M CPU at 3.0 GHz. +If only one or two inputs are available the scalar calculation method will be used for the +optimal speed in these cases. ## Operation diff --git a/vendor/github.com/minio/md5-simd/block-generic.go b/vendor/github.com/minio/md5-simd/block-generic.go deleted file mode 100644 index eb333b93f..000000000 --- a/vendor/github.com/minio/md5-simd/block-generic.go +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright 2013 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Code generated by go run gen.go -output md5block.go; DO NOT EDIT. - -package md5simd - -import ( - "encoding/binary" - "math/bits" -) - -type digest struct { - s [4]uint32 - x [BlockSize]byte - nx int - len uint64 -} - -func blockGeneric(dig *digest, p []byte) { - // load state - a, b, c, d := dig.s[0], dig.s[1], dig.s[2], dig.s[3] - - for i := 0; i <= len(p)-BlockSize; i += BlockSize { - // eliminate bounds checks on p - q := p[i:] - q = q[:BlockSize:BlockSize] - - // save current state - aa, bb, cc, dd := a, b, c, d - - // load input block - x0 := binary.LittleEndian.Uint32(q[4*0x0:]) - x1 := binary.LittleEndian.Uint32(q[4*0x1:]) - x2 := binary.LittleEndian.Uint32(q[4*0x2:]) - x3 := binary.LittleEndian.Uint32(q[4*0x3:]) - x4 := binary.LittleEndian.Uint32(q[4*0x4:]) - x5 := binary.LittleEndian.Uint32(q[4*0x5:]) - x6 := binary.LittleEndian.Uint32(q[4*0x6:]) - x7 := binary.LittleEndian.Uint32(q[4*0x7:]) - x8 := binary.LittleEndian.Uint32(q[4*0x8:]) - x9 := binary.LittleEndian.Uint32(q[4*0x9:]) - xa := binary.LittleEndian.Uint32(q[4*0xa:]) - xb := binary.LittleEndian.Uint32(q[4*0xb:]) - xc := binary.LittleEndian.Uint32(q[4*0xc:]) - xd := binary.LittleEndian.Uint32(q[4*0xd:]) - xe := binary.LittleEndian.Uint32(q[4*0xe:]) - xf := binary.LittleEndian.Uint32(q[4*0xf:]) - - // round 1 - a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x0+0xd76aa478, 7) - d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x1+0xe8c7b756, 12) - c = d + bits.RotateLeft32((((a^b)&d)^b)+c+x2+0x242070db, 17) - b = c + bits.RotateLeft32((((d^a)&c)^a)+b+x3+0xc1bdceee, 22) - a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x4+0xf57c0faf, 7) - d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x5+0x4787c62a, 12) - c = d + bits.RotateLeft32((((a^b)&d)^b)+c+x6+0xa8304613, 17) - b = c + bits.RotateLeft32((((d^a)&c)^a)+b+x7+0xfd469501, 22) - a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x8+0x698098d8, 7) - d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x9+0x8b44f7af, 12) - c = d + bits.RotateLeft32((((a^b)&d)^b)+c+xa+0xffff5bb1, 17) - b = c + bits.RotateLeft32((((d^a)&c)^a)+b+xb+0x895cd7be, 22) - a = b + bits.RotateLeft32((((c^d)&b)^d)+a+xc+0x6b901122, 7) - d = a + bits.RotateLeft32((((b^c)&a)^c)+d+xd+0xfd987193, 12) - c = d + bits.RotateLeft32((((a^b)&d)^b)+c+xe+0xa679438e, 17) - b = c + bits.RotateLeft32((((d^a)&c)^a)+b+xf+0x49b40821, 22) - - // round 2 - a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x1+0xf61e2562, 5) - d = a + bits.RotateLeft32((((a^b)&c)^b)+d+x6+0xc040b340, 9) - c = d + bits.RotateLeft32((((d^a)&b)^a)+c+xb+0x265e5a51, 14) - b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x0+0xe9b6c7aa, 20) - a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x5+0xd62f105d, 5) - d = a + bits.RotateLeft32((((a^b)&c)^b)+d+xa+0x02441453, 9) - c = d + bits.RotateLeft32((((d^a)&b)^a)+c+xf+0xd8a1e681, 14) - b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x4+0xe7d3fbc8, 20) - a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x9+0x21e1cde6, 5) - d = a + bits.RotateLeft32((((a^b)&c)^b)+d+xe+0xc33707d6, 9) - c = d + bits.RotateLeft32((((d^a)&b)^a)+c+x3+0xf4d50d87, 14) - b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x8+0x455a14ed, 20) - a = b + bits.RotateLeft32((((b^c)&d)^c)+a+xd+0xa9e3e905, 5) - d = a + bits.RotateLeft32((((a^b)&c)^b)+d+x2+0xfcefa3f8, 9) - c = d + bits.RotateLeft32((((d^a)&b)^a)+c+x7+0x676f02d9, 14) - b = c + bits.RotateLeft32((((c^d)&a)^d)+b+xc+0x8d2a4c8a, 20) - - // round 3 - a = b + bits.RotateLeft32((b^c^d)+a+x5+0xfffa3942, 4) - d = a + bits.RotateLeft32((a^b^c)+d+x8+0x8771f681, 11) - c = d + bits.RotateLeft32((d^a^b)+c+xb+0x6d9d6122, 16) - b = c + bits.RotateLeft32((c^d^a)+b+xe+0xfde5380c, 23) - a = b + bits.RotateLeft32((b^c^d)+a+x1+0xa4beea44, 4) - d = a + bits.RotateLeft32((a^b^c)+d+x4+0x4bdecfa9, 11) - c = d + bits.RotateLeft32((d^a^b)+c+x7+0xf6bb4b60, 16) - b = c + bits.RotateLeft32((c^d^a)+b+xa+0xbebfbc70, 23) - a = b + bits.RotateLeft32((b^c^d)+a+xd+0x289b7ec6, 4) - d = a + bits.RotateLeft32((a^b^c)+d+x0+0xeaa127fa, 11) - c = d + bits.RotateLeft32((d^a^b)+c+x3+0xd4ef3085, 16) - b = c + bits.RotateLeft32((c^d^a)+b+x6+0x04881d05, 23) - a = b + bits.RotateLeft32((b^c^d)+a+x9+0xd9d4d039, 4) - d = a + bits.RotateLeft32((a^b^c)+d+xc+0xe6db99e5, 11) - c = d + bits.RotateLeft32((d^a^b)+c+xf+0x1fa27cf8, 16) - b = c + bits.RotateLeft32((c^d^a)+b+x2+0xc4ac5665, 23) - - // round 4 - a = b + bits.RotateLeft32((c^(b|^d))+a+x0+0xf4292244, 6) - d = a + bits.RotateLeft32((b^(a|^c))+d+x7+0x432aff97, 10) - c = d + bits.RotateLeft32((a^(d|^b))+c+xe+0xab9423a7, 15) - b = c + bits.RotateLeft32((d^(c|^a))+b+x5+0xfc93a039, 21) - a = b + bits.RotateLeft32((c^(b|^d))+a+xc+0x655b59c3, 6) - d = a + bits.RotateLeft32((b^(a|^c))+d+x3+0x8f0ccc92, 10) - c = d + bits.RotateLeft32((a^(d|^b))+c+xa+0xffeff47d, 15) - b = c + bits.RotateLeft32((d^(c|^a))+b+x1+0x85845dd1, 21) - a = b + bits.RotateLeft32((c^(b|^d))+a+x8+0x6fa87e4f, 6) - d = a + bits.RotateLeft32((b^(a|^c))+d+xf+0xfe2ce6e0, 10) - c = d + bits.RotateLeft32((a^(d|^b))+c+x6+0xa3014314, 15) - b = c + bits.RotateLeft32((d^(c|^a))+b+xd+0x4e0811a1, 21) - a = b + bits.RotateLeft32((c^(b|^d))+a+x4+0xf7537e82, 6) - d = a + bits.RotateLeft32((b^(a|^c))+d+xb+0xbd3af235, 10) - c = d + bits.RotateLeft32((a^(d|^b))+c+x2+0x2ad7d2bb, 15) - b = c + bits.RotateLeft32((d^(c|^a))+b+x9+0xeb86d391, 21) - - // add saved state - a += aa - b += bb - c += cc - d += dd - } - - // save state - dig.s[0], dig.s[1], dig.s[2], dig.s[3] = a, b, c, d -} diff --git a/vendor/github.com/minio/md5-simd/block16_amd64.s b/vendor/github.com/minio/md5-simd/block16_amd64.s index d32c12200..be0a43a3b 100644 --- a/vendor/github.com/minio/md5-simd/block16_amd64.s +++ b/vendor/github.com/minio/md5-simd/block16_amd64.s @@ -2,70 +2,72 @@ // Use of this source code is governed by a license that can be // found in the LICENSE file. +//+build !noasm,!appengine,gc + // This is the AVX512 implementation of the MD5 block function (16-way parallel) #define prep(index) \ - KMOVQ kmask, ktmp \ + KMOVQ kmask, ktmp \ VPGATHERDD index*4(base)(ptrs*1), ktmp, mem #define ROUND1(a, b, c, d, index, const, shift) \ - VXORPS c, tmp, tmp \ - VPADDD 64*const(consts), a, a \ - VPADDD mem, a, a \ - VPTERNLOGD $0x6C, b, d, tmp \ - prep(index) \ - VPADDD tmp, a, a \ - VPROLD $shift, a, a \ - VMOVAPD c, tmp \ - VPADDD b, a, a + VPXORQ c, tmp, tmp \ + VPADDD 64*const(consts), a, a \ + VPADDD mem, a, a \ + VPTERNLOGD $0x6C, b, d, tmp \ + prep(index) \ + VPADDD tmp, a, a \ + VPROLD $shift, a, a \ + VMOVAPD c, tmp \ + VPADDD b, a, a #define ROUND1noload(a, b, c, d, const, shift) \ - VXORPS c, tmp, tmp \ - VPADDD 64*const(consts), a, a \ - VPADDD mem, a, a \ - VPTERNLOGD $0x6C, b, d, tmp \ - VPADDD tmp, a, a \ - VPROLD $shift, a, a \ - VMOVAPD c, tmp \ - VPADDD b, a, a + VPXORQ c, tmp, tmp \ + VPADDD 64*const(consts), a, a \ + VPADDD mem, a, a \ + VPTERNLOGD $0x6C, b, d, tmp \ + VPADDD tmp, a, a \ + VPROLD $shift, a, a \ + VMOVAPD c, tmp \ + VPADDD b, a, a #define ROUND2(a, b, c, d, zreg, const, shift) \ - VPADDD 64*const(consts), a, a \ - VPADDD zreg, a, a \ - VANDNPS c, tmp, tmp \ - VPTERNLOGD $0xEC, b, tmp, tmp2 \ - VMOVAPD c, tmp \ - VPADDD tmp2, a, a \ - VMOVAPD c, tmp2 \ - VPROLD $shift, a, a \ - VPADDD b, a, a + VPADDD 64*const(consts), a, a \ + VPADDD zreg, a, a \ + VANDNPD c, tmp, tmp \ + VPTERNLOGD $0xEC, b, tmp, tmp2 \ + VMOVAPD c, tmp \ + VPADDD tmp2, a, a \ + VMOVAPD c, tmp2 \ + VPROLD $shift, a, a \ + VPADDD b, a, a #define ROUND3(a, b, c, d, zreg, const, shift) \ - VPADDD 64*const(consts), a, a \ - VPADDD zreg, a, a \ - VPTERNLOGD $0x96, b, d, tmp \ - VPADDD tmp, a, a \ - VPROLD $shift, a, a \ - VMOVAPD b, tmp \ - VPADDD b, a, a + VPADDD 64*const(consts), a, a \ + VPADDD zreg, a, a \ + VPTERNLOGD $0x96, b, d, tmp \ + VPADDD tmp, a, a \ + VPROLD $shift, a, a \ + VMOVAPD b, tmp \ + VPADDD b, a, a #define ROUND4(a, b, c, d, zreg, const, shift) \ - VPADDD 64*const(consts), a, a \ - VPADDD zreg, a, a \ - VPTERNLOGD $0x36, b, c, tmp \ - VPADDD tmp, a, a \ - VPROLD $shift, a, a \ - VXORPS c, ones, tmp \ - VPADDD b, a, a - -TEXT ·block16(SB),4,$0-40 - - MOVQ state+0(FP), BX - MOVQ base+8(FP), SI - MOVQ ptrs+16(FP), AX - KMOVQ mask+24(FP), K1 - MOVQ n+32(FP), DX - MOVQ ·avx512md5consts+0(SB), DI + VPADDD 64*const(consts), a, a \ + VPADDD zreg, a, a \ + VPTERNLOGD $0x36, b, c, tmp \ + VPADDD tmp, a, a \ + VPROLD $shift, a, a \ + VPXORQ c, ones, tmp \ + VPADDD b, a, a + +TEXT ·block16(SB), 4, $0-40 + + MOVQ state+0(FP), BX + MOVQ base+8(FP), SI + MOVQ ptrs+16(FP), AX + KMOVQ mask+24(FP), K1 + MOVQ n+32(FP), DX + MOVQ ·avx512md5consts+0(SB), DI #define a Z0 #define b Z1 @@ -90,7 +92,6 @@ TEXT ·block16(SB),4,$0-40 // Registers Z16 through to Z31 are used for caching purposes // ---------------------------------------------------------- - #define dig BX #define count DX #define base SI @@ -105,7 +106,7 @@ TEXT ·block16(SB),4,$0-40 // load source pointers VMOVUPD 0x00(AX), ptrs - MOVQ $-1, AX + MOVQ $-1, AX VPBROADCASTQ AX, ones loop: @@ -190,7 +191,7 @@ loop: ROUND3(c,d,a,b, Z31,0x2e,16) ROUND3(b,c,d,a, Z18,0x2f,23) - VXORPS d, ones, tmp + VPXORQ d, ones, tmp ROUND4(a,b,c,d, Z16,0x30, 6) ROUND4(d,a,b,c, Z23,0x31,10) diff --git a/vendor/github.com/minio/md5-simd/block8_amd64.s b/vendor/github.com/minio/md5-simd/block8_amd64.s index f5f1d9cab..f57db17aa 100644 --- a/vendor/github.com/minio/md5-simd/block8_amd64.s +++ b/vendor/github.com/minio/md5-simd/block8_amd64.s @@ -1,3 +1,5 @@ +//+build !noasm,!appengine,gc + // Copyright (c) 2018 Igneous Systems // MIT License // @@ -70,7 +72,7 @@ TEXT ·block8(SB), 4, $0-40 #define consts DI #define prepmask \ - VXORPS mask, mask, mask \ + VPXOR mask, mask, mask \ VPCMPGTD mask, off, mask #define prep(index) \ @@ -86,14 +88,14 @@ TEXT ·block8(SB), 4, $0-40 #define roll(shift, a) \ VPSLLD $shift, a, rtmp1 \ VPSRLD $32-shift, a, a \ - VORPS rtmp1, a, a + VPOR rtmp1, a, a #define ROUND1(a, b, c, d, index, const, shift) \ - VXORPS c, tmp, tmp \ + VPXOR c, tmp, tmp \ VPADDD 32*const(consts), a, a \ VPADDD mem, a, a \ - VANDPS b, tmp, tmp \ - VXORPS d, tmp, tmp \ + VPAND b, tmp, tmp \ + VPXOR d, tmp, tmp \ prep(index) \ VPADDD tmp, a, a \ roll(shift,a) \ @@ -101,11 +103,11 @@ TEXT ·block8(SB), 4, $0-40 VPADDD b, a, a #define ROUND1load(a, b, c, d, index, const, shift) \ - VXORPS c, tmp, tmp \ + VXORPD c, tmp, tmp \ VPADDD 32*const(consts), a, a \ VPADDD mem, a, a \ - VANDPS b, tmp, tmp \ - VXORPS d, tmp, tmp \ + VPAND b, tmp, tmp \ + VPXOR d, tmp, tmp \ load(index) \ VPADDD tmp, a, a \ roll(shift,a) \ @@ -115,10 +117,10 @@ TEXT ·block8(SB), 4, $0-40 #define ROUND2(a, b, c, d, index, const, shift) \ VPADDD 32*const(consts), a, a \ VPADDD mem, a, a \ - VANDPS b, tmp2, tmp2 \ - VANDNPS c, tmp, tmp \ + VPAND b, tmp2, tmp2 \ + VANDNPD c, tmp, tmp \ load(index) \ - VORPS tmp, tmp2, tmp2 \ + VPOR tmp, tmp2, tmp2 \ VMOVAPD c, tmp \ VPADDD tmp2, a, a \ VMOVAPD c, tmp2 \ @@ -129,8 +131,8 @@ TEXT ·block8(SB), 4, $0-40 VPADDD 32*const(consts), a, a \ VPADDD mem, a, a \ load(index) \ - VXORPS d, tmp, tmp \ - VXORPS b, tmp, tmp \ + VPXOR d, tmp, tmp \ + VPXOR b, tmp, tmp \ VPADDD tmp, a, a \ roll(shift,a) \ VMOVAPD b, tmp \ @@ -139,12 +141,12 @@ TEXT ·block8(SB), 4, $0-40 #define ROUND4(a, b, c, d, index, const, shift) \ VPADDD 32*const(consts), a, a \ VPADDD mem, a, a \ - VORPS b, tmp, tmp \ - VXORPS c, tmp, tmp \ + VPOR b, tmp, tmp \ + VPXOR c, tmp, tmp \ VPADDD tmp, a, a \ load(index) \ roll(shift,a) \ - VXORPS c, ones, tmp \ + VPXOR c, ones, tmp \ VPADDD b, a, a // load digest into state registers @@ -242,7 +244,7 @@ loop: ROUND3(b,c,d,a, 0,0x2f,23) load(0) - VXORPS d, ones, tmp + VPXOR d, ones, tmp ROUND4(a,b,c,d, 7,0x30, 6) ROUND4(d,a,b,c,14,0x31,10) diff --git a/vendor/github.com/minio/md5-simd/block_amd64.go b/vendor/github.com/minio/md5-simd/block_amd64.go index 27d6ce00e..16edda268 100644 --- a/vendor/github.com/minio/md5-simd/block_amd64.go +++ b/vendor/github.com/minio/md5-simd/block_amd64.go @@ -9,14 +9,18 @@ package md5simd import ( "fmt" "math" - "sync" "unsafe" - "github.com/klauspost/cpuid" + "github.com/klauspost/cpuid/v2" ) var hasAVX512 bool +func init() { + // VANDNPD requires AVX512DQ. Technically it could be VPTERNLOGQ which is AVX512F. + hasAVX512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ) +} + //go:noescape func block8(state *uint32, base uintptr, bufs *int32, cache *byte, n int) @@ -82,45 +86,52 @@ var avx512md5consts = func(c []uint32) []uint32 { return inf }(md5consts[:]) -func init() { - hasAVX512 = cpuid.CPU.AVX512F() -} - // Interface function to assembly code func (s *md5Server) blockMd5_x16(d *digest16, input [16][]byte, half bool) { if hasAVX512 { blockMd5_avx512(d, input, s.allBufs, &s.maskRounds16) - } else { - d8a, d8b := digest8{}, digest8{} - for i := range d8a.v0 { - j := i + 8 - d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i] - if !half { - d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j] - } - } + return + } + + // Preparing data using copy is slower since copies aren't inlined. - i8 := [2][8][]byte{} - for i := range i8[0] { - i8[0][i], i8[1][i] = input[i], input[8+i] + // Calculate on this goroutine + if half { + for i := range s.i8[0][:] { + s.i8[0][i] = input[i] } - if half { - blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a) - } else { - wg := sync.WaitGroup{} - wg.Add(2) - go func() { blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a); wg.Done() }() - go func() { blockMd5_avx2(&d8b, i8[1], s.allBufs, &s.maskRounds8b); wg.Done() }() - wg.Wait() + for i := range s.d8a.v0[:] { + s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i] } - - for i := range d8a.v0 { - j := i + 8 - d.v0[i], d.v1[i], d.v2[i], d.v3[i] = d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i] - if !half { - d.v0[j], d.v1[j], d.v2[j], d.v3[j] = d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i] - } + blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a) + for i := range s.d8a.v0[:] { + d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] } + return + } + + for i := range s.i8[0][:] { + s.i8[0][i], s.i8[1][i] = input[i], input[8+i] + } + + for i := range s.d8a.v0[:] { + j := (i + 8) & 15 + s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i] + s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j] + } + + // Benchmarks appears to be slightly faster when spinning up 2 goroutines instead + // of using the current for one of the blocks. + s.wg.Add(2) + go func() { blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a); s.wg.Done() }() + go func() { blockMd5_avx2(&s.d8b, s.i8[1], s.allBufs, &s.maskRounds8b); s.wg.Done() }() + s.wg.Wait() + for i := range s.d8a.v0[:] { + d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] + } + for i := range s.d8b.v0[:] { + j := (i + 8) & 15 + d.v0[j], d.v1[j], d.v2[j], d.v3[j] = s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] } } diff --git a/vendor/github.com/minio/md5-simd/md5-digest_amd64.go b/vendor/github.com/minio/md5-simd/md5-digest_amd64.go index fe10c7531..5ea23a499 100644 --- a/vendor/github.com/minio/md5-simd/md5-digest_amd64.go +++ b/vendor/github.com/minio/md5-simd/md5-digest_amd64.go @@ -10,6 +10,7 @@ import ( "encoding/binary" "errors" "fmt" + "sync" "sync/atomic" ) @@ -121,6 +122,14 @@ func (d *md5Digest) Close() { } } +var sumChPool sync.Pool + +func init() { + sumChPool.New = func() interface{} { + return make(chan sumResult, 1) + } +} + // Sum - Return MD5 sum in bytes func (d *md5Digest) Sum(in []byte) (result []byte) { if d.blocksCh == nil { @@ -148,10 +157,11 @@ func (d *md5Digest) Sum(in []byte) (result []byte) { if len(trail)%BlockSize != 0 { panic(fmt.Errorf("internal error: sum block was not aligned. len=%d, nx=%d", len(trail), d.nx)) } - sumCh := make(chan sumResult, 1) + sumCh := sumChPool.Get().(chan sumResult) d.sendBlock(blockInput{uid: d.uid, msg: trail, sumCh: sumCh}, true) sum := <-sumCh + sumChPool.Put(sumCh) return append(in, sum.digest[:]...) } diff --git a/vendor/github.com/minio/md5-simd/md5-server_amd64.go b/vendor/github.com/minio/md5-simd/md5-server_amd64.go index 461059537..94f741c54 100644 --- a/vendor/github.com/minio/md5-simd/md5-server_amd64.go +++ b/vendor/github.com/minio/md5-simd/md5-server_amd64.go @@ -10,8 +10,9 @@ import ( "encoding/binary" "fmt" "runtime" + "sync" - "github.com/klauspost/cpuid" + "github.com/klauspost/cpuid/v2" ) // MD5 initialization constants @@ -23,6 +24,9 @@ const ( init1 = 0xefcdab89 init2 = 0x98badcfe init3 = 0x10325476 + + // Use scalar routine when below this many lanes + useScalarBelow = 3 ) // md5ServerUID - Does not start at 0 but next multiple of 16 so as to be able to @@ -56,11 +60,15 @@ type md5Server struct { maskRounds8b [8]maskRounds // Pre-allocated static array for max 8 rounds (2nd AVX2 core) allBufs []byte // Preallocated buffer. buffers chan []byte // Preallocated buffers, sliced from allBufs. + + i8 [2][8][]byte // avx2 temporary vars + d8a, d8b digest8 + wg sync.WaitGroup } // NewServer - Create new object for parallel processing handling func NewServer() Server { - if !cpuid.CPU.AVX2() { + if !cpuid.CPU.Supports(cpuid.AVX2) { return &fallbackServer{} } md5srv := &md5Server{} @@ -152,7 +160,7 @@ func (s *md5Server) process(newClients chan newClient) { sum := sumResult{} // Add end block to current digest. - blockGeneric(&dig, block.msg) + blockScalar(&dig.s, block.msg) binary.LittleEndian.PutUint32(sum.digest[0:], dig.s[0]) binary.LittleEndian.PutUint32(sum.digest[4:], dig.s[1]) @@ -262,6 +270,88 @@ func (s *md5Server) Close() { // Invoke assembly and send results back func (s *md5Server) blocks(lanes []blockInput) { + if len(lanes) < useScalarBelow { + // Use scalar routine when below this many lanes + switch len(lanes) { + case 0: + case 1: + lane := lanes[0] + var d digest + a, ok := s.digests[lane.uid] + if ok { + d.s[0] = binary.LittleEndian.Uint32(a[0:4]) + d.s[1] = binary.LittleEndian.Uint32(a[4:8]) + d.s[2] = binary.LittleEndian.Uint32(a[8:12]) + d.s[3] = binary.LittleEndian.Uint32(a[12:16]) + } else { + d.s[0] = init0 + d.s[1] = init1 + d.s[2] = init2 + d.s[3] = init3 + } + if len(lane.msg) > 0 { + // Update... + blockScalar(&d.s, lane.msg) + } + dig := [Size]byte{} + binary.LittleEndian.PutUint32(dig[0:], d.s[0]) + binary.LittleEndian.PutUint32(dig[4:], d.s[1]) + binary.LittleEndian.PutUint32(dig[8:], d.s[2]) + binary.LittleEndian.PutUint32(dig[12:], d.s[3]) + s.digests[lane.uid] = dig + + if lane.msg != nil { + s.buffers <- lane.msg + } + lanes[0] = blockInput{} + + default: + s.wg.Add(len(lanes)) + var results [useScalarBelow]digest + for i := range lanes { + lane := lanes[i] + go func(i int) { + var d digest + defer s.wg.Done() + a, ok := s.digests[lane.uid] + if ok { + d.s[0] = binary.LittleEndian.Uint32(a[0:4]) + d.s[1] = binary.LittleEndian.Uint32(a[4:8]) + d.s[2] = binary.LittleEndian.Uint32(a[8:12]) + d.s[3] = binary.LittleEndian.Uint32(a[12:16]) + } else { + d.s[0] = init0 + d.s[1] = init1 + d.s[2] = init2 + d.s[3] = init3 + } + if len(lane.msg) == 0 { + results[i] = d + return + } + // Update... + blockScalar(&d.s, lane.msg) + results[i] = d + }(i) + } + s.wg.Wait() + for i, lane := range lanes { + dig := [Size]byte{} + binary.LittleEndian.PutUint32(dig[0:], results[i].s[0]) + binary.LittleEndian.PutUint32(dig[4:], results[i].s[1]) + binary.LittleEndian.PutUint32(dig[8:], results[i].s[2]) + binary.LittleEndian.PutUint32(dig[12:], results[i].s[3]) + s.digests[lane.uid] = dig + + if lane.msg != nil { + s.buffers <- lane.msg + } + lanes[i] = blockInput{} + } + } + return + } + inputs := [16][]byte{} for i := range lanes { inputs[i] = lanes[i].msg diff --git a/vendor/github.com/minio/md5-simd/md5-util_amd64.go b/vendor/github.com/minio/md5-simd/md5-util_amd64.go index 32bbae4a0..73981b0eb 100644 --- a/vendor/github.com/minio/md5-simd/md5-util_amd64.go +++ b/vendor/github.com/minio/md5-simd/md5-util_amd64.go @@ -1,19 +1,21 @@ +//+build !noasm,!appengine,gc + // Copyright (c) 2020 MinIO Inc. All rights reserved. // Use of this source code is governed by a license that can be // found in the LICENSE file. package md5simd -import ( - "sort" -) - // Helper struct for sorting blocks based on length type lane struct { len uint pos uint } +type digest struct { + s [4]uint32 +} + // Helper struct for generating number of rounds in combination with mask for valid lanes type maskRounds struct { mask uint64 @@ -23,15 +25,22 @@ type maskRounds struct { func generateMaskAndRounds8(input [8][]byte, mr *[8]maskRounds) (rounds int) { // Sort on blocks length small to large var sorted [8]lane - for c, inpt := range input { + for c, inpt := range input[:] { sorted[c] = lane{uint(len(inpt)), uint(c)} + for i := c - 1; i >= 0; i-- { + // swap so largest is at the end... + if sorted[i].len > sorted[i+1].len { + sorted[i], sorted[i+1] = sorted[i+1], sorted[i] + continue + } + break + } } - sort.Slice(sorted[:], func(i, j int) bool { return sorted[i].len < sorted[j].len }) // Create mask array including 'rounds' (of processing blocks of 64 bytes) between masks m, round := uint64(0xff), uint64(0) - for _, s := range sorted { + for _, s := range sorted[:] { if s.len > 0 { if uint64(s.len)>>6 > round { mr[rounds] = maskRounds{m, (uint64(s.len) >> 6) - round} @@ -45,18 +54,24 @@ func generateMaskAndRounds8(input [8][]byte, mr *[8]maskRounds) (rounds int) { } func generateMaskAndRounds16(input [16][]byte, mr *[16]maskRounds) (rounds int) { - // Sort on blocks length small to large var sorted [16]lane - for c, inpt := range input { + for c, inpt := range input[:] { sorted[c] = lane{uint(len(inpt)), uint(c)} + for i := c - 1; i >= 0; i-- { + // swap so largest is at the end... + if sorted[i].len > sorted[i+1].len { + sorted[i], sorted[i+1] = sorted[i+1], sorted[i] + continue + } + break + } } - sort.Slice(sorted[:], func(i, j int) bool { return sorted[i].len < sorted[j].len }) // Create mask array including 'rounds' (of processing blocks of 64 bytes) between masks m, round := uint64(0xffff), uint64(0) - for _, s := range sorted { + for _, s := range sorted[:] { if s.len > 0 { if uint64(s.len)>>6 > round { mr[rounds] = maskRounds{m, (uint64(s.len) >> 6) - round} diff --git a/vendor/github.com/minio/md5-simd/md5.go b/vendor/github.com/minio/md5-simd/md5.go index 4f56b79d0..11b0cb962 100644 --- a/vendor/github.com/minio/md5-simd/md5.go +++ b/vendor/github.com/minio/md5-simd/md5.go @@ -27,6 +27,12 @@ type Hasher interface { Close() } +// StdlibHasher returns a Hasher that uses the stdlib for hashing. +// Used hashers are stored in a pool for fast reuse. +func StdlibHasher() Hasher { + return &md5Wrapper{Hash: md5Pool.New().(hash.Hash)} +} + // md5Wrapper is a wrapper around the builtin hasher. type md5Wrapper struct { hash.Hash diff --git a/vendor/github.com/minio/md5-simd/md5block_amd64.go b/vendor/github.com/minio/md5-simd/md5block_amd64.go new file mode 100644 index 000000000..4c2793662 --- /dev/null +++ b/vendor/github.com/minio/md5-simd/md5block_amd64.go @@ -0,0 +1,11 @@ +// Code generated by command: go run gen.go -out ../md5block_amd64.s -stubs ../md5block_amd64.go -pkg=md5simd. DO NOT EDIT. + +// +build !appengine +// +build !noasm +// +build gc + +package md5simd + +// Encode p to digest +//go:noescape +func blockScalar(dig *[4]uint32, p []byte) diff --git a/vendor/github.com/minio/md5-simd/md5block_amd64.s b/vendor/github.com/minio/md5-simd/md5block_amd64.s new file mode 100644 index 000000000..fbc4a21f2 --- /dev/null +++ b/vendor/github.com/minio/md5-simd/md5block_amd64.s @@ -0,0 +1,714 @@ +// Code generated by command: go run gen.go -out ../md5block_amd64.s -stubs ../md5block_amd64.go -pkg=md5simd. DO NOT EDIT. + +// +build !appengine +// +build !noasm +// +build gc + +// func blockScalar(dig *[4]uint32, p []byte) +TEXT ·blockScalar(SB), $0-32 + MOVQ p_len+16(FP), AX + MOVQ dig+0(FP), CX + MOVQ p_base+8(FP), DX + SHRQ $0x06, AX + SHLQ $0x06, AX + LEAQ (DX)(AX*1), AX + CMPQ DX, AX + JEQ end + MOVL (CX), BX + MOVL 4(CX), BP + MOVL 8(CX), SI + MOVL 12(CX), CX + MOVL $0xffffffff, DI + +loop: + MOVL (DX), R8 + MOVL CX, R9 + MOVL BX, R10 + MOVL BP, R11 + MOVL SI, R12 + MOVL CX, R13 + + // ROUND1 + XORL SI, R9 + ADDL $0xd76aa478, BX + ADDL R8, BX + ANDL BP, R9 + XORL CX, R9 + MOVL 4(DX), R8 + ADDL R9, BX + ROLL $0x07, BX + MOVL SI, R9 + ADDL BP, BX + XORL BP, R9 + ADDL $0xe8c7b756, CX + ADDL R8, CX + ANDL BX, R9 + XORL SI, R9 + MOVL 8(DX), R8 + ADDL R9, CX + ROLL $0x0c, CX + MOVL BP, R9 + ADDL BX, CX + XORL BX, R9 + ADDL $0x242070db, SI + ADDL R8, SI + ANDL CX, R9 + XORL BP, R9 + MOVL 12(DX), R8 + ADDL R9, SI + ROLL $0x11, SI + MOVL BX, R9 + ADDL CX, SI + XORL CX, R9 + ADDL $0xc1bdceee, BP + ADDL R8, BP + ANDL SI, R9 + XORL BX, R9 + MOVL 16(DX), R8 + ADDL R9, BP + ROLL $0x16, BP + MOVL CX, R9 + ADDL SI, BP + XORL SI, R9 + ADDL $0xf57c0faf, BX + ADDL R8, BX + ANDL BP, R9 + XORL CX, R9 + MOVL 20(DX), R8 + ADDL R9, BX + ROLL $0x07, BX + MOVL SI, R9 + ADDL BP, BX + XORL BP, R9 + ADDL $0x4787c62a, CX + ADDL R8, CX + ANDL BX, R9 + XORL SI, R9 + MOVL 24(DX), R8 + ADDL R9, CX + ROLL $0x0c, CX + MOVL BP, R9 + ADDL BX, CX + XORL BX, R9 + ADDL $0xa8304613, SI + ADDL R8, SI + ANDL CX, R9 + XORL BP, R9 + MOVL 28(DX), R8 + ADDL R9, SI + ROLL $0x11, SI + MOVL BX, R9 + ADDL CX, SI + XORL CX, R9 + ADDL $0xfd469501, BP + ADDL R8, BP + ANDL SI, R9 + XORL BX, R9 + MOVL 32(DX), R8 + ADDL R9, BP + ROLL $0x16, BP + MOVL CX, R9 + ADDL SI, BP + XORL SI, R9 + ADDL $0x698098d8, BX + ADDL R8, BX + ANDL BP, R9 + XORL CX, R9 + MOVL 36(DX), R8 + ADDL R9, BX + ROLL $0x07, BX + MOVL SI, R9 + ADDL BP, BX + XORL BP, R9 + ADDL $0x8b44f7af, CX + ADDL R8, CX + ANDL BX, R9 + XORL SI, R9 + MOVL 40(DX), R8 + ADDL R9, CX + ROLL $0x0c, CX + MOVL BP, R9 + ADDL BX, CX + XORL BX, R9 + ADDL $0xffff5bb1, SI + ADDL R8, SI + ANDL CX, R9 + XORL BP, R9 + MOVL 44(DX), R8 + ADDL R9, SI + ROLL $0x11, SI + MOVL BX, R9 + ADDL CX, SI + XORL CX, R9 + ADDL $0x895cd7be, BP + ADDL R8, BP + ANDL SI, R9 + XORL BX, R9 + MOVL 48(DX), R8 + ADDL R9, BP + ROLL $0x16, BP + MOVL CX, R9 + ADDL SI, BP + XORL SI, R9 + ADDL $0x6b901122, BX + ADDL R8, BX + ANDL BP, R9 + XORL CX, R9 + MOVL 52(DX), R8 + ADDL R9, BX + ROLL $0x07, BX + MOVL SI, R9 + ADDL BP, BX + XORL BP, R9 + ADDL $0xfd987193, CX + ADDL R8, CX + ANDL BX, R9 + XORL SI, R9 + MOVL 56(DX), R8 + ADDL R9, CX + ROLL $0x0c, CX + MOVL BP, R9 + ADDL BX, CX + XORL BX, R9 + ADDL $0xa679438e, SI + ADDL R8, SI + ANDL CX, R9 + XORL BP, R9 + MOVL 60(DX), R8 + ADDL R9, SI + ROLL $0x11, SI + MOVL BX, R9 + ADDL CX, SI + XORL CX, R9 + ADDL $0x49b40821, BP + ADDL R8, BP + ANDL SI, R9 + XORL BX, R9 + MOVL 4(DX), R8 + ADDL R9, BP + ROLL $0x16, BP + MOVL CX, R9 + ADDL SI, BP + + // ROUND2 + MOVL CX, R9 + MOVL CX, R14 + XORL DI, R9 + ADDL $0xf61e2562, BX + ADDL R8, BX + ANDL BP, R14 + ANDL SI, R9 + MOVL 24(DX), R8 + ORL R9, R14 + MOVL SI, R9 + ADDL R14, BX + MOVL SI, R14 + ROLL $0x05, BX + ADDL BP, BX + XORL DI, R9 + ADDL $0xc040b340, CX + ADDL R8, CX + ANDL BX, R14 + ANDL BP, R9 + MOVL 44(DX), R8 + ORL R9, R14 + MOVL BP, R9 + ADDL R14, CX + MOVL BP, R14 + ROLL $0x09, CX + ADDL BX, CX + XORL DI, R9 + ADDL $0x265e5a51, SI + ADDL R8, SI + ANDL CX, R14 + ANDL BX, R9 + MOVL (DX), R8 + ORL R9, R14 + MOVL BX, R9 + ADDL R14, SI + MOVL BX, R14 + ROLL $0x0e, SI + ADDL CX, SI + XORL DI, R9 + ADDL $0xe9b6c7aa, BP + ADDL R8, BP + ANDL SI, R14 + ANDL CX, R9 + MOVL 20(DX), R8 + ORL R9, R14 + MOVL CX, R9 + ADDL R14, BP + MOVL CX, R14 + ROLL $0x14, BP + ADDL SI, BP + XORL DI, R9 + ADDL $0xd62f105d, BX + ADDL R8, BX + ANDL BP, R14 + ANDL SI, R9 + MOVL 40(DX), R8 + ORL R9, R14 + MOVL SI, R9 + ADDL R14, BX + MOVL SI, R14 + ROLL $0x05, BX + ADDL BP, BX + XORL DI, R9 + ADDL $0x02441453, CX + ADDL R8, CX + ANDL BX, R14 + ANDL BP, R9 + MOVL 60(DX), R8 + ORL R9, R14 + MOVL BP, R9 + ADDL R14, CX + MOVL BP, R14 + ROLL $0x09, CX + ADDL BX, CX + XORL DI, R9 + ADDL $0xd8a1e681, SI + ADDL R8, SI + ANDL CX, R14 + ANDL BX, R9 + MOVL 16(DX), R8 + ORL R9, R14 + MOVL BX, R9 + ADDL R14, SI + MOVL BX, R14 + ROLL $0x0e, SI + ADDL CX, SI + XORL DI, R9 + ADDL $0xe7d3fbc8, BP + ADDL R8, BP + ANDL SI, R14 + ANDL CX, R9 + MOVL 36(DX), R8 + ORL R9, R14 + MOVL CX, R9 + ADDL R14, BP + MOVL CX, R14 + ROLL $0x14, BP + ADDL SI, BP + XORL DI, R9 + ADDL $0x21e1cde6, BX + ADDL R8, BX + ANDL BP, R14 + ANDL SI, R9 + MOVL 56(DX), R8 + ORL R9, R14 + MOVL SI, R9 + ADDL R14, BX + MOVL SI, R14 + ROLL $0x05, BX + ADDL BP, BX + XORL DI, R9 + ADDL $0xc33707d6, CX + ADDL R8, CX + ANDL BX, R14 + ANDL BP, R9 + MOVL 12(DX), R8 + ORL R9, R14 + MOVL BP, R9 + ADDL R14, CX + MOVL BP, R14 + ROLL $0x09, CX + ADDL BX, CX + XORL DI, R9 + ADDL $0xf4d50d87, SI + ADDL R8, SI + ANDL CX, R14 + ANDL BX, R9 + MOVL 32(DX), R8 + ORL R9, R14 + MOVL BX, R9 + ADDL R14, SI + MOVL BX, R14 + ROLL $0x0e, SI + ADDL CX, SI + XORL DI, R9 + ADDL $0x455a14ed, BP + ADDL R8, BP + ANDL SI, R14 + ANDL CX, R9 + MOVL 52(DX), R8 + ORL R9, R14 + MOVL CX, R9 + ADDL R14, BP + MOVL CX, R14 + ROLL $0x14, BP + ADDL SI, BP + XORL DI, R9 + ADDL $0xa9e3e905, BX + ADDL R8, BX + ANDL BP, R14 + ANDL SI, R9 + MOVL 8(DX), R8 + ORL R9, R14 + MOVL SI, R9 + ADDL R14, BX + MOVL SI, R14 + ROLL $0x05, BX + ADDL BP, BX + XORL DI, R9 + ADDL $0xfcefa3f8, CX + ADDL R8, CX + ANDL BX, R14 + ANDL BP, R9 + MOVL 28(DX), R8 + ORL R9, R14 + MOVL BP, R9 + ADDL R14, CX + MOVL BP, R14 + ROLL $0x09, CX + ADDL BX, CX + XORL DI, R9 + ADDL $0x676f02d9, SI + ADDL R8, SI + ANDL CX, R14 + ANDL BX, R9 + MOVL 48(DX), R8 + ORL R9, R14 + MOVL BX, R9 + ADDL R14, SI + MOVL BX, R14 + ROLL $0x0e, SI + ADDL CX, SI + XORL DI, R9 + ADDL $0x8d2a4c8a, BP + ADDL R8, BP + ANDL SI, R14 + ANDL CX, R9 + MOVL 20(DX), R8 + ORL R9, R14 + MOVL CX, R9 + ADDL R14, BP + MOVL CX, R14 + ROLL $0x14, BP + ADDL SI, BP + + // ROUND3 + MOVL SI, R9 + ADDL $0xfffa3942, BX + ADDL R8, BX + MOVL 32(DX), R8 + XORL CX, R9 + XORL BP, R9 + ADDL R9, BX + ROLL $0x04, BX + MOVL BP, R9 + ADDL BP, BX + ADDL $0x8771f681, CX + ADDL R8, CX + MOVL 44(DX), R8 + XORL SI, R9 + XORL BX, R9 + ADDL R9, CX + ROLL $0x0b, CX + MOVL BX, R9 + ADDL BX, CX + ADDL $0x6d9d6122, SI + ADDL R8, SI + MOVL 56(DX), R8 + XORL BP, R9 + XORL CX, R9 + ADDL R9, SI + ROLL $0x10, SI + MOVL CX, R9 + ADDL CX, SI + ADDL $0xfde5380c, BP + ADDL R8, BP + MOVL 4(DX), R8 + XORL BX, R9 + XORL SI, R9 + ADDL R9, BP + ROLL $0x17, BP + MOVL SI, R9 + ADDL SI, BP + ADDL $0xa4beea44, BX + ADDL R8, BX + MOVL 16(DX), R8 + XORL CX, R9 + XORL BP, R9 + ADDL R9, BX + ROLL $0x04, BX + MOVL BP, R9 + ADDL BP, BX + ADDL $0x4bdecfa9, CX + ADDL R8, CX + MOVL 28(DX), R8 + XORL SI, R9 + XORL BX, R9 + ADDL R9, CX + ROLL $0x0b, CX + MOVL BX, R9 + ADDL BX, CX + ADDL $0xf6bb4b60, SI + ADDL R8, SI + MOVL 40(DX), R8 + XORL BP, R9 + XORL CX, R9 + ADDL R9, SI + ROLL $0x10, SI + MOVL CX, R9 + ADDL CX, SI + ADDL $0xbebfbc70, BP + ADDL R8, BP + MOVL 52(DX), R8 + XORL BX, R9 + XORL SI, R9 + ADDL R9, BP + ROLL $0x17, BP + MOVL SI, R9 + ADDL SI, BP + ADDL $0x289b7ec6, BX + ADDL R8, BX + MOVL (DX), R8 + XORL CX, R9 + XORL BP, R9 + ADDL R9, BX + ROLL $0x04, BX + MOVL BP, R9 + ADDL BP, BX + ADDL $0xeaa127fa, CX + ADDL R8, CX + MOVL 12(DX), R8 + XORL SI, R9 + XORL BX, R9 + ADDL R9, CX + ROLL $0x0b, CX + MOVL BX, R9 + ADDL BX, CX + ADDL $0xd4ef3085, SI + ADDL R8, SI + MOVL 24(DX), R8 + XORL BP, R9 + XORL CX, R9 + ADDL R9, SI + ROLL $0x10, SI + MOVL CX, R9 + ADDL CX, SI + ADDL $0x04881d05, BP + ADDL R8, BP + MOVL 36(DX), R8 + XORL BX, R9 + XORL SI, R9 + ADDL R9, BP + ROLL $0x17, BP + MOVL SI, R9 + ADDL SI, BP + ADDL $0xd9d4d039, BX + ADDL R8, BX + MOVL 48(DX), R8 + XORL CX, R9 + XORL BP, R9 + ADDL R9, BX + ROLL $0x04, BX + MOVL BP, R9 + ADDL BP, BX + ADDL $0xe6db99e5, CX + ADDL R8, CX + MOVL 60(DX), R8 + XORL SI, R9 + XORL BX, R9 + ADDL R9, CX + ROLL $0x0b, CX + MOVL BX, R9 + ADDL BX, CX + ADDL $0x1fa27cf8, SI + ADDL R8, SI + MOVL 8(DX), R8 + XORL BP, R9 + XORL CX, R9 + ADDL R9, SI + ROLL $0x10, SI + MOVL CX, R9 + ADDL CX, SI + ADDL $0xc4ac5665, BP + ADDL R8, BP + MOVL (DX), R8 + XORL BX, R9 + XORL SI, R9 + ADDL R9, BP + ROLL $0x17, BP + MOVL SI, R9 + ADDL SI, BP + + // ROUND4 + MOVL DI, R9 + XORL CX, R9 + ADDL $0xf4292244, BX + ADDL R8, BX + ORL BP, R9 + XORL SI, R9 + ADDL R9, BX + MOVL 28(DX), R8 + MOVL DI, R9 + ROLL $0x06, BX + XORL SI, R9 + ADDL BP, BX + ADDL $0x432aff97, CX + ADDL R8, CX + ORL BX, R9 + XORL BP, R9 + ADDL R9, CX + MOVL 56(DX), R8 + MOVL DI, R9 + ROLL $0x0a, CX + XORL BP, R9 + ADDL BX, CX + ADDL $0xab9423a7, SI + ADDL R8, SI + ORL CX, R9 + XORL BX, R9 + ADDL R9, SI + MOVL 20(DX), R8 + MOVL DI, R9 + ROLL $0x0f, SI + XORL BX, R9 + ADDL CX, SI + ADDL $0xfc93a039, BP + ADDL R8, BP + ORL SI, R9 + XORL CX, R9 + ADDL R9, BP + MOVL 48(DX), R8 + MOVL DI, R9 + ROLL $0x15, BP + XORL CX, R9 + ADDL SI, BP + ADDL $0x655b59c3, BX + ADDL R8, BX + ORL BP, R9 + XORL SI, R9 + ADDL R9, BX + MOVL 12(DX), R8 + MOVL DI, R9 + ROLL $0x06, BX + XORL SI, R9 + ADDL BP, BX + ADDL $0x8f0ccc92, CX + ADDL R8, CX + ORL BX, R9 + XORL BP, R9 + ADDL R9, CX + MOVL 40(DX), R8 + MOVL DI, R9 + ROLL $0x0a, CX + XORL BP, R9 + ADDL BX, CX + ADDL $0xffeff47d, SI + ADDL R8, SI + ORL CX, R9 + XORL BX, R9 + ADDL R9, SI + MOVL 4(DX), R8 + MOVL DI, R9 + ROLL $0x0f, SI + XORL BX, R9 + ADDL CX, SI + ADDL $0x85845dd1, BP + ADDL R8, BP + ORL SI, R9 + XORL CX, R9 + ADDL R9, BP + MOVL 32(DX), R8 + MOVL DI, R9 + ROLL $0x15, BP + XORL CX, R9 + ADDL SI, BP + ADDL $0x6fa87e4f, BX + ADDL R8, BX + ORL BP, R9 + XORL SI, R9 + ADDL R9, BX + MOVL 60(DX), R8 + MOVL DI, R9 + ROLL $0x06, BX + XORL SI, R9 + ADDL BP, BX + ADDL $0xfe2ce6e0, CX + ADDL R8, CX + ORL BX, R9 + XORL BP, R9 + ADDL R9, CX + MOVL 24(DX), R8 + MOVL DI, R9 + ROLL $0x0a, CX + XORL BP, R9 + ADDL BX, CX + ADDL $0xa3014314, SI + ADDL R8, SI + ORL CX, R9 + XORL BX, R9 + ADDL R9, SI + MOVL 52(DX), R8 + MOVL DI, R9 + ROLL $0x0f, SI + XORL BX, R9 + ADDL CX, SI + ADDL $0x4e0811a1, BP + ADDL R8, BP + ORL SI, R9 + XORL CX, R9 + ADDL R9, BP + MOVL 16(DX), R8 + MOVL DI, R9 + ROLL $0x15, BP + XORL CX, R9 + ADDL SI, BP + ADDL $0xf7537e82, BX + ADDL R8, BX + ORL BP, R9 + XORL SI, R9 + ADDL R9, BX + MOVL 44(DX), R8 + MOVL DI, R9 + ROLL $0x06, BX + XORL SI, R9 + ADDL BP, BX + ADDL $0xbd3af235, CX + ADDL R8, CX + ORL BX, R9 + XORL BP, R9 + ADDL R9, CX + MOVL 8(DX), R8 + MOVL DI, R9 + ROLL $0x0a, CX + XORL BP, R9 + ADDL BX, CX + ADDL $0x2ad7d2bb, SI + ADDL R8, SI + ORL CX, R9 + XORL BX, R9 + ADDL R9, SI + MOVL 36(DX), R8 + MOVL DI, R9 + ROLL $0x0f, SI + XORL BX, R9 + ADDL CX, SI + ADDL $0xeb86d391, BP + ADDL R8, BP + ORL SI, R9 + XORL CX, R9 + ADDL R9, BP + ROLL $0x15, BP + ADDL SI, BP + ADDL R10, BX + ADDL R11, BP + ADDL R12, SI + ADDL R13, CX + + // Prepare next loop + ADDQ $0x40, DX + CMPQ DX, AX + JB loop + + // Write output + MOVQ dig+0(FP), AX + MOVL BX, (AX) + MOVL BP, 4(AX) + MOVL SI, 8(AX) + MOVL CX, 12(AX) + +end: + RET |