summaryrefslogtreecommitdiff
path: root/vendor/github.com/minio/md5-simd
diff options
context:
space:
mode:
authorLibravatar kim <89579420+NyaaaWhatsUpDoc@users.noreply.github.com>2022-09-28 18:30:40 +0100
committerLibravatar GitHub <noreply@github.com>2022-09-28 18:30:40 +0100
commita156188b3eb5cb3da44aa1b7452265f5fa38a607 (patch)
tree7097fa48d56fbabc7c2c8750b1f3bc9321d71c0f /vendor/github.com/minio/md5-simd
parent[bugfix] Fix emphasis being added to emoji shortcodes with markdown parsing (... (diff)
downloadgotosocial-a156188b3eb5cb3da44aa1b7452265f5fa38a607.tar.xz
[chore] update dependencies, bump to Go 1.19.1 (#826)
* update dependencies, bump Go version to 1.19 * bump test image Go version * update golangci-lint * update gotosocial-drone-build * sign * linting, go fmt * update swagger docs * update swagger docs * whitespace * update contributing.md * fuckin whoopsie doopsie * linterino, linteroni * fix followrequest test not starting processor * fix other api/client tests not starting processor * fix remaining tests where processor not started * bump go-runners version * don't check last-webfingered-at, processor may have updated this * update swagger command * update bun to latest version * fix embed to work the same as before with new bun Signed-off-by: kim <grufwub@gmail.com> Co-authored-by: tsmethurst <tobi.smethurst@protonmail.com>
Diffstat (limited to 'vendor/github.com/minio/md5-simd')
-rw-r--r--vendor/github.com/minio/md5-simd/LICENSE.Golang27
-rw-r--r--vendor/github.com/minio/md5-simd/README.md2
-rw-r--r--vendor/github.com/minio/md5-simd/block-generic.go132
-rw-r--r--vendor/github.com/minio/md5-simd/block16_amd64.s107
-rw-r--r--vendor/github.com/minio/md5-simd/block8_amd64.s36
-rw-r--r--vendor/github.com/minio/md5-simd/block_amd64.go77
-rw-r--r--vendor/github.com/minio/md5-simd/md5-digest_amd64.go12
-rw-r--r--vendor/github.com/minio/md5-simd/md5-server_amd64.go96
-rw-r--r--vendor/github.com/minio/md5-simd/md5-util_amd64.go37
-rw-r--r--vendor/github.com/minio/md5-simd/md5.go6
-rw-r--r--vendor/github.com/minio/md5-simd/md5block_amd64.go11
-rw-r--r--vendor/github.com/minio/md5-simd/md5block_amd64.s714
12 files changed, 1007 insertions, 250 deletions
diff --git a/vendor/github.com/minio/md5-simd/LICENSE.Golang b/vendor/github.com/minio/md5-simd/LICENSE.Golang
new file mode 100644
index 000000000..6a66aea5e
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/LICENSE.Golang
@@ -0,0 +1,27 @@
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/minio/md5-simd/README.md b/vendor/github.com/minio/md5-simd/README.md
index 374214d1a..fa6fce1a4 100644
--- a/vendor/github.com/minio/md5-simd/README.md
+++ b/vendor/github.com/minio/md5-simd/README.md
@@ -116,6 +116,8 @@ BenchmarkParallel/8MB-4 2182.48 17252.88 7.91x
These measurements were performed on AWS EC2 instance of type `c5.xlarge` equipped with a Xeon Platinum 8124M CPU at 3.0 GHz.
+If only one or two inputs are available the scalar calculation method will be used for the
+optimal speed in these cases.
## Operation
diff --git a/vendor/github.com/minio/md5-simd/block-generic.go b/vendor/github.com/minio/md5-simd/block-generic.go
deleted file mode 100644
index eb333b93f..000000000
--- a/vendor/github.com/minio/md5-simd/block-generic.go
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Code generated by go run gen.go -output md5block.go; DO NOT EDIT.
-
-package md5simd
-
-import (
- "encoding/binary"
- "math/bits"
-)
-
-type digest struct {
- s [4]uint32
- x [BlockSize]byte
- nx int
- len uint64
-}
-
-func blockGeneric(dig *digest, p []byte) {
- // load state
- a, b, c, d := dig.s[0], dig.s[1], dig.s[2], dig.s[3]
-
- for i := 0; i <= len(p)-BlockSize; i += BlockSize {
- // eliminate bounds checks on p
- q := p[i:]
- q = q[:BlockSize:BlockSize]
-
- // save current state
- aa, bb, cc, dd := a, b, c, d
-
- // load input block
- x0 := binary.LittleEndian.Uint32(q[4*0x0:])
- x1 := binary.LittleEndian.Uint32(q[4*0x1:])
- x2 := binary.LittleEndian.Uint32(q[4*0x2:])
- x3 := binary.LittleEndian.Uint32(q[4*0x3:])
- x4 := binary.LittleEndian.Uint32(q[4*0x4:])
- x5 := binary.LittleEndian.Uint32(q[4*0x5:])
- x6 := binary.LittleEndian.Uint32(q[4*0x6:])
- x7 := binary.LittleEndian.Uint32(q[4*0x7:])
- x8 := binary.LittleEndian.Uint32(q[4*0x8:])
- x9 := binary.LittleEndian.Uint32(q[4*0x9:])
- xa := binary.LittleEndian.Uint32(q[4*0xa:])
- xb := binary.LittleEndian.Uint32(q[4*0xb:])
- xc := binary.LittleEndian.Uint32(q[4*0xc:])
- xd := binary.LittleEndian.Uint32(q[4*0xd:])
- xe := binary.LittleEndian.Uint32(q[4*0xe:])
- xf := binary.LittleEndian.Uint32(q[4*0xf:])
-
- // round 1
- a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x0+0xd76aa478, 7)
- d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x1+0xe8c7b756, 12)
- c = d + bits.RotateLeft32((((a^b)&d)^b)+c+x2+0x242070db, 17)
- b = c + bits.RotateLeft32((((d^a)&c)^a)+b+x3+0xc1bdceee, 22)
- a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x4+0xf57c0faf, 7)
- d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x5+0x4787c62a, 12)
- c = d + bits.RotateLeft32((((a^b)&d)^b)+c+x6+0xa8304613, 17)
- b = c + bits.RotateLeft32((((d^a)&c)^a)+b+x7+0xfd469501, 22)
- a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x8+0x698098d8, 7)
- d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x9+0x8b44f7af, 12)
- c = d + bits.RotateLeft32((((a^b)&d)^b)+c+xa+0xffff5bb1, 17)
- b = c + bits.RotateLeft32((((d^a)&c)^a)+b+xb+0x895cd7be, 22)
- a = b + bits.RotateLeft32((((c^d)&b)^d)+a+xc+0x6b901122, 7)
- d = a + bits.RotateLeft32((((b^c)&a)^c)+d+xd+0xfd987193, 12)
- c = d + bits.RotateLeft32((((a^b)&d)^b)+c+xe+0xa679438e, 17)
- b = c + bits.RotateLeft32((((d^a)&c)^a)+b+xf+0x49b40821, 22)
-
- // round 2
- a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x1+0xf61e2562, 5)
- d = a + bits.RotateLeft32((((a^b)&c)^b)+d+x6+0xc040b340, 9)
- c = d + bits.RotateLeft32((((d^a)&b)^a)+c+xb+0x265e5a51, 14)
- b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x0+0xe9b6c7aa, 20)
- a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x5+0xd62f105d, 5)
- d = a + bits.RotateLeft32((((a^b)&c)^b)+d+xa+0x02441453, 9)
- c = d + bits.RotateLeft32((((d^a)&b)^a)+c+xf+0xd8a1e681, 14)
- b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x4+0xe7d3fbc8, 20)
- a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x9+0x21e1cde6, 5)
- d = a + bits.RotateLeft32((((a^b)&c)^b)+d+xe+0xc33707d6, 9)
- c = d + bits.RotateLeft32((((d^a)&b)^a)+c+x3+0xf4d50d87, 14)
- b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x8+0x455a14ed, 20)
- a = b + bits.RotateLeft32((((b^c)&d)^c)+a+xd+0xa9e3e905, 5)
- d = a + bits.RotateLeft32((((a^b)&c)^b)+d+x2+0xfcefa3f8, 9)
- c = d + bits.RotateLeft32((((d^a)&b)^a)+c+x7+0x676f02d9, 14)
- b = c + bits.RotateLeft32((((c^d)&a)^d)+b+xc+0x8d2a4c8a, 20)
-
- // round 3
- a = b + bits.RotateLeft32((b^c^d)+a+x5+0xfffa3942, 4)
- d = a + bits.RotateLeft32((a^b^c)+d+x8+0x8771f681, 11)
- c = d + bits.RotateLeft32((d^a^b)+c+xb+0x6d9d6122, 16)
- b = c + bits.RotateLeft32((c^d^a)+b+xe+0xfde5380c, 23)
- a = b + bits.RotateLeft32((b^c^d)+a+x1+0xa4beea44, 4)
- d = a + bits.RotateLeft32((a^b^c)+d+x4+0x4bdecfa9, 11)
- c = d + bits.RotateLeft32((d^a^b)+c+x7+0xf6bb4b60, 16)
- b = c + bits.RotateLeft32((c^d^a)+b+xa+0xbebfbc70, 23)
- a = b + bits.RotateLeft32((b^c^d)+a+xd+0x289b7ec6, 4)
- d = a + bits.RotateLeft32((a^b^c)+d+x0+0xeaa127fa, 11)
- c = d + bits.RotateLeft32((d^a^b)+c+x3+0xd4ef3085, 16)
- b = c + bits.RotateLeft32((c^d^a)+b+x6+0x04881d05, 23)
- a = b + bits.RotateLeft32((b^c^d)+a+x9+0xd9d4d039, 4)
- d = a + bits.RotateLeft32((a^b^c)+d+xc+0xe6db99e5, 11)
- c = d + bits.RotateLeft32((d^a^b)+c+xf+0x1fa27cf8, 16)
- b = c + bits.RotateLeft32((c^d^a)+b+x2+0xc4ac5665, 23)
-
- // round 4
- a = b + bits.RotateLeft32((c^(b|^d))+a+x0+0xf4292244, 6)
- d = a + bits.RotateLeft32((b^(a|^c))+d+x7+0x432aff97, 10)
- c = d + bits.RotateLeft32((a^(d|^b))+c+xe+0xab9423a7, 15)
- b = c + bits.RotateLeft32((d^(c|^a))+b+x5+0xfc93a039, 21)
- a = b + bits.RotateLeft32((c^(b|^d))+a+xc+0x655b59c3, 6)
- d = a + bits.RotateLeft32((b^(a|^c))+d+x3+0x8f0ccc92, 10)
- c = d + bits.RotateLeft32((a^(d|^b))+c+xa+0xffeff47d, 15)
- b = c + bits.RotateLeft32((d^(c|^a))+b+x1+0x85845dd1, 21)
- a = b + bits.RotateLeft32((c^(b|^d))+a+x8+0x6fa87e4f, 6)
- d = a + bits.RotateLeft32((b^(a|^c))+d+xf+0xfe2ce6e0, 10)
- c = d + bits.RotateLeft32((a^(d|^b))+c+x6+0xa3014314, 15)
- b = c + bits.RotateLeft32((d^(c|^a))+b+xd+0x4e0811a1, 21)
- a = b + bits.RotateLeft32((c^(b|^d))+a+x4+0xf7537e82, 6)
- d = a + bits.RotateLeft32((b^(a|^c))+d+xb+0xbd3af235, 10)
- c = d + bits.RotateLeft32((a^(d|^b))+c+x2+0x2ad7d2bb, 15)
- b = c + bits.RotateLeft32((d^(c|^a))+b+x9+0xeb86d391, 21)
-
- // add saved state
- a += aa
- b += bb
- c += cc
- d += dd
- }
-
- // save state
- dig.s[0], dig.s[1], dig.s[2], dig.s[3] = a, b, c, d
-}
diff --git a/vendor/github.com/minio/md5-simd/block16_amd64.s b/vendor/github.com/minio/md5-simd/block16_amd64.s
index d32c12200..be0a43a3b 100644
--- a/vendor/github.com/minio/md5-simd/block16_amd64.s
+++ b/vendor/github.com/minio/md5-simd/block16_amd64.s
@@ -2,70 +2,72 @@
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
+//+build !noasm,!appengine,gc
+
// This is the AVX512 implementation of the MD5 block function (16-way parallel)
#define prep(index) \
- KMOVQ kmask, ktmp \
+ KMOVQ kmask, ktmp \
VPGATHERDD index*4(base)(ptrs*1), ktmp, mem
#define ROUND1(a, b, c, d, index, const, shift) \
- VXORPS c, tmp, tmp \
- VPADDD 64*const(consts), a, a \
- VPADDD mem, a, a \
- VPTERNLOGD $0x6C, b, d, tmp \
- prep(index) \
- VPADDD tmp, a, a \
- VPROLD $shift, a, a \
- VMOVAPD c, tmp \
- VPADDD b, a, a
+ VPXORQ c, tmp, tmp \
+ VPADDD 64*const(consts), a, a \
+ VPADDD mem, a, a \
+ VPTERNLOGD $0x6C, b, d, tmp \
+ prep(index) \
+ VPADDD tmp, a, a \
+ VPROLD $shift, a, a \
+ VMOVAPD c, tmp \
+ VPADDD b, a, a
#define ROUND1noload(a, b, c, d, const, shift) \
- VXORPS c, tmp, tmp \
- VPADDD 64*const(consts), a, a \
- VPADDD mem, a, a \
- VPTERNLOGD $0x6C, b, d, tmp \
- VPADDD tmp, a, a \
- VPROLD $shift, a, a \
- VMOVAPD c, tmp \
- VPADDD b, a, a
+ VPXORQ c, tmp, tmp \
+ VPADDD 64*const(consts), a, a \
+ VPADDD mem, a, a \
+ VPTERNLOGD $0x6C, b, d, tmp \
+ VPADDD tmp, a, a \
+ VPROLD $shift, a, a \
+ VMOVAPD c, tmp \
+ VPADDD b, a, a
#define ROUND2(a, b, c, d, zreg, const, shift) \
- VPADDD 64*const(consts), a, a \
- VPADDD zreg, a, a \
- VANDNPS c, tmp, tmp \
- VPTERNLOGD $0xEC, b, tmp, tmp2 \
- VMOVAPD c, tmp \
- VPADDD tmp2, a, a \
- VMOVAPD c, tmp2 \
- VPROLD $shift, a, a \
- VPADDD b, a, a
+ VPADDD 64*const(consts), a, a \
+ VPADDD zreg, a, a \
+ VANDNPD c, tmp, tmp \
+ VPTERNLOGD $0xEC, b, tmp, tmp2 \
+ VMOVAPD c, tmp \
+ VPADDD tmp2, a, a \
+ VMOVAPD c, tmp2 \
+ VPROLD $shift, a, a \
+ VPADDD b, a, a
#define ROUND3(a, b, c, d, zreg, const, shift) \
- VPADDD 64*const(consts), a, a \
- VPADDD zreg, a, a \
- VPTERNLOGD $0x96, b, d, tmp \
- VPADDD tmp, a, a \
- VPROLD $shift, a, a \
- VMOVAPD b, tmp \
- VPADDD b, a, a
+ VPADDD 64*const(consts), a, a \
+ VPADDD zreg, a, a \
+ VPTERNLOGD $0x96, b, d, tmp \
+ VPADDD tmp, a, a \
+ VPROLD $shift, a, a \
+ VMOVAPD b, tmp \
+ VPADDD b, a, a
#define ROUND4(a, b, c, d, zreg, const, shift) \
- VPADDD 64*const(consts), a, a \
- VPADDD zreg, a, a \
- VPTERNLOGD $0x36, b, c, tmp \
- VPADDD tmp, a, a \
- VPROLD $shift, a, a \
- VXORPS c, ones, tmp \
- VPADDD b, a, a
-
-TEXT ·block16(SB),4,$0-40
-
- MOVQ state+0(FP), BX
- MOVQ base+8(FP), SI
- MOVQ ptrs+16(FP), AX
- KMOVQ mask+24(FP), K1
- MOVQ n+32(FP), DX
- MOVQ ·avx512md5consts+0(SB), DI
+ VPADDD 64*const(consts), a, a \
+ VPADDD zreg, a, a \
+ VPTERNLOGD $0x36, b, c, tmp \
+ VPADDD tmp, a, a \
+ VPROLD $shift, a, a \
+ VPXORQ c, ones, tmp \
+ VPADDD b, a, a
+
+TEXT ·block16(SB), 4, $0-40
+
+ MOVQ state+0(FP), BX
+ MOVQ base+8(FP), SI
+ MOVQ ptrs+16(FP), AX
+ KMOVQ mask+24(FP), K1
+ MOVQ n+32(FP), DX
+ MOVQ ·avx512md5consts+0(SB), DI
#define a Z0
#define b Z1
@@ -90,7 +92,6 @@ TEXT ·block16(SB),4,$0-40
// Registers Z16 through to Z31 are used for caching purposes
// ----------------------------------------------------------
-
#define dig BX
#define count DX
#define base SI
@@ -105,7 +106,7 @@ TEXT ·block16(SB),4,$0-40
// load source pointers
VMOVUPD 0x00(AX), ptrs
- MOVQ $-1, AX
+ MOVQ $-1, AX
VPBROADCASTQ AX, ones
loop:
@@ -190,7 +191,7 @@ loop:
ROUND3(c,d,a,b, Z31,0x2e,16)
ROUND3(b,c,d,a, Z18,0x2f,23)
- VXORPS d, ones, tmp
+ VPXORQ d, ones, tmp
ROUND4(a,b,c,d, Z16,0x30, 6)
ROUND4(d,a,b,c, Z23,0x31,10)
diff --git a/vendor/github.com/minio/md5-simd/block8_amd64.s b/vendor/github.com/minio/md5-simd/block8_amd64.s
index f5f1d9cab..f57db17aa 100644
--- a/vendor/github.com/minio/md5-simd/block8_amd64.s
+++ b/vendor/github.com/minio/md5-simd/block8_amd64.s
@@ -1,3 +1,5 @@
+//+build !noasm,!appengine,gc
+
// Copyright (c) 2018 Igneous Systems
// MIT License
//
@@ -70,7 +72,7 @@ TEXT ·block8(SB), 4, $0-40
#define consts DI
#define prepmask \
- VXORPS mask, mask, mask \
+ VPXOR mask, mask, mask \
VPCMPGTD mask, off, mask
#define prep(index) \
@@ -86,14 +88,14 @@ TEXT ·block8(SB), 4, $0-40
#define roll(shift, a) \
VPSLLD $shift, a, rtmp1 \
VPSRLD $32-shift, a, a \
- VORPS rtmp1, a, a
+ VPOR rtmp1, a, a
#define ROUND1(a, b, c, d, index, const, shift) \
- VXORPS c, tmp, tmp \
+ VPXOR c, tmp, tmp \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
- VANDPS b, tmp, tmp \
- VXORPS d, tmp, tmp \
+ VPAND b, tmp, tmp \
+ VPXOR d, tmp, tmp \
prep(index) \
VPADDD tmp, a, a \
roll(shift,a) \
@@ -101,11 +103,11 @@ TEXT ·block8(SB), 4, $0-40
VPADDD b, a, a
#define ROUND1load(a, b, c, d, index, const, shift) \
- VXORPS c, tmp, tmp \
+ VXORPD c, tmp, tmp \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
- VANDPS b, tmp, tmp \
- VXORPS d, tmp, tmp \
+ VPAND b, tmp, tmp \
+ VPXOR d, tmp, tmp \
load(index) \
VPADDD tmp, a, a \
roll(shift,a) \
@@ -115,10 +117,10 @@ TEXT ·block8(SB), 4, $0-40
#define ROUND2(a, b, c, d, index, const, shift) \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
- VANDPS b, tmp2, tmp2 \
- VANDNPS c, tmp, tmp \
+ VPAND b, tmp2, tmp2 \
+ VANDNPD c, tmp, tmp \
load(index) \
- VORPS tmp, tmp2, tmp2 \
+ VPOR tmp, tmp2, tmp2 \
VMOVAPD c, tmp \
VPADDD tmp2, a, a \
VMOVAPD c, tmp2 \
@@ -129,8 +131,8 @@ TEXT ·block8(SB), 4, $0-40
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
load(index) \
- VXORPS d, tmp, tmp \
- VXORPS b, tmp, tmp \
+ VPXOR d, tmp, tmp \
+ VPXOR b, tmp, tmp \
VPADDD tmp, a, a \
roll(shift,a) \
VMOVAPD b, tmp \
@@ -139,12 +141,12 @@ TEXT ·block8(SB), 4, $0-40
#define ROUND4(a, b, c, d, index, const, shift) \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
- VORPS b, tmp, tmp \
- VXORPS c, tmp, tmp \
+ VPOR b, tmp, tmp \
+ VPXOR c, tmp, tmp \
VPADDD tmp, a, a \
load(index) \
roll(shift,a) \
- VXORPS c, ones, tmp \
+ VPXOR c, ones, tmp \
VPADDD b, a, a
// load digest into state registers
@@ -242,7 +244,7 @@ loop:
ROUND3(b,c,d,a, 0,0x2f,23)
load(0)
- VXORPS d, ones, tmp
+ VPXOR d, ones, tmp
ROUND4(a,b,c,d, 7,0x30, 6)
ROUND4(d,a,b,c,14,0x31,10)
diff --git a/vendor/github.com/minio/md5-simd/block_amd64.go b/vendor/github.com/minio/md5-simd/block_amd64.go
index 27d6ce00e..16edda268 100644
--- a/vendor/github.com/minio/md5-simd/block_amd64.go
+++ b/vendor/github.com/minio/md5-simd/block_amd64.go
@@ -9,14 +9,18 @@ package md5simd
import (
"fmt"
"math"
- "sync"
"unsafe"
- "github.com/klauspost/cpuid"
+ "github.com/klauspost/cpuid/v2"
)
var hasAVX512 bool
+func init() {
+ // VANDNPD requires AVX512DQ. Technically it could be VPTERNLOGQ which is AVX512F.
+ hasAVX512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ)
+}
+
//go:noescape
func block8(state *uint32, base uintptr, bufs *int32, cache *byte, n int)
@@ -82,45 +86,52 @@ var avx512md5consts = func(c []uint32) []uint32 {
return inf
}(md5consts[:])
-func init() {
- hasAVX512 = cpuid.CPU.AVX512F()
-}
-
// Interface function to assembly code
func (s *md5Server) blockMd5_x16(d *digest16, input [16][]byte, half bool) {
if hasAVX512 {
blockMd5_avx512(d, input, s.allBufs, &s.maskRounds16)
- } else {
- d8a, d8b := digest8{}, digest8{}
- for i := range d8a.v0 {
- j := i + 8
- d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
- if !half {
- d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
- }
- }
+ return
+ }
+
+ // Preparing data using copy is slower since copies aren't inlined.
- i8 := [2][8][]byte{}
- for i := range i8[0] {
- i8[0][i], i8[1][i] = input[i], input[8+i]
+ // Calculate on this goroutine
+ if half {
+ for i := range s.i8[0][:] {
+ s.i8[0][i] = input[i]
}
- if half {
- blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a)
- } else {
- wg := sync.WaitGroup{}
- wg.Add(2)
- go func() { blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a); wg.Done() }()
- go func() { blockMd5_avx2(&d8b, i8[1], s.allBufs, &s.maskRounds8b); wg.Done() }()
- wg.Wait()
+ for i := range s.d8a.v0[:] {
+ s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
}
-
- for i := range d8a.v0 {
- j := i + 8
- d.v0[i], d.v1[i], d.v2[i], d.v3[i] = d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i]
- if !half {
- d.v0[j], d.v1[j], d.v2[j], d.v3[j] = d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i]
- }
+ blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a)
+ for i := range s.d8a.v0[:] {
+ d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
}
+ return
+ }
+
+ for i := range s.i8[0][:] {
+ s.i8[0][i], s.i8[1][i] = input[i], input[8+i]
+ }
+
+ for i := range s.d8a.v0[:] {
+ j := (i + 8) & 15
+ s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
+ s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
+ }
+
+ // Benchmarks appears to be slightly faster when spinning up 2 goroutines instead
+ // of using the current for one of the blocks.
+ s.wg.Add(2)
+ go func() { blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a); s.wg.Done() }()
+ go func() { blockMd5_avx2(&s.d8b, s.i8[1], s.allBufs, &s.maskRounds8b); s.wg.Done() }()
+ s.wg.Wait()
+ for i := range s.d8a.v0[:] {
+ d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
+ }
+ for i := range s.d8b.v0[:] {
+ j := (i + 8) & 15
+ d.v0[j], d.v1[j], d.v2[j], d.v3[j] = s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i]
}
}
diff --git a/vendor/github.com/minio/md5-simd/md5-digest_amd64.go b/vendor/github.com/minio/md5-simd/md5-digest_amd64.go
index fe10c7531..5ea23a499 100644
--- a/vendor/github.com/minio/md5-simd/md5-digest_amd64.go
+++ b/vendor/github.com/minio/md5-simd/md5-digest_amd64.go
@@ -10,6 +10,7 @@ import (
"encoding/binary"
"errors"
"fmt"
+ "sync"
"sync/atomic"
)
@@ -121,6 +122,14 @@ func (d *md5Digest) Close() {
}
}
+var sumChPool sync.Pool
+
+func init() {
+ sumChPool.New = func() interface{} {
+ return make(chan sumResult, 1)
+ }
+}
+
// Sum - Return MD5 sum in bytes
func (d *md5Digest) Sum(in []byte) (result []byte) {
if d.blocksCh == nil {
@@ -148,10 +157,11 @@ func (d *md5Digest) Sum(in []byte) (result []byte) {
if len(trail)%BlockSize != 0 {
panic(fmt.Errorf("internal error: sum block was not aligned. len=%d, nx=%d", len(trail), d.nx))
}
- sumCh := make(chan sumResult, 1)
+ sumCh := sumChPool.Get().(chan sumResult)
d.sendBlock(blockInput{uid: d.uid, msg: trail, sumCh: sumCh}, true)
sum := <-sumCh
+ sumChPool.Put(sumCh)
return append(in, sum.digest[:]...)
}
diff --git a/vendor/github.com/minio/md5-simd/md5-server_amd64.go b/vendor/github.com/minio/md5-simd/md5-server_amd64.go
index 461059537..94f741c54 100644
--- a/vendor/github.com/minio/md5-simd/md5-server_amd64.go
+++ b/vendor/github.com/minio/md5-simd/md5-server_amd64.go
@@ -10,8 +10,9 @@ import (
"encoding/binary"
"fmt"
"runtime"
+ "sync"
- "github.com/klauspost/cpuid"
+ "github.com/klauspost/cpuid/v2"
)
// MD5 initialization constants
@@ -23,6 +24,9 @@ const (
init1 = 0xefcdab89
init2 = 0x98badcfe
init3 = 0x10325476
+
+ // Use scalar routine when below this many lanes
+ useScalarBelow = 3
)
// md5ServerUID - Does not start at 0 but next multiple of 16 so as to be able to
@@ -56,11 +60,15 @@ type md5Server struct {
maskRounds8b [8]maskRounds // Pre-allocated static array for max 8 rounds (2nd AVX2 core)
allBufs []byte // Preallocated buffer.
buffers chan []byte // Preallocated buffers, sliced from allBufs.
+
+ i8 [2][8][]byte // avx2 temporary vars
+ d8a, d8b digest8
+ wg sync.WaitGroup
}
// NewServer - Create new object for parallel processing handling
func NewServer() Server {
- if !cpuid.CPU.AVX2() {
+ if !cpuid.CPU.Supports(cpuid.AVX2) {
return &fallbackServer{}
}
md5srv := &md5Server{}
@@ -152,7 +160,7 @@ func (s *md5Server) process(newClients chan newClient) {
sum := sumResult{}
// Add end block to current digest.
- blockGeneric(&dig, block.msg)
+ blockScalar(&dig.s, block.msg)
binary.LittleEndian.PutUint32(sum.digest[0:], dig.s[0])
binary.LittleEndian.PutUint32(sum.digest[4:], dig.s[1])
@@ -262,6 +270,88 @@ func (s *md5Server) Close() {
// Invoke assembly and send results back
func (s *md5Server) blocks(lanes []blockInput) {
+ if len(lanes) < useScalarBelow {
+ // Use scalar routine when below this many lanes
+ switch len(lanes) {
+ case 0:
+ case 1:
+ lane := lanes[0]
+ var d digest
+ a, ok := s.digests[lane.uid]
+ if ok {
+ d.s[0] = binary.LittleEndian.Uint32(a[0:4])
+ d.s[1] = binary.LittleEndian.Uint32(a[4:8])
+ d.s[2] = binary.LittleEndian.Uint32(a[8:12])
+ d.s[3] = binary.LittleEndian.Uint32(a[12:16])
+ } else {
+ d.s[0] = init0
+ d.s[1] = init1
+ d.s[2] = init2
+ d.s[3] = init3
+ }
+ if len(lane.msg) > 0 {
+ // Update...
+ blockScalar(&d.s, lane.msg)
+ }
+ dig := [Size]byte{}
+ binary.LittleEndian.PutUint32(dig[0:], d.s[0])
+ binary.LittleEndian.PutUint32(dig[4:], d.s[1])
+ binary.LittleEndian.PutUint32(dig[8:], d.s[2])
+ binary.LittleEndian.PutUint32(dig[12:], d.s[3])
+ s.digests[lane.uid] = dig
+
+ if lane.msg != nil {
+ s.buffers <- lane.msg
+ }
+ lanes[0] = blockInput{}
+
+ default:
+ s.wg.Add(len(lanes))
+ var results [useScalarBelow]digest
+ for i := range lanes {
+ lane := lanes[i]
+ go func(i int) {
+ var d digest
+ defer s.wg.Done()
+ a, ok := s.digests[lane.uid]
+ if ok {
+ d.s[0] = binary.LittleEndian.Uint32(a[0:4])
+ d.s[1] = binary.LittleEndian.Uint32(a[4:8])
+ d.s[2] = binary.LittleEndian.Uint32(a[8:12])
+ d.s[3] = binary.LittleEndian.Uint32(a[12:16])
+ } else {
+ d.s[0] = init0
+ d.s[1] = init1
+ d.s[2] = init2
+ d.s[3] = init3
+ }
+ if len(lane.msg) == 0 {
+ results[i] = d
+ return
+ }
+ // Update...
+ blockScalar(&d.s, lane.msg)
+ results[i] = d
+ }(i)
+ }
+ s.wg.Wait()
+ for i, lane := range lanes {
+ dig := [Size]byte{}
+ binary.LittleEndian.PutUint32(dig[0:], results[i].s[0])
+ binary.LittleEndian.PutUint32(dig[4:], results[i].s[1])
+ binary.LittleEndian.PutUint32(dig[8:], results[i].s[2])
+ binary.LittleEndian.PutUint32(dig[12:], results[i].s[3])
+ s.digests[lane.uid] = dig
+
+ if lane.msg != nil {
+ s.buffers <- lane.msg
+ }
+ lanes[i] = blockInput{}
+ }
+ }
+ return
+ }
+
inputs := [16][]byte{}
for i := range lanes {
inputs[i] = lanes[i].msg
diff --git a/vendor/github.com/minio/md5-simd/md5-util_amd64.go b/vendor/github.com/minio/md5-simd/md5-util_amd64.go
index 32bbae4a0..73981b0eb 100644
--- a/vendor/github.com/minio/md5-simd/md5-util_amd64.go
+++ b/vendor/github.com/minio/md5-simd/md5-util_amd64.go
@@ -1,19 +1,21 @@
+//+build !noasm,!appengine,gc
+
// Copyright (c) 2020 MinIO Inc. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
package md5simd
-import (
- "sort"
-)
-
// Helper struct for sorting blocks based on length
type lane struct {
len uint
pos uint
}
+type digest struct {
+ s [4]uint32
+}
+
// Helper struct for generating number of rounds in combination with mask for valid lanes
type maskRounds struct {
mask uint64
@@ -23,15 +25,22 @@ type maskRounds struct {
func generateMaskAndRounds8(input [8][]byte, mr *[8]maskRounds) (rounds int) {
// Sort on blocks length small to large
var sorted [8]lane
- for c, inpt := range input {
+ for c, inpt := range input[:] {
sorted[c] = lane{uint(len(inpt)), uint(c)}
+ for i := c - 1; i >= 0; i-- {
+ // swap so largest is at the end...
+ if sorted[i].len > sorted[i+1].len {
+ sorted[i], sorted[i+1] = sorted[i+1], sorted[i]
+ continue
+ }
+ break
+ }
}
- sort.Slice(sorted[:], func(i, j int) bool { return sorted[i].len < sorted[j].len })
// Create mask array including 'rounds' (of processing blocks of 64 bytes) between masks
m, round := uint64(0xff), uint64(0)
- for _, s := range sorted {
+ for _, s := range sorted[:] {
if s.len > 0 {
if uint64(s.len)>>6 > round {
mr[rounds] = maskRounds{m, (uint64(s.len) >> 6) - round}
@@ -45,18 +54,24 @@ func generateMaskAndRounds8(input [8][]byte, mr *[8]maskRounds) (rounds int) {
}
func generateMaskAndRounds16(input [16][]byte, mr *[16]maskRounds) (rounds int) {
-
// Sort on blocks length small to large
var sorted [16]lane
- for c, inpt := range input {
+ for c, inpt := range input[:] {
sorted[c] = lane{uint(len(inpt)), uint(c)}
+ for i := c - 1; i >= 0; i-- {
+ // swap so largest is at the end...
+ if sorted[i].len > sorted[i+1].len {
+ sorted[i], sorted[i+1] = sorted[i+1], sorted[i]
+ continue
+ }
+ break
+ }
}
- sort.Slice(sorted[:], func(i, j int) bool { return sorted[i].len < sorted[j].len })
// Create mask array including 'rounds' (of processing blocks of 64 bytes) between masks
m, round := uint64(0xffff), uint64(0)
- for _, s := range sorted {
+ for _, s := range sorted[:] {
if s.len > 0 {
if uint64(s.len)>>6 > round {
mr[rounds] = maskRounds{m, (uint64(s.len) >> 6) - round}
diff --git a/vendor/github.com/minio/md5-simd/md5.go b/vendor/github.com/minio/md5-simd/md5.go
index 4f56b79d0..11b0cb962 100644
--- a/vendor/github.com/minio/md5-simd/md5.go
+++ b/vendor/github.com/minio/md5-simd/md5.go
@@ -27,6 +27,12 @@ type Hasher interface {
Close()
}
+// StdlibHasher returns a Hasher that uses the stdlib for hashing.
+// Used hashers are stored in a pool for fast reuse.
+func StdlibHasher() Hasher {
+ return &md5Wrapper{Hash: md5Pool.New().(hash.Hash)}
+}
+
// md5Wrapper is a wrapper around the builtin hasher.
type md5Wrapper struct {
hash.Hash
diff --git a/vendor/github.com/minio/md5-simd/md5block_amd64.go b/vendor/github.com/minio/md5-simd/md5block_amd64.go
new file mode 100644
index 000000000..4c2793662
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/md5block_amd64.go
@@ -0,0 +1,11 @@
+// Code generated by command: go run gen.go -out ../md5block_amd64.s -stubs ../md5block_amd64.go -pkg=md5simd. DO NOT EDIT.
+
+// +build !appengine
+// +build !noasm
+// +build gc
+
+package md5simd
+
+// Encode p to digest
+//go:noescape
+func blockScalar(dig *[4]uint32, p []byte)
diff --git a/vendor/github.com/minio/md5-simd/md5block_amd64.s b/vendor/github.com/minio/md5-simd/md5block_amd64.s
new file mode 100644
index 000000000..fbc4a21f2
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/md5block_amd64.s
@@ -0,0 +1,714 @@
+// Code generated by command: go run gen.go -out ../md5block_amd64.s -stubs ../md5block_amd64.go -pkg=md5simd. DO NOT EDIT.
+
+// +build !appengine
+// +build !noasm
+// +build gc
+
+// func blockScalar(dig *[4]uint32, p []byte)
+TEXT ·blockScalar(SB), $0-32
+ MOVQ p_len+16(FP), AX
+ MOVQ dig+0(FP), CX
+ MOVQ p_base+8(FP), DX
+ SHRQ $0x06, AX
+ SHLQ $0x06, AX
+ LEAQ (DX)(AX*1), AX
+ CMPQ DX, AX
+ JEQ end
+ MOVL (CX), BX
+ MOVL 4(CX), BP
+ MOVL 8(CX), SI
+ MOVL 12(CX), CX
+ MOVL $0xffffffff, DI
+
+loop:
+ MOVL (DX), R8
+ MOVL CX, R9
+ MOVL BX, R10
+ MOVL BP, R11
+ MOVL SI, R12
+ MOVL CX, R13
+
+ // ROUND1
+ XORL SI, R9
+ ADDL $0xd76aa478, BX
+ ADDL R8, BX
+ ANDL BP, R9
+ XORL CX, R9
+ MOVL 4(DX), R8
+ ADDL R9, BX
+ ROLL $0x07, BX
+ MOVL SI, R9
+ ADDL BP, BX
+ XORL BP, R9
+ ADDL $0xe8c7b756, CX
+ ADDL R8, CX
+ ANDL BX, R9
+ XORL SI, R9
+ MOVL 8(DX), R8
+ ADDL R9, CX
+ ROLL $0x0c, CX
+ MOVL BP, R9
+ ADDL BX, CX
+ XORL BX, R9
+ ADDL $0x242070db, SI
+ ADDL R8, SI
+ ANDL CX, R9
+ XORL BP, R9
+ MOVL 12(DX), R8
+ ADDL R9, SI
+ ROLL $0x11, SI
+ MOVL BX, R9
+ ADDL CX, SI
+ XORL CX, R9
+ ADDL $0xc1bdceee, BP
+ ADDL R8, BP
+ ANDL SI, R9
+ XORL BX, R9
+ MOVL 16(DX), R8
+ ADDL R9, BP
+ ROLL $0x16, BP
+ MOVL CX, R9
+ ADDL SI, BP
+ XORL SI, R9
+ ADDL $0xf57c0faf, BX
+ ADDL R8, BX
+ ANDL BP, R9
+ XORL CX, R9
+ MOVL 20(DX), R8
+ ADDL R9, BX
+ ROLL $0x07, BX
+ MOVL SI, R9
+ ADDL BP, BX
+ XORL BP, R9
+ ADDL $0x4787c62a, CX
+ ADDL R8, CX
+ ANDL BX, R9
+ XORL SI, R9
+ MOVL 24(DX), R8
+ ADDL R9, CX
+ ROLL $0x0c, CX
+ MOVL BP, R9
+ ADDL BX, CX
+ XORL BX, R9
+ ADDL $0xa8304613, SI
+ ADDL R8, SI
+ ANDL CX, R9
+ XORL BP, R9
+ MOVL 28(DX), R8
+ ADDL R9, SI
+ ROLL $0x11, SI
+ MOVL BX, R9
+ ADDL CX, SI
+ XORL CX, R9
+ ADDL $0xfd469501, BP
+ ADDL R8, BP
+ ANDL SI, R9
+ XORL BX, R9
+ MOVL 32(DX), R8
+ ADDL R9, BP
+ ROLL $0x16, BP
+ MOVL CX, R9
+ ADDL SI, BP
+ XORL SI, R9
+ ADDL $0x698098d8, BX
+ ADDL R8, BX
+ ANDL BP, R9
+ XORL CX, R9
+ MOVL 36(DX), R8
+ ADDL R9, BX
+ ROLL $0x07, BX
+ MOVL SI, R9
+ ADDL BP, BX
+ XORL BP, R9
+ ADDL $0x8b44f7af, CX
+ ADDL R8, CX
+ ANDL BX, R9
+ XORL SI, R9
+ MOVL 40(DX), R8
+ ADDL R9, CX
+ ROLL $0x0c, CX
+ MOVL BP, R9
+ ADDL BX, CX
+ XORL BX, R9
+ ADDL $0xffff5bb1, SI
+ ADDL R8, SI
+ ANDL CX, R9
+ XORL BP, R9
+ MOVL 44(DX), R8
+ ADDL R9, SI
+ ROLL $0x11, SI
+ MOVL BX, R9
+ ADDL CX, SI
+ XORL CX, R9
+ ADDL $0x895cd7be, BP
+ ADDL R8, BP
+ ANDL SI, R9
+ XORL BX, R9
+ MOVL 48(DX), R8
+ ADDL R9, BP
+ ROLL $0x16, BP
+ MOVL CX, R9
+ ADDL SI, BP
+ XORL SI, R9
+ ADDL $0x6b901122, BX
+ ADDL R8, BX
+ ANDL BP, R9
+ XORL CX, R9
+ MOVL 52(DX), R8
+ ADDL R9, BX
+ ROLL $0x07, BX
+ MOVL SI, R9
+ ADDL BP, BX
+ XORL BP, R9
+ ADDL $0xfd987193, CX
+ ADDL R8, CX
+ ANDL BX, R9
+ XORL SI, R9
+ MOVL 56(DX), R8
+ ADDL R9, CX
+ ROLL $0x0c, CX
+ MOVL BP, R9
+ ADDL BX, CX
+ XORL BX, R9
+ ADDL $0xa679438e, SI
+ ADDL R8, SI
+ ANDL CX, R9
+ XORL BP, R9
+ MOVL 60(DX), R8
+ ADDL R9, SI
+ ROLL $0x11, SI
+ MOVL BX, R9
+ ADDL CX, SI
+ XORL CX, R9
+ ADDL $0x49b40821, BP
+ ADDL R8, BP
+ ANDL SI, R9
+ XORL BX, R9
+ MOVL 4(DX), R8
+ ADDL R9, BP
+ ROLL $0x16, BP
+ MOVL CX, R9
+ ADDL SI, BP
+
+ // ROUND2
+ MOVL CX, R9
+ MOVL CX, R14
+ XORL DI, R9
+ ADDL $0xf61e2562, BX
+ ADDL R8, BX
+ ANDL BP, R14
+ ANDL SI, R9
+ MOVL 24(DX), R8
+ ORL R9, R14
+ MOVL SI, R9
+ ADDL R14, BX
+ MOVL SI, R14
+ ROLL $0x05, BX
+ ADDL BP, BX
+ XORL DI, R9
+ ADDL $0xc040b340, CX
+ ADDL R8, CX
+ ANDL BX, R14
+ ANDL BP, R9
+ MOVL 44(DX), R8
+ ORL R9, R14
+ MOVL BP, R9
+ ADDL R14, CX
+ MOVL BP, R14
+ ROLL $0x09, CX
+ ADDL BX, CX
+ XORL DI, R9
+ ADDL $0x265e5a51, SI
+ ADDL R8, SI
+ ANDL CX, R14
+ ANDL BX, R9
+ MOVL (DX), R8
+ ORL R9, R14
+ MOVL BX, R9
+ ADDL R14, SI
+ MOVL BX, R14
+ ROLL $0x0e, SI
+ ADDL CX, SI
+ XORL DI, R9
+ ADDL $0xe9b6c7aa, BP
+ ADDL R8, BP
+ ANDL SI, R14
+ ANDL CX, R9
+ MOVL 20(DX), R8
+ ORL R9, R14
+ MOVL CX, R9
+ ADDL R14, BP
+ MOVL CX, R14
+ ROLL $0x14, BP
+ ADDL SI, BP
+ XORL DI, R9
+ ADDL $0xd62f105d, BX
+ ADDL R8, BX
+ ANDL BP, R14
+ ANDL SI, R9
+ MOVL 40(DX), R8
+ ORL R9, R14
+ MOVL SI, R9
+ ADDL R14, BX
+ MOVL SI, R14
+ ROLL $0x05, BX
+ ADDL BP, BX
+ XORL DI, R9
+ ADDL $0x02441453, CX
+ ADDL R8, CX
+ ANDL BX, R14
+ ANDL BP, R9
+ MOVL 60(DX), R8
+ ORL R9, R14
+ MOVL BP, R9
+ ADDL R14, CX
+ MOVL BP, R14
+ ROLL $0x09, CX
+ ADDL BX, CX
+ XORL DI, R9
+ ADDL $0xd8a1e681, SI
+ ADDL R8, SI
+ ANDL CX, R14
+ ANDL BX, R9
+ MOVL 16(DX), R8
+ ORL R9, R14
+ MOVL BX, R9
+ ADDL R14, SI
+ MOVL BX, R14
+ ROLL $0x0e, SI
+ ADDL CX, SI
+ XORL DI, R9
+ ADDL $0xe7d3fbc8, BP
+ ADDL R8, BP
+ ANDL SI, R14
+ ANDL CX, R9
+ MOVL 36(DX), R8
+ ORL R9, R14
+ MOVL CX, R9
+ ADDL R14, BP
+ MOVL CX, R14
+ ROLL $0x14, BP
+ ADDL SI, BP
+ XORL DI, R9
+ ADDL $0x21e1cde6, BX
+ ADDL R8, BX
+ ANDL BP, R14
+ ANDL SI, R9
+ MOVL 56(DX), R8
+ ORL R9, R14
+ MOVL SI, R9
+ ADDL R14, BX
+ MOVL SI, R14
+ ROLL $0x05, BX
+ ADDL BP, BX
+ XORL DI, R9
+ ADDL $0xc33707d6, CX
+ ADDL R8, CX
+ ANDL BX, R14
+ ANDL BP, R9
+ MOVL 12(DX), R8
+ ORL R9, R14
+ MOVL BP, R9
+ ADDL R14, CX
+ MOVL BP, R14
+ ROLL $0x09, CX
+ ADDL BX, CX
+ XORL DI, R9
+ ADDL $0xf4d50d87, SI
+ ADDL R8, SI
+ ANDL CX, R14
+ ANDL BX, R9
+ MOVL 32(DX), R8
+ ORL R9, R14
+ MOVL BX, R9
+ ADDL R14, SI
+ MOVL BX, R14
+ ROLL $0x0e, SI
+ ADDL CX, SI
+ XORL DI, R9
+ ADDL $0x455a14ed, BP
+ ADDL R8, BP
+ ANDL SI, R14
+ ANDL CX, R9
+ MOVL 52(DX), R8
+ ORL R9, R14
+ MOVL CX, R9
+ ADDL R14, BP
+ MOVL CX, R14
+ ROLL $0x14, BP
+ ADDL SI, BP
+ XORL DI, R9
+ ADDL $0xa9e3e905, BX
+ ADDL R8, BX
+ ANDL BP, R14
+ ANDL SI, R9
+ MOVL 8(DX), R8
+ ORL R9, R14
+ MOVL SI, R9
+ ADDL R14, BX
+ MOVL SI, R14
+ ROLL $0x05, BX
+ ADDL BP, BX
+ XORL DI, R9
+ ADDL $0xfcefa3f8, CX
+ ADDL R8, CX
+ ANDL BX, R14
+ ANDL BP, R9
+ MOVL 28(DX), R8
+ ORL R9, R14
+ MOVL BP, R9
+ ADDL R14, CX
+ MOVL BP, R14
+ ROLL $0x09, CX
+ ADDL BX, CX
+ XORL DI, R9
+ ADDL $0x676f02d9, SI
+ ADDL R8, SI
+ ANDL CX, R14
+ ANDL BX, R9
+ MOVL 48(DX), R8
+ ORL R9, R14
+ MOVL BX, R9
+ ADDL R14, SI
+ MOVL BX, R14
+ ROLL $0x0e, SI
+ ADDL CX, SI
+ XORL DI, R9
+ ADDL $0x8d2a4c8a, BP
+ ADDL R8, BP
+ ANDL SI, R14
+ ANDL CX, R9
+ MOVL 20(DX), R8
+ ORL R9, R14
+ MOVL CX, R9
+ ADDL R14, BP
+ MOVL CX, R14
+ ROLL $0x14, BP
+ ADDL SI, BP
+
+ // ROUND3
+ MOVL SI, R9
+ ADDL $0xfffa3942, BX
+ ADDL R8, BX
+ MOVL 32(DX), R8
+ XORL CX, R9
+ XORL BP, R9
+ ADDL R9, BX
+ ROLL $0x04, BX
+ MOVL BP, R9
+ ADDL BP, BX
+ ADDL $0x8771f681, CX
+ ADDL R8, CX
+ MOVL 44(DX), R8
+ XORL SI, R9
+ XORL BX, R9
+ ADDL R9, CX
+ ROLL $0x0b, CX
+ MOVL BX, R9
+ ADDL BX, CX
+ ADDL $0x6d9d6122, SI
+ ADDL R8, SI
+ MOVL 56(DX), R8
+ XORL BP, R9
+ XORL CX, R9
+ ADDL R9, SI
+ ROLL $0x10, SI
+ MOVL CX, R9
+ ADDL CX, SI
+ ADDL $0xfde5380c, BP
+ ADDL R8, BP
+ MOVL 4(DX), R8
+ XORL BX, R9
+ XORL SI, R9
+ ADDL R9, BP
+ ROLL $0x17, BP
+ MOVL SI, R9
+ ADDL SI, BP
+ ADDL $0xa4beea44, BX
+ ADDL R8, BX
+ MOVL 16(DX), R8
+ XORL CX, R9
+ XORL BP, R9
+ ADDL R9, BX
+ ROLL $0x04, BX
+ MOVL BP, R9
+ ADDL BP, BX
+ ADDL $0x4bdecfa9, CX
+ ADDL R8, CX
+ MOVL 28(DX), R8
+ XORL SI, R9
+ XORL BX, R9
+ ADDL R9, CX
+ ROLL $0x0b, CX
+ MOVL BX, R9
+ ADDL BX, CX
+ ADDL $0xf6bb4b60, SI
+ ADDL R8, SI
+ MOVL 40(DX), R8
+ XORL BP, R9
+ XORL CX, R9
+ ADDL R9, SI
+ ROLL $0x10, SI
+ MOVL CX, R9
+ ADDL CX, SI
+ ADDL $0xbebfbc70, BP
+ ADDL R8, BP
+ MOVL 52(DX), R8
+ XORL BX, R9
+ XORL SI, R9
+ ADDL R9, BP
+ ROLL $0x17, BP
+ MOVL SI, R9
+ ADDL SI, BP
+ ADDL $0x289b7ec6, BX
+ ADDL R8, BX
+ MOVL (DX), R8
+ XORL CX, R9
+ XORL BP, R9
+ ADDL R9, BX
+ ROLL $0x04, BX
+ MOVL BP, R9
+ ADDL BP, BX
+ ADDL $0xeaa127fa, CX
+ ADDL R8, CX
+ MOVL 12(DX), R8
+ XORL SI, R9
+ XORL BX, R9
+ ADDL R9, CX
+ ROLL $0x0b, CX
+ MOVL BX, R9
+ ADDL BX, CX
+ ADDL $0xd4ef3085, SI
+ ADDL R8, SI
+ MOVL 24(DX), R8
+ XORL BP, R9
+ XORL CX, R9
+ ADDL R9, SI
+ ROLL $0x10, SI
+ MOVL CX, R9
+ ADDL CX, SI
+ ADDL $0x04881d05, BP
+ ADDL R8, BP
+ MOVL 36(DX), R8
+ XORL BX, R9
+ XORL SI, R9
+ ADDL R9, BP
+ ROLL $0x17, BP
+ MOVL SI, R9
+ ADDL SI, BP
+ ADDL $0xd9d4d039, BX
+ ADDL R8, BX
+ MOVL 48(DX), R8
+ XORL CX, R9
+ XORL BP, R9
+ ADDL R9, BX
+ ROLL $0x04, BX
+ MOVL BP, R9
+ ADDL BP, BX
+ ADDL $0xe6db99e5, CX
+ ADDL R8, CX
+ MOVL 60(DX), R8
+ XORL SI, R9
+ XORL BX, R9
+ ADDL R9, CX
+ ROLL $0x0b, CX
+ MOVL BX, R9
+ ADDL BX, CX
+ ADDL $0x1fa27cf8, SI
+ ADDL R8, SI
+ MOVL 8(DX), R8
+ XORL BP, R9
+ XORL CX, R9
+ ADDL R9, SI
+ ROLL $0x10, SI
+ MOVL CX, R9
+ ADDL CX, SI
+ ADDL $0xc4ac5665, BP
+ ADDL R8, BP
+ MOVL (DX), R8
+ XORL BX, R9
+ XORL SI, R9
+ ADDL R9, BP
+ ROLL $0x17, BP
+ MOVL SI, R9
+ ADDL SI, BP
+
+ // ROUND4
+ MOVL DI, R9
+ XORL CX, R9
+ ADDL $0xf4292244, BX
+ ADDL R8, BX
+ ORL BP, R9
+ XORL SI, R9
+ ADDL R9, BX
+ MOVL 28(DX), R8
+ MOVL DI, R9
+ ROLL $0x06, BX
+ XORL SI, R9
+ ADDL BP, BX
+ ADDL $0x432aff97, CX
+ ADDL R8, CX
+ ORL BX, R9
+ XORL BP, R9
+ ADDL R9, CX
+ MOVL 56(DX), R8
+ MOVL DI, R9
+ ROLL $0x0a, CX
+ XORL BP, R9
+ ADDL BX, CX
+ ADDL $0xab9423a7, SI
+ ADDL R8, SI
+ ORL CX, R9
+ XORL BX, R9
+ ADDL R9, SI
+ MOVL 20(DX), R8
+ MOVL DI, R9
+ ROLL $0x0f, SI
+ XORL BX, R9
+ ADDL CX, SI
+ ADDL $0xfc93a039, BP
+ ADDL R8, BP
+ ORL SI, R9
+ XORL CX, R9
+ ADDL R9, BP
+ MOVL 48(DX), R8
+ MOVL DI, R9
+ ROLL $0x15, BP
+ XORL CX, R9
+ ADDL SI, BP
+ ADDL $0x655b59c3, BX
+ ADDL R8, BX
+ ORL BP, R9
+ XORL SI, R9
+ ADDL R9, BX
+ MOVL 12(DX), R8
+ MOVL DI, R9
+ ROLL $0x06, BX
+ XORL SI, R9
+ ADDL BP, BX
+ ADDL $0x8f0ccc92, CX
+ ADDL R8, CX
+ ORL BX, R9
+ XORL BP, R9
+ ADDL R9, CX
+ MOVL 40(DX), R8
+ MOVL DI, R9
+ ROLL $0x0a, CX
+ XORL BP, R9
+ ADDL BX, CX
+ ADDL $0xffeff47d, SI
+ ADDL R8, SI
+ ORL CX, R9
+ XORL BX, R9
+ ADDL R9, SI
+ MOVL 4(DX), R8
+ MOVL DI, R9
+ ROLL $0x0f, SI
+ XORL BX, R9
+ ADDL CX, SI
+ ADDL $0x85845dd1, BP
+ ADDL R8, BP
+ ORL SI, R9
+ XORL CX, R9
+ ADDL R9, BP
+ MOVL 32(DX), R8
+ MOVL DI, R9
+ ROLL $0x15, BP
+ XORL CX, R9
+ ADDL SI, BP
+ ADDL $0x6fa87e4f, BX
+ ADDL R8, BX
+ ORL BP, R9
+ XORL SI, R9
+ ADDL R9, BX
+ MOVL 60(DX), R8
+ MOVL DI, R9
+ ROLL $0x06, BX
+ XORL SI, R9
+ ADDL BP, BX
+ ADDL $0xfe2ce6e0, CX
+ ADDL R8, CX
+ ORL BX, R9
+ XORL BP, R9
+ ADDL R9, CX
+ MOVL 24(DX), R8
+ MOVL DI, R9
+ ROLL $0x0a, CX
+ XORL BP, R9
+ ADDL BX, CX
+ ADDL $0xa3014314, SI
+ ADDL R8, SI
+ ORL CX, R9
+ XORL BX, R9
+ ADDL R9, SI
+ MOVL 52(DX), R8
+ MOVL DI, R9
+ ROLL $0x0f, SI
+ XORL BX, R9
+ ADDL CX, SI
+ ADDL $0x4e0811a1, BP
+ ADDL R8, BP
+ ORL SI, R9
+ XORL CX, R9
+ ADDL R9, BP
+ MOVL 16(DX), R8
+ MOVL DI, R9
+ ROLL $0x15, BP
+ XORL CX, R9
+ ADDL SI, BP
+ ADDL $0xf7537e82, BX
+ ADDL R8, BX
+ ORL BP, R9
+ XORL SI, R9
+ ADDL R9, BX
+ MOVL 44(DX), R8
+ MOVL DI, R9
+ ROLL $0x06, BX
+ XORL SI, R9
+ ADDL BP, BX
+ ADDL $0xbd3af235, CX
+ ADDL R8, CX
+ ORL BX, R9
+ XORL BP, R9
+ ADDL R9, CX
+ MOVL 8(DX), R8
+ MOVL DI, R9
+ ROLL $0x0a, CX
+ XORL BP, R9
+ ADDL BX, CX
+ ADDL $0x2ad7d2bb, SI
+ ADDL R8, SI
+ ORL CX, R9
+ XORL BX, R9
+ ADDL R9, SI
+ MOVL 36(DX), R8
+ MOVL DI, R9
+ ROLL $0x0f, SI
+ XORL BX, R9
+ ADDL CX, SI
+ ADDL $0xeb86d391, BP
+ ADDL R8, BP
+ ORL SI, R9
+ XORL CX, R9
+ ADDL R9, BP
+ ROLL $0x15, BP
+ ADDL SI, BP
+ ADDL R10, BX
+ ADDL R11, BP
+ ADDL R12, SI
+ ADDL R13, CX
+
+ // Prepare next loop
+ ADDQ $0x40, DX
+ CMPQ DX, AX
+ JB loop
+
+ // Write output
+ MOVQ dig+0(FP), AX
+ MOVL BX, (AX)
+ MOVL BP, 4(AX)
+ MOVL SI, 8(AX)
+ MOVL CX, 12(AX)
+
+end:
+ RET