summaryrefslogtreecommitdiff
path: root/vendor/github.com/minio/md5-simd/block_amd64.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/minio/md5-simd/block_amd64.go')
-rw-r--r--vendor/github.com/minio/md5-simd/block_amd64.go77
1 files changed, 44 insertions, 33 deletions
diff --git a/vendor/github.com/minio/md5-simd/block_amd64.go b/vendor/github.com/minio/md5-simd/block_amd64.go
index 27d6ce00e..16edda268 100644
--- a/vendor/github.com/minio/md5-simd/block_amd64.go
+++ b/vendor/github.com/minio/md5-simd/block_amd64.go
@@ -9,14 +9,18 @@ package md5simd
import (
"fmt"
"math"
- "sync"
"unsafe"
- "github.com/klauspost/cpuid"
+ "github.com/klauspost/cpuid/v2"
)
var hasAVX512 bool
+func init() {
+ // VANDNPD requires AVX512DQ. Technically it could be VPTERNLOGQ which is AVX512F.
+ hasAVX512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ)
+}
+
//go:noescape
func block8(state *uint32, base uintptr, bufs *int32, cache *byte, n int)
@@ -82,45 +86,52 @@ var avx512md5consts = func(c []uint32) []uint32 {
return inf
}(md5consts[:])
-func init() {
- hasAVX512 = cpuid.CPU.AVX512F()
-}
-
// Interface function to assembly code
func (s *md5Server) blockMd5_x16(d *digest16, input [16][]byte, half bool) {
if hasAVX512 {
blockMd5_avx512(d, input, s.allBufs, &s.maskRounds16)
- } else {
- d8a, d8b := digest8{}, digest8{}
- for i := range d8a.v0 {
- j := i + 8
- d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
- if !half {
- d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
- }
- }
+ return
+ }
+
+ // Preparing data using copy is slower since copies aren't inlined.
- i8 := [2][8][]byte{}
- for i := range i8[0] {
- i8[0][i], i8[1][i] = input[i], input[8+i]
+ // Calculate on this goroutine
+ if half {
+ for i := range s.i8[0][:] {
+ s.i8[0][i] = input[i]
}
- if half {
- blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a)
- } else {
- wg := sync.WaitGroup{}
- wg.Add(2)
- go func() { blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a); wg.Done() }()
- go func() { blockMd5_avx2(&d8b, i8[1], s.allBufs, &s.maskRounds8b); wg.Done() }()
- wg.Wait()
+ for i := range s.d8a.v0[:] {
+ s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
}
-
- for i := range d8a.v0 {
- j := i + 8
- d.v0[i], d.v1[i], d.v2[i], d.v3[i] = d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i]
- if !half {
- d.v0[j], d.v1[j], d.v2[j], d.v3[j] = d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i]
- }
+ blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a)
+ for i := range s.d8a.v0[:] {
+ d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
}
+ return
+ }
+
+ for i := range s.i8[0][:] {
+ s.i8[0][i], s.i8[1][i] = input[i], input[8+i]
+ }
+
+ for i := range s.d8a.v0[:] {
+ j := (i + 8) & 15
+ s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
+ s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
+ }
+
+ // Benchmarks appears to be slightly faster when spinning up 2 goroutines instead
+ // of using the current for one of the blocks.
+ s.wg.Add(2)
+ go func() { blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a); s.wg.Done() }()
+ go func() { blockMd5_avx2(&s.d8b, s.i8[1], s.allBufs, &s.maskRounds8b); s.wg.Done() }()
+ s.wg.Wait()
+ for i := range s.d8a.v0[:] {
+ d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
+ }
+ for i := range s.d8b.v0[:] {
+ j := (i + 8) & 15
+ d.v0[j], d.v1[j], d.v2[j], d.v3[j] = s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i]
}
}