diff options
author | 2022-09-28 18:30:40 +0100 | |
---|---|---|
committer | 2022-09-28 18:30:40 +0100 | |
commit | a156188b3eb5cb3da44aa1b7452265f5fa38a607 (patch) | |
tree | 7097fa48d56fbabc7c2c8750b1f3bc9321d71c0f /vendor/github.com/minio/md5-simd/block_amd64.go | |
parent | [bugfix] Fix emphasis being added to emoji shortcodes with markdown parsing (... (diff) | |
download | gotosocial-a156188b3eb5cb3da44aa1b7452265f5fa38a607.tar.xz |
[chore] update dependencies, bump to Go 1.19.1 (#826)
* update dependencies, bump Go version to 1.19
* bump test image Go version
* update golangci-lint
* update gotosocial-drone-build
* sign
* linting, go fmt
* update swagger docs
* update swagger docs
* whitespace
* update contributing.md
* fuckin whoopsie doopsie
* linterino, linteroni
* fix followrequest test not starting processor
* fix other api/client tests not starting processor
* fix remaining tests where processor not started
* bump go-runners version
* don't check last-webfingered-at, processor may have updated this
* update swagger command
* update bun to latest version
* fix embed to work the same as before with new bun
Signed-off-by: kim <grufwub@gmail.com>
Co-authored-by: tsmethurst <tobi.smethurst@protonmail.com>
Diffstat (limited to 'vendor/github.com/minio/md5-simd/block_amd64.go')
-rw-r--r-- | vendor/github.com/minio/md5-simd/block_amd64.go | 77 |
1 files changed, 44 insertions, 33 deletions
diff --git a/vendor/github.com/minio/md5-simd/block_amd64.go b/vendor/github.com/minio/md5-simd/block_amd64.go index 27d6ce00e..16edda268 100644 --- a/vendor/github.com/minio/md5-simd/block_amd64.go +++ b/vendor/github.com/minio/md5-simd/block_amd64.go @@ -9,14 +9,18 @@ package md5simd import ( "fmt" "math" - "sync" "unsafe" - "github.com/klauspost/cpuid" + "github.com/klauspost/cpuid/v2" ) var hasAVX512 bool +func init() { + // VANDNPD requires AVX512DQ. Technically it could be VPTERNLOGQ which is AVX512F. + hasAVX512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ) +} + //go:noescape func block8(state *uint32, base uintptr, bufs *int32, cache *byte, n int) @@ -82,45 +86,52 @@ var avx512md5consts = func(c []uint32) []uint32 { return inf }(md5consts[:]) -func init() { - hasAVX512 = cpuid.CPU.AVX512F() -} - // Interface function to assembly code func (s *md5Server) blockMd5_x16(d *digest16, input [16][]byte, half bool) { if hasAVX512 { blockMd5_avx512(d, input, s.allBufs, &s.maskRounds16) - } else { - d8a, d8b := digest8{}, digest8{} - for i := range d8a.v0 { - j := i + 8 - d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i] - if !half { - d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j] - } - } + return + } + + // Preparing data using copy is slower since copies aren't inlined. - i8 := [2][8][]byte{} - for i := range i8[0] { - i8[0][i], i8[1][i] = input[i], input[8+i] + // Calculate on this goroutine + if half { + for i := range s.i8[0][:] { + s.i8[0][i] = input[i] } - if half { - blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a) - } else { - wg := sync.WaitGroup{} - wg.Add(2) - go func() { blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a); wg.Done() }() - go func() { blockMd5_avx2(&d8b, i8[1], s.allBufs, &s.maskRounds8b); wg.Done() }() - wg.Wait() + for i := range s.d8a.v0[:] { + s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i] } - - for i := range d8a.v0 { - j := i + 8 - d.v0[i], d.v1[i], d.v2[i], d.v3[i] = d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i] - if !half { - d.v0[j], d.v1[j], d.v2[j], d.v3[j] = d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i] - } + blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a) + for i := range s.d8a.v0[:] { + d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] } + return + } + + for i := range s.i8[0][:] { + s.i8[0][i], s.i8[1][i] = input[i], input[8+i] + } + + for i := range s.d8a.v0[:] { + j := (i + 8) & 15 + s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i] + s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j] + } + + // Benchmarks appears to be slightly faster when spinning up 2 goroutines instead + // of using the current for one of the blocks. + s.wg.Add(2) + go func() { blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a); s.wg.Done() }() + go func() { blockMd5_avx2(&s.d8b, s.i8[1], s.allBufs, &s.maskRounds8b); s.wg.Done() }() + s.wg.Wait() + for i := range s.d8a.v0[:] { + d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] + } + for i := range s.d8b.v0[:] { + j := (i + 8) & 15 + d.v0[j], d.v1[j], d.v2[j], d.v3[j] = s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] } } |