10 files changed, 0 insertions, 4022 deletions
diff --git a/vendor/github.com/klauspost/compress/huff0/.gitignore b/vendor/github.com/klauspost/compress/huff0/.gitignore
deleted file mode 100644
index b3d262958..000000000
--- a/vendor/github.com/klauspost/compress/huff0/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/huff0-fuzz.zip
diff --git a/vendor/github.com/klauspost/compress/huff0/README.md b/vendor/github.com/klauspost/compress/huff0/README.md
deleted file mode 100644
index 8b6e5c663..000000000
--- a/vendor/github.com/klauspost/compress/huff0/README.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# Huff0 entropy compression
-
-This package provides Huff0 encoding and decoding as used in zstd.
-            
-[Huff0](https://github.com/Cyan4973/FiniteStateEntropy#new-generation-entropy-coders), 
-a Huffman codec designed for modern CPU, featuring OoO (Out of Order) operations on multiple ALU 
-(Arithmetic Logic Unit), achieving extremely fast compression and decompression speeds.
-
-This can be used for compressing input with a lot of similar input values to the smallest number of bytes.
-This does not perform any multi-byte [dictionary coding](https://en.wikipedia.org/wiki/Dictionary_coder) as LZ coders,
-but it can be used as a secondary step to compressors (like Snappy) that does not do entropy encoding. 
-
-* [Godoc documentation](https://godoc.org/github.com/klauspost/compress/huff0)
-
-## News
-
-This is used as part of the [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression and decompression package.
-
-This ensures that most functionality is well tested.
-
-# Usage
-
-This package provides a low level interface that allows to compress single independent blocks. 
-
-Each block is separate, and there is no built in integrity checks. 
-This means that the caller should keep track of block sizes and also do checksums if needed.  
-
-Compressing a block is done via the [`Compress1X`](https://godoc.org/github.com/klauspost/compress/huff0#Compress1X) and 
-[`Compress4X`](https://godoc.org/github.com/klauspost/compress/huff0#Compress4X) functions.
-You must provide input and will receive the output and maybe an error.
-
-These error values can be returned:
-
-| Error               | Description                                                                 |
-|---------------------|-----------------------------------------------------------------------------|
-| `<nil>`             | Everything ok, output is returned                                           |
-| `ErrIncompressible` | Returned when input is judged to be too hard to compress                    |
-| `ErrUseRLE`         | Returned from the compressor when the input is a single byte value repeated |
-| `ErrTooBig`         | Returned if the input block exceeds the maximum allowed size (128 Kib)      |
-| `(error)`           | An internal error occurred.                                                 |
-
-
-As can be seen above some of there are errors that will be returned even under normal operation so it is important to handle these.
-
-To reduce allocations you can provide a [`Scratch`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch) object 
-that can be re-used for successive calls. Both compression and decompression accepts a `Scratch` object, and the same 
-object can be used for both.   
-
-Be aware, that when re-using a `Scratch` object that the *output* buffer is also re-used, so if you are still using this
-you must set the `Out` field in the scratch to nil. The same buffer is used for compression and decompression output.
-
-The `Scratch` object will retain state that allows to re-use previous tables for encoding and decoding.  
-
-## Tables and re-use
-
-Huff0 allows for reusing tables from the previous block to save space if that is expected to give better/faster results. 
-
-The Scratch object allows you to set a [`ReusePolicy`](https://godoc.org/github.com/klauspost/compress/huff0#ReusePolicy) 
-that controls this behaviour. See the documentation for details. This can be altered between each block.
-
-Do however note that this information is *not* stored in the output block and it is up to the users of the package to
-record whether [`ReadTable`](https://godoc.org/github.com/klauspost/compress/huff0#ReadTable) should be called,
-based on the boolean reported back from the CompressXX call. 
-
-If you want to store the table separate from the data, you can access them as `OutData` and `OutTable` on the 
-[`Scratch`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch) object.
-
-## Decompressing
-
-The first part of decoding is to initialize the decoding table through [`ReadTable`](https://godoc.org/github.com/klauspost/compress/huff0#ReadTable).
-This will initialize the decoding tables. 
-You can supply the complete block to `ReadTable` and it will return the data part of the block 
-which can be given to the decompressor. 
-
-Decompressing is done by calling the [`Decompress1X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress1X) 
-or [`Decompress4X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress4X) function.
-
-For concurrently decompressing content with a fixed table a stateless [`Decoder`](https://godoc.org/github.com/klauspost/compress/huff0#Decoder) can be requested which will remain correct as long as the scratch is unchanged. The capacity of the provided slice indicates the expected output size.
-
-You must provide the output from the compression stage, at exactly the size you got back. If you receive an error back
-your input was likely corrupted. 
-
-It is important to note that a successful decoding does *not* mean your output matches your original input. 
-There are no integrity checks, so relying on errors from the decompressor does not assure your data is valid.
-
-# Contributing
-
-Contributions are always welcome. Be aware that adding public functions will require good justification and breaking 
-changes will likely not be accepted. If in doubt open an issue before writing the PR.
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
deleted file mode 100644
index e36d9742f..000000000
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ /dev/null
@@ -1,229 +0,0 @@
-// Copyright 2018 Klaus Post. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
-
-package huff0
-
-import (
-	"encoding/binary"
-	"errors"
-	"fmt"
-	"io"
-)
-
-// bitReader reads a bitstream in reverse.
-// The last set bit indicates the start of the stream and is used
-// for aligning the input.
-type bitReaderBytes struct {
-	in       []byte
-	off      uint // next byte to read is at in[off - 1]
-	value    uint64
-	bitsRead uint8
-}
-
-// init initializes and resets the bit reader.
-func (b *bitReaderBytes) init(in []byte) error {
-	if len(in) < 1 {
-		return errors.New("corrupt stream: too short")
-	}
-	b.in = in
-	b.off = uint(len(in))
-	// The highest bit of the last byte indicates where to start
-	v := in[len(in)-1]
-	if v == 0 {
-		return errors.New("corrupt stream, did not find end of stream")
-	}
-	b.bitsRead = 64
-	b.value = 0
-	if len(in) >= 8 {
-		b.fillFastStart()
-	} else {
-		b.fill()
-		b.fill()
-	}
-	b.advance(8 - uint8(highBit32(uint32(v))))
-	return nil
-}
-
-// peekBitsFast requires that at least one bit is requested every time.
-// There are no checks if the buffer is filled.
-func (b *bitReaderBytes) peekByteFast() uint8 {
-	got := uint8(b.value >> 56)
-	return got
-}
-
-func (b *bitReaderBytes) advance(n uint8) {
-	b.bitsRead += n
-	b.value <<= n & 63
-}
-
-// fillFast() will make sure at least 32 bits are available.
-// There must be at least 4 bytes available.
-func (b *bitReaderBytes) fillFast() {
-	if b.bitsRead < 32 {
-		return
-	}
-
-	// 2 bounds checks.
-	v := b.in[b.off-4 : b.off]
-	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	b.value |= uint64(low) << (b.bitsRead - 32)
-	b.bitsRead -= 32
-	b.off -= 4
-}
-
-// fillFastStart() assumes the bitReaderBytes is empty and there is at least 8 bytes to read.
-func (b *bitReaderBytes) fillFastStart() {
-	// Do single re-slice to avoid bounds checks.
-	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
-	b.bitsRead = 0
-	b.off -= 8
-}
-
-// fill() will make sure at least 32 bits are available.
-func (b *bitReaderBytes) fill() {
-	if b.bitsRead < 32 {
-		return
-	}
-	if b.off > 4 {
-		v := b.in[b.off-4 : b.off]
-		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-		b.value |= uint64(low) << (b.bitsRead - 32)
-		b.bitsRead -= 32
-		b.off -= 4
-		return
-	}
-	for b.off > 0 {
-		b.value |= uint64(b.in[b.off-1]) << (b.bitsRead - 8)
-		b.bitsRead -= 8
-		b.off--
-	}
-}
-
-// finished returns true if all bits have been read from the bit stream.
-func (b *bitReaderBytes) finished() bool {
-	return b.off == 0 && b.bitsRead >= 64
-}
-
-func (b *bitReaderBytes) remaining() uint {
-	return b.off*8 + uint(64-b.bitsRead)
-}
-
-// close the bitstream and returns an error if out-of-buffer reads occurred.
-func (b *bitReaderBytes) close() error {
-	// Release reference.
-	b.in = nil
-	if b.remaining() > 0 {
-		return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
-	}
-	if b.bitsRead > 64 {
-		return io.ErrUnexpectedEOF
-	}
-	return nil
-}
-
-// bitReaderShifted reads a bitstream in reverse.
-// The last set bit indicates the start of the stream and is used
-// for aligning the input.
-type bitReaderShifted struct {
-	in       []byte
-	off      uint // next byte to read is at in[off - 1]
-	value    uint64
-	bitsRead uint8
-}
-
-// init initializes and resets the bit reader.
-func (b *bitReaderShifted) init(in []byte) error {
-	if len(in) < 1 {
-		return errors.New("corrupt stream: too short")
-	}
-	b.in = in
-	b.off = uint(len(in))
-	// The highest bit of the last byte indicates where to start
-	v := in[len(in)-1]
-	if v == 0 {
-		return errors.New("corrupt stream, did not find end of stream")
-	}
-	b.bitsRead = 64
-	b.value = 0
-	if len(in) >= 8 {
-		b.fillFastStart()
-	} else {
-		b.fill()
-		b.fill()
-	}
-	b.advance(8 - uint8(highBit32(uint32(v))))
-	return nil
-}
-
-// peekBitsFast requires that at least one bit is requested every time.
-// There are no checks if the buffer is filled.
-func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
-	return uint16(b.value >> ((64 - n) & 63))
-}
-
-func (b *bitReaderShifted) advance(n uint8) {
-	b.bitsRead += n
-	b.value <<= n & 63
-}
-
-// fillFast() will make sure at least 32 bits are available.
-// There must be at least 4 bytes available.
-func (b *bitReaderShifted) fillFast() {
-	if b.bitsRead < 32 {
-		return
-	}
-
-	// 2 bounds checks.
-	v := b.in[b.off-4 : b.off]
-	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
-	b.bitsRead -= 32
-	b.off -= 4
-}
-
-// fillFastStart() assumes the bitReaderShifted is empty and there is at least 8 bytes to read.
-func (b *bitReaderShifted) fillFastStart() {
-	// Do single re-slice to avoid bounds checks.
-	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
-	b.bitsRead = 0
-	b.off -= 8
-}
-
-// fill() will make sure at least 32 bits are available.
-func (b *bitReaderShifted) fill() {
-	if b.bitsRead < 32 {
-		return
-	}
-	if b.off > 4 {
-		v := b.in[b.off-4 : b.off]
-		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-		b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
-		b.bitsRead -= 32
-		b.off -= 4
-		return
-	}
-	for b.off > 0 {
-		b.value |= uint64(b.in[b.off-1]) << ((b.bitsRead - 8) & 63)
-		b.bitsRead -= 8
-		b.off--
-	}
-}
-
-func (b *bitReaderShifted) remaining() uint {
-	return b.off*8 + uint(64-b.bitsRead)
-}
-
-// close the bitstream and returns an error if out-of-buffer reads occurred.
-func (b *bitReaderShifted) close() error {
-	// Release reference.
-	b.in = nil
-	if b.remaining() > 0 {
-		return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
-	}
-	if b.bitsRead > 64 {
-		return io.ErrUnexpectedEOF
-	}
-	return nil
-}
diff --git a/vendor/github.com/klauspost/compress/huff0/bitwriter.go b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
deleted file mode 100644
index 0ebc9aaac..000000000
--- a/vendor/github.com/klauspost/compress/huff0/bitwriter.go
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright 2018 Klaus Post. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
-
-package huff0
-
-// bitWriter will write bits.
-// First bit will be LSB of the first byte of output.
-type bitWriter struct {
-	bitContainer uint64
-	nBits        uint8
-	out          []byte
-}
-
-// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
-// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
-func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
-	b.bitContainer |= uint64(value) << (b.nBits & 63)
-	b.nBits += bits
-}
-
-// encSymbol will add up to 16 bits. value may not contain more set bits than indicated.
-// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
-func (b *bitWriter) encSymbol(ct cTable, symbol byte) {
-	enc := ct[symbol]
-	b.bitContainer |= uint64(enc.val) << (b.nBits & 63)
-	if false {
-		if enc.nBits == 0 {
-			panic("nbits 0")
-		}
-	}
-	b.nBits += enc.nBits
-}
-
-// encTwoSymbols will add up to 32 bits. value may not contain more set bits than indicated.
-// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
-func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
-	encA := ct[av]
-	encB := ct[bv]
-	sh := b.nBits & 63
-	combined := uint64(encA.val) | (uint64(encB.val) << (encA.nBits & 63))
-	b.bitContainer |= combined << sh
-	if false {
-		if encA.nBits == 0 {
-			panic("nbitsA 0")
-		}
-		if encB.nBits == 0 {
-			panic("nbitsB 0")
-		}
-	}
-	b.nBits += encA.nBits + encB.nBits
-}
-
-// encFourSymbols adds up to 32 bits from four symbols.
-// It will not check if there is space for them,
-// so the caller must ensure that b has been flushed recently.
-func (b *bitWriter) encFourSymbols(encA, encB, encC, encD cTableEntry) {
-	bitsA := encA.nBits
-	bitsB := bitsA + encB.nBits
-	bitsC := bitsB + encC.nBits
-	bitsD := bitsC + encD.nBits
-	combined := uint64(encA.val) |
-		(uint64(encB.val) << (bitsA & 63)) |
-		(uint64(encC.val) << (bitsB & 63)) |
-		(uint64(encD.val) << (bitsC & 63))
-	b.bitContainer |= combined << (b.nBits & 63)
-	b.nBits += bitsD
-}
-
-// flush32 will flush out, so there are at least 32 bits available for writing.
-func (b *bitWriter) flush32() {
-	if b.nBits < 32 {
-		return
-	}
-	b.out = append(b.out,
-		byte(b.bitContainer),
-		byte(b.bitContainer>>8),
-		byte(b.bitContainer>>16),
-		byte(b.bitContainer>>24))
-	b.nBits -= 32
-	b.bitContainer >>= 32
-}
-
-// flushAlign will flush remaining full bytes and align to next byte boundary.
-func (b *bitWriter) flushAlign() {
-	nbBytes := (b.nBits + 7) >> 3
-	for i := uint8(0); i < nbBytes; i++ {
-		b.out = append(b.out, byte(b.bitContainer>>(i*8)))
-	}
-	b.nBits = 0
-	b.bitContainer = 0
-}
-
-// close will write the alignment bit and write the final byte(s)
-// to the output.
-func (b *bitWriter) close() {
-	// End mark
-	b.addBits16Clean(1, 1)
-	// flush until next byte.
-	b.flushAlign()
-}
diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
deleted file mode 100644
index 84aa3d12f..000000000
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ /dev/null
@@ -1,742 +0,0 @@
-package huff0
-
-import (
-	"fmt"
-	"math"
-	"runtime"
-	"sync"
-)
-
-// Compress1X will compress the input.
-// The output can be decoded using Decompress1X.
-// Supply a Scratch object. The scratch object contains state about re-use,
-// So when sharing across independent encodes, be sure to set the re-use policy.
-func Compress1X(in []byte, s *Scratch) (out []byte, reUsed bool, err error) {
-	s, err = s.prepare(in)
-	if err != nil {
-		return nil, false, err
-	}
-	return compress(in, s, s.compress1X)
-}
-
-// Compress4X will compress the input. The input is split into 4 independent blocks
-// and compressed similar to Compress1X.
-// The output can be decoded using Decompress4X.
-// Supply a Scratch object. The scratch object contains state about re-use,
-// So when sharing across independent encodes, be sure to set the re-use policy.
-func Compress4X(in []byte, s *Scratch) (out []byte, reUsed bool, err error) {
-	s, err = s.prepare(in)
-	if err != nil {
-		return nil, false, err
-	}
-	if false {
-		// TODO: compress4Xp only slightly faster.
-		const parallelThreshold = 8 << 10
-		if len(in) < parallelThreshold || runtime.GOMAXPROCS(0) == 1 {
-			return compress(in, s, s.compress4X)
-		}
-		return compress(in, s, s.compress4Xp)
-	}
-	return compress(in, s, s.compress4X)
-}
-
-func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)) (out []byte, reUsed bool, err error) {
-	// Nuke previous table if we cannot reuse anyway.
-	if s.Reuse == ReusePolicyNone {
-		s.prevTable = s.prevTable[:0]
-	}
-
-	// Create histogram, if none was provided.
-	maxCount := s.maxCount
-	var canReuse = false
-	if maxCount == 0 {
-		maxCount, canReuse = s.countSimple(in)
-	} else {
-		canReuse = s.canUseTable(s.prevTable)
-	}
-
-	// We want the output size to be less than this:
-	wantSize := len(in)
-	if s.WantLogLess > 0 {
-		wantSize -= wantSize >> s.WantLogLess
-	}
-
-	// Reset for next run.
-	s.clearCount = true
-	s.maxCount = 0
-	if maxCount >= len(in) {
-		if maxCount > len(in) {
-			return nil, false, fmt.Errorf("maxCount (%d) > length (%d)", maxCount, len(in))
-		}
-		if len(in) == 1 {
-			return nil, false, ErrIncompressible
-		}
-		// One symbol, use RLE
-		return nil, false, ErrUseRLE
-	}
-	if maxCount == 1 || maxCount < (len(in)>>7) {
-		// Each symbol present maximum once or too well distributed.
-		return nil, false, ErrIncompressible
-	}
-	if s.Reuse == ReusePolicyMust && !canReuse {
-		// We must reuse, but we can't.
-		return nil, false, ErrIncompressible
-	}
-	if (s.Reuse == ReusePolicyPrefer || s.Reuse == ReusePolicyMust) && canReuse {
-		keepTable := s.cTable
-		keepTL := s.actualTableLog
-		s.cTable = s.prevTable
-		s.actualTableLog = s.prevTableLog
-		s.Out, err = compressor(in)
-		s.cTable = keepTable
-		s.actualTableLog = keepTL
-		if err == nil && len(s.Out) < wantSize {
-			s.OutData = s.Out
-			return s.Out, true, nil
-		}
-		if s.Reuse == ReusePolicyMust {
-			return nil, false, ErrIncompressible
-		}
-		// Do not attempt to re-use later.
-		s.prevTable = s.prevTable[:0]
-	}
-
-	// Calculate new table.
-	err = s.buildCTable()
-	if err != nil {
-		return nil, false, err
-	}
-
-	if false && !s.canUseTable(s.cTable) {
-		panic("invalid table generated")
-	}
-
-	if s.Reuse == ReusePolicyAllow && canReuse {
-		hSize := len(s.Out)
-		oldSize := s.prevTable.estimateSize(s.count[:s.symbolLen])
-		newSize := s.cTable.estimateSize(s.count[:s.symbolLen])
-		if oldSize <= hSize+newSize || hSize+12 >= wantSize {
-			// Retain cTable even if we re-use.
-			keepTable := s.cTable
-			keepTL := s.actualTableLog
-
-			s.cTable = s.prevTable
-			s.actualTableLog = s.prevTableLog
-			s.Out, err = compressor(in)
-
-			// Restore ctable.
-			s.cTable = keepTable
-			s.actualTableLog = keepTL
-			if err != nil {
-				return nil, false, err
-			}
-			if len(s.Out) >= wantSize {
-				return nil, false, ErrIncompressible
-			}
-			s.OutData = s.Out
-			return s.Out, true, nil
-		}
-	}
-
-	// Use new table
-	err = s.cTable.write(s)
-	if err != nil {
-		s.OutTable = nil
-		return nil, false, err
-	}
-	s.OutTable = s.Out
-
-	// Compress using new table
-	s.Out, err = compressor(in)
-	if err != nil {
-		s.OutTable = nil
-		return nil, false, err
-	}
-	if len(s.Out) >= wantSize {
-		s.OutTable = nil
-		return nil, false, ErrIncompressible
-	}
-	// Move current table into previous.
-	s.prevTable, s.prevTableLog, s.cTable = s.cTable, s.actualTableLog, s.prevTable[:0]
-	s.OutData = s.Out[len(s.OutTable):]
-	return s.Out, false, nil
-}
-
-// EstimateSizes will estimate the data sizes
-func EstimateSizes(in []byte, s *Scratch) (tableSz, dataSz, reuseSz int, err error) {
-	s, err = s.prepare(in)
-	if err != nil {
-		return 0, 0, 0, err
-	}
-
-	// Create histogram, if none was provided.
-	tableSz, dataSz, reuseSz = -1, -1, -1
-	maxCount := s.maxCount
-	var canReuse = false
-	if maxCount == 0 {
-		maxCount, canReuse = s.countSimple(in)
-	} else {
-		canReuse = s.canUseTable(s.prevTable)
-	}
-
-	// We want the output size to be less than this:
-	wantSize := len(in)
-	if s.WantLogLess > 0 {
-		wantSize -= wantSize >> s.WantLogLess
-	}
-
-	// Reset for next run.
-	s.clearCount = true
-	s.maxCount = 0
-	if maxCount >= len(in) {
-		if maxCount > len(in) {
-			return 0, 0, 0, fmt.Errorf("maxCount (%d) > length (%d)", maxCount, len(in))
-		}
-		if len(in) == 1 {
-			return 0, 0, 0, ErrIncompressible
-		}
-		// One symbol, use RLE
-		return 0, 0, 0, ErrUseRLE
-	}
-	if maxCount == 1 || maxCount < (len(in)>>7) {
-		// Each symbol present maximum once or too well distributed.
-		return 0, 0, 0, ErrIncompressible
-	}
-
-	// Calculate new table.
-	err = s.buildCTable()
-	if err != nil {
-		return 0, 0, 0, err
-	}
-
-	if false && !s.canUseTable(s.cTable) {
-		panic("invalid table generated")
-	}
-
-	tableSz, err = s.cTable.estTableSize(s)
-	if err != nil {
-		return 0, 0, 0, err
-	}
-	if canReuse {
-		reuseSz = s.prevTable.estimateSize(s.count[:s.symbolLen])
-	}
-	dataSz = s.cTable.estimateSize(s.count[:s.symbolLen])
-
-	// Restore
-	return tableSz, dataSz, reuseSz, nil
-}
-
-func (s *Scratch) compress1X(src []byte) ([]byte, error) {
-	return s.compress1xDo(s.Out, src), nil
-}
-
-func (s *Scratch) compress1xDo(dst, src []byte) []byte {
-	var bw = bitWriter{out: dst}
-
-	// N is length divisible by 4.
-	n := len(src)
-	n -= n & 3
-	cTable := s.cTable[:256]
-
-	// Encode last bytes.
-	for i := len(src) & 3; i > 0; i-- {
-		bw.encSymbol(cTable, src[n+i-1])
-	}
-	n -= 4
-	if s.actualTableLog <= 8 {
-		for ; n >= 0; n -= 4 {
-			tmp := src[n : n+4]
-			// tmp should be len 4
-			bw.flush32()
-			bw.encFourSymbols(cTable[tmp[3]], cTable[tmp[2]], cTable[tmp[1]], cTable[tmp[0]])
-		}
-	} else {
-		for ; n >= 0; n -= 4 {
-			tmp := src[n : n+4]
-			// tmp should be len 4
-			bw.flush32()
-			bw.encTwoSymbols(cTable, tmp[3], tmp[2])
-			bw.flush32()
-			bw.encTwoSymbols(cTable, tmp[1], tmp[0])
-		}
-	}
-	bw.close()
-	return bw.out
-}
-
-var sixZeros [6]byte
-
-func (s *Scratch) compress4X(src []byte) ([]byte, error) {
-	if len(src) < 12 {
-		return nil, ErrIncompressible
-	}
-	segmentSize := (len(src) + 3) / 4
-
-	// Add placeholder for output length
-	offsetIdx := len(s.Out)
-	s.Out = append(s.Out, sixZeros[:]...)
-
-	for i := 0; i < 4; i++ {
-		toDo := src
-		if len(toDo) > segmentSize {
-			toDo = toDo[:segmentSize]
-		}
-		src = src[len(toDo):]
-
-		idx := len(s.Out)
-		s.Out = s.compress1xDo(s.Out, toDo)
-		if len(s.Out)-idx > math.MaxUint16 {
-			// We cannot store the size in the jump table
-			return nil, ErrIncompressible
-		}
-		// Write compressed length as little endian before block.
-		if i < 3 {
-			// Last length is not written.
-			length := len(s.Out) - idx
-			s.Out[i*2+offsetIdx] = byte(length)
-			s.Out[i*2+offsetIdx+1] = byte(length >> 8)
-		}
-	}
-
-	return s.Out, nil
-}
-
-// compress4Xp will compress 4 streams using separate goroutines.
-func (s *Scratch) compress4Xp(src []byte) ([]byte, error) {
-	if len(src) < 12 {
-		return nil, ErrIncompressible
-	}
-	// Add placeholder for output length
-	s.Out = s.Out[:6]
-
-	segmentSize := (len(src) + 3) / 4
-	var wg sync.WaitGroup
-	wg.Add(4)
-	for i := 0; i < 4; i++ {
-		toDo := src
-		if len(toDo) > segmentSize {
-			toDo = toDo[:segmentSize]
-		}
-		src = src[len(toDo):]
-
-		// Separate goroutine for each block.
-		go func(i int) {
-			s.tmpOut[i] = s.compress1xDo(s.tmpOut[i][:0], toDo)
-			wg.Done()
-		}(i)
-	}
-	wg.Wait()
-	for i := 0; i < 4; i++ {
-		o := s.tmpOut[i]
-		if len(o) > math.MaxUint16 {
-			// We cannot store the size in the jump table
-			return nil, ErrIncompressible
-		}
-		// Write compressed length as little endian before block.
-		if i < 3 {
-			// Last length is not written.
-			s.Out[i*2] = byte(len(o))
-			s.Out[i*2+1] = byte(len(o) >> 8)
-		}
-
-		// Write output.
-		s.Out = append(s.Out, o...)
-	}
-	return s.Out, nil
-}
-
-// countSimple will create a simple histogram in s.count.
-// Returns the biggest count.
-// Does not update s.clearCount.
-func (s *Scratch) countSimple(in []byte) (max int, reuse bool) {
-	reuse = true
-	_ = s.count // Assert that s != nil to speed up the following loop.
-	for _, v := range in {
-		s.count[v]++
-	}
-	m := uint32(0)
-	if len(s.prevTable) > 0 {
-		for i, v := range s.count[:] {
-			if v == 0 {
-				continue
-			}
-			if v > m {
-				m = v
-			}
-			s.symbolLen = uint16(i) + 1
-			if i >= len(s.prevTable) {
-				reuse = false
-			} else if s.prevTable[i].nBits == 0 {
-				reuse = false
-			}
-		}
-		return int(m), reuse
-	}
-	for i, v := range s.count[:] {
-		if v == 0 {
-			continue
-		}
-		if v > m {
-			m = v
-		}
-		s.symbolLen = uint16(i) + 1
-	}
-	return int(m), false
-}
-
-func (s *Scratch) canUseTable(c cTable) bool {
-	if len(c) < int(s.symbolLen) {
-		return false
-	}
-	for i, v := range s.count[:s.symbolLen] {
-		if v != 0 && c[i].nBits == 0 {
-			return false
-		}
-	}
-	return true
-}
-
-//lint:ignore U1000 used for debugging
-func (s *Scratch) validateTable(c cTable) bool {
-	if len(c) < int(s.symbolLen) {
-		return false
-	}
-	for i, v := range s.count[:s.symbolLen] {
-		if v != 0 {
-			if c[i].nBits == 0 {
-				return false
-			}
-			if c[i].nBits > s.actualTableLog {
-				return false
-			}
-		}
-	}
-	return true
-}
-
-// minTableLog provides the minimum logSize to safely represent a distribution.
-func (s *Scratch) minTableLog() uint8 {
-	minBitsSrc := highBit32(uint32(s.srcLen)) + 1
-	minBitsSymbols := highBit32(uint32(s.symbolLen-1)) + 2
-	if minBitsSrc < minBitsSymbols {
-		return uint8(minBitsSrc)
-	}
-	return uint8(minBitsSymbols)
-}
-
-// optimalTableLog calculates and sets the optimal tableLog in s.actualTableLog
-func (s *Scratch) optimalTableLog() {
-	tableLog := s.TableLog
-	minBits := s.minTableLog()
-	maxBitsSrc := uint8(highBit32(uint32(s.srcLen-1))) - 1
-	if maxBitsSrc < tableLog {
-		// Accuracy can be reduced
-		tableLog = maxBitsSrc
-	}
-	if minBits > tableLog {
-		tableLog = minBits
-	}
-	// Need a minimum to safely represent all symbol values
-	if tableLog < minTablelog {
-		tableLog = minTablelog
-	}
-	if tableLog > tableLogMax {
-		tableLog = tableLogMax
-	}
-	s.actualTableLog = tableLog
-}
-
-type cTableEntry struct {
-	val   uint16
-	nBits uint8
-	// We have 8 bits extra
-}
-
-const huffNodesMask = huffNodesLen - 1
-
-func (s *Scratch) buildCTable() error {
-	s.optimalTableLog()
-	s.huffSort()
-	if cap(s.cTable) < maxSymbolValue+1 {
-		s.cTable = make([]cTableEntry, s.symbolLen, maxSymbolValue+1)
-	} else {
-		s.cTable = s.cTable[:s.symbolLen]
-		for i := range s.cTable {
-			s.cTable[i] = cTableEntry{}
-		}
-	}
-
-	var startNode = int16(s.symbolLen)
-	nonNullRank := s.symbolLen - 1
-
-	nodeNb := startNode
-	huffNode := s.nodes[1 : huffNodesLen+1]
-
-	// This overlays the slice above, but allows "-1" index lookups.
-	// Different from reference implementation.
-	huffNode0 := s.nodes[0 : huffNodesLen+1]
-
-	for huffNode[nonNullRank].count() == 0 {
-		nonNullRank--
-	}
-
-	lowS := int16(nonNullRank)
-	nodeRoot := nodeNb + lowS - 1
-	lowN := nodeNb
-	huffNode[nodeNb].setCount(huffNode[lowS].count() + huffNode[lowS-1].count())
-	huffNode[lowS].setParent(nodeNb)
-	huffNode[lowS-1].setParent(nodeNb)
-	nodeNb++
-	lowS -= 2
-	for n := nodeNb; n <= nodeRoot; n++ {
-		huffNode[n].setCount(1 << 30)
-	}
-	// fake entry, strong barrier
-	huffNode0[0].setCount(1 << 31)
-
-	// create parents
-	for nodeNb <= nodeRoot {
-		var n1, n2 int16
-		if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
-			n1 = lowS
-			lowS--
-		} else {
-			n1 = lowN
-			lowN++
-		}
-		if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
-			n2 = lowS
-			lowS--
-		} else {
-			n2 = lowN
-			lowN++
-		}
-
-		huffNode[nodeNb].setCount(huffNode0[n1+1].count() + huffNode0[n2+1].count())
-		huffNode0[n1+1].setParent(nodeNb)
-		huffNode0[n2+1].setParent(nodeNb)
-		nodeNb++
-	}
-
-	// distribute weights (unlimited tree height)
-	huffNode[nodeRoot].setNbBits(0)
-	for n := nodeRoot - 1; n >= startNode; n-- {
-		huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
-	}
-	for n := uint16(0); n <= nonNullRank; n++ {
-		huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
-	}
-	s.actualTableLog = s.setMaxHeight(int(nonNullRank))
-	maxNbBits := s.actualTableLog
-
-	// fill result into tree (val, nbBits)
-	if maxNbBits > tableLogMax {
-		return fmt.Errorf("internal error: maxNbBits (%d) > tableLogMax (%d)", maxNbBits, tableLogMax)
-	}
-	var nbPerRank [tableLogMax + 1]uint16
-	var valPerRank [16]uint16
-	for _, v := range huffNode[:nonNullRank+1] {
-		nbPerRank[v.nbBits()]++
-	}
-	// determine stating value per rank
-	{
-		min := uint16(0)
-		for n := maxNbBits; n > 0; n-- {
-			// get starting value within each rank
-			valPerRank[n] = min
-			min += nbPerRank[n]
-			min >>= 1
-		}
-	}
-
-	// push nbBits per symbol, symbol order
-	for _, v := range huffNode[:nonNullRank+1] {
-		s.cTable[v.symbol()].nBits = v.nbBits()
-	}
-
-	// assign value within rank, symbol order
-	t := s.cTable[:s.symbolLen]
-	for n, val := range t {
-		nbits := val.nBits & 15
-		v := valPerRank[nbits]
-		t[n].val = v
-		valPerRank[nbits] = v + 1
-	}
-
-	return nil
-}
-
-// huffSort will sort symbols, decreasing order.
-func (s *Scratch) huffSort() {
-	type rankPos struct {
-		base    uint32
-		current uint32
-	}
-
-	// Clear nodes
-	nodes := s.nodes[:huffNodesLen+1]
-	s.nodes = nodes
-	nodes = nodes[1 : huffNodesLen+1]
-
-	// Sort into buckets based on length of symbol count.
-	var rank [32]rankPos
-	for _, v := range s.count[:s.symbolLen] {
-		r := highBit32(v+1) & 31
-		rank[r].base++
-	}
-	// maxBitLength is log2(BlockSizeMax) + 1
-	const maxBitLength = 18 + 1
-	for n := maxBitLength; n > 0; n-- {
-		rank[n-1].base += rank[n].base
-	}
-	for n := range rank[:maxBitLength] {
-		rank[n].current = rank[n].base
-	}
-	for n, c := range s.count[:s.symbolLen] {
-		r := (highBit32(c+1) + 1) & 31
-		pos := rank[r].current
-		rank[r].current++
-		prev := nodes[(pos-1)&huffNodesMask]
-		for pos > rank[r].base && c > prev.count() {
-			nodes[pos&huffNodesMask] = prev
-			pos--
-			prev = nodes[(pos-1)&huffNodesMask]
-		}
-		nodes[pos&huffNodesMask] = makeNodeElt(c, byte(n))
-	}
-}
-
-func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
-	maxNbBits := s.actualTableLog
-	huffNode := s.nodes[1 : huffNodesLen+1]
-	//huffNode = huffNode[: huffNodesLen]
-
-	largestBits := huffNode[lastNonNull].nbBits()
-
-	// early exit : no elt > maxNbBits
-	if largestBits <= maxNbBits {
-		return largestBits
-	}
-	totalCost := int(0)
-	baseCost := int(1) << (largestBits - maxNbBits)
-	n := uint32(lastNonNull)
-
-	for huffNode[n].nbBits() > maxNbBits {
-		totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits()))
-		huffNode[n].setNbBits(maxNbBits)
-		n--
-	}
-	// n stops at huffNode[n].nbBits <= maxNbBits
-
-	for huffNode[n].nbBits() == maxNbBits {
-		n--
-	}
-	// n end at index of smallest symbol using < maxNbBits
-
-	// renorm totalCost
-	totalCost >>= largestBits - maxNbBits /* note : totalCost is necessarily a multiple of baseCost */
-
-	// repay normalized cost
-	{
-		const noSymbol = 0xF0F0F0F0
-		var rankLast [tableLogMax + 2]uint32
-
-		for i := range rankLast[:] {
-			rankLast[i] = noSymbol
-		}
-
-		// Get pos of last (smallest) symbol per rank
-		{
-			currentNbBits := maxNbBits
-			for pos := int(n); pos >= 0; pos-- {
-				if huffNode[pos].nbBits() >= currentNbBits {
-					continue
-				}
-				currentNbBits = huffNode[pos].nbBits() // < maxNbBits
-				rankLast[maxNbBits-currentNbBits] = uint32(pos)
-			}
-		}
-
-		for totalCost > 0 {
-			nBitsToDecrease := uint8(highBit32(uint32(totalCost))) + 1
-
-			for ; nBitsToDecrease > 1; nBitsToDecrease-- {
-				highPos := rankLast[nBitsToDecrease]
-				lowPos := rankLast[nBitsToDecrease-1]
-				if highPos == noSymbol {
-					continue
-				}
-				if lowPos == noSymbol {
-					break
-				}
-				highTotal := huffNode[highPos].count()
-				lowTotal := 2 * huffNode[lowPos].count()
-				if highTotal <= lowTotal {
-					break
-				}
-			}
-			// only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !)
-			// HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary
-			// FIXME: try to remove
-			for (nBitsToDecrease <= tableLogMax) && (rankLast[nBitsToDecrease] == noSymbol) {
-				nBitsToDecrease++
-			}
-			totalCost -= 1 << (nBitsToDecrease - 1)
-			if rankLast[nBitsToDecrease-1] == noSymbol {
-				// this rank is no longer empty
-				rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease]
-			}
-			huffNode[rankLast[nBitsToDecrease]].setNbBits(1 +
-				huffNode[rankLast[nBitsToDecrease]].nbBits())
-			if rankLast[nBitsToDecrease] == 0 {
-				/* special case, reached largest symbol */
-				rankLast[nBitsToDecrease] = noSymbol
-			} else {
-				rankLast[nBitsToDecrease]--
-				if huffNode[rankLast[nBitsToDecrease]].nbBits() != maxNbBits-nBitsToDecrease {
-					rankLast[nBitsToDecrease] = noSymbol /* this rank is now empty */
-				}
-			}
-		}
-
-		for totalCost < 0 { /* Sometimes, cost correction overshoot */
-			if rankLast[1] == noSymbol { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
-				for huffNode[n].nbBits() == maxNbBits {
-					n--
-				}
-				huffNode[n+1].setNbBits(huffNode[n+1].nbBits() - 1)
-				rankLast[1] = n + 1
-				totalCost++
-				continue
-			}
-			huffNode[rankLast[1]+1].setNbBits(huffNode[rankLast[1]+1].nbBits() - 1)
-			rankLast[1]++
-			totalCost++
-		}
-	}
-	return maxNbBits
-}
-
-// A nodeElt is the fields
-//
-//	count  uint32
-//	parent uint16
-//	symbol byte
-//	nbBits uint8
-//
-// in some order, all squashed into an integer so that the compiler
-// always loads and stores entire nodeElts instead of separate fields.
-type nodeElt uint64
-
-func makeNodeElt(count uint32, symbol byte) nodeElt {
-	return nodeElt(count) | nodeElt(symbol)<<48
-}
-
-func (e *nodeElt) count() uint32  { return uint32(*e) }
-func (e *nodeElt) parent() uint16 { return uint16(*e >> 32) }
-func (e *nodeElt) symbol() byte   { return byte(*e >> 48) }
-func (e *nodeElt) nbBits() uint8  { return uint8(*e >> 56) }
-
-func (e *nodeElt) setCount(c uint32) { *e = (*e)&0xffffffff00000000 | nodeElt(c) }
-func (e *nodeElt) setParent(p int16) { *e = (*e)&0xffff0000ffffffff | nodeElt(uint16(p))<<32 }
-func (e *nodeElt) setNbBits(n uint8) { *e = (*e)&0x00ffffffffffffff | nodeElt(n)<<56 }
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
deleted file mode 100644
index 0f56b02d7..000000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ /dev/null
@@ -1,1167 +0,0 @@
-package huff0
-
-import (
-	"errors"
-	"fmt"
-	"io"
-	"sync"
-
-	"github.com/klauspost/compress/fse"
-)
-
-type dTable struct {
-	single []dEntrySingle
-}
-
-// single-symbols decoding
-type dEntrySingle struct {
-	entry uint16
-}
-
-// Uses special code for all tables that are < 8 bits.
-const use8BitTables = true
-
-// ReadTable will read a table from the input.
-// The size of the input may be larger than the table definition.
-// Any content remaining after the table definition will be returned.
-// If no Scratch is provided a new one is allocated.
-// The returned Scratch can be used for encoding or decoding input using this table.
-func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
-	s, err = s.prepare(nil)
-	if err != nil {
-		return s, nil, err
-	}
-	if len(in) <= 1 {
-		return s, nil, errors.New("input too small for table")
-	}
-	iSize := in[0]
-	in = in[1:]
-	if iSize >= 128 {
-		// Uncompressed
-		oSize := iSize - 127
-		iSize = (oSize + 1) / 2
-		if int(iSize) > len(in) {
-			return s, nil, errors.New("input too small for table")
-		}
-		for n := uint8(0); n < oSize; n += 2 {
-			v := in[n/2]
-			s.huffWeight[n] = v >> 4
-			s.huffWeight[n+1] = v & 15
-		}
-		s.symbolLen = uint16(oSize)
-		in = in[iSize:]
-	} else {
-		if len(in) < int(iSize) {
-			return s, nil, fmt.Errorf("input too small for table, want %d bytes, have %d", iSize, len(in))
-		}
-		// FSE compressed weights
-		s.fse.DecompressLimit = 255
-		hw := s.huffWeight[:]
-		s.fse.Out = hw
-		b, err := fse.Decompress(in[:iSize], s.fse)
-		s.fse.Out = nil
-		if err != nil {
-			return s, nil, fmt.Errorf("fse decompress returned: %w", err)
-		}
-		if len(b) > 255 {
-			return s, nil, errors.New("corrupt input: output table too large")
-		}
-		s.symbolLen = uint16(len(b))
-		in = in[iSize:]
-	}
-
-	// collect weight stats
-	var rankStats [16]uint32
-	weightTotal := uint32(0)
-	for _, v := range s.huffWeight[:s.symbolLen] {
-		if v > tableLogMax {
-			return s, nil, errors.New("corrupt input: weight too large")
-		}
-		v2 := v & 15
-		rankStats[v2]++
-		// (1 << (v2-1)) is slower since the compiler cannot prove that v2 isn't 0.
-		weightTotal += (1 << v2) >> 1
-	}
-	if weightTotal == 0 {
-		return s, nil, errors.New("corrupt input: weights zero")
-	}
-
-	// get last non-null symbol weight (implied, total must be 2^n)
-	{
-		tableLog := highBit32(weightTotal) + 1
-		if tableLog > tableLogMax {
-			return s, nil, errors.New("corrupt input: tableLog too big")
-		}
-		s.actualTableLog = uint8(tableLog)
-		// determine last weight
-		{
-			total := uint32(1) << tableLog
-			rest := total - weightTotal
-			verif := uint32(1) << highBit32(rest)
-			lastWeight := highBit32(rest) + 1
-			if verif != rest {
-				// last value must be a clean power of 2
-				return s, nil, errors.New("corrupt input: last value not power of two")
-			}
-			s.huffWeight[s.symbolLen] = uint8(lastWeight)
-			s.symbolLen++
-			rankStats[lastWeight]++
-		}
-	}
-
-	if (rankStats[1] < 2) || (rankStats[1]&1 != 0) {
-		// by construction : at least 2 elts of rank 1, must be even
-		return s, nil, errors.New("corrupt input: min elt size, even check failed ")
-	}
-
-	// TODO: Choose between single/double symbol decoding
-
-	// Calculate starting value for each rank
-	{
-		var nextRankStart uint32
-		for n := uint8(1); n < s.actualTableLog+1; n++ {
-			current := nextRankStart
-			nextRankStart += rankStats[n] << (n - 1)
-			rankStats[n] = current
-		}
-	}
-
-	// fill DTable (always full size)
-	tSize := 1 << tableLogMax
-	if len(s.dt.single) != tSize {
-		s.dt.single = make([]dEntrySingle, tSize)
-	}
-	cTable := s.prevTable
-	if cap(cTable) < maxSymbolValue+1 {
-		cTable = make([]cTableEntry, 0, maxSymbolValue+1)
-	}
-	cTable = cTable[:maxSymbolValue+1]
-	s.prevTable = cTable[:s.symbolLen]
-	s.prevTableLog = s.actualTableLog
-
-	for n, w := range s.huffWeight[:s.symbolLen] {
-		if w == 0 {
-			cTable[n] = cTableEntry{
-				val:   0,
-				nBits: 0,
-			}
-			continue
-		}
-		length := (uint32(1) << w) >> 1
-		d := dEntrySingle{
-			entry: uint16(s.actualTableLog+1-w) | (uint16(n) << 8),
-		}
-
-		rank := &rankStats[w]
-		cTable[n] = cTableEntry{
-			val:   uint16(*rank >> (w - 1)),
-			nBits: uint8(d.entry),
-		}
-
-		single := s.dt.single[*rank : *rank+length]
-		for i := range single {
-			single[i] = d
-		}
-		*rank += length
-	}
-
-	return s, in, nil
-}
-
-// Decompress1X will decompress a 1X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// Before this is called, the table must be initialized with ReadTable unless
-// the encoder re-used the table.
-// deprecated: Use the stateless Decoder() to get a concurrent version.
-func (s *Scratch) Decompress1X(in []byte) (out []byte, err error) {
-	if cap(s.Out) < s.MaxDecodedSize {
-		s.Out = make([]byte, s.MaxDecodedSize)
-	}
-	s.Out = s.Out[:0:s.MaxDecodedSize]
-	s.Out, err = s.Decoder().Decompress1X(s.Out, in)
-	return s.Out, err
-}
-
-// Decompress4X will decompress a 4X encoded stream.
-// Before this is called, the table must be initialized with ReadTable unless
-// the encoder re-used the table.
-// The length of the supplied input must match the end of a block exactly.
-// The destination size of the uncompressed data must be known and provided.
-// deprecated: Use the stateless Decoder() to get a concurrent version.
-func (s *Scratch) Decompress4X(in []byte, dstSize int) (out []byte, err error) {
-	if dstSize > s.MaxDecodedSize {
-		return nil, ErrMaxDecodedSizeExceeded
-	}
-	if cap(s.Out) < dstSize {
-		s.Out = make([]byte, s.MaxDecodedSize)
-	}
-	s.Out = s.Out[:0:dstSize]
-	s.Out, err = s.Decoder().Decompress4X(s.Out, in)
-	return s.Out, err
-}
-
-// Decoder will return a stateless decoder that can be used by multiple
-// decompressors concurrently.
-// Before this is called, the table must be initialized with ReadTable.
-// The Decoder is still linked to the scratch buffer so that cannot be reused.
-// However, it is safe to discard the scratch.
-func (s *Scratch) Decoder() *Decoder {
-	return &Decoder{
-		dt:             s.dt,
-		actualTableLog: s.actualTableLog,
-		bufs:           &s.decPool,
-	}
-}
-
-// Decoder provides stateless decoding.
-type Decoder struct {
-	dt             dTable
-	actualTableLog uint8
-	bufs           *sync.Pool
-}
-
-func (d *Decoder) buffer() *[4][256]byte {
-	buf, ok := d.bufs.Get().(*[4][256]byte)
-	if ok {
-		return buf
-	}
-	return &[4][256]byte{}
-}
-
-// decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
-// The cap of the output buffer will be the maximum decompressed size.
-// The length of the supplied input must match the end of a block exactly.
-func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
-	if d.actualTableLog == 8 {
-		return d.decompress1X8BitExactly(dst, src)
-	}
-	var br bitReaderBytes
-	err := br.init(src)
-	if err != nil {
-		return dst, err
-	}
-	maxDecodedSize := cap(dst)
-	dst = dst[:0]
-
-	// Avoid bounds check by always having full sized table.
-	dt := d.dt.single[:256]
-
-	// Use temp table to avoid bound checks/append penalty.
-	bufs := d.buffer()
-	buf := &bufs[0]
-	var off uint8
-
-	switch d.actualTableLog {
-	case 8:
-		const shift = 0
-		for br.off >= 4 {
-			br.fillFast()
-			v := dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+0] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+1] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+2] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+3] = uint8(v.entry >> 8)
-
-			off += 4
-			if off == 0 {
-				if len(dst)+256 > maxDecodedSize {
-					br.close()
-					d.bufs.Put(bufs)
-					return nil, ErrMaxDecodedSizeExceeded
-				}
-				dst = append(dst, buf[:]...)
-			}
-		}
-	case 7:
-		const shift = 8 - 7
-		for br.off >= 4 {
-			br.fillFast()
-			v := dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+0] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+1] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+2] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+3] = uint8(v.entry >> 8)
-
-			off += 4
-			if off == 0 {
-				if len(dst)+256 > maxDecodedSize {
-					br.close()
-					d.bufs.Put(bufs)
-					return nil, ErrMaxDecodedSizeExceeded
-				}
-				dst = append(dst, buf[:]...)
-			}
-		}
-	case 6:
-		const shift = 8 - 6
-		for br.off >= 4 {
-			br.fillFast()
-			v := dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+0] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+1] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+2] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+3] = uint8(v.entry >> 8)
-
-			off += 4
-			if off == 0 {
-				if len(dst)+256 > maxDecodedSize {
-					d.bufs.Put(bufs)
-					br.close()
-					return nil, ErrMaxDecodedSizeExceeded
-				}
-				dst = append(dst, buf[:]...)
-			}
-		}
-	case 5:
-		const shift = 8 - 5
-		for br.off >= 4 {
-			br.fillFast()
-			v := dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+0] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+1] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+2] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+3] = uint8(v.entry >> 8)
-
-			off += 4
-			if off == 0 {
-				if len(dst)+256 > maxDecodedSize {
-					d.bufs.Put(bufs)
-					br.close()
-					return nil, ErrMaxDecodedSizeExceeded
-				}
-				dst = append(dst, buf[:]...)
-			}
-		}
-	case 4:
-		const shift = 8 - 4
-		for br.off >= 4 {
-			br.fillFast()
-			v := dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+0] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+1] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+2] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+3] = uint8(v.entry >> 8)
-
-			off += 4
-			if off == 0 {
-				if len(dst)+256 > maxDecodedSize {
-					d.bufs.Put(bufs)
-					br.close()
-					return nil, ErrMaxDecodedSizeExceeded
-				}
-				dst = append(dst, buf[:]...)
-			}
-		}
-	case 3:
-		const shift = 8 - 3
-		for br.off >= 4 {
-			br.fillFast()
-			v := dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+0] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+1] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+2] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+3] = uint8(v.entry >> 8)
-
-			off += 4
-			if off == 0 {
-				if len(dst)+256 > maxDecodedSize {
-					d.bufs.Put(bufs)
-					br.close()
-					return nil, ErrMaxDecodedSizeExceeded
-				}
-				dst = append(dst, buf[:]...)
-			}
-		}
-	case 2:
-		const shift = 8 - 2
-		for br.off >= 4 {
-			br.fillFast()
-			v := dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+0] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+1] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+2] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+3] = uint8(v.entry >> 8)
-
-			off += 4
-			if off == 0 {
-				if len(dst)+256 > maxDecodedSize {
-					d.bufs.Put(bufs)
-					br.close()
-					return nil, ErrMaxDecodedSizeExceeded
-				}
-				dst = append(dst, buf[:]...)
-			}
-		}
-	case 1:
-		const shift = 8 - 1
-		for br.off >= 4 {
-			br.fillFast()
-			v := dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+0] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+1] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+2] = uint8(v.entry >> 8)
-
-			v = dt[uint8(br.value>>(56+shift))]
-			br.advance(uint8(v.entry))
-			buf[off+3] = uint8(v.entry >> 8)
-
-			off += 4
-			if off == 0 {
-				if len(dst)+256 > maxDecodedSize {
-					d.bufs.Put(bufs)
-					br.close()
-					return nil, ErrMaxDecodedSizeExceeded
-				}
-				dst = append(dst, buf[:]...)
-			}
-		}
-	default:
-		d.bufs.Put(bufs)
-		return nil, fmt.Errorf("invalid tablelog: %d", d.actualTableLog)
-	}
-
-	if len(dst)+int(off) > maxDecodedSize {
-		d.bufs.Put(bufs)
-		br.close()
-		return nil, ErrMaxDecodedSizeExceeded
-	}
-	dst = append(dst, buf[:off]...)
-
-	// br < 4, so uint8 is fine
-	bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
-	shift := (8 - d.actualTableLog) & 7
-
-	for bitsLeft > 0 {
-		if br.bitsRead >= 64-8 {
-			for br.off > 0 {
-				br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
-				br.bitsRead -= 8
-				br.off--
-			}
-		}
-		if len(dst) >= maxDecodedSize {
-			br.close()
-			d.bufs.Put(bufs)
-			return nil, ErrMaxDecodedSizeExceeded
-		}
-		v := dt[br.peekByteFast()>>shift]
-		nBits := uint8(v.entry)
-		br.advance(nBits)
-		bitsLeft -= int8(nBits)
-		dst = append(dst, uint8(v.entry>>8))
-	}
-	d.bufs.Put(bufs)
-	return dst, br.close()
-}
-
-// decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
-// The cap of the output buffer will be the maximum decompressed size.
-// The length of the supplied input must match the end of a block exactly.
-func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
-	var br bitReaderBytes
-	err := br.init(src)
-	if err != nil {
-		return dst, err
-	}
-	maxDecodedSize := cap(dst)
-	dst = dst[:0]
-
-	// Avoid bounds check by always having full sized table.
-	dt := d.dt.single[:256]
-
-	// Use temp table to avoid bound checks/append penalty.
-	bufs := d.buffer()
-	buf := &bufs[0]
-	var off uint8
-
-	const shift = 56
-
-	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
-	for br.off >= 4 {
-		br.fillFast()
-		v := dt[uint8(br.value>>shift)]
-		br.advance(uint8(v.entry))
-		buf[off+0] = uint8(v.entry >> 8)
-
-		v = dt[uint8(br.value>>shift)]
-		br.advance(uint8(v.entry))
-		buf[off+1] = uint8(v.entry >> 8)
-
-		v = dt[uint8(br.value>>shift)]
-		br.advance(uint8(v.entry))
-		buf[off+2] = uint8(v.entry >> 8)
-
-		v = dt[uint8(br.value>>shift)]
-		br.advance(uint8(v.entry))
-		buf[off+3] = uint8(v.entry >> 8)
-
-		off += 4
-		if off == 0 {
-			if len(dst)+256 > maxDecodedSize {
-				d.bufs.Put(bufs)
-				br.close()
-				return nil, ErrMaxDecodedSizeExceeded
-			}
-			dst = append(dst, buf[:]...)
-		}
-	}
-
-	if len(dst)+int(off) > maxDecodedSize {
-		d.bufs.Put(bufs)
-		br.close()
-		return nil, ErrMaxDecodedSizeExceeded
-	}
-	dst = append(dst, buf[:off]...)
-
-	// br < 4, so uint8 is fine
-	bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
-	for bitsLeft > 0 {
-		if br.bitsRead >= 64-8 {
-			for br.off > 0 {
-				br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
-				br.bitsRead -= 8
-				br.off--
-			}
-		}
-		if len(dst) >= maxDecodedSize {
-			d.bufs.Put(bufs)
-			br.close()
-			return nil, ErrMaxDecodedSizeExceeded
-		}
-		v := dt[br.peekByteFast()]
-		nBits := uint8(v.entry)
-		br.advance(nBits)
-		bitsLeft -= int8(nBits)
-		dst = append(dst, uint8(v.entry>>8))
-	}
-	d.bufs.Put(bufs)
-	return dst, br.close()
-}
-
-// Decompress4X will decompress a 4X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// The *capacity* of the dst slice must match the destination size of
-// the uncompressed data exactly.
-func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
-	if d.actualTableLog == 8 {
-		return d.decompress4X8bitExactly(dst, src)
-	}
-
-	var br [4]bitReaderBytes
-	start := 6
-	for i := 0; i < 3; i++ {
-		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
-		if start+length >= len(src) {
-			return nil, errors.New("truncated input (or invalid offset)")
-		}
-		err := br[i].init(src[start : start+length])
-		if err != nil {
-			return nil, err
-		}
-		start += length
-	}
-	err := br[3].init(src[start:])
-	if err != nil {
-		return nil, err
-	}
-
-	// destination, offset to match first output
-	dstSize := cap(dst)
-	dst = dst[:dstSize]
-	out := dst
-	dstEvery := (dstSize + 3) / 4
-
-	shift := (56 + (8 - d.actualTableLog)) & 63
-
-	const tlSize = 1 << 8
-	single := d.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	buf := d.buffer()
-	var off uint8
-	var decoded int
-
-	// Decode 4 values from each decoder/loop.
-	const bufoff = 256
-	for {
-		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
-			break
-		}
-
-		{
-			// Interleave 2 decodes.
-			const stream = 0
-			const stream2 = 1
-			br1 := &br[stream]
-			br2 := &br[stream2]
-			br1.fillFast()
-			br2.fillFast()
-
-			v := single[uint8(br1.value>>shift)].entry
-			v2 := single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off] = uint8(v >> 8)
-			buf[stream2][off] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+1] = uint8(v >> 8)
-			buf[stream2][off+1] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+2] = uint8(v >> 8)
-			buf[stream2][off+2] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+3] = uint8(v >> 8)
-			buf[stream2][off+3] = uint8(v2 >> 8)
-		}
-
-		{
-			const stream = 2
-			const stream2 = 3
-			br1 := &br[stream]
-			br2 := &br[stream2]
-			br1.fillFast()
-			br2.fillFast()
-
-			v := single[uint8(br1.value>>shift)].entry
-			v2 := single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off] = uint8(v >> 8)
-			buf[stream2][off] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+1] = uint8(v >> 8)
-			buf[stream2][off+1] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+2] = uint8(v >> 8)
-			buf[stream2][off+2] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+3] = uint8(v >> 8)
-			buf[stream2][off+3] = uint8(v2 >> 8)
-		}
-
-		off += 4
-
-		if off == 0 {
-			if bufoff > dstEvery {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 1")
-			}
-			// There must at least be 3 buffers left.
-			if len(out)-bufoff < dstEvery*3 {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 2")
-			}
-			//copy(out, buf[0][:])
-			//copy(out[dstEvery:], buf[1][:])
-			//copy(out[dstEvery*2:], buf[2][:])
-			*(*[bufoff]byte)(out) = buf[0]
-			*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
-			*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
-			*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
-			out = out[bufoff:]
-			decoded += bufoff * 4
-		}
-	}
-	if off > 0 {
-		ioff := int(off)
-		if len(out) < dstEvery*3+ioff {
-			d.bufs.Put(buf)
-			return nil, errors.New("corruption detected: stream overrun 3")
-		}
-		copy(out, buf[0][:off])
-		copy(out[dstEvery:], buf[1][:off])
-		copy(out[dstEvery*2:], buf[2][:off])
-		copy(out[dstEvery*3:], buf[3][:off])
-		decoded += int(off) * 4
-		out = out[off:]
-	}
-
-	// Decode remaining.
-	// Decode remaining.
-	remainBytes := dstEvery - (decoded / 4)
-	for i := range br {
-		offset := dstEvery * i
-		endsAt := offset + remainBytes
-		if endsAt > len(out) {
-			endsAt = len(out)
-		}
-		br := &br[i]
-		bitsLeft := br.remaining()
-		for bitsLeft > 0 {
-			if br.finished() {
-				d.bufs.Put(buf)
-				return nil, io.ErrUnexpectedEOF
-			}
-			if br.bitsRead >= 56 {
-				if br.off >= 4 {
-					v := br.in[br.off-4:]
-					v = v[:4]
-					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-					br.value |= uint64(low) << (br.bitsRead - 32)
-					br.bitsRead -= 32
-					br.off -= 4
-				} else {
-					for br.off > 0 {
-						br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
-						br.bitsRead -= 8
-						br.off--
-					}
-				}
-			}
-			// end inline...
-			if offset >= endsAt {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 4")
-			}
-
-			// Read value and increment offset.
-			v := single[uint8(br.value>>shift)].entry
-			nBits := uint8(v)
-			br.advance(nBits)
-			bitsLeft -= uint(nBits)
-			out[offset] = uint8(v >> 8)
-			offset++
-		}
-		if offset != endsAt {
-			d.bufs.Put(buf)
-			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
-		}
-		decoded += offset - dstEvery*i
-		err = br.close()
-		if err != nil {
-			d.bufs.Put(buf)
-			return nil, err
-		}
-	}
-	d.bufs.Put(buf)
-	if dstSize != decoded {
-		return nil, errors.New("corruption detected: short output block")
-	}
-	return dst, nil
-}
-
-// Decompress4X will decompress a 4X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// The *capacity* of the dst slice must match the destination size of
-// the uncompressed data exactly.
-func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
-	var br [4]bitReaderBytes
-	start := 6
-	for i := 0; i < 3; i++ {
-		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
-		if start+length >= len(src) {
-			return nil, errors.New("truncated input (or invalid offset)")
-		}
-		err := br[i].init(src[start : start+length])
-		if err != nil {
-			return nil, err
-		}
-		start += length
-	}
-	err := br[3].init(src[start:])
-	if err != nil {
-		return nil, err
-	}
-
-	// destination, offset to match first output
-	dstSize := cap(dst)
-	dst = dst[:dstSize]
-	out := dst
-	dstEvery := (dstSize + 3) / 4
-
-	const shift = 56
-	const tlSize = 1 << 8
-	single := d.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	buf := d.buffer()
-	var off uint8
-	var decoded int
-
-	// Decode 4 values from each decoder/loop.
-	const bufoff = 256
-	for {
-		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
-			break
-		}
-
-		{
-			// Interleave 2 decodes.
-			const stream = 0
-			const stream2 = 1
-			br1 := &br[stream]
-			br2 := &br[stream2]
-			br1.fillFast()
-			br2.fillFast()
-
-			v := single[uint8(br1.value>>shift)].entry
-			v2 := single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off] = uint8(v >> 8)
-			buf[stream2][off] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+1] = uint8(v >> 8)
-			buf[stream2][off+1] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+2] = uint8(v >> 8)
-			buf[stream2][off+2] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+3] = uint8(v >> 8)
-			buf[stream2][off+3] = uint8(v2 >> 8)
-		}
-
-		{
-			const stream = 2
-			const stream2 = 3
-			br1 := &br[stream]
-			br2 := &br[stream2]
-			br1.fillFast()
-			br2.fillFast()
-
-			v := single[uint8(br1.value>>shift)].entry
-			v2 := single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off] = uint8(v >> 8)
-			buf[stream2][off] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+1] = uint8(v >> 8)
-			buf[stream2][off+1] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+2] = uint8(v >> 8)
-			buf[stream2][off+2] = uint8(v2 >> 8)
-
-			v = single[uint8(br1.value>>shift)].entry
-			v2 = single[uint8(br2.value>>shift)].entry
-			br1.bitsRead += uint8(v)
-			br1.value <<= v & 63
-			br2.bitsRead += uint8(v2)
-			br2.value <<= v2 & 63
-			buf[stream][off+3] = uint8(v >> 8)
-			buf[stream2][off+3] = uint8(v2 >> 8)
-		}
-
-		off += 4
-
-		if off == 0 {
-			if bufoff > dstEvery {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 1")
-			}
-			// There must at least be 3 buffers left.
-			if len(out)-bufoff < dstEvery*3 {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 2")
-			}
-
-			//copy(out, buf[0][:])
-			//copy(out[dstEvery:], buf[1][:])
-			//copy(out[dstEvery*2:], buf[2][:])
-			// copy(out[dstEvery*3:], buf[3][:])
-			*(*[bufoff]byte)(out) = buf[0]
-			*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
-			*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
-			*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
-			out = out[bufoff:]
-			decoded += bufoff * 4
-		}
-	}
-	if off > 0 {
-		ioff := int(off)
-		if len(out) < dstEvery*3+ioff {
-			return nil, errors.New("corruption detected: stream overrun 3")
-		}
-		copy(out, buf[0][:off])
-		copy(out[dstEvery:], buf[1][:off])
-		copy(out[dstEvery*2:], buf[2][:off])
-		copy(out[dstEvery*3:], buf[3][:off])
-		decoded += int(off) * 4
-		out = out[off:]
-	}
-
-	// Decode remaining.
-	remainBytes := dstEvery - (decoded / 4)
-	for i := range br {
-		offset := dstEvery * i
-		endsAt := offset + remainBytes
-		if endsAt > len(out) {
-			endsAt = len(out)
-		}
-		br := &br[i]
-		bitsLeft := br.remaining()
-		for bitsLeft > 0 {
-			if br.finished() {
-				d.bufs.Put(buf)
-				return nil, io.ErrUnexpectedEOF
-			}
-			if br.bitsRead >= 56 {
-				if br.off >= 4 {
-					v := br.in[br.off-4:]
-					v = v[:4]
-					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-					br.value |= uint64(low) << (br.bitsRead - 32)
-					br.bitsRead -= 32
-					br.off -= 4
-				} else {
-					for br.off > 0 {
-						br.value |= uint64(br.in[br.off-1]) << (br.bitsRead - 8)
-						br.bitsRead -= 8
-						br.off--
-					}
-				}
-			}
-			// end inline...
-			if offset >= endsAt {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 4")
-			}
-
-			// Read value and increment offset.
-			v := single[br.peekByteFast()].entry
-			nBits := uint8(v)
-			br.advance(nBits)
-			bitsLeft -= uint(nBits)
-			out[offset] = uint8(v >> 8)
-			offset++
-		}
-		if offset != endsAt {
-			d.bufs.Put(buf)
-			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
-		}
-
-		decoded += offset - dstEvery*i
-		err = br.close()
-		if err != nil {
-			d.bufs.Put(buf)
-			return nil, err
-		}
-	}
-	d.bufs.Put(buf)
-	if dstSize != decoded {
-		return nil, errors.New("corruption detected: short output block")
-	}
-	return dst, nil
-}
-
-// matches will compare a decoding table to a coding table.
-// Errors are written to the writer.
-// Nothing will be written if table is ok.
-func (s *Scratch) matches(ct cTable, w io.Writer) {
-	if s == nil || len(s.dt.single) == 0 {
-		return
-	}
-	dt := s.dt.single[:1<<s.actualTableLog]
-	tablelog := s.actualTableLog
-	ok := 0
-	broken := 0
-	for sym, enc := range ct {
-		errs := 0
-		broken++
-		if enc.nBits == 0 {
-			for _, dec := range dt {
-				if uint8(dec.entry>>8) == byte(sym) {
-					fmt.Fprintf(w, "symbol %x has decoder, but no encoder\n", sym)
-					errs++
-					break
-				}
-			}
-			if errs == 0 {
-				broken--
-			}
-			continue
-		}
-		// Unused bits in input
-		ub := tablelog - enc.nBits
-		top := enc.val << ub
-		// decoder looks at top bits.
-		dec := dt[top]
-		if uint8(dec.entry) != enc.nBits {
-			fmt.Fprintf(w, "symbol 0x%x bit size mismatch (enc: %d, dec:%d).\n", sym, enc.nBits, uint8(dec.entry))
-			errs++
-		}
-		if uint8(dec.entry>>8) != uint8(sym) {
-			fmt.Fprintf(w, "symbol 0x%x decoder output mismatch (enc: %d, dec:%d).\n", sym, sym, uint8(dec.entry>>8))
-			errs++
-		}
-		if errs > 0 {
-			fmt.Fprintf(w, "%d errors in base, stopping\n", errs)
-			continue
-		}
-		// Ensure that all combinations are covered.
-		for i := uint16(0); i < (1 << ub); i++ {
-			vval := top | i
-			dec := dt[vval]
-			if uint8(dec.entry) != enc.nBits {
-				fmt.Fprintf(w, "symbol 0x%x bit size mismatch (enc: %d, dec:%d).\n", vval, enc.nBits, uint8(dec.entry))
-				errs++
-			}
-			if uint8(dec.entry>>8) != uint8(sym) {
-				fmt.Fprintf(w, "symbol 0x%x decoder output mismatch (enc: %d, dec:%d).\n", vval, sym, uint8(dec.entry>>8))
-				errs++
-			}
-			if errs > 20 {
-				fmt.Fprintf(w, "%d errors, stopping\n", errs)
-				break
-			}
-		}
-		if errs == 0 {
-			ok++
-			broken--
-		}
-	}
-	if broken > 0 {
-		fmt.Fprintf(w, "%d broken, %d ok\n", broken, ok)
-	}
-}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
deleted file mode 100644
index ba7e8e6b0..000000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
+++ /dev/null
@@ -1,226 +0,0 @@
-//go:build amd64 && !appengine && !noasm && gc
-// +build amd64,!appengine,!noasm,gc
-
-// This file contains the specialisation of Decoder.Decompress4X
-// and Decoder.Decompress1X that use an asm implementation of thir main loops.
-package huff0
-
-import (
-	"errors"
-	"fmt"
-
-	"github.com/klauspost/compress/internal/cpuinfo"
-)
-
-// decompress4x_main_loop_x86 is an x86 assembler implementation
-// of Decompress4X when tablelog > 8.
-//
-//go:noescape
-func decompress4x_main_loop_amd64(ctx *decompress4xContext)
-
-// decompress4x_8b_loop_x86 is an x86 assembler implementation
-// of Decompress4X when tablelog <= 8 which decodes 4 entries
-// per loop.
-//
-//go:noescape
-func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
-
-// fallback8BitSize is the size where using Go version is faster.
-const fallback8BitSize = 800
-
-type decompress4xContext struct {
-	pbr      *[4]bitReaderShifted
-	peekBits uint8
-	out      *byte
-	dstEvery int
-	tbl      *dEntrySingle
-	decoded  int
-	limit    *byte
-}
-
-// Decompress4X will decompress a 4X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// The *capacity* of the dst slice must match the destination size of
-// the uncompressed data exactly.
-func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
-	if len(d.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	if len(src) < 6+(4*1) {
-		return nil, errors.New("input too small")
-	}
-
-	use8BitTables := d.actualTableLog <= 8
-	if cap(dst) < fallback8BitSize && use8BitTables {
-		return d.decompress4X8bit(dst, src)
-	}
-
-	var br [4]bitReaderShifted
-	// Decode "jump table"
-	start := 6
-	for i := 0; i < 3; i++ {
-		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
-		if start+length >= len(src) {
-			return nil, errors.New("truncated input (or invalid offset)")
-		}
-		err := br[i].init(src[start : start+length])
-		if err != nil {
-			return nil, err
-		}
-		start += length
-	}
-	err := br[3].init(src[start:])
-	if err != nil {
-		return nil, err
-	}
-
-	// destination, offset to match first output
-	dstSize := cap(dst)
-	dst = dst[:dstSize]
-	out := dst
-	dstEvery := (dstSize + 3) / 4
-
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-	single := d.dt.single[:tlSize]
-
-	var decoded int
-
-	if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
-		ctx := decompress4xContext{
-			pbr:      &br,
-			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
-			out:      &out[0],
-			dstEvery: dstEvery,
-			tbl:      &single[0],
-			limit:    &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
-		}
-		if use8BitTables {
-			decompress4x_8b_main_loop_amd64(&ctx)
-		} else {
-			decompress4x_main_loop_amd64(&ctx)
-		}
-
-		decoded = ctx.decoded
-		out = out[decoded/4:]
-	}
-
-	// Decode remaining.
-	remainBytes := dstEvery - (decoded / 4)
-	for i := range br {
-		offset := dstEvery * i
-		endsAt := offset + remainBytes
-		if endsAt > len(out) {
-			endsAt = len(out)
-		}
-		br := &br[i]
-		bitsLeft := br.remaining()
-		for bitsLeft > 0 {
-			br.fill()
-			if offset >= endsAt {
-				return nil, errors.New("corruption detected: stream overrun 4")
-			}
-
-			// Read value and increment offset.
-			val := br.peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask].entry
-			nBits := uint8(v)
-			br.advance(nBits)
-			bitsLeft -= uint(nBits)
-			out[offset] = uint8(v >> 8)
-			offset++
-		}
-		if offset != endsAt {
-			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
-		}
-		decoded += offset - dstEvery*i
-		err = br.close()
-		if err != nil {
-			return nil, err
-		}
-	}
-	if dstSize != decoded {
-		return nil, errors.New("corruption detected: short output block")
-	}
-	return dst, nil
-}
-
-// decompress4x_main_loop_x86 is an x86 assembler implementation
-// of Decompress1X when tablelog > 8.
-//
-//go:noescape
-func decompress1x_main_loop_amd64(ctx *decompress1xContext)
-
-// decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
-// of Decompress1X when tablelog > 8.
-//
-//go:noescape
-func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
-
-type decompress1xContext struct {
-	pbr      *bitReaderShifted
-	peekBits uint8
-	out      *byte
-	outCap   int
-	tbl      *dEntrySingle
-	decoded  int
-}
-
-// Error reported by asm implementations
-const error_max_decoded_size_exeeded = -1
-
-// Decompress1X will decompress a 1X encoded stream.
-// The cap of the output buffer will be the maximum decompressed size.
-// The length of the supplied input must match the end of a block exactly.
-func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
-	if len(d.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	var br bitReaderShifted
-	err := br.init(src)
-	if err != nil {
-		return dst, err
-	}
-	maxDecodedSize := cap(dst)
-	dst = dst[:maxDecodedSize]
-
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-
-	if maxDecodedSize >= 4 {
-		ctx := decompress1xContext{
-			pbr:      &br,
-			out:      &dst[0],
-			outCap:   maxDecodedSize,
-			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
-			tbl:      &d.dt.single[0],
-		}
-
-		if cpuinfo.HasBMI2() {
-			decompress1x_main_loop_bmi2(&ctx)
-		} else {
-			decompress1x_main_loop_amd64(&ctx)
-		}
-		if ctx.decoded == error_max_decoded_size_exeeded {
-			return nil, ErrMaxDecodedSizeExceeded
-		}
-
-		dst = dst[:ctx.decoded]
-	}
-
-	// br < 8, so uint8 is fine
-	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
-	for bitsLeft > 0 {
-		br.fill()
-		if len(dst) >= maxDecodedSize {
-			br.close()
-			return nil, ErrMaxDecodedSizeExceeded
-		}
-		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
-		nBits := uint8(v.entry)
-		br.advance(nBits)
-		bitsLeft -= nBits
-		dst = append(dst, uint8(v.entry>>8))
-	}
-	return dst, br.close()
-}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
deleted file mode 100644
index c4c7ab2d1..000000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
+++ /dev/null
@@ -1,830 +0,0 @@
-// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
-
-//go:build amd64 && !appengine && !noasm && gc
-
-// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
-TEXT ·decompress4x_main_loop_amd64(SB), $0-8
-	// Preload values
-	MOVQ    ctx+0(FP), AX
-	MOVBQZX 8(AX), DI
-	MOVQ    16(AX), BX
-	MOVQ    48(AX), SI
-	MOVQ    24(AX), R8
-	MOVQ    32(AX), R9
-	MOVQ    (AX), R10
-
-	// Main loop
-main_loop:
-	XORL  DX, DX
-	CMPQ  BX, SI
-	SETGE DL
-
-	// br0.fillFast32()
-	MOVQ    32(R10), R11
-	MOVBQZX 40(R10), R12
-	CMPQ    R12, $0x20
-	JBE     skip_fill0
-	MOVQ    24(R10), AX
-	SUBQ    $0x20, R12
-	SUBQ    $0x04, AX
-	MOVQ    (R10), R13
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(R13*1), R13
-	MOVQ R12, CX
-	SHLQ CL, R13
-	MOVQ AX, 24(R10)
-	ORQ  R13, R11
-
-	// exhausted += (br0.off < 4)
-	CMPQ AX, $0x04
-	ADCB $+0, DL
-
-skip_fill0:
-	// val0 := br0.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v0 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br0.advance(uint8(v0.entry)
-	MOVB CH, AL
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val1 := br0.peekTopBits(peekBits)
-	MOVQ DI, CX
-	MOVQ R11, R13
-	SHRQ CL, R13
-
-	// v1 := table[val1&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br0.advance(uint8(v1.entry))
-	MOVB CH, AH
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// these two writes get coalesced
-	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
-	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (BX)
-
-	// update the bitreader structure
-	MOVQ R11, 32(R10)
-	MOVB R12, 40(R10)
-
-	// br1.fillFast32()
-	MOVQ    80(R10), R11
-	MOVBQZX 88(R10), R12
-	CMPQ    R12, $0x20
-	JBE     skip_fill1
-	MOVQ    72(R10), AX
-	SUBQ    $0x20, R12
-	SUBQ    $0x04, AX
-	MOVQ    48(R10), R13
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(R13*1), R13
-	MOVQ R12, CX
-	SHLQ CL, R13
-	MOVQ AX, 72(R10)
-	ORQ  R13, R11
-
-	// exhausted += (br1.off < 4)
-	CMPQ AX, $0x04
-	ADCB $+0, DL
-
-skip_fill1:
-	// val0 := br1.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v0 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br1.advance(uint8(v0.entry)
-	MOVB CH, AL
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val1 := br1.peekTopBits(peekBits)
-	MOVQ DI, CX
-	MOVQ R11, R13
-	SHRQ CL, R13
-
-	// v1 := table[val1&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br1.advance(uint8(v1.entry))
-	MOVB CH, AH
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// these two writes get coalesced
-	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
-	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (BX)(R8*1)
-
-	// update the bitreader structure
-	MOVQ R11, 80(R10)
-	MOVB R12, 88(R10)
-
-	// br2.fillFast32()
-	MOVQ    128(R10), R11
-	MOVBQZX 136(R10), R12
-	CMPQ    R12, $0x20
-	JBE     skip_fill2
-	MOVQ    120(R10), AX
-	SUBQ    $0x20, R12
-	SUBQ    $0x04, AX
-	MOVQ    96(R10), R13
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(R13*1), R13
-	MOVQ R12, CX
-	SHLQ CL, R13
-	MOVQ AX, 120(R10)
-	ORQ  R13, R11
-
-	// exhausted += (br2.off < 4)
-	CMPQ AX, $0x04
-	ADCB $+0, DL
-
-skip_fill2:
-	// val0 := br2.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v0 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br2.advance(uint8(v0.entry)
-	MOVB CH, AL
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val1 := br2.peekTopBits(peekBits)
-	MOVQ DI, CX
-	MOVQ R11, R13
-	SHRQ CL, R13
-
-	// v1 := table[val1&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br2.advance(uint8(v1.entry))
-	MOVB CH, AH
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// these two writes get coalesced
-	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
-	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (BX)(R8*2)
-
-	// update the bitreader structure
-	MOVQ R11, 128(R10)
-	MOVB R12, 136(R10)
-
-	// br3.fillFast32()
-	MOVQ    176(R10), R11
-	MOVBQZX 184(R10), R12
-	CMPQ    R12, $0x20
-	JBE     skip_fill3
-	MOVQ    168(R10), AX
-	SUBQ    $0x20, R12
-	SUBQ    $0x04, AX
-	MOVQ    144(R10), R13
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(R13*1), R13
-	MOVQ R12, CX
-	SHLQ CL, R13
-	MOVQ AX, 168(R10)
-	ORQ  R13, R11
-
-	// exhausted += (br3.off < 4)
-	CMPQ AX, $0x04
-	ADCB $+0, DL
-
-skip_fill3:
-	// val0 := br3.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v0 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br3.advance(uint8(v0.entry)
-	MOVB CH, AL
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val1 := br3.peekTopBits(peekBits)
-	MOVQ DI, CX
-	MOVQ R11, R13
-	SHRQ CL, R13
-
-	// v1 := table[val1&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br3.advance(uint8(v1.entry))
-	MOVB CH, AH
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// these two writes get coalesced
-	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
-	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	LEAQ (R8)(R8*2), CX
-	MOVW AX, (BX)(CX*1)
-
-	// update the bitreader structure
-	MOVQ  R11, 176(R10)
-	MOVB  R12, 184(R10)
-	ADDQ  $0x02, BX
-	TESTB DL, DL
-	JZ    main_loop
-	MOVQ  ctx+0(FP), AX
-	SUBQ  16(AX), BX
-	SHLQ  $0x02, BX
-	MOVQ  BX, 40(AX)
-	RET
-
-// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
-TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
-	// Preload values
-	MOVQ    ctx+0(FP), CX
-	MOVBQZX 8(CX), DI
-	MOVQ    16(CX), BX
-	MOVQ    48(CX), SI
-	MOVQ    24(CX), R8
-	MOVQ    32(CX), R9
-	MOVQ    (CX), R10
-
-	// Main loop
-main_loop:
-	XORL  DX, DX
-	CMPQ  BX, SI
-	SETGE DL
-
-	// br0.fillFast32()
-	MOVQ    32(R10), R11
-	MOVBQZX 40(R10), R12
-	CMPQ    R12, $0x20
-	JBE     skip_fill0
-	MOVQ    24(R10), R13
-	SUBQ    $0x20, R12
-	SUBQ    $0x04, R13
-	MOVQ    (R10), R14
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R13)(R14*1), R14
-	MOVQ R12, CX
-	SHLQ CL, R14
-	MOVQ R13, 24(R10)
-	ORQ  R14, R11
-
-	// exhausted += (br0.off < 4)
-	CMPQ R13, $0x04
-	ADCB $+0, DL
-
-skip_fill0:
-	// val0 := br0.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v0 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br0.advance(uint8(v0.entry)
-	MOVB CH, AL
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val1 := br0.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v1 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br0.advance(uint8(v1.entry)
-	MOVB   CH, AH
-	SHLQ   CL, R11
-	ADDB   CL, R12
-	BSWAPL AX
-
-	// val2 := br0.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v2 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br0.advance(uint8(v2.entry)
-	MOVB CH, AH
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val3 := br0.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v3 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br0.advance(uint8(v3.entry)
-	MOVB   CH, AL
-	SHLQ   CL, R11
-	ADDB   CL, R12
-	BSWAPL AX
-
-	// these four writes get coalesced
-	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
-	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
-	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (BX)
-
-	// update the bitreader structure
-	MOVQ R11, 32(R10)
-	MOVB R12, 40(R10)
-
-	// br1.fillFast32()
-	MOVQ    80(R10), R11
-	MOVBQZX 88(R10), R12
-	CMPQ    R12, $0x20
-	JBE     skip_fill1
-	MOVQ    72(R10), R13
-	SUBQ    $0x20, R12
-	SUBQ    $0x04, R13
-	MOVQ    48(R10), R14
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R13)(R14*1), R14
-	MOVQ R12, CX
-	SHLQ CL, R14
-	MOVQ R13, 72(R10)
-	ORQ  R14, R11
-
-	// exhausted += (br1.off < 4)
-	CMPQ R13, $0x04
-	ADCB $+0, DL
-
-skip_fill1:
-	// val0 := br1.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v0 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br1.advance(uint8(v0.entry)
-	MOVB CH, AL
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val1 := br1.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v1 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br1.advance(uint8(v1.entry)
-	MOVB   CH, AH
-	SHLQ   CL, R11
-	ADDB   CL, R12
-	BSWAPL AX
-
-	// val2 := br1.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v2 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br1.advance(uint8(v2.entry)
-	MOVB CH, AH
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val3 := br1.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v3 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br1.advance(uint8(v3.entry)
-	MOVB   CH, AL
-	SHLQ   CL, R11
-	ADDB   CL, R12
-	BSWAPL AX
-
-	// these four writes get coalesced
-	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
-	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
-	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (BX)(R8*1)
-
-	// update the bitreader structure
-	MOVQ R11, 80(R10)
-	MOVB R12, 88(R10)
-
-	// br2.fillFast32()
-	MOVQ    128(R10), R11
-	MOVBQZX 136(R10), R12
-	CMPQ    R12, $0x20
-	JBE     skip_fill2
-	MOVQ    120(R10), R13
-	SUBQ    $0x20, R12
-	SUBQ    $0x04, R13
-	MOVQ    96(R10), R14
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R13)(R14*1), R14
-	MOVQ R12, CX
-	SHLQ CL, R14
-	MOVQ R13, 120(R10)
-	ORQ  R14, R11
-
-	// exhausted += (br2.off < 4)
-	CMPQ R13, $0x04
-	ADCB $+0, DL
-
-skip_fill2:
-	// val0 := br2.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v0 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br2.advance(uint8(v0.entry)
-	MOVB CH, AL
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val1 := br2.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v1 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br2.advance(uint8(v1.entry)
-	MOVB   CH, AH
-	SHLQ   CL, R11
-	ADDB   CL, R12
-	BSWAPL AX
-
-	// val2 := br2.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v2 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br2.advance(uint8(v2.entry)
-	MOVB CH, AH
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val3 := br2.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v3 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br2.advance(uint8(v3.entry)
-	MOVB   CH, AL
-	SHLQ   CL, R11
-	ADDB   CL, R12
-	BSWAPL AX
-
-	// these four writes get coalesced
-	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
-	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
-	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (BX)(R8*2)
-
-	// update the bitreader structure
-	MOVQ R11, 128(R10)
-	MOVB R12, 136(R10)
-
-	// br3.fillFast32()
-	MOVQ    176(R10), R11
-	MOVBQZX 184(R10), R12
-	CMPQ    R12, $0x20
-	JBE     skip_fill3
-	MOVQ    168(R10), R13
-	SUBQ    $0x20, R12
-	SUBQ    $0x04, R13
-	MOVQ    144(R10), R14
-
-	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R13)(R14*1), R14
-	MOVQ R12, CX
-	SHLQ CL, R14
-	MOVQ R13, 168(R10)
-	ORQ  R14, R11
-
-	// exhausted += (br3.off < 4)
-	CMPQ R13, $0x04
-	ADCB $+0, DL
-
-skip_fill3:
-	// val0 := br3.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v0 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br3.advance(uint8(v0.entry)
-	MOVB CH, AL
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val1 := br3.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v1 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br3.advance(uint8(v1.entry)
-	MOVB   CH, AH
-	SHLQ   CL, R11
-	ADDB   CL, R12
-	BSWAPL AX
-
-	// val2 := br3.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v2 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br3.advance(uint8(v2.entry)
-	MOVB CH, AH
-	SHLQ CL, R11
-	ADDB CL, R12
-
-	// val3 := br3.peekTopBits(peekBits)
-	MOVQ R11, R13
-	MOVQ DI, CX
-	SHRQ CL, R13
-
-	// v3 := table[val0&mask]
-	MOVW (R9)(R13*2), CX
-
-	// br3.advance(uint8(v3.entry)
-	MOVB   CH, AL
-	SHLQ   CL, R11
-	ADDB   CL, R12
-	BSWAPL AX
-
-	// these four writes get coalesced
-	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
-	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
-	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	LEAQ (R8)(R8*2), CX
-	MOVL AX, (BX)(CX*1)
-
-	// update the bitreader structure
-	MOVQ  R11, 176(R10)
-	MOVB  R12, 184(R10)
-	ADDQ  $0x04, BX
-	TESTB DL, DL
-	JZ    main_loop
-	MOVQ  ctx+0(FP), AX
-	SUBQ  16(AX), BX
-	SHLQ  $0x02, BX
-	MOVQ  BX, 40(AX)
-	RET
-
-// func decompress1x_main_loop_amd64(ctx *decompress1xContext)
-TEXT ·decompress1x_main_loop_amd64(SB), $0-8
-	MOVQ    ctx+0(FP), CX
-	MOVQ    16(CX), DX
-	MOVQ    24(CX), BX
-	CMPQ    BX, $0x04
-	JB      error_max_decoded_size_exceeded
-	LEAQ    (DX)(BX*1), BX
-	MOVQ    (CX), SI
-	MOVQ    (SI), R8
-	MOVQ    24(SI), R9
-	MOVQ    32(SI), R10
-	MOVBQZX 40(SI), R11
-	MOVQ    32(CX), SI
-	MOVBQZX 8(CX), DI
-	JMP     loop_condition
-
-main_loop:
-	// Check if we have room for 4 bytes in the output buffer
-	LEAQ 4(DX), CX
-	CMPQ CX, BX
-	JGE  error_max_decoded_size_exceeded
-
-	// Decode 4 values
-	CMPQ R11, $0x20
-	JL   bitReader_fillFast_1_end
-	SUBQ $0x20, R11
-	SUBQ $0x04, R9
-	MOVL (R8)(R9*1), R12
-	MOVQ R11, CX
-	SHLQ CL, R12
-	ORQ  R12, R10
-
-bitReader_fillFast_1_end:
-	MOVQ    DI, CX
-	MOVQ    R10, R12
-	SHRQ    CL, R12
-	MOVW    (SI)(R12*2), CX
-	MOVB    CH, AL
-	MOVBQZX CL, CX
-	ADDQ    CX, R11
-	SHLQ    CL, R10
-	MOVQ    DI, CX
-	MOVQ    R10, R12
-	SHRQ    CL, R12
-	MOVW    (SI)(R12*2), CX
-	MOVB    CH, AH
-	MOVBQZX CL, CX
-	ADDQ    CX, R11
-	SHLQ    CL, R10
-	BSWAPL  AX
-	CMPQ    R11, $0x20
-	JL      bitReader_fillFast_2_end
-	SUBQ    $0x20, R11
-	SUBQ    $0x04, R9
-	MOVL    (R8)(R9*1), R12
-	MOVQ    R11, CX
-	SHLQ    CL, R12
-	ORQ     R12, R10
-
-bitReader_fillFast_2_end:
-	MOVQ    DI, CX
-	MOVQ    R10, R12
-	SHRQ    CL, R12
-	MOVW    (SI)(R12*2), CX
-	MOVB    CH, AH
-	MOVBQZX CL, CX
-	ADDQ    CX, R11
-	SHLQ    CL, R10
-	MOVQ    DI, CX
-	MOVQ    R10, R12
-	SHRQ    CL, R12
-	MOVW    (SI)(R12*2), CX
-	MOVB    CH, AL
-	MOVBQZX CL, CX
-	ADDQ    CX, R11
-	SHLQ    CL, R10
-	BSWAPL  AX
-
-	// Store the decoded values
-	MOVL AX, (DX)
-	ADDQ $0x04, DX
-
-loop_condition:
-	CMPQ R9, $0x08
-	JGE  main_loop
-
-	// Update ctx structure
-	MOVQ ctx+0(FP), AX
-	SUBQ 16(AX), DX
-	MOVQ DX, 40(AX)
-	MOVQ (AX), AX
-	MOVQ R9, 24(AX)
-	MOVQ R10, 32(AX)
-	MOVB R11, 40(AX)
-	RET
-
-	// Report error
-error_max_decoded_size_exceeded:
-	MOVQ ctx+0(FP), AX
-	MOVQ $-1, CX
-	MOVQ CX, 40(AX)
-	RET
-
-// func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
-// Requires: BMI2
-TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
-	MOVQ    ctx+0(FP), CX
-	MOVQ    16(CX), DX
-	MOVQ    24(CX), BX
-	CMPQ    BX, $0x04
-	JB      error_max_decoded_size_exceeded
-	LEAQ    (DX)(BX*1), BX
-	MOVQ    (CX), SI
-	MOVQ    (SI), R8
-	MOVQ    24(SI), R9
-	MOVQ    32(SI), R10
-	MOVBQZX 40(SI), R11
-	MOVQ    32(CX), SI
-	MOVBQZX 8(CX), DI
-	JMP     loop_condition
-
-main_loop:
-	// Check if we have room for 4 bytes in the output buffer
-	LEAQ 4(DX), CX
-	CMPQ CX, BX
-	JGE  error_max_decoded_size_exceeded
-
-	// Decode 4 values
-	CMPQ  R11, $0x20
-	JL    bitReader_fillFast_1_end
-	SUBQ  $0x20, R11
-	SUBQ  $0x04, R9
-	MOVL  (R8)(R9*1), CX
-	SHLXQ R11, CX, CX
-	ORQ   CX, R10
-
-bitReader_fillFast_1_end:
-	SHRXQ   DI, R10, CX
-	MOVW    (SI)(CX*2), CX
-	MOVB    CH, AL
-	MOVBQZX CL, CX
-	ADDQ    CX, R11
-	SHLXQ   CX, R10, R10
-	SHRXQ   DI, R10, CX
-	MOVW    (SI)(CX*2), CX
-	MOVB    CH, AH
-	MOVBQZX CL, CX
-	ADDQ    CX, R11
-	SHLXQ   CX, R10, R10
-	BSWAPL  AX
-	CMPQ    R11, $0x20
-	JL      bitReader_fillFast_2_end
-	SUBQ    $0x20, R11
-	SUBQ    $0x04, R9
-	MOVL    (R8)(R9*1), CX
-	SHLXQ   R11, CX, CX
-	ORQ     CX, R10
-
-bitReader_fillFast_2_end:
-	SHRXQ   DI, R10, CX
-	MOVW    (SI)(CX*2), CX
-	MOVB    CH, AH
-	MOVBQZX CL, CX
-	ADDQ    CX, R11
-	SHLXQ   CX, R10, R10
-	SHRXQ   DI, R10, CX
-	MOVW    (SI)(CX*2), CX
-	MOVB    CH, AL
-	MOVBQZX CL, CX
-	ADDQ    CX, R11
-	SHLXQ   CX, R10, R10
-	BSWAPL  AX
-
-	// Store the decoded values
-	MOVL AX, (DX)
-	ADDQ $0x04, DX
-
-loop_condition:
-	CMPQ R9, $0x08
-	JGE  main_loop
-
-	// Update ctx structure
-	MOVQ ctx+0(FP), AX
-	SUBQ 16(AX), DX
-	MOVQ DX, 40(AX)
-	MOVQ (AX), AX
-	MOVQ R9, 24(AX)
-	MOVQ R10, 32(AX)
-	MOVB R11, 40(AX)
-	RET
-
-	// Report error
-error_max_decoded_size_exceeded:
-	MOVQ ctx+0(FP), AX
-	MOVQ $-1, CX
-	MOVQ CX, 40(AX)
-	RET
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
deleted file mode 100644
index 908c17de6..000000000
--- a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
+++ /dev/null
@@ -1,299 +0,0 @@
-//go:build !amd64 || appengine || !gc || noasm
-// +build !amd64 appengine !gc noasm
-
-// This file contains a generic implementation of Decoder.Decompress4X.
-package huff0
-
-import (
-	"errors"
-	"fmt"
-)
-
-// Decompress4X will decompress a 4X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// The *capacity* of the dst slice must match the destination size of
-// the uncompressed data exactly.
-func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
-	if len(d.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	if len(src) < 6+(4*1) {
-		return nil, errors.New("input too small")
-	}
-	if use8BitTables && d.actualTableLog <= 8 {
-		return d.decompress4X8bit(dst, src)
-	}
-
-	var br [4]bitReaderShifted
-	// Decode "jump table"
-	start := 6
-	for i := 0; i < 3; i++ {
-		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
-		if start+length >= len(src) {
-			return nil, errors.New("truncated input (or invalid offset)")
-		}
-		err := br[i].init(src[start : start+length])
-		if err != nil {
-			return nil, err
-		}
-		start += length
-	}
-	err := br[3].init(src[start:])
-	if err != nil {
-		return nil, err
-	}
-
-	// destination, offset to match first output
-	dstSize := cap(dst)
-	dst = dst[:dstSize]
-	out := dst
-	dstEvery := (dstSize + 3) / 4
-
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-	single := d.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	buf := d.buffer()
-	var off uint8
-	var decoded int
-
-	// Decode 2 values from each decoder/loop.
-	const bufoff = 256
-	for {
-		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
-			break
-		}
-
-		{
-			const stream = 0
-			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			v2 := single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[stream][off] = uint8(v.entry >> 8)
-			buf[stream2][off] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			v2 = single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[stream][off+1] = uint8(v.entry >> 8)
-			buf[stream2][off+1] = uint8(v2.entry >> 8)
-		}
-
-		{
-			const stream = 2
-			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			v2 := single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[stream][off] = uint8(v.entry >> 8)
-			buf[stream2][off] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			v2 = single[val2&tlMask]
-			br[stream].advance(uint8(v.entry))
-			br[stream2].advance(uint8(v2.entry))
-			buf[stream][off+1] = uint8(v.entry >> 8)
-			buf[stream2][off+1] = uint8(v2.entry >> 8)
-		}
-
-		off += 2
-
-		if off == 0 {
-			if bufoff > dstEvery {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 1")
-			}
-			// There must at least be 3 buffers left.
-			if len(out)-bufoff < dstEvery*3 {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 2")
-			}
-			//copy(out, buf[0][:])
-			//copy(out[dstEvery:], buf[1][:])
-			//copy(out[dstEvery*2:], buf[2][:])
-			//copy(out[dstEvery*3:], buf[3][:])
-			*(*[bufoff]byte)(out) = buf[0]
-			*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
-			*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
-			*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
-			out = out[bufoff:]
-			decoded += bufoff * 4
-		}
-	}
-	if off > 0 {
-		ioff := int(off)
-		if len(out) < dstEvery*3+ioff {
-			d.bufs.Put(buf)
-			return nil, errors.New("corruption detected: stream overrun 3")
-		}
-		copy(out, buf[0][:off])
-		copy(out[dstEvery:], buf[1][:off])
-		copy(out[dstEvery*2:], buf[2][:off])
-		copy(out[dstEvery*3:], buf[3][:off])
-		decoded += int(off) * 4
-		out = out[off:]
-	}
-
-	// Decode remaining.
-	remainBytes := dstEvery - (decoded / 4)
-	for i := range br {
-		offset := dstEvery * i
-		endsAt := offset + remainBytes
-		if endsAt > len(out) {
-			endsAt = len(out)
-		}
-		br := &br[i]
-		bitsLeft := br.remaining()
-		for bitsLeft > 0 {
-			br.fill()
-			if offset >= endsAt {
-				d.bufs.Put(buf)
-				return nil, errors.New("corruption detected: stream overrun 4")
-			}
-
-			// Read value and increment offset.
-			val := br.peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask].entry
-			nBits := uint8(v)
-			br.advance(nBits)
-			bitsLeft -= uint(nBits)
-			out[offset] = uint8(v >> 8)
-			offset++
-		}
-		if offset != endsAt {
-			d.bufs.Put(buf)
-			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
-		}
-		decoded += offset - dstEvery*i
-		err = br.close()
-		if err != nil {
-			return nil, err
-		}
-	}
-	d.bufs.Put(buf)
-	if dstSize != decoded {
-		return nil, errors.New("corruption detected: short output block")
-	}
-	return dst, nil
-}
-
-// Decompress1X will decompress a 1X encoded stream.
-// The cap of the output buffer will be the maximum decompressed size.
-// The length of the supplied input must match the end of a block exactly.
-func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
-	if len(d.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	if use8BitTables && d.actualTableLog <= 8 {
-		return d.decompress1X8Bit(dst, src)
-	}
-	var br bitReaderShifted
-	err := br.init(src)
-	if err != nil {
-		return dst, err
-	}
-	maxDecodedSize := cap(dst)
-	dst = dst[:0]
-
-	// Avoid bounds check by always having full sized table.
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-	dt := d.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	bufs := d.buffer()
-	buf := &bufs[0]
-	var off uint8
-
-	for br.off >= 8 {
-		br.fillFast()
-		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
-		br.advance(uint8(v.entry))
-		buf[off+0] = uint8(v.entry >> 8)
-
-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
-		br.advance(uint8(v.entry))
-		buf[off+1] = uint8(v.entry >> 8)
-
-		// Refill
-		br.fillFast()
-
-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
-		br.advance(uint8(v.entry))
-		buf[off+2] = uint8(v.entry >> 8)
-
-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
-		br.advance(uint8(v.entry))
-		buf[off+3] = uint8(v.entry >> 8)
-
-		off += 4
-		if off == 0 {
-			if len(dst)+256 > maxDecodedSize {
-				br.close()
-				d.bufs.Put(bufs)
-				return nil, ErrMaxDecodedSizeExceeded
-			}
-			dst = append(dst, buf[:]...)
-		}
-	}
-
-	if len(dst)+int(off) > maxDecodedSize {
-		d.bufs.Put(bufs)
-		br.close()
-		return nil, ErrMaxDecodedSizeExceeded
-	}
-	dst = append(dst, buf[:off]...)
-
-	// br < 8, so uint8 is fine
-	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
-	for bitsLeft > 0 {
-		br.fill()
-		if false && br.bitsRead >= 32 {
-			if br.off >= 4 {
-				v := br.in[br.off-4:]
-				v = v[:4]
-				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-				br.value = (br.value << 32) | uint64(low)
-				br.bitsRead -= 32
-				br.off -= 4
-			} else {
-				for br.off > 0 {
-					br.value = (br.value << 8) | uint64(br.in[br.off-1])
-					br.bitsRead -= 8
-					br.off--
-				}
-			}
-		}
-		if len(dst) >= maxDecodedSize {
-			d.bufs.Put(bufs)
-			br.close()
-			return nil, ErrMaxDecodedSizeExceeded
-		}
-		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
-		nBits := uint8(v.entry)
-		br.advance(nBits)
-		bitsLeft -= nBits
-		dst = append(dst, uint8(v.entry>>8))
-	}
-	d.bufs.Put(bufs)
-	return dst, br.close()
-}
diff --git a/vendor/github.com/klauspost/compress/huff0/huff0.go b/vendor/github.com/klauspost/compress/huff0/huff0.go
deleted file mode 100644
index 77ecd68e0..000000000
--- a/vendor/github.com/klauspost/compress/huff0/huff0.go
+++ /dev/null
@@ -1,337 +0,0 @@
-// Package huff0 provides fast huffman encoding as used in zstd.
-//
-// See README.md at https://github.com/klauspost/compress/tree/master/huff0 for details.
-package huff0
-
-import (
-	"errors"
-	"fmt"
-	"math"
-	"math/bits"
-	"sync"
-
-	"github.com/klauspost/compress/fse"
-)
-
-const (
-	maxSymbolValue = 255
-
-	// zstandard limits tablelog to 11, see:
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#huffman-tree-description
-	tableLogMax     = 11
-	tableLogDefault = 11
-	minTablelog     = 5
-	huffNodesLen    = 512
-
-	// BlockSizeMax is maximum input size for a single block uncompressed.
-	BlockSizeMax = 1<<18 - 1
-)
-
-var (
-	// ErrIncompressible is returned when input is judged to be too hard to compress.
-	ErrIncompressible = errors.New("input is not compressible")
-
-	// ErrUseRLE is returned from the compressor when the input is a single byte value repeated.
-	ErrUseRLE = errors.New("input is single value repeated")
-
-	// ErrTooBig is return if input is too large for a single block.
-	ErrTooBig = errors.New("input too big")
-
-	// ErrMaxDecodedSizeExceeded is return if input is too large for a single block.
-	ErrMaxDecodedSizeExceeded = errors.New("maximum output size exceeded")
-)
-
-type ReusePolicy uint8
-
-const (
-	// ReusePolicyAllow will allow reuse if it produces smaller output.
-	ReusePolicyAllow ReusePolicy = iota
-
-	// ReusePolicyPrefer will re-use aggressively if possible.
-	// This will not check if a new table will produce smaller output,
-	// except if the current table is impossible to use or
-	// compressed output is bigger than input.
-	ReusePolicyPrefer
-
-	// ReusePolicyNone will disable re-use of tables.
-	// This is slightly faster than ReusePolicyAllow but may produce larger output.
-	ReusePolicyNone
-
-	// ReusePolicyMust must allow reuse and produce smaller output.
-	ReusePolicyMust
-)
-
-type Scratch struct {
-	count [maxSymbolValue + 1]uint32
-
-	// Per block parameters.
-	// These can be used to override compression parameters of the block.
-	// Do not touch, unless you know what you are doing.
-
-	// Out is output buffer.
-	// If the scratch is re-used before the caller is done processing the output,
-	// set this field to nil.
-	// Otherwise the output buffer will be re-used for next Compression/Decompression step
-	// and allocation will be avoided.
-	Out []byte
-
-	// OutTable will contain the table data only, if a new table has been generated.
-	// Slice of the returned data.
-	OutTable []byte
-
-	// OutData will contain the compressed data.
-	// Slice of the returned data.
-	OutData []byte
-
-	// MaxDecodedSize will set the maximum allowed output size.
-	// This value will automatically be set to BlockSizeMax if not set.
-	// Decoders will return ErrMaxDecodedSizeExceeded is this limit is exceeded.
-	MaxDecodedSize int
-
-	srcLen int
-
-	// MaxSymbolValue will override the maximum symbol value of the next block.
-	MaxSymbolValue uint8
-
-	// TableLog will attempt to override the tablelog for the next block.
-	// Must be <= 11 and >= 5.
-	TableLog uint8
-
-	// Reuse will specify the reuse policy
-	Reuse ReusePolicy
-
-	// WantLogLess allows to specify a log 2 reduction that should at least be achieved,
-	// otherwise the block will be returned as incompressible.
-	// The reduction should then at least be (input size >> WantLogLess)
-	// If WantLogLess == 0 any improvement will do.
-	WantLogLess uint8
-
-	symbolLen      uint16 // Length of active part of the symbol table.
-	maxCount       int    // count of the most probable symbol
-	clearCount     bool   // clear count
-	actualTableLog uint8  // Selected tablelog.
-	prevTableLog   uint8  // Tablelog for previous table
-	prevTable      cTable // Table used for previous compression.
-	cTable         cTable // compression table
-	dt             dTable // decompression table
-	nodes          []nodeElt
-	tmpOut         [4][]byte
-	fse            *fse.Scratch
-	decPool        sync.Pool // *[4][256]byte buffers.
-	huffWeight     [maxSymbolValue + 1]byte
-}
-
-// TransferCTable will transfer the previously used compression table.
-func (s *Scratch) TransferCTable(src *Scratch) {
-	if cap(s.prevTable) < len(src.prevTable) {
-		s.prevTable = make(cTable, 0, maxSymbolValue+1)
-	}
-	s.prevTable = s.prevTable[:len(src.prevTable)]
-	copy(s.prevTable, src.prevTable)
-	s.prevTableLog = src.prevTableLog
-}
-
-func (s *Scratch) prepare(in []byte) (*Scratch, error) {
-	if len(in) > BlockSizeMax {
-		return nil, ErrTooBig
-	}
-	if s == nil {
-		s = &Scratch{}
-	}
-	if s.MaxSymbolValue == 0 {
-		s.MaxSymbolValue = maxSymbolValue
-	}
-	if s.TableLog == 0 {
-		s.TableLog = tableLogDefault
-	}
-	if s.TableLog > tableLogMax || s.TableLog < minTablelog {
-		return nil, fmt.Errorf(" invalid tableLog %d (%d -> %d)", s.TableLog, minTablelog, tableLogMax)
-	}
-	if s.MaxDecodedSize <= 0 || s.MaxDecodedSize > BlockSizeMax {
-		s.MaxDecodedSize = BlockSizeMax
-	}
-	if s.clearCount && s.maxCount == 0 {
-		for i := range s.count {
-			s.count[i] = 0
-		}
-		s.clearCount = false
-	}
-	if cap(s.Out) == 0 {
-		s.Out = make([]byte, 0, len(in))
-	}
-	s.Out = s.Out[:0]
-
-	s.OutTable = nil
-	s.OutData = nil
-	if cap(s.nodes) < huffNodesLen+1 {
-		s.nodes = make([]nodeElt, 0, huffNodesLen+1)
-	}
-	s.nodes = s.nodes[:0]
-	if s.fse == nil {
-		s.fse = &fse.Scratch{}
-	}
-	s.srcLen = len(in)
-
-	return s, nil
-}
-
-type cTable []cTableEntry
-
-func (c cTable) write(s *Scratch) error {
-	var (
-		// precomputed conversion table
-		bitsToWeight [tableLogMax + 1]byte
-		huffLog      = s.actualTableLog
-		// last weight is not saved.
-		maxSymbolValue = uint8(s.symbolLen - 1)
-		huffWeight     = s.huffWeight[:256]
-	)
-	const (
-		maxFSETableLog = 6
-	)
-	// convert to weight
-	bitsToWeight[0] = 0
-	for n := uint8(1); n < huffLog+1; n++ {
-		bitsToWeight[n] = huffLog + 1 - n
-	}
-
-	// Acquire histogram for FSE.
-	hist := s.fse.Histogram()
-	hist = hist[:256]
-	for i := range hist[:16] {
-		hist[i] = 0
-	}
-	for n := uint8(0); n < maxSymbolValue; n++ {
-		v := bitsToWeight[c[n].nBits] & 15
-		huffWeight[n] = v
-		hist[v]++
-	}
-
-	// FSE compress if feasible.
-	if maxSymbolValue >= 2 {
-		huffMaxCnt := uint32(0)
-		huffMax := uint8(0)
-		for i, v := range hist[:16] {
-			if v == 0 {
-				continue
-			}
-			huffMax = byte(i)
-			if v > huffMaxCnt {
-				huffMaxCnt = v
-			}
-		}
-		s.fse.HistogramFinished(huffMax, int(huffMaxCnt))
-		s.fse.TableLog = maxFSETableLog
-		b, err := fse.Compress(huffWeight[:maxSymbolValue], s.fse)
-		if err == nil && len(b) < int(s.symbolLen>>1) {
-			s.Out = append(s.Out, uint8(len(b)))
-			s.Out = append(s.Out, b...)
-			return nil
-		}
-		// Unable to compress (RLE/uncompressible)
-	}
-	// write raw values as 4-bits (max : 15)
-	if maxSymbolValue > (256 - 128) {
-		// should not happen : likely means source cannot be compressed
-		return ErrIncompressible
-	}
-	op := s.Out
-	// special case, pack weights 4 bits/weight.
-	op = append(op, 128|(maxSymbolValue-1))
-	// be sure it doesn't cause msan issue in final combination
-	huffWeight[maxSymbolValue] = 0
-	for n := uint16(0); n < uint16(maxSymbolValue); n += 2 {
-		op = append(op, (huffWeight[n]<<4)|huffWeight[n+1])
-	}
-	s.Out = op
-	return nil
-}
-
-func (c cTable) estTableSize(s *Scratch) (sz int, err error) {
-	var (
-		// precomputed conversion table
-		bitsToWeight [tableLogMax + 1]byte
-		huffLog      = s.actualTableLog
-		// last weight is not saved.
-		maxSymbolValue = uint8(s.symbolLen - 1)
-		huffWeight     = s.huffWeight[:256]
-	)
-	const (
-		maxFSETableLog = 6
-	)
-	// convert to weight
-	bitsToWeight[0] = 0
-	for n := uint8(1); n < huffLog+1; n++ {
-		bitsToWeight[n] = huffLog + 1 - n
-	}
-
-	// Acquire histogram for FSE.
-	hist := s.fse.Histogram()
-	hist = hist[:256]
-	for i := range hist[:16] {
-		hist[i] = 0
-	}
-	for n := uint8(0); n < maxSymbolValue; n++ {
-		v := bitsToWeight[c[n].nBits] & 15
-		huffWeight[n] = v
-		hist[v]++
-	}
-
-	// FSE compress if feasible.
-	if maxSymbolValue >= 2 {
-		huffMaxCnt := uint32(0)
-		huffMax := uint8(0)
-		for i, v := range hist[:16] {
-			if v == 0 {
-				continue
-			}
-			huffMax = byte(i)
-			if v > huffMaxCnt {
-				huffMaxCnt = v
-			}
-		}
-		s.fse.HistogramFinished(huffMax, int(huffMaxCnt))
-		s.fse.TableLog = maxFSETableLog
-		b, err := fse.Compress(huffWeight[:maxSymbolValue], s.fse)
-		if err == nil && len(b) < int(s.symbolLen>>1) {
-			sz += 1 + len(b)
-			return sz, nil
-		}
-		// Unable to compress (RLE/uncompressible)
-	}
-	// write raw values as 4-bits (max : 15)
-	if maxSymbolValue > (256 - 128) {
-		// should not happen : likely means source cannot be compressed
-		return 0, ErrIncompressible
-	}
-	// special case, pack weights 4 bits/weight.
-	sz += 1 + int(maxSymbolValue/2)
-	return sz, nil
-}
-
-// estimateSize returns the estimated size in bytes of the input represented in the
-// histogram supplied.
-func (c cTable) estimateSize(hist []uint32) int {
-	nbBits := uint32(7)
-	for i, v := range c[:len(hist)] {
-		nbBits += uint32(v.nBits) * hist[i]
-	}
-	return int(nbBits >> 3)
-}
-
-// minSize returns the minimum possible size considering the shannon limit.
-func (s *Scratch) minSize(total int) int {
-	nbBits := float64(7)
-	fTotal := float64(total)
-	for _, v := range s.count[:s.symbolLen] {
-		n := float64(v)
-		if n > 0 {
-			nbBits += math.Log2(fTotal/n) * n
-		}
-	}
-	return int(nbBits) >> 3
-}
-
-func highBit32(val uint32) (n uint32) {
-	return uint32(bits.Len32(val) - 1)
-}