diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress/zstd/dict.go')
-rw-r--r-- | vendor/github.com/klauspost/compress/zstd/dict.go | 565 |
1 files changed, 0 insertions, 565 deletions
diff --git a/vendor/github.com/klauspost/compress/zstd/dict.go b/vendor/github.com/klauspost/compress/zstd/dict.go deleted file mode 100644 index b7b83164b..000000000 --- a/vendor/github.com/klauspost/compress/zstd/dict.go +++ /dev/null @@ -1,565 +0,0 @@ -package zstd - -import ( - "bytes" - "encoding/binary" - "errors" - "fmt" - "io" - "math" - "sort" - - "github.com/klauspost/compress/huff0" -) - -type dict struct { - id uint32 - - litEnc *huff0.Scratch - llDec, ofDec, mlDec sequenceDec - offsets [3]int - content []byte -} - -const dictMagic = "\x37\xa4\x30\xec" - -// Maximum dictionary size for the reference implementation (1.5.3) is 2 GiB. -const dictMaxLength = 1 << 31 - -// ID returns the dictionary id or 0 if d is nil. -func (d *dict) ID() uint32 { - if d == nil { - return 0 - } - return d.id -} - -// ContentSize returns the dictionary content size or 0 if d is nil. -func (d *dict) ContentSize() int { - if d == nil { - return 0 - } - return len(d.content) -} - -// Content returns the dictionary content. -func (d *dict) Content() []byte { - if d == nil { - return nil - } - return d.content -} - -// Offsets returns the initial offsets. -func (d *dict) Offsets() [3]int { - if d == nil { - return [3]int{} - } - return d.offsets -} - -// LitEncoder returns the literal encoder. -func (d *dict) LitEncoder() *huff0.Scratch { - if d == nil { - return nil - } - return d.litEnc -} - -// Load a dictionary as described in -// https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format -func loadDict(b []byte) (*dict, error) { - // Check static field size. - if len(b) <= 8+(3*4) { - return nil, io.ErrUnexpectedEOF - } - d := dict{ - llDec: sequenceDec{fse: &fseDecoder{}}, - ofDec: sequenceDec{fse: &fseDecoder{}}, - mlDec: sequenceDec{fse: &fseDecoder{}}, - } - if string(b[:4]) != dictMagic { - return nil, ErrMagicMismatch - } - d.id = binary.LittleEndian.Uint32(b[4:8]) - if d.id == 0 { - return nil, errors.New("dictionaries cannot have ID 0") - } - - // Read literal table - var err error - d.litEnc, b, err = huff0.ReadTable(b[8:], nil) - if err != nil { - return nil, fmt.Errorf("loading literal table: %w", err) - } - d.litEnc.Reuse = huff0.ReusePolicyMust - - br := byteReader{ - b: b, - off: 0, - } - readDec := func(i tableIndex, dec *fseDecoder) error { - if err := dec.readNCount(&br, uint16(maxTableSymbol[i])); err != nil { - return err - } - if br.overread() { - return io.ErrUnexpectedEOF - } - err = dec.transform(symbolTableX[i]) - if err != nil { - println("Transform table error:", err) - return err - } - if debugDecoder || debugEncoder { - println("Read table ok", "symbolLen:", dec.symbolLen) - } - // Set decoders as predefined so they aren't reused. - dec.preDefined = true - return nil - } - - if err := readDec(tableOffsets, d.ofDec.fse); err != nil { - return nil, err - } - if err := readDec(tableMatchLengths, d.mlDec.fse); err != nil { - return nil, err - } - if err := readDec(tableLiteralLengths, d.llDec.fse); err != nil { - return nil, err - } - if br.remain() < 12 { - return nil, io.ErrUnexpectedEOF - } - - d.offsets[0] = int(br.Uint32()) - br.advance(4) - d.offsets[1] = int(br.Uint32()) - br.advance(4) - d.offsets[2] = int(br.Uint32()) - br.advance(4) - if d.offsets[0] <= 0 || d.offsets[1] <= 0 || d.offsets[2] <= 0 { - return nil, errors.New("invalid offset in dictionary") - } - d.content = make([]byte, br.remain()) - copy(d.content, br.unread()) - if d.offsets[0] > len(d.content) || d.offsets[1] > len(d.content) || d.offsets[2] > len(d.content) { - return nil, fmt.Errorf("initial offset bigger than dictionary content size %d, offsets: %v", len(d.content), d.offsets) - } - - return &d, nil -} - -// InspectDictionary loads a zstd dictionary and provides functions to inspect the content. -func InspectDictionary(b []byte) (interface { - ID() uint32 - ContentSize() int - Content() []byte - Offsets() [3]int - LitEncoder() *huff0.Scratch -}, error) { - initPredefined() - d, err := loadDict(b) - return d, err -} - -type BuildDictOptions struct { - // Dictionary ID. - ID uint32 - - // Content to use to create dictionary tables. - Contents [][]byte - - // History to use for all blocks. - History []byte - - // Offsets to use. - Offsets [3]int - - // CompatV155 will make the dictionary compatible with Zstd v1.5.5 and earlier. - // See https://github.com/facebook/zstd/issues/3724 - CompatV155 bool - - // Use the specified encoder level. - // The dictionary will be built using the specified encoder level, - // which will reflect speed and make the dictionary tailored for that level. - // If not set SpeedBestCompression will be used. - Level EncoderLevel - - // DebugOut will write stats and other details here if set. - DebugOut io.Writer -} - -func BuildDict(o BuildDictOptions) ([]byte, error) { - initPredefined() - hist := o.History - contents := o.Contents - debug := o.DebugOut != nil - println := func(args ...interface{}) { - if o.DebugOut != nil { - fmt.Fprintln(o.DebugOut, args...) - } - } - printf := func(s string, args ...interface{}) { - if o.DebugOut != nil { - fmt.Fprintf(o.DebugOut, s, args...) - } - } - print := func(args ...interface{}) { - if o.DebugOut != nil { - fmt.Fprint(o.DebugOut, args...) - } - } - - if int64(len(hist)) > dictMaxLength { - return nil, fmt.Errorf("dictionary of size %d > %d", len(hist), int64(dictMaxLength)) - } - if len(hist) < 8 { - return nil, fmt.Errorf("dictionary of size %d < %d", len(hist), 8) - } - if len(contents) == 0 { - return nil, errors.New("no content provided") - } - d := dict{ - id: o.ID, - litEnc: nil, - llDec: sequenceDec{}, - ofDec: sequenceDec{}, - mlDec: sequenceDec{}, - offsets: o.Offsets, - content: hist, - } - block := blockEnc{lowMem: false} - block.init() - enc := encoder(&bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(maxMatchLen), bufferReset: math.MaxInt32 - int32(maxMatchLen*2), lowMem: false}}) - if o.Level != 0 { - eOpts := encoderOptions{ - level: o.Level, - blockSize: maxMatchLen, - windowSize: maxMatchLen, - dict: &d, - lowMem: false, - } - enc = eOpts.encoder() - } else { - o.Level = SpeedBestCompression - } - var ( - remain [256]int - ll [256]int - ml [256]int - of [256]int - ) - addValues := func(dst *[256]int, src []byte) { - for _, v := range src { - dst[v]++ - } - } - addHist := func(dst *[256]int, src *[256]uint32) { - for i, v := range src { - dst[i] += int(v) - } - } - seqs := 0 - nUsed := 0 - litTotal := 0 - newOffsets := make(map[uint32]int, 1000) - for _, b := range contents { - block.reset(nil) - if len(b) < 8 { - continue - } - nUsed++ - enc.Reset(&d, true) - enc.Encode(&block, b) - addValues(&remain, block.literals) - litTotal += len(block.literals) - if len(block.sequences) == 0 { - continue - } - seqs += len(block.sequences) - block.genCodes() - addHist(&ll, block.coders.llEnc.Histogram()) - addHist(&ml, block.coders.mlEnc.Histogram()) - addHist(&of, block.coders.ofEnc.Histogram()) - for i, seq := range block.sequences { - if i > 3 { - break - } - offset := seq.offset - if offset == 0 { - continue - } - if int(offset) >= len(o.History) { - continue - } - if offset > 3 { - newOffsets[offset-3]++ - } else { - newOffsets[uint32(o.Offsets[offset-1])]++ - } - } - } - // Find most used offsets. - var sortedOffsets []uint32 - for k := range newOffsets { - sortedOffsets = append(sortedOffsets, k) - } - sort.Slice(sortedOffsets, func(i, j int) bool { - a, b := sortedOffsets[i], sortedOffsets[j] - if a == b { - // Prefer the longer offset - return sortedOffsets[i] > sortedOffsets[j] - } - return newOffsets[sortedOffsets[i]] > newOffsets[sortedOffsets[j]] - }) - if len(sortedOffsets) > 3 { - if debug { - print("Offsets:") - for i, v := range sortedOffsets { - if i > 20 { - break - } - printf("[%d: %d],", v, newOffsets[v]) - } - println("") - } - - sortedOffsets = sortedOffsets[:3] - } - for i, v := range sortedOffsets { - o.Offsets[i] = int(v) - } - if debug { - println("New repeat offsets", o.Offsets) - } - - if nUsed == 0 || seqs == 0 { - return nil, fmt.Errorf("%d blocks, %d sequences found", nUsed, seqs) - } - if debug { - println("Sequences:", seqs, "Blocks:", nUsed, "Literals:", litTotal) - } - if seqs/nUsed < 512 { - // Use 512 as minimum. - nUsed = seqs / 512 - if nUsed == 0 { - nUsed = 1 - } - } - copyHist := func(dst *fseEncoder, src *[256]int) ([]byte, error) { - hist := dst.Histogram() - var maxSym uint8 - var maxCount int - var fakeLength int - for i, v := range src { - if v > 0 { - v = v / nUsed - if v == 0 { - v = 1 - } - } - if v > maxCount { - maxCount = v - } - if v != 0 { - maxSym = uint8(i) - } - fakeLength += v - hist[i] = uint32(v) - } - - // Ensure we aren't trying to represent RLE. - if maxCount == fakeLength { - for i := range hist { - if uint8(i) == maxSym { - fakeLength++ - maxSym++ - hist[i+1] = 1 - if maxSym > 1 { - break - } - } - if hist[0] == 0 { - fakeLength++ - hist[i] = 1 - if maxSym > 1 { - break - } - } - } - } - - dst.HistogramFinished(maxSym, maxCount) - dst.reUsed = false - dst.useRLE = false - err := dst.normalizeCount(fakeLength) - if err != nil { - return nil, err - } - if debug { - println("RAW:", dst.count[:maxSym+1], "NORM:", dst.norm[:maxSym+1], "LEN:", fakeLength) - } - return dst.writeCount(nil) - } - if debug { - print("Literal lengths: ") - } - llTable, err := copyHist(block.coders.llEnc, &ll) - if err != nil { - return nil, err - } - if debug { - print("Match lengths: ") - } - mlTable, err := copyHist(block.coders.mlEnc, &ml) - if err != nil { - return nil, err - } - if debug { - print("Offsets: ") - } - ofTable, err := copyHist(block.coders.ofEnc, &of) - if err != nil { - return nil, err - } - - // Literal table - avgSize := litTotal - if avgSize > huff0.BlockSizeMax/2 { - avgSize = huff0.BlockSizeMax / 2 - } - huffBuff := make([]byte, 0, avgSize) - // Target size - div := litTotal / avgSize - if div < 1 { - div = 1 - } - if debug { - println("Huffman weights:") - } - for i, n := range remain[:] { - if n > 0 { - n = n / div - // Allow all entries to be represented. - if n == 0 { - n = 1 - } - huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...) - if debug { - printf("[%d: %d], ", i, n) - } - } - } - if o.CompatV155 && remain[255]/div == 0 { - huffBuff = append(huffBuff, 255) - } - scratch := &huff0.Scratch{TableLog: 11} - for tries := 0; tries < 255; tries++ { - scratch = &huff0.Scratch{TableLog: 11} - _, _, err = huff0.Compress1X(huffBuff, scratch) - if err == nil { - break - } - if debug { - printf("Try %d: Huffman error: %v\n", tries+1, err) - } - huffBuff = huffBuff[:0] - if tries == 250 { - if debug { - println("Huffman: Bailing out with predefined table") - } - - // Bail out.... Just generate something - huffBuff = append(huffBuff, bytes.Repeat([]byte{255}, 10000)...) - for i := 0; i < 128; i++ { - huffBuff = append(huffBuff, byte(i)) - } - continue - } - if errors.Is(err, huff0.ErrIncompressible) { - // Try truncating least common. - for i, n := range remain[:] { - if n > 0 { - n = n / (div * (i + 1)) - if n > 0 { - huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...) - } - } - } - if o.CompatV155 && len(huffBuff) > 0 && huffBuff[len(huffBuff)-1] != 255 { - huffBuff = append(huffBuff, 255) - } - if len(huffBuff) == 0 { - huffBuff = append(huffBuff, 0, 255) - } - } - if errors.Is(err, huff0.ErrUseRLE) { - for i, n := range remain[:] { - n = n / (div * (i + 1)) - // Allow all entries to be represented. - if n == 0 { - n = 1 - } - huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...) - } - } - } - - var out bytes.Buffer - out.Write([]byte(dictMagic)) - out.Write(binary.LittleEndian.AppendUint32(nil, o.ID)) - out.Write(scratch.OutTable) - if debug { - println("huff table:", len(scratch.OutTable), "bytes") - println("of table:", len(ofTable), "bytes") - println("ml table:", len(mlTable), "bytes") - println("ll table:", len(llTable), "bytes") - } - out.Write(ofTable) - out.Write(mlTable) - out.Write(llTable) - out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[0]))) - out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[1]))) - out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[2]))) - out.Write(hist) - if debug { - _, err := loadDict(out.Bytes()) - if err != nil { - panic(err) - } - i, err := InspectDictionary(out.Bytes()) - if err != nil { - panic(err) - } - println("ID:", i.ID()) - println("Content size:", i.ContentSize()) - println("Encoder:", i.LitEncoder() != nil) - println("Offsets:", i.Offsets()) - var totalSize int - for _, b := range contents { - totalSize += len(b) - } - - encWith := func(opts ...EOption) int { - enc, err := NewWriter(nil, opts...) - if err != nil { - panic(err) - } - defer enc.Close() - var dst []byte - var totalSize int - for _, b := range contents { - dst = enc.EncodeAll(b, dst[:0]) - totalSize += len(dst) - } - return totalSize - } - plain := encWith(WithEncoderLevel(o.Level)) - withDict := encWith(WithEncoderLevel(o.Level), WithEncoderDict(out.Bytes())) - println("Input size:", totalSize) - println("Plain Compressed:", plain) - println("Dict Compressed:", withDict) - println("Saved:", plain-withDict, (plain-withDict)/len(contents), "bytes per input (rounded down)") - } - return out.Bytes(), nil -} |