summaryrefslogtreecommitdiff
path: root/vendor/github.com/klauspost/compress/s2
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2')
-rw-r--r--vendor/github.com/klauspost/compress/s2/README.md283
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode.go6
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode_other.go34
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_all.go3
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_amd64.go12
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_best.go35
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_better.go105
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_go.go9
-rw-r--r--vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go23
-rw-r--r--vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s919
10 files changed, 784 insertions, 645 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md
index 73c0c462d..1d80c42a5 100644
--- a/vendor/github.com/klauspost/compress/s2/README.md
+++ b/vendor/github.com/klauspost/compress/s2/README.md
@@ -325,35 +325,35 @@ The content compressed in this mode is fully compatible with the standard decode
Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all threads and a single thread (1 CPU):
-| File | S2 speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller |
-|-----------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------|
-| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 12.70x | 10556 MB/s | 7.35% | 4.15x | 3455 MB/s | 12.79% |
-| (1 CPU) | 1.14x | 948 MB/s | - | 0.42x | 349 MB/s | - |
-| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 17.13x | 14484 MB/s | 31.60% | 10.09x | 8533 MB/s | 37.71% |
-| (1 CPU) | 1.33x | 1127 MB/s | - | 0.70x | 589 MB/s | - |
-| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 15.14x | 12000 MB/s | -5.79% | 6.59x | 5223 MB/s | 5.80% |
-| (1 CPU) | 1.11x | 877 MB/s | - | 0.47x | 370 MB/s | - |
-| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 14.62x | 12116 MB/s | 15.90% | 5.35x | 4430 MB/s | 16.08% |
-| (1 CPU) | 1.38x | 1146 MB/s | - | 0.38x | 312 MB/s | - |
-| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst) | 8.83x | 17579 MB/s | 43.86% | 6.54x | 13011 MB/s | 47.23% |
-| (1 CPU) | 1.14x | 2259 MB/s | - | 0.74x | 1475 MB/s | - |
-| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 16.72x | 14019 MB/s | 24.02% | 10.11x | 8477 MB/s | 30.48% |
-| (1 CPU) | 1.24x | 1043 MB/s | - | 0.70x | 586 MB/s | - |
-| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 13.33x | 9254 MB/s | 1.84% | 6.75x | 4686 MB/s | 6.72% |
-| (1 CPU) | 0.97x | 672 MB/s | - | 0.53x | 366 MB/s | - |
-| sharnd.out.2gb | 2.11x | 12639 MB/s | 0.01% | 1.98x | 11833 MB/s | 0.01% |
-| (1 CPU) | 0.93x | 5594 MB/s | - | 1.34x | 8030 MB/s | - |
-| [enwik9](http://mattmahoney.net/dc/textdata.html) | 19.34x | 8220 MB/s | 3.98% | 7.87x | 3345 MB/s | 15.82% |
-| (1 CPU) | 1.06x | 452 MB/s | - | 0.50x | 213 MB/s | - |
-| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 10.48x | 6124 MB/s | 5.67% | 3.76x | 2197 MB/s | 12.60% |
-| (1 CPU) | 0.97x | 568 MB/s | - | 0.46x | 271 MB/s | - |
-| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results) | 21.07x | 9020 MB/s | 6.36% | 6.91x | 2959 MB/s | 16.95% |
-| (1 CPU) | 1.07x | 460 MB/s | - | 0.51x | 220 MB/s | - |
+| File | S2 Speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller |
+|---------------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------|
+| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 16.33x | 10556 MB/s | 8.0% | 6.04x | 5252 MB/s | 14.7% |
+| (1 CPU) | 1.08x | 940 MB/s | - | 0.46x | 400 MB/s | - |
+| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 16.51x | 15224 MB/s | 31.70% | 9.47x | 8734 MB/s | 37.71% |
+| (1 CPU) | 1.26x | 1157 MB/s | - | 0.60x | 556 MB/s | - |
+| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 15.14x | 12598 MB/s | -5.76% | 6.23x | 5675 MB/s | 3.62% |
+| (1 CPU) | 1.02x | 932 MB/s | - | 0.47x | 432 MB/s | - |
+| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 11.21x | 12116 MB/s | 15.95% | 3.24x | 3500 MB/s | 18.00% |
+| (1 CPU) | 1.05x | 1135 MB/s | - | 0.27x | 292 MB/s | - |
+| [apache.log](https://files.klauspost.com/compress/apache.log.zst) | 8.55x | 16673 MB/s | 20.54% | 5.85x | 11420 MB/s | 24.97% |
+| (1 CPU) | 1.91x | 1771 MB/s | - | 0.53x | 1041 MB/s | - |
+| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 15.76x | 14357 MB/s | 24.01% | 8.67x | 7891 MB/s | 33.68% |
+| (1 CPU) | 1.17x | 1064 MB/s | - | 0.65x | 595 MB/s | - |
+| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 13.33x | 9835 MB/s | 2.34% | 6.85x | 4863 MB/s | 9.96% |
+| (1 CPU) | 0.97x | 689 MB/s | - | 0.55x | 387 MB/s | - |
+| sharnd.out.2gb | 9.11x | 13213 MB/s | 0.01% | 1.49x | 9184 MB/s | 0.01% |
+| (1 CPU) | 0.88x | 5418 MB/s | - | 0.77x | 5417 MB/s | - |
+| [sofia-air-quality-dataset csv](https://files.klauspost.com/compress/sofia-air-quality-dataset.tar.zst) | 22.00x | 11477 MB/s | 18.73% | 11.15x | 5817 MB/s | 27.88% |
+| (1 CPU) | 1.23x | 642 MB/s | - | 0.71x | 642 MB/s | - |
+| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 11.23x | 6520 MB/s | 5.9% | 5.35x | 3109 MB/s | 15.88% |
+| (1 CPU) | 1.05x | 607 MB/s | - | 0.52x | 304 MB/s | - |
+| [enwik9](https://files.klauspost.com/compress/enwik9.zst) | 19.28x | 8440 MB/s | 4.04% | 9.31x | 4076 MB/s | 18.04% |
+| (1 CPU) | 1.12x | 488 MB/s | - | 0.57x | 250 MB/s | - |
### Legend
-* `S2 speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core.
-* `S2 throughput`: Throughput of S2 in MB/s.
+* `S2 Speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core.
+* `S2 Throughput`: Throughput of S2 in MB/s.
* `S2 % smaller`: How many percent of the Snappy output size is S2 better.
* `S2 "better"`: Speed when enabling "better" compression mode in S2 compared to Snappy.
* `"better" throughput`: Speed when enabling "better" compression mode in S2 compared to Snappy.
@@ -361,7 +361,7 @@ Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all th
There is a good speedup across the board when using a single thread and a significant speedup when using multiple threads.
-Machine generated data gets by far the biggest compression boost, with size being being reduced by up to 45% of Snappy size.
+Machine generated data gets by far the biggest compression boost, with size being reduced by up to 35% of Snappy size.
The "better" compression mode sees a good improvement in all cases, but usually at a performance cost.
@@ -404,15 +404,15 @@ The "better" compression mode will actively look for shorter matches, which is w
Without assembly decompression is also very fast; single goroutine decompression speed. No assembly:
| File | S2 Throughput | S2 throughput |
-|--------------------------------|--------------|---------------|
-| consensus.db.10gb.s2 | 1.84x | 2289.8 MB/s |
-| 10gb.tar.s2 | 1.30x | 867.07 MB/s |
-| rawstudio-mint14.tar.s2 | 1.66x | 1329.65 MB/s |
-| github-june-2days-2019.json.s2 | 2.36x | 1831.59 MB/s |
-| github-ranks-backup.bin.s2 | 1.73x | 1390.7 MB/s |
-| enwik9.s2 | 1.67x | 681.53 MB/s |
-| adresser.json.s2 | 3.41x | 4230.53 MB/s |
-| silesia.tar.s2 | 1.52x | 811.58 |
+|--------------------------------|---------------|---------------|
+| consensus.db.10gb.s2 | 1.84x | 2289.8 MB/s |
+| 10gb.tar.s2 | 1.30x | 867.07 MB/s |
+| rawstudio-mint14.tar.s2 | 1.66x | 1329.65 MB/s |
+| github-june-2days-2019.json.s2 | 2.36x | 1831.59 MB/s |
+| github-ranks-backup.bin.s2 | 1.73x | 1390.7 MB/s |
+| enwik9.s2 | 1.67x | 681.53 MB/s |
+| adresser.json.s2 | 3.41x | 4230.53 MB/s |
+| silesia.tar.s2 | 1.52x | 811.58 |
Even though S2 typically compresses better than Snappy, decompression speed is always better.
@@ -450,14 +450,14 @@ The most reliable is a wide dataset.
For this we use [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
53927 files, total input size: 4,014,735,833 bytes. Single goroutine used.
-| * | Input | Output | Reduction | MB/s |
-|-------------------|------------|------------|-----------|--------|
-| S2 | 4014735833 | 1059723369 | 73.60% | **934.34** |
-| S2 Better | 4014735833 | 969670507 | 75.85% | 532.70 |
-| S2 Best | 4014735833 | 906625668 | **77.85%** | 46.84 |
-| Snappy | 4014735833 | 1128706759 | 71.89% | 762.59 |
-| S2, Snappy Output | 4014735833 | 1093821420 | 72.75% | 908.60 |
-| LZ4 | 4014735833 | 1079259294 | 73.12% | 526.94 |
+| * | Input | Output | Reduction | MB/s |
+|-------------------|------------|------------|------------|------------|
+| S2 | 4014735833 | 1059723369 | 73.60% | **936.73** |
+| S2 Better | 4014735833 | 961580539 | 76.05% | 451.10 |
+| S2 Best | 4014735833 | 899182886 | **77.60%** | 46.84 |
+| Snappy | 4014735833 | 1128706759 | 71.89% | 790.15 |
+| S2, Snappy Output | 4014735833 | 1093823291 | 72.75% | 936.60 |
+| LZ4 | 4014735833 | 1063768713 | 73.50% | 452.02 |
S2 delivers both the best single threaded throughput with regular mode and the best compression rate with "best".
"Better" mode provides the same compression speed as LZ4 with better compression ratio.
@@ -489,42 +489,23 @@ AMD64 assembly is use for both S2 and Snappy.
| Absolute Perf | Snappy size | S2 Size | Snappy Speed | S2 Speed | Snappy dec | S2 dec |
|-----------------------|-------------|---------|--------------|-------------|-------------|-------------|
-| html | 22843 | 21111 | 16246 MB/s | 17438 MB/s | 40972 MB/s | 49263 MB/s |
-| urls.10K | 335492 | 287326 | 7943 MB/s | 9693 MB/s | 22523 MB/s | 26484 MB/s |
-| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 273889 MB/s | 718321 MB/s | 827552 MB/s |
-| fireworks.jpeg (200B) | 146 | 155 | 8869 MB/s | 17773 MB/s | 33691 MB/s | 52421 MB/s |
-| paper-100k.pdf | 85304 | 84459 | 167546 MB/s | 101263 MB/s | 326905 MB/s | 291944 MB/s |
-| html_x_4 | 92234 | 21113 | 15194 MB/s | 50670 MB/s | 30843 MB/s | 32217 MB/s |
-| alice29.txt | 88034 | 85975 | 5936 MB/s | 6139 MB/s | 12882 MB/s | 20044 MB/s |
-| asyoulik.txt | 77503 | 79650 | 5517 MB/s | 6366 MB/s | 12735 MB/s | 22806 MB/s |
-| lcet10.txt | 234661 | 220670 | 6235 MB/s | 6067 MB/s | 14519 MB/s | 18697 MB/s |
-| plrabn12.txt | 319267 | 317985 | 5159 MB/s | 5726 MB/s | 11923 MB/s | 19901 MB/s |
-| geo.protodata | 23335 | 18690 | 21220 MB/s | 26529 MB/s | 56271 MB/s | 62540 MB/s |
-| kppkn.gtb | 69526 | 65312 | 9732 MB/s | 8559 MB/s | 18491 MB/s | 18969 MB/s |
-| alice29.txt (128B) | 80 | 82 | 6691 MB/s | 15489 MB/s | 31883 MB/s | 38874 MB/s |
-| alice29.txt (1000B) | 774 | 774 | 12204 MB/s | 13000 MB/s | 48056 MB/s | 52341 MB/s |
-| alice29.txt (10000B) | 6648 | 6933 | 10044 MB/s | 12806 MB/s | 32378 MB/s | 46322 MB/s |
-| alice29.txt (20000B) | 12686 | 13574 | 7733 MB/s | 11210 MB/s | 30566 MB/s | 58969 MB/s |
-
-
-| Relative Perf | Snappy size | S2 size improved | S2 Speed | S2 Dec Speed |
-|-----------------------|-------------|------------------|----------|--------------|
-| html | 22.31% | 7.58% | 1.07x | 1.20x |
-| urls.10K | 47.78% | 14.36% | 1.22x | 1.18x |
-| fireworks.jpeg | 99.95% | -0.05% | 0.78x | 1.15x |
-| fireworks.jpeg (200B) | 73.00% | -6.16% | 2.00x | 1.56x |
-| paper-100k.pdf | 83.30% | 0.99% | 0.60x | 0.89x |
-| html_x_4 | 22.52% | 77.11% | 3.33x | 1.04x |
-| alice29.txt | 57.88% | 2.34% | 1.03x | 1.56x |
-| asyoulik.txt | 61.91% | -2.77% | 1.15x | 1.79x |
-| lcet10.txt | 54.99% | 5.96% | 0.97x | 1.29x |
-| plrabn12.txt | 66.26% | 0.40% | 1.11x | 1.67x |
-| geo.protodata | 19.68% | 19.91% | 1.25x | 1.11x |
-| kppkn.gtb | 37.72% | 6.06% | 0.88x | 1.03x |
-| alice29.txt (128B) | 62.50% | -2.50% | 2.31x | 1.22x |
-| alice29.txt (1000B) | 77.40% | 0.00% | 1.07x | 1.09x |
-| alice29.txt (10000B) | 66.48% | -4.29% | 1.27x | 1.43x |
-| alice29.txt (20000B) | 63.43% | -7.00% | 1.45x | 1.93x |
+| html | 22843 | 20868 | 16246 MB/s | 18617 MB/s | 40972 MB/s | 49263 MB/s |
+| urls.10K | 335492 | 286541 | 7943 MB/s | 10201 MB/s | 22523 MB/s | 26484 MB/s |
+| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 303228 MB/s | 718321 MB/s | 827552 MB/s |
+| fireworks.jpeg (200B) | 146 | 155 | 8869 MB/s | 20180 MB/s | 33691 MB/s | 52421 MB/s |
+| paper-100k.pdf | 85304 | 84202 | 167546 MB/s | 112988 MB/s | 326905 MB/s | 291944 MB/s |
+| html_x_4 | 92234 | 20870 | 15194 MB/s | 54457 MB/s | 30843 MB/s | 32217 MB/s |
+| alice29.txt | 88034 | 85934 | 5936 MB/s | 6540 MB/s | 12882 MB/s | 20044 MB/s |
+| asyoulik.txt | 77503 | 79575 | 5517 MB/s | 6657 MB/s | 12735 MB/s | 22806 MB/s |
+| lcet10.txt | 234661 | 220383 | 6235 MB/s | 6303 MB/s | 14519 MB/s | 18697 MB/s |
+| plrabn12.txt | 319267 | 318196 | 5159 MB/s | 6074 MB/s | 11923 MB/s | 19901 MB/s |
+| geo.protodata | 23335 | 18606 | 21220 MB/s | 25432 MB/s | 56271 MB/s | 62540 MB/s |
+| kppkn.gtb | 69526 | 65019 | 9732 MB/s | 8905 MB/s | 18491 MB/s | 18969 MB/s |
+| alice29.txt (128B) | 80 | 82 | 6691 MB/s | 17179 MB/s | 31883 MB/s | 38874 MB/s |
+| alice29.txt (1000B) | 774 | 774 | 12204 MB/s | 13273 MB/s | 48056 MB/s | 52341 MB/s |
+| alice29.txt (10000B) | 6648 | 6933 | 10044 MB/s | 12824 MB/s | 32378 MB/s | 46322 MB/s |
+| alice29.txt (20000B) | 12686 | 13516 | 7733 MB/s | 12160 MB/s | 30566 MB/s | 58969 MB/s |
+
Speed is generally at or above Snappy. Small blocks gets a significant speedup, although at the expense of size.
@@ -543,42 +524,23 @@ So individual benchmarks should only be seen as a guideline and the overall pict
| Absolute Perf | Snappy size | Better Size | Snappy Speed | Better Speed | Snappy dec | Better dec |
|-----------------------|-------------|-------------|--------------|--------------|-------------|-------------|
-| html | 22843 | 19833 | 16246 MB/s | 7731 MB/s | 40972 MB/s | 40292 MB/s |
-| urls.10K | 335492 | 253529 | 7943 MB/s | 3980 MB/s | 22523 MB/s | 20981 MB/s |
-| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 9760 MB/s | 718321 MB/s | 823698 MB/s |
-| fireworks.jpeg (200B) | 146 | 142 | 8869 MB/s | 594 MB/s | 33691 MB/s | 30101 MB/s |
-| paper-100k.pdf | 85304 | 82915 | 167546 MB/s | 7470 MB/s | 326905 MB/s | 198869 MB/s |
-| html_x_4 | 92234 | 19841 | 15194 MB/s | 23403 MB/s | 30843 MB/s | 30937 MB/s |
-| alice29.txt | 88034 | 73218 | 5936 MB/s | 2945 MB/s | 12882 MB/s | 16611 MB/s |
-| asyoulik.txt | 77503 | 66844 | 5517 MB/s | 2739 MB/s | 12735 MB/s | 14975 MB/s |
-| lcet10.txt | 234661 | 190589 | 6235 MB/s | 3099 MB/s | 14519 MB/s | 16634 MB/s |
-| plrabn12.txt | 319267 | 270828 | 5159 MB/s | 2600 MB/s | 11923 MB/s | 13382 MB/s |
-| geo.protodata | 23335 | 18278 | 21220 MB/s | 11208 MB/s | 56271 MB/s | 57961 MB/s |
-| kppkn.gtb | 69526 | 61851 | 9732 MB/s | 4556 MB/s | 18491 MB/s | 16524 MB/s |
-| alice29.txt (128B) | 80 | 81 | 6691 MB/s | 529 MB/s | 31883 MB/s | 34225 MB/s |
-| alice29.txt (1000B) | 774 | 748 | 12204 MB/s | 1943 MB/s | 48056 MB/s | 42068 MB/s |
-| alice29.txt (10000B) | 6648 | 6234 | 10044 MB/s | 2949 MB/s | 32378 MB/s | 28813 MB/s |
-| alice29.txt (20000B) | 12686 | 11584 | 7733 MB/s | 2822 MB/s | 30566 MB/s | 27315 MB/s |
-
-
-| Relative Perf | Snappy size | Better size | Better Speed | Better dec |
-|-----------------------|-------------|-------------|--------------|------------|
-| html | 22.31% | 13.18% | 0.48x | 0.98x |
-| urls.10K | 47.78% | 24.43% | 0.50x | 0.93x |
-| fireworks.jpeg | 99.95% | -0.05% | 0.03x | 1.15x |
-| fireworks.jpeg (200B) | 73.00% | 2.74% | 0.07x | 0.89x |
-| paper-100k.pdf | 83.30% | 2.80% | 0.07x | 0.61x |
-| html_x_4 | 22.52% | 78.49% | 0.04x | 1.00x |
-| alice29.txt | 57.88% | 16.83% | 1.54x | 1.29x |
-| asyoulik.txt | 61.91% | 13.75% | 0.50x | 1.18x |
-| lcet10.txt | 54.99% | 18.78% | 0.50x | 1.15x |
-| plrabn12.txt | 66.26% | 15.17% | 0.50x | 1.12x |
-| geo.protodata | 19.68% | 21.67% | 0.50x | 1.03x |
-| kppkn.gtb | 37.72% | 11.04% | 0.53x | 0.89x |
-| alice29.txt (128B) | 62.50% | -1.25% | 0.47x | 1.07x |
-| alice29.txt (1000B) | 77.40% | 3.36% | 0.08x | 0.88x |
-| alice29.txt (10000B) | 66.48% | 6.23% | 0.16x | 0.89x |
-| alice29.txt (20000B) | 63.43% | 8.69% | 0.29x | 0.89x |
+| html | 22843 | 18972 | 16246 MB/s | 8621 MB/s | 40972 MB/s | 40292 MB/s |
+| urls.10K | 335492 | 248079 | 7943 MB/s | 5104 MB/s | 22523 MB/s | 20981 MB/s |
+| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 84429 MB/s | 718321 MB/s | 823698 MB/s |
+| fireworks.jpeg (200B) | 146 | 149 | 8869 MB/s | 7125 MB/s | 33691 MB/s | 30101 MB/s |
+| paper-100k.pdf | 85304 | 82887 | 167546 MB/s | 11087 MB/s | 326905 MB/s | 198869 MB/s |
+| html_x_4 | 92234 | 18982 | 15194 MB/s | 29316 MB/s | 30843 MB/s | 30937 MB/s |
+| alice29.txt | 88034 | 71611 | 5936 MB/s | 3709 MB/s | 12882 MB/s | 16611 MB/s |
+| asyoulik.txt | 77503 | 65941 | 5517 MB/s | 3380 MB/s | 12735 MB/s | 14975 MB/s |
+| lcet10.txt | 234661 | 184939 | 6235 MB/s | 3537 MB/s | 14519 MB/s | 16634 MB/s |
+| plrabn12.txt | 319267 | 264990 | 5159 MB/s | 2960 MB/s | 11923 MB/s | 13382 MB/s |
+| geo.protodata | 23335 | 17689 | 21220 MB/s | 10859 MB/s | 56271 MB/s | 57961 MB/s |
+| kppkn.gtb | 69526 | 55398 | 9732 MB/s | 5206 MB/s | 18491 MB/s | 16524 MB/s |
+| alice29.txt (128B) | 80 | 78 | 6691 MB/s | 7422 MB/s | 31883 MB/s | 34225 MB/s |
+| alice29.txt (1000B) | 774 | 746 | 12204 MB/s | 5734 MB/s | 48056 MB/s | 42068 MB/s |
+| alice29.txt (10000B) | 6648 | 6218 | 10044 MB/s | 6055 MB/s | 32378 MB/s | 28813 MB/s |
+| alice29.txt (20000B) | 12686 | 11492 | 7733 MB/s | 3143 MB/s | 30566 MB/s | 27315 MB/s |
+
Except for the mostly incompressible JPEG image compression is better and usually in the
double digits in terms of percentage reduction over Snappy.
@@ -605,29 +567,29 @@ Some examples compared on 16 core CPU, amd64 assembly used:
```
* enwik10
-Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s
-Better... 10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s
-Best... 10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s
+Default... 10000000000 -> 4759950115 [47.60%]; 1.03s, 9263.0MB/s
+Better... 10000000000 -> 4084706676 [40.85%]; 2.16s, 4415.4MB/s
+Best... 10000000000 -> 3615520079 [36.16%]; 42.259s, 225.7MB/s
* github-june-2days-2019.json
-Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s
-Better... 6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s
-Best... 6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s
+Default... 6273951764 -> 1041700255 [16.60%]; 431ms, 13882.3MB/s
+Better... 6273951764 -> 945841238 [15.08%]; 547ms, 10938.4MB/s
+Best... 6273951764 -> 826392576 [13.17%]; 9.455s, 632.8MB/s
* nyc-taxi-data-10M.csv
-Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s
-Better... 3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s
-Best... 3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s
+Default... 3325605752 -> 1093516949 [32.88%]; 324ms, 9788.7MB/s
+Better... 3325605752 -> 885394158 [26.62%]; 491ms, 6459.4MB/s
+Best... 3325605752 -> 773681257 [23.26%]; 8.29s, 412.0MB/s
* 10gb.tar
-Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s
-Better... 10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s
-Best... 10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/
+Default... 10065157632 -> 5915541066 [58.77%]; 1.028s, 9337.4MB/s
+Better... 10065157632 -> 5453844650 [54.19%]; 1.597s, 4862.7MB/s
+Best... 10065157632 -> 5192495021 [51.59%]; 32.78s, 308.2MB/
* consensus.db.10gb
-Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s
-Better... 10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s
-Best... 10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s
+Default... 10737418240 -> 4549762344 [42.37%]; 882ms, 12118.4MB/s
+Better... 10737418240 -> 4438535064 [41.34%]; 1.533s, 3500.9MB/s
+Best... 10737418240 -> 4210602774 [39.21%]; 42.96s, 254.4MB/s
```
Decompression speed should be around the same as using the 'better' compression mode.
@@ -648,10 +610,10 @@ If you would like more control, you can use the s2 package as described below:
Snappy compatible blocks can be generated with the S2 encoder.
Compression and speed is typically a bit better `MaxEncodedLen` is also smaller for smaller memory usage. Replace
-| Snappy | S2 replacement |
-|----------------------------|-------------------------|
-| snappy.Encode(...) | s2.EncodeSnappy(...) |
-| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) |
+| Snappy | S2 replacement |
+|---------------------------|-----------------------|
+| snappy.Encode(...) | s2.EncodeSnappy(...) |
+| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) |
`s2.EncodeSnappy` can be replaced with `s2.EncodeSnappyBetter` or `s2.EncodeSnappyBest` to get more efficiently compressed snappy compatible output.
@@ -660,12 +622,12 @@ Compression and speed is typically a bit better `MaxEncodedLen` is also smaller
Comparison of [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
53927 files, total input size: 4,014,735,833 bytes. amd64, single goroutine used:
-| Encoder | Size | MB/s | Reduction |
-|-----------------------|------------|------------|------------
-| snappy.Encode | 1128706759 | 725.59 | 71.89% |
-| s2.EncodeSnappy | 1093823291 | **899.16** | 72.75% |
-| s2.EncodeSnappyBetter | 1001158548 | 578.49 | 75.06% |
-| s2.EncodeSnappyBest | 944507998 | 66.00 | **76.47%**|
+| Encoder | Size | MB/s | Reduction |
+|-----------------------|------------|------------|------------|
+| snappy.Encode | 1128706759 | 725.59 | 71.89% |
+| s2.EncodeSnappy | 1093823291 | **899.16** | 72.75% |
+| s2.EncodeSnappyBetter | 1001158548 | 578.49 | 75.06% |
+| s2.EncodeSnappyBest | 944507998 | 66.00 | **76.47%** |
## Streams
@@ -835,6 +797,13 @@ This is done using the regular "Skip" function:
This will ensure that we are at exactly the offset we want, and reading from `dec` will start at the requested offset.
+# Compact storage
+
+For compact storage [RemoveIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RemoveIndexHeaders) can be used to remove any redundant info from
+a serialized index. If you remove the header it must be restored before [Loading](https://pkg.go.dev/github.com/klauspost/compress/s2#Index.Load).
+
+This is expected to save 20 bytes. These can be restored using [RestoreIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RestoreIndexHeaders). This removes a layer of security, but is the most compact representation. Returns nil if headers contains errors.
+
## Index Format:
Each block is structured as a snappy skippable block, with the chunk ID 0x99.
@@ -844,20 +813,20 @@ The block can be read from the front, but contains information so it can be read
Numbers are stored as fixed size little endian values or [zigzag encoded](https://developers.google.com/protocol-buffers/docs/encoding#signed_integers) [base 128 varints](https://developers.google.com/protocol-buffers/docs/encoding),
with un-encoded value length of 64 bits, unless other limits are specified.
-| Content | Format |
-|---------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------|
-| ID, `[1]byte` | Always 0x99. |
-| Data Length, `[3]byte` | 3 byte little-endian length of the chunk in bytes, following this. |
-| Header `[6]byte` | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00". |
-| UncompressedSize, Varint | Total Uncompressed size. |
-| CompressedSize, Varint | Total Compressed size if known. Should be -1 if unknown. |
-| EstBlockSize, Varint | Block Size, used for guessing uncompressed offsets. Must be >= 0. |
-| Entries, Varint | Number of Entries in index, must be < 65536 and >=0. |
-| HasUncompressedOffsets `byte` | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid. |
-| UncompressedOffsets, [Entries]VarInt | Uncompressed offsets. See below how to decode. |
-| CompressedOffsets, [Entries]VarInt | Compressed offsets. See below how to decode. |
-| Block Size, `[4]byte` | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block. |
-| Trailer `[6]byte` | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. |
+| Content | Format |
+|--------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
+| ID, `[1]byte` | Always 0x99. |
+| Data Length, `[3]byte` | 3 byte little-endian length of the chunk in bytes, following this. |
+| Header `[6]byte` | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00". |
+| UncompressedSize, Varint | Total Uncompressed size. |
+| CompressedSize, Varint | Total Compressed size if known. Should be -1 if unknown. |
+| EstBlockSize, Varint | Block Size, used for guessing uncompressed offsets. Must be >= 0. |
+| Entries, Varint | Number of Entries in index, must be < 65536 and >=0. |
+| HasUncompressedOffsets `byte` | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid. |
+| UncompressedOffsets, [Entries]VarInt | Uncompressed offsets. See below how to decode. |
+| CompressedOffsets, [Entries]VarInt | Compressed offsets. See below how to decode. |
+| Block Size, `[4]byte` | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block. |
+| Trailer `[6]byte` | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. |
For regular streams the uncompressed offsets are fully predictable,
so `HasUncompressedOffsets` allows to specify that compressed blocks all have
@@ -929,6 +898,7 @@ To decode from any given uncompressed offset `(wantOffset)`:
See [using indexes](https://github.com/klauspost/compress/tree/master/s2#using-indexes) for functions that perform the operations with a simpler interface.
+
# Format Extensions
* Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`.
@@ -951,10 +921,11 @@ The length is specified by reading the 3-bit length specified in the tag and dec
| 7 | 65540 + read 3 bytes |
This allows any repeat offset + length to be represented by 2 to 5 bytes.
+It also allows to emit matches longer than 64 bytes with one copy + one repeat instead of several 64 byte copies.
Lengths are stored as little endian values.
-The first copy of a block cannot be a repeat offset and the offset is not carried across blocks in streams.
+The first copy of a block cannot be a repeat offset and the offset is reset on every block in streams.
Default streaming block size is 1MB.
diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go
index 27c0f3c2c..00c5cc72c 100644
--- a/vendor/github.com/klauspost/compress/s2/decode.go
+++ b/vendor/github.com/klauspost/compress/s2/decode.go
@@ -952,7 +952,11 @@ func (r *Reader) ReadSeeker(random bool, index []byte) (*ReadSeeker, error) {
// Seek allows seeking in compressed data.
func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) {
if r.err != nil {
- return 0, r.err
+ if !errors.Is(r.err, io.EOF) {
+ return 0, r.err
+ }
+ // Reset on EOF
+ r.err = nil
}
if offset == 0 && whence == io.SeekCurrent {
return r.blockStart + int64(r.i), nil
diff --git a/vendor/github.com/klauspost/compress/s2/decode_other.go b/vendor/github.com/klauspost/compress/s2/decode_other.go
index 1074ebd21..11300c3a8 100644
--- a/vendor/github.com/klauspost/compress/s2/decode_other.go
+++ b/vendor/github.com/klauspost/compress/s2/decode_other.go
@@ -28,6 +28,9 @@ func s2Decode(dst, src []byte) int {
// As long as we can read at least 5 bytes...
for s < len(src)-5 {
+ // Removing bounds checks is SLOWER, when if doing
+ // in := src[s:s+5]
+ // Checked on Go 1.18
switch src[s] & 0x03 {
case tagLiteral:
x := uint32(src[s] >> 2)
@@ -38,14 +41,19 @@ func s2Decode(dst, src []byte) int {
s += 2
x = uint32(src[s-1])
case x == 61:
+ in := src[s : s+3]
+ x = uint32(in[1]) | uint32(in[2])<<8
s += 3
- x = uint32(src[s-2]) | uint32(src[s-1])<<8
case x == 62:
+ in := src[s : s+4]
+ // Load as 32 bit and shift down.
+ x = uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
+ x >>= 8
s += 4
- x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
case x == 63:
+ in := src[s : s+5]
+ x = uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24
s += 5
- x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
}
length = int(x) + 1
if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
@@ -62,8 +70,8 @@ func s2Decode(dst, src []byte) int {
case tagCopy1:
s += 2
- length = int(src[s-2]) >> 2 & 0x7
toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+ length = int(src[s-2]) >> 2 & 0x7
if toffset == 0 {
if debug {
fmt.Print("(repeat) ")
@@ -71,14 +79,16 @@ func s2Decode(dst, src []byte) int {
// keep last offset
switch length {
case 5:
+ length = int(src[s]) + 4
s += 1
- length = int(uint32(src[s-1])) + 4
case 6:
+ in := src[s : s+2]
+ length = int(uint32(in[0])|(uint32(in[1])<<8)) + (1 << 8)
s += 2
- length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
case 7:
+ in := src[s : s+3]
+ length = int((uint32(in[2])<<16)|(uint32(in[1])<<8)|uint32(in[0])) + (1 << 16)
s += 3
- length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
default: // 0-> 4
}
} else {
@@ -86,14 +96,16 @@ func s2Decode(dst, src []byte) int {
}
length += 4
case tagCopy2:
+ in := src[s : s+3]
+ offset = int(uint32(in[1]) | uint32(in[2])<<8)
+ length = 1 + int(in[0])>>2
s += 3
- length = 1 + int(src[s-3])>>2
- offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
case tagCopy4:
+ in := src[s : s+5]
+ offset = int(uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24)
+ length = 1 + int(in[0])>>2
s += 5
- length = 1 + int(src[s-5])>>2
- offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
}
if offset <= 0 || d < offset || length > len(dst)-d {
diff --git a/vendor/github.com/klauspost/compress/s2/encode_all.go b/vendor/github.com/klauspost/compress/s2/encode_all.go
index 8b16c38a6..54c71d3b5 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_all.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_all.go
@@ -58,8 +58,9 @@ func encodeGo(dst, src []byte) []byte {
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockGo(dst, src []byte) (d int) {
// Initialize the hash table.
const (
diff --git a/vendor/github.com/klauspost/compress/s2/encode_amd64.go b/vendor/github.com/klauspost/compress/s2/encode_amd64.go
index e612225f4..6b93daa5a 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_amd64.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_amd64.go
@@ -8,8 +8,9 @@ package s2
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlock(dst, src []byte) (d int) {
const (
// Use 12 bit table when less than...
@@ -43,8 +44,9 @@ func encodeBlock(dst, src []byte) (d int) {
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBetter(dst, src []byte) (d int) {
const (
// Use 12 bit table when less than...
@@ -78,8 +80,9 @@ func encodeBlockBetter(dst, src []byte) (d int) {
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockSnappy(dst, src []byte) (d int) {
const (
// Use 12 bit table when less than...
@@ -112,8 +115,9 @@ func encodeBlockSnappy(dst, src []byte) (d int) {
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBetterSnappy(dst, src []byte) (d int) {
const (
// Use 12 bit table when less than...
diff --git a/vendor/github.com/klauspost/compress/s2/encode_best.go b/vendor/github.com/klauspost/compress/s2/encode_best.go
index 4bc80bc6a..1b7ea394f 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_best.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_best.go
@@ -15,8 +15,9 @@ import (
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBest(dst, src []byte) (d int) {
// Initialize the hash tables.
const (
@@ -176,14 +177,21 @@ func encodeBlockBest(dst, src []byte) (d int) {
best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
}
// Search for a match at best match end, see if that is better.
- if sAt := best.s + best.length; sAt < sLimit {
- sBack := best.s
- backL := best.length
+ // Allow some bytes at the beginning to mismatch.
+ // Sweet spot is around 1-2 bytes, but depends on input.
+ // The skipped bytes are tested in Extend backwards,
+ // and still picked up as part of the match if they do.
+ const skipBeginning = 2
+ const skipEnd = 1
+ if sAt := best.s + best.length - skipEnd; sAt < sLimit {
+
+ sBack := best.s + skipBeginning - skipEnd
+ backL := best.length - skipBeginning
// Load initial values
cv = load64(src, sBack)
- // Search for mismatch
+
+ // Grab candidates...
next := lTable[hash8(load64(src, sAt), lTableBits)]
- //next := sTable[hash4(load64(src, sAt), sTableBits)]
if checkAt := getCur(next) - backL; checkAt > 0 {
best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
@@ -191,6 +199,16 @@ func encodeBlockBest(dst, src []byte) (d int) {
if checkAt := getPrev(next) - backL; checkAt > 0 {
best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
}
+ // Disabled: Extremely small gain
+ if false {
+ next = sTable[hash4(load64(src, sAt), sTableBits)]
+ if checkAt := getCur(next) - backL; checkAt > 0 {
+ best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+ }
+ if checkAt := getPrev(next) - backL; checkAt > 0 {
+ best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+ }
+ }
}
}
}
@@ -288,8 +306,9 @@ emitRemainder:
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBestSnappy(dst, src []byte) (d int) {
// Initialize the hash tables.
const (
@@ -546,6 +565,7 @@ emitRemainder:
// emitCopySize returns the size to encode the offset+length
//
// It assumes that:
+//
// 1 <= offset && offset <= math.MaxUint32
// 4 <= length && length <= 1 << 24
func emitCopySize(offset, length int) int {
@@ -584,6 +604,7 @@ func emitCopySize(offset, length int) int {
// emitCopyNoRepeatSize returns the size to encode the offset+length
//
// It assumes that:
+//
// 1 <= offset && offset <= math.MaxUint32
// 4 <= length && length <= 1 << 24
func emitCopyNoRepeatSize(offset, length int) int {
diff --git a/vendor/github.com/klauspost/compress/s2/encode_better.go b/vendor/github.com/klauspost/compress/s2/encode_better.go
index 943215b8a..3b66ba42b 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_better.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_better.go
@@ -42,8 +42,9 @@ func hash8(u uint64, h uint8) uint32 {
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBetterGo(dst, src []byte) (d int) {
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
@@ -56,7 +57,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
// Initialize the hash tables.
const (
// Long hash matches.
- lTableBits = 16
+ lTableBits = 17
maxLTableSize = 1 << lTableBits
// Short hash matches.
@@ -97,9 +98,26 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
lTable[hashL] = uint32(s)
sTable[hashS] = uint32(s)
+ valLong := load64(src, candidateL)
+ valShort := load64(src, candidateS)
+
+ // If long matches at least 8 bytes, use that.
+ if cv == valLong {
+ break
+ }
+ if cv == valShort {
+ candidateL = candidateS
+ break
+ }
+
// Check repeat at offset checkRep.
const checkRep = 1
- if false && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+ // Minimum length of a repeat. Tested with various values.
+ // While 4-5 offers improvements in some, 6 reduces
+ // regressions significantly.
+ const wantRepeatBytes = 6
+ const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
+ if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
base := s + checkRep
// Extend back
for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
@@ -109,8 +127,8 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
d += emitLiteral(dst[d:], src[nextEmit:base])
// Extend forward
- candidate := s - repeat + 4 + checkRep
- s += 4 + checkRep
+ candidate := s - repeat + wantRepeatBytes + checkRep
+ s += wantRepeatBytes + checkRep
for s < len(src) {
if len(src)-s < 8 {
if src[s] == src[candidate] {
@@ -127,28 +145,40 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
s += 8
candidate += 8
}
- if nextEmit > 0 {
- // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
- d += emitRepeat(dst[d:], repeat, s-base)
- } else {
- // First match, cannot be repeat.
- d += emitCopy(dst[d:], repeat, s-base)
- }
+ // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+ d += emitRepeat(dst[d:], repeat, s-base)
nextEmit = s
if s >= sLimit {
goto emitRemainder
}
+ // Index in-between
+ index0 := base + 1
+ index1 := s - 2
+
+ cv = load64(src, s)
+ for index0 < index1 {
+ cv0 := load64(src, index0)
+ cv1 := load64(src, index1)
+ lTable[hash7(cv0, lTableBits)] = uint32(index0)
+ sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+ lTable[hash7(cv1, lTableBits)] = uint32(index1)
+ sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+ index0 += 2
+ index1 -= 2
+ }
cv = load64(src, s)
continue
}
- if uint32(cv) == load32(src, candidateL) {
+ // Long likely matches 7, so take that.
+ if uint32(cv) == uint32(valLong) {
break
}
// Check our short candidate
- if uint32(cv) == load32(src, candidateS) {
+ if uint32(cv) == uint32(valShort) {
// Try a long candidate at s+1
hashL = hash7(cv>>8, lTableBits)
candidateL = int(lTable[hashL])
@@ -227,21 +257,29 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
// Do we have space for more, if not bail.
return 0
}
- // Index match start+1 (long) and start+2 (short)
+
+ // Index short & long
index0 := base + 1
- // Index match end-2 (long) and end-1 (short)
index1 := s - 2
cv0 := load64(src, index0)
cv1 := load64(src, index1)
- cv = load64(src, s)
lTable[hash7(cv0, lTableBits)] = uint32(index0)
- lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
- lTable[hash7(cv1, lTableBits)] = uint32(index1)
- lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
- sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)
+
+ lTable[hash7(cv1, lTableBits)] = uint32(index1)
sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+ index0 += 1
+ index1 -= 1
+ cv = load64(src, s)
+
+ // index every second long in between.
+ for index0 < index1 {
+ lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+ lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
+ index0 += 2
+ index1 -= 2
+ }
}
emitRemainder:
@@ -260,8 +298,9 @@ emitRemainder:
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src)) &&
-// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
@@ -402,21 +441,29 @@ func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
// Do we have space for more, if not bail.
return 0
}
- // Index match start+1 (long) and start+2 (short)
+
+ // Index short & long
index0 := base + 1
- // Index match end-2 (long) and end-1 (short)
index1 := s - 2
cv0 := load64(src, index0)
cv1 := load64(src, index1)
- cv = load64(src, s)
lTable[hash7(cv0, lTableBits)] = uint32(index0)
- lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
- lTable[hash7(cv1, lTableBits)] = uint32(index1)
- lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
- sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)
+
+ lTable[hash7(cv1, lTableBits)] = uint32(index1)
sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+ index0 += 1
+ index1 -= 1
+ cv = load64(src, s)
+
+ // index every second long in between.
+ for index0 < index1 {
+ lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+ lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
+ index0 += 2
+ index1 -= 2
+ }
}
emitRemainder:
diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go
index 94784b82a..db08fc355 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_go.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_go.go
@@ -12,6 +12,7 @@ import (
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src))
func encodeBlock(dst, src []byte) (d int) {
if len(src) < minNonLiteralBlockSize {
@@ -25,6 +26,7 @@ func encodeBlock(dst, src []byte) (d int) {
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src))
func encodeBlockBetter(dst, src []byte) (d int) {
return encodeBlockBetterGo(dst, src)
@@ -35,6 +37,7 @@ func encodeBlockBetter(dst, src []byte) (d int) {
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src))
func encodeBlockBetterSnappy(dst, src []byte) (d int) {
return encodeBlockBetterSnappyGo(dst, src)
@@ -45,6 +48,7 @@ func encodeBlockBetterSnappy(dst, src []byte) (d int) {
// been written.
//
// It also assumes that:
+//
// len(dst) >= MaxEncodedLen(len(src))
func encodeBlockSnappy(dst, src []byte) (d int) {
if len(src) < minNonLiteralBlockSize {
@@ -56,6 +60,7 @@ func encodeBlockSnappy(dst, src []byte) (d int) {
// emitLiteral writes a literal chunk and returns the number of bytes written.
//
// It assumes that:
+//
// dst is long enough to hold the encoded bytes
// 0 <= len(lit) && len(lit) <= math.MaxUint32
func emitLiteral(dst, lit []byte) int {
@@ -146,6 +151,7 @@ func emitRepeat(dst []byte, offset, length int) int {
// emitCopy writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
+//
// dst is long enough to hold the encoded bytes
// 1 <= offset && offset <= math.MaxUint32
// 4 <= length && length <= 1 << 24
@@ -214,6 +220,7 @@ func emitCopy(dst []byte, offset, length int) int {
// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
+//
// dst is long enough to hold the encoded bytes
// 1 <= offset && offset <= math.MaxUint32
// 4 <= length && length <= 1 << 24
@@ -273,8 +280,8 @@ func emitCopyNoRepeat(dst []byte, offset, length int) int {
// matchLen returns how many bytes match in a and b
//
// It assumes that:
-// len(a) <= len(b)
//
+// len(a) <= len(b)
func matchLen(a []byte, b []byte) int {
b = b[:len(a)]
var checked int
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
index 88f27c099..7e00bac3e 100644
--- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
@@ -1,7 +1,6 @@
// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
//go:build !appengine && !noasm && gc && !noasm
-// +build !appengine,!noasm,gc,!noasm
package s2
@@ -150,8 +149,9 @@ func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
// emitLiteral writes a literal chunk and returns the number of bytes written.
//
// It assumes that:
-// dst is long enough to hold the encoded bytes with margin of 0 bytes
-// 0 <= len(lit) && len(lit) <= math.MaxUint32
+//
+// dst is long enough to hold the encoded bytes with margin of 0 bytes
+// 0 <= len(lit) && len(lit) <= math.MaxUint32
//
//go:noescape
func emitLiteral(dst []byte, lit []byte) int
@@ -165,9 +165,10 @@ func emitRepeat(dst []byte, offset int, length int) int
// emitCopy writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
-// dst is long enough to hold the encoded bytes
-// 1 <= offset && offset <= math.MaxUint32
-// 4 <= length && length <= 1 << 24
+//
+// dst is long enough to hold the encoded bytes
+// 1 <= offset && offset <= math.MaxUint32
+// 4 <= length && length <= 1 << 24
//
//go:noescape
func emitCopy(dst []byte, offset int, length int) int
@@ -175,9 +176,10 @@ func emitCopy(dst []byte, offset int, length int) int
// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
-// dst is long enough to hold the encoded bytes
-// 1 <= offset && offset <= math.MaxUint32
-// 4 <= length && length <= 1 << 24
+//
+// dst is long enough to hold the encoded bytes
+// 1 <= offset && offset <= math.MaxUint32
+// 4 <= length && length <= 1 << 24
//
//go:noescape
func emitCopyNoRepeat(dst []byte, offset int, length int) int
@@ -185,7 +187,8 @@ func emitCopyNoRepeat(dst []byte, offset int, length int) int
// matchLen returns how many bytes match in a and b
//
// It assumes that:
-// len(a) <= len(b)
+//
+// len(a) <= len(b)
//
//go:noescape
func matchLen(a []byte, b []byte) int
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
index 36915d949..81a487d6d 100644
--- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
@@ -1,7 +1,6 @@
// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
//go:build !appengine && !noasm && gc && !noasm
-// +build !appengine,!noasm,gc,!noasm
#include "textflag.h"
@@ -5743,9 +5742,9 @@ emit_literal_done_emit_remainder_encodeBlockAsm8B:
// func encodeBetterBlockAsm(dst []byte, src []byte) int
// Requires: BMI, SSE2
-TEXT ·encodeBetterBlockAsm(SB), $327704-56
+TEXT ·encodeBetterBlockAsm(SB), $589848-56
MOVQ dst_base+0(FP), AX
- MOVQ $0x00000a00, CX
+ MOVQ $0x00001200, CX
LEAQ 24(SP), DX
PXOR X0, X0
@@ -5797,27 +5796,37 @@ check_maxskip_cont_encodeBetterBlockAsm:
MOVQ DI, R11
SHLQ $0x08, R10
IMULQ R9, R10
- SHRQ $0x30, R10
+ SHRQ $0x2f, R10
SHLQ $0x20, R11
IMULQ SI, R11
SHRQ $0x32, R11
MOVL 24(SP)(R10*4), SI
- MOVL 262168(SP)(R11*4), R8
+ MOVL 524312(SP)(R11*4), R8
MOVL CX, 24(SP)(R10*4)
- MOVL CX, 262168(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 524312(SP)(R11*4)
+ MOVQ (DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ CMPQ R10, DI
JEQ candidate_match_encodeBetterBlockAsm
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeBetterBlockAsm
- MOVL 20(SP), CX
- JMP search_loop_encodeBetterBlockAsm
+ CMPQ R11, DI
+ JNE no_short_found_encodeBetterBlockAsm
+ MOVL R8, SI
+ JMP candidate_match_encodeBetterBlockAsm
+
+no_short_found_encodeBetterBlockAsm:
+ CMPL R10, DI
+ JEQ candidate_match_encodeBetterBlockAsm
+ CMPL R11, DI
+ JEQ candidateS_match_encodeBetterBlockAsm
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm
candidateS_match_encodeBetterBlockAsm:
SHRQ $0x08, DI
MOVQ DI, R10
SHLQ $0x08, R10
IMULQ R9, R10
- SHRQ $0x30, R10
+ SHRQ $0x2f, R10
MOVL 24(SP)(R10*4), SI
INCL CX
MOVL CX, 24(SP)(R10*4)
@@ -6590,52 +6599,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm:
match_nolit_dst_ok_encodeBetterBlockAsm:
MOVQ $0x00cf1bbcdcbfa563, SI
MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
+ LEAQ 1(DI), DI
+ LEAQ -2(CX), R9
+ MOVQ (DX)(DI*1), R10
+ MOVQ 1(DX)(DI*1), R11
+ MOVQ (DX)(R9*1), R12
+ MOVQ 1(DX)(R9*1), R13
SHLQ $0x08, R10
IMULQ SI, R10
- SHRQ $0x30, R10
- SHLQ $0x08, R13
- IMULQ SI, R13
- SHRQ $0x30, R13
+ SHRQ $0x2f, R10
SHLQ $0x20, R11
IMULQ R8, R11
SHRQ $0x32, R11
- SHLQ $0x20, R12
- IMULQ R8, R12
- SHRQ $0x32, R12
+ SHLQ $0x08, R12
+ IMULQ SI, R12
+ SHRQ $0x2f, R12
+ SHLQ $0x20, R13
+ IMULQ R8, R13
+ SHRQ $0x32, R13
+ LEAQ 1(DI), R8
+ LEAQ 1(R9), R14
MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 262168(SP)(R11*4)
- MOVL R15, 262168(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
+ MOVL R9, 24(SP)(R12*4)
+ MOVL R8, 524312(SP)(R11*4)
+ MOVL R14, 524312(SP)(R13*4)
+ ADDQ $0x01, DI
+ SUBQ $0x01, R9
+
+index_loop_encodeBetterBlockAsm:
+ CMPQ DI, R9
+ JAE search_loop_encodeBetterBlockAsm
+ MOVQ (DX)(DI*1), R8
+ MOVQ (DX)(R9*1), R10
+ SHLQ $0x08, R8
+ IMULQ SI, R8
+ SHRQ $0x2f, R8
SHLQ $0x08, R10
IMULQ SI, R10
- SHRQ $0x30, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x32, R11
- SHLQ $0x08, R13
- IMULQ SI, R13
- SHRQ $0x30, R13
+ SHRQ $0x2f, R10
+ MOVL DI, 24(SP)(R8*4)
MOVL R9, 24(SP)(R10*4)
- MOVL DI, 262168(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeBetterBlockAsm
+ ADDQ $0x02, DI
+ SUBQ $0x02, R9
+ JMP index_loop_encodeBetterBlockAsm
emit_remainder_encodeBetterBlockAsm:
MOVQ src_len+32(FP), CX
@@ -6815,9 +6821,9 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm:
// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
// Requires: BMI, SSE2
-TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56
+TEXT ·encodeBetterBlockAsm4MB(SB), $589848-56
MOVQ dst_base+0(FP), AX
- MOVQ $0x00000a00, CX
+ MOVQ $0x00001200, CX
LEAQ 24(SP), DX
PXOR X0, X0
@@ -6869,27 +6875,37 @@ check_maxskip_cont_encodeBetterBlockAsm4MB:
MOVQ DI, R11
SHLQ $0x08, R10
IMULQ R9, R10
- SHRQ $0x30, R10
+ SHRQ $0x2f, R10
SHLQ $0x20, R11
IMULQ SI, R11
SHRQ $0x32, R11
MOVL 24(SP)(R10*4), SI
- MOVL 262168(SP)(R11*4), R8
+ MOVL 524312(SP)(R11*4), R8
MOVL CX, 24(SP)(R10*4)
- MOVL CX, 262168(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 524312(SP)(R11*4)
+ MOVQ (DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ CMPQ R10, DI
JEQ candidate_match_encodeBetterBlockAsm4MB
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeBetterBlockAsm4MB
- MOVL 20(SP), CX
- JMP search_loop_encodeBetterBlockAsm4MB
+ CMPQ R11, DI
+ JNE no_short_found_encodeBetterBlockAsm4MB
+ MOVL R8, SI
+ JMP candidate_match_encodeBetterBlockAsm4MB
+
+no_short_found_encodeBetterBlockAsm4MB:
+ CMPL R10, DI
+ JEQ candidate_match_encodeBetterBlockAsm4MB
+ CMPL R11, DI
+ JEQ candidateS_match_encodeBetterBlockAsm4MB
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm4MB
candidateS_match_encodeBetterBlockAsm4MB:
SHRQ $0x08, DI
MOVQ DI, R10
SHLQ $0x08, R10
IMULQ R9, R10
- SHRQ $0x30, R10
+ SHRQ $0x2f, R10
MOVL 24(SP)(R10*4), SI
INCL CX
MOVL CX, 24(SP)(R10*4)
@@ -7600,52 +7616,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
match_nolit_dst_ok_encodeBetterBlockAsm4MB:
MOVQ $0x00cf1bbcdcbfa563, SI
MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
+ LEAQ 1(DI), DI
+ LEAQ -2(CX), R9
+ MOVQ (DX)(DI*1), R10
+ MOVQ 1(DX)(DI*1), R11
+ MOVQ (DX)(R9*1), R12
+ MOVQ 1(DX)(R9*1), R13
SHLQ $0x08, R10
IMULQ SI, R10
- SHRQ $0x30, R10
- SHLQ $0x08, R13
- IMULQ SI, R13
- SHRQ $0x30, R13
+ SHRQ $0x2f, R10
SHLQ $0x20, R11
IMULQ R8, R11
SHRQ $0x32, R11
- SHLQ $0x20, R12
- IMULQ R8, R12
- SHRQ $0x32, R12
+ SHLQ $0x08, R12
+ IMULQ SI, R12
+ SHRQ $0x2f, R12
+ SHLQ $0x20, R13
+ IMULQ R8, R13
+ SHRQ $0x32, R13
+ LEAQ 1(DI), R8
+ LEAQ 1(R9), R14
MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 262168(SP)(R11*4)
- MOVL R15, 262168(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
+ MOVL R9, 24(SP)(R12*4)
+ MOVL R8, 524312(SP)(R11*4)
+ MOVL R14, 524312(SP)(R13*4)
+ ADDQ $0x01, DI
+ SUBQ $0x01, R9
+
+index_loop_encodeBetterBlockAsm4MB:
+ CMPQ DI, R9
+ JAE search_loop_encodeBetterBlockAsm4MB
+ MOVQ (DX)(DI*1), R8
+ MOVQ (DX)(R9*1), R10
+ SHLQ $0x08, R8
+ IMULQ SI, R8
+ SHRQ $0x2f, R8
SHLQ $0x08, R10
IMULQ SI, R10
- SHRQ $0x30, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x32, R11
- SHLQ $0x08, R13
- IMULQ SI, R13
- SHRQ $0x30, R13
+ SHRQ $0x2f, R10
+ MOVL DI, 24(SP)(R8*4)
MOVL R9, 24(SP)(R10*4)
- MOVL DI, 262168(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeBetterBlockAsm4MB
+ ADDQ $0x02, DI
+ SUBQ $0x02, R9
+ JMP index_loop_encodeBetterBlockAsm4MB
emit_remainder_encodeBetterBlockAsm4MB:
MOVQ src_len+32(FP), CX
@@ -7871,12 +7884,22 @@ search_loop_encodeBetterBlockAsm12B:
MOVL 65560(SP)(R11*4), R8
MOVL CX, 24(SP)(R10*4)
MOVL CX, 65560(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVQ (DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ CMPQ R10, DI
JEQ candidate_match_encodeBetterBlockAsm12B
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeBetterBlockAsm12B
- MOVL 20(SP), CX
- JMP search_loop_encodeBetterBlockAsm12B
+ CMPQ R11, DI
+ JNE no_short_found_encodeBetterBlockAsm12B
+ MOVL R8, SI
+ JMP candidate_match_encodeBetterBlockAsm12B
+
+no_short_found_encodeBetterBlockAsm12B:
+ CMPL R10, DI
+ JEQ candidate_match_encodeBetterBlockAsm12B
+ CMPL R11, DI
+ JEQ candidateS_match_encodeBetterBlockAsm12B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm12B
candidateS_match_encodeBetterBlockAsm12B:
SHRQ $0x08, DI
@@ -8447,52 +8470,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
match_nolit_dst_ok_encodeBetterBlockAsm12B:
MOVQ $0x0000cf1bbcdcbf9b, SI
MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
+ LEAQ 1(DI), DI
+ LEAQ -2(CX), R9
+ MOVQ (DX)(DI*1), R10
+ MOVQ 1(DX)(DI*1), R11
+ MOVQ (DX)(R9*1), R12
+ MOVQ 1(DX)(R9*1), R13
SHLQ $0x10, R10
IMULQ SI, R10
SHRQ $0x32, R10
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x32, R13
SHLQ $0x20, R11
IMULQ R8, R11
SHRQ $0x34, R11
- SHLQ $0x20, R12
- IMULQ R8, R12
- SHRQ $0x34, R12
+ SHLQ $0x10, R12
+ IMULQ SI, R12
+ SHRQ $0x32, R12
+ SHLQ $0x20, R13
+ IMULQ R8, R13
+ SHRQ $0x34, R13
+ LEAQ 1(DI), R8
+ LEAQ 1(R9), R14
MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 65560(SP)(R11*4)
- MOVL R15, 65560(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
+ MOVL R9, 24(SP)(R12*4)
+ MOVL R8, 65560(SP)(R11*4)
+ MOVL R14, 65560(SP)(R13*4)
+ ADDQ $0x01, DI
+ SUBQ $0x01, R9
+
+index_loop_encodeBetterBlockAsm12B:
+ CMPQ DI, R9
+ JAE search_loop_encodeBetterBlockAsm12B
+ MOVQ (DX)(DI*1), R8
+ MOVQ (DX)(R9*1), R10
+ SHLQ $0x10, R8
+ IMULQ SI, R8
+ SHRQ $0x32, R8
SHLQ $0x10, R10
IMULQ SI, R10
SHRQ $0x32, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x34, R11
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x32, R13
+ MOVL DI, 24(SP)(R8*4)
MOVL R9, 24(SP)(R10*4)
- MOVL DI, 65560(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeBetterBlockAsm12B
+ ADDQ $0x02, DI
+ SUBQ $0x02, R9
+ JMP index_loop_encodeBetterBlockAsm12B
emit_remainder_encodeBetterBlockAsm12B:
MOVQ src_len+32(FP), CX
@@ -8707,12 +8727,22 @@ search_loop_encodeBetterBlockAsm10B:
MOVL 16408(SP)(R11*4), R8
MOVL CX, 24(SP)(R10*4)
MOVL CX, 16408(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVQ (DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ CMPQ R10, DI
JEQ candidate_match_encodeBetterBlockAsm10B
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeBetterBlockAsm10B
- MOVL 20(SP), CX
- JMP search_loop_encodeBetterBlockAsm10B
+ CMPQ R11, DI
+ JNE no_short_found_encodeBetterBlockAsm10B
+ MOVL R8, SI
+ JMP candidate_match_encodeBetterBlockAsm10B
+
+no_short_found_encodeBetterBlockAsm10B:
+ CMPL R10, DI
+ JEQ candidate_match_encodeBetterBlockAsm10B
+ CMPL R11, DI
+ JEQ candidateS_match_encodeBetterBlockAsm10B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm10B
candidateS_match_encodeBetterBlockAsm10B:
SHRQ $0x08, DI
@@ -9283,52 +9313,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
match_nolit_dst_ok_encodeBetterBlockAsm10B:
MOVQ $0x0000cf1bbcdcbf9b, SI
MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
+ LEAQ 1(DI), DI
+ LEAQ -2(CX), R9
+ MOVQ (DX)(DI*1), R10
+ MOVQ 1(DX)(DI*1), R11
+ MOVQ (DX)(R9*1), R12
+ MOVQ 1(DX)(R9*1), R13
SHLQ $0x10, R10
IMULQ SI, R10
SHRQ $0x34, R10
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x34, R13
SHLQ $0x20, R11
IMULQ R8, R11
SHRQ $0x36, R11
- SHLQ $0x20, R12
- IMULQ R8, R12
- SHRQ $0x36, R12
+ SHLQ $0x10, R12
+ IMULQ SI, R12
+ SHRQ $0x34, R12
+ SHLQ $0x20, R13
+ IMULQ R8, R13
+ SHRQ $0x36, R13
+ LEAQ 1(DI), R8
+ LEAQ 1(R9), R14
MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 16408(SP)(R11*4)
- MOVL R15, 16408(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
+ MOVL R9, 24(SP)(R12*4)
+ MOVL R8, 16408(SP)(R11*4)
+ MOVL R14, 16408(SP)(R13*4)
+ ADDQ $0x01, DI
+ SUBQ $0x01, R9
+
+index_loop_encodeBetterBlockAsm10B:
+ CMPQ DI, R9
+ JAE search_loop_encodeBetterBlockAsm10B
+ MOVQ (DX)(DI*1), R8
+ MOVQ (DX)(R9*1), R10
+ SHLQ $0x10, R8
+ IMULQ SI, R8
+ SHRQ $0x34, R8
SHLQ $0x10, R10
IMULQ SI, R10
SHRQ $0x34, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x36, R11
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x34, R13
+ MOVL DI, 24(SP)(R8*4)
MOVL R9, 24(SP)(R10*4)
- MOVL DI, 16408(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeBetterBlockAsm10B
+ ADDQ $0x02, DI
+ SUBQ $0x02, R9
+ JMP index_loop_encodeBetterBlockAsm10B
emit_remainder_encodeBetterBlockAsm10B:
MOVQ src_len+32(FP), CX
@@ -9543,12 +9570,22 @@ search_loop_encodeBetterBlockAsm8B:
MOVL 4120(SP)(R11*4), R8
MOVL CX, 24(SP)(R10*4)
MOVL CX, 4120(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVQ (DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ CMPQ R10, DI
JEQ candidate_match_encodeBetterBlockAsm8B
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeBetterBlockAsm8B
- MOVL 20(SP), CX
- JMP search_loop_encodeBetterBlockAsm8B
+ CMPQ R11, DI
+ JNE no_short_found_encodeBetterBlockAsm8B
+ MOVL R8, SI
+ JMP candidate_match_encodeBetterBlockAsm8B
+
+no_short_found_encodeBetterBlockAsm8B:
+ CMPL R10, DI
+ JEQ candidate_match_encodeBetterBlockAsm8B
+ CMPL R11, DI
+ JEQ candidateS_match_encodeBetterBlockAsm8B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeBetterBlockAsm8B
candidateS_match_encodeBetterBlockAsm8B:
SHRQ $0x08, DI
@@ -10105,52 +10142,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
match_nolit_dst_ok_encodeBetterBlockAsm8B:
MOVQ $0x0000cf1bbcdcbf9b, SI
MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
+ LEAQ 1(DI), DI
+ LEAQ -2(CX), R9
+ MOVQ (DX)(DI*1), R10
+ MOVQ 1(DX)(DI*1), R11
+ MOVQ (DX)(R9*1), R12
+ MOVQ 1(DX)(R9*1), R13
SHLQ $0x10, R10
IMULQ SI, R10
SHRQ $0x36, R10
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x36, R13
SHLQ $0x20, R11
IMULQ R8, R11
SHRQ $0x38, R11
- SHLQ $0x20, R12
- IMULQ R8, R12
- SHRQ $0x38, R12
+ SHLQ $0x10, R12
+ IMULQ SI, R12
+ SHRQ $0x36, R12
+ SHLQ $0x20, R13
+ IMULQ R8, R13
+ SHRQ $0x38, R13
+ LEAQ 1(DI), R8
+ LEAQ 1(R9), R14
MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 4120(SP)(R11*4)
- MOVL R15, 4120(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
+ MOVL R9, 24(SP)(R12*4)
+ MOVL R8, 4120(SP)(R11*4)
+ MOVL R14, 4120(SP)(R13*4)
+ ADDQ $0x01, DI
+ SUBQ $0x01, R9
+
+index_loop_encodeBetterBlockAsm8B:
+ CMPQ DI, R9
+ JAE search_loop_encodeBetterBlockAsm8B
+ MOVQ (DX)(DI*1), R8
+ MOVQ (DX)(R9*1), R10
+ SHLQ $0x10, R8
+ IMULQ SI, R8
+ SHRQ $0x36, R8
SHLQ $0x10, R10
IMULQ SI, R10
SHRQ $0x36, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x38, R11
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x36, R13
+ MOVL DI, 24(SP)(R8*4)
MOVL R9, 24(SP)(R10*4)
- MOVL DI, 4120(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeBetterBlockAsm8B
+ ADDQ $0x02, DI
+ SUBQ $0x02, R9
+ JMP index_loop_encodeBetterBlockAsm8B
emit_remainder_encodeBetterBlockAsm8B:
MOVQ src_len+32(FP), CX
@@ -14287,9 +14321,9 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
// Requires: BMI, SSE2
-TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56
+TEXT ·encodeSnappyBetterBlockAsm(SB), $589848-56
MOVQ dst_base+0(FP), AX
- MOVQ $0x00000a00, CX
+ MOVQ $0x00001200, CX
LEAQ 24(SP), DX
PXOR X0, X0
@@ -14341,27 +14375,37 @@ check_maxskip_cont_encodeSnappyBetterBlockAsm:
MOVQ DI, R11
SHLQ $0x08, R10
IMULQ R9, R10
- SHRQ $0x30, R10
+ SHRQ $0x2f, R10
SHLQ $0x20, R11
IMULQ SI, R11
SHRQ $0x32, R11
MOVL 24(SP)(R10*4), SI
- MOVL 262168(SP)(R11*4), R8
+ MOVL 524312(SP)(R11*4), R8
MOVL CX, 24(SP)(R10*4)
- MOVL CX, 262168(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVL CX, 524312(SP)(R11*4)
+ MOVQ (DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ CMPQ R10, DI
JEQ candidate_match_encodeSnappyBetterBlockAsm
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeSnappyBetterBlockAsm
- MOVL 20(SP), CX
- JMP search_loop_encodeSnappyBetterBlockAsm
+ CMPQ R11, DI
+ JNE no_short_found_encodeSnappyBetterBlockAsm
+ MOVL R8, SI
+ JMP candidate_match_encodeSnappyBetterBlockAsm
+
+no_short_found_encodeSnappyBetterBlockAsm:
+ CMPL R10, DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm
+ CMPL R11, DI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm
candidateS_match_encodeSnappyBetterBlockAsm:
SHRQ $0x08, DI
MOVQ DI, R10
SHLQ $0x08, R10
IMULQ R9, R10
- SHRQ $0x30, R10
+ SHRQ $0x2f, R10
MOVL 24(SP)(R10*4), SI
INCL CX
MOVL CX, 24(SP)(R10*4)
@@ -14685,52 +14729,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
MOVQ $0x00cf1bbcdcbfa563, SI
MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
+ LEAQ 1(DI), DI
+ LEAQ -2(CX), R9
+ MOVQ (DX)(DI*1), R10
+ MOVQ 1(DX)(DI*1), R11
+ MOVQ (DX)(R9*1), R12
+ MOVQ 1(DX)(R9*1), R13
SHLQ $0x08, R10
IMULQ SI, R10
- SHRQ $0x30, R10
- SHLQ $0x08, R13
- IMULQ SI, R13
- SHRQ $0x30, R13
+ SHRQ $0x2f, R10
SHLQ $0x20, R11
IMULQ R8, R11
SHRQ $0x32, R11
- SHLQ $0x20, R12
- IMULQ R8, R12
- SHRQ $0x32, R12
+ SHLQ $0x08, R12
+ IMULQ SI, R12
+ SHRQ $0x2f, R12
+ SHLQ $0x20, R13
+ IMULQ R8, R13
+ SHRQ $0x32, R13
+ LEAQ 1(DI), R8
+ LEAQ 1(R9), R14
MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 262168(SP)(R11*4)
- MOVL R15, 262168(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
+ MOVL R9, 24(SP)(R12*4)
+ MOVL R8, 524312(SP)(R11*4)
+ MOVL R14, 524312(SP)(R13*4)
+ ADDQ $0x01, DI
+ SUBQ $0x01, R9
+
+index_loop_encodeSnappyBetterBlockAsm:
+ CMPQ DI, R9
+ JAE search_loop_encodeSnappyBetterBlockAsm
+ MOVQ (DX)(DI*1), R8
+ MOVQ (DX)(R9*1), R10
+ SHLQ $0x08, R8
+ IMULQ SI, R8
+ SHRQ $0x2f, R8
SHLQ $0x08, R10
IMULQ SI, R10
- SHRQ $0x30, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x32, R11
- SHLQ $0x08, R13
- IMULQ SI, R13
- SHRQ $0x30, R13
+ SHRQ $0x2f, R10
+ MOVL DI, 24(SP)(R8*4)
MOVL R9, 24(SP)(R10*4)
- MOVL DI, 262168(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeSnappyBetterBlockAsm
+ ADDQ $0x02, DI
+ SUBQ $0x02, R9
+ JMP index_loop_encodeSnappyBetterBlockAsm
emit_remainder_encodeSnappyBetterBlockAsm:
MOVQ src_len+32(FP), CX
@@ -14964,12 +15005,22 @@ search_loop_encodeSnappyBetterBlockAsm64K:
MOVL 262168(SP)(R11*4), R8
MOVL CX, 24(SP)(R10*4)
MOVL CX, 262168(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVQ (DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ CMPQ R10, DI
JEQ candidate_match_encodeSnappyBetterBlockAsm64K
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeSnappyBetterBlockAsm64K
- MOVL 20(SP), CX
- JMP search_loop_encodeSnappyBetterBlockAsm64K
+ CMPQ R11, DI
+ JNE no_short_found_encodeSnappyBetterBlockAsm64K
+ MOVL R8, SI
+ JMP candidate_match_encodeSnappyBetterBlockAsm64K
+
+no_short_found_encodeSnappyBetterBlockAsm64K:
+ CMPL R10, DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm64K
+ CMPL R11, DI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm64K
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm64K
candidateS_match_encodeSnappyBetterBlockAsm64K:
SHRQ $0x08, DI
@@ -15248,52 +15299,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
MOVQ $0x00cf1bbcdcbfa563, SI
MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
+ LEAQ 1(DI), DI
+ LEAQ -2(CX), R9
+ MOVQ (DX)(DI*1), R10
+ MOVQ 1(DX)(DI*1), R11
+ MOVQ (DX)(R9*1), R12
+ MOVQ 1(DX)(R9*1), R13
SHLQ $0x08, R10
IMULQ SI, R10
SHRQ $0x30, R10
- SHLQ $0x08, R13
- IMULQ SI, R13
- SHRQ $0x30, R13
SHLQ $0x20, R11
IMULQ R8, R11
SHRQ $0x32, R11
- SHLQ $0x20, R12
- IMULQ R8, R12
- SHRQ $0x32, R12
+ SHLQ $0x08, R12
+ IMULQ SI, R12
+ SHRQ $0x30, R12
+ SHLQ $0x20, R13
+ IMULQ R8, R13
+ SHRQ $0x32, R13
+ LEAQ 1(DI), R8
+ LEAQ 1(R9), R14
MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 262168(SP)(R11*4)
- MOVL R15, 262168(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
+ MOVL R9, 24(SP)(R12*4)
+ MOVL R8, 262168(SP)(R11*4)
+ MOVL R14, 262168(SP)(R13*4)
+ ADDQ $0x01, DI
+ SUBQ $0x01, R9
+
+index_loop_encodeSnappyBetterBlockAsm64K:
+ CMPQ DI, R9
+ JAE search_loop_encodeSnappyBetterBlockAsm64K
+ MOVQ (DX)(DI*1), R8
+ MOVQ (DX)(R9*1), R10
+ SHLQ $0x08, R8
+ IMULQ SI, R8
+ SHRQ $0x30, R8
SHLQ $0x08, R10
IMULQ SI, R10
SHRQ $0x30, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x32, R11
- SHLQ $0x08, R13
- IMULQ SI, R13
- SHRQ $0x30, R13
+ MOVL DI, 24(SP)(R8*4)
MOVL R9, 24(SP)(R10*4)
- MOVL DI, 262168(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeSnappyBetterBlockAsm64K
+ ADDQ $0x02, DI
+ SUBQ $0x02, R9
+ JMP index_loop_encodeSnappyBetterBlockAsm64K
emit_remainder_encodeSnappyBetterBlockAsm64K:
MOVQ src_len+32(FP), CX
@@ -15508,12 +15556,22 @@ search_loop_encodeSnappyBetterBlockAsm12B:
MOVL 65560(SP)(R11*4), R8
MOVL CX, 24(SP)(R10*4)
MOVL CX, 65560(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVQ (DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ CMPQ R10, DI
JEQ candidate_match_encodeSnappyBetterBlockAsm12B
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeSnappyBetterBlockAsm12B
- MOVL 20(SP), CX
- JMP search_loop_encodeSnappyBetterBlockAsm12B
+ CMPQ R11, DI
+ JNE no_short_found_encodeSnappyBetterBlockAsm12B
+ MOVL R8, SI
+ JMP candidate_match_encodeSnappyBetterBlockAsm12B
+
+no_short_found_encodeSnappyBetterBlockAsm12B:
+ CMPL R10, DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm12B
+ CMPL R11, DI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm12B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm12B
candidateS_match_encodeSnappyBetterBlockAsm12B:
SHRQ $0x08, DI
@@ -15792,52 +15850,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
MOVQ $0x0000cf1bbcdcbf9b, SI
MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
+ LEAQ 1(DI), DI
+ LEAQ -2(CX), R9
+ MOVQ (DX)(DI*1), R10
+ MOVQ 1(DX)(DI*1), R11
+ MOVQ (DX)(R9*1), R12
+ MOVQ 1(DX)(R9*1), R13
SHLQ $0x10, R10
IMULQ SI, R10
SHRQ $0x32, R10
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x32, R13
SHLQ $0x20, R11
IMULQ R8, R11
SHRQ $0x34, R11
- SHLQ $0x20, R12
- IMULQ R8, R12
- SHRQ $0x34, R12
+ SHLQ $0x10, R12
+ IMULQ SI, R12
+ SHRQ $0x32, R12
+ SHLQ $0x20, R13
+ IMULQ R8, R13
+ SHRQ $0x34, R13
+ LEAQ 1(DI), R8
+ LEAQ 1(R9), R14
MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 65560(SP)(R11*4)
- MOVL R15, 65560(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
+ MOVL R9, 24(SP)(R12*4)
+ MOVL R8, 65560(SP)(R11*4)
+ MOVL R14, 65560(SP)(R13*4)
+ ADDQ $0x01, DI
+ SUBQ $0x01, R9
+
+index_loop_encodeSnappyBetterBlockAsm12B:
+ CMPQ DI, R9
+ JAE search_loop_encodeSnappyBetterBlockAsm12B
+ MOVQ (DX)(DI*1), R8
+ MOVQ (DX)(R9*1), R10
+ SHLQ $0x10, R8
+ IMULQ SI, R8
+ SHRQ $0x32, R8
SHLQ $0x10, R10
IMULQ SI, R10
SHRQ $0x32, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x34, R11
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x32, R13
+ MOVL DI, 24(SP)(R8*4)
MOVL R9, 24(SP)(R10*4)
- MOVL DI, 65560(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeSnappyBetterBlockAsm12B
+ ADDQ $0x02, DI
+ SUBQ $0x02, R9
+ JMP index_loop_encodeSnappyBetterBlockAsm12B
emit_remainder_encodeSnappyBetterBlockAsm12B:
MOVQ src_len+32(FP), CX
@@ -16052,12 +16107,22 @@ search_loop_encodeSnappyBetterBlockAsm10B:
MOVL 16408(SP)(R11*4), R8
MOVL CX, 24(SP)(R10*4)
MOVL CX, 16408(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVQ (DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ CMPQ R10, DI
JEQ candidate_match_encodeSnappyBetterBlockAsm10B
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeSnappyBetterBlockAsm10B
- MOVL 20(SP), CX
- JMP search_loop_encodeSnappyBetterBlockAsm10B
+ CMPQ R11, DI
+ JNE no_short_found_encodeSnappyBetterBlockAsm10B
+ MOVL R8, SI
+ JMP candidate_match_encodeSnappyBetterBlockAsm10B
+
+no_short_found_encodeSnappyBetterBlockAsm10B:
+ CMPL R10, DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm10B
+ CMPL R11, DI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm10B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm10B
candidateS_match_encodeSnappyBetterBlockAsm10B:
SHRQ $0x08, DI
@@ -16336,52 +16401,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
MOVQ $0x0000cf1bbcdcbf9b, SI
MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
+ LEAQ 1(DI), DI
+ LEAQ -2(CX), R9
+ MOVQ (DX)(DI*1), R10
+ MOVQ 1(DX)(DI*1), R11
+ MOVQ (DX)(R9*1), R12
+ MOVQ 1(DX)(R9*1), R13
SHLQ $0x10, R10
IMULQ SI, R10
SHRQ $0x34, R10
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x34, R13
SHLQ $0x20, R11
IMULQ R8, R11
SHRQ $0x36, R11
- SHLQ $0x20, R12
- IMULQ R8, R12
- SHRQ $0x36, R12
+ SHLQ $0x10, R12
+ IMULQ SI, R12
+ SHRQ $0x34, R12
+ SHLQ $0x20, R13
+ IMULQ R8, R13
+ SHRQ $0x36, R13
+ LEAQ 1(DI), R8
+ LEAQ 1(R9), R14
MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 16408(SP)(R11*4)
- MOVL R15, 16408(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
+ MOVL R9, 24(SP)(R12*4)
+ MOVL R8, 16408(SP)(R11*4)
+ MOVL R14, 16408(SP)(R13*4)
+ ADDQ $0x01, DI
+ SUBQ $0x01, R9
+
+index_loop_encodeSnappyBetterBlockAsm10B:
+ CMPQ DI, R9
+ JAE search_loop_encodeSnappyBetterBlockAsm10B
+ MOVQ (DX)(DI*1), R8
+ MOVQ (DX)(R9*1), R10
+ SHLQ $0x10, R8
+ IMULQ SI, R8
+ SHRQ $0x34, R8
SHLQ $0x10, R10
IMULQ SI, R10
SHRQ $0x34, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x36, R11
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x34, R13
+ MOVL DI, 24(SP)(R8*4)
MOVL R9, 24(SP)(R10*4)
- MOVL DI, 16408(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeSnappyBetterBlockAsm10B
+ ADDQ $0x02, DI
+ SUBQ $0x02, R9
+ JMP index_loop_encodeSnappyBetterBlockAsm10B
emit_remainder_encodeSnappyBetterBlockAsm10B:
MOVQ src_len+32(FP), CX
@@ -16596,12 +16658,22 @@ search_loop_encodeSnappyBetterBlockAsm8B:
MOVL 4120(SP)(R11*4), R8
MOVL CX, 24(SP)(R10*4)
MOVL CX, 4120(SP)(R11*4)
- CMPL (DX)(SI*1), DI
+ MOVQ (DX)(SI*1), R10
+ MOVQ (DX)(R8*1), R11
+ CMPQ R10, DI
JEQ candidate_match_encodeSnappyBetterBlockAsm8B
- CMPL (DX)(R8*1), DI
- JEQ candidateS_match_encodeSnappyBetterBlockAsm8B
- MOVL 20(SP), CX
- JMP search_loop_encodeSnappyBetterBlockAsm8B
+ CMPQ R11, DI
+ JNE no_short_found_encodeSnappyBetterBlockAsm8B
+ MOVL R8, SI
+ JMP candidate_match_encodeSnappyBetterBlockAsm8B
+
+no_short_found_encodeSnappyBetterBlockAsm8B:
+ CMPL R10, DI
+ JEQ candidate_match_encodeSnappyBetterBlockAsm8B
+ CMPL R11, DI
+ JEQ candidateS_match_encodeSnappyBetterBlockAsm8B
+ MOVL 20(SP), CX
+ JMP search_loop_encodeSnappyBetterBlockAsm8B
candidateS_match_encodeSnappyBetterBlockAsm8B:
SHRQ $0x08, DI
@@ -16878,52 +16950,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
MOVQ $0x0000cf1bbcdcbf9b, SI
MOVQ $0x9e3779b1, R8
- INCL DI
- MOVQ (DX)(DI*1), R9
- MOVQ R9, R10
- MOVQ R9, R11
- MOVQ R9, R12
- SHRQ $0x08, R11
- MOVQ R11, R13
- SHRQ $0x10, R12
- LEAL 1(DI), R14
- LEAL 2(DI), R15
- MOVQ -2(DX)(CX*1), R9
+ LEAQ 1(DI), DI
+ LEAQ -2(CX), R9
+ MOVQ (DX)(DI*1), R10
+ MOVQ 1(DX)(DI*1), R11
+ MOVQ (DX)(R9*1), R12
+ MOVQ 1(DX)(R9*1), R13
SHLQ $0x10, R10
IMULQ SI, R10
SHRQ $0x36, R10
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x36, R13
SHLQ $0x20, R11
IMULQ R8, R11
SHRQ $0x38, R11
- SHLQ $0x20, R12
- IMULQ R8, R12
- SHRQ $0x38, R12
+ SHLQ $0x10, R12
+ IMULQ SI, R12
+ SHRQ $0x36, R12
+ SHLQ $0x20, R13
+ IMULQ R8, R13
+ SHRQ $0x38, R13
+ LEAQ 1(DI), R8
+ LEAQ 1(R9), R14
MOVL DI, 24(SP)(R10*4)
- MOVL R14, 24(SP)(R13*4)
- MOVL R14, 4120(SP)(R11*4)
- MOVL R15, 4120(SP)(R12*4)
- MOVQ R9, R10
- MOVQ R9, R11
- SHRQ $0x08, R11
- MOVQ R11, R13
- LEAL -2(CX), R9
- LEAL -1(CX), DI
+ MOVL R9, 24(SP)(R12*4)
+ MOVL R8, 4120(SP)(R11*4)
+ MOVL R14, 4120(SP)(R13*4)
+ ADDQ $0x01, DI
+ SUBQ $0x01, R9
+
+index_loop_encodeSnappyBetterBlockAsm8B:
+ CMPQ DI, R9
+ JAE search_loop_encodeSnappyBetterBlockAsm8B
+ MOVQ (DX)(DI*1), R8
+ MOVQ (DX)(R9*1), R10
+ SHLQ $0x10, R8
+ IMULQ SI, R8
+ SHRQ $0x36, R8
SHLQ $0x10, R10
IMULQ SI, R10
SHRQ $0x36, R10
- SHLQ $0x20, R11
- IMULQ R8, R11
- SHRQ $0x38, R11
- SHLQ $0x10, R13
- IMULQ SI, R13
- SHRQ $0x36, R13
+ MOVL DI, 24(SP)(R8*4)
MOVL R9, 24(SP)(R10*4)
- MOVL DI, 4120(SP)(R11*4)
- MOVL DI, 24(SP)(R13*4)
- JMP search_loop_encodeSnappyBetterBlockAsm8B
+ ADDQ $0x02, DI
+ SUBQ $0x02, R9
+ JMP index_loop_encodeSnappyBetterBlockAsm8B
emit_remainder_encodeSnappyBetterBlockAsm8B:
MOVQ src_len+32(FP), CX