10 files changed, 784 insertions, 645 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md
index 73c0c462d..1d80c42a5 100644
--- a/vendor/github.com/klauspost/compress/s2/README.md
+++ b/vendor/github.com/klauspost/compress/s2/README.md
@@ -325,35 +325,35 @@ The content compressed in this mode is fully compatible with the standard decode
 
 Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all threads and a single thread (1 CPU):
 
-| File                                                                                                | S2 speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller |
-|-----------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------|
-| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z)                    | 12.70x   | 10556 MB/s    | 7.35%        | 4.15x       | 3455 MB/s           | 12.79%             |
-| (1 CPU)                                                                                             | 1.14x    | 948 MB/s      | -            | 0.42x       | 349 MB/s            | -                  |
-| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 17.13x   | 14484 MB/s    | 31.60%       | 10.09x      | 8533 MB/s           | 37.71%             |
-| (1 CPU)                                                                                             | 1.33x    | 1127 MB/s     | -            | 0.70x       | 589 MB/s            | -                  |
-| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst)         | 15.14x   | 12000 MB/s    | -5.79%       | 6.59x       | 5223 MB/s           | 5.80%              |
-| (1 CPU)                                                                                             | 1.11x    | 877 MB/s      | -            | 0.47x       | 370 MB/s            | -                  |
-| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst)                     | 14.62x   | 12116 MB/s    | 15.90%       | 5.35x       | 4430 MB/s           | 16.08%             |
-| (1 CPU)                                                                                             | 1.38x    | 1146 MB/s     | -            | 0.38x       | 312 MB/s            | -                  |
-| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst)                             | 8.83x    | 17579 MB/s    | 43.86%       | 6.54x       | 13011 MB/s          | 47.23%             |
-| (1 CPU)                                                                                             | 1.14x    | 2259 MB/s     | -            | 0.74x       | 1475 MB/s           | -                  |
-| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z)                                    | 16.72x   | 14019 MB/s    | 24.02%       | 10.11x      | 8477 MB/s           | 30.48%             |
-| (1 CPU)                                                                                             | 1.24x    | 1043 MB/s     | -            | 0.70x       | 586 MB/s            | -                  |
-| [10gb.tar](http://mattmahoney.net/dc/10gb.html)                                                     | 13.33x   | 9254 MB/s     | 1.84%        | 6.75x       | 4686 MB/s           | 6.72%              |
-| (1 CPU)                                                                                             | 0.97x    | 672 MB/s      | -            | 0.53x       | 366 MB/s            | -                  |
-| sharnd.out.2gb                                                                                      | 2.11x    | 12639 MB/s    | 0.01%        | 1.98x       | 11833 MB/s          | 0.01%              |
-| (1 CPU)                                                                                             | 0.93x    | 5594 MB/s     | -            | 1.34x       | 8030 MB/s           | -                  |
-| [enwik9](http://mattmahoney.net/dc/textdata.html)                                                   | 19.34x   | 8220 MB/s     | 3.98%        | 7.87x       | 3345 MB/s           | 15.82%             |
-| (1 CPU)                                                                                             | 1.06x    | 452 MB/s      | -            | 0.50x       | 213 MB/s            | -                  |
-| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip)                                    | 10.48x   | 6124 MB/s     | 5.67%        | 3.76x       | 2197 MB/s           | 12.60%             |
-| (1 CPU)                                                                                             | 0.97x    | 568 MB/s      | -            | 0.46x       | 271 MB/s            | -                  |
-| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results)                                 | 21.07x   | 9020 MB/s     | 6.36%        | 6.91x       | 2959 MB/s           | 16.95%             |
-| (1 CPU)                                                                                             | 1.07x    | 460 MB/s      | -            | 0.51x       | 220 MB/s            | -                  |
+| File                                                                                                    | S2 Speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller |
+|---------------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------|
+| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z)                        | 16.33x   | 10556 MB/s    | 8.0%         | 6.04x       | 5252 MB/s           | 14.7%              |
+| (1 CPU)                                                                                                 | 1.08x    | 940 MB/s      | -            | 0.46x       | 400 MB/s            | -                  |
+| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst)     | 16.51x   | 15224 MB/s    | 31.70%       | 9.47x       | 8734 MB/s           | 37.71%             |
+| (1 CPU)                                                                                                 | 1.26x    | 1157 MB/s     | -            | 0.60x       | 556 MB/s            | -                  |
+| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst)             | 15.14x   | 12598 MB/s    | -5.76%       | 6.23x       | 5675 MB/s           | 3.62%              |
+| (1 CPU)                                                                                                 | 1.02x    | 932 MB/s      | -            | 0.47x       | 432 MB/s            | -                  |
+| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst)                         | 11.21x   | 12116 MB/s    | 15.95%       | 3.24x       | 3500 MB/s           | 18.00%             |
+| (1 CPU)                                                                                                 | 1.05x    | 1135 MB/s     | -            | 0.27x       | 292 MB/s            | -                  |
+| [apache.log](https://files.klauspost.com/compress/apache.log.zst)                                       | 8.55x    | 16673 MB/s    | 20.54%       | 5.85x       | 11420 MB/s          | 24.97%             |
+| (1 CPU)                                                                                                 | 1.91x    | 1771 MB/s     | -            | 0.53x       | 1041 MB/s           | -                  |
+| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z)                                        | 15.76x   | 14357 MB/s    | 24.01%       | 8.67x       | 7891 MB/s           | 33.68%             |
+| (1 CPU)                                                                                                 | 1.17x    | 1064 MB/s     | -            | 0.65x       | 595 MB/s            | -                  |
+| [10gb.tar](http://mattmahoney.net/dc/10gb.html)                                                         | 13.33x   | 9835 MB/s     | 2.34%        | 6.85x       | 4863 MB/s           | 9.96%              |
+| (1 CPU)                                                                                                 | 0.97x    | 689 MB/s      | -            | 0.55x       | 387 MB/s            | -                  |
+| sharnd.out.2gb                                                                                          | 9.11x    | 13213 MB/s    | 0.01%        | 1.49x       | 9184 MB/s           | 0.01%              |
+| (1 CPU)                                                                                                 | 0.88x    | 5418 MB/s     | -            | 0.77x       | 5417 MB/s           | -                  |
+| [sofia-air-quality-dataset csv](https://files.klauspost.com/compress/sofia-air-quality-dataset.tar.zst) | 22.00x   | 11477 MB/s    | 18.73%       | 11.15x      | 5817 MB/s           | 27.88%             |
+| (1 CPU)                                                                                                 | 1.23x    | 642 MB/s      | -            | 0.71x       | 642 MB/s            | -                  |
+| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip)                                        | 11.23x   | 6520 MB/s     | 5.9%         | 5.35x       | 3109 MB/s           | 15.88%             |
+| (1 CPU)                                                                                                 | 1.05x    | 607 MB/s      | -            | 0.52x       | 304 MB/s            | -                  |
+| [enwik9](https://files.klauspost.com/compress/enwik9.zst)                                               | 19.28x   | 8440 MB/s     | 4.04%        | 9.31x       | 4076 MB/s           | 18.04%             |
+| (1 CPU)                                                                                                 | 1.12x    | 488 MB/s      | -            | 0.57x       | 250 MB/s            | -                  |
 
 ### Legend
 
-* `S2 speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core.
-* `S2 throughput`: Throughput of S2 in MB/s. 
+* `S2 Speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core.
+* `S2 Throughput`: Throughput of S2 in MB/s. 
 * `S2 % smaller`: How many percent of the Snappy output size is S2 better.
 * `S2 "better"`: Speed when enabling "better" compression mode in S2 compared to Snappy. 
 * `"better" throughput`: Speed when enabling "better" compression mode in S2 compared to Snappy. 
@@ -361,7 +361,7 @@ Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all th
 
 There is a good speedup across the board when using a single thread and a significant speedup when using multiple threads.
 
-Machine generated data gets by far the biggest compression boost, with size being being reduced by up to 45% of Snappy size.
+Machine generated data gets by far the biggest compression boost, with size being reduced by up to 35% of Snappy size.
 
 The "better" compression mode sees a good improvement in all cases, but usually at a performance cost.
 
@@ -404,15 +404,15 @@ The "better" compression mode will actively look for shorter matches, which is w
 Without assembly decompression is also very fast; single goroutine decompression speed. No assembly:
 
 | File                           | S2 Throughput | S2 throughput |
-|--------------------------------|--------------|---------------|
-| consensus.db.10gb.s2           | 1.84x        | 2289.8 MB/s   |
-| 10gb.tar.s2                    | 1.30x        | 867.07 MB/s   |
-| rawstudio-mint14.tar.s2        | 1.66x        | 1329.65 MB/s  |
-| github-june-2days-2019.json.s2 | 2.36x        | 1831.59 MB/s  |
-| github-ranks-backup.bin.s2     | 1.73x        | 1390.7 MB/s   |
-| enwik9.s2                      | 1.67x        | 681.53 MB/s   |
-| adresser.json.s2               | 3.41x        | 4230.53 MB/s  |
-| silesia.tar.s2                 | 1.52x        | 811.58        |
+|--------------------------------|---------------|---------------|
+| consensus.db.10gb.s2           | 1.84x         | 2289.8 MB/s   |
+| 10gb.tar.s2                    | 1.30x         | 867.07 MB/s   |
+| rawstudio-mint14.tar.s2        | 1.66x         | 1329.65 MB/s  |
+| github-june-2days-2019.json.s2 | 2.36x         | 1831.59 MB/s  |
+| github-ranks-backup.bin.s2     | 1.73x         | 1390.7 MB/s   |
+| enwik9.s2                      | 1.67x         | 681.53 MB/s   |
+| adresser.json.s2               | 3.41x         | 4230.53 MB/s  |
+| silesia.tar.s2                 | 1.52x         | 811.58        |
 
 Even though S2 typically compresses better than Snappy, decompression speed is always better. 
 
@@ -450,14 +450,14 @@ The most reliable is a wide dataset.
 For this we use [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
 53927 files, total input size: 4,014,735,833 bytes. Single goroutine used.
 
-| *                 | Input      | Output     | Reduction | MB/s   |
-|-------------------|------------|------------|-----------|--------|
-| S2                | 4014735833 | 1059723369 | 73.60%    | **934.34** |
-| S2 Better         | 4014735833 | 969670507  | 75.85%    | 532.70 |
-| S2 Best           | 4014735833 | 906625668  | **77.85%** | 46.84 |
-| Snappy            | 4014735833 | 1128706759 | 71.89%    | 762.59 |
-| S2, Snappy Output | 4014735833 | 1093821420 | 72.75%    | 908.60 |
-| LZ4               | 4014735833 | 1079259294 | 73.12%    | 526.94 |
+| *                 | Input      | Output     | Reduction  | MB/s       |
+|-------------------|------------|------------|------------|------------|
+| S2                | 4014735833 | 1059723369 | 73.60%     | **936.73** |
+| S2 Better         | 4014735833 | 961580539  | 76.05%     | 451.10     |
+| S2 Best           | 4014735833 | 899182886  | **77.60%** | 46.84      |
+| Snappy            | 4014735833 | 1128706759 | 71.89%     | 790.15     |
+| S2, Snappy Output | 4014735833 | 1093823291 | 72.75%     | 936.60     |
+| LZ4               | 4014735833 | 1063768713 | 73.50%     | 452.02     |
 
 S2 delivers both the best single threaded throughput with regular mode and the best compression rate with "best".
 "Better" mode provides the same compression speed as LZ4 with better compression ratio. 
@@ -489,42 +489,23 @@ AMD64 assembly is use for both S2 and Snappy.
 
 | Absolute Perf         | Snappy size | S2 Size | Snappy Speed | S2 Speed    | Snappy dec  | S2 dec      |
 |-----------------------|-------------|---------|--------------|-------------|-------------|-------------|
-| html                  | 22843       | 21111   | 16246 MB/s   | 17438 MB/s  | 40972 MB/s  | 49263 MB/s  |
-| urls.10K              | 335492      | 287326  | 7943 MB/s    | 9693 MB/s   | 22523 MB/s  | 26484 MB/s  |
-| fireworks.jpeg        | 123034      | 123100  | 349544 MB/s  | 273889 MB/s | 718321 MB/s | 827552 MB/s |
-| fireworks.jpeg (200B) | 146         | 155     | 8869 MB/s    | 17773 MB/s  | 33691 MB/s  | 52421 MB/s  |
-| paper-100k.pdf        | 85304       | 84459   | 167546 MB/s  | 101263 MB/s | 326905 MB/s | 291944 MB/s |
-| html_x_4              | 92234       | 21113   | 15194 MB/s   | 50670 MB/s  | 30843 MB/s  | 32217 MB/s  |
-| alice29.txt           | 88034       | 85975   | 5936 MB/s    | 6139 MB/s   | 12882 MB/s  | 20044 MB/s  |
-| asyoulik.txt          | 77503       | 79650   | 5517 MB/s    | 6366 MB/s   | 12735 MB/s  | 22806 MB/s  |
-| lcet10.txt            | 234661      | 220670  | 6235 MB/s    | 6067 MB/s   | 14519 MB/s  | 18697 MB/s  |
-| plrabn12.txt          | 319267      | 317985  | 5159 MB/s    | 5726 MB/s   | 11923 MB/s  | 19901 MB/s  |
-| geo.protodata         | 23335       | 18690   | 21220 MB/s   | 26529 MB/s  | 56271 MB/s  | 62540 MB/s  |
-| kppkn.gtb             | 69526       | 65312   | 9732 MB/s    | 8559 MB/s   | 18491 MB/s  | 18969 MB/s  |
-| alice29.txt (128B)    | 80          | 82      | 6691 MB/s    | 15489 MB/s  | 31883 MB/s  | 38874 MB/s  |
-| alice29.txt (1000B)   | 774         | 774     | 12204 MB/s   | 13000 MB/s  | 48056 MB/s  | 52341 MB/s  |
-| alice29.txt (10000B)  | 6648        | 6933    | 10044 MB/s   | 12806 MB/s  | 32378 MB/s  | 46322 MB/s  |
-| alice29.txt (20000B)  | 12686       | 13574   | 7733 MB/s    | 11210 MB/s  | 30566 MB/s  | 58969 MB/s  |
-
-
-| Relative Perf         | Snappy size | S2 size improved | S2 Speed | S2 Dec Speed |
-|-----------------------|-------------|------------------|----------|--------------|
-| html                  | 22.31%      | 7.58%            | 1.07x    | 1.20x        |
-| urls.10K              | 47.78%      | 14.36%           | 1.22x    | 1.18x        |
-| fireworks.jpeg        | 99.95%      | -0.05%           | 0.78x    | 1.15x        |
-| fireworks.jpeg (200B) | 73.00%      | -6.16%           | 2.00x    | 1.56x        |
-| paper-100k.pdf        | 83.30%      | 0.99%            | 0.60x    | 0.89x        |
-| html_x_4              | 22.52%      | 77.11%           | 3.33x    | 1.04x        |
-| alice29.txt           | 57.88%      | 2.34%            | 1.03x    | 1.56x        |
-| asyoulik.txt          | 61.91%      | -2.77%           | 1.15x    | 1.79x        |
-| lcet10.txt            | 54.99%      | 5.96%            | 0.97x    | 1.29x        |
-| plrabn12.txt          | 66.26%      | 0.40%            | 1.11x    | 1.67x        |
-| geo.protodata         | 19.68%      | 19.91%           | 1.25x    | 1.11x        |
-| kppkn.gtb             | 37.72%      | 6.06%            | 0.88x    | 1.03x        |
-| alice29.txt (128B)    | 62.50%      | -2.50%           | 2.31x    | 1.22x        |
-| alice29.txt (1000B)   | 77.40%      | 0.00%            | 1.07x    | 1.09x        |
-| alice29.txt (10000B)  | 66.48%      | -4.29%           | 1.27x    | 1.43x        |
-| alice29.txt (20000B)  | 63.43%      | -7.00%           | 1.45x    | 1.93x        |
+| html                  | 22843       | 20868   | 16246 MB/s   | 18617 MB/s  | 40972 MB/s  | 49263 MB/s  |
+| urls.10K              | 335492      | 286541  | 7943 MB/s    | 10201 MB/s  | 22523 MB/s  | 26484 MB/s  |
+| fireworks.jpeg        | 123034      | 123100  | 349544 MB/s  | 303228 MB/s | 718321 MB/s | 827552 MB/s |
+| fireworks.jpeg (200B) | 146         | 155     | 8869 MB/s    | 20180 MB/s  | 33691 MB/s  | 52421 MB/s  |
+| paper-100k.pdf        | 85304       | 84202   | 167546 MB/s  | 112988 MB/s | 326905 MB/s | 291944 MB/s |
+| html_x_4              | 92234       | 20870   | 15194 MB/s   | 54457 MB/s  | 30843 MB/s  | 32217 MB/s  |
+| alice29.txt           | 88034       | 85934   | 5936 MB/s    | 6540 MB/s   | 12882 MB/s  | 20044 MB/s  |
+| asyoulik.txt          | 77503       | 79575   | 5517 MB/s    | 6657 MB/s   | 12735 MB/s  | 22806 MB/s  |
+| lcet10.txt            | 234661      | 220383  | 6235 MB/s    | 6303 MB/s   | 14519 MB/s  | 18697 MB/s  |
+| plrabn12.txt          | 319267      | 318196  | 5159 MB/s    | 6074 MB/s   | 11923 MB/s  | 19901 MB/s  |
+| geo.protodata         | 23335       | 18606   | 21220 MB/s   | 25432 MB/s  | 56271 MB/s  | 62540 MB/s  |
+| kppkn.gtb             | 69526       | 65019   | 9732 MB/s    | 8905 MB/s   | 18491 MB/s  | 18969 MB/s  |
+| alice29.txt (128B)    | 80          | 82      | 6691 MB/s    | 17179 MB/s  | 31883 MB/s  | 38874 MB/s  |
+| alice29.txt (1000B)   | 774         | 774     | 12204 MB/s   | 13273 MB/s  | 48056 MB/s  | 52341 MB/s  |
+| alice29.txt (10000B)  | 6648        | 6933    | 10044 MB/s   | 12824 MB/s  | 32378 MB/s  | 46322 MB/s  |
+| alice29.txt (20000B)  | 12686       | 13516   | 7733 MB/s    | 12160 MB/s  | 30566 MB/s  | 58969 MB/s  |
+
 
 Speed is generally at or above Snappy. Small blocks gets a significant speedup, although at the expense of size. 
 
@@ -543,42 +524,23 @@ So individual benchmarks should only be seen as a guideline and the overall pict
 
 | Absolute Perf         | Snappy size | Better Size | Snappy Speed | Better Speed | Snappy dec  | Better dec  |
 |-----------------------|-------------|-------------|--------------|--------------|-------------|-------------|
-| html                  | 22843       | 19833       | 16246 MB/s   | 7731 MB/s    | 40972 MB/s  | 40292 MB/s  |
-| urls.10K              | 335492      | 253529      | 7943 MB/s    | 3980 MB/s    | 22523 MB/s  | 20981 MB/s  |
-| fireworks.jpeg        | 123034      | 123100      | 349544 MB/s  | 9760 MB/s    | 718321 MB/s | 823698 MB/s |
-| fireworks.jpeg (200B) | 146         | 142         | 8869 MB/s    | 594 MB/s     | 33691 MB/s  | 30101 MB/s  |
-| paper-100k.pdf        | 85304       | 82915       | 167546 MB/s  | 7470 MB/s    | 326905 MB/s | 198869 MB/s |
-| html_x_4              | 92234       | 19841       | 15194 MB/s   | 23403 MB/s   | 30843 MB/s  | 30937 MB/s  |
-| alice29.txt           | 88034       | 73218       | 5936 MB/s    | 2945 MB/s    | 12882 MB/s  | 16611 MB/s  |
-| asyoulik.txt          | 77503       | 66844       | 5517 MB/s    | 2739 MB/s    | 12735 MB/s  | 14975 MB/s  |
-| lcet10.txt            | 234661      | 190589      | 6235 MB/s    | 3099 MB/s    | 14519 MB/s  | 16634 MB/s  |
-| plrabn12.txt          | 319267      | 270828      | 5159 MB/s    | 2600 MB/s    | 11923 MB/s  | 13382 MB/s  |
-| geo.protodata         | 23335       | 18278       | 21220 MB/s   | 11208 MB/s   | 56271 MB/s  | 57961 MB/s  |
-| kppkn.gtb             | 69526       | 61851       | 9732 MB/s    | 4556 MB/s    | 18491 MB/s  | 16524 MB/s  |
-| alice29.txt (128B)    | 80          | 81          | 6691 MB/s    | 529 MB/s     | 31883 MB/s  | 34225 MB/s  |
-| alice29.txt (1000B)   | 774         | 748         | 12204 MB/s   | 1943 MB/s    | 48056 MB/s  | 42068 MB/s  |
-| alice29.txt (10000B)  | 6648        | 6234        | 10044 MB/s   | 2949 MB/s    | 32378 MB/s  | 28813 MB/s  |
-| alice29.txt (20000B)  | 12686       | 11584       | 7733 MB/s    | 2822 MB/s    | 30566 MB/s  | 27315 MB/s  |
-
-
-| Relative Perf         | Snappy size | Better size | Better Speed | Better dec |
-|-----------------------|-------------|-------------|--------------|------------|
-| html                  | 22.31%      | 13.18%      | 0.48x        | 0.98x      |
-| urls.10K              | 47.78%      | 24.43%      | 0.50x        | 0.93x      |
-| fireworks.jpeg        | 99.95%      | -0.05%      | 0.03x        | 1.15x      |
-| fireworks.jpeg (200B) | 73.00%      | 2.74%       | 0.07x        | 0.89x      |
-| paper-100k.pdf        | 83.30%      | 2.80%       | 0.07x        | 0.61x      |
-| html_x_4              | 22.52%      | 78.49%      | 0.04x        | 1.00x      |
-| alice29.txt           | 57.88%      | 16.83%      | 1.54x        | 1.29x      |
-| asyoulik.txt          | 61.91%      | 13.75%      | 0.50x        | 1.18x      |
-| lcet10.txt            | 54.99%      | 18.78%      | 0.50x        | 1.15x      |
-| plrabn12.txt          | 66.26%      | 15.17%      | 0.50x        | 1.12x      |
-| geo.protodata         | 19.68%      | 21.67%      | 0.50x        | 1.03x      |
-| kppkn.gtb             | 37.72%      | 11.04%      | 0.53x        | 0.89x      |
-| alice29.txt (128B)    | 62.50%      | -1.25%      | 0.47x        | 1.07x      |
-| alice29.txt (1000B)   | 77.40%      | 3.36%       | 0.08x        | 0.88x      |
-| alice29.txt (10000B)  | 66.48%      | 6.23%       | 0.16x        | 0.89x      |
-| alice29.txt (20000B)  | 63.43%      | 8.69%       | 0.29x        | 0.89x      |
+| html                  | 22843       | 18972       | 16246 MB/s   | 8621 MB/s    | 40972 MB/s  | 40292 MB/s  |
+| urls.10K              | 335492      | 248079      | 7943 MB/s    | 5104 MB/s    | 22523 MB/s  | 20981 MB/s  |
+| fireworks.jpeg        | 123034      | 123100      | 349544 MB/s  | 84429 MB/s   | 718321 MB/s | 823698 MB/s |
+| fireworks.jpeg (200B) | 146         | 149         | 8869 MB/s    | 7125 MB/s    | 33691 MB/s  | 30101 MB/s  |
+| paper-100k.pdf        | 85304       | 82887       | 167546 MB/s  | 11087 MB/s   | 326905 MB/s | 198869 MB/s |
+| html_x_4              | 92234       | 18982       | 15194 MB/s   | 29316 MB/s   | 30843 MB/s  | 30937 MB/s  |
+| alice29.txt           | 88034       | 71611       | 5936 MB/s    | 3709 MB/s    | 12882 MB/s  | 16611 MB/s  |
+| asyoulik.txt          | 77503       | 65941       | 5517 MB/s    | 3380 MB/s    | 12735 MB/s  | 14975 MB/s  |
+| lcet10.txt            | 234661      | 184939      | 6235 MB/s    | 3537 MB/s    | 14519 MB/s  | 16634 MB/s  |
+| plrabn12.txt          | 319267      | 264990      | 5159 MB/s    | 2960 MB/s    | 11923 MB/s  | 13382 MB/s  |
+| geo.protodata         | 23335       | 17689       | 21220 MB/s   | 10859 MB/s   | 56271 MB/s  | 57961 MB/s  |
+| kppkn.gtb             | 69526       | 55398       | 9732 MB/s    | 5206 MB/s    | 18491 MB/s  | 16524 MB/s  |
+| alice29.txt (128B)    | 80          | 78          | 6691 MB/s    | 7422 MB/s    | 31883 MB/s  | 34225 MB/s  |
+| alice29.txt (1000B)   | 774         | 746         | 12204 MB/s   | 5734 MB/s    | 48056 MB/s  | 42068 MB/s  |
+| alice29.txt (10000B)  | 6648        | 6218        | 10044 MB/s   | 6055 MB/s    | 32378 MB/s  | 28813 MB/s  |
+| alice29.txt (20000B)  | 12686       | 11492       | 7733 MB/s    | 3143 MB/s    | 30566 MB/s  | 27315 MB/s  |
+
 
 Except for the mostly incompressible JPEG image compression is better and usually in the 
 double digits in terms of percentage reduction over Snappy.
@@ -605,29 +567,29 @@ Some examples compared on 16 core CPU, amd64 assembly used:
 
 ```
 * enwik10
-Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s
-Better...  10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s
-Best...    10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s
+Default... 10000000000 -> 4759950115 [47.60%]; 1.03s, 9263.0MB/s
+Better...  10000000000 -> 4084706676 [40.85%]; 2.16s, 4415.4MB/s
+Best...    10000000000 -> 3615520079 [36.16%]; 42.259s, 225.7MB/s
 
 * github-june-2days-2019.json
-Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s
-Better...  6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s
-Best...    6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s
+Default... 6273951764 -> 1041700255 [16.60%]; 431ms, 13882.3MB/s
+Better...  6273951764 -> 945841238 [15.08%]; 547ms, 10938.4MB/s
+Best...    6273951764 -> 826392576 [13.17%]; 9.455s, 632.8MB/s
 
 * nyc-taxi-data-10M.csv
-Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s
-Better...  3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s
-Best...    3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s
+Default... 3325605752 -> 1093516949 [32.88%]; 324ms, 9788.7MB/s
+Better...  3325605752 -> 885394158 [26.62%]; 491ms, 6459.4MB/s
+Best...    3325605752 -> 773681257 [23.26%]; 8.29s, 412.0MB/s
 
 * 10gb.tar
-Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s
-Better...  10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s
-Best...    10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/
+Default... 10065157632 -> 5915541066 [58.77%]; 1.028s, 9337.4MB/s
+Better...  10065157632 -> 5453844650 [54.19%]; 1.597s, 4862.7MB/s
+Best...    10065157632 -> 5192495021 [51.59%]; 32.78s, 308.2MB/
 
 * consensus.db.10gb
-Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s
-Better...  10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s
-Best...    10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s
+Default... 10737418240 -> 4549762344 [42.37%]; 882ms, 12118.4MB/s
+Better...  10737418240 -> 4438535064 [41.34%]; 1.533s, 3500.9MB/s
+Best...    10737418240 -> 4210602774 [39.21%]; 42.96s, 254.4MB/s
 ```
 
 Decompression speed should be around the same as using the 'better' compression mode. 
@@ -648,10 +610,10 @@ If you would like more control, you can use the s2 package as described below:
 Snappy compatible blocks can be generated with the S2 encoder. 
 Compression and speed is typically a bit better `MaxEncodedLen` is also smaller for smaller memory usage. Replace 
 
-| Snappy                     | S2 replacement          |
-|----------------------------|-------------------------|
-| snappy.Encode(...)         | s2.EncodeSnappy(...)   |
-| snappy.MaxEncodedLen(...)  | s2.MaxEncodedLen(...)   |
+| Snappy                    | S2 replacement        |
+|---------------------------|-----------------------|
+| snappy.Encode(...)        | s2.EncodeSnappy(...)  |
+| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) |
 
 `s2.EncodeSnappy` can be replaced with `s2.EncodeSnappyBetter` or `s2.EncodeSnappyBest` to get more efficiently compressed snappy compatible output. 
 
@@ -660,12 +622,12 @@ Compression and speed is typically a bit better `MaxEncodedLen` is also smaller
 Comparison of [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
 53927 files, total input size: 4,014,735,833 bytes. amd64, single goroutine used:
 
-| Encoder               | Size       | MB/s       | Reduction |
-|-----------------------|------------|------------|------------
-| snappy.Encode         | 1128706759 | 725.59     | 71.89%    |
-| s2.EncodeSnappy       | 1093823291 | **899.16** | 72.75%    |
-| s2.EncodeSnappyBetter | 1001158548 | 578.49     | 75.06%    |
-| s2.EncodeSnappyBest   | 944507998  | 66.00      | **76.47%**|
+| Encoder               | Size       | MB/s       | Reduction  |
+|-----------------------|------------|------------|------------|
+| snappy.Encode         | 1128706759 | 725.59     | 71.89%     |
+| s2.EncodeSnappy       | 1093823291 | **899.16** | 72.75%     |
+| s2.EncodeSnappyBetter | 1001158548 | 578.49     | 75.06%     |
+| s2.EncodeSnappyBest   | 944507998  | 66.00      | **76.47%** |
 
 ## Streams
 
@@ -835,6 +797,13 @@ This is done using the regular "Skip" function:
 
 This will ensure that we are at exactly the offset we want, and reading from `dec` will start at the requested offset.
 
+# Compact storage
+
+For compact storage [RemoveIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RemoveIndexHeaders) can be used to remove any redundant info from 
+a serialized index. If you remove the header it must be restored before [Loading](https://pkg.go.dev/github.com/klauspost/compress/s2#Index.Load).
+
+This is expected to save 20 bytes. These can be restored using [RestoreIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RestoreIndexHeaders). This removes a layer of security, but is the most compact representation. Returns nil if headers contains errors.
+
 ## Index Format:
 
 Each block is structured as a snappy skippable block, with the chunk ID 0x99.
@@ -844,20 +813,20 @@ The block can be read from the front, but contains information so it can be read
 Numbers are stored as fixed size little endian values or [zigzag encoded](https://developers.google.com/protocol-buffers/docs/encoding#signed_integers) [base 128 varints](https://developers.google.com/protocol-buffers/docs/encoding), 
 with un-encoded value length of 64 bits, unless other limits are specified. 
 
-| Content                                                                   | Format                                                                                                                      |
-|---------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------|
-| ID, `[1]byte`                                                           | Always 0x99.                                                                                                                  |
-| Data Length, `[3]byte`                                                  | 3 byte little-endian length of the chunk in bytes, following this.                                                            |
-| Header `[6]byte`                                                        | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00".                                                        |
-| UncompressedSize, Varint                                                | Total Uncompressed size.                                                                                                      |
-| CompressedSize, Varint                                                  | Total Compressed size if known. Should be -1 if unknown.                                                                      |
-| EstBlockSize, Varint                                                    | Block Size, used for guessing uncompressed offsets. Must be >= 0.                                                             |
-| Entries, Varint                                                         | Number of Entries in index, must be < 65536 and >=0.                                                                          |
-| HasUncompressedOffsets `byte`                                           | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid.                                             |
-| UncompressedOffsets, [Entries]VarInt                                    | Uncompressed offsets. See below how to decode.                                                                                |
-| CompressedOffsets, [Entries]VarInt                                      | Compressed offsets. See below how to decode.                                                                                  |
-| Block Size, `[4]byte`                                                   | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block.       |
-| Trailer `[6]byte`                                                       | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. |
+| Content                              | Format                                                                                                                        |
+|--------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
+| ID, `[1]byte`                        | Always 0x99.                                                                                                                  |
+| Data Length, `[3]byte`               | 3 byte little-endian length of the chunk in bytes, following this.                                                            |
+| Header `[6]byte`                     | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00".                                                        |
+| UncompressedSize, Varint             | Total Uncompressed size.                                                                                                      |
+| CompressedSize, Varint               | Total Compressed size if known. Should be -1 if unknown.                                                                      |
+| EstBlockSize, Varint                 | Block Size, used for guessing uncompressed offsets. Must be >= 0.                                                             |
+| Entries, Varint                      | Number of Entries in index, must be < 65536 and >=0.                                                                          |
+| HasUncompressedOffsets `byte`        | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid.                                             |
+| UncompressedOffsets, [Entries]VarInt | Uncompressed offsets. See below how to decode.                                                                                |
+| CompressedOffsets, [Entries]VarInt   | Compressed offsets. See below how to decode.                                                                                  |
+| Block Size, `[4]byte`                | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block.       |
+| Trailer `[6]byte`                    | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. |
 
 For regular streams the uncompressed offsets are fully predictable,
 so `HasUncompressedOffsets` allows to specify that compressed blocks all have 
@@ -929,6 +898,7 @@ To decode from any given uncompressed offset `(wantOffset)`:
 
 See [using indexes](https://github.com/klauspost/compress/tree/master/s2#using-indexes) for functions that perform the operations with a simpler interface.
 
+
 # Format Extensions
 
 * Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`.
@@ -951,10 +921,11 @@ The length is specified by reading the 3-bit length specified in the tag and dec
 | 7      | 65540 + read 3 bytes |
 
 This allows any repeat offset + length to be represented by 2 to 5 bytes.
+It also allows to emit matches longer than 64 bytes with one copy + one repeat instead of several 64 byte copies.
 
 Lengths are stored as little endian values.
 
-The first copy of a block cannot be a repeat offset and the offset is not carried across blocks in streams.
+The first copy of a block cannot be a repeat offset and the offset is reset on every block in streams.
 
 Default streaming block size is 1MB.
 
diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go
index 27c0f3c2c..00c5cc72c 100644
--- a/vendor/github.com/klauspost/compress/s2/decode.go
+++ b/vendor/github.com/klauspost/compress/s2/decode.go
@@ -952,7 +952,11 @@ func (r *Reader) ReadSeeker(random bool, index []byte) (*ReadSeeker, error) {
 // Seek allows seeking in compressed data.
 func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) {
 	if r.err != nil {
-		return 0, r.err
+		if !errors.Is(r.err, io.EOF) {
+			return 0, r.err
+		}
+		// Reset on EOF
+		r.err = nil
 	}
 	if offset == 0 && whence == io.SeekCurrent {
 		return r.blockStart + int64(r.i), nil
diff --git a/vendor/github.com/klauspost/compress/s2/decode_other.go b/vendor/github.com/klauspost/compress/s2/decode_other.go
index 1074ebd21..11300c3a8 100644
--- a/vendor/github.com/klauspost/compress/s2/decode_other.go
+++ b/vendor/github.com/klauspost/compress/s2/decode_other.go
@@ -28,6 +28,9 @@ func s2Decode(dst, src []byte) int {
 
 	// As long as we can read at least 5 bytes...
 	for s < len(src)-5 {
+		// Removing bounds checks is SLOWER, when if doing
+		// in := src[s:s+5]
+		// Checked on Go 1.18
 		switch src[s] & 0x03 {
 		case tagLiteral:
 			x := uint32(src[s] >> 2)
@@ -38,14 +41,19 @@ func s2Decode(dst, src []byte) int {
 				s += 2
 				x = uint32(src[s-1])
 			case x == 61:
+				in := src[s : s+3]
+				x = uint32(in[1]) | uint32(in[2])<<8
 				s += 3
-				x = uint32(src[s-2]) | uint32(src[s-1])<<8
 			case x == 62:
+				in := src[s : s+4]
+				// Load as 32 bit and shift down.
+				x = uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
+				x >>= 8
 				s += 4
-				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
 			case x == 63:
+				in := src[s : s+5]
+				x = uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24
 				s += 5
-				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
 			}
 			length = int(x) + 1
 			if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
@@ -62,8 +70,8 @@ func s2Decode(dst, src []byte) int {
 
 		case tagCopy1:
 			s += 2
-			length = int(src[s-2]) >> 2 & 0x7
 			toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+			length = int(src[s-2]) >> 2 & 0x7
 			if toffset == 0 {
 				if debug {
 					fmt.Print("(repeat) ")
@@ -71,14 +79,16 @@ func s2Decode(dst, src []byte) int {
 				// keep last offset
 				switch length {
 				case 5:
+					length = int(src[s]) + 4
 					s += 1
-					length = int(uint32(src[s-1])) + 4
 				case 6:
+					in := src[s : s+2]
+					length = int(uint32(in[0])|(uint32(in[1])<<8)) + (1 << 8)
 					s += 2
-					length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
 				case 7:
+					in := src[s : s+3]
+					length = int((uint32(in[2])<<16)|(uint32(in[1])<<8)|uint32(in[0])) + (1 << 16)
 					s += 3
-					length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
 				default: // 0-> 4
 				}
 			} else {
@@ -86,14 +96,16 @@ func s2Decode(dst, src []byte) int {
 			}
 			length += 4
 		case tagCopy2:
+			in := src[s : s+3]
+			offset = int(uint32(in[1]) | uint32(in[2])<<8)
+			length = 1 + int(in[0])>>2
 			s += 3
-			length = 1 + int(src[s-3])>>2
-			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
 
 		case tagCopy4:
+			in := src[s : s+5]
+			offset = int(uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24)
+			length = 1 + int(in[0])>>2
 			s += 5
-			length = 1 + int(src[s-5])>>2
-			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
 		}
 
 		if offset <= 0 || d < offset || length > len(dst)-d {
diff --git a/vendor/github.com/klauspost/compress/s2/encode_all.go b/vendor/github.com/klauspost/compress/s2/encode_all.go
index 8b16c38a6..54c71d3b5 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_all.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_all.go
@@ -58,8 +58,9 @@ func encodeGo(dst, src []byte) []byte {
 // been written.
 //
 // It also assumes that:
+//
 //	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlockGo(dst, src []byte) (d int) {
 	// Initialize the hash table.
 	const (
diff --git a/vendor/github.com/klauspost/compress/s2/encode_amd64.go b/vendor/github.com/klauspost/compress/s2/encode_amd64.go
index e612225f4..6b93daa5a 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_amd64.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_amd64.go
@@ -8,8 +8,9 @@ package s2
 // been written.
 //
 // It also assumes that:
+//
 //	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlock(dst, src []byte) (d int) {
 	const (
 		// Use 12 bit table when less than...
@@ -43,8 +44,9 @@ func encodeBlock(dst, src []byte) (d int) {
 // been written.
 //
 // It also assumes that:
+//
 //	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlockBetter(dst, src []byte) (d int) {
 	const (
 		// Use 12 bit table when less than...
@@ -78,8 +80,9 @@ func encodeBlockBetter(dst, src []byte) (d int) {
 // been written.
 //
 // It also assumes that:
+//
 //	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlockSnappy(dst, src []byte) (d int) {
 	const (
 		// Use 12 bit table when less than...
@@ -112,8 +115,9 @@ func encodeBlockSnappy(dst, src []byte) (d int) {
 // been written.
 //
 // It also assumes that:
+//
 //	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlockBetterSnappy(dst, src []byte) (d int) {
 	const (
 		// Use 12 bit table when less than...
diff --git a/vendor/github.com/klauspost/compress/s2/encode_best.go b/vendor/github.com/klauspost/compress/s2/encode_best.go
index 4bc80bc6a..1b7ea394f 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_best.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_best.go
@@ -15,8 +15,9 @@ import (
 // been written.
 //
 // It also assumes that:
+//
 //	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlockBest(dst, src []byte) (d int) {
 	// Initialize the hash tables.
 	const (
@@ -176,14 +177,21 @@ func encodeBlockBest(dst, src []byte) (d int) {
 						best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
 					}
 					// Search for a match at best match end, see if that is better.
-					if sAt := best.s + best.length; sAt < sLimit {
-						sBack := best.s
-						backL := best.length
+					// Allow some bytes at the beginning to mismatch.
+					// Sweet spot is around 1-2 bytes, but depends on input.
+					// The skipped bytes are tested in Extend backwards,
+					// and still picked up as part of the match if they do.
+					const skipBeginning = 2
+					const skipEnd = 1
+					if sAt := best.s + best.length - skipEnd; sAt < sLimit {
+
+						sBack := best.s + skipBeginning - skipEnd
+						backL := best.length - skipBeginning
 						// Load initial values
 						cv = load64(src, sBack)
-						// Search for mismatch
+
+						// Grab candidates...
 						next := lTable[hash8(load64(src, sAt), lTableBits)]
-						//next := sTable[hash4(load64(src, sAt), sTableBits)]
 
 						if checkAt := getCur(next) - backL; checkAt > 0 {
 							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
@@ -191,6 +199,16 @@ func encodeBlockBest(dst, src []byte) (d int) {
 						if checkAt := getPrev(next) - backL; checkAt > 0 {
 							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
 						}
+						// Disabled: Extremely small gain
+						if false {
+							next = sTable[hash4(load64(src, sAt), sTableBits)]
+							if checkAt := getCur(next) - backL; checkAt > 0 {
+								best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+							}
+							if checkAt := getPrev(next) - backL; checkAt > 0 {
+								best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+							}
+						}
 					}
 				}
 			}
@@ -288,8 +306,9 @@ emitRemainder:
 // been written.
 //
 // It also assumes that:
+//
 //	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlockBestSnappy(dst, src []byte) (d int) {
 	// Initialize the hash tables.
 	const (
@@ -546,6 +565,7 @@ emitRemainder:
 // emitCopySize returns the size to encode the offset+length
 //
 // It assumes that:
+//
 //	1 <= offset && offset <= math.MaxUint32
 //	4 <= length && length <= 1 << 24
 func emitCopySize(offset, length int) int {
@@ -584,6 +604,7 @@ func emitCopySize(offset, length int) int {
 // emitCopyNoRepeatSize returns the size to encode the offset+length
 //
 // It assumes that:
+//
 //	1 <= offset && offset <= math.MaxUint32
 //	4 <= length && length <= 1 << 24
 func emitCopyNoRepeatSize(offset, length int) int {
diff --git a/vendor/github.com/klauspost/compress/s2/encode_better.go b/vendor/github.com/klauspost/compress/s2/encode_better.go
index 943215b8a..3b66ba42b 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_better.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_better.go
@@ -42,8 +42,9 @@ func hash8(u uint64, h uint8) uint32 {
 // been written.
 //
 // It also assumes that:
+//
 //	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlockBetterGo(dst, src []byte) (d int) {
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
@@ -56,7 +57,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
 	// Initialize the hash tables.
 	const (
 		// Long hash matches.
-		lTableBits    = 16
+		lTableBits    = 17
 		maxLTableSize = 1 << lTableBits
 
 		// Short hash matches.
@@ -97,9 +98,26 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
 			lTable[hashL] = uint32(s)
 			sTable[hashS] = uint32(s)
 
+			valLong := load64(src, candidateL)
+			valShort := load64(src, candidateS)
+
+			// If long matches at least 8 bytes, use that.
+			if cv == valLong {
+				break
+			}
+			if cv == valShort {
+				candidateL = candidateS
+				break
+			}
+
 			// Check repeat at offset checkRep.
 			const checkRep = 1
-			if false && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+			// Minimum length of a repeat. Tested with various values.
+			// While 4-5 offers improvements in some, 6 reduces
+			// regressions significantly.
+			const wantRepeatBytes = 6
+			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
+			if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
 				base := s + checkRep
 				// Extend back
 				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
@@ -109,8 +127,8 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
 				d += emitLiteral(dst[d:], src[nextEmit:base])
 
 				// Extend forward
-				candidate := s - repeat + 4 + checkRep
-				s += 4 + checkRep
+				candidate := s - repeat + wantRepeatBytes + checkRep
+				s += wantRepeatBytes + checkRep
 				for s < len(src) {
 					if len(src)-s < 8 {
 						if src[s] == src[candidate] {
@@ -127,28 +145,40 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
 					s += 8
 					candidate += 8
 				}
-				if nextEmit > 0 {
-					// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
-					d += emitRepeat(dst[d:], repeat, s-base)
-				} else {
-					// First match, cannot be repeat.
-					d += emitCopy(dst[d:], repeat, s-base)
-				}
+				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+				d += emitRepeat(dst[d:], repeat, s-base)
 				nextEmit = s
 				if s >= sLimit {
 					goto emitRemainder
 				}
+				// Index in-between
+				index0 := base + 1
+				index1 := s - 2
+
+				cv = load64(src, s)
+				for index0 < index1 {
+					cv0 := load64(src, index0)
+					cv1 := load64(src, index1)
+					lTable[hash7(cv0, lTableBits)] = uint32(index0)
+					sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+					lTable[hash7(cv1, lTableBits)] = uint32(index1)
+					sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+					index0 += 2
+					index1 -= 2
+				}
 
 				cv = load64(src, s)
 				continue
 			}
 
-			if uint32(cv) == load32(src, candidateL) {
+			// Long likely matches 7, so take that.
+			if uint32(cv) == uint32(valLong) {
 				break
 			}
 
 			// Check our short candidate
-			if uint32(cv) == load32(src, candidateS) {
+			if uint32(cv) == uint32(valShort) {
 				// Try a long candidate at s+1
 				hashL = hash7(cv>>8, lTableBits)
 				candidateL = int(lTable[hashL])
@@ -227,21 +257,29 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
 			// Do we have space for more, if not bail.
 			return 0
 		}
-		// Index match start+1 (long) and start+2 (short)
+
+		// Index short & long
 		index0 := base + 1
-		// Index match end-2 (long) and end-1 (short)
 		index1 := s - 2
 
 		cv0 := load64(src, index0)
 		cv1 := load64(src, index1)
-		cv = load64(src, s)
 		lTable[hash7(cv0, lTableBits)] = uint32(index0)
-		lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
-		lTable[hash7(cv1, lTableBits)] = uint32(index1)
-		lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
 		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
-		sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)
+
+		lTable[hash7(cv1, lTableBits)] = uint32(index1)
 		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// index every second long in between.
+		for index0 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+			lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
+			index0 += 2
+			index1 -= 2
+		}
 	}
 
 emitRemainder:
@@ -260,8 +298,9 @@ emitRemainder:
 // been written.
 //
 // It also assumes that:
+//
 //	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
@@ -402,21 +441,29 @@ func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
 			// Do we have space for more, if not bail.
 			return 0
 		}
-		// Index match start+1 (long) and start+2 (short)
+
+		// Index short & long
 		index0 := base + 1
-		// Index match end-2 (long) and end-1 (short)
 		index1 := s - 2
 
 		cv0 := load64(src, index0)
 		cv1 := load64(src, index1)
-		cv = load64(src, s)
 		lTable[hash7(cv0, lTableBits)] = uint32(index0)
-		lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
-		lTable[hash7(cv1, lTableBits)] = uint32(index1)
-		lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
 		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
-		sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)
+
+		lTable[hash7(cv1, lTableBits)] = uint32(index1)
 		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// index every second long in between.
+		for index0 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+			lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
+			index0 += 2
+			index1 -= 2
+		}
 	}
 
 emitRemainder:
diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go
index 94784b82a..db08fc355 100644
--- a/vendor/github.com/klauspost/compress/s2/encode_go.go
+++ b/vendor/github.com/klauspost/compress/s2/encode_go.go
@@ -12,6 +12,7 @@ import (
 // been written.
 //
 // It also assumes that:
+//
 //	len(dst) >= MaxEncodedLen(len(src))
 func encodeBlock(dst, src []byte) (d int) {
 	if len(src) < minNonLiteralBlockSize {
@@ -25,6 +26,7 @@ func encodeBlock(dst, src []byte) (d int) {
 // been written.
 //
 // It also assumes that:
+//
 //	len(dst) >= MaxEncodedLen(len(src))
 func encodeBlockBetter(dst, src []byte) (d int) {
 	return encodeBlockBetterGo(dst, src)
@@ -35,6 +37,7 @@ func encodeBlockBetter(dst, src []byte) (d int) {
 // been written.
 //
 // It also assumes that:
+//
 //	len(dst) >= MaxEncodedLen(len(src))
 func encodeBlockBetterSnappy(dst, src []byte) (d int) {
 	return encodeBlockBetterSnappyGo(dst, src)
@@ -45,6 +48,7 @@ func encodeBlockBetterSnappy(dst, src []byte) (d int) {
 // been written.
 //
 // It also assumes that:
+//
 //	len(dst) >= MaxEncodedLen(len(src))
 func encodeBlockSnappy(dst, src []byte) (d int) {
 	if len(src) < minNonLiteralBlockSize {
@@ -56,6 +60,7 @@ func encodeBlockSnappy(dst, src []byte) (d int) {
 // emitLiteral writes a literal chunk and returns the number of bytes written.
 //
 // It assumes that:
+//
 //	dst is long enough to hold the encoded bytes
 //	0 <= len(lit) && len(lit) <= math.MaxUint32
 func emitLiteral(dst, lit []byte) int {
@@ -146,6 +151,7 @@ func emitRepeat(dst []byte, offset, length int) int {
 // emitCopy writes a copy chunk and returns the number of bytes written.
 //
 // It assumes that:
+//
 //	dst is long enough to hold the encoded bytes
 //	1 <= offset && offset <= math.MaxUint32
 //	4 <= length && length <= 1 << 24
@@ -214,6 +220,7 @@ func emitCopy(dst []byte, offset, length int) int {
 // emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
 //
 // It assumes that:
+//
 //	dst is long enough to hold the encoded bytes
 //	1 <= offset && offset <= math.MaxUint32
 //	4 <= length && length <= 1 << 24
@@ -273,8 +280,8 @@ func emitCopyNoRepeat(dst []byte, offset, length int) int {
 // matchLen returns how many bytes match in a and b
 //
 // It assumes that:
-//   len(a) <= len(b)
 //
+//	len(a) <= len(b)
 func matchLen(a []byte, b []byte) int {
 	b = b[:len(a)]
 	var checked int
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
index 88f27c099..7e00bac3e 100644
--- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
@@ -1,7 +1,6 @@
 // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
 
 //go:build !appengine && !noasm && gc && !noasm
-// +build !appengine,!noasm,gc,!noasm
 
 package s2
 
@@ -150,8 +149,9 @@ func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
 // emitLiteral writes a literal chunk and returns the number of bytes written.
 //
 // It assumes that:
-//   dst is long enough to hold the encoded bytes with margin of 0 bytes
-//   0 <= len(lit) && len(lit) <= math.MaxUint32
+//
+//	dst is long enough to hold the encoded bytes with margin of 0 bytes
+//	0 <= len(lit) && len(lit) <= math.MaxUint32
 //
 //go:noescape
 func emitLiteral(dst []byte, lit []byte) int
@@ -165,9 +165,10 @@ func emitRepeat(dst []byte, offset int, length int) int
 // emitCopy writes a copy chunk and returns the number of bytes written.
 //
 // It assumes that:
-//   dst is long enough to hold the encoded bytes
-//   1 <= offset && offset <= math.MaxUint32
-//   4 <= length && length <= 1 << 24
+//
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
 //
 //go:noescape
 func emitCopy(dst []byte, offset int, length int) int
@@ -175,9 +176,10 @@ func emitCopy(dst []byte, offset int, length int) int
 // emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
 //
 // It assumes that:
-//   dst is long enough to hold the encoded bytes
-//   1 <= offset && offset <= math.MaxUint32
-//   4 <= length && length <= 1 << 24
+//
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= math.MaxUint32
+//	4 <= length && length <= 1 << 24
 //
 //go:noescape
 func emitCopyNoRepeat(dst []byte, offset int, length int) int
@@ -185,7 +187,8 @@ func emitCopyNoRepeat(dst []byte, offset int, length int) int
 // matchLen returns how many bytes match in a and b
 //
 // It assumes that:
-//   len(a) <= len(b)
+//
+//	len(a) <= len(b)
 //
 //go:noescape
 func matchLen(a []byte, b []byte) int
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
index 36915d949..81a487d6d 100644
--- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
@@ -1,7 +1,6 @@
 // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
 
 //go:build !appengine && !noasm && gc && !noasm
-// +build !appengine,!noasm,gc,!noasm
 
 #include "textflag.h"
 
@@ -5743,9 +5742,9 @@ emit_literal_done_emit_remainder_encodeBlockAsm8B:
 
 // func encodeBetterBlockAsm(dst []byte, src []byte) int
 // Requires: BMI, SSE2
-TEXT ·encodeBetterBlockAsm(SB), $327704-56
+TEXT ·encodeBetterBlockAsm(SB), $589848-56
 	MOVQ dst_base+0(FP), AX
-	MOVQ $0x00000a00, CX
+	MOVQ $0x00001200, CX
 	LEAQ 24(SP), DX
 	PXOR X0, X0
 
@@ -5797,27 +5796,37 @@ check_maxskip_cont_encodeBetterBlockAsm:
 	MOVQ  DI, R11
 	SHLQ  $0x08, R10
 	IMULQ R9, R10
-	SHRQ  $0x30, R10
+	SHRQ  $0x2f, R10
 	SHLQ  $0x20, R11
 	IMULQ SI, R11
 	SHRQ  $0x32, R11
 	MOVL  24(SP)(R10*4), SI
-	MOVL  262168(SP)(R11*4), R8
+	MOVL  524312(SP)(R11*4), R8
 	MOVL  CX, 24(SP)(R10*4)
-	MOVL  CX, 262168(SP)(R11*4)
-	CMPL  (DX)(SI*1), DI
+	MOVL  CX, 524312(SP)(R11*4)
+	MOVQ  (DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	CMPQ  R10, DI
 	JEQ   candidate_match_encodeBetterBlockAsm
-	CMPL  (DX)(R8*1), DI
-	JEQ   candidateS_match_encodeBetterBlockAsm
-	MOVL  20(SP), CX
-	JMP   search_loop_encodeBetterBlockAsm
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeBetterBlockAsm
+	MOVL  R8, SI
+	JMP   candidate_match_encodeBetterBlockAsm
+
+no_short_found_encodeBetterBlockAsm:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeBetterBlockAsm
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeBetterBlockAsm
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBetterBlockAsm
 
 candidateS_match_encodeBetterBlockAsm:
 	SHRQ  $0x08, DI
 	MOVQ  DI, R10
 	SHLQ  $0x08, R10
 	IMULQ R9, R10
-	SHRQ  $0x30, R10
+	SHRQ  $0x2f, R10
 	MOVL  24(SP)(R10*4), SI
 	INCL  CX
 	MOVL  CX, 24(SP)(R10*4)
@@ -6590,52 +6599,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm:
 match_nolit_dst_ok_encodeBetterBlockAsm:
 	MOVQ  $0x00cf1bbcdcbfa563, SI
 	MOVQ  $0x9e3779b1, R8
-	INCL  DI
-	MOVQ  (DX)(DI*1), R9
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	MOVQ  R9, R12
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	SHRQ  $0x10, R12
-	LEAL  1(DI), R14
-	LEAL  2(DI), R15
-	MOVQ  -2(DX)(CX*1), R9
+	LEAQ  1(DI), DI
+	LEAQ  -2(CX), R9
+	MOVQ  (DX)(DI*1), R10
+	MOVQ  1(DX)(DI*1), R11
+	MOVQ  (DX)(R9*1), R12
+	MOVQ  1(DX)(R9*1), R13
 	SHLQ  $0x08, R10
 	IMULQ SI, R10
-	SHRQ  $0x30, R10
-	SHLQ  $0x08, R13
-	IMULQ SI, R13
-	SHRQ  $0x30, R13
+	SHRQ  $0x2f, R10
 	SHLQ  $0x20, R11
 	IMULQ R8, R11
 	SHRQ  $0x32, R11
-	SHLQ  $0x20, R12
-	IMULQ R8, R12
-	SHRQ  $0x32, R12
+	SHLQ  $0x08, R12
+	IMULQ SI, R12
+	SHRQ  $0x2f, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x32, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
 	MOVL  DI, 24(SP)(R10*4)
-	MOVL  R14, 24(SP)(R13*4)
-	MOVL  R14, 262168(SP)(R11*4)
-	MOVL  R15, 262168(SP)(R12*4)
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	LEAL  -2(CX), R9
-	LEAL  -1(CX), DI
+	MOVL  R9, 24(SP)(R12*4)
+	MOVL  R8, 524312(SP)(R11*4)
+	MOVL  R14, 524312(SP)(R13*4)
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeBetterBlockAsm:
+	CMPQ  DI, R9
+	JAE   search_loop_encodeBetterBlockAsm
+	MOVQ  (DX)(DI*1), R8
+	MOVQ  (DX)(R9*1), R10
+	SHLQ  $0x08, R8
+	IMULQ SI, R8
+	SHRQ  $0x2f, R8
 	SHLQ  $0x08, R10
 	IMULQ SI, R10
-	SHRQ  $0x30, R10
-	SHLQ  $0x20, R11
-	IMULQ R8, R11
-	SHRQ  $0x32, R11
-	SHLQ  $0x08, R13
-	IMULQ SI, R13
-	SHRQ  $0x30, R13
+	SHRQ  $0x2f, R10
+	MOVL  DI, 24(SP)(R8*4)
 	MOVL  R9, 24(SP)(R10*4)
-	MOVL  DI, 262168(SP)(R11*4)
-	MOVL  DI, 24(SP)(R13*4)
-	JMP   search_loop_encodeBetterBlockAsm
+	ADDQ  $0x02, DI
+	SUBQ  $0x02, R9
+	JMP   index_loop_encodeBetterBlockAsm
 
 emit_remainder_encodeBetterBlockAsm:
 	MOVQ src_len+32(FP), CX
@@ -6815,9 +6821,9 @@ emit_literal_done_emit_remainder_encodeBetterBlockAsm:
 
 // func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
 // Requires: BMI, SSE2
-TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56
+TEXT ·encodeBetterBlockAsm4MB(SB), $589848-56
 	MOVQ dst_base+0(FP), AX
-	MOVQ $0x00000a00, CX
+	MOVQ $0x00001200, CX
 	LEAQ 24(SP), DX
 	PXOR X0, X0
 
@@ -6869,27 +6875,37 @@ check_maxskip_cont_encodeBetterBlockAsm4MB:
 	MOVQ  DI, R11
 	SHLQ  $0x08, R10
 	IMULQ R9, R10
-	SHRQ  $0x30, R10
+	SHRQ  $0x2f, R10
 	SHLQ  $0x20, R11
 	IMULQ SI, R11
 	SHRQ  $0x32, R11
 	MOVL  24(SP)(R10*4), SI
-	MOVL  262168(SP)(R11*4), R8
+	MOVL  524312(SP)(R11*4), R8
 	MOVL  CX, 24(SP)(R10*4)
-	MOVL  CX, 262168(SP)(R11*4)
-	CMPL  (DX)(SI*1), DI
+	MOVL  CX, 524312(SP)(R11*4)
+	MOVQ  (DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	CMPQ  R10, DI
 	JEQ   candidate_match_encodeBetterBlockAsm4MB
-	CMPL  (DX)(R8*1), DI
-	JEQ   candidateS_match_encodeBetterBlockAsm4MB
-	MOVL  20(SP), CX
-	JMP   search_loop_encodeBetterBlockAsm4MB
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeBetterBlockAsm4MB
+	MOVL  R8, SI
+	JMP   candidate_match_encodeBetterBlockAsm4MB
+
+no_short_found_encodeBetterBlockAsm4MB:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeBetterBlockAsm4MB
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeBetterBlockAsm4MB
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBetterBlockAsm4MB
 
 candidateS_match_encodeBetterBlockAsm4MB:
 	SHRQ  $0x08, DI
 	MOVQ  DI, R10
 	SHLQ  $0x08, R10
 	IMULQ R9, R10
-	SHRQ  $0x30, R10
+	SHRQ  $0x2f, R10
 	MOVL  24(SP)(R10*4), SI
 	INCL  CX
 	MOVL  CX, 24(SP)(R10*4)
@@ -7600,52 +7616,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
 match_nolit_dst_ok_encodeBetterBlockAsm4MB:
 	MOVQ  $0x00cf1bbcdcbfa563, SI
 	MOVQ  $0x9e3779b1, R8
-	INCL  DI
-	MOVQ  (DX)(DI*1), R9
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	MOVQ  R9, R12
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	SHRQ  $0x10, R12
-	LEAL  1(DI), R14
-	LEAL  2(DI), R15
-	MOVQ  -2(DX)(CX*1), R9
+	LEAQ  1(DI), DI
+	LEAQ  -2(CX), R9
+	MOVQ  (DX)(DI*1), R10
+	MOVQ  1(DX)(DI*1), R11
+	MOVQ  (DX)(R9*1), R12
+	MOVQ  1(DX)(R9*1), R13
 	SHLQ  $0x08, R10
 	IMULQ SI, R10
-	SHRQ  $0x30, R10
-	SHLQ  $0x08, R13
-	IMULQ SI, R13
-	SHRQ  $0x30, R13
+	SHRQ  $0x2f, R10
 	SHLQ  $0x20, R11
 	IMULQ R8, R11
 	SHRQ  $0x32, R11
-	SHLQ  $0x20, R12
-	IMULQ R8, R12
-	SHRQ  $0x32, R12
+	SHLQ  $0x08, R12
+	IMULQ SI, R12
+	SHRQ  $0x2f, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x32, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
 	MOVL  DI, 24(SP)(R10*4)
-	MOVL  R14, 24(SP)(R13*4)
-	MOVL  R14, 262168(SP)(R11*4)
-	MOVL  R15, 262168(SP)(R12*4)
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	LEAL  -2(CX), R9
-	LEAL  -1(CX), DI
+	MOVL  R9, 24(SP)(R12*4)
+	MOVL  R8, 524312(SP)(R11*4)
+	MOVL  R14, 524312(SP)(R13*4)
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeBetterBlockAsm4MB:
+	CMPQ  DI, R9
+	JAE   search_loop_encodeBetterBlockAsm4MB
+	MOVQ  (DX)(DI*1), R8
+	MOVQ  (DX)(R9*1), R10
+	SHLQ  $0x08, R8
+	IMULQ SI, R8
+	SHRQ  $0x2f, R8
 	SHLQ  $0x08, R10
 	IMULQ SI, R10
-	SHRQ  $0x30, R10
-	SHLQ  $0x20, R11
-	IMULQ R8, R11
-	SHRQ  $0x32, R11
-	SHLQ  $0x08, R13
-	IMULQ SI, R13
-	SHRQ  $0x30, R13
+	SHRQ  $0x2f, R10
+	MOVL  DI, 24(SP)(R8*4)
 	MOVL  R9, 24(SP)(R10*4)
-	MOVL  DI, 262168(SP)(R11*4)
-	MOVL  DI, 24(SP)(R13*4)
-	JMP   search_loop_encodeBetterBlockAsm4MB
+	ADDQ  $0x02, DI
+	SUBQ  $0x02, R9
+	JMP   index_loop_encodeBetterBlockAsm4MB
 
 emit_remainder_encodeBetterBlockAsm4MB:
 	MOVQ src_len+32(FP), CX
@@ -7871,12 +7884,22 @@ search_loop_encodeBetterBlockAsm12B:
 	MOVL  65560(SP)(R11*4), R8
 	MOVL  CX, 24(SP)(R10*4)
 	MOVL  CX, 65560(SP)(R11*4)
-	CMPL  (DX)(SI*1), DI
+	MOVQ  (DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	CMPQ  R10, DI
 	JEQ   candidate_match_encodeBetterBlockAsm12B
-	CMPL  (DX)(R8*1), DI
-	JEQ   candidateS_match_encodeBetterBlockAsm12B
-	MOVL  20(SP), CX
-	JMP   search_loop_encodeBetterBlockAsm12B
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeBetterBlockAsm12B
+	MOVL  R8, SI
+	JMP   candidate_match_encodeBetterBlockAsm12B
+
+no_short_found_encodeBetterBlockAsm12B:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeBetterBlockAsm12B
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeBetterBlockAsm12B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBetterBlockAsm12B
 
 candidateS_match_encodeBetterBlockAsm12B:
 	SHRQ  $0x08, DI
@@ -8447,52 +8470,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
 match_nolit_dst_ok_encodeBetterBlockAsm12B:
 	MOVQ  $0x0000cf1bbcdcbf9b, SI
 	MOVQ  $0x9e3779b1, R8
-	INCL  DI
-	MOVQ  (DX)(DI*1), R9
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	MOVQ  R9, R12
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	SHRQ  $0x10, R12
-	LEAL  1(DI), R14
-	LEAL  2(DI), R15
-	MOVQ  -2(DX)(CX*1), R9
+	LEAQ  1(DI), DI
+	LEAQ  -2(CX), R9
+	MOVQ  (DX)(DI*1), R10
+	MOVQ  1(DX)(DI*1), R11
+	MOVQ  (DX)(R9*1), R12
+	MOVQ  1(DX)(R9*1), R13
 	SHLQ  $0x10, R10
 	IMULQ SI, R10
 	SHRQ  $0x32, R10
-	SHLQ  $0x10, R13
-	IMULQ SI, R13
-	SHRQ  $0x32, R13
 	SHLQ  $0x20, R11
 	IMULQ R8, R11
 	SHRQ  $0x34, R11
-	SHLQ  $0x20, R12
-	IMULQ R8, R12
-	SHRQ  $0x34, R12
+	SHLQ  $0x10, R12
+	IMULQ SI, R12
+	SHRQ  $0x32, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x34, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
 	MOVL  DI, 24(SP)(R10*4)
-	MOVL  R14, 24(SP)(R13*4)
-	MOVL  R14, 65560(SP)(R11*4)
-	MOVL  R15, 65560(SP)(R12*4)
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	LEAL  -2(CX), R9
-	LEAL  -1(CX), DI
+	MOVL  R9, 24(SP)(R12*4)
+	MOVL  R8, 65560(SP)(R11*4)
+	MOVL  R14, 65560(SP)(R13*4)
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeBetterBlockAsm12B:
+	CMPQ  DI, R9
+	JAE   search_loop_encodeBetterBlockAsm12B
+	MOVQ  (DX)(DI*1), R8
+	MOVQ  (DX)(R9*1), R10
+	SHLQ  $0x10, R8
+	IMULQ SI, R8
+	SHRQ  $0x32, R8
 	SHLQ  $0x10, R10
 	IMULQ SI, R10
 	SHRQ  $0x32, R10
-	SHLQ  $0x20, R11
-	IMULQ R8, R11
-	SHRQ  $0x34, R11
-	SHLQ  $0x10, R13
-	IMULQ SI, R13
-	SHRQ  $0x32, R13
+	MOVL  DI, 24(SP)(R8*4)
 	MOVL  R9, 24(SP)(R10*4)
-	MOVL  DI, 65560(SP)(R11*4)
-	MOVL  DI, 24(SP)(R13*4)
-	JMP   search_loop_encodeBetterBlockAsm12B
+	ADDQ  $0x02, DI
+	SUBQ  $0x02, R9
+	JMP   index_loop_encodeBetterBlockAsm12B
 
 emit_remainder_encodeBetterBlockAsm12B:
 	MOVQ src_len+32(FP), CX
@@ -8707,12 +8727,22 @@ search_loop_encodeBetterBlockAsm10B:
 	MOVL  16408(SP)(R11*4), R8
 	MOVL  CX, 24(SP)(R10*4)
 	MOVL  CX, 16408(SP)(R11*4)
-	CMPL  (DX)(SI*1), DI
+	MOVQ  (DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	CMPQ  R10, DI
 	JEQ   candidate_match_encodeBetterBlockAsm10B
-	CMPL  (DX)(R8*1), DI
-	JEQ   candidateS_match_encodeBetterBlockAsm10B
-	MOVL  20(SP), CX
-	JMP   search_loop_encodeBetterBlockAsm10B
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeBetterBlockAsm10B
+	MOVL  R8, SI
+	JMP   candidate_match_encodeBetterBlockAsm10B
+
+no_short_found_encodeBetterBlockAsm10B:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeBetterBlockAsm10B
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeBetterBlockAsm10B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBetterBlockAsm10B
 
 candidateS_match_encodeBetterBlockAsm10B:
 	SHRQ  $0x08, DI
@@ -9283,52 +9313,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
 match_nolit_dst_ok_encodeBetterBlockAsm10B:
 	MOVQ  $0x0000cf1bbcdcbf9b, SI
 	MOVQ  $0x9e3779b1, R8
-	INCL  DI
-	MOVQ  (DX)(DI*1), R9
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	MOVQ  R9, R12
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	SHRQ  $0x10, R12
-	LEAL  1(DI), R14
-	LEAL  2(DI), R15
-	MOVQ  -2(DX)(CX*1), R9
+	LEAQ  1(DI), DI
+	LEAQ  -2(CX), R9
+	MOVQ  (DX)(DI*1), R10
+	MOVQ  1(DX)(DI*1), R11
+	MOVQ  (DX)(R9*1), R12
+	MOVQ  1(DX)(R9*1), R13
 	SHLQ  $0x10, R10
 	IMULQ SI, R10
 	SHRQ  $0x34, R10
-	SHLQ  $0x10, R13
-	IMULQ SI, R13
-	SHRQ  $0x34, R13
 	SHLQ  $0x20, R11
 	IMULQ R8, R11
 	SHRQ  $0x36, R11
-	SHLQ  $0x20, R12
-	IMULQ R8, R12
-	SHRQ  $0x36, R12
+	SHLQ  $0x10, R12
+	IMULQ SI, R12
+	SHRQ  $0x34, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x36, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
 	MOVL  DI, 24(SP)(R10*4)
-	MOVL  R14, 24(SP)(R13*4)
-	MOVL  R14, 16408(SP)(R11*4)
-	MOVL  R15, 16408(SP)(R12*4)
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	LEAL  -2(CX), R9
-	LEAL  -1(CX), DI
+	MOVL  R9, 24(SP)(R12*4)
+	MOVL  R8, 16408(SP)(R11*4)
+	MOVL  R14, 16408(SP)(R13*4)
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeBetterBlockAsm10B:
+	CMPQ  DI, R9
+	JAE   search_loop_encodeBetterBlockAsm10B
+	MOVQ  (DX)(DI*1), R8
+	MOVQ  (DX)(R9*1), R10
+	SHLQ  $0x10, R8
+	IMULQ SI, R8
+	SHRQ  $0x34, R8
 	SHLQ  $0x10, R10
 	IMULQ SI, R10
 	SHRQ  $0x34, R10
-	SHLQ  $0x20, R11
-	IMULQ R8, R11
-	SHRQ  $0x36, R11
-	SHLQ  $0x10, R13
-	IMULQ SI, R13
-	SHRQ  $0x34, R13
+	MOVL  DI, 24(SP)(R8*4)
 	MOVL  R9, 24(SP)(R10*4)
-	MOVL  DI, 16408(SP)(R11*4)
-	MOVL  DI, 24(SP)(R13*4)
-	JMP   search_loop_encodeBetterBlockAsm10B
+	ADDQ  $0x02, DI
+	SUBQ  $0x02, R9
+	JMP   index_loop_encodeBetterBlockAsm10B
 
 emit_remainder_encodeBetterBlockAsm10B:
 	MOVQ src_len+32(FP), CX
@@ -9543,12 +9570,22 @@ search_loop_encodeBetterBlockAsm8B:
 	MOVL  4120(SP)(R11*4), R8
 	MOVL  CX, 24(SP)(R10*4)
 	MOVL  CX, 4120(SP)(R11*4)
-	CMPL  (DX)(SI*1), DI
+	MOVQ  (DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	CMPQ  R10, DI
 	JEQ   candidate_match_encodeBetterBlockAsm8B
-	CMPL  (DX)(R8*1), DI
-	JEQ   candidateS_match_encodeBetterBlockAsm8B
-	MOVL  20(SP), CX
-	JMP   search_loop_encodeBetterBlockAsm8B
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeBetterBlockAsm8B
+	MOVL  R8, SI
+	JMP   candidate_match_encodeBetterBlockAsm8B
+
+no_short_found_encodeBetterBlockAsm8B:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeBetterBlockAsm8B
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeBetterBlockAsm8B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeBetterBlockAsm8B
 
 candidateS_match_encodeBetterBlockAsm8B:
 	SHRQ  $0x08, DI
@@ -10105,52 +10142,49 @@ match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
 match_nolit_dst_ok_encodeBetterBlockAsm8B:
 	MOVQ  $0x0000cf1bbcdcbf9b, SI
 	MOVQ  $0x9e3779b1, R8
-	INCL  DI
-	MOVQ  (DX)(DI*1), R9
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	MOVQ  R9, R12
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	SHRQ  $0x10, R12
-	LEAL  1(DI), R14
-	LEAL  2(DI), R15
-	MOVQ  -2(DX)(CX*1), R9
+	LEAQ  1(DI), DI
+	LEAQ  -2(CX), R9
+	MOVQ  (DX)(DI*1), R10
+	MOVQ  1(DX)(DI*1), R11
+	MOVQ  (DX)(R9*1), R12
+	MOVQ  1(DX)(R9*1), R13
 	SHLQ  $0x10, R10
 	IMULQ SI, R10
 	SHRQ  $0x36, R10
-	SHLQ  $0x10, R13
-	IMULQ SI, R13
-	SHRQ  $0x36, R13
 	SHLQ  $0x20, R11
 	IMULQ R8, R11
 	SHRQ  $0x38, R11
-	SHLQ  $0x20, R12
-	IMULQ R8, R12
-	SHRQ  $0x38, R12
+	SHLQ  $0x10, R12
+	IMULQ SI, R12
+	SHRQ  $0x36, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x38, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
 	MOVL  DI, 24(SP)(R10*4)
-	MOVL  R14, 24(SP)(R13*4)
-	MOVL  R14, 4120(SP)(R11*4)
-	MOVL  R15, 4120(SP)(R12*4)
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	LEAL  -2(CX), R9
-	LEAL  -1(CX), DI
+	MOVL  R9, 24(SP)(R12*4)
+	MOVL  R8, 4120(SP)(R11*4)
+	MOVL  R14, 4120(SP)(R13*4)
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeBetterBlockAsm8B:
+	CMPQ  DI, R9
+	JAE   search_loop_encodeBetterBlockAsm8B
+	MOVQ  (DX)(DI*1), R8
+	MOVQ  (DX)(R9*1), R10
+	SHLQ  $0x10, R8
+	IMULQ SI, R8
+	SHRQ  $0x36, R8
 	SHLQ  $0x10, R10
 	IMULQ SI, R10
 	SHRQ  $0x36, R10
-	SHLQ  $0x20, R11
-	IMULQ R8, R11
-	SHRQ  $0x38, R11
-	SHLQ  $0x10, R13
-	IMULQ SI, R13
-	SHRQ  $0x36, R13
+	MOVL  DI, 24(SP)(R8*4)
 	MOVL  R9, 24(SP)(R10*4)
-	MOVL  DI, 4120(SP)(R11*4)
-	MOVL  DI, 24(SP)(R13*4)
-	JMP   search_loop_encodeBetterBlockAsm8B
+	ADDQ  $0x02, DI
+	SUBQ  $0x02, R9
+	JMP   index_loop_encodeBetterBlockAsm8B
 
 emit_remainder_encodeBetterBlockAsm8B:
 	MOVQ src_len+32(FP), CX
@@ -14287,9 +14321,9 @@ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
 
 // func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
 // Requires: BMI, SSE2
-TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56
+TEXT ·encodeSnappyBetterBlockAsm(SB), $589848-56
 	MOVQ dst_base+0(FP), AX
-	MOVQ $0x00000a00, CX
+	MOVQ $0x00001200, CX
 	LEAQ 24(SP), DX
 	PXOR X0, X0
 
@@ -14341,27 +14375,37 @@ check_maxskip_cont_encodeSnappyBetterBlockAsm:
 	MOVQ  DI, R11
 	SHLQ  $0x08, R10
 	IMULQ R9, R10
-	SHRQ  $0x30, R10
+	SHRQ  $0x2f, R10
 	SHLQ  $0x20, R11
 	IMULQ SI, R11
 	SHRQ  $0x32, R11
 	MOVL  24(SP)(R10*4), SI
-	MOVL  262168(SP)(R11*4), R8
+	MOVL  524312(SP)(R11*4), R8
 	MOVL  CX, 24(SP)(R10*4)
-	MOVL  CX, 262168(SP)(R11*4)
-	CMPL  (DX)(SI*1), DI
+	MOVL  CX, 524312(SP)(R11*4)
+	MOVQ  (DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	CMPQ  R10, DI
 	JEQ   candidate_match_encodeSnappyBetterBlockAsm
-	CMPL  (DX)(R8*1), DI
-	JEQ   candidateS_match_encodeSnappyBetterBlockAsm
-	MOVL  20(SP), CX
-	JMP   search_loop_encodeSnappyBetterBlockAsm
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeSnappyBetterBlockAsm
+	MOVL  R8, SI
+	JMP   candidate_match_encodeSnappyBetterBlockAsm
+
+no_short_found_encodeSnappyBetterBlockAsm:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeSnappyBetterBlockAsm
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeSnappyBetterBlockAsm
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBetterBlockAsm
 
 candidateS_match_encodeSnappyBetterBlockAsm:
 	SHRQ  $0x08, DI
 	MOVQ  DI, R10
 	SHLQ  $0x08, R10
 	IMULQ R9, R10
-	SHRQ  $0x30, R10
+	SHRQ  $0x2f, R10
 	MOVL  24(SP)(R10*4), SI
 	INCL  CX
 	MOVL  CX, 24(SP)(R10*4)
@@ -14685,52 +14729,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
 match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
 	MOVQ  $0x00cf1bbcdcbfa563, SI
 	MOVQ  $0x9e3779b1, R8
-	INCL  DI
-	MOVQ  (DX)(DI*1), R9
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	MOVQ  R9, R12
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	SHRQ  $0x10, R12
-	LEAL  1(DI), R14
-	LEAL  2(DI), R15
-	MOVQ  -2(DX)(CX*1), R9
+	LEAQ  1(DI), DI
+	LEAQ  -2(CX), R9
+	MOVQ  (DX)(DI*1), R10
+	MOVQ  1(DX)(DI*1), R11
+	MOVQ  (DX)(R9*1), R12
+	MOVQ  1(DX)(R9*1), R13
 	SHLQ  $0x08, R10
 	IMULQ SI, R10
-	SHRQ  $0x30, R10
-	SHLQ  $0x08, R13
-	IMULQ SI, R13
-	SHRQ  $0x30, R13
+	SHRQ  $0x2f, R10
 	SHLQ  $0x20, R11
 	IMULQ R8, R11
 	SHRQ  $0x32, R11
-	SHLQ  $0x20, R12
-	IMULQ R8, R12
-	SHRQ  $0x32, R12
+	SHLQ  $0x08, R12
+	IMULQ SI, R12
+	SHRQ  $0x2f, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x32, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
 	MOVL  DI, 24(SP)(R10*4)
-	MOVL  R14, 24(SP)(R13*4)
-	MOVL  R14, 262168(SP)(R11*4)
-	MOVL  R15, 262168(SP)(R12*4)
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	LEAL  -2(CX), R9
-	LEAL  -1(CX), DI
+	MOVL  R9, 24(SP)(R12*4)
+	MOVL  R8, 524312(SP)(R11*4)
+	MOVL  R14, 524312(SP)(R13*4)
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeSnappyBetterBlockAsm:
+	CMPQ  DI, R9
+	JAE   search_loop_encodeSnappyBetterBlockAsm
+	MOVQ  (DX)(DI*1), R8
+	MOVQ  (DX)(R9*1), R10
+	SHLQ  $0x08, R8
+	IMULQ SI, R8
+	SHRQ  $0x2f, R8
 	SHLQ  $0x08, R10
 	IMULQ SI, R10
-	SHRQ  $0x30, R10
-	SHLQ  $0x20, R11
-	IMULQ R8, R11
-	SHRQ  $0x32, R11
-	SHLQ  $0x08, R13
-	IMULQ SI, R13
-	SHRQ  $0x30, R13
+	SHRQ  $0x2f, R10
+	MOVL  DI, 24(SP)(R8*4)
 	MOVL  R9, 24(SP)(R10*4)
-	MOVL  DI, 262168(SP)(R11*4)
-	MOVL  DI, 24(SP)(R13*4)
-	JMP   search_loop_encodeSnappyBetterBlockAsm
+	ADDQ  $0x02, DI
+	SUBQ  $0x02, R9
+	JMP   index_loop_encodeSnappyBetterBlockAsm
 
 emit_remainder_encodeSnappyBetterBlockAsm:
 	MOVQ src_len+32(FP), CX
@@ -14964,12 +15005,22 @@ search_loop_encodeSnappyBetterBlockAsm64K:
 	MOVL  262168(SP)(R11*4), R8
 	MOVL  CX, 24(SP)(R10*4)
 	MOVL  CX, 262168(SP)(R11*4)
-	CMPL  (DX)(SI*1), DI
+	MOVQ  (DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	CMPQ  R10, DI
 	JEQ   candidate_match_encodeSnappyBetterBlockAsm64K
-	CMPL  (DX)(R8*1), DI
-	JEQ   candidateS_match_encodeSnappyBetterBlockAsm64K
-	MOVL  20(SP), CX
-	JMP   search_loop_encodeSnappyBetterBlockAsm64K
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeSnappyBetterBlockAsm64K
+	MOVL  R8, SI
+	JMP   candidate_match_encodeSnappyBetterBlockAsm64K
+
+no_short_found_encodeSnappyBetterBlockAsm64K:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeSnappyBetterBlockAsm64K
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeSnappyBetterBlockAsm64K
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBetterBlockAsm64K
 
 candidateS_match_encodeSnappyBetterBlockAsm64K:
 	SHRQ  $0x08, DI
@@ -15248,52 +15299,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
 match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
 	MOVQ  $0x00cf1bbcdcbfa563, SI
 	MOVQ  $0x9e3779b1, R8
-	INCL  DI
-	MOVQ  (DX)(DI*1), R9
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	MOVQ  R9, R12
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	SHRQ  $0x10, R12
-	LEAL  1(DI), R14
-	LEAL  2(DI), R15
-	MOVQ  -2(DX)(CX*1), R9
+	LEAQ  1(DI), DI
+	LEAQ  -2(CX), R9
+	MOVQ  (DX)(DI*1), R10
+	MOVQ  1(DX)(DI*1), R11
+	MOVQ  (DX)(R9*1), R12
+	MOVQ  1(DX)(R9*1), R13
 	SHLQ  $0x08, R10
 	IMULQ SI, R10
 	SHRQ  $0x30, R10
-	SHLQ  $0x08, R13
-	IMULQ SI, R13
-	SHRQ  $0x30, R13
 	SHLQ  $0x20, R11
 	IMULQ R8, R11
 	SHRQ  $0x32, R11
-	SHLQ  $0x20, R12
-	IMULQ R8, R12
-	SHRQ  $0x32, R12
+	SHLQ  $0x08, R12
+	IMULQ SI, R12
+	SHRQ  $0x30, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x32, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
 	MOVL  DI, 24(SP)(R10*4)
-	MOVL  R14, 24(SP)(R13*4)
-	MOVL  R14, 262168(SP)(R11*4)
-	MOVL  R15, 262168(SP)(R12*4)
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	LEAL  -2(CX), R9
-	LEAL  -1(CX), DI
+	MOVL  R9, 24(SP)(R12*4)
+	MOVL  R8, 262168(SP)(R11*4)
+	MOVL  R14, 262168(SP)(R13*4)
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeSnappyBetterBlockAsm64K:
+	CMPQ  DI, R9
+	JAE   search_loop_encodeSnappyBetterBlockAsm64K
+	MOVQ  (DX)(DI*1), R8
+	MOVQ  (DX)(R9*1), R10
+	SHLQ  $0x08, R8
+	IMULQ SI, R8
+	SHRQ  $0x30, R8
 	SHLQ  $0x08, R10
 	IMULQ SI, R10
 	SHRQ  $0x30, R10
-	SHLQ  $0x20, R11
-	IMULQ R8, R11
-	SHRQ  $0x32, R11
-	SHLQ  $0x08, R13
-	IMULQ SI, R13
-	SHRQ  $0x30, R13
+	MOVL  DI, 24(SP)(R8*4)
 	MOVL  R9, 24(SP)(R10*4)
-	MOVL  DI, 262168(SP)(R11*4)
-	MOVL  DI, 24(SP)(R13*4)
-	JMP   search_loop_encodeSnappyBetterBlockAsm64K
+	ADDQ  $0x02, DI
+	SUBQ  $0x02, R9
+	JMP   index_loop_encodeSnappyBetterBlockAsm64K
 
 emit_remainder_encodeSnappyBetterBlockAsm64K:
 	MOVQ src_len+32(FP), CX
@@ -15508,12 +15556,22 @@ search_loop_encodeSnappyBetterBlockAsm12B:
 	MOVL  65560(SP)(R11*4), R8
 	MOVL  CX, 24(SP)(R10*4)
 	MOVL  CX, 65560(SP)(R11*4)
-	CMPL  (DX)(SI*1), DI
+	MOVQ  (DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	CMPQ  R10, DI
 	JEQ   candidate_match_encodeSnappyBetterBlockAsm12B
-	CMPL  (DX)(R8*1), DI
-	JEQ   candidateS_match_encodeSnappyBetterBlockAsm12B
-	MOVL  20(SP), CX
-	JMP   search_loop_encodeSnappyBetterBlockAsm12B
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeSnappyBetterBlockAsm12B
+	MOVL  R8, SI
+	JMP   candidate_match_encodeSnappyBetterBlockAsm12B
+
+no_short_found_encodeSnappyBetterBlockAsm12B:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeSnappyBetterBlockAsm12B
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeSnappyBetterBlockAsm12B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBetterBlockAsm12B
 
 candidateS_match_encodeSnappyBetterBlockAsm12B:
 	SHRQ  $0x08, DI
@@ -15792,52 +15850,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
 match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
 	MOVQ  $0x0000cf1bbcdcbf9b, SI
 	MOVQ  $0x9e3779b1, R8
-	INCL  DI
-	MOVQ  (DX)(DI*1), R9
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	MOVQ  R9, R12
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	SHRQ  $0x10, R12
-	LEAL  1(DI), R14
-	LEAL  2(DI), R15
-	MOVQ  -2(DX)(CX*1), R9
+	LEAQ  1(DI), DI
+	LEAQ  -2(CX), R9
+	MOVQ  (DX)(DI*1), R10
+	MOVQ  1(DX)(DI*1), R11
+	MOVQ  (DX)(R9*1), R12
+	MOVQ  1(DX)(R9*1), R13
 	SHLQ  $0x10, R10
 	IMULQ SI, R10
 	SHRQ  $0x32, R10
-	SHLQ  $0x10, R13
-	IMULQ SI, R13
-	SHRQ  $0x32, R13
 	SHLQ  $0x20, R11
 	IMULQ R8, R11
 	SHRQ  $0x34, R11
-	SHLQ  $0x20, R12
-	IMULQ R8, R12
-	SHRQ  $0x34, R12
+	SHLQ  $0x10, R12
+	IMULQ SI, R12
+	SHRQ  $0x32, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x34, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
 	MOVL  DI, 24(SP)(R10*4)
-	MOVL  R14, 24(SP)(R13*4)
-	MOVL  R14, 65560(SP)(R11*4)
-	MOVL  R15, 65560(SP)(R12*4)
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	LEAL  -2(CX), R9
-	LEAL  -1(CX), DI
+	MOVL  R9, 24(SP)(R12*4)
+	MOVL  R8, 65560(SP)(R11*4)
+	MOVL  R14, 65560(SP)(R13*4)
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeSnappyBetterBlockAsm12B:
+	CMPQ  DI, R9
+	JAE   search_loop_encodeSnappyBetterBlockAsm12B
+	MOVQ  (DX)(DI*1), R8
+	MOVQ  (DX)(R9*1), R10
+	SHLQ  $0x10, R8
+	IMULQ SI, R8
+	SHRQ  $0x32, R8
 	SHLQ  $0x10, R10
 	IMULQ SI, R10
 	SHRQ  $0x32, R10
-	SHLQ  $0x20, R11
-	IMULQ R8, R11
-	SHRQ  $0x34, R11
-	SHLQ  $0x10, R13
-	IMULQ SI, R13
-	SHRQ  $0x32, R13
+	MOVL  DI, 24(SP)(R8*4)
 	MOVL  R9, 24(SP)(R10*4)
-	MOVL  DI, 65560(SP)(R11*4)
-	MOVL  DI, 24(SP)(R13*4)
-	JMP   search_loop_encodeSnappyBetterBlockAsm12B
+	ADDQ  $0x02, DI
+	SUBQ  $0x02, R9
+	JMP   index_loop_encodeSnappyBetterBlockAsm12B
 
 emit_remainder_encodeSnappyBetterBlockAsm12B:
 	MOVQ src_len+32(FP), CX
@@ -16052,12 +16107,22 @@ search_loop_encodeSnappyBetterBlockAsm10B:
 	MOVL  16408(SP)(R11*4), R8
 	MOVL  CX, 24(SP)(R10*4)
 	MOVL  CX, 16408(SP)(R11*4)
-	CMPL  (DX)(SI*1), DI
+	MOVQ  (DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	CMPQ  R10, DI
 	JEQ   candidate_match_encodeSnappyBetterBlockAsm10B
-	CMPL  (DX)(R8*1), DI
-	JEQ   candidateS_match_encodeSnappyBetterBlockAsm10B
-	MOVL  20(SP), CX
-	JMP   search_loop_encodeSnappyBetterBlockAsm10B
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeSnappyBetterBlockAsm10B
+	MOVL  R8, SI
+	JMP   candidate_match_encodeSnappyBetterBlockAsm10B
+
+no_short_found_encodeSnappyBetterBlockAsm10B:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeSnappyBetterBlockAsm10B
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeSnappyBetterBlockAsm10B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBetterBlockAsm10B
 
 candidateS_match_encodeSnappyBetterBlockAsm10B:
 	SHRQ  $0x08, DI
@@ -16336,52 +16401,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
 match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
 	MOVQ  $0x0000cf1bbcdcbf9b, SI
 	MOVQ  $0x9e3779b1, R8
-	INCL  DI
-	MOVQ  (DX)(DI*1), R9
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	MOVQ  R9, R12
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	SHRQ  $0x10, R12
-	LEAL  1(DI), R14
-	LEAL  2(DI), R15
-	MOVQ  -2(DX)(CX*1), R9
+	LEAQ  1(DI), DI
+	LEAQ  -2(CX), R9
+	MOVQ  (DX)(DI*1), R10
+	MOVQ  1(DX)(DI*1), R11
+	MOVQ  (DX)(R9*1), R12
+	MOVQ  1(DX)(R9*1), R13
 	SHLQ  $0x10, R10
 	IMULQ SI, R10
 	SHRQ  $0x34, R10
-	SHLQ  $0x10, R13
-	IMULQ SI, R13
-	SHRQ  $0x34, R13
 	SHLQ  $0x20, R11
 	IMULQ R8, R11
 	SHRQ  $0x36, R11
-	SHLQ  $0x20, R12
-	IMULQ R8, R12
-	SHRQ  $0x36, R12
+	SHLQ  $0x10, R12
+	IMULQ SI, R12
+	SHRQ  $0x34, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x36, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
 	MOVL  DI, 24(SP)(R10*4)
-	MOVL  R14, 24(SP)(R13*4)
-	MOVL  R14, 16408(SP)(R11*4)
-	MOVL  R15, 16408(SP)(R12*4)
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	LEAL  -2(CX), R9
-	LEAL  -1(CX), DI
+	MOVL  R9, 24(SP)(R12*4)
+	MOVL  R8, 16408(SP)(R11*4)
+	MOVL  R14, 16408(SP)(R13*4)
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeSnappyBetterBlockAsm10B:
+	CMPQ  DI, R9
+	JAE   search_loop_encodeSnappyBetterBlockAsm10B
+	MOVQ  (DX)(DI*1), R8
+	MOVQ  (DX)(R9*1), R10
+	SHLQ  $0x10, R8
+	IMULQ SI, R8
+	SHRQ  $0x34, R8
 	SHLQ  $0x10, R10
 	IMULQ SI, R10
 	SHRQ  $0x34, R10
-	SHLQ  $0x20, R11
-	IMULQ R8, R11
-	SHRQ  $0x36, R11
-	SHLQ  $0x10, R13
-	IMULQ SI, R13
-	SHRQ  $0x34, R13
+	MOVL  DI, 24(SP)(R8*4)
 	MOVL  R9, 24(SP)(R10*4)
-	MOVL  DI, 16408(SP)(R11*4)
-	MOVL  DI, 24(SP)(R13*4)
-	JMP   search_loop_encodeSnappyBetterBlockAsm10B
+	ADDQ  $0x02, DI
+	SUBQ  $0x02, R9
+	JMP   index_loop_encodeSnappyBetterBlockAsm10B
 
 emit_remainder_encodeSnappyBetterBlockAsm10B:
 	MOVQ src_len+32(FP), CX
@@ -16596,12 +16658,22 @@ search_loop_encodeSnappyBetterBlockAsm8B:
 	MOVL  4120(SP)(R11*4), R8
 	MOVL  CX, 24(SP)(R10*4)
 	MOVL  CX, 4120(SP)(R11*4)
-	CMPL  (DX)(SI*1), DI
+	MOVQ  (DX)(SI*1), R10
+	MOVQ  (DX)(R8*1), R11
+	CMPQ  R10, DI
 	JEQ   candidate_match_encodeSnappyBetterBlockAsm8B
-	CMPL  (DX)(R8*1), DI
-	JEQ   candidateS_match_encodeSnappyBetterBlockAsm8B
-	MOVL  20(SP), CX
-	JMP   search_loop_encodeSnappyBetterBlockAsm8B
+	CMPQ  R11, DI
+	JNE   no_short_found_encodeSnappyBetterBlockAsm8B
+	MOVL  R8, SI
+	JMP   candidate_match_encodeSnappyBetterBlockAsm8B
+
+no_short_found_encodeSnappyBetterBlockAsm8B:
+	CMPL R10, DI
+	JEQ  candidate_match_encodeSnappyBetterBlockAsm8B
+	CMPL R11, DI
+	JEQ  candidateS_match_encodeSnappyBetterBlockAsm8B
+	MOVL 20(SP), CX
+	JMP  search_loop_encodeSnappyBetterBlockAsm8B
 
 candidateS_match_encodeSnappyBetterBlockAsm8B:
 	SHRQ  $0x08, DI
@@ -16878,52 +16950,49 @@ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
 match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
 	MOVQ  $0x0000cf1bbcdcbf9b, SI
 	MOVQ  $0x9e3779b1, R8
-	INCL  DI
-	MOVQ  (DX)(DI*1), R9
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	MOVQ  R9, R12
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	SHRQ  $0x10, R12
-	LEAL  1(DI), R14
-	LEAL  2(DI), R15
-	MOVQ  -2(DX)(CX*1), R9
+	LEAQ  1(DI), DI
+	LEAQ  -2(CX), R9
+	MOVQ  (DX)(DI*1), R10
+	MOVQ  1(DX)(DI*1), R11
+	MOVQ  (DX)(R9*1), R12
+	MOVQ  1(DX)(R9*1), R13
 	SHLQ  $0x10, R10
 	IMULQ SI, R10
 	SHRQ  $0x36, R10
-	SHLQ  $0x10, R13
-	IMULQ SI, R13
-	SHRQ  $0x36, R13
 	SHLQ  $0x20, R11
 	IMULQ R8, R11
 	SHRQ  $0x38, R11
-	SHLQ  $0x20, R12
-	IMULQ R8, R12
-	SHRQ  $0x38, R12
+	SHLQ  $0x10, R12
+	IMULQ SI, R12
+	SHRQ  $0x36, R12
+	SHLQ  $0x20, R13
+	IMULQ R8, R13
+	SHRQ  $0x38, R13
+	LEAQ  1(DI), R8
+	LEAQ  1(R9), R14
 	MOVL  DI, 24(SP)(R10*4)
-	MOVL  R14, 24(SP)(R13*4)
-	MOVL  R14, 4120(SP)(R11*4)
-	MOVL  R15, 4120(SP)(R12*4)
-	MOVQ  R9, R10
-	MOVQ  R9, R11
-	SHRQ  $0x08, R11
-	MOVQ  R11, R13
-	LEAL  -2(CX), R9
-	LEAL  -1(CX), DI
+	MOVL  R9, 24(SP)(R12*4)
+	MOVL  R8, 4120(SP)(R11*4)
+	MOVL  R14, 4120(SP)(R13*4)
+	ADDQ  $0x01, DI
+	SUBQ  $0x01, R9
+
+index_loop_encodeSnappyBetterBlockAsm8B:
+	CMPQ  DI, R9
+	JAE   search_loop_encodeSnappyBetterBlockAsm8B
+	MOVQ  (DX)(DI*1), R8
+	MOVQ  (DX)(R9*1), R10
+	SHLQ  $0x10, R8
+	IMULQ SI, R8
+	SHRQ  $0x36, R8
 	SHLQ  $0x10, R10
 	IMULQ SI, R10
 	SHRQ  $0x36, R10
-	SHLQ  $0x20, R11
-	IMULQ R8, R11
-	SHRQ  $0x38, R11
-	SHLQ  $0x10, R13
-	IMULQ SI, R13
-	SHRQ  $0x36, R13
+	MOVL  DI, 24(SP)(R8*4)
 	MOVL  R9, 24(SP)(R10*4)
-	MOVL  DI, 4120(SP)(R11*4)
-	MOVL  DI, 24(SP)(R13*4)
-	JMP   search_loop_encodeSnappyBetterBlockAsm8B
+	ADDQ  $0x02, DI
+	SUBQ  $0x02, R9
+	JMP   index_loop_encodeSnappyBetterBlockAsm8B
 
 emit_remainder_encodeSnappyBetterBlockAsm8B:
 	MOVQ src_len+32(FP), CX