Merge pull request from thaJeztah/23.0_backport_bump_compress

[23.0 backport] vendor: github.com/klauspost/compress v1.17.2
This commit is contained in:
Sebastiaan van Stijn 2023-10-24 16:55:07 +02:00 committed by GitHub
commit 0360dbe11d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
49 changed files with 1951 additions and 1328 deletions

View file

@ -47,7 +47,7 @@ require (
github.com/hashicorp/serf v0.8.5
github.com/imdario/mergo v0.3.12
github.com/ishidawataru/sctp v0.0.0-20230406120618-7ff4192f6ff2
github.com/klauspost/compress v1.15.12
github.com/klauspost/compress v1.17.2
github.com/miekg/dns v1.1.43
github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible
github.com/moby/buildkit v0.10.7-0.20230412161310-d52b2d584242

View file

@ -676,8 +676,8 @@ github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+o
github.com/klauspost/compress v1.11.3/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
github.com/klauspost/compress v1.11.13/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
github.com/klauspost/compress v1.15.1/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
github.com/klauspost/compress v1.15.12 h1:YClS/PImqYbn+UILDnqxQCZ3RehC9N318SU3kElDUEM=
github.com/klauspost/compress v1.15.12/go.mod h1:QPwzmACJjUTFsnSHH934V6woptycfrDDJnH7hvFVbGM=
github.com/klauspost/compress v1.17.2 h1:RlWWUY/Dr4fL8qk9YG7DTZ7PDgME2V4csBXA8L/ixi4=
github.com/klauspost/compress v1.17.2/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=

View file

@ -3,7 +3,7 @@
before:
hooks:
- ./gen.sh
- go install mvdan.cc/garble@latest
- go install mvdan.cc/garble@v0.10.1
builds:
-
@ -92,16 +92,7 @@ builds:
archives:
-
id: s2-binaries
name_template: "s2-{{ .Os }}_{{ .Arch }}_{{ .Version }}"
replacements:
aix: AIX
darwin: OSX
linux: Linux
windows: Windows
386: i386
amd64: x86_64
freebsd: FreeBSD
netbsd: NetBSD
name_template: "s2-{{ .Os }}_{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}"
format_overrides:
- goos: windows
format: zip
@ -125,7 +116,7 @@ changelog:
nfpms:
-
file_name_template: "s2_package_{{ .Version }}_{{ .Os }}_{{ .Arch }}"
file_name_template: "s2_package__{{ .Os }}_{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}"
vendor: Klaus Post
homepage: https://github.com/klauspost/compress
maintainer: Klaus Post <klauspost@gmail.com>
@ -134,8 +125,3 @@ nfpms:
formats:
- deb
- rpm
replacements:
darwin: Darwin
linux: Linux
freebsd: FreeBSD
amd64: x86_64

View file

@ -9,7 +9,6 @@ This package provides various compression algorithms.
* [huff0](https://github.com/klauspost/compress/tree/master/huff0) and [FSE](https://github.com/klauspost/compress/tree/master/fse) implementations for raw entropy encoding.
* [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp) Provides client and server wrappers for handling gzipped requests efficiently.
* [pgzip](https://github.com/klauspost/pgzip) is a separate package that provides a very fast parallel gzip implementation.
* [fuzz package](https://github.com/klauspost/compress-fuzz) for fuzz testing all compressors/decompressors here.
[![Go Reference](https://pkg.go.dev/badge/klauspost/compress.svg)](https://pkg.go.dev/github.com/klauspost/compress?tab=subdirectories)
[![Go](https://github.com/klauspost/compress/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/compress/actions/workflows/go.yml)
@ -17,6 +16,77 @@ This package provides various compression algorithms.
# changelog
* Sept 19th, 2023 - [v1.17.0](https://github.com/klauspost/compress/releases/tag/v1.17.0)
* Add experimental dictionary builder https://github.com/klauspost/compress/pull/853
* Add xerial snappy read/writer https://github.com/klauspost/compress/pull/838
* flate: Add limited window compression https://github.com/klauspost/compress/pull/843
* s2: Do 2 overlapping match checks https://github.com/klauspost/compress/pull/839
* flate: Add amd64 assembly matchlen https://github.com/klauspost/compress/pull/837
* gzip: Copy bufio.Reader on Reset by @thatguystone in https://github.com/klauspost/compress/pull/860
* July 1st, 2023 - [v1.16.7](https://github.com/klauspost/compress/releases/tag/v1.16.7)
* zstd: Fix default level first dictionary encode https://github.com/klauspost/compress/pull/829
* s2: add GetBufferCapacity() method by @GiedriusS in https://github.com/klauspost/compress/pull/832
* June 13, 2023 - [v1.16.6](https://github.com/klauspost/compress/releases/tag/v1.16.6)
* zstd: correctly ignore WithEncoderPadding(1) by @ianlancetaylor in https://github.com/klauspost/compress/pull/806
* zstd: Add amd64 match length assembly https://github.com/klauspost/compress/pull/824
* gzhttp: Handle informational headers by @rtribotte in https://github.com/klauspost/compress/pull/815
* s2: Improve Better compression slightly https://github.com/klauspost/compress/pull/663
* Apr 16, 2023 - [v1.16.5](https://github.com/klauspost/compress/releases/tag/v1.16.5)
* zstd: readByte needs to use io.ReadFull by @jnoxon in https://github.com/klauspost/compress/pull/802
* gzip: Fix WriterTo after initial read https://github.com/klauspost/compress/pull/804
* Apr 5, 2023 - [v1.16.4](https://github.com/klauspost/compress/releases/tag/v1.16.4)
* zstd: Improve zstd best efficiency by @greatroar and @klauspost in https://github.com/klauspost/compress/pull/784
* zstd: Respect WithAllLitEntropyCompression https://github.com/klauspost/compress/pull/792
* zstd: Fix amd64 not always detecting corrupt data https://github.com/klauspost/compress/pull/785
* zstd: Various minor improvements by @greatroar in https://github.com/klauspost/compress/pull/788 https://github.com/klauspost/compress/pull/794 https://github.com/klauspost/compress/pull/795
* s2: Fix huge block overflow https://github.com/klauspost/compress/pull/779
* s2: Allow CustomEncoder fallback https://github.com/klauspost/compress/pull/780
* gzhttp: Suppport ResponseWriter Unwrap() in gzhttp handler by @jgimenez in https://github.com/klauspost/compress/pull/799
* Mar 13, 2023 - [v1.16.1](https://github.com/klauspost/compress/releases/tag/v1.16.1)
* zstd: Speed up + improve best encoder by @greatroar in https://github.com/klauspost/compress/pull/776
* gzhttp: Add optional [BREACH mitigation](https://github.com/klauspost/compress/tree/master/gzhttp#breach-mitigation). https://github.com/klauspost/compress/pull/762 https://github.com/klauspost/compress/pull/768 https://github.com/klauspost/compress/pull/769 https://github.com/klauspost/compress/pull/770 https://github.com/klauspost/compress/pull/767
* s2: Add Intel LZ4s converter https://github.com/klauspost/compress/pull/766
* zstd: Minor bug fixes https://github.com/klauspost/compress/pull/771 https://github.com/klauspost/compress/pull/772 https://github.com/klauspost/compress/pull/773
* huff0: Speed up compress1xDo by @greatroar in https://github.com/klauspost/compress/pull/774
* Feb 26, 2023 - [v1.16.0](https://github.com/klauspost/compress/releases/tag/v1.16.0)
* s2: Add [Dictionary](https://github.com/klauspost/compress/tree/master/s2#dictionaries) support. https://github.com/klauspost/compress/pull/685
* s2: Add Compression Size Estimate. https://github.com/klauspost/compress/pull/752
* s2: Add support for custom stream encoder. https://github.com/klauspost/compress/pull/755
* s2: Add LZ4 block converter. https://github.com/klauspost/compress/pull/748
* s2: Support io.ReaderAt in ReadSeeker. https://github.com/klauspost/compress/pull/747
* s2c/s2sx: Use concurrent decoding. https://github.com/klauspost/compress/pull/746
<details>
<summary>See changes to v1.15.x</summary>
* Jan 21st, 2023 (v1.15.15)
* deflate: Improve level 7-9 by @klauspost in https://github.com/klauspost/compress/pull/739
* zstd: Add delta encoding support by @greatroar in https://github.com/klauspost/compress/pull/728
* zstd: Various speed improvements by @greatroar https://github.com/klauspost/compress/pull/741 https://github.com/klauspost/compress/pull/734 https://github.com/klauspost/compress/pull/736 https://github.com/klauspost/compress/pull/744 https://github.com/klauspost/compress/pull/743 https://github.com/klauspost/compress/pull/745
* gzhttp: Add SuffixETag() and DropETag() options to prevent ETag collisions on compressed responses by @willbicks in https://github.com/klauspost/compress/pull/740
* Jan 3rd, 2023 (v1.15.14)
* flate: Improve speed in big stateless blocks https://github.com/klauspost/compress/pull/718
* zstd: Minor speed tweaks by @greatroar in https://github.com/klauspost/compress/pull/716 https://github.com/klauspost/compress/pull/720
* export NoGzipResponseWriter for custom ResponseWriter wrappers by @harshavardhana in https://github.com/klauspost/compress/pull/722
* s2: Add example for indexing and existing stream https://github.com/klauspost/compress/pull/723
* Dec 11, 2022 (v1.15.13)
* zstd: Add [MaxEncodedSize](https://pkg.go.dev/github.com/klauspost/compress@v1.15.13/zstd#Encoder.MaxEncodedSize) to encoder https://github.com/klauspost/compress/pull/691
* zstd: Various tweaks and improvements https://github.com/klauspost/compress/pull/693 https://github.com/klauspost/compress/pull/695 https://github.com/klauspost/compress/pull/696 https://github.com/klauspost/compress/pull/701 https://github.com/klauspost/compress/pull/702 https://github.com/klauspost/compress/pull/703 https://github.com/klauspost/compress/pull/704 https://github.com/klauspost/compress/pull/705 https://github.com/klauspost/compress/pull/706 https://github.com/klauspost/compress/pull/707 https://github.com/klauspost/compress/pull/708
* Oct 26, 2022 (v1.15.12)
* zstd: Tweak decoder allocs. https://github.com/klauspost/compress/pull/680
* gzhttp: Always delete `HeaderNoCompression` https://github.com/klauspost/compress/pull/683
* Sept 26, 2022 (v1.15.11)
* flate: Improve level 1-3 compression https://github.com/klauspost/compress/pull/678
@ -121,6 +191,8 @@ Stream decompression is now faster on asynchronous, since the goroutine allocati
While the release has been extensively tested, it is recommended to testing when upgrading.
</details>
<details>
<summary>See changes to v1.14.x</summary>
@ -579,6 +651,10 @@ Here are other packages of good quality and pure Go (no cgo wrappers or autoconv
* [github.com/pierrec/lz4](https://github.com/pierrec/lz4) - strong multithreaded LZ4 compression.
* [github.com/cosnicolaou/pbzip2](https://github.com/cosnicolaou/pbzip2) - multithreaded bzip2 decompression.
* [github.com/dsnet/compress](https://github.com/dsnet/compress) - brotli decompression, bzip2 writer.
* [github.com/ronanh/intcomp](https://github.com/ronanh/intcomp) - Integer compression.
* [github.com/spenczar/fpc](https://github.com/spenczar/fpc) - Float compression.
* [github.com/minio/zipindex](https://github.com/minio/zipindex) - External ZIP directory index.
* [github.com/ybirader/pzip](https://github.com/ybirader/pzip) - Fast concurrent zip archiver and extractor.
# license

25
vendor/github.com/klauspost/compress/SECURITY.md generated vendored Normal file
View file

@ -0,0 +1,25 @@
# Security Policy
## Supported Versions
Security updates are applied only to the latest release.
## Vulnerability Definition
A security vulnerability is a bug that with certain input triggers a crash or an infinite loop. Most calls will have varying execution time and only in rare cases will slow operation be considered a security vulnerability.
Corrupted output generally is not considered a security vulnerability, unless independent operations are able to affect each other. Note that not all functionality is re-entrant and safe to use concurrently.
Out-of-memory crashes only applies if the en/decoder uses an abnormal amount of memory, with appropriate options applied, to limit maximum window size, concurrency, etc. However, if you are in doubt you are welcome to file a security issue.
It is assumed that all callers are trusted, meaning internal data exposed through reflection or inspection of returned data structures is not considered a vulnerability.
Vulnerabilities resulting from compiler/assembler errors should be reported upstream. Depending on the severity this package may or may not implement a workaround.
## Reporting a Vulnerability
If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
Please disclose it at [security advisory](https://github.com/klauspost/compress/security/advisories/new). If possible please provide a minimal reproducer. If the issue only applies to a single platform, it would be helpful to provide access to that.
This project is maintained by a team of volunteers on a reasonable-effort basis. As such, vulnerabilities will be disclosed in a best effort base.

View file

@ -152,12 +152,11 @@ func (b *bitWriter) flushAlign() {
// close will write the alignment bit and write the final byte(s)
// to the output.
func (b *bitWriter) close() error {
func (b *bitWriter) close() {
// End mark
b.addBits16Clean(1, 1)
// flush until next byte.
b.flushAlign()
return nil
}
// reset and continue writing by appending to out.

View file

@ -146,54 +146,51 @@ func (s *Scratch) compress(src []byte) error {
c1.encodeZero(tt[src[ip-2]])
ip -= 2
}
src = src[:ip]
// Main compression loop.
switch {
case !s.zeroBits && s.actualTableLog <= 8:
// We can encode 4 symbols without requiring a flush.
// We do not need to check if any output is 0 bits.
for ip >= 4 {
for ; len(src) >= 4; src = src[:len(src)-4] {
s.bw.flush32()
v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
c2.encode(tt[v0])
c1.encode(tt[v1])
c2.encode(tt[v2])
c1.encode(tt[v3])
ip -= 4
}
case !s.zeroBits:
// We do not need to check if any output is 0 bits.
for ip >= 4 {
for ; len(src) >= 4; src = src[:len(src)-4] {
s.bw.flush32()
v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
c2.encode(tt[v0])
c1.encode(tt[v1])
s.bw.flush32()
c2.encode(tt[v2])
c1.encode(tt[v3])
ip -= 4
}
case s.actualTableLog <= 8:
// We can encode 4 symbols without requiring a flush
for ip >= 4 {
for ; len(src) >= 4; src = src[:len(src)-4] {
s.bw.flush32()
v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
c2.encodeZero(tt[v0])
c1.encodeZero(tt[v1])
c2.encodeZero(tt[v2])
c1.encodeZero(tt[v3])
ip -= 4
}
default:
for ip >= 4 {
for ; len(src) >= 4; src = src[:len(src)-4] {
s.bw.flush32()
v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
c2.encodeZero(tt[v0])
c1.encodeZero(tt[v1])
s.bw.flush32()
c2.encodeZero(tt[v2])
c1.encodeZero(tt[v3])
ip -= 4
}
}
@ -202,7 +199,8 @@ func (s *Scratch) compress(src []byte) error {
c2.flush(s.actualTableLog)
c1.flush(s.actualTableLog)
return s.bw.close()
s.bw.close()
return nil
}
// writeCount will write the normalized histogram count to header.
@ -459,15 +457,17 @@ func (s *Scratch) countSimple(in []byte) (max int) {
for _, v := range in {
s.count[v]++
}
m := uint32(0)
m, symlen := uint32(0), s.symbolLen
for i, v := range s.count[:] {
if v == 0 {
continue
}
if v > m {
m = v
}
if v > 0 {
s.symbolLen = uint16(i) + 1
}
symlen = uint16(i) + 1
}
s.symbolLen = symlen
return int(m)
}

View file

@ -260,7 +260,9 @@ func (s *Scratch) buildDtable() error {
// If the buffer is over-read an error is returned.
func (s *Scratch) decompress() error {
br := &s.bits
br.init(s.br.unread())
if err := br.init(s.br.unread()); err != nil {
return err
}
var s1, s2 decoder
// Initialize and decode first state and symbol.

View file

@ -67,7 +67,6 @@ func (b *bitReaderBytes) fillFast() {
// 2 bounds checks.
v := b.in[b.off-4 : b.off]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value |= uint64(low) << (b.bitsRead - 32)
b.bitsRead -= 32
@ -88,8 +87,7 @@ func (b *bitReaderBytes) fill() {
return
}
if b.off > 4 {
v := b.in[b.off-4:]
v = v[:4]
v := b.in[b.off-4 : b.off]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value |= uint64(low) << (b.bitsRead - 32)
b.bitsRead -= 32
@ -179,7 +177,6 @@ func (b *bitReaderShifted) fillFast() {
// 2 bounds checks.
v := b.in[b.off-4 : b.off]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
b.bitsRead -= 32
@ -200,8 +197,7 @@ func (b *bitReaderShifted) fill() {
return
}
if b.off > 4 {
v := b.in[b.off-4:]
v = v[:4]
v := b.in[b.off-4 : b.off]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
b.bitsRead -= 32

View file

@ -13,14 +13,6 @@ type bitWriter struct {
out []byte
}
// bitMask16 is bitmasks. Has extra to avoid bounds check.
var bitMask16 = [32]uint16{
0, 1, 3, 7, 0xF, 0x1F,
0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF,
0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF} /* up to 16 bits */
// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
@ -60,6 +52,22 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
b.nBits += encA.nBits + encB.nBits
}
// encFourSymbols adds up to 32 bits from four symbols.
// It will not check if there is space for them,
// so the caller must ensure that b has been flushed recently.
func (b *bitWriter) encFourSymbols(encA, encB, encC, encD cTableEntry) {
bitsA := encA.nBits
bitsB := bitsA + encB.nBits
bitsC := bitsB + encC.nBits
bitsD := bitsC + encD.nBits
combined := uint64(encA.val) |
(uint64(encB.val) << (bitsA & 63)) |
(uint64(encC.val) << (bitsB & 63)) |
(uint64(encD.val) << (bitsC & 63))
b.bitContainer |= combined << (b.nBits & 63)
b.nBits += bitsD
}
// flush32 will flush out, so there are at least 32 bits available for writing.
func (b *bitWriter) flush32() {
if b.nBits < 32 {
@ -86,10 +94,9 @@ func (b *bitWriter) flushAlign() {
// close will write the alignment bit and write the final byte(s)
// to the output.
func (b *bitWriter) close() error {
func (b *bitWriter) close() {
// End mark
b.addBits16Clean(1, 1)
// flush until next byte.
b.flushAlign()
return nil
}

View file

@ -227,10 +227,10 @@ func EstimateSizes(in []byte, s *Scratch) (tableSz, dataSz, reuseSz int, err err
}
func (s *Scratch) compress1X(src []byte) ([]byte, error) {
return s.compress1xDo(s.Out, src)
return s.compress1xDo(s.Out, src), nil
}
func (s *Scratch) compress1xDo(dst, src []byte) ([]byte, error) {
func (s *Scratch) compress1xDo(dst, src []byte) []byte {
var bw = bitWriter{out: dst}
// N is length divisible by 4.
@ -248,8 +248,7 @@ func (s *Scratch) compress1xDo(dst, src []byte) ([]byte, error) {
tmp := src[n : n+4]
// tmp should be len 4
bw.flush32()
bw.encTwoSymbols(cTable, tmp[3], tmp[2])
bw.encTwoSymbols(cTable, tmp[1], tmp[0])
bw.encFourSymbols(cTable[tmp[3]], cTable[tmp[2]], cTable[tmp[1]], cTable[tmp[0]])
}
} else {
for ; n >= 0; n -= 4 {
@ -261,8 +260,8 @@ func (s *Scratch) compress1xDo(dst, src []byte) ([]byte, error) {
bw.encTwoSymbols(cTable, tmp[1], tmp[0])
}
}
err := bw.close()
return bw.out, err
bw.close()
return bw.out
}
var sixZeros [6]byte
@ -284,12 +283,8 @@ func (s *Scratch) compress4X(src []byte) ([]byte, error) {
}
src = src[len(toDo):]
var err error
idx := len(s.Out)
s.Out, err = s.compress1xDo(s.Out, toDo)
if err != nil {
return nil, err
}
s.Out = s.compress1xDo(s.Out, toDo)
if len(s.Out)-idx > math.MaxUint16 {
// We cannot store the size in the jump table
return nil, ErrIncompressible
@ -316,7 +311,6 @@ func (s *Scratch) compress4Xp(src []byte) ([]byte, error) {
segmentSize := (len(src) + 3) / 4
var wg sync.WaitGroup
var errs [4]error
wg.Add(4)
for i := 0; i < 4; i++ {
toDo := src
@ -327,15 +321,12 @@ func (s *Scratch) compress4Xp(src []byte) ([]byte, error) {
// Separate goroutine for each block.
go func(i int) {
s.tmpOut[i], errs[i] = s.compress1xDo(s.tmpOut[i][:0], toDo)
s.tmpOut[i] = s.compress1xDo(s.tmpOut[i][:0], toDo)
wg.Done()
}(i)
}
wg.Wait()
for i := 0; i < 4; i++ {
if errs[i] != nil {
return nil, errs[i]
}
o := s.tmpOut[i]
if len(o) > math.MaxUint16 {
// We cannot store the size in the jump table
@ -365,29 +356,29 @@ func (s *Scratch) countSimple(in []byte) (max int, reuse bool) {
m := uint32(0)
if len(s.prevTable) > 0 {
for i, v := range s.count[:] {
if v == 0 {
continue
}
if v > m {
m = v
}
if v > 0 {
s.symbolLen = uint16(i) + 1
if i >= len(s.prevTable) {
reuse = false
} else {
if s.prevTable[i].nBits == 0 {
reuse = false
}
}
s.symbolLen = uint16(i) + 1
if i >= len(s.prevTable) {
reuse = false
} else if s.prevTable[i].nBits == 0 {
reuse = false
}
}
return int(m), reuse
}
for i, v := range s.count[:] {
if v == 0 {
continue
}
if v > m {
m = v
}
if v > 0 {
s.symbolLen = uint16(i) + 1
}
s.symbolLen = uint16(i) + 1
}
return int(m), false
}
@ -484,34 +475,35 @@ func (s *Scratch) buildCTable() error {
// Different from reference implementation.
huffNode0 := s.nodes[0 : huffNodesLen+1]
for huffNode[nonNullRank].count == 0 {
for huffNode[nonNullRank].count() == 0 {
nonNullRank--
}
lowS := int16(nonNullRank)
nodeRoot := nodeNb + lowS - 1
lowN := nodeNb
huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count
huffNode[lowS].parent, huffNode[lowS-1].parent = uint16(nodeNb), uint16(nodeNb)
huffNode[nodeNb].setCount(huffNode[lowS].count() + huffNode[lowS-1].count())
huffNode[lowS].setParent(nodeNb)
huffNode[lowS-1].setParent(nodeNb)
nodeNb++
lowS -= 2
for n := nodeNb; n <= nodeRoot; n++ {
huffNode[n].count = 1 << 30
huffNode[n].setCount(1 << 30)
}
// fake entry, strong barrier
huffNode0[0].count = 1 << 31
huffNode0[0].setCount(1 << 31)
// create parents
for nodeNb <= nodeRoot {
var n1, n2 int16
if huffNode0[lowS+1].count < huffNode0[lowN+1].count {
if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
n1 = lowS
lowS--
} else {
n1 = lowN
lowN++
}
if huffNode0[lowS+1].count < huffNode0[lowN+1].count {
if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
n2 = lowS
lowS--
} else {
@ -519,18 +511,19 @@ func (s *Scratch) buildCTable() error {
lowN++
}
huffNode[nodeNb].count = huffNode0[n1+1].count + huffNode0[n2+1].count
huffNode0[n1+1].parent, huffNode0[n2+1].parent = uint16(nodeNb), uint16(nodeNb)
huffNode[nodeNb].setCount(huffNode0[n1+1].count() + huffNode0[n2+1].count())
huffNode0[n1+1].setParent(nodeNb)
huffNode0[n2+1].setParent(nodeNb)
nodeNb++
}
// distribute weights (unlimited tree height)
huffNode[nodeRoot].nbBits = 0
huffNode[nodeRoot].setNbBits(0)
for n := nodeRoot - 1; n >= startNode; n-- {
huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1
huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
}
for n := uint16(0); n <= nonNullRank; n++ {
huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1
huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
}
s.actualTableLog = s.setMaxHeight(int(nonNullRank))
maxNbBits := s.actualTableLog
@ -542,7 +535,7 @@ func (s *Scratch) buildCTable() error {
var nbPerRank [tableLogMax + 1]uint16
var valPerRank [16]uint16
for _, v := range huffNode[:nonNullRank+1] {
nbPerRank[v.nbBits]++
nbPerRank[v.nbBits()]++
}
// determine stating value per rank
{
@ -557,7 +550,7 @@ func (s *Scratch) buildCTable() error {
// push nbBits per symbol, symbol order
for _, v := range huffNode[:nonNullRank+1] {
s.cTable[v.symbol].nBits = v.nbBits
s.cTable[v.symbol()].nBits = v.nbBits()
}
// assign value within rank, symbol order
@ -603,12 +596,12 @@ func (s *Scratch) huffSort() {
pos := rank[r].current
rank[r].current++
prev := nodes[(pos-1)&huffNodesMask]
for pos > rank[r].base && c > prev.count {
for pos > rank[r].base && c > prev.count() {
nodes[pos&huffNodesMask] = prev
pos--
prev = nodes[(pos-1)&huffNodesMask]
}
nodes[pos&huffNodesMask] = nodeElt{count: c, symbol: byte(n)}
nodes[pos&huffNodesMask] = makeNodeElt(c, byte(n))
}
}
@ -617,7 +610,7 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
huffNode := s.nodes[1 : huffNodesLen+1]
//huffNode = huffNode[: huffNodesLen]
largestBits := huffNode[lastNonNull].nbBits
largestBits := huffNode[lastNonNull].nbBits()
// early exit : no elt > maxNbBits
if largestBits <= maxNbBits {
@ -627,14 +620,14 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
baseCost := int(1) << (largestBits - maxNbBits)
n := uint32(lastNonNull)
for huffNode[n].nbBits > maxNbBits {
totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits))
huffNode[n].nbBits = maxNbBits
for huffNode[n].nbBits() > maxNbBits {
totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits()))
huffNode[n].setNbBits(maxNbBits)
n--
}
// n stops at huffNode[n].nbBits <= maxNbBits
for huffNode[n].nbBits == maxNbBits {
for huffNode[n].nbBits() == maxNbBits {
n--
}
// n end at index of smallest symbol using < maxNbBits
@ -655,10 +648,10 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
{
currentNbBits := maxNbBits
for pos := int(n); pos >= 0; pos-- {
if huffNode[pos].nbBits >= currentNbBits {
if huffNode[pos].nbBits() >= currentNbBits {
continue
}
currentNbBits = huffNode[pos].nbBits // < maxNbBits
currentNbBits = huffNode[pos].nbBits() // < maxNbBits
rankLast[maxNbBits-currentNbBits] = uint32(pos)
}
}
@ -675,8 +668,8 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
if lowPos == noSymbol {
break
}
highTotal := huffNode[highPos].count
lowTotal := 2 * huffNode[lowPos].count
highTotal := huffNode[highPos].count()
lowTotal := 2 * huffNode[lowPos].count()
if highTotal <= lowTotal {
break
}
@ -692,13 +685,14 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
// this rank is no longer empty
rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease]
}
huffNode[rankLast[nBitsToDecrease]].nbBits++
huffNode[rankLast[nBitsToDecrease]].setNbBits(1 +
huffNode[rankLast[nBitsToDecrease]].nbBits())
if rankLast[nBitsToDecrease] == 0 {
/* special case, reached largest symbol */
rankLast[nBitsToDecrease] = noSymbol
} else {
rankLast[nBitsToDecrease]--
if huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease {
if huffNode[rankLast[nBitsToDecrease]].nbBits() != maxNbBits-nBitsToDecrease {
rankLast[nBitsToDecrease] = noSymbol /* this rank is now empty */
}
}
@ -706,15 +700,15 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
for totalCost < 0 { /* Sometimes, cost correction overshoot */
if rankLast[1] == noSymbol { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
for huffNode[n].nbBits == maxNbBits {
for huffNode[n].nbBits() == maxNbBits {
n--
}
huffNode[n+1].nbBits--
huffNode[n+1].setNbBits(huffNode[n+1].nbBits() - 1)
rankLast[1] = n + 1
totalCost++
continue
}
huffNode[rankLast[1]+1].nbBits--
huffNode[rankLast[1]+1].setNbBits(huffNode[rankLast[1]+1].nbBits() - 1)
rankLast[1]++
totalCost++
}
@ -722,9 +716,26 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
return maxNbBits
}
type nodeElt struct {
count uint32
parent uint16
symbol byte
nbBits uint8
// A nodeElt is the fields
//
// count uint32
// parent uint16
// symbol byte
// nbBits uint8
//
// in some order, all squashed into an integer so that the compiler
// always loads and stores entire nodeElts instead of separate fields.
type nodeElt uint64
func makeNodeElt(count uint32, symbol byte) nodeElt {
return nodeElt(count) | nodeElt(symbol)<<48
}
func (e *nodeElt) count() uint32 { return uint32(*e) }
func (e *nodeElt) parent() uint16 { return uint16(*e >> 32) }
func (e *nodeElt) symbol() byte { return byte(*e >> 48) }
func (e *nodeElt) nbBits() uint8 { return uint8(*e >> 56) }
func (e *nodeElt) setCount(c uint32) { *e = (*e)&0xffffffff00000000 | nodeElt(c) }
func (e *nodeElt) setParent(p int16) { *e = (*e)&0xffff0000ffffffff | nodeElt(uint16(p))<<32 }
func (e *nodeElt) setNbBits(n uint8) { *e = (*e)&0x00ffffffffffffff | nodeElt(n)<<56 }

View file

@ -61,7 +61,7 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
b, err := fse.Decompress(in[:iSize], s.fse)
s.fse.Out = nil
if err != nil {
return s, nil, err
return s, nil, fmt.Errorf("fse decompress returned: %w", err)
}
if len(b) > 255 {
return s, nil, errors.New("corrupt input: output table too large")
@ -253,7 +253,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
switch d.actualTableLog {
case 8:
const shift = 8 - 8
const shift = 0
for br.off >= 4 {
br.fillFast()
v := dt[uint8(br.value>>(56+shift))]

View file

@ -4,360 +4,349 @@
// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
TEXT ·decompress4x_main_loop_amd64(SB), $0-8
XORQ DX, DX
// Preload values
MOVQ ctx+0(FP), AX
MOVBQZX 8(AX), DI
MOVQ 16(AX), SI
MOVQ 48(AX), BX
MOVQ 24(AX), R9
MOVQ 32(AX), R10
MOVQ (AX), R11
MOVQ 16(AX), BX
MOVQ 48(AX), SI
MOVQ 24(AX), R8
MOVQ 32(AX), R9
MOVQ (AX), R10
// Main loop
main_loop:
MOVQ SI, R8
CMPQ R8, BX
XORL DX, DX
CMPQ BX, SI
SETGE DL
// br0.fillFast32()
MOVQ 32(R11), R12
MOVBQZX 40(R11), R13
CMPQ R13, $0x20
MOVQ 32(R10), R11
MOVBQZX 40(R10), R12
CMPQ R12, $0x20
JBE skip_fill0
MOVQ 24(R11), AX
SUBQ $0x20, R13
MOVQ 24(R10), AX
SUBQ $0x20, R12
SUBQ $0x04, AX
MOVQ (R11), R14
MOVQ (R10), R13
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (AX)(R14*1), R14
MOVQ R13, CX
SHLQ CL, R14
MOVQ AX, 24(R11)
ORQ R14, R12
MOVL (AX)(R13*1), R13
MOVQ R12, CX
SHLQ CL, R13
MOVQ AX, 24(R10)
ORQ R13, R11
// exhausted = exhausted || (br0.off < 4)
CMPQ AX, $0x04
SETLT AL
ORB AL, DL
// exhausted += (br0.off < 4)
CMPQ AX, $0x04
ADCB $+0, DL
skip_fill0:
// val0 := br0.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v0 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br0.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
// val1 := br0.peekTopBits(peekBits)
MOVQ DI, CX
MOVQ R12, R14
SHRQ CL, R14
MOVQ R11, R13
SHRQ CL, R13
// v1 := table[val1&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br0.advance(uint8(v1.entry))
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
MOVW AX, (R8)
MOVW AX, (BX)
// update the bitreader structure
MOVQ R12, 32(R11)
MOVB R13, 40(R11)
ADDQ R9, R8
MOVQ R11, 32(R10)
MOVB R12, 40(R10)
// br1.fillFast32()
MOVQ 80(R11), R12
MOVBQZX 88(R11), R13
CMPQ R13, $0x20
MOVQ 80(R10), R11
MOVBQZX 88(R10), R12
CMPQ R12, $0x20
JBE skip_fill1
MOVQ 72(R11), AX
SUBQ $0x20, R13
MOVQ 72(R10), AX
SUBQ $0x20, R12
SUBQ $0x04, AX
MOVQ 48(R11), R14
MOVQ 48(R10), R13
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (AX)(R14*1), R14
MOVQ R13, CX
SHLQ CL, R14
MOVQ AX, 72(R11)
ORQ R14, R12
MOVL (AX)(R13*1), R13
MOVQ R12, CX
SHLQ CL, R13
MOVQ AX, 72(R10)
ORQ R13, R11
// exhausted = exhausted || (br1.off < 4)
CMPQ AX, $0x04
SETLT AL
ORB AL, DL
// exhausted += (br1.off < 4)
CMPQ AX, $0x04
ADCB $+0, DL
skip_fill1:
// val0 := br1.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v0 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br1.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
// val1 := br1.peekTopBits(peekBits)
MOVQ DI, CX
MOVQ R12, R14
SHRQ CL, R14
MOVQ R11, R13
SHRQ CL, R13
// v1 := table[val1&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br1.advance(uint8(v1.entry))
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
MOVW AX, (R8)
MOVW AX, (BX)(R8*1)
// update the bitreader structure
MOVQ R12, 80(R11)
MOVB R13, 88(R11)
ADDQ R9, R8
MOVQ R11, 80(R10)
MOVB R12, 88(R10)
// br2.fillFast32()
MOVQ 128(R11), R12
MOVBQZX 136(R11), R13
CMPQ R13, $0x20
MOVQ 128(R10), R11
MOVBQZX 136(R10), R12
CMPQ R12, $0x20
JBE skip_fill2
MOVQ 120(R11), AX
SUBQ $0x20, R13
MOVQ 120(R10), AX
SUBQ $0x20, R12
SUBQ $0x04, AX
MOVQ 96(R11), R14
MOVQ 96(R10), R13
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (AX)(R14*1), R14
MOVQ R13, CX
SHLQ CL, R14
MOVQ AX, 120(R11)
ORQ R14, R12
MOVL (AX)(R13*1), R13
MOVQ R12, CX
SHLQ CL, R13
MOVQ AX, 120(R10)
ORQ R13, R11
// exhausted = exhausted || (br2.off < 4)
CMPQ AX, $0x04
SETLT AL
ORB AL, DL
// exhausted += (br2.off < 4)
CMPQ AX, $0x04
ADCB $+0, DL
skip_fill2:
// val0 := br2.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v0 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br2.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
// val1 := br2.peekTopBits(peekBits)
MOVQ DI, CX
MOVQ R12, R14
SHRQ CL, R14
MOVQ R11, R13
SHRQ CL, R13
// v1 := table[val1&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br2.advance(uint8(v1.entry))
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
MOVW AX, (R8)
MOVW AX, (BX)(R8*2)
// update the bitreader structure
MOVQ R12, 128(R11)
MOVB R13, 136(R11)
ADDQ R9, R8
MOVQ R11, 128(R10)
MOVB R12, 136(R10)
// br3.fillFast32()
MOVQ 176(R11), R12
MOVBQZX 184(R11), R13
CMPQ R13, $0x20
MOVQ 176(R10), R11
MOVBQZX 184(R10), R12
CMPQ R12, $0x20
JBE skip_fill3
MOVQ 168(R11), AX
SUBQ $0x20, R13
MOVQ 168(R10), AX
SUBQ $0x20, R12
SUBQ $0x04, AX
MOVQ 144(R11), R14
MOVQ 144(R10), R13
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (AX)(R14*1), R14
MOVQ R13, CX
SHLQ CL, R14
MOVQ AX, 168(R11)
ORQ R14, R12
MOVL (AX)(R13*1), R13
MOVQ R12, CX
SHLQ CL, R13
MOVQ AX, 168(R10)
ORQ R13, R11
// exhausted = exhausted || (br3.off < 4)
CMPQ AX, $0x04
SETLT AL
ORB AL, DL
// exhausted += (br3.off < 4)
CMPQ AX, $0x04
ADCB $+0, DL
skip_fill3:
// val0 := br3.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v0 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br3.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
// val1 := br3.peekTopBits(peekBits)
MOVQ DI, CX
MOVQ R12, R14
SHRQ CL, R14
MOVQ R11, R13
SHRQ CL, R13
// v1 := table[val1&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br3.advance(uint8(v1.entry))
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
// these two writes get coalesced
// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
MOVW AX, (R8)
LEAQ (R8)(R8*2), CX
MOVW AX, (BX)(CX*1)
// update the bitreader structure
MOVQ R12, 176(R11)
MOVB R13, 184(R11)
ADDQ $0x02, SI
MOVQ R11, 176(R10)
MOVB R12, 184(R10)
ADDQ $0x02, BX
TESTB DL, DL
JZ main_loop
MOVQ ctx+0(FP), AX
SUBQ 16(AX), SI
SHLQ $0x02, SI
MOVQ SI, 40(AX)
SUBQ 16(AX), BX
SHLQ $0x02, BX
MOVQ BX, 40(AX)
RET
// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
XORQ DX, DX
// Preload values
MOVQ ctx+0(FP), CX
MOVBQZX 8(CX), DI
MOVQ 16(CX), BX
MOVQ 48(CX), SI
MOVQ 24(CX), R9
MOVQ 32(CX), R10
MOVQ (CX), R11
MOVQ 24(CX), R8
MOVQ 32(CX), R9
MOVQ (CX), R10
// Main loop
main_loop:
MOVQ BX, R8
CMPQ R8, SI
XORL DX, DX
CMPQ BX, SI
SETGE DL
// br0.fillFast32()
MOVQ 32(R11), R12
MOVBQZX 40(R11), R13
CMPQ R13, $0x20
MOVQ 32(R10), R11
MOVBQZX 40(R10), R12
CMPQ R12, $0x20
JBE skip_fill0
MOVQ 24(R11), R14
SUBQ $0x20, R13
SUBQ $0x04, R14
MOVQ (R11), R15
MOVQ 24(R10), R13
SUBQ $0x20, R12
SUBQ $0x04, R13
MOVQ (R10), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (R14)(R15*1), R15
MOVQ R13, CX
SHLQ CL, R15
MOVQ R14, 24(R11)
ORQ R15, R12
MOVL (R13)(R14*1), R14
MOVQ R12, CX
SHLQ CL, R14
MOVQ R13, 24(R10)
ORQ R14, R11
// exhausted = exhausted || (br0.off < 4)
CMPQ R14, $0x04
SETLT AL
ORB AL, DL
// exhausted += (br0.off < 4)
CMPQ R13, $0x04
ADCB $+0, DL
skip_fill0:
// val0 := br0.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v0 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br0.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
// val1 := br0.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v1 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br0.advance(uint8(v1.entry)
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
BSWAPL AX
// val2 := br0.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v2 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br0.advance(uint8(v2.entry)
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
// val3 := br0.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v3 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br0.advance(uint8(v3.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
BSWAPL AX
// these four writes get coalesced
@ -365,88 +354,86 @@ skip_fill0:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
MOVL AX, (R8)
MOVL AX, (BX)
// update the bitreader structure
MOVQ R12, 32(R11)
MOVB R13, 40(R11)
ADDQ R9, R8
MOVQ R11, 32(R10)
MOVB R12, 40(R10)
// br1.fillFast32()
MOVQ 80(R11), R12
MOVBQZX 88(R11), R13
CMPQ R13, $0x20
MOVQ 80(R10), R11
MOVBQZX 88(R10), R12
CMPQ R12, $0x20
JBE skip_fill1
MOVQ 72(R11), R14
SUBQ $0x20, R13
SUBQ $0x04, R14
MOVQ 48(R11), R15
MOVQ 72(R10), R13
SUBQ $0x20, R12
SUBQ $0x04, R13
MOVQ 48(R10), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (R14)(R15*1), R15
MOVQ R13, CX
SHLQ CL, R15
MOVQ R14, 72(R11)
ORQ R15, R12
MOVL (R13)(R14*1), R14
MOVQ R12, CX
SHLQ CL, R14
MOVQ R13, 72(R10)
ORQ R14, R11
// exhausted = exhausted || (br1.off < 4)
CMPQ R14, $0x04
SETLT AL
ORB AL, DL
// exhausted += (br1.off < 4)
CMPQ R13, $0x04
ADCB $+0, DL
skip_fill1:
// val0 := br1.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v0 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br1.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
// val1 := br1.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v1 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br1.advance(uint8(v1.entry)
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
BSWAPL AX
// val2 := br1.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v2 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br1.advance(uint8(v2.entry)
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
// val3 := br1.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v3 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br1.advance(uint8(v3.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
BSWAPL AX
// these four writes get coalesced
@ -454,88 +441,86 @@ skip_fill1:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
MOVL AX, (R8)
MOVL AX, (BX)(R8*1)
// update the bitreader structure
MOVQ R12, 80(R11)
MOVB R13, 88(R11)
ADDQ R9, R8
MOVQ R11, 80(R10)
MOVB R12, 88(R10)
// br2.fillFast32()
MOVQ 128(R11), R12
MOVBQZX 136(R11), R13
CMPQ R13, $0x20
MOVQ 128(R10), R11
MOVBQZX 136(R10), R12
CMPQ R12, $0x20
JBE skip_fill2
MOVQ 120(R11), R14
SUBQ $0x20, R13
SUBQ $0x04, R14
MOVQ 96(R11), R15
MOVQ 120(R10), R13
SUBQ $0x20, R12
SUBQ $0x04, R13
MOVQ 96(R10), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (R14)(R15*1), R15
MOVQ R13, CX
SHLQ CL, R15
MOVQ R14, 120(R11)
ORQ R15, R12
MOVL (R13)(R14*1), R14
MOVQ R12, CX
SHLQ CL, R14
MOVQ R13, 120(R10)
ORQ R14, R11
// exhausted = exhausted || (br2.off < 4)
CMPQ R14, $0x04
SETLT AL
ORB AL, DL
// exhausted += (br2.off < 4)
CMPQ R13, $0x04
ADCB $+0, DL
skip_fill2:
// val0 := br2.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v0 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br2.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
// val1 := br2.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v1 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br2.advance(uint8(v1.entry)
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
BSWAPL AX
// val2 := br2.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v2 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br2.advance(uint8(v2.entry)
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
// val3 := br2.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v3 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br2.advance(uint8(v3.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
BSWAPL AX
// these four writes get coalesced
@ -543,88 +528,86 @@ skip_fill2:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
MOVL AX, (R8)
MOVL AX, (BX)(R8*2)
// update the bitreader structure
MOVQ R12, 128(R11)
MOVB R13, 136(R11)
ADDQ R9, R8
MOVQ R11, 128(R10)
MOVB R12, 136(R10)
// br3.fillFast32()
MOVQ 176(R11), R12
MOVBQZX 184(R11), R13
CMPQ R13, $0x20
MOVQ 176(R10), R11
MOVBQZX 184(R10), R12
CMPQ R12, $0x20
JBE skip_fill3
MOVQ 168(R11), R14
SUBQ $0x20, R13
SUBQ $0x04, R14
MOVQ 144(R11), R15
MOVQ 168(R10), R13
SUBQ $0x20, R12
SUBQ $0x04, R13
MOVQ 144(R10), R14
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVL (R14)(R15*1), R15
MOVQ R13, CX
SHLQ CL, R15
MOVQ R14, 168(R11)
ORQ R15, R12
MOVL (R13)(R14*1), R14
MOVQ R12, CX
SHLQ CL, R14
MOVQ R13, 168(R10)
ORQ R14, R11
// exhausted = exhausted || (br3.off < 4)
CMPQ R14, $0x04
SETLT AL
ORB AL, DL
// exhausted += (br3.off < 4)
CMPQ R13, $0x04
ADCB $+0, DL
skip_fill3:
// val0 := br3.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v0 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br3.advance(uint8(v0.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
// val1 := br3.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v1 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br3.advance(uint8(v1.entry)
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
BSWAPL AX
// val2 := br3.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v2 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br3.advance(uint8(v2.entry)
MOVB CH, AH
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
// val3 := br3.peekTopBits(peekBits)
MOVQ R12, R14
MOVQ R11, R13
MOVQ DI, CX
SHRQ CL, R14
SHRQ CL, R13
// v3 := table[val0&mask]
MOVW (R10)(R14*2), CX
MOVW (R9)(R13*2), CX
// br3.advance(uint8(v3.entry)
MOVB CH, AL
SHLQ CL, R12
ADDB CL, R13
SHLQ CL, R11
ADDB CL, R12
BSWAPL AX
// these four writes get coalesced
@ -632,11 +615,12 @@ skip_fill3:
// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
MOVL AX, (R8)
LEAQ (R8)(R8*2), CX
MOVL AX, (BX)(CX*1)
// update the bitreader structure
MOVQ R12, 176(R11)
MOVB R13, 184(R11)
MOVQ R11, 176(R10)
MOVB R12, 184(R10)
ADDQ $0x04, BX
TESTB DL, DL
JZ main_loop
@ -652,7 +636,7 @@ TEXT ·decompress1x_main_loop_amd64(SB), $0-8
MOVQ 16(CX), DX
MOVQ 24(CX), BX
CMPQ BX, $0x04
JB error_max_decoded_size_exeeded
JB error_max_decoded_size_exceeded
LEAQ (DX)(BX*1), BX
MOVQ (CX), SI
MOVQ (SI), R8
@ -667,7 +651,7 @@ main_loop:
// Check if we have room for 4 bytes in the output buffer
LEAQ 4(DX), CX
CMPQ CX, BX
JGE error_max_decoded_size_exeeded
JGE error_max_decoded_size_exceeded
// Decode 4 values
CMPQ R11, $0x20
@ -744,7 +728,7 @@ loop_condition:
RET
// Report error
error_max_decoded_size_exeeded:
error_max_decoded_size_exceeded:
MOVQ ctx+0(FP), AX
MOVQ $-1, CX
MOVQ CX, 40(AX)
@ -757,7 +741,7 @@ TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
MOVQ 16(CX), DX
MOVQ 24(CX), BX
CMPQ BX, $0x04
JB error_max_decoded_size_exeeded
JB error_max_decoded_size_exceeded
LEAQ (DX)(BX*1), BX
MOVQ (CX), SI
MOVQ (SI), R8
@ -772,7 +756,7 @@ main_loop:
// Check if we have room for 4 bytes in the output buffer
LEAQ 4(DX), CX
CMPQ CX, BX
JGE error_max_decoded_size_exeeded
JGE error_max_decoded_size_exceeded
// Decode 4 values
CMPQ R11, $0x20
@ -839,7 +823,7 @@ loop_condition:
RET
// Report error
error_max_decoded_size_exeeded:
error_max_decoded_size_exceeded:
MOVQ ctx+0(FP), AX
MOVQ $-1, CX
MOVQ CX, 40(AX)

View file

@ -87,22 +87,32 @@ func emitCopy(dst []byte, offset, length int) int {
return i + 2
}
// extendMatch returns the largest k such that k <= len(src) and that
// src[i:i+k-j] and src[j:k] have the same contents.
//
// It assumes that:
//
// 0 <= i && i < j && j <= len(src)
func extendMatch(src []byte, i, j int) int {
for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
}
return j
}
func hash(u, shift uint32) uint32 {
return (u * 0x1e35a7bd) >> shift
}
// EncodeBlockInto exposes encodeBlock but checks dst size.
func EncodeBlockInto(dst, src []byte) (d int) {
if MaxEncodedLen(len(src)) > len(dst) {
return 0
}
// encodeBlock breaks on too big blocks, so split.
for len(src) > 0 {
p := src
src = nil
if len(p) > maxBlockSize {
p, src = p[:maxBlockSize], p[maxBlockSize:]
}
if len(p) < minNonLiteralBlockSize {
d += emitLiteral(dst[d:], p)
} else {
d += encodeBlock(dst[d:], p)
}
}
return d
}
// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.

View file

@ -304,7 +304,7 @@ import "github.com/klauspost/compress/zstd"
// Create a reader that caches decompressors.
// For this operation type we supply a nil Reader.
var decoder, _ = zstd.NewReader(nil, WithDecoderConcurrency(0))
var decoder, _ = zstd.NewReader(nil, zstd.WithDecoderConcurrency(0))
// Decompress a buffer. We don't supply a destination buffer,
// so it will be allocated by the decoder.

View file

@ -17,7 +17,6 @@ import (
// for aligning the input.
type bitReader struct {
in []byte
off uint // next byte to read is at in[off - 1]
value uint64 // Maybe use [16]byte, but shifting is awkward.
bitsRead uint8
}
@ -28,7 +27,6 @@ func (b *bitReader) init(in []byte) error {
return errors.New("corrupt stream: too short")
}
b.in = in
b.off = uint(len(in))
// The highest bit of the last byte indicates where to start
v := in[len(in)-1]
if v == 0 {
@ -69,21 +67,19 @@ func (b *bitReader) fillFast() {
if b.bitsRead < 32 {
return
}
// 2 bounds checks.
v := b.in[b.off-4:]
v = v[:4]
v := b.in[len(b.in)-4:]
b.in = b.in[:len(b.in)-4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value = (b.value << 32) | uint64(low)
b.bitsRead -= 32
b.off -= 4
}
// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
func (b *bitReader) fillFastStart() {
// Do single re-slice to avoid bounds checks.
b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
v := b.in[len(b.in)-8:]
b.in = b.in[:len(b.in)-8]
b.value = binary.LittleEndian.Uint64(v)
b.bitsRead = 0
b.off -= 8
}
// fill() will make sure at least 32 bits are available.
@ -91,25 +87,25 @@ func (b *bitReader) fill() {
if b.bitsRead < 32 {
return
}
if b.off >= 4 {
v := b.in[b.off-4:]
v = v[:4]
if len(b.in) >= 4 {
v := b.in[len(b.in)-4:]
b.in = b.in[:len(b.in)-4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value = (b.value << 32) | uint64(low)
b.bitsRead -= 32
b.off -= 4
return
}
for b.off > 0 {
b.value = (b.value << 8) | uint64(b.in[b.off-1])
b.bitsRead -= 8
b.off--
b.bitsRead -= uint8(8 * len(b.in))
for len(b.in) > 0 {
b.value = (b.value << 8) | uint64(b.in[len(b.in)-1])
b.in = b.in[:len(b.in)-1]
}
}
// finished returns true if all bits have been read from the bit stream.
func (b *bitReader) finished() bool {
return b.off == 0 && b.bitsRead >= 64
return len(b.in) == 0 && b.bitsRead >= 64
}
// overread returns true if more bits have been requested than is on the stream.
@ -119,7 +115,7 @@ func (b *bitReader) overread() bool {
// remain returns the number of bits remaining.
func (b *bitReader) remain() uint {
return b.off*8 + 64 - uint(b.bitsRead)
return 8*uint(len(b.in)) + 64 - uint(b.bitsRead)
}
// close the bitstream and returns an error if out-of-buffer reads occurred.

View file

@ -97,12 +97,11 @@ func (b *bitWriter) flushAlign() {
// close will write the alignment bit and write the final byte(s)
// to the output.
func (b *bitWriter) close() error {
func (b *bitWriter) close() {
// End mark
b.addBits16Clean(1, 1)
// flush until next byte.
b.flushAlign()
return nil
}
// reset and continue writing by appending to out.

View file

@ -9,6 +9,7 @@ import (
"encoding/binary"
"errors"
"fmt"
"hash/crc32"
"io"
"os"
"path/filepath"
@ -82,8 +83,9 @@ type blockDec struct {
err error
// Check against this crc
checkCRC []byte
// Check against this crc, if hasCRC is true.
checkCRC uint32
hasCRC bool
// Frame to use for singlethreaded decoding.
// Should not be used by the decoder itself since parent may be another frame.
@ -191,16 +193,14 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
}
// Read block data.
if cap(b.dataStorage) < cSize {
if _, ok := br.(*byteBuf); !ok && cap(b.dataStorage) < cSize {
// byteBuf doesn't need a destination buffer.
if b.lowMem || cSize > maxCompressedBlockSize {
b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc)
} else {
b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc)
}
}
if cap(b.dst) <= maxSize {
b.dst = make([]byte, 0, maxSize+1)
}
b.data, err = br.readBig(cSize, b.dataStorage)
if err != nil {
if debugDecoder {
@ -209,6 +209,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
}
return err
}
if cap(b.dst) <= maxSize {
b.dst = make([]byte, 0, maxSize+1)
}
return nil
}
@ -440,6 +443,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
}
}
var err error
if debugDecoder {
println("huff table input:", len(literals), "CRC:", crc32.ChecksumIEEE(literals))
}
huff, literals, err = huff0.ReadTable(literals, huff)
if err != nil {
println("reading huffman table:", err)
@ -586,7 +592,7 @@ func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
}
seq.fse.setRLE(symb)
if debugDecoder {
printf("RLE set to %+v, code: %v", symb, v)
printf("RLE set to 0x%x, code: %v", symb, v)
}
case compModeFSE:
println("Reading table for", tableIndex(i))

View file

@ -361,14 +361,21 @@ func (b *blockEnc) encodeLits(lits []byte, raw bool) error {
if len(lits) >= 1024 {
// Use 4 Streams.
out, reUsed, err = huff0.Compress4X(lits, b.litEnc)
} else if len(lits) > 32 {
} else if len(lits) > 16 {
// Use 1 stream
single = true
out, reUsed, err = huff0.Compress1X(lits, b.litEnc)
} else {
err = huff0.ErrIncompressible
}
if err == nil && len(out)+5 > len(lits) {
// If we are close, we may still be worse or equal to raw.
var lh literalsHeader
lh.setSizes(len(out), len(lits), single)
if len(out)+lh.size() >= len(lits) {
err = huff0.ErrIncompressible
}
}
switch err {
case huff0.ErrIncompressible:
if debugEncoder {
@ -473,7 +480,7 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
return b.encodeLits(b.literals, rawAllLits)
}
// We want some difference to at least account for the headers.
saved := b.size - len(b.literals) - (b.size >> 5)
saved := b.size - len(b.literals) - (b.size >> 6)
if saved < 16 {
if org == nil {
return errIncompressible
@ -503,7 +510,7 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
if len(b.literals) >= 1024 && !raw {
// Use 4 Streams.
out, reUsed, err = huff0.Compress4X(b.literals, b.litEnc)
} else if len(b.literals) > 32 && !raw {
} else if len(b.literals) > 16 && !raw {
// Use 1 stream
single = true
out, reUsed, err = huff0.Compress1X(b.literals, b.litEnc)
@ -511,6 +518,17 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
err = huff0.ErrIncompressible
}
if err == nil && len(out)+5 > len(b.literals) {
// If we are close, we may still be worse or equal to raw.
var lh literalsHeader
lh.setSize(len(b.literals))
szRaw := lh.size()
lh.setSizes(len(out), len(b.literals), single)
szComp := lh.size()
if len(out)+szComp >= len(b.literals)+szRaw {
err = huff0.ErrIncompressible
}
}
switch err {
case huff0.ErrIncompressible:
lh.setType(literalsBlockRaw)
@ -773,16 +791,16 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
ml.flush(mlEnc.actualTableLog)
of.flush(ofEnc.actualTableLog)
ll.flush(llEnc.actualTableLog)
err = wr.close()
if err != nil {
return err
}
wr.close()
b.output = wr.out
// Maybe even add a bigger margin.
if len(b.output)-3-bhOffset >= b.size {
// Maybe even add a bigger margin.
// Discard and encode as raw block.
b.output = b.encodeRawTo(b.output[:bhOffset], org)
b.popOffsets()
b.litEnc.Reuse = huff0.ReusePolicyNone
return errIncompressible
return nil
}
// Size is output minus block header.

View file

@ -54,7 +54,7 @@ func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) {
func (b *byteBuf) readByte() (byte, error) {
bb := *b
if len(bb) < 1 {
return 0, nil
return 0, io.ErrUnexpectedEOF
}
r := bb[0]
*b = bb[1:]
@ -109,7 +109,7 @@ func (r *readerWrapper) readBig(n int, dst []byte) ([]byte, error) {
}
func (r *readerWrapper) readByte() (byte, error) {
n2, err := r.r.Read(r.tmp[:1])
n2, err := io.ReadFull(r.r, r.tmp[:1])
if err != nil {
if err == io.EOF {
err = io.ErrUnexpectedEOF

View file

@ -4,7 +4,6 @@
package zstd
import (
"bytes"
"encoding/binary"
"errors"
"io"
@ -102,8 +101,8 @@ func (h *Header) Decode(in []byte) error {
}
h.HeaderSize += 4
b, in := in[:4], in[4:]
if !bytes.Equal(b, frameMagic) {
if !bytes.Equal(b[1:4], skippableFrameMagic) || b[0]&0xf0 != 0x50 {
if string(b) != frameMagic {
if string(b[1:4]) != skippableFrameMagic || b[0]&0xf0 != 0x50 {
return ErrMagicMismatch
}
if len(in) < 4 {
@ -153,7 +152,7 @@ func (h *Header) Decode(in []byte) error {
}
b, in = in[:size], in[size:]
h.HeaderSize += int(size)
switch size {
switch len(b) {
case 1:
h.DictionaryID = uint32(b[0])
case 2:
@ -183,7 +182,7 @@ func (h *Header) Decode(in []byte) error {
}
b, in = in[:fcsSize], in[fcsSize:]
h.HeaderSize += int(fcsSize)
switch fcsSize {
switch len(b) {
case 1:
h.FrameContentSize = uint64(b[0])
case 2:

View file

@ -5,7 +5,6 @@
package zstd
import (
"bytes"
"context"
"encoding/binary"
"io"
@ -41,8 +40,7 @@ type Decoder struct {
frame *frameDec
// Custom dictionaries.
// Always uses copies.
dicts map[uint32]dict
dicts map[uint32]*dict
// streamWg is the waitgroup for all streams
streamWg sync.WaitGroup
@ -104,7 +102,7 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
}
// Transfer option dicts.
d.dicts = make(map[uint32]dict, len(d.o.dicts))
d.dicts = make(map[uint32]*dict, len(d.o.dicts))
for _, dc := range d.o.dicts {
d.dicts[dc.id] = dc
}
@ -342,15 +340,8 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
}
return dst, err
}
if frame.DictionaryID != nil {
dict, ok := d.dicts[*frame.DictionaryID]
if !ok {
return nil, ErrUnknownDictionary
}
if debugDecoder {
println("setting dict", frame.DictionaryID)
}
frame.history.setDict(&dict)
if err = d.setDict(frame); err != nil {
return nil, err
}
if frame.WindowSize > d.o.maxWindowSize {
if debugDecoder {
@ -459,26 +450,23 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
}
if !d.o.ignoreChecksum && len(next.b) > 0 {
n, err := d.current.crc.Write(next.b)
if err == nil {
if n != len(next.b) {
d.current.err = io.ErrShortWrite
}
}
if d.o.ignoreChecksum {
return true
}
if next.err == nil && next.d != nil && len(next.d.checkCRC) != 0 {
got := d.current.crc.Sum64()
var tmp [4]byte
binary.LittleEndian.PutUint32(tmp[:], uint32(got))
if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) {
if len(next.b) > 0 {
d.current.crc.Write(next.b)
}
if next.err == nil && next.d != nil && next.d.hasCRC {
got := uint32(d.current.crc.Sum64())
if got != next.d.checkCRC {
if debugDecoder {
println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)")
printf("CRC Check Failed: %08x (got) != %08x (on stream)\n", got, next.d.checkCRC)
}
d.current.err = ErrCRCMismatch
} else {
if debugDecoder {
println("CRC ok", tmp[:])
printf("CRC ok %08x\n", got)
}
}
}
@ -494,18 +482,12 @@ func (d *Decoder) nextBlockSync() (ok bool) {
if !d.syncStream.inFrame {
d.frame.history.reset()
d.current.err = d.frame.reset(&d.syncStream.br)
if d.current.err == nil {
d.current.err = d.setDict(d.frame)
}
if d.current.err != nil {
return false
}
if d.frame.DictionaryID != nil {
dict, ok := d.dicts[*d.frame.DictionaryID]
if !ok {
d.current.err = ErrUnknownDictionary
return false
} else {
d.frame.history.setDict(&dict)
}
}
if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize {
d.current.err = ErrDecoderSizeExceeded
return false
@ -864,13 +846,8 @@ decodeStream:
if debugDecoder && err != nil {
println("Frame decoder returned", err)
}
if err == nil && frame.DictionaryID != nil {
dict, ok := d.dicts[*frame.DictionaryID]
if !ok {
err = ErrUnknownDictionary
} else {
frame.history.setDict(&dict)
}
if err == nil {
err = d.setDict(frame)
}
if err == nil && d.frame.WindowSize > d.o.maxWindowSize {
if debugDecoder {
@ -918,18 +895,22 @@ decodeStream:
println("next block returned error:", err)
}
dec.err = err
dec.checkCRC = nil
dec.hasCRC = false
if dec.Last && frame.HasCheckSum && err == nil {
crc, err := frame.rawInput.readSmall(4)
if err != nil {
if len(crc) < 4 {
if err == nil {
err = io.ErrUnexpectedEOF
}
println("CRC missing?", err)
dec.err = err
}
var tmp [4]byte
copy(tmp[:], crc)
dec.checkCRC = tmp[:]
if debugDecoder {
println("found crc to check:", dec.checkCRC)
} else {
dec.checkCRC = binary.LittleEndian.Uint32(crc)
dec.hasCRC = true
if debugDecoder {
printf("found crc to check: %08x\n", dec.checkCRC)
}
}
}
err = dec.err
@ -948,3 +929,20 @@ decodeStream:
hist.reset()
d.frame.history.b = frameHistCache
}
func (d *Decoder) setDict(frame *frameDec) (err error) {
dict, ok := d.dicts[frame.DictionaryID]
if ok {
if debugDecoder {
println("setting dict", frame.DictionaryID)
}
frame.history.setDict(dict)
} else if frame.DictionaryID != 0 {
// A zero or missing dictionary id is ambiguous:
// either dictionary zero, or no dictionary. In particular,
// zstd --patch-from uses this id for the source file,
// so only return an error if the dictionary id is not zero.
err = ErrUnknownDictionary
}
return err
}

View file

@ -6,6 +6,8 @@ package zstd
import (
"errors"
"fmt"
"math/bits"
"runtime"
)
@ -18,7 +20,7 @@ type decoderOptions struct {
concurrent int
maxDecodedSize uint64
maxWindowSize uint64
dicts []dict
dicts []*dict
ignoreChecksum bool
limitToCap bool
decodeBufsBelow int
@ -85,7 +87,13 @@ func WithDecoderMaxMemory(n uint64) DOption {
}
// WithDecoderDicts allows to register one or more dictionaries for the decoder.
// If several dictionaries with the same ID is provided the last one will be used.
//
// Each slice in dict must be in the [dictionary format] produced by
// "zstd --train" from the Zstandard reference implementation.
//
// If several dictionaries with the same ID are provided, the last one will be used.
//
// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
func WithDecoderDicts(dicts ...[]byte) DOption {
return func(o *decoderOptions) error {
for _, b := range dicts {
@ -93,12 +101,24 @@ func WithDecoderDicts(dicts ...[]byte) DOption {
if err != nil {
return err
}
o.dicts = append(o.dicts, *d)
o.dicts = append(o.dicts, d)
}
return nil
}
}
// WithDecoderDictRaw registers a dictionary that may be used by the decoder.
// The slice content can be arbitrary data.
func WithDecoderDictRaw(id uint32, content []byte) DOption {
return func(o *decoderOptions) error {
if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
}
o.dicts = append(o.dicts, &dict{id: id, content: content, offsets: [3]int{1, 4, 8}})
return nil
}
}
// WithDecoderMaxWindow allows to set a maximum window size for decodes.
// This allows rejecting packets that will cause big memory usage.
// The Decoder will likely allocate more memory based on the WithDecoderLowmem setting.

View file

@ -6,6 +6,8 @@ import (
"errors"
"fmt"
"io"
"math"
"sort"
"github.com/klauspost/compress/huff0"
)
@ -15,12 +17,14 @@ type dict struct {
litEnc *huff0.Scratch
llDec, ofDec, mlDec sequenceDec
//llEnc, ofEnc, mlEnc []*fseEncoder
offsets [3]int
content []byte
offsets [3]int
content []byte
}
var dictMagic = [4]byte{0x37, 0xa4, 0x30, 0xec}
const dictMagic = "\x37\xa4\x30\xec"
// Maximum dictionary size for the reference implementation (1.5.3) is 2 GiB.
const dictMaxLength = 1 << 31
// ID returns the dictionary id or 0 if d is nil.
func (d *dict) ID() uint32 {
@ -30,14 +34,38 @@ func (d *dict) ID() uint32 {
return d.id
}
// DictContentSize returns the dictionary content size or 0 if d is nil.
func (d *dict) DictContentSize() int {
// ContentSize returns the dictionary content size or 0 if d is nil.
func (d *dict) ContentSize() int {
if d == nil {
return 0
}
return len(d.content)
}
// Content returns the dictionary content.
func (d *dict) Content() []byte {
if d == nil {
return nil
}
return d.content
}
// Offsets returns the initial offsets.
func (d *dict) Offsets() [3]int {
if d == nil {
return [3]int{}
}
return d.offsets
}
// LitEncoder returns the literal encoder.
func (d *dict) LitEncoder() *huff0.Scratch {
if d == nil {
return nil
}
return d.litEnc
}
// Load a dictionary as described in
// https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
func loadDict(b []byte) (*dict, error) {
@ -50,7 +78,7 @@ func loadDict(b []byte) (*dict, error) {
ofDec: sequenceDec{fse: &fseDecoder{}},
mlDec: sequenceDec{fse: &fseDecoder{}},
}
if !bytes.Equal(b[:4], dictMagic[:]) {
if string(b[:4]) != dictMagic {
return nil, ErrMagicMismatch
}
d.id = binary.LittleEndian.Uint32(b[4:8])
@ -62,7 +90,7 @@ func loadDict(b []byte) (*dict, error) {
var err error
d.litEnc, b, err = huff0.ReadTable(b[8:], nil)
if err != nil {
return nil, err
return nil, fmt.Errorf("loading literal table: %w", err)
}
d.litEnc.Reuse = huff0.ReusePolicyMust
@ -120,3 +148,387 @@ func loadDict(b []byte) (*dict, error) {
return &d, nil
}
// InspectDictionary loads a zstd dictionary and provides functions to inspect the content.
func InspectDictionary(b []byte) (interface {
ID() uint32
ContentSize() int
Content() []byte
Offsets() [3]int
LitEncoder() *huff0.Scratch
}, error) {
initPredefined()
d, err := loadDict(b)
return d, err
}
type BuildDictOptions struct {
// Dictionary ID.
ID uint32
// Content to use to create dictionary tables.
Contents [][]byte
// History to use for all blocks.
History []byte
// Offsets to use.
Offsets [3]int
// CompatV155 will make the dictionary compatible with Zstd v1.5.5 and earlier.
// See https://github.com/facebook/zstd/issues/3724
CompatV155 bool
// Use the specified encoder level.
// The dictionary will be built using the specified encoder level,
// which will reflect speed and make the dictionary tailored for that level.
// If not set SpeedBestCompression will be used.
Level EncoderLevel
// DebugOut will write stats and other details here if set.
DebugOut io.Writer
}
func BuildDict(o BuildDictOptions) ([]byte, error) {
initPredefined()
hist := o.History
contents := o.Contents
debug := o.DebugOut != nil
println := func(args ...interface{}) {
if o.DebugOut != nil {
fmt.Fprintln(o.DebugOut, args...)
}
}
printf := func(s string, args ...interface{}) {
if o.DebugOut != nil {
fmt.Fprintf(o.DebugOut, s, args...)
}
}
print := func(args ...interface{}) {
if o.DebugOut != nil {
fmt.Fprint(o.DebugOut, args...)
}
}
if int64(len(hist)) > dictMaxLength {
return nil, fmt.Errorf("dictionary of size %d > %d", len(hist), int64(dictMaxLength))
}
if len(hist) < 8 {
return nil, fmt.Errorf("dictionary of size %d < %d", len(hist), 8)
}
if len(contents) == 0 {
return nil, errors.New("no content provided")
}
d := dict{
id: o.ID,
litEnc: nil,
llDec: sequenceDec{},
ofDec: sequenceDec{},
mlDec: sequenceDec{},
offsets: o.Offsets,
content: hist,
}
block := blockEnc{lowMem: false}
block.init()
enc := encoder(&bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(maxMatchLen), bufferReset: math.MaxInt32 - int32(maxMatchLen*2), lowMem: false}})
if o.Level != 0 {
eOpts := encoderOptions{
level: o.Level,
blockSize: maxMatchLen,
windowSize: maxMatchLen,
dict: &d,
lowMem: false,
}
enc = eOpts.encoder()
} else {
o.Level = SpeedBestCompression
}
var (
remain [256]int
ll [256]int
ml [256]int
of [256]int
)
addValues := func(dst *[256]int, src []byte) {
for _, v := range src {
dst[v]++
}
}
addHist := func(dst *[256]int, src *[256]uint32) {
for i, v := range src {
dst[i] += int(v)
}
}
seqs := 0
nUsed := 0
litTotal := 0
newOffsets := make(map[uint32]int, 1000)
for _, b := range contents {
block.reset(nil)
if len(b) < 8 {
continue
}
nUsed++
enc.Reset(&d, true)
enc.Encode(&block, b)
addValues(&remain, block.literals)
litTotal += len(block.literals)
seqs += len(block.sequences)
block.genCodes()
addHist(&ll, block.coders.llEnc.Histogram())
addHist(&ml, block.coders.mlEnc.Histogram())
addHist(&of, block.coders.ofEnc.Histogram())
for i, seq := range block.sequences {
if i > 3 {
break
}
offset := seq.offset
if offset == 0 {
continue
}
if offset > 3 {
newOffsets[offset-3]++
} else {
newOffsets[uint32(o.Offsets[offset-1])]++
}
}
}
// Find most used offsets.
var sortedOffsets []uint32
for k := range newOffsets {
sortedOffsets = append(sortedOffsets, k)
}
sort.Slice(sortedOffsets, func(i, j int) bool {
a, b := sortedOffsets[i], sortedOffsets[j]
if a == b {
// Prefer the longer offset
return sortedOffsets[i] > sortedOffsets[j]
}
return newOffsets[sortedOffsets[i]] > newOffsets[sortedOffsets[j]]
})
if len(sortedOffsets) > 3 {
if debug {
print("Offsets:")
for i, v := range sortedOffsets {
if i > 20 {
break
}
printf("[%d: %d],", v, newOffsets[v])
}
println("")
}
sortedOffsets = sortedOffsets[:3]
}
for i, v := range sortedOffsets {
o.Offsets[i] = int(v)
}
if debug {
println("New repeat offsets", o.Offsets)
}
if nUsed == 0 || seqs == 0 {
return nil, fmt.Errorf("%d blocks, %d sequences found", nUsed, seqs)
}
if debug {
println("Sequences:", seqs, "Blocks:", nUsed, "Literals:", litTotal)
}
if seqs/nUsed < 512 {
// Use 512 as minimum.
nUsed = seqs / 512
}
copyHist := func(dst *fseEncoder, src *[256]int) ([]byte, error) {
hist := dst.Histogram()
var maxSym uint8
var maxCount int
var fakeLength int
for i, v := range src {
if v > 0 {
v = v / nUsed
if v == 0 {
v = 1
}
}
if v > maxCount {
maxCount = v
}
if v != 0 {
maxSym = uint8(i)
}
fakeLength += v
hist[i] = uint32(v)
}
dst.HistogramFinished(maxSym, maxCount)
dst.reUsed = false
dst.useRLE = false
err := dst.normalizeCount(fakeLength)
if err != nil {
return nil, err
}
if debug {
println("RAW:", dst.count[:maxSym+1], "NORM:", dst.norm[:maxSym+1], "LEN:", fakeLength)
}
return dst.writeCount(nil)
}
if debug {
print("Literal lengths: ")
}
llTable, err := copyHist(block.coders.llEnc, &ll)
if err != nil {
return nil, err
}
if debug {
print("Match lengths: ")
}
mlTable, err := copyHist(block.coders.mlEnc, &ml)
if err != nil {
return nil, err
}
if debug {
print("Offsets: ")
}
ofTable, err := copyHist(block.coders.ofEnc, &of)
if err != nil {
return nil, err
}
// Literal table
avgSize := litTotal
if avgSize > huff0.BlockSizeMax/2 {
avgSize = huff0.BlockSizeMax / 2
}
huffBuff := make([]byte, 0, avgSize)
// Target size
div := litTotal / avgSize
if div < 1 {
div = 1
}
if debug {
println("Huffman weights:")
}
for i, n := range remain[:] {
if n > 0 {
n = n / div
// Allow all entries to be represented.
if n == 0 {
n = 1
}
huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...)
if debug {
printf("[%d: %d], ", i, n)
}
}
}
if o.CompatV155 && remain[255]/div == 0 {
huffBuff = append(huffBuff, 255)
}
scratch := &huff0.Scratch{TableLog: 11}
for tries := 0; tries < 255; tries++ {
scratch = &huff0.Scratch{TableLog: 11}
_, _, err = huff0.Compress1X(huffBuff, scratch)
if err == nil {
break
}
if debug {
printf("Try %d: Huffman error: %v\n", tries+1, err)
}
huffBuff = huffBuff[:0]
if tries == 250 {
if debug {
println("Huffman: Bailing out with predefined table")
}
// Bail out.... Just generate something
huffBuff = append(huffBuff, bytes.Repeat([]byte{255}, 10000)...)
for i := 0; i < 128; i++ {
huffBuff = append(huffBuff, byte(i))
}
continue
}
if errors.Is(err, huff0.ErrIncompressible) {
// Try truncating least common.
for i, n := range remain[:] {
if n > 0 {
n = n / (div * (i + 1))
if n > 0 {
huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...)
}
}
}
if o.CompatV155 && len(huffBuff) > 0 && huffBuff[len(huffBuff)-1] != 255 {
huffBuff = append(huffBuff, 255)
}
if len(huffBuff) == 0 {
huffBuff = append(huffBuff, 0, 255)
}
}
if errors.Is(err, huff0.ErrUseRLE) {
for i, n := range remain[:] {
n = n / (div * (i + 1))
// Allow all entries to be represented.
if n == 0 {
n = 1
}
huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...)
}
}
}
var out bytes.Buffer
out.Write([]byte(dictMagic))
out.Write(binary.LittleEndian.AppendUint32(nil, o.ID))
out.Write(scratch.OutTable)
if debug {
println("huff table:", len(scratch.OutTable), "bytes")
println("of table:", len(ofTable), "bytes")
println("ml table:", len(mlTable), "bytes")
println("ll table:", len(llTable), "bytes")
}
out.Write(ofTable)
out.Write(mlTable)
out.Write(llTable)
out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[0])))
out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[1])))
out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[2])))
out.Write(hist)
if debug {
_, err := loadDict(out.Bytes())
if err != nil {
panic(err)
}
i, err := InspectDictionary(out.Bytes())
if err != nil {
panic(err)
}
println("ID:", i.ID())
println("Content size:", i.ContentSize())
println("Encoder:", i.LitEncoder() != nil)
println("Offsets:", i.Offsets())
var totalSize int
for _, b := range contents {
totalSize += len(b)
}
encWith := func(opts ...EOption) int {
enc, err := NewWriter(nil, opts...)
if err != nil {
panic(err)
}
defer enc.Close()
var dst []byte
var totalSize int
for _, b := range contents {
dst = enc.EncodeAll(b, dst[:0])
totalSize += len(dst)
}
return totalSize
}
plain := encWith(WithEncoderLevel(o.Level))
withDict := encWith(WithEncoderLevel(o.Level), WithEncoderDict(out.Bytes()))
println("Input size:", totalSize)
println("Plain Compressed:", plain)
println("Dict Compressed:", withDict)
println("Saved:", plain-withDict, (plain-withDict)/len(contents), "bytes per input (rounded down)")
}
return out.Bytes(), nil
}

View file

@ -16,6 +16,7 @@ type fastBase struct {
cur int32
// maximum offset. Should be at least 2x block size.
maxMatchOff int32
bufferReset int32
hist []byte
crc *xxhash.Digest
tmp [8]byte
@ -56,8 +57,8 @@ func (e *fastBase) Block() *blockEnc {
}
func (e *fastBase) addBlock(src []byte) int32 {
if debugAsserts && e.cur > bufferReset {
panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, bufferReset))
if debugAsserts && e.cur > e.bufferReset {
panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, e.bufferReset))
}
// check if we have space already
if len(e.hist)+len(src) > cap(e.hist) {
@ -126,24 +127,7 @@ func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize))
}
}
a := src[s:]
b := src[t:]
b = b[:len(a)]
end := int32((len(a) >> 3) << 3)
for i := int32(0); i < end; i += 8 {
if diff := load6432(a, i) ^ load6432(b, i); diff != 0 {
return i + int32(bits.TrailingZeros64(diff)>>3)
}
}
a = a[end:]
b = b[end:]
for i := range a {
if a[i] != b[i] {
return int32(i) + end
}
}
return int32(len(a)) + end
return int32(matchLen(src[s:], src[t:]))
}
// Reset the encoding table.
@ -160,18 +144,19 @@ func (e *fastBase) resetBase(d *dict, singleBlock bool) {
} else {
e.crc.Reset()
}
e.blk.dictLitEnc = nil
if d != nil {
low := e.lowMem
if singleBlock {
e.lowMem = true
}
e.ensureHist(d.DictContentSize() + maxCompressedBlockSize)
e.ensureHist(d.ContentSize() + maxCompressedBlockSize)
e.lowMem = low
}
// We offset current position so everything will be out of reach.
// If above reset line, history will be purged.
if e.cur < bufferReset {
if e.cur < e.bufferReset {
e.cur += e.maxMatchOff + int32(len(e.hist))
}
e.hist = e.hist[:0]

View file

@ -32,10 +32,9 @@ type match struct {
length int32
rep int32
est int32
_ [12]byte // Aligned size to cache line: 4+4+4+4+4 bytes + 12 bytes padding = 32 bytes
}
const highScore = 25000
const highScore = maxMatchLen * 8
// estBits will estimate output bits from predefined tables.
func (m *match) estBits(bitsPerByte int32) {
@ -85,14 +84,10 @@ func (e *bestFastEncoder) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
for e.cur >= bufferReset {
for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
for i := range e.table[:] {
e.table[i] = prevEntry{}
}
for i := range e.longTable[:] {
e.longTable[i] = prevEntry{}
}
e.table = [bestShortTableSize]prevEntry{}
e.longTable = [bestLongTableSize]prevEntry{}
e.cur = e.maxMatchOff
break
}
@ -164,7 +159,6 @@ func (e *bestFastEncoder) Encode(blk *blockEnc, src []byte) {
// nextEmit is where in src the next emitLiteral should start from.
nextEmit := s
cv := load6432(src, s)
// Relative offsets
offset1 := int32(blk.recentOffsets[0])
@ -178,7 +172,6 @@ func (e *bestFastEncoder) Encode(blk *blockEnc, src []byte) {
blk.literals = append(blk.literals, src[nextEmit:until]...)
s.litLen = uint32(until - nextEmit)
}
_ = addLiterals
if debugEncoder {
println("recent offsets:", blk.recentOffsets)
@ -193,49 +186,97 @@ encodeLoop:
panic("offset0 was 0")
}
bestOf := func(a, b match) match {
if a.est+(a.s-b.s)*bitsPerByte>>10 < b.est+(b.s-a.s)*bitsPerByte>>10 {
return a
}
return b
}
const goodEnough = 100
const goodEnough = 250
cv := load6432(src, s)
nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)
nextHashS := hashLen(cv, bestShortTableBits, bestShortLen)
candidateL := e.longTable[nextHashL]
candidateS := e.table[nextHashS]
matchAt := func(offset int32, s int32, first uint32, rep int32) match {
if s-offset >= e.maxMatchOff || load3232(src, offset) != first {
return match{s: s, est: highScore}
// Set m to a match at offset if it looks like that will improve compression.
improve := func(m *match, offset int32, s int32, first uint32, rep int32) {
delta := s - offset
if delta >= e.maxMatchOff || delta <= 0 || load3232(src, offset) != first {
return
}
if debugAsserts {
if offset >= s {
panic(fmt.Sprintf("offset: %d - s:%d - rep: %d - cur :%d - max: %d", offset, s, rep, e.cur, e.maxMatchOff))
}
if !bytes.Equal(src[s:s+4], src[offset:offset+4]) {
panic(fmt.Sprintf("first match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first))
}
}
m := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
m.estBits(bitsPerByte)
return m
// Try to quick reject if we already have a long match.
if m.length > 16 {
left := len(src) - int(m.s+m.length)
// If we are too close to the end, keep as is.
if left <= 0 {
return
}
checkLen := m.length - (s - m.s) - 8
if left > 2 && checkLen > 4 {
// Check 4 bytes, 4 bytes from the end of the current match.
a := load3232(src, offset+checkLen)
b := load3232(src, s+checkLen)
if a != b {
return
}
}
}
l := 4 + e.matchlen(s+4, offset+4, src)
if rep < 0 {
// Extend candidate match backwards as far as possible.
tMin := s - e.maxMatchOff
if tMin < 0 {
tMin = 0
}
for offset > tMin && s > nextEmit && src[offset-1] == src[s-1] && l < maxMatchLength {
s--
offset--
l++
}
}
cand := match{offset: offset, s: s, length: l, rep: rep}
cand.estBits(bitsPerByte)
if m.est >= highScore || cand.est-m.est+(cand.s-m.s)*bitsPerByte>>10 < 0 {
*m = cand
}
}
best := bestOf(matchAt(candidateL.offset-e.cur, s, uint32(cv), -1), matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
best = bestOf(best, matchAt(candidateS.prev-e.cur, s, uint32(cv), -1))
best := match{s: s, est: highScore}
improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
improve(&best, candidateS.prev-e.cur, s, uint32(cv), -1)
if canRepeat && best.length < goodEnough {
cv32 := uint32(cv >> 8)
spp := s + 1
best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1))
best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2))
best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3))
if best.length > 0 {
cv32 = uint32(cv >> 24)
spp += 2
best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1))
best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2))
best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3))
if s == nextEmit {
// Check repeats straight after a match.
improve(&best, s-offset2, s, uint32(cv), 1|4)
improve(&best, s-offset3, s, uint32(cv), 2|4)
if offset1 > 1 {
improve(&best, s-(offset1-1), s, uint32(cv), 3|4)
}
}
// If either no match or a non-repeat match, check at + 1
if best.rep <= 0 {
cv32 := uint32(cv >> 8)
spp := s + 1
improve(&best, spp-offset1, spp, cv32, 1)
improve(&best, spp-offset2, spp, cv32, 2)
improve(&best, spp-offset3, spp, cv32, 3)
if best.rep < 0 {
cv32 = uint32(cv >> 24)
spp += 2
improve(&best, spp-offset1, spp, cv32, 1)
improve(&best, spp-offset2, spp, cv32, 2)
improve(&best, spp-offset3, spp, cv32, 3)
}
}
}
// Load next and check...
@ -250,40 +291,45 @@ encodeLoop:
if s >= sLimit {
break encodeLoop
}
cv = load6432(src, s)
continue
}
s++
candidateS = e.table[hashLen(cv>>8, bestShortTableBits, bestShortLen)]
cv = load6432(src, s)
cv2 := load6432(src, s+1)
cv = load6432(src, s+1)
cv2 := load6432(src, s+2)
candidateL = e.longTable[hashLen(cv, bestLongTableBits, bestLongLen)]
candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)]
// Short at s+1
best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
improve(&best, candidateS.offset-e.cur, s+1, uint32(cv), -1)
// Long at s+1, s+2
best = bestOf(best, matchAt(candidateL.offset-e.cur, s, uint32(cv), -1))
best = bestOf(best, matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
best = bestOf(best, matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1))
best = bestOf(best, matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1))
improve(&best, candidateL.offset-e.cur, s+1, uint32(cv), -1)
improve(&best, candidateL.prev-e.cur, s+1, uint32(cv), -1)
improve(&best, candidateL2.offset-e.cur, s+2, uint32(cv2), -1)
improve(&best, candidateL2.prev-e.cur, s+2, uint32(cv2), -1)
if false {
// Short at s+3.
// Too often worse...
best = bestOf(best, matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1))
improve(&best, e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+3, uint32(cv2>>8), -1)
}
// See if we can find a better match by checking where the current best ends.
// Use that offset to see if we can find a better full match.
if sAt := best.s + best.length; sAt < sLimit {
nextHashL := hashLen(load6432(src, sAt), bestLongTableBits, bestLongLen)
candidateEnd := e.longTable[nextHashL]
if pos := candidateEnd.offset - e.cur - best.length; pos >= 0 {
bestEnd := bestOf(best, matchAt(pos, best.s, load3232(src, best.s), -1))
if pos := candidateEnd.prev - e.cur - best.length; pos >= 0 {
bestEnd = bestOf(bestEnd, matchAt(pos, best.s, load3232(src, best.s), -1))
// Start check at a fixed offset to allow for a few mismatches.
// For this compression level 2 yields the best results.
// We cannot do this if we have already indexed this position.
const skipBeginning = 2
if best.s > s-skipBeginning {
// See if we can find a better match by checking where the current best ends.
// Use that offset to see if we can find a better full match.
if sAt := best.s + best.length; sAt < sLimit {
nextHashL := hashLen(load6432(src, sAt), bestLongTableBits, bestLongLen)
candidateEnd := e.longTable[nextHashL]
if off := candidateEnd.offset - e.cur - best.length + skipBeginning; off >= 0 {
improve(&best, off, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
if off := candidateEnd.prev - e.cur - best.length + skipBeginning; off >= 0 {
improve(&best, off, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
}
}
best = bestEnd
}
}
}
@ -296,51 +342,34 @@ encodeLoop:
// We have a match, we can store the forward value
if best.rep > 0 {
s = best.s
var seq seq
seq.matchLen = uint32(best.length - zstdMinMatch)
// We might be able to match backwards.
// Extend as long as we can.
start := best.s
// We end the search early, so we don't risk 0 literals
// and have to do special offset treatment.
startLimit := nextEmit + 1
tMin := s - e.maxMatchOff
if tMin < 0 {
tMin = 0
if debugAsserts && s < nextEmit {
panic("s < nextEmit")
}
repIndex := best.offset
for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
repIndex--
start--
seq.matchLen++
}
addLiterals(&seq, start)
addLiterals(&seq, best.s)
// rep 0
seq.offset = uint32(best.rep)
// Repeat. If bit 4 is set, this is a non-lit repeat.
seq.offset = uint32(best.rep & 3)
if debugSequences {
println("repeat sequence", seq, "next s:", s)
}
blk.sequences = append(blk.sequences, seq)
// Index match start+1 (long) -> s - 1
index0 := s
// Index old s + 1 -> s - 1
index0 := s + 1
s = best.s + best.length
nextEmit = s
if s >= sLimit {
if debugEncoder {
println("repeat ended", s, best.length)
}
break encodeLoop
}
// Index skipped...
off := index0 + e.cur
for index0 < s-1 {
for index0 < s {
cv0 := load6432(src, index0)
h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
@ -350,17 +379,19 @@ encodeLoop:
index0++
}
switch best.rep {
case 2:
case 2, 4 | 1:
offset1, offset2 = offset2, offset1
case 3:
case 3, 4 | 2:
offset1, offset2, offset3 = offset3, offset1, offset2
case 4 | 3:
offset1, offset2, offset3 = offset1-1, offset1, offset2
}
cv = load6432(src, s)
continue
}
// A 4-byte match has been found. Update recent offsets.
// We'll later see if more than 4 bytes.
index0 := s + 1
s = best.s
t := best.offset
offset1, offset2, offset3 = s-t, offset1, offset2
@ -373,22 +404,9 @@ encodeLoop:
panic("invalid offset")
}
// Extend the n-byte match as long as possible.
l := best.length
// Extend backwards
tMin := s - e.maxMatchOff
if tMin < 0 {
tMin = 0
}
for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
s--
t--
l++
}
// Write our sequence
var seq seq
l := best.length
seq.litLen = uint32(s - nextEmit)
seq.matchLen = uint32(l - zstdMinMatch)
if seq.litLen > 0 {
@ -405,10 +423,8 @@ encodeLoop:
break encodeLoop
}
// Index match start+1 (long) -> s - 1
index0 := s - l + 1
// every entry
for index0 < s-1 {
// Index old s + 1 -> s - 1
for index0 < s {
cv0 := load6432(src, index0)
h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
@ -417,50 +433,6 @@ encodeLoop:
e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
index0++
}
cv = load6432(src, s)
if !canRepeat {
continue
}
// Check offset 2
for {
o2 := s - offset2
if load3232(src, o2) != uint32(cv) {
// Do regular search
break
}
// Store this, since we have it.
nextHashS := hashLen(cv, bestShortTableBits, bestShortLen)
nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)
// We have at least 4 byte match.
// No need to check backwards. We come straight from a match
l := 4 + e.matchlen(s+4, o2+4, src)
e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
e.table[nextHashS] = prevEntry{offset: s + e.cur, prev: e.table[nextHashS].offset}
seq.matchLen = uint32(l) - zstdMinMatch
seq.litLen = 0
// Since litlen is always 0, this is offset 1.
seq.offset = 1
s += l
nextEmit = s
if debugSequences {
println("sequence", seq, "next s:", s)
}
blk.sequences = append(blk.sequences, seq)
// Swap offset 1 and 2.
offset1, offset2 = offset2, offset1
if s >= sLimit {
// Finished
break encodeLoop
}
cv = load6432(src, s)
}
}
if int(nextEmit) < len(src) {

View file

@ -62,14 +62,10 @@ func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
for e.cur >= bufferReset {
for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
for i := range e.table[:] {
e.table[i] = tableEntry{}
}
for i := range e.longTable[:] {
e.longTable[i] = prevEntry{}
}
e.table = [betterShortTableSize]tableEntry{}
e.longTable = [betterLongTableSize]prevEntry{}
e.cur = e.maxMatchOff
break
}
@ -587,7 +583,7 @@ func (e *betterFastEncoderDict) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
for e.cur >= bufferReset {
for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
for i := range e.table[:] {
e.table[i] = tableEntry{}

View file

@ -44,14 +44,10 @@ func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
for e.cur >= bufferReset {
for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
for i := range e.table[:] {
e.table[i] = tableEntry{}
}
for i := range e.longTable[:] {
e.longTable[i] = tableEntry{}
}
e.table = [dFastShortTableSize]tableEntry{}
e.longTable = [dFastLongTableSize]tableEntry{}
e.cur = e.maxMatchOff
break
}
@ -388,7 +384,7 @@ func (e *doubleFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
if e.cur >= bufferReset {
if e.cur >= e.bufferReset {
for i := range e.table[:] {
e.table[i] = tableEntry{}
}
@ -685,7 +681,7 @@ encodeLoop:
}
// We do not store history, so we must offset e.cur to avoid false matches for next user.
if e.cur < bufferReset {
if e.cur < e.bufferReset {
e.cur += int32(len(src))
}
}
@ -700,7 +696,7 @@ func (e *doubleFastEncoderDict) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
for e.cur >= bufferReset {
for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
for i := range e.table[:] {
e.table[i] = tableEntry{}
@ -1088,7 +1084,7 @@ func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) {
}
}
e.lastDictID = d.id
e.allDirty = true
allDirty = true
}
// Reset table to initial state
e.cur = e.maxMatchOff

View file

@ -43,7 +43,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
)
// Protect against e.cur wraparound.
for e.cur >= bufferReset {
for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
for i := range e.table[:] {
e.table[i] = tableEntry{}
@ -133,8 +133,7 @@ encodeLoop:
if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
// Consider history as well.
var seq seq
var length int32
length = 4 + e.matchlen(s+6, repIndex+4, src)
length := 4 + e.matchlen(s+6, repIndex+4, src)
seq.matchLen = uint32(length - zstdMinMatch)
// We might be able to match backwards.
@ -310,7 +309,7 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
}
// Protect against e.cur wraparound.
if e.cur >= bufferReset {
if e.cur >= e.bufferReset {
for i := range e.table[:] {
e.table[i] = tableEntry{}
}
@ -538,7 +537,7 @@ encodeLoop:
println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
}
// We do not store history, so we must offset e.cur to avoid false matches for next user.
if e.cur < bufferReset {
if e.cur < e.bufferReset {
e.cur += int32(len(src))
}
}
@ -555,11 +554,9 @@ func (e *fastEncoderDict) Encode(blk *blockEnc, src []byte) {
return
}
// Protect against e.cur wraparound.
for e.cur >= bufferReset {
for e.cur >= e.bufferReset-int32(len(e.hist)) {
if len(e.hist) == 0 {
for i := range e.table[:] {
e.table[i] = tableEntry{}
}
e.table = [tableSize]tableEntry{}
e.cur = e.maxMatchOff
break
}
@ -647,8 +644,7 @@ encodeLoop:
if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
// Consider history as well.
var seq seq
var length int32
length = 4 + e.matchlen(s+6, repIndex+4, src)
length := 4 + e.matchlen(s+6, repIndex+4, src)
seq.matchLen = uint32(length - zstdMinMatch)
@ -833,13 +829,12 @@ func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) {
}
if true {
end := e.maxMatchOff + int32(len(d.content)) - 8
for i := e.maxMatchOff; i < end; i += 3 {
for i := e.maxMatchOff; i < end; i += 2 {
const hashLog = tableBits
cv := load6432(d.content, i-e.maxMatchOff)
nextHash := hashLen(cv, hashLog, tableFastHashLen) // 0 -> 5
nextHash1 := hashLen(cv>>8, hashLog, tableFastHashLen) // 1 -> 6
nextHash2 := hashLen(cv>>16, hashLog, tableFastHashLen) // 2 -> 7
nextHash := hashLen(cv, hashLog, tableFastHashLen) // 0 -> 6
nextHash1 := hashLen(cv>>8, hashLog, tableFastHashLen) // 1 -> 7
e.dictTable[nextHash] = tableEntry{
val: uint32(cv),
offset: i,
@ -848,10 +843,6 @@ func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) {
val: uint32(cv >> 8),
offset: i + 1,
}
e.dictTable[nextHash2] = tableEntry{
val: uint32(cv >> 16),
offset: i + 2,
}
}
}
e.lastDictID = d.id

View file

@ -8,6 +8,7 @@ import (
"crypto/rand"
"fmt"
"io"
"math"
rdebug "runtime/debug"
"sync"
@ -226,10 +227,7 @@ func (e *Encoder) nextBlock(final bool) error {
DictID: e.o.dict.ID(),
}
dst, err := fh.appendTo(tmp[:0])
if err != nil {
return err
}
dst := fh.appendTo(tmp[:0])
s.headerWritten = true
s.wWg.Wait()
var n2 int
@ -276,23 +274,9 @@ func (e *Encoder) nextBlock(final bool) error {
s.eofWritten = true
}
err := errIncompressible
// If we got the exact same number of literals as input,
// assume the literals cannot be compressed.
if len(src) != len(blk.literals) || len(src) != e.o.blockSize {
err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
}
switch err {
case errIncompressible:
if debugEncoder {
println("Storing incompressible block as raw")
}
blk.encodeRaw(src)
// In fast mode, we do not transfer offsets, so we don't have to deal with changing the.
case nil:
default:
s.err = err
return err
s.err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
if s.err != nil {
return s.err
}
_, s.err = s.w.Write(blk.output)
s.nWritten += int64(len(blk.output))
@ -342,22 +326,8 @@ func (e *Encoder) nextBlock(final bool) error {
}
s.wWg.Done()
}()
err := errIncompressible
// If we got the exact same number of literals as input,
// assume the literals cannot be compressed.
if len(src) != len(blk.literals) || len(src) != e.o.blockSize {
err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
}
switch err {
case errIncompressible:
if debugEncoder {
println("Storing incompressible block as raw")
}
blk.encodeRaw(src)
// In fast mode, we do not transfer offsets, so we don't have to deal with changing the.
case nil:
default:
s.writeErr = err
s.writeErr = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
if s.writeErr != nil {
return
}
_, s.writeErr = s.w.Write(blk.output)
@ -510,7 +480,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
Checksum: false,
DictID: 0,
}
dst, _ = fh.appendTo(dst)
dst = fh.appendTo(dst)
// Write raw block as last one only.
var blk blockHeader
@ -545,10 +515,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
if len(dst) == 0 && cap(dst) == 0 && len(src) < 1<<20 && !e.o.lowMem {
dst = make([]byte, 0, len(src))
}
dst, err := fh.appendTo(dst)
if err != nil {
panic(err)
}
dst = fh.appendTo(dst)
// If we can do everything in one block, prefer that.
if len(src) <= e.o.blockSize {
@ -567,25 +534,15 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
// If we got the exact same number of literals as input,
// assume the literals cannot be compressed.
err := errIncompressible
oldout := blk.output
if len(blk.literals) != len(src) || len(src) != e.o.blockSize {
// Output directly to dst
blk.output = dst
err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
}
// Output directly to dst
blk.output = dst
switch err {
case errIncompressible:
if debugEncoder {
println("Storing incompressible block as raw")
}
dst = blk.encodeRawTo(dst, src)
case nil:
dst = blk.output
default:
err := blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
if err != nil {
panic(err)
}
dst = blk.output
blk.output = oldout
} else {
enc.Reset(e.o.dict, false)
@ -604,25 +561,11 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
if len(src) == 0 {
blk.last = true
}
err := errIncompressible
// If we got the exact same number of literals as input,
// assume the literals cannot be compressed.
if len(blk.literals) != len(todo) || len(todo) != e.o.blockSize {
err = blk.encode(todo, e.o.noEntropy, !e.o.allLitEntropy)
}
switch err {
case errIncompressible:
if debugEncoder {
println("Storing incompressible block as raw")
}
dst = blk.encodeRawTo(dst, todo)
blk.popOffsets()
case nil:
dst = append(dst, blk.output...)
default:
err := blk.encode(todo, e.o.noEntropy, !e.o.allLitEntropy)
if err != nil {
panic(err)
}
dst = append(dst, blk.output...)
blk.reset(nil)
}
}
@ -632,6 +575,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
// Add padding with content from crypto/rand.Reader
if e.o.pad > 0 {
add := calcSkippableFrame(int64(len(dst)), int64(e.o.pad))
var err error
dst, err = skippableFrame(dst, add, rand.Reader)
if err != nil {
panic(err)
@ -639,3 +583,37 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
}
return dst
}
// MaxEncodedSize returns the expected maximum
// size of an encoded block or stream.
func (e *Encoder) MaxEncodedSize(size int) int {
frameHeader := 4 + 2 // magic + frame header & window descriptor
if e.o.dict != nil {
frameHeader += 4
}
// Frame content size:
if size < 256 {
frameHeader++
} else if size < 65536+256 {
frameHeader += 2
} else if size < math.MaxInt32 {
frameHeader += 4
} else {
frameHeader += 8
}
// Final crc
if e.o.crc {
frameHeader += 4
}
// Max overhead is 3 bytes/block.
// There cannot be 0 blocks.
blocks := (size + e.o.blockSize) / e.o.blockSize
// Combine, add padding.
maxSz := frameHeader + 3*blocks + size
if e.o.pad > 1 {
maxSz += calcSkippableFrame(int64(maxSz), int64(e.o.pad))
}
return maxSz
}

View file

@ -3,6 +3,8 @@ package zstd
import (
"errors"
"fmt"
"math"
"math/bits"
"runtime"
"strings"
)
@ -37,7 +39,7 @@ func (o *encoderOptions) setDefault() {
blockSize: maxCompressedBlockSize,
windowSize: 8 << 20,
level: SpeedDefault,
allLitEntropy: true,
allLitEntropy: false,
lowMem: false,
}
}
@ -47,22 +49,22 @@ func (o encoderOptions) encoder() encoder {
switch o.level {
case SpeedFastest:
if o.dict != nil {
return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
}
return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
case SpeedDefault:
if o.dict != nil {
return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}}
return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}}
}
return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
case SpeedBetterCompression:
if o.dict != nil {
return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
}
return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
case SpeedBestCompression:
return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
}
panic("unknown compression level")
}
@ -127,7 +129,7 @@ func WithEncoderPadding(n int) EOption {
}
// No need to waste our time.
if n == 1 {
o.pad = 0
n = 0
}
if n > 1<<30 {
return fmt.Errorf("padding must less than 1GB (1<<30 bytes) ")
@ -236,7 +238,7 @@ func WithEncoderLevel(l EncoderLevel) EOption {
}
}
if !o.customALEntropy {
o.allLitEntropy = l > SpeedFastest
o.allLitEntropy = l > SpeedDefault
}
return nil
@ -304,7 +306,13 @@ func WithLowerEncoderMem(b bool) EOption {
}
// WithEncoderDict allows to register a dictionary that will be used for the encode.
//
// The slice dict must be in the [dictionary format] produced by
// "zstd --train" from the Zstandard reference implementation.
//
// The encoder *may* choose to use no dictionary instead for certain payloads.
//
// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
func WithEncoderDict(dict []byte) EOption {
return func(o *encoderOptions) error {
d, err := loadDict(dict)
@ -315,3 +323,17 @@ func WithEncoderDict(dict []byte) EOption {
return nil
}
}
// WithEncoderDictRaw registers a dictionary that may be used by the encoder.
//
// The slice content may contain arbitrary data. It will be used as an initial
// history.
func WithEncoderDictRaw(id uint32, content []byte) EOption {
return func(o *encoderOptions) error {
if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
}
o.dict = &dict{id: id, content: content, offsets: [3]int{1, 4, 8}}
return nil
}
}

View file

@ -5,7 +5,7 @@
package zstd
import (
"bytes"
"encoding/binary"
"encoding/hex"
"errors"
"io"
@ -29,7 +29,7 @@ type frameDec struct {
FrameContentSize uint64
DictionaryID *uint32
DictionaryID uint32
HasCheckSum bool
SingleSegment bool
}
@ -43,9 +43,9 @@ const (
MaxWindowSize = 1 << 29
)
var (
frameMagic = []byte{0x28, 0xb5, 0x2f, 0xfd}
skippableFrameMagic = []byte{0x2a, 0x4d, 0x18}
const (
frameMagic = "\x28\xb5\x2f\xfd"
skippableFrameMagic = "\x2a\x4d\x18"
)
func newFrameDec(o decoderOptions) *frameDec {
@ -73,25 +73,25 @@ func (d *frameDec) reset(br byteBuffer) error {
switch err {
case io.EOF, io.ErrUnexpectedEOF:
return io.EOF
default:
return err
case nil:
signature[0] = b[0]
default:
return err
}
// Read the rest, don't allow io.ErrUnexpectedEOF
b, err = br.readSmall(3)
switch err {
case io.EOF:
return io.EOF
default:
return err
case nil:
copy(signature[1:], b)
default:
return err
}
if !bytes.Equal(signature[1:4], skippableFrameMagic) || signature[0]&0xf0 != 0x50 {
if string(signature[1:4]) != skippableFrameMagic || signature[0]&0xf0 != 0x50 {
if debugDecoder {
println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString(skippableFrameMagic))
println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString([]byte(skippableFrameMagic)))
}
// Break if not skippable frame.
break
@ -114,9 +114,9 @@ func (d *frameDec) reset(br byteBuffer) error {
return err
}
}
if !bytes.Equal(signature[:], frameMagic) {
if string(signature[:]) != frameMagic {
if debugDecoder {
println("Got magic numbers: ", signature, "want:", frameMagic)
println("Got magic numbers: ", signature, "want:", []byte(frameMagic))
}
return ErrMagicMismatch
}
@ -155,7 +155,7 @@ func (d *frameDec) reset(br byteBuffer) error {
// Read Dictionary_ID
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
d.DictionaryID = nil
d.DictionaryID = 0
if size := fhd & 3; size != 0 {
if size == 3 {
size = 4
@ -167,7 +167,7 @@ func (d *frameDec) reset(br byteBuffer) error {
return err
}
var id uint32
switch size {
switch len(b) {
case 1:
id = uint32(b[0])
case 2:
@ -178,11 +178,7 @@ func (d *frameDec) reset(br byteBuffer) error {
if debugDecoder {
println("Dict size", size, "ID:", id)
}
if id > 0 {
// ID 0 means "sorry, no dictionary anyway".
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
d.DictionaryID = &id
}
d.DictionaryID = id
}
// Read Frame_Content_Size
@ -204,7 +200,7 @@ func (d *frameDec) reset(br byteBuffer) error {
println("Reading Frame content", err)
return err
}
switch fcsSize {
switch len(b) {
case 1:
d.FrameContentSize = uint64(b[0])
case 2:
@ -297,55 +293,38 @@ func (d *frameDec) next(block *blockDec) error {
return nil
}
// checkCRC will check the checksum if the frame has one.
// checkCRC will check the checksum, assuming the frame has one.
// Will return ErrCRCMismatch if crc check failed, otherwise nil.
func (d *frameDec) checkCRC() error {
if !d.HasCheckSum {
return nil
}
// We can overwrite upper tmp now
want, err := d.rawInput.readSmall(4)
buf, err := d.rawInput.readSmall(4)
if err != nil {
println("CRC missing?", err)
return err
}
if d.o.ignoreChecksum {
return nil
}
want := binary.LittleEndian.Uint32(buf[:4])
got := uint32(d.crc.Sum64())
var tmp [4]byte
got := d.crc.Sum64()
// Flip to match file order.
tmp[0] = byte(got >> 0)
tmp[1] = byte(got >> 8)
tmp[2] = byte(got >> 16)
tmp[3] = byte(got >> 24)
if !bytes.Equal(tmp[:], want) {
if got != want {
if debugDecoder {
println("CRC Check Failed:", tmp[:], "!=", want)
printf("CRC check failed: got %08x, want %08x\n", got, want)
}
return ErrCRCMismatch
}
if debugDecoder {
println("CRC ok", tmp[:])
printf("CRC ok %08x\n", got)
}
return nil
}
// consumeCRC reads the checksum data if the frame has one.
// consumeCRC skips over the checksum, assuming the frame has one.
func (d *frameDec) consumeCRC() error {
if d.HasCheckSum {
_, err := d.rawInput.readSmall(4)
if err != nil {
println("CRC missing?", err)
return err
}
_, err := d.rawInput.readSmall(4)
if err != nil {
println("CRC missing?", err)
}
return nil
return err
}
// runDecoder will run the decoder for the remainder of the frame.
@ -424,15 +403,8 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
if d.o.ignoreChecksum {
err = d.consumeCRC()
} else {
var n int
n, err = d.crc.Write(dst[crcStart:])
if err == nil {
if n != len(dst)-crcStart {
err = io.ErrShortWrite
} else {
err = d.checkCRC()
}
}
d.crc.Write(dst[crcStart:])
err = d.checkCRC()
}
}
}

View file

@ -22,7 +22,7 @@ type frameHeader struct {
const maxHeaderSize = 14
func (f frameHeader) appendTo(dst []byte) ([]byte, error) {
func (f frameHeader) appendTo(dst []byte) []byte {
dst = append(dst, frameMagic...)
var fhd uint8
if f.Checksum {
@ -88,7 +88,7 @@ func (f frameHeader) appendTo(dst []byte) ([]byte, error) {
default:
panic("invalid fcs")
}
return dst, nil
return dst
}
const skippableFrameHeader = 4 + 4

View file

@ -2,12 +2,7 @@
VENDORED: Go to [github.com/cespare/xxhash](https://github.com/cespare/xxhash) for original package.
[![GoDoc](https://godoc.org/github.com/cespare/xxhash?status.svg)](https://godoc.org/github.com/cespare/xxhash)
[![Build Status](https://travis-ci.org/cespare/xxhash.svg?branch=master)](https://travis-ci.org/cespare/xxhash)
xxhash is a Go implementation of the 64-bit
[xxHash](http://cyan4973.github.io/xxHash/) algorithm, XXH64. This is a
xxhash is a Go implementation of the 64-bit [xxHash] algorithm, XXH64. This is a
high-quality hashing algorithm that is much faster than anything in the Go
standard library.
@ -28,31 +23,49 @@ func (*Digest) WriteString(string) (int, error)
func (*Digest) Sum64() uint64
```
This implementation provides a fast pure-Go implementation and an even faster
assembly implementation for amd64.
The package is written with optimized pure Go and also contains even faster
assembly implementations for amd64 and arm64. If desired, the `purego` build tag
opts into using the Go code even on those architectures.
[xxHash]: http://cyan4973.github.io/xxHash/
## Compatibility
This package is in a module and the latest code is in version 2 of the module.
You need a version of Go with at least "minimal module compatibility" to use
github.com/cespare/xxhash/v2:
* 1.9.7+ for Go 1.9
* 1.10.3+ for Go 1.10
* Go 1.11 or later
I recommend using the latest release of Go.
## Benchmarks
Here are some quick benchmarks comparing the pure-Go and assembly
implementations of Sum64.
| input size | purego | asm |
| --- | --- | --- |
| 5 B | 979.66 MB/s | 1291.17 MB/s |
| 100 B | 7475.26 MB/s | 7973.40 MB/s |
| 4 KB | 17573.46 MB/s | 17602.65 MB/s |
| 10 MB | 17131.46 MB/s | 17142.16 MB/s |
| input size | purego | asm |
| ---------- | --------- | --------- |
| 4 B | 1.3 GB/s | 1.2 GB/s |
| 16 B | 2.9 GB/s | 3.5 GB/s |
| 100 B | 6.9 GB/s | 8.1 GB/s |
| 4 KB | 11.7 GB/s | 16.7 GB/s |
| 10 MB | 12.0 GB/s | 17.3 GB/s |
These numbers were generated on Ubuntu 18.04 with an Intel i7-8700K CPU using
the following commands under Go 1.11.2:
These numbers were generated on Ubuntu 20.04 with an Intel Xeon Platinum 8252C
CPU using the following commands under Go 1.19.2:
```
$ go test -tags purego -benchtime 10s -bench '/xxhash,direct,bytes'
$ go test -benchtime 10s -bench '/xxhash,direct,bytes'
benchstat <(go test -tags purego -benchtime 500ms -count 15 -bench 'Sum64$')
benchstat <(go test -benchtime 500ms -count 15 -bench 'Sum64$')
```
## Projects using this package
- [InfluxDB](https://github.com/influxdata/influxdb)
- [Prometheus](https://github.com/prometheus/prometheus)
- [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
- [FreeCache](https://github.com/coocood/freecache)
- [FastCache](https://github.com/VictoriaMetrics/fastcache)

View file

@ -18,19 +18,11 @@ const (
prime5 uint64 = 2870177450012600261
)
// NOTE(caleb): I'm using both consts and vars of the primes. Using consts where
// possible in the Go code is worth a small (but measurable) performance boost
// by avoiding some MOVQs. Vars are needed for the asm and also are useful for
// convenience in the Go code in a few places where we need to intentionally
// avoid constant arithmetic (e.g., v1 := prime1 + prime2 fails because the
// result overflows a uint64).
var (
prime1v = prime1
prime2v = prime2
prime3v = prime3
prime4v = prime4
prime5v = prime5
)
// Store the primes in an array as well.
//
// The consts are used when possible in Go code to avoid MOVs but we need a
// contiguous array of the assembly code.
var primes = [...]uint64{prime1, prime2, prime3, prime4, prime5}
// Digest implements hash.Hash64.
type Digest struct {
@ -52,10 +44,10 @@ func New() *Digest {
// Reset clears the Digest's state so that it can be reused.
func (d *Digest) Reset() {
d.v1 = prime1v + prime2
d.v1 = primes[0] + prime2
d.v2 = prime2
d.v3 = 0
d.v4 = -prime1v
d.v4 = -primes[0]
d.total = 0
d.n = 0
}
@ -71,21 +63,23 @@ func (d *Digest) Write(b []byte) (n int, err error) {
n = len(b)
d.total += uint64(n)
memleft := d.mem[d.n&(len(d.mem)-1):]
if d.n+n < 32 {
// This new data doesn't even fill the current block.
copy(d.mem[d.n:], b)
copy(memleft, b)
d.n += n
return
}
if d.n > 0 {
// Finish off the partial block.
copy(d.mem[d.n:], b)
c := copy(memleft, b)
d.v1 = round(d.v1, u64(d.mem[0:8]))
d.v2 = round(d.v2, u64(d.mem[8:16]))
d.v3 = round(d.v3, u64(d.mem[16:24]))
d.v4 = round(d.v4, u64(d.mem[24:32]))
b = b[32-d.n:]
b = b[c:]
d.n = 0
}
@ -135,21 +129,20 @@ func (d *Digest) Sum64() uint64 {
h += d.total
i, end := 0, d.n
for ; i+8 <= end; i += 8 {
k1 := round(0, u64(d.mem[i:i+8]))
b := d.mem[:d.n&(len(d.mem)-1)]
for ; len(b) >= 8; b = b[8:] {
k1 := round(0, u64(b[:8]))
h ^= k1
h = rol27(h)*prime1 + prime4
}
if i+4 <= end {
h ^= uint64(u32(d.mem[i:i+4])) * prime1
if len(b) >= 4 {
h ^= uint64(u32(b[:4])) * prime1
h = rol23(h)*prime2 + prime3
i += 4
b = b[4:]
}
for i < end {
h ^= uint64(d.mem[i]) * prime5
for ; len(b) > 0; b = b[1:] {
h ^= uint64(b[0]) * prime5
h = rol11(h) * prime1
i++
}
h ^= h >> 33

View file

@ -1,3 +1,4 @@
//go:build !appengine && gc && !purego && !noasm
// +build !appengine
// +build gc
// +build !purego
@ -5,212 +6,205 @@
#include "textflag.h"
// Register allocation:
// AX h
// SI pointer to advance through b
// DX n
// BX loop end
// R8 v1, k1
// R9 v2
// R10 v3
// R11 v4
// R12 tmp
// R13 prime1v
// R14 prime2v
// DI prime4v
// Registers:
#define h AX
#define d AX
#define p SI // pointer to advance through b
#define n DX
#define end BX // loop end
#define v1 R8
#define v2 R9
#define v3 R10
#define v4 R11
#define x R12
#define prime1 R13
#define prime2 R14
#define prime4 DI
// round reads from and advances the buffer pointer in SI.
// It assumes that R13 has prime1v and R14 has prime2v.
#define round(r) \
MOVQ (SI), R12 \
ADDQ $8, SI \
IMULQ R14, R12 \
ADDQ R12, r \
ROLQ $31, r \
IMULQ R13, r
#define round(acc, x) \
IMULQ prime2, x \
ADDQ x, acc \
ROLQ $31, acc \
IMULQ prime1, acc
// mergeRound applies a merge round on the two registers acc and val.
// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v.
#define mergeRound(acc, val) \
IMULQ R14, val \
ROLQ $31, val \
IMULQ R13, val \
XORQ val, acc \
IMULQ R13, acc \
ADDQ DI, acc
// round0 performs the operation x = round(0, x).
#define round0(x) \
IMULQ prime2, x \
ROLQ $31, x \
IMULQ prime1, x
// mergeRound applies a merge round on the two registers acc and x.
// It assumes that prime1, prime2, and prime4 have been loaded.
#define mergeRound(acc, x) \
round0(x) \
XORQ x, acc \
IMULQ prime1, acc \
ADDQ prime4, acc
// blockLoop processes as many 32-byte blocks as possible,
// updating v1, v2, v3, and v4. It assumes that there is at least one block
// to process.
#define blockLoop() \
loop: \
MOVQ +0(p), x \
round(v1, x) \
MOVQ +8(p), x \
round(v2, x) \
MOVQ +16(p), x \
round(v3, x) \
MOVQ +24(p), x \
round(v4, x) \
ADDQ $32, p \
CMPQ p, end \
JLE loop
// func Sum64(b []byte) uint64
TEXT ·Sum64(SB), NOSPLIT, $0-32
TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
// Load fixed primes.
MOVQ ·prime1v(SB), R13
MOVQ ·prime2v(SB), R14
MOVQ ·prime4v(SB), DI
MOVQ ·primes+0(SB), prime1
MOVQ ·primes+8(SB), prime2
MOVQ ·primes+24(SB), prime4
// Load slice.
MOVQ b_base+0(FP), SI
MOVQ b_len+8(FP), DX
LEAQ (SI)(DX*1), BX
MOVQ b_base+0(FP), p
MOVQ b_len+8(FP), n
LEAQ (p)(n*1), end
// The first loop limit will be len(b)-32.
SUBQ $32, BX
SUBQ $32, end
// Check whether we have at least one block.
CMPQ DX, $32
CMPQ n, $32
JLT noBlocks
// Set up initial state (v1, v2, v3, v4).
MOVQ R13, R8
ADDQ R14, R8
MOVQ R14, R9
XORQ R10, R10
XORQ R11, R11
SUBQ R13, R11
MOVQ prime1, v1
ADDQ prime2, v1
MOVQ prime2, v2
XORQ v3, v3
XORQ v4, v4
SUBQ prime1, v4
// Loop until SI > BX.
blockLoop:
round(R8)
round(R9)
round(R10)
round(R11)
blockLoop()
CMPQ SI, BX
JLE blockLoop
MOVQ v1, h
ROLQ $1, h
MOVQ v2, x
ROLQ $7, x
ADDQ x, h
MOVQ v3, x
ROLQ $12, x
ADDQ x, h
MOVQ v4, x
ROLQ $18, x
ADDQ x, h
MOVQ R8, AX
ROLQ $1, AX
MOVQ R9, R12
ROLQ $7, R12
ADDQ R12, AX
MOVQ R10, R12
ROLQ $12, R12
ADDQ R12, AX
MOVQ R11, R12
ROLQ $18, R12
ADDQ R12, AX
mergeRound(AX, R8)
mergeRound(AX, R9)
mergeRound(AX, R10)
mergeRound(AX, R11)
mergeRound(h, v1)
mergeRound(h, v2)
mergeRound(h, v3)
mergeRound(h, v4)
JMP afterBlocks
noBlocks:
MOVQ ·prime5v(SB), AX
MOVQ ·primes+32(SB), h
afterBlocks:
ADDQ DX, AX
ADDQ n, h
// Right now BX has len(b)-32, and we want to loop until SI > len(b)-8.
ADDQ $24, BX
ADDQ $24, end
CMPQ p, end
JG try4
CMPQ SI, BX
JG fourByte
loop8:
MOVQ (p), x
ADDQ $8, p
round0(x)
XORQ x, h
ROLQ $27, h
IMULQ prime1, h
ADDQ prime4, h
wordLoop:
// Calculate k1.
MOVQ (SI), R8
ADDQ $8, SI
IMULQ R14, R8
ROLQ $31, R8
IMULQ R13, R8
CMPQ p, end
JLE loop8
XORQ R8, AX
ROLQ $27, AX
IMULQ R13, AX
ADDQ DI, AX
try4:
ADDQ $4, end
CMPQ p, end
JG try1
CMPQ SI, BX
JLE wordLoop
MOVL (p), x
ADDQ $4, p
IMULQ prime1, x
XORQ x, h
fourByte:
ADDQ $4, BX
CMPQ SI, BX
JG singles
ROLQ $23, h
IMULQ prime2, h
ADDQ ·primes+16(SB), h
MOVL (SI), R8
ADDQ $4, SI
IMULQ R13, R8
XORQ R8, AX
ROLQ $23, AX
IMULQ R14, AX
ADDQ ·prime3v(SB), AX
singles:
ADDQ $4, BX
CMPQ SI, BX
try1:
ADDQ $4, end
CMPQ p, end
JGE finalize
singlesLoop:
MOVBQZX (SI), R12
ADDQ $1, SI
IMULQ ·prime5v(SB), R12
XORQ R12, AX
loop1:
MOVBQZX (p), x
ADDQ $1, p
IMULQ ·primes+32(SB), x
XORQ x, h
ROLQ $11, h
IMULQ prime1, h
ROLQ $11, AX
IMULQ R13, AX
CMPQ SI, BX
JL singlesLoop
CMPQ p, end
JL loop1
finalize:
MOVQ AX, R12
SHRQ $33, R12
XORQ R12, AX
IMULQ R14, AX
MOVQ AX, R12
SHRQ $29, R12
XORQ R12, AX
IMULQ ·prime3v(SB), AX
MOVQ AX, R12
SHRQ $32, R12
XORQ R12, AX
MOVQ h, x
SHRQ $33, x
XORQ x, h
IMULQ prime2, h
MOVQ h, x
SHRQ $29, x
XORQ x, h
IMULQ ·primes+16(SB), h
MOVQ h, x
SHRQ $32, x
XORQ x, h
MOVQ AX, ret+24(FP)
MOVQ h, ret+24(FP)
RET
// writeBlocks uses the same registers as above except that it uses AX to store
// the d pointer.
// func writeBlocks(d *Digest, b []byte) int
TEXT ·writeBlocks(SB), NOSPLIT, $0-40
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
// Load fixed primes needed for round.
MOVQ ·prime1v(SB), R13
MOVQ ·prime2v(SB), R14
MOVQ ·primes+0(SB), prime1
MOVQ ·primes+8(SB), prime2
// Load slice.
MOVQ b_base+8(FP), SI
MOVQ b_len+16(FP), DX
LEAQ (SI)(DX*1), BX
SUBQ $32, BX
MOVQ b_base+8(FP), p
MOVQ b_len+16(FP), n
LEAQ (p)(n*1), end
SUBQ $32, end
// Load vN from d.
MOVQ d+0(FP), AX
MOVQ 0(AX), R8 // v1
MOVQ 8(AX), R9 // v2
MOVQ 16(AX), R10 // v3
MOVQ 24(AX), R11 // v4
MOVQ s+0(FP), d
MOVQ 0(d), v1
MOVQ 8(d), v2
MOVQ 16(d), v3
MOVQ 24(d), v4
// We don't need to check the loop condition here; this function is
// always called with at least one block of data to process.
blockLoop:
round(R8)
round(R9)
round(R10)
round(R11)
CMPQ SI, BX
JLE blockLoop
blockLoop()
// Copy vN back to d.
MOVQ R8, 0(AX)
MOVQ R9, 8(AX)
MOVQ R10, 16(AX)
MOVQ R11, 24(AX)
MOVQ v1, 0(d)
MOVQ v2, 8(d)
MOVQ v3, 16(d)
MOVQ v4, 24(d)
// The number of bytes written is SI minus the old base pointer.
SUBQ b_base+8(FP), SI
MOVQ SI, ret+32(FP)
// The number of bytes written is p minus the old base pointer.
SUBQ b_base+8(FP), p
MOVQ p, ret+32(FP)
RET

View file

@ -1,13 +1,17 @@
// +build gc,!purego,!noasm
//go:build !appengine && gc && !purego && !noasm
// +build !appengine
// +build gc
// +build !purego
// +build !noasm
#include "textflag.h"
// Register allocation.
// Registers:
#define digest R1
#define h R2 // Return value.
#define p R3 // Input pointer.
#define len R4
#define nblocks R5 // len / 32.
#define h R2 // return value
#define p R3 // input pointer
#define n R4 // input length
#define nblocks R5 // n / 32
#define prime1 R7
#define prime2 R8
#define prime3 R9
@ -25,60 +29,52 @@
#define round(acc, x) \
MADD prime2, acc, x, acc \
ROR $64-31, acc \
MUL prime1, acc \
MUL prime1, acc
// x = round(0, x).
// round0 performs the operation x = round(0, x).
#define round0(x) \
MUL prime2, x \
ROR $64-31, x \
MUL prime1, x \
MUL prime1, x
#define mergeRound(x) \
round0(x) \
EOR x, h \
MADD h, prime4, prime1, h \
#define mergeRound(acc, x) \
round0(x) \
EOR x, acc \
MADD acc, prime4, prime1, acc
// Update v[1-4] with 32-byte blocks. Assumes len >= 32.
#define blocksLoop() \
LSR $5, len, nblocks \
PCALIGN $16 \
loop: \
LDP.P 32(p), (x1, x2) \
round(v1, x1) \
LDP -16(p), (x3, x4) \
round(v2, x2) \
SUB $1, nblocks \
round(v3, x3) \
round(v4, x4) \
CBNZ nblocks, loop \
// The primes are repeated here to ensure that they're stored
// in a contiguous array, so we can load them with LDP.
DATA primes<> +0(SB)/8, $11400714785074694791
DATA primes<> +8(SB)/8, $14029467366897019727
DATA primes<>+16(SB)/8, $1609587929392839161
DATA primes<>+24(SB)/8, $9650029242287828579
DATA primes<>+32(SB)/8, $2870177450012600261
GLOBL primes<>(SB), NOPTR+RODATA, $40
// blockLoop processes as many 32-byte blocks as possible,
// updating v1, v2, v3, and v4. It assumes that n >= 32.
#define blockLoop() \
LSR $5, n, nblocks \
PCALIGN $16 \
loop: \
LDP.P 16(p), (x1, x2) \
LDP.P 16(p), (x3, x4) \
round(v1, x1) \
round(v2, x2) \
round(v3, x3) \
round(v4, x4) \
SUB $1, nblocks \
CBNZ nblocks, loop
// func Sum64(b []byte) uint64
TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
LDP b_base+0(FP), (p, len)
TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
LDP b_base+0(FP), (p, n)
LDP primes<> +0(SB), (prime1, prime2)
LDP primes<>+16(SB), (prime3, prime4)
MOVD primes<>+32(SB), prime5
LDP ·primes+0(SB), (prime1, prime2)
LDP ·primes+16(SB), (prime3, prime4)
MOVD ·primes+32(SB), prime5
CMP $32, len
CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 }
BLO afterLoop
CMP $32, n
CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
BLT afterLoop
ADD prime1, prime2, v1
MOVD prime2, v2
MOVD $0, v3
NEG prime1, v4
blocksLoop()
blockLoop()
ROR $64-1, v1, x1
ROR $64-7, v2, x2
@ -88,71 +84,75 @@ TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
ADD x3, x4
ADD x2, x4, h
mergeRound(v1)
mergeRound(v2)
mergeRound(v3)
mergeRound(v4)
mergeRound(h, v1)
mergeRound(h, v2)
mergeRound(h, v3)
mergeRound(h, v4)
afterLoop:
ADD len, h
ADD n, h
TBZ $4, len, try8
TBZ $4, n, try8
LDP.P 16(p), (x1, x2)
round0(x1)
// NOTE: here and below, sequencing the EOR after the ROR (using a
// rotated register) is worth a small but measurable speedup for small
// inputs.
ROR $64-27, h
EOR x1 @> 64-27, h, h
MADD h, prime4, prime1, h
round0(x2)
ROR $64-27, h
EOR x2 @> 64-27, h
EOR x2 @> 64-27, h, h
MADD h, prime4, prime1, h
try8:
TBZ $3, len, try4
TBZ $3, n, try4
MOVD.P 8(p), x1
round0(x1)
ROR $64-27, h
EOR x1 @> 64-27, h
EOR x1 @> 64-27, h, h
MADD h, prime4, prime1, h
try4:
TBZ $2, len, try2
TBZ $2, n, try2
MOVWU.P 4(p), x2
MUL prime1, x2
ROR $64-23, h
EOR x2 @> 64-23, h
EOR x2 @> 64-23, h, h
MADD h, prime3, prime2, h
try2:
TBZ $1, len, try1
TBZ $1, n, try1
MOVHU.P 2(p), x3
AND $255, x3, x1
LSR $8, x3, x2
MUL prime5, x1
ROR $64-11, h
EOR x1 @> 64-11, h
EOR x1 @> 64-11, h, h
MUL prime1, h
MUL prime5, x2
ROR $64-11, h
EOR x2 @> 64-11, h
EOR x2 @> 64-11, h, h
MUL prime1, h
try1:
TBZ $0, len, end
TBZ $0, n, finalize
MOVBU (p), x4
MUL prime5, x4
ROR $64-11, h
EOR x4 @> 64-11, h
EOR x4 @> 64-11, h, h
MUL prime1, h
end:
finalize:
EOR h >> 33, h
MUL prime2, h
EOR h >> 29, h
@ -163,24 +163,22 @@ end:
RET
// func writeBlocks(d *Digest, b []byte) int
//
// Assumes len(b) >= 32.
TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40
LDP primes<>(SB), (prime1, prime2)
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
LDP ·primes+0(SB), (prime1, prime2)
// Load state. Assume v[1-4] are stored contiguously.
MOVD d+0(FP), digest
LDP 0(digest), (v1, v2)
LDP 16(digest), (v3, v4)
LDP b_base+8(FP), (p, len)
LDP b_base+8(FP), (p, n)
blocksLoop()
blockLoop()
// Store updated state.
STP (v1, v2), 0(digest)
STP (v3, v4), 16(digest)
BIC $31, len
MOVD len, ret+32(FP)
BIC $31, n
MOVD n, ret+32(FP)
RET

View file

@ -13,4 +13,4 @@ package xxhash
func Sum64(b []byte) uint64
//go:noescape
func writeBlocks(d *Digest, b []byte) int
func writeBlocks(s *Digest, b []byte) int

View file

@ -15,10 +15,10 @@ func Sum64(b []byte) uint64 {
var h uint64
if n >= 32 {
v1 := prime1v + prime2
v1 := primes[0] + prime2
v2 := prime2
v3 := uint64(0)
v4 := -prime1v
v4 := -primes[0]
for len(b) >= 32 {
v1 = round(v1, u64(b[0:8:len(b)]))
v2 = round(v2, u64(b[8:16:len(b)]))
@ -37,19 +37,18 @@ func Sum64(b []byte) uint64 {
h += uint64(n)
i, end := 0, len(b)
for ; i+8 <= end; i += 8 {
k1 := round(0, u64(b[i:i+8:len(b)]))
for ; len(b) >= 8; b = b[8:] {
k1 := round(0, u64(b[:8]))
h ^= k1
h = rol27(h)*prime1 + prime4
}
if i+4 <= end {
h ^= uint64(u32(b[i:i+4:len(b)])) * prime1
if len(b) >= 4 {
h ^= uint64(u32(b[:4])) * prime1
h = rol23(h)*prime2 + prime3
i += 4
b = b[4:]
}
for ; i < end; i++ {
h ^= uint64(b[i]) * prime5
for ; len(b) > 0; b = b[1:] {
h ^= uint64(b[0]) * prime5
h = rol11(h) * prime1
}

View file

@ -0,0 +1,16 @@
//go:build amd64 && !appengine && !noasm && gc
// +build amd64,!appengine,!noasm,gc
// Copyright 2019+ Klaus Post. All rights reserved.
// License information can be found in the LICENSE file.
package zstd
// matchLen returns how many bytes match in a and b
//
// It assumes that:
//
// len(a) <= len(b) and len(a) > 0
//
//go:noescape
func matchLen(a []byte, b []byte) int

View file

@ -0,0 +1,68 @@
// Copied from S2 implementation.
//go:build !appengine && !noasm && gc && !noasm
#include "textflag.h"
// func matchLen(a []byte, b []byte) int
// Requires: BMI
TEXT ·matchLen(SB), NOSPLIT, $0-56
MOVQ a_base+0(FP), AX
MOVQ b_base+24(FP), CX
MOVQ a_len+8(FP), DX
// matchLen
XORL SI, SI
CMPL DX, $0x08
JB matchlen_match4_standalone
matchlen_loopback_standalone:
MOVQ (AX)(SI*1), BX
XORQ (CX)(SI*1), BX
TESTQ BX, BX
JZ matchlen_loop_standalone
#ifdef GOAMD64_v3
TZCNTQ BX, BX
#else
BSFQ BX, BX
#endif
SARQ $0x03, BX
LEAL (SI)(BX*1), SI
JMP gen_match_len_end
matchlen_loop_standalone:
LEAL -8(DX), DX
LEAL 8(SI), SI
CMPL DX, $0x08
JAE matchlen_loopback_standalone
matchlen_match4_standalone:
CMPL DX, $0x04
JB matchlen_match2_standalone
MOVL (AX)(SI*1), BX
CMPL (CX)(SI*1), BX
JNE matchlen_match2_standalone
LEAL -4(DX), DX
LEAL 4(SI), SI
matchlen_match2_standalone:
CMPL DX, $0x02
JB matchlen_match1_standalone
MOVW (AX)(SI*1), BX
CMPW (CX)(SI*1), BX
JNE matchlen_match1_standalone
LEAL -2(DX), DX
LEAL 2(SI), SI
matchlen_match1_standalone:
CMPL DX, $0x01
JB gen_match_len_end
MOVB (AX)(SI*1), BL
CMPB (CX)(SI*1), BL
JNE gen_match_len_end
INCL SI
gen_match_len_end:
MOVQ SI, ret+48(FP)
RET

View file

@ -0,0 +1,33 @@
//go:build !amd64 || appengine || !gc || noasm
// +build !amd64 appengine !gc noasm
// Copyright 2019+ Klaus Post. All rights reserved.
// License information can be found in the LICENSE file.
package zstd
import (
"encoding/binary"
"math/bits"
)
// matchLen returns the maximum common prefix length of a and b.
// a must be the shortest of the two.
func matchLen(a, b []byte) (n int) {
for ; len(a) >= 8 && len(b) >= 8; a, b = a[8:], b[8:] {
diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b)
if diff != 0 {
return n + bits.TrailingZeros64(diff)>>3
}
n += 8
}
for i := range a {
if a[i] != b[i] {
break
}
n++
}
return n
}

View file

@ -236,13 +236,16 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
maxBlockSize = s.windowSize
}
if debugDecoder {
println("decodeSync: decoding", seqs, "sequences", br.remain(), "bits remain on stream")
}
for i := seqs - 1; i >= 0; i-- {
if br.overread() {
printf("reading sequence %d, exceeded available data\n", seqs-i)
printf("reading sequence %d, exceeded available data. Overread by %d\n", seqs-i, -br.remain())
return io.ErrUnexpectedEOF
}
var ll, mo, ml int
if br.off > 4+((maxOffsetBits+16+16)>>3) {
if len(br.in) > 4+((maxOffsetBits+16+16)>>3) {
// inlined function:
// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
@ -314,9 +317,6 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
}
size := ll + ml + len(out)
if size-startSize > maxBlockSize {
if size-startSize == 424242 {
panic("here")
}
return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
if size > cap(out) {
@ -427,8 +427,7 @@ func (s *sequenceDecs) decodeSync(hist []byte) error {
}
}
// Check if space for literals
if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize {
if size := len(s.literals) + len(out) - startSize; size > maxBlockSize {
return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
@ -453,18 +452,13 @@ func (s *sequenceDecs) next(br *bitReader, llState, mlState, ofState decSymbol)
// extra bits are stored in reverse order.
br.fill()
if s.maxBits <= 32 {
mo += br.getBits(moB)
ml += br.getBits(mlB)
ll += br.getBits(llB)
} else {
mo += br.getBits(moB)
mo += br.getBits(moB)
if s.maxBits > 32 {
br.fill()
// matchlength+literal length, max 32 bits
ml += br.getBits(mlB)
ll += br.getBits(llB)
}
// matchlength+literal length, max 32 bits
ml += br.getBits(mlB)
ll += br.getBits(llB)
mo = s.adjustOffset(mo, ll, moB)
return
}

View file

@ -5,6 +5,7 @@ package zstd
import (
"fmt"
"io"
"github.com/klauspost/compress/internal/cpuinfo"
)
@ -134,6 +135,9 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
return true, fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available",
ctx.ll, ctx.litRemain+ctx.ll)
case errorOverread:
return true, io.ErrUnexpectedEOF
case errorNotEnoughSpace:
size := ctx.outPosition + ctx.ll + ctx.ml
if debugDecoder {
@ -148,7 +152,6 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
s.seqSize += ctx.litRemain
if s.seqSize > maxBlockSize {
return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
err := br.close()
if err != nil {
@ -203,6 +206,9 @@ const errorNotEnoughLiterals = 4
// error reported when capacity of `out` is too small
const errorNotEnoughSpace = 5
// error reported when bits are overread.
const errorOverread = 6
// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
//
// Please refer to seqdec_generic.go for the reference implementation.
@ -248,6 +254,10 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
litRemain: len(s.literals),
}
if debugDecoder {
println("decode: decoding", len(seqs), "sequences", br.remain(), "bits remain on stream")
}
s.seqSize = 0
lte56bits := s.maxBits+s.offsets.fse.actualTableLog+s.matchLengths.fse.actualTableLog+s.litLengths.fse.actualTableLog <= 56
var errCode int
@ -278,6 +288,8 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
case errorNotEnoughLiterals:
ll := ctx.seqs[i].ll
return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, ctx.litRemain+ll)
case errorOverread:
return io.ErrUnexpectedEOF
}
return fmt.Errorf("sequenceDecs_decode_amd64 returned erronous code %d", errCode)
@ -292,6 +304,9 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
if s.seqSize > maxBlockSize {
return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
}
if debugDecoder {
println("decode: ", br.remain(), "bits remain on stream. code:", errCode)
}
err := br.close()
if err != nil {
printf("Closing sequences: %v, %+v\n", err, *br)

View file

@ -5,11 +5,11 @@
// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: CMOV
TEXT ·sequenceDecs_decode_amd64(SB), $8-32
MOVQ br+8(FP), AX
MOVQ 32(AX), DX
MOVBQZX 40(AX), BX
MOVQ 24(AX), SI
MOVQ (AX), AX
MOVQ br+8(FP), CX
MOVQ 24(CX), DX
MOVBQZX 32(CX), BX
MOVQ (CX), AX
MOVQ 8(CX), SI
ADDQ SI, AX
MOVQ AX, (SP)
MOVQ ctx+16(FP), AX
@ -38,7 +38,7 @@ sequenceDecs_decode_amd64_main_loop:
sequenceDecs_decode_amd64_fill_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decode_amd64_fill_end
JLE sequenceDecs_decode_amd64_fill_check_overread
CMPQ BX, $0x07
JLE sequenceDecs_decode_amd64_fill_end
SHLQ $0x08, DX
@ -49,6 +49,10 @@ sequenceDecs_decode_amd64_fill_byte_by_byte:
ORQ AX, DX
JMP sequenceDecs_decode_amd64_fill_byte_by_byte
sequenceDecs_decode_amd64_fill_check_overread:
CMPQ BX, $0x40
JA error_overread
sequenceDecs_decode_amd64_fill_end:
// Update offset
MOVQ R9, AX
@ -105,7 +109,7 @@ sequenceDecs_decode_amd64_ml_update_zero:
sequenceDecs_decode_amd64_fill_2_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decode_amd64_fill_2_end
JLE sequenceDecs_decode_amd64_fill_2_check_overread
CMPQ BX, $0x07
JLE sequenceDecs_decode_amd64_fill_2_end
SHLQ $0x08, DX
@ -116,6 +120,10 @@ sequenceDecs_decode_amd64_fill_2_byte_by_byte:
ORQ AX, DX
JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte
sequenceDecs_decode_amd64_fill_2_check_overread:
CMPQ BX, $0x40
JA error_overread
sequenceDecs_decode_amd64_fill_2_end:
// Update literal length
MOVQ DI, AX
@ -293,9 +301,9 @@ sequenceDecs_decode_amd64_match_len_ofs_ok:
MOVQ R12, 152(AX)
MOVQ R13, 160(AX)
MOVQ br+8(FP), AX
MOVQ DX, 32(AX)
MOVB BL, 40(AX)
MOVQ SI, 24(AX)
MOVQ DX, 24(AX)
MOVB BL, 32(AX)
MOVQ SI, 8(AX)
// Return success
MOVQ $0x00000000, ret+24(FP)
@ -320,18 +328,19 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with not enough output space error
MOVQ $0x00000005, ret+24(FP)
// Return with overread error
error_overread:
MOVQ $0x00000006, ret+24(FP)
RET
// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: CMOV
TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
MOVQ br+8(FP), AX
MOVQ 32(AX), DX
MOVBQZX 40(AX), BX
MOVQ 24(AX), SI
MOVQ (AX), AX
MOVQ br+8(FP), CX
MOVQ 24(CX), DX
MOVBQZX 32(CX), BX
MOVQ (CX), AX
MOVQ 8(CX), SI
ADDQ SI, AX
MOVQ AX, (SP)
MOVQ ctx+16(FP), AX
@ -360,7 +369,7 @@ sequenceDecs_decode_56_amd64_main_loop:
sequenceDecs_decode_56_amd64_fill_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decode_56_amd64_fill_end
JLE sequenceDecs_decode_56_amd64_fill_check_overread
CMPQ BX, $0x07
JLE sequenceDecs_decode_56_amd64_fill_end
SHLQ $0x08, DX
@ -371,6 +380,10 @@ sequenceDecs_decode_56_amd64_fill_byte_by_byte:
ORQ AX, DX
JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte
sequenceDecs_decode_56_amd64_fill_check_overread:
CMPQ BX, $0x40
JA error_overread
sequenceDecs_decode_56_amd64_fill_end:
// Update offset
MOVQ R9, AX
@ -590,9 +603,9 @@ sequenceDecs_decode_56_amd64_match_len_ofs_ok:
MOVQ R12, 152(AX)
MOVQ R13, 160(AX)
MOVQ br+8(FP), AX
MOVQ DX, 32(AX)
MOVB BL, 40(AX)
MOVQ SI, 24(AX)
MOVQ DX, 24(AX)
MOVB BL, 32(AX)
MOVQ SI, 8(AX)
// Return success
MOVQ $0x00000000, ret+24(FP)
@ -617,18 +630,19 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with not enough output space error
MOVQ $0x00000005, ret+24(FP)
// Return with overread error
error_overread:
MOVQ $0x00000006, ret+24(FP)
RET
// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: BMI, BMI2, CMOV
TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
MOVQ br+8(FP), CX
MOVQ 32(CX), AX
MOVBQZX 40(CX), DX
MOVQ 24(CX), BX
MOVQ (CX), CX
MOVQ br+8(FP), BX
MOVQ 24(BX), AX
MOVBQZX 32(BX), DX
MOVQ (BX), CX
MOVQ 8(BX), BX
ADDQ BX, CX
MOVQ CX, (SP)
MOVQ ctx+16(FP), CX
@ -657,7 +671,7 @@ sequenceDecs_decode_bmi2_main_loop:
sequenceDecs_decode_bmi2_fill_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decode_bmi2_fill_end
JLE sequenceDecs_decode_bmi2_fill_check_overread
CMPQ DX, $0x07
JLE sequenceDecs_decode_bmi2_fill_end
SHLQ $0x08, AX
@ -668,6 +682,10 @@ sequenceDecs_decode_bmi2_fill_byte_by_byte:
ORQ CX, AX
JMP sequenceDecs_decode_bmi2_fill_byte_by_byte
sequenceDecs_decode_bmi2_fill_check_overread:
CMPQ DX, $0x40
JA error_overread
sequenceDecs_decode_bmi2_fill_end:
// Update offset
MOVQ $0x00000808, CX
@ -708,7 +726,7 @@ sequenceDecs_decode_bmi2_fill_end:
sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decode_bmi2_fill_2_end
JLE sequenceDecs_decode_bmi2_fill_2_check_overread
CMPQ DX, $0x07
JLE sequenceDecs_decode_bmi2_fill_2_end
SHLQ $0x08, AX
@ -719,6 +737,10 @@ sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
ORQ CX, AX
JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte
sequenceDecs_decode_bmi2_fill_2_check_overread:
CMPQ DX, $0x40
JA error_overread
sequenceDecs_decode_bmi2_fill_2_end:
// Update literal length
MOVQ $0x00000808, CX
@ -870,9 +892,9 @@ sequenceDecs_decode_bmi2_match_len_ofs_ok:
MOVQ R11, 152(CX)
MOVQ R12, 160(CX)
MOVQ br+8(FP), CX
MOVQ AX, 32(CX)
MOVB DL, 40(CX)
MOVQ BX, 24(CX)
MOVQ AX, 24(CX)
MOVB DL, 32(CX)
MOVQ BX, 8(CX)
// Return success
MOVQ $0x00000000, ret+24(FP)
@ -897,18 +919,19 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with not enough output space error
MOVQ $0x00000005, ret+24(FP)
// Return with overread error
error_overread:
MOVQ $0x00000006, ret+24(FP)
RET
// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: BMI, BMI2, CMOV
TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
MOVQ br+8(FP), CX
MOVQ 32(CX), AX
MOVBQZX 40(CX), DX
MOVQ 24(CX), BX
MOVQ (CX), CX
MOVQ br+8(FP), BX
MOVQ 24(BX), AX
MOVBQZX 32(BX), DX
MOVQ (BX), CX
MOVQ 8(BX), BX
ADDQ BX, CX
MOVQ CX, (SP)
MOVQ ctx+16(FP), CX
@ -937,7 +960,7 @@ sequenceDecs_decode_56_bmi2_main_loop:
sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decode_56_bmi2_fill_end
JLE sequenceDecs_decode_56_bmi2_fill_check_overread
CMPQ DX, $0x07
JLE sequenceDecs_decode_56_bmi2_fill_end
SHLQ $0x08, AX
@ -948,6 +971,10 @@ sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
ORQ CX, AX
JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte
sequenceDecs_decode_56_bmi2_fill_check_overread:
CMPQ DX, $0x40
JA error_overread
sequenceDecs_decode_56_bmi2_fill_end:
// Update offset
MOVQ $0x00000808, CX
@ -1125,9 +1152,9 @@ sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
MOVQ R11, 152(CX)
MOVQ R12, 160(CX)
MOVQ br+8(FP), CX
MOVQ AX, 32(CX)
MOVB DL, 40(CX)
MOVQ BX, 24(CX)
MOVQ AX, 24(CX)
MOVB DL, 32(CX)
MOVQ BX, 8(CX)
// Return success
MOVQ $0x00000000, ret+24(FP)
@ -1152,8 +1179,9 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with not enough output space error
MOVQ $0x00000005, ret+24(FP)
// Return with overread error
error_overread:
MOVQ $0x00000006, ret+24(FP)
RET
// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
@ -1389,8 +1417,7 @@ loop_finished:
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
MOVQ 80(AX), CX
SUBQ CX, SI
SUBQ 80(AX), SI
MOVQ SI, 112(AX)
RET
@ -1402,8 +1429,7 @@ error_match_off_too_big:
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
MOVQ 80(AX), CX
SUBQ CX, SI
SUBQ 80(AX), SI
MOVQ SI, 112(AX)
RET
@ -1747,8 +1773,7 @@ loop_finished:
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
MOVQ 80(AX), CX
SUBQ CX, SI
SUBQ 80(AX), SI
MOVQ SI, 112(AX)
RET
@ -1760,8 +1785,7 @@ error_match_off_too_big:
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
MOVQ 80(AX), CX
SUBQ CX, SI
SUBQ 80(AX), SI
MOVQ SI, 112(AX)
RET
@ -1773,11 +1797,11 @@ empty_seqs:
// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: CMOV, SSE
TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
MOVQ br+8(FP), AX
MOVQ 32(AX), DX
MOVBQZX 40(AX), BX
MOVQ 24(AX), SI
MOVQ (AX), AX
MOVQ br+8(FP), CX
MOVQ 24(CX), DX
MOVBQZX 32(CX), BX
MOVQ (CX), AX
MOVQ 8(CX), SI
ADDQ SI, AX
MOVQ AX, (SP)
MOVQ ctx+16(FP), AX
@ -1824,7 +1848,7 @@ sequenceDecs_decodeSync_amd64_main_loop:
sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decodeSync_amd64_fill_end
JLE sequenceDecs_decodeSync_amd64_fill_check_overread
CMPQ BX, $0x07
JLE sequenceDecs_decodeSync_amd64_fill_end
SHLQ $0x08, DX
@ -1835,6 +1859,10 @@ sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
ORQ AX, DX
JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte
sequenceDecs_decodeSync_amd64_fill_check_overread:
CMPQ BX, $0x40
JA error_overread
sequenceDecs_decodeSync_amd64_fill_end:
// Update offset
MOVQ R9, AX
@ -1891,7 +1919,7 @@ sequenceDecs_decodeSync_amd64_ml_update_zero:
sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decodeSync_amd64_fill_2_end
JLE sequenceDecs_decodeSync_amd64_fill_2_check_overread
CMPQ BX, $0x07
JLE sequenceDecs_decodeSync_amd64_fill_2_end
SHLQ $0x08, DX
@ -1902,6 +1930,10 @@ sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
ORQ AX, DX
JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
sequenceDecs_decodeSync_amd64_fill_2_check_overread:
CMPQ BX, $0x40
JA error_overread
sequenceDecs_decodeSync_amd64_fill_2_end:
// Update literal length
MOVQ DI, AX
@ -2263,9 +2295,9 @@ handle_loop:
loop_finished:
MOVQ br+8(FP), AX
MOVQ DX, 32(AX)
MOVB BL, 40(AX)
MOVQ SI, 24(AX)
MOVQ DX, 24(AX)
MOVB BL, 32(AX)
MOVQ SI, 8(AX)
// Update the context
MOVQ ctx+16(FP), AX
@ -2311,6 +2343,11 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with overread error
error_overread:
MOVQ $0x00000006, ret+24(FP)
RET
// Return with not enough output space error
error_not_enough_space:
MOVQ ctx+16(FP), AX
@ -2325,11 +2362,11 @@ error_not_enough_space:
// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: BMI, BMI2, CMOV, SSE
TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
MOVQ br+8(FP), CX
MOVQ 32(CX), AX
MOVBQZX 40(CX), DX
MOVQ 24(CX), BX
MOVQ (CX), CX
MOVQ br+8(FP), BX
MOVQ 24(BX), AX
MOVBQZX 32(BX), DX
MOVQ (BX), CX
MOVQ 8(BX), BX
ADDQ BX, CX
MOVQ CX, (SP)
MOVQ ctx+16(FP), CX
@ -2376,7 +2413,7 @@ sequenceDecs_decodeSync_bmi2_main_loop:
sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decodeSync_bmi2_fill_end
JLE sequenceDecs_decodeSync_bmi2_fill_check_overread
CMPQ DX, $0x07
JLE sequenceDecs_decodeSync_bmi2_fill_end
SHLQ $0x08, AX
@ -2387,6 +2424,10 @@ sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
ORQ CX, AX
JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
sequenceDecs_decodeSync_bmi2_fill_check_overread:
CMPQ DX, $0x40
JA error_overread
sequenceDecs_decodeSync_bmi2_fill_end:
// Update offset
MOVQ $0x00000808, CX
@ -2427,7 +2468,7 @@ sequenceDecs_decodeSync_bmi2_fill_end:
sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decodeSync_bmi2_fill_2_end
JLE sequenceDecs_decodeSync_bmi2_fill_2_check_overread
CMPQ DX, $0x07
JLE sequenceDecs_decodeSync_bmi2_fill_2_end
SHLQ $0x08, AX
@ -2438,6 +2479,10 @@ sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
ORQ CX, AX
JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
sequenceDecs_decodeSync_bmi2_fill_2_check_overread:
CMPQ DX, $0x40
JA error_overread
sequenceDecs_decodeSync_bmi2_fill_2_end:
// Update literal length
MOVQ $0x00000808, CX
@ -2773,9 +2818,9 @@ handle_loop:
loop_finished:
MOVQ br+8(FP), CX
MOVQ AX, 32(CX)
MOVB DL, 40(CX)
MOVQ BX, 24(CX)
MOVQ AX, 24(CX)
MOVB DL, 32(CX)
MOVQ BX, 8(CX)
// Update the context
MOVQ ctx+16(FP), AX
@ -2821,6 +2866,11 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with overread error
error_overread:
MOVQ $0x00000006, ret+24(FP)
RET
// Return with not enough output space error
error_not_enough_space:
MOVQ ctx+16(FP), AX
@ -2835,11 +2885,11 @@ error_not_enough_space:
// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: CMOV, SSE
TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
MOVQ br+8(FP), AX
MOVQ 32(AX), DX
MOVBQZX 40(AX), BX
MOVQ 24(AX), SI
MOVQ (AX), AX
MOVQ br+8(FP), CX
MOVQ 24(CX), DX
MOVBQZX 32(CX), BX
MOVQ (CX), AX
MOVQ 8(CX), SI
ADDQ SI, AX
MOVQ AX, (SP)
MOVQ ctx+16(FP), AX
@ -2886,7 +2936,7 @@ sequenceDecs_decodeSync_safe_amd64_main_loop:
sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decodeSync_safe_amd64_fill_end
JLE sequenceDecs_decodeSync_safe_amd64_fill_check_overread
CMPQ BX, $0x07
JLE sequenceDecs_decodeSync_safe_amd64_fill_end
SHLQ $0x08, DX
@ -2897,6 +2947,10 @@ sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
ORQ AX, DX
JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
sequenceDecs_decodeSync_safe_amd64_fill_check_overread:
CMPQ BX, $0x40
JA error_overread
sequenceDecs_decodeSync_safe_amd64_fill_end:
// Update offset
MOVQ R9, AX
@ -2953,7 +3007,7 @@ sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
JLE sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread
CMPQ BX, $0x07
JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
SHLQ $0x08, DX
@ -2964,6 +3018,10 @@ sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
ORQ AX, DX
JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread:
CMPQ BX, $0x40
JA error_overread
sequenceDecs_decodeSync_safe_amd64_fill_2_end:
// Update literal length
MOVQ DI, AX
@ -3427,9 +3485,9 @@ handle_loop:
loop_finished:
MOVQ br+8(FP), AX
MOVQ DX, 32(AX)
MOVB BL, 40(AX)
MOVQ SI, 24(AX)
MOVQ DX, 24(AX)
MOVB BL, 32(AX)
MOVQ SI, 8(AX)
// Update the context
MOVQ ctx+16(FP), AX
@ -3475,6 +3533,11 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with overread error
error_overread:
MOVQ $0x00000006, ret+24(FP)
RET
// Return with not enough output space error
error_not_enough_space:
MOVQ ctx+16(FP), AX
@ -3489,11 +3552,11 @@ error_not_enough_space:
// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: BMI, BMI2, CMOV, SSE
TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
MOVQ br+8(FP), CX
MOVQ 32(CX), AX
MOVBQZX 40(CX), DX
MOVQ 24(CX), BX
MOVQ (CX), CX
MOVQ br+8(FP), BX
MOVQ 24(BX), AX
MOVBQZX 32(BX), DX
MOVQ (BX), CX
MOVQ 8(BX), BX
ADDQ BX, CX
MOVQ CX, (SP)
MOVQ ctx+16(FP), CX
@ -3540,7 +3603,7 @@ sequenceDecs_decodeSync_safe_bmi2_main_loop:
sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
JLE sequenceDecs_decodeSync_safe_bmi2_fill_check_overread
CMPQ DX, $0x07
JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
SHLQ $0x08, AX
@ -3551,6 +3614,10 @@ sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
ORQ CX, AX
JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
sequenceDecs_decodeSync_safe_bmi2_fill_check_overread:
CMPQ DX, $0x40
JA error_overread
sequenceDecs_decodeSync_safe_bmi2_fill_end:
// Update offset
MOVQ $0x00000808, CX
@ -3591,7 +3658,7 @@ sequenceDecs_decodeSync_safe_bmi2_fill_end:
sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread
CMPQ DX, $0x07
JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
SHLQ $0x08, AX
@ -3602,6 +3669,10 @@ sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
ORQ CX, AX
JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread:
CMPQ DX, $0x40
JA error_overread
sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
// Update literal length
MOVQ $0x00000808, CX
@ -4039,9 +4110,9 @@ handle_loop:
loop_finished:
MOVQ br+8(FP), CX
MOVQ AX, 32(CX)
MOVB DL, 40(CX)
MOVQ BX, 24(CX)
MOVQ AX, 24(CX)
MOVB DL, 32(CX)
MOVQ BX, 8(CX)
// Update the context
MOVQ ctx+16(FP), AX
@ -4087,6 +4158,11 @@ error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with overread error
error_overread:
MOVQ $0x00000006, ret+24(FP)
RET
// Return with not enough output space error
error_not_enough_space:
MOVQ ctx+16(FP), AX

View file

@ -29,7 +29,7 @@ func (s *sequenceDecs) decode(seqs []seqVals) error {
}
for i := range seqs {
var ll, mo, ml int
if br.off > 4+((maxOffsetBits+16+16)>>3) {
if len(br.in) > 4+((maxOffsetBits+16+16)>>3) {
// inlined function:
// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)

View file

@ -95,10 +95,9 @@ func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
var written int64
var readHeader bool
{
var header []byte
var n int
header, r.err = frameHeader{WindowSize: snappyMaxBlockSize}.appendTo(r.buf[:0])
header := frameHeader{WindowSize: snappyMaxBlockSize}.appendTo(r.buf[:0])
var n int
n, r.err = w.Write(header)
if r.err != nil {
return written, r.err

View file

@ -9,7 +9,6 @@ import (
"errors"
"log"
"math"
"math/bits"
)
// enable debug printing
@ -36,9 +35,6 @@ const forcePreDef = false
// zstdMinMatch is the minimum zstd match length.
const zstdMinMatch = 3
// Reset the buffer offset when reaching this.
const bufferReset = math.MaxInt32 - MaxWindowSize
// fcsUnknown is used for unknown frame content size.
const fcsUnknown = math.MaxUint64
@ -75,7 +71,6 @@ var (
ErrDecoderSizeExceeded = errors.New("decompressed size exceeds configured limit")
// ErrUnknownDictionary is returned if the dictionary ID is unknown.
// For the time being dictionaries are not supported.
ErrUnknownDictionary = errors.New("unknown dictionary")
// ErrFrameSizeExceeded is returned if the stated frame size is exceeded.
@ -110,38 +105,12 @@ func printf(format string, a ...interface{}) {
}
}
// matchLen returns the maximum length.
// a must be the shortest of the two.
// The function also returns whether all bytes matched.
func matchLen(a, b []byte) int {
b = b[:len(a)]
for i := 0; i < len(a)-7; i += 8 {
if diff := load64(a, i) ^ load64(b, i); diff != 0 {
return i + (bits.TrailingZeros64(diff) >> 3)
}
}
checked := (len(a) >> 3) << 3
a = a[checked:]
b = b[checked:]
for i := range a {
if a[i] != b[i] {
return i + checked
}
}
return len(a) + checked
}
func load3232(b []byte, i int32) uint32 {
return binary.LittleEndian.Uint32(b[i:])
return binary.LittleEndian.Uint32(b[:len(b):len(b)][i:])
}
func load6432(b []byte, i int32) uint64 {
return binary.LittleEndian.Uint64(b[i:])
}
func load64(b []byte, i int) uint64 {
return binary.LittleEndian.Uint64(b[i:])
return binary.LittleEndian.Uint64(b[:len(b):len(b)][i:])
}
type byter interface {

4
vendor/modules.txt vendored
View file

@ -467,8 +467,8 @@ github.com/ishidawataru/sctp
# github.com/jmespath/go-jmespath v0.4.0
## explicit; go 1.14
github.com/jmespath/go-jmespath
# github.com/klauspost/compress v1.15.12
## explicit; go 1.17
# github.com/klauspost/compress v1.17.2
## explicit; go 1.18
github.com/klauspost/compress
github.com/klauspost/compress/fse
github.com/klauspost/compress/huff0