2 лет назад · 0db50996b7
--- a/vendor.mod
+++ b/vendor.mod
@@ -47,7 +47,7 @@ require (
 
				 	github.com/hashicorp/serf v0.8.5
			
 
				 	github.com/imdario/mergo v0.3.12
			
 
				 	github.com/ishidawataru/sctp v0.0.0-20210707070123-9a39160e9062
			
 
				-	github.com/klauspost/compress v1.15.1
			
 
				+	github.com/klauspost/compress v1.15.9
			
 
				 	github.com/miekg/dns v1.1.27
			
 
				 	github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible
			
 
				 	github.com/moby/buildkit v0.10.4
			
--- a/vendor.sum
+++ b/vendor.sum
@@ -691,8 +691,9 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI
 
				 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
			
 
				 github.com/klauspost/compress v1.11.3/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
			
 
				 github.com/klauspost/compress v1.11.13/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
			
 
				-github.com/klauspost/compress v1.15.1 h1:y9FcTHGyrebwfP0ZZqFiaxTaiDnUrGkJkI+f583BL1A=
			
 
				 github.com/klauspost/compress v1.15.1/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
			
 
				+github.com/klauspost/compress v1.15.9 h1:wKRjX6JRtDdrE9qwa4b/Cip7ACOshUI4smpCQanqjSY=
			
 
				+github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU=
			
 
				 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
			
 
				 github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
			
 
				 github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
			
--- a/vendor/github.com/klauspost/compress/.gitignore
+++ b/vendor/github.com/klauspost/compress/.gitignore
@@ -23,3 +23,10 @@ _testmain.go
 
				 *.test
			
 
				 *.prof
			
 
				 /s2/cmd/_s2sx/sfx-exe
			
 
				+
			
 
				+# Linux perf files
			
 
				+perf.data
			
 
				+perf.data.old
			
 
				+
			
 
				+# gdb history
			
 
				+.gdb_history
			
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md
@@ -17,6 +17,72 @@ This package provides various compression algorithms.
 
				 

			
 
				 # changelog

			
 
				 

			
 
				+* July 13, 2022 (v1.15.8)

			
 
				+

			
 
				+	* gzip: fix stack exhaustion bug in Reader.Read https://github.com/klauspost/compress/pull/641

			
 
				+	* s2: Add Index header trim/restore https://github.com/klauspost/compress/pull/638

			
 
				+	* zstd: Optimize seqdeq amd64 asm by @greatroar in https://github.com/klauspost/compress/pull/636

			
 
				+	* zstd: Improve decoder memcopy https://github.com/klauspost/compress/pull/637

			
 
				+	* huff0: Pass a single bitReader pointer to asm by @greatroar in https://github.com/klauspost/compress/pull/634

			
 
				+	* zstd: Branchless getBits for amd64 w/o BMI2 by @greatroar in https://github.com/klauspost/compress/pull/640

			
 
				+	* gzhttp: Remove header before writing https://github.com/klauspost/compress/pull/639

			
 
				+

			
 
				+* June 29, 2022 (v1.15.7)

			
 
				+

			
 
				+	* s2: Fix absolute forward seeks  https://github.com/klauspost/compress/pull/633

			
 
				+	* zip: Merge upstream  https://github.com/klauspost/compress/pull/631

			
 
				+	* zip: Re-add zip64 fix https://github.com/klauspost/compress/pull/624

			
 
				+	* zstd: translate fseDecoder.buildDtable into asm by @WojciechMula in https://github.com/klauspost/compress/pull/598

			
 
				+	* flate: Faster histograms  https://github.com/klauspost/compress/pull/620

			
 
				+	* deflate: Use compound hcode  https://github.com/klauspost/compress/pull/622

			
 
				+

			
 
				+* June 3, 2022 (v1.15.6)

			
 
				+	* s2: Improve coding for long, close matches https://github.com/klauspost/compress/pull/613

			
 
				+	* s2c: Add Snappy/S2 stream recompression https://github.com/klauspost/compress/pull/611

			
 
				+	* zstd: Always use configured block size https://github.com/klauspost/compress/pull/605

			
 
				+	* zstd: Fix incorrect hash table placement for dict encoding in default https://github.com/klauspost/compress/pull/606

			
 
				+	* zstd: Apply default config to ZipDecompressor without options https://github.com/klauspost/compress/pull/608

			
 
				+	* gzhttp: Exclude more common archive formats https://github.com/klauspost/compress/pull/612

			
 
				+	* s2: Add ReaderIgnoreCRC https://github.com/klauspost/compress/pull/609

			
 
				+	* s2: Remove sanity load on index creation https://github.com/klauspost/compress/pull/607

			
 
				+	* snappy: Use dedicated function for scoring https://github.com/klauspost/compress/pull/614

			
 
				+	* s2c+s2d: Use official snappy framed extension https://github.com/klauspost/compress/pull/610

			
 
				+

			
 
				+* May 25, 2022 (v1.15.5)

			
 
				+	* s2: Add concurrent stream decompression https://github.com/klauspost/compress/pull/602

			
 
				+	* s2: Fix final emit oob read crash on amd64 https://github.com/klauspost/compress/pull/601

			
 
				+	* huff0: asm implementation of Decompress1X by @WojciechMula https://github.com/klauspost/compress/pull/596

			
 
				+	* zstd: Use 1 less goroutine for stream decoding https://github.com/klauspost/compress/pull/588

			
 
				+	* zstd: Copy literal in 16 byte blocks when possible https://github.com/klauspost/compress/pull/592

			
 
				+	* zstd: Speed up when WithDecoderLowmem(false) https://github.com/klauspost/compress/pull/599

			
 
				+	* zstd: faster next state update in BMI2 version of decode by @WojciechMula in https://github.com/klauspost/compress/pull/593

			
 
				+	* huff0: Do not check max size when reading table. https://github.com/klauspost/compress/pull/586

			
 
				+	* flate: Inplace hashing for level 7-9 by @klauspost in https://github.com/klauspost/compress/pull/590

			
 
				+

			
 
				+

			
 
				+* May 11, 2022 (v1.15.4)

			
 
				+	* huff0: decompress directly into output by @WojciechMula in [#577](https://github.com/klauspost/compress/pull/577)

			
 
				+	* inflate: Keep dict on stack [#581](https://github.com/klauspost/compress/pull/581)

			
 
				+	* zstd: Faster decoding memcopy in asm [#583](https://github.com/klauspost/compress/pull/583)

			
 
				+	* zstd: Fix ignored crc [#580](https://github.com/klauspost/compress/pull/580)

			
 
				+

			
 
				+* May 5, 2022 (v1.15.3)

			
 
				+	* zstd: Allow to ignore checksum checking by @WojciechMula [#572](https://github.com/klauspost/compress/pull/572)

			
 
				+	* s2: Fix incorrect seek for io.SeekEnd in [#575](https://github.com/klauspost/compress/pull/575)

			
 
				+

			
 
				+* Apr 26, 2022 (v1.15.2)

			
 
				+	* zstd: Add x86-64 assembly for decompression on streams and blocks. Contributed by [@WojciechMula](https://github.com/WojciechMula). Typically 2x faster.  [#528](https://github.com/klauspost/compress/pull/528) [#531](https://github.com/klauspost/compress/pull/531) [#545](https://github.com/klauspost/compress/pull/545) [#537](https://github.com/klauspost/compress/pull/537)

			
 
				+	* zstd: Add options to ZipDecompressor and fixes [#539](https://github.com/klauspost/compress/pull/539)

			
 
				+	* s2: Use sorted search for index [#555](https://github.com/klauspost/compress/pull/555)

			
 
				+	* Minimum version is Go 1.16, added CI test on 1.18.

			
 
				+

			
 
				+* Mar 11, 2022 (v1.15.1)

			
 
				+	* huff0: Add x86 assembly of Decode4X by @WojciechMula in [#512](https://github.com/klauspost/compress/pull/512)

			
 
				+	* zstd: Reuse zip decoders in [#514](https://github.com/klauspost/compress/pull/514)

			
 
				+	* zstd: Detect extra block data and report as corrupted in [#520](https://github.com/klauspost/compress/pull/520)

			
 
				+	* zstd: Handle zero sized frame content size stricter in [#521](https://github.com/klauspost/compress/pull/521)

			
 
				+	* zstd: Add stricter block size checks in [#523](https://github.com/klauspost/compress/pull/523)

			
 
				+

			
 
				 * Mar 3, 2022 (v1.15.0)

			
 
				 	* zstd: Refactor decoder by @klauspost in [#498](https://github.com/klauspost/compress/pull/498)

			
 
				 	* zstd: Add stream encoding without goroutines by @klauspost in [#505](https://github.com/klauspost/compress/pull/505)

			
@@ -60,6 +126,9 @@ While the release has been extensively tested, it is recommended to testing when
 
				 	* zstd: add arm64 xxhash assembly in [#464](https://github.com/klauspost/compress/pull/464)

			
 
				 	* Add garbled for binaries for s2 in [#445](https://github.com/klauspost/compress/pull/445)

			
 
				 

			
 
				+<details>

			
 
				+	<summary>See changes to v1.13.x</summary>

			
 
				+	

			
 
				 * Aug 30, 2021 (v1.13.5)

			
 
				 	* gz/zlib/flate: Alias stdlib errors [#425](https://github.com/klauspost/compress/pull/425)

			
 
				 	* s2: Add block support to commandline tools [#413](https://github.com/klauspost/compress/pull/413)

			
@@ -88,6 +157,8 @@ While the release has been extensively tested, it is recommended to testing when
 
				 	* Added [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp#gzip-handler) which allows wrapping HTTP servers and clients with GZIP compressors.

			
 
				 	* zstd: Detect short invalid signatures [#382](https://github.com/klauspost/compress/pull/382)

			
 
				 	* zstd: Spawn decoder goroutine only if needed. [#380](https://github.com/klauspost/compress/pull/380)

			
 
				+</details>

			
 
				+

			
 
				 

			
 
				 <details>

			
 
				 	<summary>See changes to v1.12.x</summary>

			
--- a/vendor/github.com/klauspost/compress/huff0/autogen.go
+++ b/vendor/github.com/klauspost/compress/huff0/autogen.go
@@ -1,5 +0,0 @@
 
				-package huff0
			
 
				-
			
 
				-//go:generate go run generate.go
			
 
				-//go:generate asmfmt -w decompress_amd64.s
			
 
				-//go:generate asmfmt -w decompress_8b_amd64.s
			
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -165,11 +165,6 @@ func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
 
				 	return uint16(b.value >> ((64 - n) & 63))
			
 
				 }
			
 
				 
			
 
				-// peekTopBits(n) is equvialent to peekBitFast(64 - n)
			
 
				-func (b *bitReaderShifted) peekTopBits(n uint8) uint16 {
			
 
				-	return uint16(b.value >> n)
			
 
				-}
			
 
				-
			
 
				 func (b *bitReaderShifted) advance(n uint8) {
			
 
				 	b.bitsRead += n
			
 
				 	b.value <<= n & 63
			
@@ -220,11 +215,6 @@ func (b *bitReaderShifted) fill() {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-// finished returns true if all bits have been read from the bit stream.
			
 
				-func (b *bitReaderShifted) finished() bool {
			
 
				-	return b.off == 0 && b.bitsRead >= 64
			
 
				-}
			
 
				-
			
 
				 func (b *bitReaderShifted) remaining() uint {
			
 
				 	return b.off*8 + uint(64-b.bitsRead)
			
 
				 }
			
--- a/vendor/github.com/klauspost/compress/huff0/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
@@ -5,8 +5,6 @@
 
				 
			
 
				 package huff0
			
 
				 
			
 
				-import "fmt"
			
 
				-
			
 
				 // bitWriter will write bits.
			
 
				 // First bit will be LSB of the first byte of output.
			
 
				 type bitWriter struct {
			
@@ -23,14 +21,6 @@ var bitMask16 = [32]uint16{
 
				 	0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
			
 
				 	0xFFFF, 0xFFFF} /* up to 16 bits */
			
 
				 
			
 
				-// addBits16NC will add up to 16 bits.
			
 
				-// It will not check if there is space for them,
			
 
				-// so the caller must ensure that it has flushed recently.
			
 
				-func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
			
 
				-	b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
			
 
				-	b.nBits += bits
			
 
				-}
			
 
				-
			
 
				 // addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
			
 
				 // It will not check if there is space for them, so the caller must ensure that it has flushed recently.
			
 
				 func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
			
@@ -70,104 +60,6 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
 
				 	b.nBits += encA.nBits + encB.nBits
			
 
				 }
			
 
				 
			
 
				-// addBits16ZeroNC will add up to 16 bits.
			
 
				-// It will not check if there is space for them,
			
 
				-// so the caller must ensure that it has flushed recently.
			
 
				-// This is fastest if bits can be zero.
			
 
				-func (b *bitWriter) addBits16ZeroNC(value uint16, bits uint8) {
			
 
				-	if bits == 0 {
			
 
				-		return
			
 
				-	}
			
 
				-	value <<= (16 - bits) & 15
			
 
				-	value >>= (16 - bits) & 15
			
 
				-	b.bitContainer |= uint64(value) << (b.nBits & 63)
			
 
				-	b.nBits += bits
			
 
				-}
			
 
				-
			
 
				-// flush will flush all pending full bytes.
			
 
				-// There will be at least 56 bits available for writing when this has been called.
			
 
				-// Using flush32 is faster, but leaves less space for writing.
			
 
				-func (b *bitWriter) flush() {
			
 
				-	v := b.nBits >> 3
			
 
				-	switch v {
			
 
				-	case 0:
			
 
				-		return
			
 
				-	case 1:
			
 
				-		b.out = append(b.out,
			
 
				-			byte(b.bitContainer),
			
 
				-		)
			
 
				-		b.bitContainer >>= 1 << 3
			
 
				-	case 2:
			
 
				-		b.out = append(b.out,
			
 
				-			byte(b.bitContainer),
			
 
				-			byte(b.bitContainer>>8),
			
 
				-		)
			
 
				-		b.bitContainer >>= 2 << 3
			
 
				-	case 3:
			
 
				-		b.out = append(b.out,
			
 
				-			byte(b.bitContainer),
			
 
				-			byte(b.bitContainer>>8),
			
 
				-			byte(b.bitContainer>>16),
			
 
				-		)
			
 
				-		b.bitContainer >>= 3 << 3
			
 
				-	case 4:
			
 
				-		b.out = append(b.out,
			
 
				-			byte(b.bitContainer),
			
 
				-			byte(b.bitContainer>>8),
			
 
				-			byte(b.bitContainer>>16),
			
 
				-			byte(b.bitContainer>>24),
			
 
				-		)
			
 
				-		b.bitContainer >>= 4 << 3
			
 
				-	case 5:
			
 
				-		b.out = append(b.out,
			
 
				-			byte(b.bitContainer),
			
 
				-			byte(b.bitContainer>>8),
			
 
				-			byte(b.bitContainer>>16),
			
 
				-			byte(b.bitContainer>>24),
			
 
				-			byte(b.bitContainer>>32),
			
 
				-		)
			
 
				-		b.bitContainer >>= 5 << 3
			
 
				-	case 6:
			
 
				-		b.out = append(b.out,
			
 
				-			byte(b.bitContainer),
			
 
				-			byte(b.bitContainer>>8),
			
 
				-			byte(b.bitContainer>>16),
			
 
				-			byte(b.bitContainer>>24),
			
 
				-			byte(b.bitContainer>>32),
			
 
				-			byte(b.bitContainer>>40),
			
 
				-		)
			
 
				-		b.bitContainer >>= 6 << 3
			
 
				-	case 7:
			
 
				-		b.out = append(b.out,
			
 
				-			byte(b.bitContainer),
			
 
				-			byte(b.bitContainer>>8),
			
 
				-			byte(b.bitContainer>>16),
			
 
				-			byte(b.bitContainer>>24),
			
 
				-			byte(b.bitContainer>>32),
			
 
				-			byte(b.bitContainer>>40),
			
 
				-			byte(b.bitContainer>>48),
			
 
				-		)
			
 
				-		b.bitContainer >>= 7 << 3
			
 
				-	case 8:
			
 
				-		b.out = append(b.out,
			
 
				-			byte(b.bitContainer),
			
 
				-			byte(b.bitContainer>>8),
			
 
				-			byte(b.bitContainer>>16),
			
 
				-			byte(b.bitContainer>>24),
			
 
				-			byte(b.bitContainer>>32),
			
 
				-			byte(b.bitContainer>>40),
			
 
				-			byte(b.bitContainer>>48),
			
 
				-			byte(b.bitContainer>>56),
			
 
				-		)
			
 
				-		b.bitContainer = 0
			
 
				-		b.nBits = 0
			
 
				-		return
			
 
				-	default:
			
 
				-		panic(fmt.Errorf("bits (%d) > 64", b.nBits))
			
 
				-	}
			
 
				-	b.nBits &= 7
			
 
				-}
			
 
				-
			
 
				 // flush32 will flush out, so there are at least 32 bits available for writing.
			
 
				 func (b *bitWriter) flush32() {
			
 
				 	if b.nBits < 32 {
			
@@ -201,10 +93,3 @@ func (b *bitWriter) close() error {
 
				 	b.flushAlign()
			
 
				 	return nil
			
 
				 }
			
 
				-
			
 
				-// reset and continue writing by appending to out.
			
 
				-func (b *bitWriter) reset(out []byte) {
			
 
				-	b.bitContainer = 0
			
 
				-	b.nBits = 0
			
 
				-	b.out = out
			
 
				-}
			
--- a/vendor/github.com/klauspost/compress/huff0/bytereader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bytereader.go
@@ -20,11 +20,6 @@ func (b *byteReader) init(in []byte) {
 
				 	b.off = 0
			
 
				 }
			
 
				 
			
 
				-// advance the stream b n bytes.
			
 
				-func (b *byteReader) advance(n uint) {
			
 
				-	b.off += int(n)
			
 
				-}
			
 
				-
			
 
				 // Int32 returns a little endian int32 starting at current offset.
			
 
				 func (b byteReader) Int32() int32 {
			
 
				 	v3 := int32(b.b[b.off+3])
			
@@ -43,11 +38,6 @@ func (b byteReader) Uint32() uint32 {
 
				 	return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
			
 
				 }
			
 
				 
			
 
				-// unread returns the unread portion of the input.
			
 
				-func (b byteReader) unread() []byte {
			
 
				-	return b.b[b.off:]
			
 
				-}
			
 
				-
			
 
				 // remain will return the number of bytes remaining.
			
 
				 func (b byteReader) remain() int {
			
 
				 	return len(b.b) - b.off
			
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@@ -404,6 +404,7 @@ func (s *Scratch) canUseTable(c cTable) bool {
 
				 	return true
			
 
				 }
			
 
				 
			
 
				+//lint:ignore U1000 used for debugging
			
 
				 func (s *Scratch) validateTable(c cTable) bool {
			
 
				 	if len(c) < int(s.symbolLen) {
			
 
				 		return false
			
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -11,7 +11,6 @@ import (
 
				 
			
 
				 type dTable struct {
			
 
				 	single []dEntrySingle
			
 
				-	double []dEntryDouble
			
 
				 }
			
 
				 
			
 
				 // single-symbols decoding
			
@@ -19,13 +18,6 @@ type dEntrySingle struct {
 
				 	entry uint16
			
 
				 }
			
 
				 
			
 
				-// double-symbols decoding
			
 
				-type dEntryDouble struct {
			
 
				-	seq   [4]byte
			
 
				-	nBits uint8
			
 
				-	len   uint8
			
 
				-}
			
 
				-
			
 
				 // Uses special code for all tables that are < 8 bits.
			
 
				 const use8BitTables = true
			
 
				 
			
@@ -35,7 +27,7 @@ const use8BitTables = true
 
				 // If no Scratch is provided a new one is allocated.
			
 
				 // The returned Scratch can be used for encoding or decoding input using this table.
			
 
				 func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
			
 
				-	s, err = s.prepare(in)
			
 
				+	s, err = s.prepare(nil)
			
 
				 	if err != nil {
			
 
				 		return s, nil, err
			
 
				 	}
			
@@ -236,108 +228,6 @@ func (d *Decoder) buffer() *[4][256]byte {
 
				 	return &[4][256]byte{}
			
 
				 }
			
 
				 
			
 
				-// Decompress1X will decompress a 1X encoded stream.
			
 
				-// The cap of the output buffer will be the maximum decompressed size.
			
 
				-// The length of the supplied input must match the end of a block exactly.
			
 
				-func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
			
 
				-	if len(d.dt.single) == 0 {
			
 
				-		return nil, errors.New("no table loaded")
			
 
				-	}
			
 
				-	if use8BitTables && d.actualTableLog <= 8 {
			
 
				-		return d.decompress1X8Bit(dst, src)
			
 
				-	}
			
 
				-	var br bitReaderShifted
			
 
				-	err := br.init(src)
			
 
				-	if err != nil {
			
 
				-		return dst, err
			
 
				-	}
			
 
				-	maxDecodedSize := cap(dst)
			
 
				-	dst = dst[:0]
			
 
				-
			
 
				-	// Avoid bounds check by always having full sized table.
			
 
				-	const tlSize = 1 << tableLogMax
			
 
				-	const tlMask = tlSize - 1
			
 
				-	dt := d.dt.single[:tlSize]
			
 
				-
			
 
				-	// Use temp table to avoid bound checks/append penalty.
			
 
				-	bufs := d.buffer()
			
 
				-	buf := &bufs[0]
			
 
				-	var off uint8
			
 
				-
			
 
				-	for br.off >= 8 {
			
 
				-		br.fillFast()
			
 
				-		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
			
 
				-		br.advance(uint8(v.entry))
			
 
				-		buf[off+0] = uint8(v.entry >> 8)
			
 
				-
			
 
				-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
			
 
				-		br.advance(uint8(v.entry))
			
 
				-		buf[off+1] = uint8(v.entry >> 8)
			
 
				-
			
 
				-		// Refill
			
 
				-		br.fillFast()
			
 
				-
			
 
				-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
			
 
				-		br.advance(uint8(v.entry))
			
 
				-		buf[off+2] = uint8(v.entry >> 8)
			
 
				-
			
 
				-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
			
 
				-		br.advance(uint8(v.entry))
			
 
				-		buf[off+3] = uint8(v.entry >> 8)
			
 
				-
			
 
				-		off += 4
			
 
				-		if off == 0 {
			
 
				-			if len(dst)+256 > maxDecodedSize {
			
 
				-				br.close()
			
 
				-				d.bufs.Put(bufs)
			
 
				-				return nil, ErrMaxDecodedSizeExceeded
			
 
				-			}
			
 
				-			dst = append(dst, buf[:]...)
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	if len(dst)+int(off) > maxDecodedSize {
			
 
				-		d.bufs.Put(bufs)
			
 
				-		br.close()
			
 
				-		return nil, ErrMaxDecodedSizeExceeded
			
 
				-	}
			
 
				-	dst = append(dst, buf[:off]...)
			
 
				-
			
 
				-	// br < 8, so uint8 is fine
			
 
				-	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
			
 
				-	for bitsLeft > 0 {
			
 
				-		br.fill()
			
 
				-		if false && br.bitsRead >= 32 {
			
 
				-			if br.off >= 4 {
			
 
				-				v := br.in[br.off-4:]
			
 
				-				v = v[:4]
			
 
				-				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
			
 
				-				br.value = (br.value << 32) | uint64(low)
			
 
				-				br.bitsRead -= 32
			
 
				-				br.off -= 4
			
 
				-			} else {
			
 
				-				for br.off > 0 {
			
 
				-					br.value = (br.value << 8) | uint64(br.in[br.off-1])
			
 
				-					br.bitsRead -= 8
			
 
				-					br.off--
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-		if len(dst) >= maxDecodedSize {
			
 
				-			d.bufs.Put(bufs)
			
 
				-			br.close()
			
 
				-			return nil, ErrMaxDecodedSizeExceeded
			
 
				-		}
			
 
				-		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
			
 
				-		nBits := uint8(v.entry)
			
 
				-		br.advance(nBits)
			
 
				-		bitsLeft -= nBits
			
 
				-		dst = append(dst, uint8(v.entry>>8))
			
 
				-	}
			
 
				-	d.bufs.Put(bufs)
			
 
				-	return dst, br.close()
			
 
				-}
			
 
				-
			
 
				 // decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
			
 
				 // The cap of the output buffer will be the maximum decompressed size.
			
 
				 // The length of the supplied input must match the end of a block exactly.
			
@@ -995,7 +885,6 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 
				 
			
 
				 	const shift = 56
			
 
				 	const tlSize = 1 << 8
			
 
				-	const tlMask = tlSize - 1
			
 
				 	single := d.dt.single[:tlSize]
			
 
				 
			
 
				 	// Use temp table to avoid bound checks/append penalty.
			
--- a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
@@ -1,488 +0,0 @@
 
				-// +build !appengine
			
 
				-// +build gc
			
 
				-// +build !noasm
			
 
				-
			
 
				-#include "textflag.h"
			
 
				-#include "funcdata.h"
			
 
				-#include "go_asm.h"
			
 
				-
			
 
				-#define bufoff      256 // see decompress.go, we're using [4][256]byte table
			
 
				-
			
 
				-// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
			
 
				-//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
			
 
				-TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
			
 
				-#define off             R8
			
 
				-#define buffer          DI
			
 
				-#define table           SI
			
 
				-
			
 
				-#define br_bits_read    R9
			
 
				-#define br_value        R10
			
 
				-#define br_offset       R11
			
 
				-#define peek_bits       R12
			
 
				-#define exhausted       DX
			
 
				-
			
 
				-#define br0             R13
			
 
				-#define br1             R14
			
 
				-#define br2             R15
			
 
				-#define br3             BP
			
 
				-
			
 
				-	MOVQ BP, 0(SP)
			
 
				-
			
 
				-	XORQ exhausted, exhausted // exhausted = false
			
 
				-	XORQ off, off             // off = 0
			
 
				-
			
 
				-	MOVBQZX peekBits+32(FP), peek_bits
			
 
				-	MOVQ    buf+40(FP), buffer
			
 
				-	MOVQ    tbl+48(FP), table
			
 
				-
			
 
				-	MOVQ pbr0+0(FP), br0
			
 
				-	MOVQ pbr1+8(FP), br1
			
 
				-	MOVQ pbr2+16(FP), br2
			
 
				-	MOVQ pbr3+24(FP), br3
			
 
				-
			
 
				-main_loop:
			
 
				-
			
 
				-	// const stream = 0
			
 
				-	// br0.fillFast()
			
 
				-	MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
			
 
				-	MOVQ    bitReaderShifted_value(br0), br_value
			
 
				-	MOVQ    bitReaderShifted_off(br0), br_offset
			
 
				-
			
 
				-	// if b.bitsRead >= 32 {
			
 
				-	CMPQ br_bits_read, $32
			
 
				-	JB   skip_fill0
			
 
				-
			
 
				-	SUBQ $32, br_bits_read // b.bitsRead -= 32
			
 
				-	SUBQ $4, br_offset     // b.off -= 4
			
 
				-
			
 
				-	// v := b.in[b.off-4 : b.off]
			
 
				-	// v = v[:4]
			
 
				-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
			
 
				-	MOVQ bitReaderShifted_in(br0), AX
			
 
				-	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
			
 
				-
			
 
				-	// b.value |= uint64(low) << (b.bitsRead & 63)
			
 
				-	MOVQ br_bits_read, CX
			
 
				-	SHLQ CL, AX
			
 
				-	ORQ  AX, br_value
			
 
				-
			
 
				-	// exhausted = exhausted || (br0.off < 4)
			
 
				-	CMPQ  br_offset, $4
			
 
				-	SETLT DL
			
 
				-	ORB   DL, DH
			
 
				-
			
 
				-	// }
			
 
				-skip_fill0:
			
 
				-
			
 
				-	// val0 := br0.peekTopBits(peekBits)
			
 
				-	MOVQ br_value, AX
			
 
				-	MOVQ peek_bits, CX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-	// v0 := table[val0&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v0
			
 
				-
			
 
				-	// br0.advance(uint8(v0.entry))
			
 
				-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CL, br_value     // value <<= n
			
 
				-	ADDQ    CX, br_bits_read // bits_read += n
			
 
				-
			
 
				-	// val1 := br0.peekTopBits(peekBits)
			
 
				-	MOVQ peek_bits, CX
			
 
				-	MOVQ br_value, AX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-	// v1 := table[val1&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v1
			
 
				-
			
 
				-	// br0.advance(uint8(v1.entry))
			
 
				-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CX, br_value     // value <<= n
			
 
				-	ADDQ    CX, br_bits_read // bits_read += n
			
 
				-
			
 
				-	// these two writes get coalesced
			
 
				-	// buf[stream][off] = uint8(v0.entry >> 8)
			
 
				-	// buf[stream][off+1] = uint8(v1.entry >> 8)
			
 
				-	MOVW BX, 0(buffer)(off*1)
			
 
				-
			
 
				-	// SECOND PART:
			
 
				-	// val2 := br0.peekTopBits(peekBits)
			
 
				-	MOVQ br_value, AX
			
 
				-	MOVQ peek_bits, CX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-	// v2 := table[val0&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v0
			
 
				-
			
 
				-	// br0.advance(uint8(v0.entry))
			
 
				-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CL, br_value     // value <<= n
			
 
				-	ADDQ    CX, br_bits_read // bits_read += n
			
 
				-
			
 
				-	// val3 := br0.peekTopBits(peekBits)
			
 
				-	MOVQ peek_bits, CX
			
 
				-	MOVQ br_value, AX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-	// v3 := table[val1&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v1
			
 
				-
			
 
				-	// br0.advance(uint8(v1.entry))
			
 
				-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CX, br_value     // value <<= n
			
 
				-	ADDQ    CX, br_bits_read // bits_read += n
			
 
				-
			
 
				-	// these two writes get coalesced
			
 
				-	// buf[stream][off+2] = uint8(v2.entry >> 8)
			
 
				-	// buf[stream][off+3] = uint8(v3.entry >> 8)
			
 
				-	MOVW BX, 0+2(buffer)(off*1)
			
 
				-
			
 
				-	// update the bitrader reader structure
			
 
				-	MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
			
 
				-	MOVQ br_value, bitReaderShifted_value(br0)
			
 
				-	MOVQ br_offset, bitReaderShifted_off(br0)
			
 
				-
			
 
				-	// const stream = 1
			
 
				-	// br1.fillFast()
			
 
				-	MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
			
 
				-	MOVQ    bitReaderShifted_value(br1), br_value
			
 
				-	MOVQ    bitReaderShifted_off(br1), br_offset
			
 
				-
			
 
				-	// if b.bitsRead >= 32 {
			
 
				-	CMPQ br_bits_read, $32
			
 
				-	JB   skip_fill1
			
 
				-
			
 
				-	SUBQ $32, br_bits_read // b.bitsRead -= 32
			
 
				-	SUBQ $4, br_offset     // b.off -= 4
			
 
				-
			
 
				-	// v := b.in[b.off-4 : b.off]
			
 
				-	// v = v[:4]
			
 
				-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
			
 
				-	MOVQ bitReaderShifted_in(br1), AX
			
 
				-	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
			
 
				-
			
 
				-	// b.value |= uint64(low) << (b.bitsRead & 63)
			
 
				-	MOVQ br_bits_read, CX
			
 
				-	SHLQ CL, AX
			
 
				-	ORQ  AX, br_value
			
 
				-
			
 
				-	// exhausted = exhausted || (br1.off < 4)
			
 
				-	CMPQ  br_offset, $4
			
 
				-	SETLT DL
			
 
				-	ORB   DL, DH
			
 
				-
			
 
				-	// }
			
 
				-skip_fill1:
			
 
				-
			
 
				-	// val0 := br1.peekTopBits(peekBits)
			
 
				-	MOVQ br_value, AX
			
 
				-	MOVQ peek_bits, CX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-	// v0 := table[val0&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v0
			
 
				-
			
 
				-	// br1.advance(uint8(v0.entry))
			
 
				-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CL, br_value     // value <<= n
			
 
				-	ADDQ    CX, br_bits_read // bits_read += n
			
 
				-
			
 
				-	// val1 := br1.peekTopBits(peekBits)
			
 
				-	MOVQ peek_bits, CX
			
 
				-	MOVQ br_value, AX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-	// v1 := table[val1&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v1
			
 
				-
			
 
				-	// br1.advance(uint8(v1.entry))
			
 
				-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CX, br_value     // value <<= n
			
 
				-	ADDQ    CX, br_bits_read // bits_read += n
			
 
				-
			
 
				-	// these two writes get coalesced
			
 
				-	// buf[stream][off] = uint8(v0.entry >> 8)
			
 
				-	// buf[stream][off+1] = uint8(v1.entry >> 8)
			
 
				-	MOVW BX, 256(buffer)(off*1)
			
 
				-
			
 
				-	// SECOND PART:
			
 
				-	// val2 := br1.peekTopBits(peekBits)
			
 
				-	MOVQ br_value, AX
			
 
				-	MOVQ peek_bits, CX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-	// v2 := table[val0&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v0
			
 
				-
			
 
				-	// br1.advance(uint8(v0.entry))
			
 
				-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CL, br_value     // value <<= n
			
 
				-	ADDQ    CX, br_bits_read // bits_read += n
			
 
				-
			
 
				-	// val3 := br1.peekTopBits(peekBits)
			
 
				-	MOVQ peek_bits, CX
			
 
				-	MOVQ br_value, AX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-	// v3 := table[val1&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v1
			
 
				-
			
 
				-	// br1.advance(uint8(v1.entry))
			
 
				-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CX, br_value     // value <<= n
			
 
				-	ADDQ    CX, br_bits_read // bits_read += n
			
 
				-
			
 
				-	// these two writes get coalesced
			
 
				-	// buf[stream][off+2] = uint8(v2.entry >> 8)
			
 
				-	// buf[stream][off+3] = uint8(v3.entry >> 8)
			
 
				-	MOVW BX, 256+2(buffer)(off*1)
			
 
				-
			
 
				-	// update the bitrader reader structure
			
 
				-	MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
			
 
				-	MOVQ br_value, bitReaderShifted_value(br1)
			
 
				-	MOVQ br_offset, bitReaderShifted_off(br1)
			
 
				-
			
 
				-	// const stream = 2
			
 
				-	// br2.fillFast()
			
 
				-	MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
			
 
				-	MOVQ    bitReaderShifted_value(br2), br_value
			
 
				-	MOVQ    bitReaderShifted_off(br2), br_offset
			
 
				-
			
 
				-	// if b.bitsRead >= 32 {
			
 
				-	CMPQ br_bits_read, $32
			
 
				-	JB   skip_fill2
			
 
				-
			
 
				-	SUBQ $32, br_bits_read // b.bitsRead -= 32
			
 
				-	SUBQ $4, br_offset     // b.off -= 4
			
 
				-
			
 
				-	// v := b.in[b.off-4 : b.off]
			
 
				-	// v = v[:4]
			
 
				-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
			
 
				-	MOVQ bitReaderShifted_in(br2), AX
			
 
				-	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
			
 
				-
			
 
				-	// b.value |= uint64(low) << (b.bitsRead & 63)
			
 
				-	MOVQ br_bits_read, CX
			
 
				-	SHLQ CL, AX
			
 
				-	ORQ  AX, br_value
			
 
				-
			
 
				-	// exhausted = exhausted || (br2.off < 4)
			
 
				-	CMPQ  br_offset, $4
			
 
				-	SETLT DL
			
 
				-	ORB   DL, DH
			
 
				-
			
 
				-	// }
			
 
				-skip_fill2:
			
 
				-
			
 
				-	// val0 := br2.peekTopBits(peekBits)
			
 
				-	MOVQ br_value, AX
			
 
				-	MOVQ peek_bits, CX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-	// v0 := table[val0&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v0
			
 
				-
			
 
				-	// br2.advance(uint8(v0.entry))
			
 
				-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CL, br_value     // value <<= n
			
 
				-	ADDQ    CX, br_bits_read // bits_read += n
			
 
				-
			
 
				-	// val1 := br2.peekTopBits(peekBits)
			
 
				-	MOVQ peek_bits, CX
			
 
				-	MOVQ br_value, AX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-	// v1 := table[val1&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v1
			
 
				-
			
 
				-	// br2.advance(uint8(v1.entry))
			
 
				-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CX, br_value     // value <<= n
			
 
				-	ADDQ    CX, br_bits_read // bits_read += n
			
 
				-
			
 
				-	// these two writes get coalesced
			
 
				-	// buf[stream][off] = uint8(v0.entry >> 8)
			
 
				-	// buf[stream][off+1] = uint8(v1.entry >> 8)
			
 
				-	MOVW BX, 512(buffer)(off*1)
			
 
				-
			
 
				-	// SECOND PART:
			
 
				-	// val2 := br2.peekTopBits(peekBits)
			
 
				-	MOVQ br_value, AX
			
 
				-	MOVQ peek_bits, CX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-	// v2 := table[val0&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v0
			
 
				-
			
 
				-	// br2.advance(uint8(v0.entry))
			
 
				-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CL, br_value     // value <<= n
			
 
				-	ADDQ    CX, br_bits_read // bits_read += n
			
 
				-
			
 
				-	// val3 := br2.peekTopBits(peekBits)
			
 
				-	MOVQ peek_bits, CX
			
 
				-	MOVQ br_value, AX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-	// v3 := table[val1&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v1
			
 
				-
			
 
				-	// br2.advance(uint8(v1.entry))
			
 
				-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CX, br_value     // value <<= n
			
 
				-	ADDQ    CX, br_bits_read // bits_read += n
			
 
				-
			
 
				-	// these two writes get coalesced
			
 
				-	// buf[stream][off+2] = uint8(v2.entry >> 8)
			
 
				-	// buf[stream][off+3] = uint8(v3.entry >> 8)
			
 
				-	MOVW BX, 512+2(buffer)(off*1)
			
 
				-
			
 
				-	// update the bitrader reader structure
			
 
				-	MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
			
 
				-	MOVQ br_value, bitReaderShifted_value(br2)
			
 
				-	MOVQ br_offset, bitReaderShifted_off(br2)
			
 
				-
			
 
				-	// const stream = 3
			
 
				-	// br3.fillFast()
			
 
				-	MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
			
 
				-	MOVQ    bitReaderShifted_value(br3), br_value
			
 
				-	MOVQ    bitReaderShifted_off(br3), br_offset
			
 
				-
			
 
				-	// if b.bitsRead >= 32 {
			
 
				-	CMPQ br_bits_read, $32
			
 
				-	JB   skip_fill3
			
 
				-
			
 
				-	SUBQ $32, br_bits_read // b.bitsRead -= 32
			
 
				-	SUBQ $4, br_offset     // b.off -= 4
			
 
				-
			
 
				-	// v := b.in[b.off-4 : b.off]
			
 
				-	// v = v[:4]
			
 
				-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
			
 
				-	MOVQ bitReaderShifted_in(br3), AX
			
 
				-	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
			
 
				-
			
 
				-	// b.value |= uint64(low) << (b.bitsRead & 63)
			
 
				-	MOVQ br_bits_read, CX
			
 
				-	SHLQ CL, AX
			
 
				-	ORQ  AX, br_value
			
 
				-
			
 
				-	// exhausted = exhausted || (br3.off < 4)
			
 
				-	CMPQ  br_offset, $4
			
 
				-	SETLT DL
			
 
				-	ORB   DL, DH
			
 
				-
			
 
				-	// }
			
 
				-skip_fill3:
			
 
				-
			
 
				-	// val0 := br3.peekTopBits(peekBits)
			
 
				-	MOVQ br_value, AX
			
 
				-	MOVQ peek_bits, CX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-	// v0 := table[val0&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v0
			
 
				-
			
 
				-	// br3.advance(uint8(v0.entry))
			
 
				-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CL, br_value     // value <<= n
			
 
				-	ADDQ    CX, br_bits_read // bits_read += n
			
 
				-
			
 
				-	// val1 := br3.peekTopBits(peekBits)
			
 
				-	MOVQ peek_bits, CX
			
 
				-	MOVQ br_value, AX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-	// v1 := table[val1&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v1
			
 
				-
			
 
				-	// br3.advance(uint8(v1.entry))
			
 
				-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CX, br_value     // value <<= n
			
 
				-	ADDQ    CX, br_bits_read // bits_read += n
			
 
				-
			
 
				-	// these two writes get coalesced
			
 
				-	// buf[stream][off] = uint8(v0.entry >> 8)
			
 
				-	// buf[stream][off+1] = uint8(v1.entry >> 8)
			
 
				-	MOVW BX, 768(buffer)(off*1)
			
 
				-
			
 
				-	// SECOND PART:
			
 
				-	// val2 := br3.peekTopBits(peekBits)
			
 
				-	MOVQ br_value, AX
			
 
				-	MOVQ peek_bits, CX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-	// v2 := table[val0&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v0
			
 
				-
			
 
				-	// br3.advance(uint8(v0.entry))
			
 
				-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CL, br_value     // value <<= n
			
 
				-	ADDQ    CX, br_bits_read // bits_read += n
			
 
				-
			
 
				-	// val3 := br3.peekTopBits(peekBits)
			
 
				-	MOVQ peek_bits, CX
			
 
				-	MOVQ br_value, AX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-	// v3 := table[val1&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v1
			
 
				-
			
 
				-	// br3.advance(uint8(v1.entry))
			
 
				-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CX, br_value     // value <<= n
			
 
				-	ADDQ    CX, br_bits_read // bits_read += n
			
 
				-
			
 
				-	// these two writes get coalesced
			
 
				-	// buf[stream][off+2] = uint8(v2.entry >> 8)
			
 
				-	// buf[stream][off+3] = uint8(v3.entry >> 8)
			
 
				-	MOVW BX, 768+2(buffer)(off*1)
			
 
				-
			
 
				-	// update the bitrader reader structure
			
 
				-	MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
			
 
				-	MOVQ br_value, bitReaderShifted_value(br3)
			
 
				-	MOVQ br_offset, bitReaderShifted_off(br3)
			
 
				-
			
 
				-	ADDQ $4, off // off += 2
			
 
				-
			
 
				-	TESTB DH, DH // any br[i].ofs < 4?
			
 
				-	JNZ   end
			
 
				-
			
 
				-	CMPQ off, $bufoff
			
 
				-	JL   main_loop
			
 
				-
			
 
				-end:
			
 
				-	MOVQ 0(SP), BP
			
 
				-
			
 
				-	MOVB off, ret+56(FP)
			
 
				-	RET
			
 
				-
			
 
				-#undef off
			
 
				-#undef buffer
			
 
				-#undef table
			
 
				-
			
 
				-#undef br_bits_read
			
 
				-#undef br_value
			
 
				-#undef br_offset
			
 
				-#undef peek_bits
			
 
				-#undef exhausted
			
 
				-
			
 
				-#undef br0
			
 
				-#undef br1
			
 
				-#undef br2
			
 
				-#undef br3
			
--- a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
@@ -1,197 +0,0 @@
 
				-// +build !appengine
			
 
				-// +build gc
			
 
				-// +build !noasm
			
 
				-
			
 
				-#include "textflag.h"
			
 
				-#include "funcdata.h"
			
 
				-#include "go_asm.h"
			
 
				-
			
 
				-
			
 
				-#define bufoff      256     // see decompress.go, we're using [4][256]byte table
			
 
				-
			
 
				-//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
			
 
				-//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
			
 
				-TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
			
 
				-#define off             R8
			
 
				-#define buffer          DI
			
 
				-#define table           SI
			
 
				-
			
 
				-#define br_bits_read    R9
			
 
				-#define br_value        R10
			
 
				-#define br_offset       R11
			
 
				-#define peek_bits       R12
			
 
				-#define exhausted       DX
			
 
				-
			
 
				-#define br0             R13
			
 
				-#define br1             R14
			
 
				-#define br2             R15
			
 
				-#define br3             BP
			
 
				-
			
 
				-    MOVQ    BP, 0(SP)
			
 
				-
			
 
				-    XORQ    exhausted, exhausted    // exhausted = false
			
 
				-    XORQ    off, off                // off = 0
			
 
				-
			
 
				-    MOVBQZX peekBits+32(FP), peek_bits
			
 
				-    MOVQ    buf+40(FP), buffer
			
 
				-    MOVQ    tbl+48(FP), table
			
 
				-
			
 
				-    MOVQ    pbr0+0(FP), br0
			
 
				-    MOVQ    pbr1+8(FP), br1
			
 
				-    MOVQ    pbr2+16(FP), br2
			
 
				-    MOVQ    pbr3+24(FP), br3
			
 
				-
			
 
				-main_loop:
			
 
				-{{ define "decode_2_values_x86" }}
			
 
				-    // const stream = {{ var "id" }}
			
 
				-    // br{{ var "id"}}.fillFast()
			
 
				-    MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
			
 
				-    MOVQ    bitReaderShifted_value(br{{ var "id" }}), br_value
			
 
				-    MOVQ    bitReaderShifted_off(br{{ var "id" }}), br_offset
			
 
				-
			
 
				-	// if b.bitsRead >= 32 {
			
 
				-    CMPQ    br_bits_read, $32
			
 
				-    JB      skip_fill{{ var "id" }}
			
 
				-
			
 
				-    SUBQ    $32, br_bits_read       // b.bitsRead -= 32
			
 
				-    SUBQ    $4, br_offset           // b.off -= 4
			
 
				-
			
 
				-	// v := b.in[b.off-4 : b.off]
			
 
				-	// v = v[:4]
			
 
				-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
			
 
				-    MOVQ    bitReaderShifted_in(br{{ var "id" }}), AX
			
 
				-    MOVL    0(br_offset)(AX*1), AX  // AX = uint32(b.in[b.off:b.off+4])
			
 
				-
			
 
				-	// b.value |= uint64(low) << (b.bitsRead & 63)
			
 
				-    MOVQ    br_bits_read, CX
			
 
				-    SHLQ    CL, AX
			
 
				-    ORQ     AX, br_value
			
 
				-
			
 
				-    // exhausted = exhausted || (br{{ var "id"}}.off < 4)
			
 
				-    CMPQ    br_offset, $4
			
 
				-    SETLT   DL
			
 
				-    ORB     DL, DH
			
 
				-    // }
			
 
				-skip_fill{{ var "id" }}:
			
 
				-
			
 
				-    // val0 := br{{ var "id"}}.peekTopBits(peekBits)
			
 
				-    MOVQ    br_value, AX
			
 
				-    MOVQ    peek_bits, CX
			
 
				-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-    // v0 := table[val0&mask]
			
 
				-    MOVW    0(table)(AX*2), AX      // AX - v0
			
 
				-
			
 
				-    // br{{ var "id"}}.advance(uint8(v0.entry))
			
 
				-    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
			
 
				-    MOVBQZX AL, CX
			
 
				-    SHLQ    CL, br_value            // value <<= n
			
 
				-    ADDQ    CX, br_bits_read        // bits_read += n
			
 
				-
			
 
				-    // val1 := br{{ var "id"}}.peekTopBits(peekBits)
			
 
				-    MOVQ    peek_bits, CX
			
 
				-    MOVQ    br_value, AX
			
 
				-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-    // v1 := table[val1&mask]
			
 
				-    MOVW    0(table)(AX*2), AX      // AX - v1
			
 
				-
			
 
				-    // br{{ var "id"}}.advance(uint8(v1.entry))
			
 
				-    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
			
 
				-    MOVBQZX AL, CX
			
 
				-    SHLQ    CX, br_value            // value <<= n
			
 
				-    ADDQ    CX, br_bits_read        // bits_read += n
			
 
				-
			
 
				-
			
 
				-    // these two writes get coalesced
			
 
				-    // buf[stream][off] = uint8(v0.entry >> 8)
			
 
				-    // buf[stream][off+1] = uint8(v1.entry >> 8)
			
 
				-    MOVW    BX, {{ var "bufofs" }}(buffer)(off*1)
			
 
				-
			
 
				-    // SECOND PART:
			
 
				-    // val2 := br{{ var "id"}}.peekTopBits(peekBits)
			
 
				-    MOVQ    br_value, AX
			
 
				-    MOVQ    peek_bits, CX
			
 
				-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-    // v2 := table[val0&mask]
			
 
				-    MOVW    0(table)(AX*2), AX      // AX - v0
			
 
				-
			
 
				-    // br{{ var "id"}}.advance(uint8(v0.entry))
			
 
				-    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
			
 
				-    MOVBQZX AL, CX
			
 
				-    SHLQ    CL, br_value            // value <<= n
			
 
				-    ADDQ    CX, br_bits_read        // bits_read += n
			
 
				-
			
 
				-    // val3 := br{{ var "id"}}.peekTopBits(peekBits)
			
 
				-    MOVQ    peek_bits, CX
			
 
				-    MOVQ    br_value, AX
			
 
				-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-    // v3 := table[val1&mask]
			
 
				-    MOVW    0(table)(AX*2), AX      // AX - v1
			
 
				-
			
 
				-    // br{{ var "id"}}.advance(uint8(v1.entry))
			
 
				-    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
			
 
				-    MOVBQZX AL, CX
			
 
				-    SHLQ    CX, br_value            // value <<= n
			
 
				-    ADDQ    CX, br_bits_read        // bits_read += n
			
 
				-
			
 
				-
			
 
				-    // these two writes get coalesced
			
 
				-    // buf[stream][off+2] = uint8(v2.entry >> 8)
			
 
				-    // buf[stream][off+3] = uint8(v3.entry >> 8)
			
 
				-    MOVW    BX, {{ var "bufofs" }}+2(buffer)(off*1)
			
 
				-
			
 
				-    // update the bitrader reader structure
			
 
				-    MOVB    br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
			
 
				-    MOVQ    br_value, bitReaderShifted_value(br{{ var "id" }})
			
 
				-    MOVQ    br_offset, bitReaderShifted_off(br{{ var "id" }})
			
 
				-{{ end }}
			
 
				-
			
 
				-    {{ set "id" "0" }}
			
 
				-    {{ set "ofs" "0" }}
			
 
				-    {{ set "bufofs" "0" }} {{/* id * bufoff */}}
			
 
				-    {{ template "decode_2_values_x86" . }}
			
 
				-
			
 
				-    {{ set "id" "1" }}
			
 
				-    {{ set "ofs" "8" }}
			
 
				-    {{ set "bufofs" "256" }}
			
 
				-    {{ template "decode_2_values_x86" . }}
			
 
				-
			
 
				-    {{ set "id" "2" }}
			
 
				-    {{ set "ofs" "16" }}
			
 
				-    {{ set "bufofs" "512" }}
			
 
				-    {{ template "decode_2_values_x86" . }}
			
 
				-
			
 
				-    {{ set "id" "3" }}
			
 
				-    {{ set "ofs" "24" }}
			
 
				-    {{ set "bufofs" "768" }}
			
 
				-    {{ template "decode_2_values_x86" . }}
			
 
				-
			
 
				-    ADDQ    $4, off     // off += 2
			
 
				-
			
 
				-    TESTB   DH, DH      // any br[i].ofs < 4?
			
 
				-    JNZ     end
			
 
				-
			
 
				-    CMPQ    off, $bufoff
			
 
				-    JL      main_loop
			
 
				-end:
			
 
				-    MOVQ    0(SP), BP
			
 
				-
			
 
				-    MOVB    off, ret+56(FP)
			
 
				-    RET
			
 
				-#undef  off
			
 
				-#undef  buffer
			
 
				-#undef  table
			
 
				-
			
 
				-#undef  br_bits_read
			
 
				-#undef  br_value
			
 
				-#undef  br_offset
			
 
				-#undef  peek_bits
			
 
				-#undef  exhausted
			
 
				-
			
 
				-#undef  br0
			
 
				-#undef  br1
			
 
				-#undef  br2
			
 
				-#undef  br3
			
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -2,30 +2,40 @@
 
				 // +build amd64,!appengine,!noasm,gc
			
 
				 
			
 
				 // This file contains the specialisation of Decoder.Decompress4X
			
 
				-// that uses an asm implementation of its main loop.
			
 
				+// and Decoder.Decompress1X that use an asm implementation of thir main loops.
			
 
				 package huff0
			
 
				 
			
 
				 import (
			
 
				 	"errors"
			
 
				 	"fmt"
			
 
				+
			
 
				+	"github.com/klauspost/compress/internal/cpuinfo"
			
 
				 )
			
 
				 
			
 
				 // decompress4x_main_loop_x86 is an x86 assembler implementation
			
 
				 // of Decompress4X when tablelog > 8.
			
 
				-// go:noescape
			
 
				-func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
			
 
				-	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
			
 
				+//go:noescape
			
 
				+func decompress4x_main_loop_amd64(ctx *decompress4xContext)
			
 
				 
			
 
				 // decompress4x_8b_loop_x86 is an x86 assembler implementation
			
 
				 // of Decompress4X when tablelog <= 8 which decodes 4 entries
			
 
				 // per loop.
			
 
				-// go:noescape
			
 
				-func decompress4x_8b_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
			
 
				-	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
			
 
				+//go:noescape
			
 
				+func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
			
 
				 
			
 
				 // fallback8BitSize is the size where using Go version is faster.
			
 
				 const fallback8BitSize = 800
			
 
				 
			
 
				+type decompress4xContext struct {
			
 
				+	pbr      *[4]bitReaderShifted
			
 
				+	peekBits uint8
			
 
				+	out      *byte
			
 
				+	dstEvery int
			
 
				+	tbl      *dEntrySingle
			
 
				+	decoded  int
			
 
				+	limit    *byte
			
 
				+}
			
 
				+
			
 
				 // Decompress4X will decompress a 4X encoded stream.
			
 
				 // The length of the supplied input must match the end of a block exactly.
			
 
				 // The *capacity* of the dst slice must match the destination size of
			
@@ -42,6 +52,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 
				 	if cap(dst) < fallback8BitSize && use8BitTables {
			
 
				 		return d.decompress4X8bit(dst, src)
			
 
				 	}
			
 
				+
			
 
				 	var br [4]bitReaderShifted
			
 
				 	// Decode "jump table"
			
 
				 	start := 6
			
@@ -71,70 +82,25 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 
				 	const tlMask = tlSize - 1
			
 
				 	single := d.dt.single[:tlSize]
			
 
				 
			
 
				-	// Use temp table to avoid bound checks/append penalty.
			
 
				-	buf := d.buffer()
			
 
				-	var off uint8
			
 
				 	var decoded int
			
 
				 
			
 
				-	const debug = false
			
 
				-
			
 
				-	// see: bitReaderShifted.peekBitsFast()
			
 
				-	peekBits := uint8((64 - d.actualTableLog) & 63)
			
 
				-
			
 
				-	// Decode 2 values from each decoder/loop.
			
 
				-	const bufoff = 256
			
 
				-	for {
			
 
				-		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
			
 
				-			break
			
 
				+	if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
			
 
				+		ctx := decompress4xContext{
			
 
				+			pbr:      &br,
			
 
				+			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
			
 
				+			out:      &out[0],
			
 
				+			dstEvery: dstEvery,
			
 
				+			tbl:      &single[0],
			
 
				+			limit:    &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
			
 
				 		}
			
 
				-
			
 
				 		if use8BitTables {
			
 
				-			off = decompress4x_8b_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
			
 
				+			decompress4x_8b_main_loop_amd64(&ctx)
			
 
				 		} else {
			
 
				-			off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
			
 
				-		}
			
 
				-		if debug {
			
 
				-			fmt.Print("DEBUG: ")
			
 
				-			fmt.Printf("off=%d,", off)
			
 
				-			for i := 0; i < 4; i++ {
			
 
				-				fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}",
			
 
				-					i, br[i].bitsRead, br[i].value, br[i].off)
			
 
				-			}
			
 
				-			fmt.Println("")
			
 
				-		}
			
 
				-
			
 
				-		if off != 0 {
			
 
				-			break
			
 
				+			decompress4x_main_loop_amd64(&ctx)
			
 
				 		}
			
 
				 
			
 
				-		if bufoff > dstEvery {
			
 
				-			d.bufs.Put(buf)
			
 
				-			return nil, errors.New("corruption detected: stream overrun 1")
			
 
				-		}
			
 
				-		copy(out, buf[0][:])
			
 
				-		copy(out[dstEvery:], buf[1][:])
			
 
				-		copy(out[dstEvery*2:], buf[2][:])
			
 
				-		copy(out[dstEvery*3:], buf[3][:])
			
 
				-		out = out[bufoff:]
			
 
				-		decoded += bufoff * 4
			
 
				-		// There must at least be 3 buffers left.
			
 
				-		if len(out) < dstEvery*3 {
			
 
				-			d.bufs.Put(buf)
			
 
				-			return nil, errors.New("corruption detected: stream overrun 2")
			
 
				-		}
			
 
				-	}
			
 
				-	if off > 0 {
			
 
				-		ioff := int(off)
			
 
				-		if len(out) < dstEvery*3+ioff {
			
 
				-			d.bufs.Put(buf)
			
 
				-			return nil, errors.New("corruption detected: stream overrun 3")
			
 
				-		}
			
 
				-		copy(out, buf[0][:off])
			
 
				-		copy(out[dstEvery:], buf[1][:off])
			
 
				-		copy(out[dstEvery*2:], buf[2][:off])
			
 
				-		copy(out[dstEvery*3:], buf[3][:off])
			
 
				-		decoded += int(off) * 4
			
 
				-		out = out[off:]
			
 
				+		decoded = ctx.decoded
			
 
				+		out = out[decoded/4:]
			
 
				 	}
			
 
				 
			
 
				 	// Decode remaining.
			
@@ -150,7 +116,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 
				 		for bitsLeft > 0 {
			
 
				 			br.fill()
			
 
				 			if offset >= endsAt {
			
 
				-				d.bufs.Put(buf)
			
 
				 				return nil, errors.New("corruption detected: stream overrun 4")
			
 
				 			}
			
 
				 
			
@@ -164,7 +129,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 
				 			offset++
			
 
				 		}
			
 
				 		if offset != endsAt {
			
 
				-			d.bufs.Put(buf)
			
 
				 			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
			
 
				 		}
			
 
				 		decoded += offset - dstEvery*i
			
@@ -173,9 +137,86 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 
				 			return nil, err
			
 
				 		}
			
 
				 	}
			
 
				-	d.bufs.Put(buf)
			
 
				 	if dstSize != decoded {
			
 
				 		return nil, errors.New("corruption detected: short output block")
			
 
				 	}
			
 
				 	return dst, nil
			
 
				 }
			
 
				+
			
 
				+// decompress4x_main_loop_x86 is an x86 assembler implementation
			
 
				+// of Decompress1X when tablelog > 8.
			
 
				+//go:noescape
			
 
				+func decompress1x_main_loop_amd64(ctx *decompress1xContext)
			
 
				+
			
 
				+// decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
			
 
				+// of Decompress1X when tablelog > 8.
			
 
				+//go:noescape
			
 
				+func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
			
 
				+
			
 
				+type decompress1xContext struct {
			
 
				+	pbr      *bitReaderShifted
			
 
				+	peekBits uint8
			
 
				+	out      *byte
			
 
				+	outCap   int
			
 
				+	tbl      *dEntrySingle
			
 
				+	decoded  int
			
 
				+}
			
 
				+
			
 
				+// Error reported by asm implementations
			
 
				+const error_max_decoded_size_exeeded = -1
			
 
				+
			
 
				+// Decompress1X will decompress a 1X encoded stream.
			
 
				+// The cap of the output buffer will be the maximum decompressed size.
			
 
				+// The length of the supplied input must match the end of a block exactly.
			
 
				+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
			
 
				+	if len(d.dt.single) == 0 {
			
 
				+		return nil, errors.New("no table loaded")
			
 
				+	}
			
 
				+	var br bitReaderShifted
			
 
				+	err := br.init(src)
			
 
				+	if err != nil {
			
 
				+		return dst, err
			
 
				+	}
			
 
				+	maxDecodedSize := cap(dst)
			
 
				+	dst = dst[:maxDecodedSize]
			
 
				+
			
 
				+	const tlSize = 1 << tableLogMax
			
 
				+	const tlMask = tlSize - 1
			
 
				+
			
 
				+	if maxDecodedSize >= 4 {
			
 
				+		ctx := decompress1xContext{
			
 
				+			pbr:      &br,
			
 
				+			out:      &dst[0],
			
 
				+			outCap:   maxDecodedSize,
			
 
				+			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
			
 
				+			tbl:      &d.dt.single[0],
			
 
				+		}
			
 
				+
			
 
				+		if cpuinfo.HasBMI2() {
			
 
				+			decompress1x_main_loop_bmi2(&ctx)
			
 
				+		} else {
			
 
				+			decompress1x_main_loop_amd64(&ctx)
			
 
				+		}
			
 
				+		if ctx.decoded == error_max_decoded_size_exeeded {
			
 
				+			return nil, ErrMaxDecodedSizeExceeded
			
 
				+		}
			
 
				+
			
 
				+		dst = dst[:ctx.decoded]
			
 
				+	}
			
 
				+
			
 
				+	// br < 8, so uint8 is fine
			
 
				+	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
			
 
				+	for bitsLeft > 0 {
			
 
				+		br.fill()
			
 
				+		if len(dst) >= maxDecodedSize {
			
 
				+			br.close()
			
 
				+			return nil, ErrMaxDecodedSizeExceeded
			
 
				+		}
			
 
				+		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
			
 
				+		nBits := uint8(v.entry)
			
 
				+		br.advance(nBits)
			
 
				+		bitsLeft -= nBits
			
 
				+		dst = append(dst, uint8(v.entry>>8))
			
 
				+	}
			
 
				+	return dst, br.close()
			
 
				+}
			
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -1,506 +1,847 @@
 
				-// +build !appengine
			
 
				-// +build gc
			
 
				-// +build !noasm
			
 
				-
			
 
				-#include "textflag.h"
			
 
				-#include "funcdata.h"
			
 
				-#include "go_asm.h"
			
 
				-
			
 
				-#ifdef GOAMD64_v4
			
 
				-#ifndef GOAMD64_v3
			
 
				-#define GOAMD64_v3
			
 
				-#endif
			
 
				-#endif
			
 
				-
			
 
				-#define bufoff      256 // see decompress.go, we're using [4][256]byte table
			
 
				-
			
 
				-// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
			
 
				-//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
			
 
				-TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
			
 
				-#define off             R8
			
 
				-#define buffer          DI
			
 
				-#define table           SI
			
 
				-
			
 
				-#define br_bits_read    R9
			
 
				-#define br_value        R10
			
 
				-#define br_offset       R11
			
 
				-#define peek_bits       R12
			
 
				-#define exhausted       DX
			
 
				-
			
 
				-#define br0             R13
			
 
				-#define br1             R14
			
 
				-#define br2             R15
			
 
				-#define br3             BP
			
 
				-
			
 
				-	MOVQ BP, 0(SP)
			
 
				-
			
 
				-	XORQ exhausted, exhausted // exhausted = false
			
 
				-	XORQ off, off             // off = 0
			
 
				-
			
 
				-	MOVBQZX peekBits+32(FP), peek_bits
			
 
				-	MOVQ    buf+40(FP), buffer
			
 
				-	MOVQ    tbl+48(FP), table
			
 
				-
			
 
				-	MOVQ pbr0+0(FP), br0
			
 
				-	MOVQ pbr1+8(FP), br1
			
 
				-	MOVQ pbr2+16(FP), br2
			
 
				-	MOVQ pbr3+24(FP), br3
			
 
				+// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
			
 
				 
			
 
				-main_loop:
			
 
				-
			
 
				-	// const stream = 0
			
 
				-	// br0.fillFast()
			
 
				-	MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
			
 
				-	MOVQ    bitReaderShifted_value(br0), br_value
			
 
				-	MOVQ    bitReaderShifted_off(br0), br_offset
			
 
				+//go:build amd64 && !appengine && !noasm && gc
			
 
				+// +build amd64,!appengine,!noasm,gc
			
 
				 
			
 
				-	// We must have at least 2 * max tablelog left
			
 
				-	CMPQ br_bits_read, $64-22
			
 
				-	JBE  skip_fill0
			
 
				+// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
			
 
				+TEXT ·decompress4x_main_loop_amd64(SB), $0-8
			
 
				+	XORQ DX, DX
			
 
				 
			
 
				-	SUBQ $32, br_bits_read // b.bitsRead -= 32
			
 
				-	SUBQ $4, br_offset     // b.off -= 4
			
 
				+	// Preload values
			
 
				+	MOVQ    ctx+0(FP), AX
			
 
				+	MOVBQZX 8(AX), DI
			
 
				+	MOVQ    16(AX), SI
			
 
				+	MOVQ    48(AX), BX
			
 
				+	MOVQ    24(AX), R9
			
 
				+	MOVQ    32(AX), R10
			
 
				+	MOVQ    (AX), R11
			
 
				 
			
 
				-	// v := b.in[b.off-4 : b.off]
			
 
				-	// v = v[:4]
			
 
				-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
			
 
				-	MOVQ bitReaderShifted_in(br0), AX
			
 
				+	// Main loop
			
 
				+main_loop:
			
 
				+	MOVQ  SI, R8
			
 
				+	CMPQ  R8, BX
			
 
				+	SETGE DL
			
 
				+
			
 
				+	// br0.fillFast32()
			
 
				+	MOVQ    32(R11), R12
			
 
				+	MOVBQZX 40(R11), R13
			
 
				+	CMPQ    R13, $0x20
			
 
				+	JBE     skip_fill0
			
 
				+	MOVQ    24(R11), AX
			
 
				+	SUBQ    $0x20, R13
			
 
				+	SUBQ    $0x04, AX
			
 
				+	MOVQ    (R11), R14
			
 
				 
			
 
				 	// b.value |= uint64(low) << (b.bitsRead & 63)
			
 
				-#ifdef GOAMD64_v3
			
 
				-	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
			
 
				-
			
 
				-#else
			
 
				-	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
			
 
				-	MOVQ br_bits_read, CX
			
 
				-	SHLQ CL, AX
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				-	ORQ AX, br_value
			
 
				+	MOVL (AX)(R14*1), R14
			
 
				+	MOVQ R13, CX
			
 
				+	SHLQ CL, R14
			
 
				+	MOVQ AX, 24(R11)
			
 
				+	ORQ  R14, R12
			
 
				 
			
 
				 	// exhausted = exhausted || (br0.off < 4)
			
 
				-	CMPQ  br_offset, $4
			
 
				-	SETLT DL
			
 
				-	ORB   DL, DH
			
 
				+	CMPQ  AX, $0x04
			
 
				+	SETLT AL
			
 
				+	ORB   AL, DL
			
 
				 
			
 
				-	// }
			
 
				 skip_fill0:
			
 
				-
			
 
				 	// val0 := br0.peekTopBits(peekBits)
			
 
				-#ifdef GOAMD64_v3
			
 
				-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-#else
			
 
				-	MOVQ br_value, AX
			
 
				-	MOVQ peek_bits, CX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-#endif
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				 
			
 
				 	// v0 := table[val0&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v0
			
 
				-
			
 
				-	// br0.advance(uint8(v0.entry))
			
 
				-	MOVB AH, BL // BL = uint8(v0.entry >> 8)
			
 
				-
			
 
				-#ifdef GOAMD64_v3
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLXQ   AX, br_value, br_value // value <<= n
			
 
				-
			
 
				-#else
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CL, br_value // value <<= n
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				-	ADDQ CX, br_bits_read // bits_read += n
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				 
			
 
				-#ifdef GOAMD64_v3
			
 
				-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
			
 
				+	// br0.advance(uint8(v0.entry)
			
 
				+	MOVB CH, AL
			
 
				+	SHLQ CL, R12
			
 
				+	ADDB CL, R13
			
 
				 
			
 
				-#else
			
 
				 	// val1 := br0.peekTopBits(peekBits)
			
 
				-	MOVQ peek_bits, CX
			
 
				-	MOVQ br_value, AX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-#endif
			
 
				+	MOVQ DI, CX
			
 
				+	MOVQ R12, R14
			
 
				+	SHRQ CL, R14
			
 
				 
			
 
				 	// v1 := table[val1&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v1
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				 
			
 
				 	// br0.advance(uint8(v1.entry))
			
 
				-	MOVB AH, BH // BH = uint8(v1.entry >> 8)
			
 
				-
			
 
				-#ifdef GOAMD64_v3
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLXQ   AX, br_value, br_value // value <<= n
			
 
				-
			
 
				-#else
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CL, br_value // value <<= n
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				-	ADDQ CX, br_bits_read // bits_read += n
			
 
				+	MOVB CH, AH
			
 
				+	SHLQ CL, R12
			
 
				+	ADDB CL, R13
			
 
				 
			
 
				 	// these two writes get coalesced
			
 
				-	// buf[stream][off] = uint8(v0.entry >> 8)
			
 
				-	// buf[stream][off+1] = uint8(v1.entry >> 8)
			
 
				-	MOVW BX, 0(buffer)(off*1)
			
 
				-
			
 
				-	// update the bitrader reader structure
			
 
				-	MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
			
 
				-	MOVQ br_value, bitReaderShifted_value(br0)
			
 
				-	MOVQ br_offset, bitReaderShifted_off(br0)
			
 
				-
			
 
				-	// const stream = 1
			
 
				-	// br1.fillFast()
			
 
				-	MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
			
 
				-	MOVQ    bitReaderShifted_value(br1), br_value
			
 
				-	MOVQ    bitReaderShifted_off(br1), br_offset
			
 
				-
			
 
				-	// We must have at least 2 * max tablelog left
			
 
				-	CMPQ br_bits_read, $64-22
			
 
				-	JBE  skip_fill1
			
 
				-
			
 
				-	SUBQ $32, br_bits_read // b.bitsRead -= 32
			
 
				-	SUBQ $4, br_offset     // b.off -= 4
			
 
				-
			
 
				-	// v := b.in[b.off-4 : b.off]
			
 
				-	// v = v[:4]
			
 
				-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
			
 
				-	MOVQ bitReaderShifted_in(br1), AX
			
 
				+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
			
 
				+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
			
 
				+	MOVW AX, (R8)
			
 
				+
			
 
				+	// update the bitreader structure
			
 
				+	MOVQ R12, 32(R11)
			
 
				+	MOVB R13, 40(R11)
			
 
				+	ADDQ R9, R8
			
 
				+
			
 
				+	// br1.fillFast32()
			
 
				+	MOVQ    80(R11), R12
			
 
				+	MOVBQZX 88(R11), R13
			
 
				+	CMPQ    R13, $0x20
			
 
				+	JBE     skip_fill1
			
 
				+	MOVQ    72(R11), AX
			
 
				+	SUBQ    $0x20, R13
			
 
				+	SUBQ    $0x04, AX
			
 
				+	MOVQ    48(R11), R14
			
 
				 
			
 
				 	// b.value |= uint64(low) << (b.bitsRead & 63)
			
 
				-#ifdef GOAMD64_v3
			
 
				-	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
			
 
				-
			
 
				-#else
			
 
				-	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
			
 
				-	MOVQ br_bits_read, CX
			
 
				-	SHLQ CL, AX
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				-	ORQ AX, br_value
			
 
				+	MOVL (AX)(R14*1), R14
			
 
				+	MOVQ R13, CX
			
 
				+	SHLQ CL, R14
			
 
				+	MOVQ AX, 72(R11)
			
 
				+	ORQ  R14, R12
			
 
				 
			
 
				 	// exhausted = exhausted || (br1.off < 4)
			
 
				-	CMPQ  br_offset, $4
			
 
				-	SETLT DL
			
 
				-	ORB   DL, DH
			
 
				+	CMPQ  AX, $0x04
			
 
				+	SETLT AL
			
 
				+	ORB   AL, DL
			
 
				 
			
 
				-	// }
			
 
				 skip_fill1:
			
 
				-
			
 
				 	// val0 := br1.peekTopBits(peekBits)
			
 
				-#ifdef GOAMD64_v3
			
 
				-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-#else
			
 
				-	MOVQ br_value, AX
			
 
				-	MOVQ peek_bits, CX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-#endif
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				 
			
 
				 	// v0 := table[val0&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v0
			
 
				-
			
 
				-	// br1.advance(uint8(v0.entry))
			
 
				-	MOVB AH, BL // BL = uint8(v0.entry >> 8)
			
 
				-
			
 
				-#ifdef GOAMD64_v3
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLXQ   AX, br_value, br_value // value <<= n
			
 
				-
			
 
				-#else
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CL, br_value // value <<= n
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				-	ADDQ CX, br_bits_read // bits_read += n
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				 
			
 
				-#ifdef GOAMD64_v3
			
 
				-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
			
 
				+	// br1.advance(uint8(v0.entry)
			
 
				+	MOVB CH, AL
			
 
				+	SHLQ CL, R12
			
 
				+	ADDB CL, R13
			
 
				 
			
 
				-#else
			
 
				 	// val1 := br1.peekTopBits(peekBits)
			
 
				-	MOVQ peek_bits, CX
			
 
				-	MOVQ br_value, AX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-#endif
			
 
				+	MOVQ DI, CX
			
 
				+	MOVQ R12, R14
			
 
				+	SHRQ CL, R14
			
 
				 
			
 
				 	// v1 := table[val1&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v1
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				 
			
 
				 	// br1.advance(uint8(v1.entry))
			
 
				-	MOVB AH, BH // BH = uint8(v1.entry >> 8)
			
 
				-
			
 
				-#ifdef GOAMD64_v3
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLXQ   AX, br_value, br_value // value <<= n
			
 
				-
			
 
				-#else
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CL, br_value // value <<= n
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				-	ADDQ CX, br_bits_read // bits_read += n
			
 
				+	MOVB CH, AH
			
 
				+	SHLQ CL, R12
			
 
				+	ADDB CL, R13
			
 
				 
			
 
				 	// these two writes get coalesced
			
 
				-	// buf[stream][off] = uint8(v0.entry >> 8)
			
 
				-	// buf[stream][off+1] = uint8(v1.entry >> 8)
			
 
				-	MOVW BX, 256(buffer)(off*1)
			
 
				-
			
 
				-	// update the bitrader reader structure
			
 
				-	MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
			
 
				-	MOVQ br_value, bitReaderShifted_value(br1)
			
 
				-	MOVQ br_offset, bitReaderShifted_off(br1)
			
 
				-
			
 
				-	// const stream = 2
			
 
				-	// br2.fillFast()
			
 
				-	MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
			
 
				-	MOVQ    bitReaderShifted_value(br2), br_value
			
 
				-	MOVQ    bitReaderShifted_off(br2), br_offset
			
 
				-
			
 
				-	// We must have at least 2 * max tablelog left
			
 
				-	CMPQ br_bits_read, $64-22
			
 
				-	JBE  skip_fill2
			
 
				-
			
 
				-	SUBQ $32, br_bits_read // b.bitsRead -= 32
			
 
				-	SUBQ $4, br_offset     // b.off -= 4
			
 
				-
			
 
				-	// v := b.in[b.off-4 : b.off]
			
 
				-	// v = v[:4]
			
 
				-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
			
 
				-	MOVQ bitReaderShifted_in(br2), AX
			
 
				+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
			
 
				+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
			
 
				+	MOVW AX, (R8)
			
 
				+
			
 
				+	// update the bitreader structure
			
 
				+	MOVQ R12, 80(R11)
			
 
				+	MOVB R13, 88(R11)
			
 
				+	ADDQ R9, R8
			
 
				+
			
 
				+	// br2.fillFast32()
			
 
				+	MOVQ    128(R11), R12
			
 
				+	MOVBQZX 136(R11), R13
			
 
				+	CMPQ    R13, $0x20
			
 
				+	JBE     skip_fill2
			
 
				+	MOVQ    120(R11), AX
			
 
				+	SUBQ    $0x20, R13
			
 
				+	SUBQ    $0x04, AX
			
 
				+	MOVQ    96(R11), R14
			
 
				 
			
 
				 	// b.value |= uint64(low) << (b.bitsRead & 63)
			
 
				-#ifdef GOAMD64_v3
			
 
				-	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
			
 
				-
			
 
				-#else
			
 
				-	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
			
 
				-	MOVQ br_bits_read, CX
			
 
				-	SHLQ CL, AX
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				-	ORQ AX, br_value
			
 
				+	MOVL (AX)(R14*1), R14
			
 
				+	MOVQ R13, CX
			
 
				+	SHLQ CL, R14
			
 
				+	MOVQ AX, 120(R11)
			
 
				+	ORQ  R14, R12
			
 
				 
			
 
				 	// exhausted = exhausted || (br2.off < 4)
			
 
				-	CMPQ  br_offset, $4
			
 
				-	SETLT DL
			
 
				-	ORB   DL, DH
			
 
				+	CMPQ  AX, $0x04
			
 
				+	SETLT AL
			
 
				+	ORB   AL, DL
			
 
				 
			
 
				-	// }
			
 
				 skip_fill2:
			
 
				-
			
 
				 	// val0 := br2.peekTopBits(peekBits)
			
 
				-#ifdef GOAMD64_v3
			
 
				-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-#else
			
 
				-	MOVQ br_value, AX
			
 
				-	MOVQ peek_bits, CX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-#endif
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				 
			
 
				 	// v0 := table[val0&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v0
			
 
				-
			
 
				-	// br2.advance(uint8(v0.entry))
			
 
				-	MOVB AH, BL // BL = uint8(v0.entry >> 8)
			
 
				-
			
 
				-#ifdef GOAMD64_v3
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLXQ   AX, br_value, br_value // value <<= n
			
 
				-
			
 
				-#else
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CL, br_value // value <<= n
			
 
				-
			
 
				-#endif
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				 
			
 
				-	ADDQ CX, br_bits_read // bits_read += n
			
 
				+	// br2.advance(uint8(v0.entry)
			
 
				+	MOVB CH, AL
			
 
				+	SHLQ CL, R12
			
 
				+	ADDB CL, R13
			
 
				 
			
 
				-#ifdef GOAMD64_v3
			
 
				-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-#else
			
 
				 	// val1 := br2.peekTopBits(peekBits)
			
 
				-	MOVQ peek_bits, CX
			
 
				-	MOVQ br_value, AX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				-
			
 
				-#endif
			
 
				+	MOVQ DI, CX
			
 
				+	MOVQ R12, R14
			
 
				+	SHRQ CL, R14
			
 
				 
			
 
				 	// v1 := table[val1&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v1
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				 
			
 
				 	// br2.advance(uint8(v1.entry))
			
 
				-	MOVB AH, BH // BH = uint8(v1.entry >> 8)
			
 
				+	MOVB CH, AH
			
 
				+	SHLQ CL, R12
			
 
				+	ADDB CL, R13
			
 
				 
			
 
				-#ifdef GOAMD64_v3
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLXQ   AX, br_value, br_value // value <<= n
			
 
				-
			
 
				-#else
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CL, br_value // value <<= n
			
 
				+	// these two writes get coalesced
			
 
				+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
			
 
				+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
			
 
				+	MOVW AX, (R8)
			
 
				+
			
 
				+	// update the bitreader structure
			
 
				+	MOVQ R12, 128(R11)
			
 
				+	MOVB R13, 136(R11)
			
 
				+	ADDQ R9, R8
			
 
				+
			
 
				+	// br3.fillFast32()
			
 
				+	MOVQ    176(R11), R12
			
 
				+	MOVBQZX 184(R11), R13
			
 
				+	CMPQ    R13, $0x20
			
 
				+	JBE     skip_fill3
			
 
				+	MOVQ    168(R11), AX
			
 
				+	SUBQ    $0x20, R13
			
 
				+	SUBQ    $0x04, AX
			
 
				+	MOVQ    144(R11), R14
			
 
				 
			
 
				-#endif
			
 
				+	// b.value |= uint64(low) << (b.bitsRead & 63)
			
 
				+	MOVL (AX)(R14*1), R14
			
 
				+	MOVQ R13, CX
			
 
				+	SHLQ CL, R14
			
 
				+	MOVQ AX, 168(R11)
			
 
				+	ORQ  R14, R12
			
 
				 
			
 
				-	ADDQ CX, br_bits_read // bits_read += n
			
 
				+	// exhausted = exhausted || (br3.off < 4)
			
 
				+	CMPQ  AX, $0x04
			
 
				+	SETLT AL
			
 
				+	ORB   AL, DL
			
 
				 
			
 
				-	// these two writes get coalesced
			
 
				-	// buf[stream][off] = uint8(v0.entry >> 8)
			
 
				-	// buf[stream][off+1] = uint8(v1.entry >> 8)
			
 
				-	MOVW BX, 512(buffer)(off*1)
			
 
				+skip_fill3:
			
 
				+	// val0 := br3.peekTopBits(peekBits)
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				 
			
 
				-	// update the bitrader reader structure
			
 
				-	MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
			
 
				-	MOVQ br_value, bitReaderShifted_value(br2)
			
 
				-	MOVQ br_offset, bitReaderShifted_off(br2)
			
 
				+	// v0 := table[val0&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				 
			
 
				-	// const stream = 3
			
 
				-	// br3.fillFast()
			
 
				-	MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
			
 
				-	MOVQ    bitReaderShifted_value(br3), br_value
			
 
				-	MOVQ    bitReaderShifted_off(br3), br_offset
			
 
				+	// br3.advance(uint8(v0.entry)
			
 
				+	MOVB CH, AL
			
 
				+	SHLQ CL, R12
			
 
				+	ADDB CL, R13
			
 
				 
			
 
				-	// We must have at least 2 * max tablelog left
			
 
				-	CMPQ br_bits_read, $64-22
			
 
				-	JBE  skip_fill3
			
 
				+	// val1 := br3.peekTopBits(peekBits)
			
 
				+	MOVQ DI, CX
			
 
				+	MOVQ R12, R14
			
 
				+	SHRQ CL, R14
			
 
				 
			
 
				-	SUBQ $32, br_bits_read // b.bitsRead -= 32
			
 
				-	SUBQ $4, br_offset     // b.off -= 4
			
 
				+	// v1 := table[val1&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				 
			
 
				-	// v := b.in[b.off-4 : b.off]
			
 
				-	// v = v[:4]
			
 
				-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
			
 
				-	MOVQ bitReaderShifted_in(br3), AX
			
 
				+	// br3.advance(uint8(v1.entry))
			
 
				+	MOVB CH, AH
			
 
				+	SHLQ CL, R12
			
 
				+	ADDB CL, R13
			
 
				 
			
 
				-	// b.value |= uint64(low) << (b.bitsRead & 63)
			
 
				-#ifdef GOAMD64_v3
			
 
				-	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
			
 
				+	// these two writes get coalesced
			
 
				+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
			
 
				+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
			
 
				+	MOVW AX, (R8)
			
 
				+
			
 
				+	// update the bitreader structure
			
 
				+	MOVQ  R12, 176(R11)
			
 
				+	MOVB  R13, 184(R11)
			
 
				+	ADDQ  $0x02, SI
			
 
				+	TESTB DL, DL
			
 
				+	JZ    main_loop
			
 
				+	MOVQ  ctx+0(FP), AX
			
 
				+	SUBQ  16(AX), SI
			
 
				+	SHLQ  $0x02, SI
			
 
				+	MOVQ  SI, 40(AX)
			
 
				+	RET
			
 
				 
			
 
				-#else
			
 
				-	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
			
 
				-	MOVQ br_bits_read, CX
			
 
				-	SHLQ CL, AX
			
 
				+// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
			
 
				+TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
			
 
				+	XORQ DX, DX
			
 
				 
			
 
				-#endif
			
 
				+	// Preload values
			
 
				+	MOVQ    ctx+0(FP), CX
			
 
				+	MOVBQZX 8(CX), DI
			
 
				+	MOVQ    16(CX), BX
			
 
				+	MOVQ    48(CX), SI
			
 
				+	MOVQ    24(CX), R9
			
 
				+	MOVQ    32(CX), R10
			
 
				+	MOVQ    (CX), R11
			
 
				 
			
 
				-	ORQ AX, br_value
			
 
				+	// Main loop
			
 
				+main_loop:
			
 
				+	MOVQ  BX, R8
			
 
				+	CMPQ  R8, SI
			
 
				+	SETGE DL
			
 
				+
			
 
				+	// br0.fillFast32()
			
 
				+	MOVQ    32(R11), R12
			
 
				+	MOVBQZX 40(R11), R13
			
 
				+	CMPQ    R13, $0x20
			
 
				+	JBE     skip_fill0
			
 
				+	MOVQ    24(R11), R14
			
 
				+	SUBQ    $0x20, R13
			
 
				+	SUBQ    $0x04, R14
			
 
				+	MOVQ    (R11), R15
			
 
				 
			
 
				-	// exhausted = exhausted || (br3.off < 4)
			
 
				-	CMPQ  br_offset, $4
			
 
				-	SETLT DL
			
 
				-	ORB   DL, DH
			
 
				+	// b.value |= uint64(low) << (b.bitsRead & 63)
			
 
				+	MOVL (R14)(R15*1), R15
			
 
				+	MOVQ R13, CX
			
 
				+	SHLQ CL, R15
			
 
				+	MOVQ R14, 24(R11)
			
 
				+	ORQ  R15, R12
			
 
				 
			
 
				-	// }
			
 
				-skip_fill3:
			
 
				+	// exhausted = exhausted || (br0.off < 4)
			
 
				+	CMPQ  R14, $0x04
			
 
				+	SETLT AL
			
 
				+	ORB   AL, DL
			
 
				 
			
 
				-	// val0 := br3.peekTopBits(peekBits)
			
 
				-#ifdef GOAMD64_v3
			
 
				-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
			
 
				+skip_fill0:
			
 
				+	// val0 := br0.peekTopBits(peekBits)
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				 
			
 
				-#else
			
 
				-	MOVQ br_value, AX
			
 
				-	MOVQ peek_bits, CX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				+	// v0 := table[val0&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				 
			
 
				-#endif
			
 
				+	// br0.advance(uint8(v0.entry)
			
 
				+	MOVB CH, AL
			
 
				+	SHLQ CL, R12
			
 
				+	ADDB CL, R13
			
 
				 
			
 
				-	// v0 := table[val0&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v0
			
 
				+	// val1 := br0.peekTopBits(peekBits)
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				+
			
 
				+	// v1 := table[val0&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				+
			
 
				+	// br0.advance(uint8(v1.entry)
			
 
				+	MOVB   CH, AH
			
 
				+	SHLQ   CL, R12
			
 
				+	ADDB   CL, R13
			
 
				+	BSWAPL AX
			
 
				+
			
 
				+	// val2 := br0.peekTopBits(peekBits)
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				+
			
 
				+	// v2 := table[val0&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				+
			
 
				+	// br0.advance(uint8(v2.entry)
			
 
				+	MOVB CH, AH
			
 
				+	SHLQ CL, R12
			
 
				+	ADDB CL, R13
			
 
				+
			
 
				+	// val3 := br0.peekTopBits(peekBits)
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				+
			
 
				+	// v3 := table[val0&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				+
			
 
				+	// br0.advance(uint8(v3.entry)
			
 
				+	MOVB   CH, AL
			
 
				+	SHLQ   CL, R12
			
 
				+	ADDB   CL, R13
			
 
				+	BSWAPL AX
			
 
				+
			
 
				+	// these four writes get coalesced
			
 
				+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
			
 
				+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
			
 
				+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
			
 
				+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
			
 
				+	MOVL AX, (R8)
			
 
				+
			
 
				+	// update the bitreader structure
			
 
				+	MOVQ R12, 32(R11)
			
 
				+	MOVB R13, 40(R11)
			
 
				+	ADDQ R9, R8
			
 
				+
			
 
				+	// br1.fillFast32()
			
 
				+	MOVQ    80(R11), R12
			
 
				+	MOVBQZX 88(R11), R13
			
 
				+	CMPQ    R13, $0x20
			
 
				+	JBE     skip_fill1
			
 
				+	MOVQ    72(R11), R14
			
 
				+	SUBQ    $0x20, R13
			
 
				+	SUBQ    $0x04, R14
			
 
				+	MOVQ    48(R11), R15
			
 
				 
			
 
				-	// br3.advance(uint8(v0.entry))
			
 
				-	MOVB AH, BL // BL = uint8(v0.entry >> 8)
			
 
				+	// b.value |= uint64(low) << (b.bitsRead & 63)
			
 
				+	MOVL (R14)(R15*1), R15
			
 
				+	MOVQ R13, CX
			
 
				+	SHLQ CL, R15
			
 
				+	MOVQ R14, 72(R11)
			
 
				+	ORQ  R15, R12
			
 
				 
			
 
				-#ifdef GOAMD64_v3
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLXQ   AX, br_value, br_value // value <<= n
			
 
				+	// exhausted = exhausted || (br1.off < 4)
			
 
				+	CMPQ  R14, $0x04
			
 
				+	SETLT AL
			
 
				+	ORB   AL, DL
			
 
				 
			
 
				-#else
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CL, br_value // value <<= n
			
 
				+skip_fill1:
			
 
				+	// val0 := br1.peekTopBits(peekBits)
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				 
			
 
				-#endif
			
 
				+	// v0 := table[val0&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				 
			
 
				-	ADDQ CX, br_bits_read // bits_read += n
			
 
				+	// br1.advance(uint8(v0.entry)
			
 
				+	MOVB CH, AL
			
 
				+	SHLQ CL, R12
			
 
				+	ADDB CL, R13
			
 
				 
			
 
				-#ifdef GOAMD64_v3
			
 
				-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
			
 
				+	// val1 := br1.peekTopBits(peekBits)
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				+
			
 
				+	// v1 := table[val0&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				+
			
 
				+	// br1.advance(uint8(v1.entry)
			
 
				+	MOVB   CH, AH
			
 
				+	SHLQ   CL, R12
			
 
				+	ADDB   CL, R13
			
 
				+	BSWAPL AX
			
 
				+
			
 
				+	// val2 := br1.peekTopBits(peekBits)
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				+
			
 
				+	// v2 := table[val0&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				+
			
 
				+	// br1.advance(uint8(v2.entry)
			
 
				+	MOVB CH, AH
			
 
				+	SHLQ CL, R12
			
 
				+	ADDB CL, R13
			
 
				+
			
 
				+	// val3 := br1.peekTopBits(peekBits)
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				+
			
 
				+	// v3 := table[val0&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				+
			
 
				+	// br1.advance(uint8(v3.entry)
			
 
				+	MOVB   CH, AL
			
 
				+	SHLQ   CL, R12
			
 
				+	ADDB   CL, R13
			
 
				+	BSWAPL AX
			
 
				+
			
 
				+	// these four writes get coalesced
			
 
				+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
			
 
				+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
			
 
				+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
			
 
				+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
			
 
				+	MOVL AX, (R8)
			
 
				+
			
 
				+	// update the bitreader structure
			
 
				+	MOVQ R12, 80(R11)
			
 
				+	MOVB R13, 88(R11)
			
 
				+	ADDQ R9, R8
			
 
				+
			
 
				+	// br2.fillFast32()
			
 
				+	MOVQ    128(R11), R12
			
 
				+	MOVBQZX 136(R11), R13
			
 
				+	CMPQ    R13, $0x20
			
 
				+	JBE     skip_fill2
			
 
				+	MOVQ    120(R11), R14
			
 
				+	SUBQ    $0x20, R13
			
 
				+	SUBQ    $0x04, R14
			
 
				+	MOVQ    96(R11), R15
			
 
				 
			
 
				-#else
			
 
				-	// val1 := br3.peekTopBits(peekBits)
			
 
				-	MOVQ peek_bits, CX
			
 
				-	MOVQ br_value, AX
			
 
				-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
			
 
				+	// b.value |= uint64(low) << (b.bitsRead & 63)
			
 
				+	MOVL (R14)(R15*1), R15
			
 
				+	MOVQ R13, CX
			
 
				+	SHLQ CL, R15
			
 
				+	MOVQ R14, 120(R11)
			
 
				+	ORQ  R15, R12
			
 
				 
			
 
				-#endif
			
 
				+	// exhausted = exhausted || (br2.off < 4)
			
 
				+	CMPQ  R14, $0x04
			
 
				+	SETLT AL
			
 
				+	ORB   AL, DL
			
 
				 
			
 
				-	// v1 := table[val1&mask]
			
 
				-	MOVW 0(table)(AX*2), AX // AX - v1
			
 
				+skip_fill2:
			
 
				+	// val0 := br2.peekTopBits(peekBits)
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				 
			
 
				-	// br3.advance(uint8(v1.entry))
			
 
				-	MOVB AH, BH // BH = uint8(v1.entry >> 8)
			
 
				+	// v0 := table[val0&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				 
			
 
				-#ifdef GOAMD64_v3
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLXQ   AX, br_value, br_value // value <<= n
			
 
				+	// br2.advance(uint8(v0.entry)
			
 
				+	MOVB CH, AL
			
 
				+	SHLQ CL, R12
			
 
				+	ADDB CL, R13
			
 
				 
			
 
				-#else
			
 
				-	MOVBQZX AL, CX
			
 
				-	SHLQ    CL, br_value // value <<= n
			
 
				+	// val1 := br2.peekTopBits(peekBits)
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				+
			
 
				+	// v1 := table[val0&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				+
			
 
				+	// br2.advance(uint8(v1.entry)
			
 
				+	MOVB   CH, AH
			
 
				+	SHLQ   CL, R12
			
 
				+	ADDB   CL, R13
			
 
				+	BSWAPL AX
			
 
				+
			
 
				+	// val2 := br2.peekTopBits(peekBits)
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				+
			
 
				+	// v2 := table[val0&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				+
			
 
				+	// br2.advance(uint8(v2.entry)
			
 
				+	MOVB CH, AH
			
 
				+	SHLQ CL, R12
			
 
				+	ADDB CL, R13
			
 
				+
			
 
				+	// val3 := br2.peekTopBits(peekBits)
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				+
			
 
				+	// v3 := table[val0&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				+
			
 
				+	// br2.advance(uint8(v3.entry)
			
 
				+	MOVB   CH, AL
			
 
				+	SHLQ   CL, R12
			
 
				+	ADDB   CL, R13
			
 
				+	BSWAPL AX
			
 
				+
			
 
				+	// these four writes get coalesced
			
 
				+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
			
 
				+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
			
 
				+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
			
 
				+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
			
 
				+	MOVL AX, (R8)
			
 
				+
			
 
				+	// update the bitreader structure
			
 
				+	MOVQ R12, 128(R11)
			
 
				+	MOVB R13, 136(R11)
			
 
				+	ADDQ R9, R8
			
 
				+
			
 
				+	// br3.fillFast32()
			
 
				+	MOVQ    176(R11), R12
			
 
				+	MOVBQZX 184(R11), R13
			
 
				+	CMPQ    R13, $0x20
			
 
				+	JBE     skip_fill3
			
 
				+	MOVQ    168(R11), R14
			
 
				+	SUBQ    $0x20, R13
			
 
				+	SUBQ    $0x04, R14
			
 
				+	MOVQ    144(R11), R15
			
 
				 
			
 
				-#endif
			
 
				+	// b.value |= uint64(low) << (b.bitsRead & 63)
			
 
				+	MOVL (R14)(R15*1), R15
			
 
				+	MOVQ R13, CX
			
 
				+	SHLQ CL, R15
			
 
				+	MOVQ R14, 168(R11)
			
 
				+	ORQ  R15, R12
			
 
				 
			
 
				-	ADDQ CX, br_bits_read // bits_read += n
			
 
				+	// exhausted = exhausted || (br3.off < 4)
			
 
				+	CMPQ  R14, $0x04
			
 
				+	SETLT AL
			
 
				+	ORB   AL, DL
			
 
				 
			
 
				-	// these two writes get coalesced
			
 
				-	// buf[stream][off] = uint8(v0.entry >> 8)
			
 
				-	// buf[stream][off+1] = uint8(v1.entry >> 8)
			
 
				-	MOVW BX, 768(buffer)(off*1)
			
 
				+skip_fill3:
			
 
				+	// val0 := br3.peekTopBits(peekBits)
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				 
			
 
				-	// update the bitrader reader structure
			
 
				-	MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
			
 
				-	MOVQ br_value, bitReaderShifted_value(br3)
			
 
				-	MOVQ br_offset, bitReaderShifted_off(br3)
			
 
				+	// v0 := table[val0&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				 
			
 
				-	ADDQ $2, off // off += 2
			
 
				+	// br3.advance(uint8(v0.entry)
			
 
				+	MOVB CH, AL
			
 
				+	SHLQ CL, R12
			
 
				+	ADDB CL, R13
			
 
				 
			
 
				-	TESTB DH, DH // any br[i].ofs < 4?
			
 
				-	JNZ   end
			
 
				+	// val1 := br3.peekTopBits(peekBits)
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				+
			
 
				+	// v1 := table[val0&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				+
			
 
				+	// br3.advance(uint8(v1.entry)
			
 
				+	MOVB   CH, AH
			
 
				+	SHLQ   CL, R12
			
 
				+	ADDB   CL, R13
			
 
				+	BSWAPL AX
			
 
				+
			
 
				+	// val2 := br3.peekTopBits(peekBits)
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				+
			
 
				+	// v2 := table[val0&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				+
			
 
				+	// br3.advance(uint8(v2.entry)
			
 
				+	MOVB CH, AH
			
 
				+	SHLQ CL, R12
			
 
				+	ADDB CL, R13
			
 
				+
			
 
				+	// val3 := br3.peekTopBits(peekBits)
			
 
				+	MOVQ R12, R14
			
 
				+	MOVQ DI, CX
			
 
				+	SHRQ CL, R14
			
 
				+
			
 
				+	// v3 := table[val0&mask]
			
 
				+	MOVW (R10)(R14*2), CX
			
 
				+
			
 
				+	// br3.advance(uint8(v3.entry)
			
 
				+	MOVB   CH, AL
			
 
				+	SHLQ   CL, R12
			
 
				+	ADDB   CL, R13
			
 
				+	BSWAPL AX
			
 
				+
			
 
				+	// these four writes get coalesced
			
 
				+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
			
 
				+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
			
 
				+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
			
 
				+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
			
 
				+	MOVL AX, (R8)
			
 
				+
			
 
				+	// update the bitreader structure
			
 
				+	MOVQ  R12, 176(R11)
			
 
				+	MOVB  R13, 184(R11)
			
 
				+	ADDQ  $0x04, BX
			
 
				+	TESTB DL, DL
			
 
				+	JZ    main_loop
			
 
				+	MOVQ  ctx+0(FP), AX
			
 
				+	SUBQ  16(AX), BX
			
 
				+	SHLQ  $0x02, BX
			
 
				+	MOVQ  BX, 40(AX)
			
 
				+	RET
			
 
				 
			
 
				-	CMPQ off, $bufoff
			
 
				-	JL   main_loop
			
 
				+// func decompress1x_main_loop_amd64(ctx *decompress1xContext)
			
 
				+TEXT ·decompress1x_main_loop_amd64(SB), $0-8
			
 
				+	MOVQ    ctx+0(FP), CX
			
 
				+	MOVQ    16(CX), DX
			
 
				+	MOVQ    24(CX), BX
			
 
				+	CMPQ    BX, $0x04
			
 
				+	JB      error_max_decoded_size_exeeded
			
 
				+	LEAQ    (DX)(BX*1), BX
			
 
				+	MOVQ    (CX), SI
			
 
				+	MOVQ    (SI), R8
			
 
				+	MOVQ    24(SI), R9
			
 
				+	MOVQ    32(SI), R10
			
 
				+	MOVBQZX 40(SI), R11
			
 
				+	MOVQ    32(CX), SI
			
 
				+	MOVBQZX 8(CX), DI
			
 
				+	JMP     loop_condition
			
 
				 
			
 
				-end:
			
 
				-	MOVQ 0(SP), BP
			
 
				+main_loop:
			
 
				+	// Check if we have room for 4 bytes in the output buffer
			
 
				+	LEAQ 4(DX), CX
			
 
				+	CMPQ CX, BX
			
 
				+	JGE  error_max_decoded_size_exeeded
			
 
				+
			
 
				+	// Decode 4 values
			
 
				+	CMPQ R11, $0x20
			
 
				+	JL   bitReader_fillFast_1_end
			
 
				+	SUBQ $0x20, R11
			
 
				+	SUBQ $0x04, R9
			
 
				+	MOVL (R8)(R9*1), R12
			
 
				+	MOVQ R11, CX
			
 
				+	SHLQ CL, R12
			
 
				+	ORQ  R12, R10
			
 
				+
			
 
				+bitReader_fillFast_1_end:
			
 
				+	MOVQ    DI, CX
			
 
				+	MOVQ    R10, R12
			
 
				+	SHRQ    CL, R12
			
 
				+	MOVW    (SI)(R12*2), CX
			
 
				+	MOVB    CH, AL
			
 
				+	MOVBQZX CL, CX
			
 
				+	ADDQ    CX, R11
			
 
				+	SHLQ    CL, R10
			
 
				+	MOVQ    DI, CX
			
 
				+	MOVQ    R10, R12
			
 
				+	SHRQ    CL, R12
			
 
				+	MOVW    (SI)(R12*2), CX
			
 
				+	MOVB    CH, AH
			
 
				+	MOVBQZX CL, CX
			
 
				+	ADDQ    CX, R11
			
 
				+	SHLQ    CL, R10
			
 
				+	BSWAPL  AX
			
 
				+	CMPQ    R11, $0x20
			
 
				+	JL      bitReader_fillFast_2_end
			
 
				+	SUBQ    $0x20, R11
			
 
				+	SUBQ    $0x04, R9
			
 
				+	MOVL    (R8)(R9*1), R12
			
 
				+	MOVQ    R11, CX
			
 
				+	SHLQ    CL, R12
			
 
				+	ORQ     R12, R10
			
 
				+
			
 
				+bitReader_fillFast_2_end:
			
 
				+	MOVQ    DI, CX
			
 
				+	MOVQ    R10, R12
			
 
				+	SHRQ    CL, R12
			
 
				+	MOVW    (SI)(R12*2), CX
			
 
				+	MOVB    CH, AH
			
 
				+	MOVBQZX CL, CX
			
 
				+	ADDQ    CX, R11
			
 
				+	SHLQ    CL, R10
			
 
				+	MOVQ    DI, CX
			
 
				+	MOVQ    R10, R12
			
 
				+	SHRQ    CL, R12
			
 
				+	MOVW    (SI)(R12*2), CX
			
 
				+	MOVB    CH, AL
			
 
				+	MOVBQZX CL, CX
			
 
				+	ADDQ    CX, R11
			
 
				+	SHLQ    CL, R10
			
 
				+	BSWAPL  AX
			
 
				+
			
 
				+	// Store the decoded values
			
 
				+	MOVL AX, (DX)
			
 
				+	ADDQ $0x04, DX
			
 
				+
			
 
				+loop_condition:
			
 
				+	CMPQ R9, $0x08
			
 
				+	JGE  main_loop
			
 
				+
			
 
				+	// Update ctx structure
			
 
				+	MOVQ ctx+0(FP), AX
			
 
				+	SUBQ 16(AX), DX
			
 
				+	MOVQ DX, 40(AX)
			
 
				+	MOVQ (AX), AX
			
 
				+	MOVQ R9, 24(AX)
			
 
				+	MOVQ R10, 32(AX)
			
 
				+	MOVB R11, 40(AX)
			
 
				+	RET
			
 
				 
			
 
				-	MOVB off, ret+56(FP)
			
 
				+	// Report error
			
 
				+error_max_decoded_size_exeeded:
			
 
				+	MOVQ ctx+0(FP), AX
			
 
				+	MOVQ $-1, CX
			
 
				+	MOVQ CX, 40(AX)
			
 
				 	RET
			
 
				 
			
 
				-#undef off
			
 
				-#undef buffer
			
 
				-#undef table
			
 
				+// func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
			
 
				+// Requires: BMI2
			
 
				+TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
			
 
				+	MOVQ    ctx+0(FP), CX
			
 
				+	MOVQ    16(CX), DX
			
 
				+	MOVQ    24(CX), BX
			
 
				+	CMPQ    BX, $0x04
			
 
				+	JB      error_max_decoded_size_exeeded
			
 
				+	LEAQ    (DX)(BX*1), BX
			
 
				+	MOVQ    (CX), SI
			
 
				+	MOVQ    (SI), R8
			
 
				+	MOVQ    24(SI), R9
			
 
				+	MOVQ    32(SI), R10
			
 
				+	MOVBQZX 40(SI), R11
			
 
				+	MOVQ    32(CX), SI
			
 
				+	MOVBQZX 8(CX), DI
			
 
				+	JMP     loop_condition
			
 
				 
			
 
				-#undef br_bits_read
			
 
				-#undef br_value
			
 
				-#undef br_offset
			
 
				-#undef peek_bits
			
 
				-#undef exhausted
			
 
				+main_loop:
			
 
				+	// Check if we have room for 4 bytes in the output buffer
			
 
				+	LEAQ 4(DX), CX
			
 
				+	CMPQ CX, BX
			
 
				+	JGE  error_max_decoded_size_exeeded
			
 
				+
			
 
				+	// Decode 4 values
			
 
				+	CMPQ  R11, $0x20
			
 
				+	JL    bitReader_fillFast_1_end
			
 
				+	SUBQ  $0x20, R11
			
 
				+	SUBQ  $0x04, R9
			
 
				+	MOVL  (R8)(R9*1), CX
			
 
				+	SHLXQ R11, CX, CX
			
 
				+	ORQ   CX, R10
			
 
				+
			
 
				+bitReader_fillFast_1_end:
			
 
				+	SHRXQ   DI, R10, CX
			
 
				+	MOVW    (SI)(CX*2), CX
			
 
				+	MOVB    CH, AL
			
 
				+	MOVBQZX CL, CX
			
 
				+	ADDQ    CX, R11
			
 
				+	SHLXQ   CX, R10, R10
			
 
				+	SHRXQ   DI, R10, CX
			
 
				+	MOVW    (SI)(CX*2), CX
			
 
				+	MOVB    CH, AH
			
 
				+	MOVBQZX CL, CX
			
 
				+	ADDQ    CX, R11
			
 
				+	SHLXQ   CX, R10, R10
			
 
				+	BSWAPL  AX
			
 
				+	CMPQ    R11, $0x20
			
 
				+	JL      bitReader_fillFast_2_end
			
 
				+	SUBQ    $0x20, R11
			
 
				+	SUBQ    $0x04, R9
			
 
				+	MOVL    (R8)(R9*1), CX
			
 
				+	SHLXQ   R11, CX, CX
			
 
				+	ORQ     CX, R10
			
 
				+
			
 
				+bitReader_fillFast_2_end:
			
 
				+	SHRXQ   DI, R10, CX
			
 
				+	MOVW    (SI)(CX*2), CX
			
 
				+	MOVB    CH, AH
			
 
				+	MOVBQZX CL, CX
			
 
				+	ADDQ    CX, R11
			
 
				+	SHLXQ   CX, R10, R10
			
 
				+	SHRXQ   DI, R10, CX
			
 
				+	MOVW    (SI)(CX*2), CX
			
 
				+	MOVB    CH, AL
			
 
				+	MOVBQZX CL, CX
			
 
				+	ADDQ    CX, R11
			
 
				+	SHLXQ   CX, R10, R10
			
 
				+	BSWAPL  AX
			
 
				+
			
 
				+	// Store the decoded values
			
 
				+	MOVL AX, (DX)
			
 
				+	ADDQ $0x04, DX
			
 
				+
			
 
				+loop_condition:
			
 
				+	CMPQ R9, $0x08
			
 
				+	JGE  main_loop
			
 
				+
			
 
				+	// Update ctx structure
			
 
				+	MOVQ ctx+0(FP), AX
			
 
				+	SUBQ 16(AX), DX
			
 
				+	MOVQ DX, 40(AX)
			
 
				+	MOVQ (AX), AX
			
 
				+	MOVQ R9, 24(AX)
			
 
				+	MOVQ R10, 32(AX)
			
 
				+	MOVB R11, 40(AX)
			
 
				+	RET
			
 
				 
			
 
				-#undef br0
			
 
				-#undef br1
			
 
				-#undef br2
			
 
				-#undef br3
			
 
				+	// Report error
			
 
				+error_max_decoded_size_exeeded:
			
 
				+	MOVQ ctx+0(FP), AX
			
 
				+	MOVQ $-1, CX
			
 
				+	MOVQ CX, 40(AX)
			
 
				+	RET
			
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
@@ -1,195 +0,0 @@
 
				-// +build !appengine
			
 
				-// +build gc
			
 
				-// +build !noasm
			
 
				-
			
 
				-#include "textflag.h"
			
 
				-#include "funcdata.h"
			
 
				-#include "go_asm.h"
			
 
				-
			
 
				-#ifdef GOAMD64_v4
			
 
				-#ifndef GOAMD64_v3
			
 
				-#define GOAMD64_v3
			
 
				-#endif
			
 
				-#endif
			
 
				-
			
 
				-#define bufoff      256     // see decompress.go, we're using [4][256]byte table
			
 
				-
			
 
				-//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
			
 
				-//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
			
 
				-TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
			
 
				-#define off             R8
			
 
				-#define buffer          DI
			
 
				-#define table           SI
			
 
				-
			
 
				-#define br_bits_read    R9
			
 
				-#define br_value        R10
			
 
				-#define br_offset       R11
			
 
				-#define peek_bits       R12
			
 
				-#define exhausted       DX
			
 
				-
			
 
				-#define br0             R13
			
 
				-#define br1             R14
			
 
				-#define br2             R15
			
 
				-#define br3             BP
			
 
				-
			
 
				-    MOVQ    BP, 0(SP)
			
 
				-
			
 
				-    XORQ    exhausted, exhausted    // exhausted = false
			
 
				-    XORQ    off, off                // off = 0
			
 
				-
			
 
				-    MOVBQZX peekBits+32(FP), peek_bits
			
 
				-    MOVQ    buf+40(FP), buffer
			
 
				-    MOVQ    tbl+48(FP), table
			
 
				-
			
 
				-    MOVQ    pbr0+0(FP), br0
			
 
				-    MOVQ    pbr1+8(FP), br1
			
 
				-    MOVQ    pbr2+16(FP), br2
			
 
				-    MOVQ    pbr3+24(FP), br3
			
 
				-
			
 
				-main_loop:
			
 
				-{{ define "decode_2_values_x86" }}
			
 
				-    // const stream = {{ var "id" }}
			
 
				-    // br{{ var "id"}}.fillFast()
			
 
				-    MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
			
 
				-    MOVQ    bitReaderShifted_value(br{{ var "id" }}), br_value
			
 
				-    MOVQ    bitReaderShifted_off(br{{ var "id" }}), br_offset
			
 
				-
			
 
				-    // We must have at least 2 * max tablelog left
			
 
				-    CMPQ    br_bits_read, $64-22
			
 
				-    JBE     skip_fill{{ var "id" }}
			
 
				-
			
 
				-    SUBQ    $32, br_bits_read       // b.bitsRead -= 32
			
 
				-    SUBQ    $4, br_offset           // b.off -= 4
			
 
				-
			
 
				-	// v := b.in[b.off-4 : b.off]
			
 
				-	// v = v[:4]
			
 
				-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
			
 
				-    MOVQ    bitReaderShifted_in(br{{ var "id" }}), AX
			
 
				-
			
 
				-	// b.value |= uint64(low) << (b.bitsRead & 63)
			
 
				-#ifdef GOAMD64_v3
			
 
				-    SHLXQ   br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
			
 
				-#else
			
 
				-    MOVL    0(br_offset)(AX*1), AX  // AX = uint32(b.in[b.off:b.off+4])
			
 
				-    MOVQ    br_bits_read, CX
			
 
				-    SHLQ    CL, AX
			
 
				-#endif
			
 
				-
			
 
				-    ORQ     AX, br_value
			
 
				-
			
 
				-    // exhausted = exhausted || (br{{ var "id"}}.off < 4)
			
 
				-    CMPQ    br_offset, $4
			
 
				-    SETLT   DL
			
 
				-    ORB     DL, DH
			
 
				-    // }
			
 
				-skip_fill{{ var "id" }}:
			
 
				-
			
 
				-    // val0 := br{{ var "id"}}.peekTopBits(peekBits)
			
 
				-#ifdef GOAMD64_v3
			
 
				-    SHRXQ   peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
			
 
				-#else
			
 
				-    MOVQ    br_value, AX
			
 
				-    MOVQ    peek_bits, CX
			
 
				-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
			
 
				-#endif
			
 
				-
			
 
				-    // v0 := table[val0&mask]
			
 
				-    MOVW    0(table)(AX*2), AX      // AX - v0
			
 
				-
			
 
				-    // br{{ var "id"}}.advance(uint8(v0.entry))
			
 
				-    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
			
 
				-
			
 
				-#ifdef GOAMD64_v3
			
 
				-    MOVBQZX AL, CX
			
 
				-    SHLXQ   AX, br_value, br_value // value <<= n
			
 
				-#else
			
 
				-    MOVBQZX AL, CX
			
 
				-    SHLQ    CL, br_value            // value <<= n
			
 
				-#endif
			
 
				-
			
 
				-    ADDQ    CX, br_bits_read        // bits_read += n
			
 
				-
			
 
				-
			
 
				-#ifdef GOAMD64_v3
			
 
				-    SHRXQ    peek_bits, br_value, AX  // AX = (value >> peek_bits) & mask
			
 
				-#else
			
 
				-    // val1 := br{{ var "id"}}.peekTopBits(peekBits)
			
 
				-    MOVQ    peek_bits, CX
			
 
				-    MOVQ    br_value, AX
			
 
				-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
			
 
				-#endif
			
 
				-
			
 
				-    // v1 := table[val1&mask]
			
 
				-    MOVW    0(table)(AX*2), AX      // AX - v1
			
 
				-
			
 
				-    // br{{ var "id"}}.advance(uint8(v1.entry))
			
 
				-    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
			
 
				-
			
 
				-#ifdef GOAMD64_v3
			
 
				-    MOVBQZX AL, CX
			
 
				-    SHLXQ   AX, br_value, br_value // value <<= n
			
 
				-#else
			
 
				-    MOVBQZX AL, CX
			
 
				-    SHLQ    CL, br_value            // value <<= n
			
 
				-#endif
			
 
				-
			
 
				-    ADDQ    CX, br_bits_read        // bits_read += n
			
 
				-
			
 
				-
			
 
				-    // these two writes get coalesced
			
 
				-    // buf[stream][off] = uint8(v0.entry >> 8)
			
 
				-    // buf[stream][off+1] = uint8(v1.entry >> 8)
			
 
				-    MOVW    BX, {{ var "bufofs" }}(buffer)(off*1)
			
 
				-
			
 
				-    // update the bitrader reader structure
			
 
				-    MOVB    br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
			
 
				-    MOVQ    br_value, bitReaderShifted_value(br{{ var "id" }})
			
 
				-    MOVQ    br_offset, bitReaderShifted_off(br{{ var "id" }})
			
 
				-{{ end }}
			
 
				-
			
 
				-    {{ set "id" "0" }}
			
 
				-    {{ set "ofs" "0" }}
			
 
				-    {{ set "bufofs" "0" }} {{/* id * bufoff */}}
			
 
				-    {{ template "decode_2_values_x86" . }}
			
 
				-
			
 
				-    {{ set "id" "1" }}
			
 
				-    {{ set "ofs" "8" }}
			
 
				-    {{ set "bufofs" "256" }}
			
 
				-    {{ template "decode_2_values_x86" . }}
			
 
				-
			
 
				-    {{ set "id" "2" }}
			
 
				-    {{ set "ofs" "16" }}
			
 
				-    {{ set "bufofs" "512" }}
			
 
				-    {{ template "decode_2_values_x86" . }}
			
 
				-
			
 
				-    {{ set "id" "3" }}
			
 
				-    {{ set "ofs" "24" }}
			
 
				-    {{ set "bufofs" "768" }}
			
 
				-    {{ template "decode_2_values_x86" . }}
			
 
				-
			
 
				-    ADDQ    $2, off     // off += 2
			
 
				-
			
 
				-    TESTB   DH, DH      // any br[i].ofs < 4?
			
 
				-    JNZ     end
			
 
				-
			
 
				-    CMPQ    off, $bufoff
			
 
				-    JL      main_loop
			
 
				-end:
			
 
				-    MOVQ    0(SP), BP
			
 
				-
			
 
				-    MOVB    off, ret+56(FP)
			
 
				-    RET
			
 
				-#undef  off
			
 
				-#undef  buffer
			
 
				-#undef  table
			
 
				-
			
 
				-#undef  br_bits_read
			
 
				-#undef  br_value
			
 
				-#undef  br_offset
			
 
				-#undef  peek_bits
			
 
				-#undef  exhausted
			
 
				-
			
 
				-#undef  br0
			
 
				-#undef  br1
			
 
				-#undef  br2
			
 
				-#undef  br3
			
--- a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
@@ -191,3 +191,105 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
 
				 	}
			
 
				 	return dst, nil
			
 
				 }
			
 
				+
			
 
				+// Decompress1X will decompress a 1X encoded stream.
			
 
				+// The cap of the output buffer will be the maximum decompressed size.
			
 
				+// The length of the supplied input must match the end of a block exactly.
			
 
				+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
			
 
				+	if len(d.dt.single) == 0 {
			
 
				+		return nil, errors.New("no table loaded")
			
 
				+	}
			
 
				+	if use8BitTables && d.actualTableLog <= 8 {
			
 
				+		return d.decompress1X8Bit(dst, src)
			
 
				+	}
			
 
				+	var br bitReaderShifted
			
 
				+	err := br.init(src)
			
 
				+	if err != nil {
			
 
				+		return dst, err
			
 
				+	}
			
 
				+	maxDecodedSize := cap(dst)
			
 
				+	dst = dst[:0]
			
 
				+
			
 
				+	// Avoid bounds check by always having full sized table.
			
 
				+	const tlSize = 1 << tableLogMax
			
 
				+	const tlMask = tlSize - 1
			
 
				+	dt := d.dt.single[:tlSize]
			
 
				+
			
 
				+	// Use temp table to avoid bound checks/append penalty.
			
 
				+	bufs := d.buffer()
			
 
				+	buf := &bufs[0]
			
 
				+	var off uint8
			
 
				+
			
 
				+	for br.off >= 8 {
			
 
				+		br.fillFast()
			
 
				+		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
			
 
				+		br.advance(uint8(v.entry))
			
 
				+		buf[off+0] = uint8(v.entry >> 8)
			
 
				+
			
 
				+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
			
 
				+		br.advance(uint8(v.entry))
			
 
				+		buf[off+1] = uint8(v.entry >> 8)
			
 
				+
			
 
				+		// Refill
			
 
				+		br.fillFast()
			
 
				+
			
 
				+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
			
 
				+		br.advance(uint8(v.entry))
			
 
				+		buf[off+2] = uint8(v.entry >> 8)
			
 
				+
			
 
				+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
			
 
				+		br.advance(uint8(v.entry))
			
 
				+		buf[off+3] = uint8(v.entry >> 8)
			
 
				+
			
 
				+		off += 4
			
 
				+		if off == 0 {
			
 
				+			if len(dst)+256 > maxDecodedSize {
			
 
				+				br.close()
			
 
				+				d.bufs.Put(bufs)
			
 
				+				return nil, ErrMaxDecodedSizeExceeded
			
 
				+			}
			
 
				+			dst = append(dst, buf[:]...)
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if len(dst)+int(off) > maxDecodedSize {
			
 
				+		d.bufs.Put(bufs)
			
 
				+		br.close()
			
 
				+		return nil, ErrMaxDecodedSizeExceeded
			
 
				+	}
			
 
				+	dst = append(dst, buf[:off]...)
			
 
				+
			
 
				+	// br < 8, so uint8 is fine
			
 
				+	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
			
 
				+	for bitsLeft > 0 {
			
 
				+		br.fill()
			
 
				+		if false && br.bitsRead >= 32 {
			
 
				+			if br.off >= 4 {
			
 
				+				v := br.in[br.off-4:]
			
 
				+				v = v[:4]
			
 
				+				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
			
 
				+				br.value = (br.value << 32) | uint64(low)
			
 
				+				br.bitsRead -= 32
			
 
				+				br.off -= 4
			
 
				+			} else {
			
 
				+				for br.off > 0 {
			
 
				+					br.value = (br.value << 8) | uint64(br.in[br.off-1])
			
 
				+					br.bitsRead -= 8
			
 
				+					br.off--
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		if len(dst) >= maxDecodedSize {
			
 
				+			d.bufs.Put(bufs)
			
 
				+			br.close()
			
 
				+			return nil, ErrMaxDecodedSizeExceeded
			
 
				+		}
			
 
				+		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
			
 
				+		nBits := uint8(v.entry)
			
 
				+		br.advance(nBits)
			
 
				+		bitsLeft -= nBits
			
 
				+		dst = append(dst, uint8(v.entry>>8))
			
 
				+	}
			
 
				+	d.bufs.Put(bufs)
			
 
				+	return dst, br.close()
			
 
				+}
			
--- a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go
@@ -0,0 +1,34 @@
 
				+// Package cpuinfo gives runtime info about the current CPU.
			
 
				+//
			
 
				+// This is a very limited module meant for use internally
			
 
				+// in this project. For more versatile solution check
			
 
				+// https://github.com/klauspost/cpuid.
			
 
				+package cpuinfo
			
 
				+
			
 
				+// HasBMI1 checks whether an x86 CPU supports the BMI1 extension.
			
 
				+func HasBMI1() bool {
			
 
				+	return hasBMI1
			
 
				+}
			
 
				+
			
 
				+// HasBMI2 checks whether an x86 CPU supports the BMI2 extension.
			
 
				+func HasBMI2() bool {
			
 
				+	return hasBMI2
			
 
				+}
			
 
				+
			
 
				+// DisableBMI2 will disable BMI2, for testing purposes.
			
 
				+// Call returned function to restore previous state.
			
 
				+func DisableBMI2() func() {
			
 
				+	old := hasBMI2
			
 
				+	hasBMI2 = false
			
 
				+	return func() {
			
 
				+		hasBMI2 = old
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// HasBMI checks whether an x86 CPU supports both BMI1 and BMI2 extensions.
			
 
				+func HasBMI() bool {
			
 
				+	return HasBMI1() && HasBMI2()
			
 
				+}
			
 
				+
			
 
				+var hasBMI1 bool
			
 
				+var hasBMI2 bool
			
--- a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go
@@ -0,0 +1,11 @@
 
				+//go:build amd64 && !appengine && !noasm && gc
			
 
				+// +build amd64,!appengine,!noasm,gc
			
 
				+
			
 
				+package cpuinfo
			
 
				+
			
 
				+// go:noescape
			
 
				+func x86extensions() (bmi1, bmi2 bool)
			
 
				+
			
 
				+func init() {
			
 
				+	hasBMI1, hasBMI2 = x86extensions()
			
 
				+}
			
--- a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s
@@ -0,0 +1,36 @@
 
				+// +build !appengine
			
 
				+// +build gc
			
 
				+// +build !noasm
			
 
				+
			
 
				+#include "textflag.h"
			
 
				+#include "funcdata.h"
			
 
				+#include "go_asm.h"
			
 
				+
			
 
				+TEXT ·x86extensions(SB), NOSPLIT, $0
			
 
				+	// 1. determine max EAX value
			
 
				+	XORQ AX, AX
			
 
				+	CPUID
			
 
				+
			
 
				+	CMPQ AX, $7
			
 
				+	JB   unsupported
			
 
				+
			
 
				+	// 2. EAX = 7, ECX = 0 --- see Table 3-8 "Information Returned by CPUID Instruction"
			
 
				+	MOVQ $7, AX
			
 
				+	MOVQ $0, CX
			
 
				+	CPUID
			
 
				+
			
 
				+	BTQ   $3, BX // bit 3 = BMI1
			
 
				+	SETCS AL
			
 
				+
			
 
				+	BTQ   $8, BX // bit 8 = BMI2
			
 
				+	SETCS AH
			
 
				+
			
 
				+	MOVB AL, bmi1+0(FP)
			
 
				+	MOVB AH, bmi2+1(FP)
			
 
				+	RET
			
 
				+
			
 
				+unsupported:
			
 
				+	XORQ AX, AX
			
 
				+	MOVB AL, bmi1+0(FP)
			
 
				+	MOVB AL, bmi2+1(FP)
			
 
				+	RET
			
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@@ -386,47 +386,31 @@ In practice this means that concurrency is often limited to utilizing about 3 co
 
				   
			
 
				 ### Benchmarks
			
 
				 
			
 
				-These are some examples of performance compared to [datadog cgo library](https://github.com/DataDog/zstd).
			
 
				-
			
 
				 The first two are streaming decodes and the last are smaller inputs. 
			
 
				- 
			
 
				+
			
 
				+Running on AMD Ryzen 9 3950X 16-Core Processor. AMD64 assembly used.
			
 
				+
			
 
				 ```
			
 
				-BenchmarkDecoderSilesia-8                          3     385000067 ns/op     550.51 MB/s        5498 B/op          8 allocs/op
			
 
				-BenchmarkDecoderSilesiaCgo-8                       6     197666567 ns/op    1072.25 MB/s      270672 B/op          8 allocs/op
			
 
				-
			
 
				-BenchmarkDecoderEnwik9-8                           1    2027001600 ns/op     493.34 MB/s       10496 B/op         18 allocs/op
			
 
				-BenchmarkDecoderEnwik9Cgo-8                        2     979499200 ns/op    1020.93 MB/s      270672 B/op          8 allocs/op
			
 
				-
			
 
				-Concurrent performance:
			
 
				-
			
 
				-BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16                28915         42469 ns/op    4340.07 MB/s         114 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16           116505          9965 ns/op    11900.16 MB/s         16 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16              8952        134272 ns/op    3588.70 MB/s         915 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16               11820        102538 ns/op    4161.90 MB/s         594 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16             34782         34184 ns/op    3661.88 MB/s          60 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16              27712         43447 ns/op    3500.58 MB/s          99 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16                 62826         18750 ns/op    21845.10 MB/s        104 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16          631545          1794 ns/op    57078.74 MB/s          2 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16         1690140           712 ns/op    172938.13 MB/s         1 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16                 10432        113593 ns/op    6180.73 MB/s        1143 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallel/html.zst-16                    113206         10671 ns/op    9596.27 MB/s          15 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16          1530615           779 ns/op    5229.49 MB/s           0 B/op          0 allocs/op
			
 
				-
			
 
				-BenchmarkDecoder_DecodeAllParallelCgo/kppkn.gtb.zst-16             65217         16192 ns/op    11383.34 MB/s         46 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallelCgo/geo.protodata.zst-16        292671          4039 ns/op    29363.19 MB/s          6 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallelCgo/plrabn12.txt.zst-16          26314         46021 ns/op    10470.43 MB/s        293 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallelCgo/lcet10.txt.zst-16            33897         34900 ns/op    12227.96 MB/s        205 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallelCgo/asyoulik.txt.zst-16         104348         11433 ns/op    10949.01 MB/s         20 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallelCgo/alice29.txt.zst-16           75949         15510 ns/op    9805.60 MB/s          32 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallelCgo/html_x_4.zst-16             173910          6756 ns/op    60624.29 MB/s         37 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallelCgo/paper-100k.pdf.zst-16       923076          1339 ns/op    76474.87 MB/s          1 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallelCgo/fireworks.jpeg.zst-16       922920          1351 ns/op    91102.57 MB/s          2 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallelCgo/urls.10K.zst-16              27649         43618 ns/op    16096.19 MB/s        407 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallelCgo/html.zst-16                 279073          4160 ns/op    24614.18 MB/s          6 B/op          0 allocs/op
			
 
				-BenchmarkDecoder_DecodeAllParallelCgo/comp-data.bin.zst-16        749938          1579 ns/op    2581.71 MB/s           0 B/op          0 allocs/op
			
 
				+BenchmarkDecoderSilesia-32    	                   5	 206878840 ns/op	1024.50 MB/s	   49808 B/op	      43 allocs/op
			
 
				+BenchmarkDecoderEnwik9-32                          1	1271809000 ns/op	 786.28 MB/s	   72048 B/op	      52 allocs/op
			
 
				+
			
 
				+Concurrent blocks, performance:
			
 
				+
			
 
				+BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32         	   67356	     17857 ns/op	10321.96 MB/s	        22.48 pct	     102 B/op	       0 allocs/op
			
 
				+BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32     	  266656	      4421 ns/op	26823.21 MB/s	        11.89 pct	      19 B/op	       0 allocs/op
			
 
				+BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32      	   20992	     56842 ns/op	8477.17 MB/s	        39.90 pct	     754 B/op	       0 allocs/op
			
 
				+BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32        	   27456	     43932 ns/op	9714.01 MB/s	        33.27 pct	     524 B/op	       0 allocs/op
			
 
				+BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32      	   78432	     15047 ns/op	8319.15 MB/s	        40.34 pct	      66 B/op	       0 allocs/op
			
 
				+BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32       	   65800	     18436 ns/op	8249.63 MB/s	        37.75 pct	      88 B/op	       0 allocs/op
			
 
				+BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32          	  102993	     11523 ns/op	35546.09 MB/s	         3.637 pct	     143 B/op	       0 allocs/op
			
 
				+BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32    	 1000000	      1070 ns/op	95720.98 MB/s	        80.53 pct	       3 B/op	       0 allocs/op
			
 
				+BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32    	  749802	      1752 ns/op	70272.35 MB/s	       100.0 pct	       5 B/op	       0 allocs/op
			
 
				+BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32          	   22640	     52934 ns/op	13263.37 MB/s	        26.25 pct	    1014 B/op	       0 allocs/op
			
 
				+BenchmarkDecoder_DecodeAllParallel/html.zst-32              	  226412	      5232 ns/op	19572.27 MB/s	        14.49 pct	      20 B/op	       0 allocs/op
			
 
				+BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32     	  923041	      1276 ns/op	3194.71 MB/s	        31.26 pct	       0 B/op	       0 allocs/op
			
 
				 ```
			
 
				 
			
 
				-This reflects the performance around May 2020, but this may be out of date.
			
 
				+This reflects the performance around May 2022, but this may be out of date.
			
 
				 
			
 
				 ## Zstd inside ZIP files
			
 
				 
			
--- a/vendor/github.com/klauspost/compress/zstd/bitreader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go
@@ -63,13 +63,6 @@ func (b *bitReader) get32BitsFast(n uint8) uint32 {
 
				 	return v
			
 
				 }
			
 
				 
			
 
				-func (b *bitReader) get16BitsFast(n uint8) uint16 {
			
 
				-	const regMask = 64 - 1
			
 
				-	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
			
 
				-	b.bitsRead += n
			
 
				-	return v
			
 
				-}
			
 
				-
			
 
				 // fillFast() will make sure at least 32 bits are available.
			
 
				 // There must be at least 4 bytes available.
			
 
				 func (b *bitReader) fillFast() {
			
--- a/vendor/github.com/klauspost/compress/zstd/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
@@ -5,8 +5,6 @@
 
				 
			
 
				 package zstd
			
 
				 
			
 
				-import "fmt"
			
 
				-
			
 
				 // bitWriter will write bits.
			
 
				 // First bit will be LSB of the first byte of output.
			
 
				 type bitWriter struct {
			
@@ -73,80 +71,6 @@ func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
 
				 	b.nBits += bits
			
 
				 }
			
 
				 
			
 
				-// flush will flush all pending full bytes.
			
 
				-// There will be at least 56 bits available for writing when this has been called.
			
 
				-// Using flush32 is faster, but leaves less space for writing.
			
 
				-func (b *bitWriter) flush() {
			
 
				-	v := b.nBits >> 3
			
 
				-	switch v {
			
 
				-	case 0:
			
 
				-	case 1:
			
 
				-		b.out = append(b.out,
			
 
				-			byte(b.bitContainer),
			
 
				-		)
			
 
				-	case 2:
			
 
				-		b.out = append(b.out,
			
 
				-			byte(b.bitContainer),
			
 
				-			byte(b.bitContainer>>8),
			
 
				-		)
			
 
				-	case 3:
			
 
				-		b.out = append(b.out,
			
 
				-			byte(b.bitContainer),
			
 
				-			byte(b.bitContainer>>8),
			
 
				-			byte(b.bitContainer>>16),
			
 
				-		)
			
 
				-	case 4:
			
 
				-		b.out = append(b.out,
			
 
				-			byte(b.bitContainer),
			
 
				-			byte(b.bitContainer>>8),
			
 
				-			byte(b.bitContainer>>16),
			
 
				-			byte(b.bitContainer>>24),
			
 
				-		)
			
 
				-	case 5:
			
 
				-		b.out = append(b.out,
			
 
				-			byte(b.bitContainer),
			
 
				-			byte(b.bitContainer>>8),
			
 
				-			byte(b.bitContainer>>16),
			
 
				-			byte(b.bitContainer>>24),
			
 
				-			byte(b.bitContainer>>32),
			
 
				-		)
			
 
				-	case 6:
			
 
				-		b.out = append(b.out,
			
 
				-			byte(b.bitContainer),
			
 
				-			byte(b.bitContainer>>8),
			
 
				-			byte(b.bitContainer>>16),
			
 
				-			byte(b.bitContainer>>24),
			
 
				-			byte(b.bitContainer>>32),
			
 
				-			byte(b.bitContainer>>40),
			
 
				-		)
			
 
				-	case 7:
			
 
				-		b.out = append(b.out,
			
 
				-			byte(b.bitContainer),
			
 
				-			byte(b.bitContainer>>8),
			
 
				-			byte(b.bitContainer>>16),
			
 
				-			byte(b.bitContainer>>24),
			
 
				-			byte(b.bitContainer>>32),
			
 
				-			byte(b.bitContainer>>40),
			
 
				-			byte(b.bitContainer>>48),
			
 
				-		)
			
 
				-	case 8:
			
 
				-		b.out = append(b.out,
			
 
				-			byte(b.bitContainer),
			
 
				-			byte(b.bitContainer>>8),
			
 
				-			byte(b.bitContainer>>16),
			
 
				-			byte(b.bitContainer>>24),
			
 
				-			byte(b.bitContainer>>32),
			
 
				-			byte(b.bitContainer>>40),
			
 
				-			byte(b.bitContainer>>48),
			
 
				-			byte(b.bitContainer>>56),
			
 
				-		)
			
 
				-	default:
			
 
				-		panic(fmt.Errorf("bits (%d) > 64", b.nBits))
			
 
				-	}
			
 
				-	b.bitContainer >>= v << 3
			
 
				-	b.nBits &= 7
			
 
				-}
			
 
				-
			
 
				 // flush32 will flush out, so there are at least 32 bits available for writing.
			
 
				 func (b *bitWriter) flush32() {
			
 
				 	if b.nBits < 32 {
			
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -5,9 +5,14 @@
 
				 package zstd
			
 
				 
			
 
				 import (
			
 
				+	"bytes"
			
 
				+	"encoding/binary"
			
 
				 	"errors"
			
 
				 	"fmt"
			
 
				 	"io"
			
 
				+	"io/ioutil"
			
 
				+	"os"
			
 
				+	"path/filepath"
			
 
				 	"sync"
			
 
				 
			
 
				 	"github.com/klauspost/compress/huff0"
			
@@ -38,14 +43,14 @@ const (
 
				 	// maxCompressedBlockSize is the biggest allowed compressed block size (128KB)
			
 
				 	maxCompressedBlockSize = 128 << 10
			
 
				 
			
 
				+	compressedBlockOverAlloc    = 16
			
 
				+	maxCompressedBlockSizeAlloc = 128<<10 + compressedBlockOverAlloc
			
 
				+
			
 
				 	// Maximum possible block size (all Raw+Uncompressed).
			
 
				 	maxBlockSize = (1 << 21) - 1
			
 
				 
			
 
				-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#literals_section_header
			
 
				-	maxCompressedLiteralSize = 1 << 18
			
 
				-	maxRLELiteralSize        = 1 << 20
			
 
				-	maxMatchLen              = 131074
			
 
				-	maxSequences             = 0x7f00 + 0xffff
			
 
				+	maxMatchLen  = 131074
			
 
				+	maxSequences = 0x7f00 + 0xffff
			
 
				 
			
 
				 	// We support slightly less than the reference decoder to be able to
			
 
				 	// use ints on 32 bit archs.
			
@@ -97,7 +102,6 @@ type blockDec struct {
 
				 
			
 
				 	// Block is RLE, this is the size.
			
 
				 	RLESize uint32
			
 
				-	tmp     [4]byte
			
 
				 
			
 
				 	Type blockType
			
 
				 
			
@@ -136,7 +140,7 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 
				 	b.Type = blockType((bh >> 1) & 3)
			
 
				 	// find size.
			
 
				 	cSize := int(bh >> 3)
			
 
				-	maxSize := maxBlockSize
			
 
				+	maxSize := maxCompressedBlockSizeAlloc
			
 
				 	switch b.Type {
			
 
				 	case blockTypeReserved:
			
 
				 		return ErrReservedBlockType
			
@@ -157,9 +161,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 
				 			println("Data size on stream:", cSize)
			
 
				 		}
			
 
				 		b.RLESize = 0
			
 
				-		maxSize = maxCompressedBlockSize
			
 
				+		maxSize = maxCompressedBlockSizeAlloc
			
 
				 		if windowSize < maxCompressedBlockSize && b.lowMem {
			
 
				-			maxSize = int(windowSize)
			
 
				+			maxSize = int(windowSize) + compressedBlockOverAlloc
			
 
				 		}
			
 
				 		if cSize > maxCompressedBlockSize || uint64(cSize) > b.WindowSize {
			
 
				 			if debugDecoder {
			
@@ -190,9 +194,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 
				 	// Read block data.
			
 
				 	if cap(b.dataStorage) < cSize {
			
 
				 		if b.lowMem || cSize > maxCompressedBlockSize {
			
 
				-			b.dataStorage = make([]byte, 0, cSize)
			
 
				+			b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc)
			
 
				 		} else {
			
 
				-			b.dataStorage = make([]byte, 0, maxCompressedBlockSize)
			
 
				+			b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc)
			
 
				 		}
			
 
				 	}
			
 
				 	if cap(b.dst) <= maxSize {
			
@@ -360,14 +364,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
 
				 		}
			
 
				 		if cap(b.literalBuf) < litRegenSize {
			
 
				 			if b.lowMem {
			
 
				-				b.literalBuf = make([]byte, litRegenSize)
			
 
				+				b.literalBuf = make([]byte, litRegenSize, litRegenSize+compressedBlockOverAlloc)
			
 
				 			} else {
			
 
				-				if litRegenSize > maxCompressedLiteralSize {
			
 
				-					// Exceptional
			
 
				-					b.literalBuf = make([]byte, litRegenSize)
			
 
				-				} else {
			
 
				-					b.literalBuf = make([]byte, litRegenSize, maxCompressedLiteralSize)
			
 
				-				}
			
 
				+				b.literalBuf = make([]byte, litRegenSize, maxCompressedBlockSize+compressedBlockOverAlloc)
			
 
				 			}
			
 
				 		}
			
 
				 		literals = b.literalBuf[:litRegenSize]
			
@@ -397,14 +396,14 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
 
				 		// Ensure we have space to store it.
			
 
				 		if cap(b.literalBuf) < litRegenSize {
			
 
				 			if b.lowMem {
			
 
				-				b.literalBuf = make([]byte, 0, litRegenSize)
			
 
				+				b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
			
 
				 			} else {
			
 
				-				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
			
 
				+				b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
			
 
				 			}
			
 
				 		}
			
 
				 		var err error
			
 
				 		// Use our out buffer.
			
 
				-		huff.MaxDecodedSize = maxCompressedBlockSize
			
 
				+		huff.MaxDecodedSize = litRegenSize
			
 
				 		if fourStreams {
			
 
				 			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
			
 
				 		} else {
			
@@ -429,9 +428,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
 
				 		// Ensure we have space to store it.
			
 
				 		if cap(b.literalBuf) < litRegenSize {
			
 
				 			if b.lowMem {
			
 
				-				b.literalBuf = make([]byte, 0, litRegenSize)
			
 
				+				b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
			
 
				 			} else {
			
 
				-				b.literalBuf = make([]byte, 0, maxCompressedBlockSize)
			
 
				+				b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
			
 
				 			}
			
 
				 		}
			
 
				 		huff := hist.huffTree
			
@@ -448,7 +447,7 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
 
				 			return in, err
			
 
				 		}
			
 
				 		hist.huffTree = huff
			
 
				-		huff.MaxDecodedSize = maxCompressedBlockSize
			
 
				+		huff.MaxDecodedSize = litRegenSize
			
 
				 		// Use our out buffer.
			
 
				 		if fourStreams {
			
 
				 			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
			
@@ -463,6 +462,8 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
 
				 		if len(literals) != litRegenSize {
			
 
				 			return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
			
 
				 		}
			
 
				+		// Re-cap to get extra size.
			
 
				+		literals = b.literalBuf[:len(literals)]
			
 
				 		if debugDecoder {
			
 
				 			printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize)
			
 
				 		}
			
@@ -486,10 +487,15 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 
				 		b.dst = append(b.dst, hist.decoders.literals...)
			
 
				 		return nil
			
 
				 	}
			
 
				-	err = hist.decoders.decodeSync(hist)
			
 
				+	before := len(hist.decoders.out)
			
 
				+	err = hist.decoders.decodeSync(hist.b[hist.ignoreBuffer:])
			
 
				 	if err != nil {
			
 
				 		return err
			
 
				 	}
			
 
				+	if hist.decoders.maxSyncLen > 0 {
			
 
				+		hist.decoders.maxSyncLen += uint64(before)
			
 
				+		hist.decoders.maxSyncLen -= uint64(len(hist.decoders.out))
			
 
				+	}
			
 
				 	b.dst = hist.decoders.out
			
 
				 	hist.recentOffsets = hist.decoders.prevOffset
			
 
				 	return nil
			
@@ -632,6 +638,22 @@ func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
 
				 		println("initializing sequences:", err)
			
 
				 		return err
			
 
				 	}
			
 
				+	// Extract blocks...
			
 
				+	if false && hist.dict == nil {
			
 
				+		fatalErr := func(err error) {
			
 
				+			if err != nil {
			
 
				+				panic(err)
			
 
				+			}
			
 
				+		}
			
 
				+		fn := fmt.Sprintf("n-%d-lits-%d-prev-%d-%d-%d-win-%d.blk", hist.decoders.nSeqs, len(hist.decoders.literals), hist.recentOffsets[0], hist.recentOffsets[1], hist.recentOffsets[2], hist.windowSize)
			
 
				+		var buf bytes.Buffer
			
 
				+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.litLengths.fse))
			
 
				+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse))
			
 
				+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse))
			
 
				+		buf.Write(in)
			
 
				+		ioutil.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
			
 
				+	}
			
 
				+
			
 
				 	return nil
			
 
				 }
			
 
				 
			
@@ -650,6 +672,7 @@ func (b *blockDec) decodeSequences(hist *history) error {
 
				 	}
			
 
				 	hist.decoders.windowSize = hist.windowSize
			
 
				 	hist.decoders.prevOffset = hist.recentOffsets
			
 
				+
			
 
				 	err := hist.decoders.decode(b.sequence)
			
 
				 	hist.recentOffsets = hist.decoders.prevOffset
			
 
				 	return err
			
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@@ -23,7 +23,7 @@ type byteBuffer interface {
 
				 	readByte() (byte, error)
			
 
				 
			
 
				 	// Skip n bytes.
			
 
				-	skipN(n int) error
			
 
				+	skipN(n int64) error
			
 
				 }
			
 
				 
			
 
				 // in-memory buffer
			
@@ -52,10 +52,6 @@ func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) {
 
				 	return r, nil
			
 
				 }
			
 
				 
			
 
				-func (b *byteBuf) remain() []byte {
			
 
				-	return *b
			
 
				-}
			
 
				-
			
 
				 func (b *byteBuf) readByte() (byte, error) {
			
 
				 	bb := *b
			
 
				 	if len(bb) < 1 {
			
@@ -66,9 +62,12 @@ func (b *byteBuf) readByte() (byte, error) {
 
				 	return r, nil
			
 
				 }
			
 
				 
			
 
				-func (b *byteBuf) skipN(n int) error {
			
 
				+func (b *byteBuf) skipN(n int64) error {
			
 
				 	bb := *b
			
 
				-	if len(bb) < n {
			
 
				+	if n < 0 {
			
 
				+		return fmt.Errorf("negative skip (%d) requested", n)
			
 
				+	}
			
 
				+	if int64(len(bb)) < n {
			
 
				 		return io.ErrUnexpectedEOF
			
 
				 	}
			
 
				 	*b = bb[n:]
			
@@ -124,9 +123,9 @@ func (r *readerWrapper) readByte() (byte, error) {
 
				 	return r.tmp[0], nil
			
 
				 }
			
 
				 
			
 
				-func (r *readerWrapper) skipN(n int) error {
			
 
				-	n2, err := io.CopyN(ioutil.Discard, r.r, int64(n))
			
 
				-	if n2 != int64(n) {
			
 
				+func (r *readerWrapper) skipN(n int64) error {
			
 
				+	n2, err := io.CopyN(ioutil.Discard, r.r, n)
			
 
				+	if n2 != n {
			
 
				 		err = io.ErrUnexpectedEOF
			
 
				 	}
			
 
				 	return err
			
--- a/vendor/github.com/klauspost/compress/zstd/bytereader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytereader.go
@@ -13,12 +13,6 @@ type byteReader struct {
 
				 	off int
			
 
				 }
			
 
				 
			
 
				-// init will initialize the reader and set the input.
			
 
				-func (b *byteReader) init(in []byte) {
			
 
				-	b.b = in
			
 
				-	b.off = 0
			
 
				-}
			
 
				-
			
 
				 // advance the stream b n bytes.
			
 
				 func (b *byteReader) advance(n uint) {
			
 
				 	b.off += int(n)
			
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -347,18 +347,23 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 
				 			}
			
 
				 			frame.history.setDict(&dict)
			
 
				 		}
			
 
				-
			
 
				-		if frame.FrameContentSize != fcsUnknown && frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
			
 
				-			return dst, ErrDecoderSizeExceeded
			
 
				+		if frame.WindowSize > d.o.maxWindowSize {
			
 
				+			if debugDecoder {
			
 
				+				println("window size exceeded:", frame.WindowSize, ">", d.o.maxWindowSize)
			
 
				+			}
			
 
				+			return dst, ErrWindowSizeExceeded
			
 
				 		}
			
 
				-		if frame.FrameContentSize < 1<<30 {
			
 
				-			// Never preallocate more than 1 GB up front.
			
 
				+		if frame.FrameContentSize != fcsUnknown {
			
 
				+			if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
			
 
				+				return dst, ErrDecoderSizeExceeded
			
 
				+			}
			
 
				 			if cap(dst)-len(dst) < int(frame.FrameContentSize) {
			
 
				-				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize))
			
 
				+				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize)+compressedBlockOverAlloc)
			
 
				 				copy(dst2, dst)
			
 
				 				dst = dst2
			
 
				 			}
			
 
				 		}
			
 
				+
			
 
				 		if cap(dst) == 0 {
			
 
				 			// Allocate len(input) * 2 by default if nothing is provided
			
 
				 			// and we didn't get frame content size.
			
@@ -437,7 +442,7 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
 
				 		println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
			
 
				 	}
			
 
				 
			
 
				-	if len(next.b) > 0 {
			
 
				+	if !d.o.ignoreChecksum && len(next.b) > 0 {
			
 
				 		n, err := d.current.crc.Write(next.b)
			
 
				 		if err == nil {
			
 
				 			if n != len(next.b) {
			
@@ -449,7 +454,7 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
 
				 		got := d.current.crc.Sum64()
			
 
				 		var tmp [4]byte
			
 
				 		binary.LittleEndian.PutUint32(tmp[:], uint32(got))
			
 
				-		if !bytes.Equal(tmp[:], next.d.checkCRC) && !ignoreCRC {
			
 
				+		if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) {
			
 
				 			if debugDecoder {
			
 
				 				println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)")
			
 
				 			}
			
@@ -533,9 +538,15 @@ func (d *Decoder) nextBlockSync() (ok bool) {
 
				 
			
 
				 		// Update/Check CRC
			
 
				 		if d.frame.HasCheckSum {
			
 
				-			d.frame.crc.Write(d.current.b)
			
 
				+			if !d.o.ignoreChecksum {
			
 
				+				d.frame.crc.Write(d.current.b)
			
 
				+			}
			
 
				 			if d.current.d.Last {
			
 
				-				d.current.err = d.frame.checkCRC()
			
 
				+				if !d.o.ignoreChecksum {
			
 
				+					d.current.err = d.frame.checkCRC()
			
 
				+				} else {
			
 
				+					d.current.err = d.frame.consumeCRC()
			
 
				+				}
			
 
				 				if d.current.err != nil {
			
 
				 					println("CRC error:", d.current.err)
			
 
				 					return false
			
@@ -629,60 +640,18 @@ func (d *Decoder) startSyncDecoder(r io.Reader) error {
 
				 
			
 
				 // Create Decoder:
			
 
				 // ASYNC:
			
 
				-// Spawn 4 go routines.
			
 
				-// 0: Read frames and decode blocks.
			
 
				-// 1: Decode block and literals. Receives hufftree and seqdecs, returns seqdecs and huff tree.
			
 
				-// 2: Wait for recentOffsets if needed. Decode sequences, send recentOffsets.
			
 
				-// 3: Wait for stream history, execute sequences, send stream history.
			
 
				+// Spawn 3 go routines.
			
 
				+// 0: Read frames and decode block literals.
			
 
				+// 1: Decode sequences.
			
 
				+// 2: Execute sequences, send to output.
			
 
				 func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output chan decodeOutput) {
			
 
				 	defer d.streamWg.Done()
			
 
				 	br := readerWrapper{r: r}
			
 
				 
			
 
				-	var seqPrepare = make(chan *blockDec, d.o.concurrent)
			
 
				 	var seqDecode = make(chan *blockDec, d.o.concurrent)
			
 
				 	var seqExecute = make(chan *blockDec, d.o.concurrent)
			
 
				 
			
 
				-	// Async 1: Prepare blocks...
			
 
				-	go func() {
			
 
				-		var hist history
			
 
				-		var hasErr bool
			
 
				-		for block := range seqPrepare {
			
 
				-			if hasErr {
			
 
				-				if block != nil {
			
 
				-					seqDecode <- block
			
 
				-				}
			
 
				-				continue
			
 
				-			}
			
 
				-			if block.async.newHist != nil {
			
 
				-				if debugDecoder {
			
 
				-					println("Async 1: new history")
			
 
				-				}
			
 
				-				hist.reset()
			
 
				-				if block.async.newHist.dict != nil {
			
 
				-					hist.setDict(block.async.newHist.dict)
			
 
				-				}
			
 
				-			}
			
 
				-			if block.err != nil || block.Type != blockTypeCompressed {
			
 
				-				hasErr = block.err != nil
			
 
				-				seqDecode <- block
			
 
				-				continue
			
 
				-			}
			
 
				-
			
 
				-			remain, err := block.decodeLiterals(block.data, &hist)
			
 
				-			block.err = err
			
 
				-			hasErr = block.err != nil
			
 
				-			if err == nil {
			
 
				-				block.async.literals = hist.decoders.literals
			
 
				-				block.async.seqData = remain
			
 
				-			} else if debugDecoder {
			
 
				-				println("decodeLiterals error:", err)
			
 
				-			}
			
 
				-			seqDecode <- block
			
 
				-		}
			
 
				-		close(seqDecode)
			
 
				-	}()
			
 
				-
			
 
				-	// Async 2: Decode sequences...
			
 
				+	// Async 1: Decode sequences...
			
 
				 	go func() {
			
 
				 		var hist history
			
 
				 		var hasErr bool
			
@@ -696,7 +665,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
 
				 			}
			
 
				 			if block.async.newHist != nil {
			
 
				 				if debugDecoder {
			
 
				-					println("Async 2: new history, recent:", block.async.newHist.recentOffsets)
			
 
				+					println("Async 1: new history, recent:", block.async.newHist.recentOffsets)
			
 
				 				}
			
 
				 				hist.decoders = block.async.newHist.decoders
			
 
				 				hist.recentOffsets = block.async.newHist.recentOffsets
			
@@ -750,7 +719,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
 
				 			}
			
 
				 			if block.async.newHist != nil {
			
 
				 				if debugDecoder {
			
 
				-					println("Async 3: new history")
			
 
				+					println("Async 2: new history")
			
 
				 				}
			
 
				 				hist.windowSize = block.async.newHist.windowSize
			
 
				 				hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer
			
@@ -837,6 +806,33 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
 
				 
			
 
				 decodeStream:
			
 
				 	for {
			
 
				+		var hist history
			
 
				+		var hasErr bool
			
 
				+
			
 
				+		decodeBlock := func(block *blockDec) {
			
 
				+			if hasErr {
			
 
				+				if block != nil {
			
 
				+					seqDecode <- block
			
 
				+				}
			
 
				+				return
			
 
				+			}
			
 
				+			if block.err != nil || block.Type != blockTypeCompressed {
			
 
				+				hasErr = block.err != nil
			
 
				+				seqDecode <- block
			
 
				+				return
			
 
				+			}
			
 
				+
			
 
				+			remain, err := block.decodeLiterals(block.data, &hist)
			
 
				+			block.err = err
			
 
				+			hasErr = block.err != nil
			
 
				+			if err == nil {
			
 
				+				block.async.literals = hist.decoders.literals
			
 
				+				block.async.seqData = remain
			
 
				+			} else if debugDecoder {
			
 
				+				println("decodeLiterals error:", err)
			
 
				+			}
			
 
				+			seqDecode <- block
			
 
				+		}
			
 
				 		frame := d.frame
			
 
				 		if debugDecoder {
			
 
				 			println("New frame...")
			
@@ -863,7 +859,7 @@ decodeStream:
 
				 			case <-ctx.Done():
			
 
				 			case dec := <-d.decoders:
			
 
				 				dec.sendErr(err)
			
 
				-				seqPrepare <- dec
			
 
				+				decodeBlock(dec)
			
 
				 			}
			
 
				 			break decodeStream
			
 
				 		}
			
@@ -883,6 +879,10 @@ decodeStream:
 
				 				if debugDecoder {
			
 
				 					println("Alloc History:", h.allocFrameBuffer)
			
 
				 				}
			
 
				+				hist.reset()
			
 
				+				if h.dict != nil {
			
 
				+					hist.setDict(h.dict)
			
 
				+				}
			
 
				 				dec.async.newHist = &h
			
 
				 				dec.async.fcs = frame.FrameContentSize
			
 
				 				historySent = true
			
@@ -909,7 +909,7 @@ decodeStream:
 
				 			}
			
 
				 			err = dec.err
			
 
				 			last := dec.Last
			
 
				-			seqPrepare <- dec
			
 
				+			decodeBlock(dec)
			
 
				 			if err != nil {
			
 
				 				break decodeStream
			
 
				 			}
			
@@ -918,7 +918,7 @@ decodeStream:
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				-	close(seqPrepare)
			
 
				+	close(seqDecode)
			
 
				 	wg.Wait()
			
 
				 	d.frame.history.b = frameHistCache
			
 
				 }
			
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@@ -19,6 +19,7 @@ type decoderOptions struct {
 
				 	maxDecodedSize uint64
			
 
				 	maxWindowSize  uint64
			
 
				 	dicts          []dict
			
 
				+	ignoreChecksum bool
			
 
				 }
			
 
				 
			
 
				 func (o *decoderOptions) setDefault() {
			
@@ -31,7 +32,7 @@ func (o *decoderOptions) setDefault() {
 
				 	if o.concurrent > 4 {
			
 
				 		o.concurrent = 4
			
 
				 	}
			
 
				-	o.maxDecodedSize = 1 << 63
			
 
				+	o.maxDecodedSize = 64 << 30
			
 
				 }
			
 
				 
			
 
				 // WithDecoderLowmem will set whether to use a lower amount of memory,
			
@@ -66,7 +67,7 @@ func WithDecoderConcurrency(n int) DOption {
 
				 // WithDecoderMaxMemory allows to set a maximum decoded size for in-memory
			
 
				 // non-streaming operations or maximum window size for streaming operations.
			
 
				 // This can be used to control memory usage of potentially hostile content.
			
 
				-// Maximum and default is 1 << 63 bytes.
			
 
				+// Maximum is 1 << 63 bytes. Default is 64GiB.
			
 
				 func WithDecoderMaxMemory(n uint64) DOption {
			
 
				 	return func(o *decoderOptions) error {
			
 
				 		if n == 0 {
			
@@ -112,3 +113,11 @@ func WithDecoderMaxWindow(size uint64) DOption {
 
				 		return nil
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+// IgnoreChecksum allows to forcibly ignore checksum checking.
			
 
				+func IgnoreChecksum(b bool) DOption {
			
 
				+	return func(o *decoderOptions) error {
			
 
				+		o.ignoreChecksum = b
			
 
				+		return nil
			
 
				+	}
			
 
				+}
			
--- a/vendor/github.com/klauspost/compress/zstd/enc_better.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go
@@ -156,8 +156,8 @@ encodeLoop:
 
				 				panic("offset0 was 0")
			
 
				 			}
			
 
				 
			
 
				-			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
			
 
				 			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
			
 
				+			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
			
 
				 			candidateL := e.longTable[nextHashL]
			
 
				 			candidateS := e.table[nextHashS]
			
 
				 
			
@@ -518,8 +518,8 @@ encodeLoop:
 
				 			}
			
 
				 
			
 
				 			// Store this, since we have it.
			
 
				-			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
			
 
				 			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
			
 
				+			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
			
 
				 
			
 
				 			// We have at least 4 byte match.
			
 
				 			// No need to check backwards. We come straight from a match
			
@@ -674,8 +674,8 @@ encodeLoop:
 
				 				panic("offset0 was 0")
			
 
				 			}
			
 
				 
			
 
				-			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
			
 
				 			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
			
 
				+			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
			
 
				 			candidateL := e.longTable[nextHashL]
			
 
				 			candidateS := e.table[nextHashS]
			
 
				 
			
@@ -1047,8 +1047,8 @@ encodeLoop:
 
				 			}
			
 
				 
			
 
				 			// Store this, since we have it.
			
 
				-			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
			
 
				 			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
			
 
				+			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
			
 
				 
			
 
				 			// We have at least 4 byte match.
			
 
				 			// No need to check backwards. We come straight from a match
			
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
@@ -127,8 +127,8 @@ encodeLoop:
 
				 				panic("offset0 was 0")
			
 
				 			}
			
 
				 
			
 
				-			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
			
 
				 			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
			
 
				+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
			
 
				 			candidateL := e.longTable[nextHashL]
			
 
				 			candidateS := e.table[nextHashS]
			
 
				 
			
@@ -439,8 +439,8 @@ encodeLoop:
 
				 		var t int32
			
 
				 		for {
			
 
				 
			
 
				-			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
			
 
				 			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
			
 
				+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
			
 
				 			candidateL := e.longTable[nextHashL]
			
 
				 			candidateS := e.table[nextHashS]
			
 
				 
			
@@ -785,8 +785,8 @@ encodeLoop:
 
				 				panic("offset0 was 0")
			
 
				 			}
			
 
				 
			
 
				-			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
			
 
				 			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
			
 
				+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
			
 
				 			candidateL := e.longTable[nextHashL]
			
 
				 			candidateS := e.table[nextHashS]
			
 
				 
			
@@ -969,7 +969,7 @@ encodeLoop:
 
				 		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
			
 
				 		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
			
 
				 		longHash1 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
			
 
				-		longHash2 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
			
 
				+		longHash2 := hashLen(cv1, dFastLongTableBits, dFastLongLen)
			
 
				 		e.longTable[longHash1] = te0
			
 
				 		e.longTable[longHash2] = te1
			
 
				 		e.markLongShardDirty(longHash1)
			
@@ -1002,8 +1002,8 @@ encodeLoop:
 
				 			}
			
 
				 
			
 
				 			// Store this, since we have it.
			
 
				-			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
			
 
				 			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
			
 
				+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
			
 
				 
			
 
				 			// We have at least 4 byte match.
			
 
				 			// No need to check backwards. We come straight from a match
			
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -528,8 +528,8 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 
				 		// If a non-single block is needed the encoder will reset again.
			
 
				 		e.encoders <- enc
			
 
				 	}()
			
 
				-	// Use single segments when above minimum window and below 1MB.
			
 
				-	single := len(src) < 1<<20 && len(src) > MinWindowSize
			
 
				+	// Use single segments when above minimum window and below window size.
			
 
				+	single := len(src) <= e.o.windowSize && len(src) > MinWindowSize
			
 
				 	if e.o.single != nil {
			
 
				 		single = *e.o.single
			
 
				 	}
			
@@ -551,7 +551,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 
				 	}
			
 
				 
			
 
				 	// If we can do everything in one block, prefer that.
			
 
				-	if len(src) <= maxCompressedBlockSize {
			
 
				+	if len(src) <= e.o.blockSize {
			
 
				 		enc.Reset(e.o.dict, true)
			
 
				 		// Slightly faster with no history and everything in one block.
			
 
				 		if e.o.crc {
			
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@@ -283,7 +283,7 @@ func WithNoEntropyCompression(b bool) EOption {
 
				 // a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range.
			
 
				 // For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB.
			
 
				 // This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations.
			
 
				-// If this is not specified, block encodes will automatically choose this based on the input size.
			
 
				+// If this is not specified, block encodes will automatically choose this based on the input size and the window size.
			
 
				 // This setting has no effect on streamed encodes.
			
 
				 func WithSingleSegment(b bool) EOption {
			
 
				 	return func(o *encoderOptions) error {
			
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -106,7 +106,7 @@ func (d *frameDec) reset(br byteBuffer) error {
 
				 		}
			
 
				 		n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
			
 
				 		println("Skipping frame with", n, "bytes.")
			
 
				-		err = br.skipN(int(n))
			
 
				+		err = br.skipN(int64(n))
			
 
				 		if err != nil {
			
 
				 			if debugDecoder {
			
 
				 				println("Reading discarded frame", err)
			
@@ -231,20 +231,27 @@ func (d *frameDec) reset(br byteBuffer) error {
 
				 		d.crc.Reset()
			
 
				 	}
			
 
				 
			
 
				+	if d.WindowSize > d.o.maxWindowSize {
			
 
				+		if debugDecoder {
			
 
				+			printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
			
 
				+		}
			
 
				+		return ErrWindowSizeExceeded
			
 
				+	}
			
 
				+
			
 
				 	if d.WindowSize == 0 && d.SingleSegment {
			
 
				 		// We may not need window in this case.
			
 
				 		d.WindowSize = d.FrameContentSize
			
 
				 		if d.WindowSize < MinWindowSize {
			
 
				 			d.WindowSize = MinWindowSize
			
 
				 		}
			
 
				-	}
			
 
				-
			
 
				-	if d.WindowSize > uint64(d.o.maxWindowSize) {
			
 
				-		if debugDecoder {
			
 
				-			printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
			
 
				+		if d.WindowSize > d.o.maxDecodedSize {
			
 
				+			if debugDecoder {
			
 
				+				printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
			
 
				+			}
			
 
				+			return ErrDecoderSizeExceeded
			
 
				 		}
			
 
				-		return ErrWindowSizeExceeded
			
 
				 	}
			
 
				+
			
 
				 	// The minimum Window_Size is 1 KB.
			
 
				 	if d.WindowSize < MinWindowSize {
			
 
				 		if debugDecoder {
			
@@ -253,10 +260,11 @@ func (d *frameDec) reset(br byteBuffer) error {
 
				 		return ErrWindowSizeTooSmall
			
 
				 	}
			
 
				 	d.history.windowSize = int(d.WindowSize)
			
 
				-	if d.o.lowMem && d.history.windowSize < maxBlockSize {
			
 
				+	if !d.o.lowMem || d.history.windowSize < maxBlockSize {
			
 
				+		// Alloc 2x window size if not low-mem, or very small window size.
			
 
				 		d.history.allocFrameBuffer = d.history.windowSize * 2
			
 
				-		// TODO: Maybe use FrameContent size
			
 
				 	} else {
			
 
				+		// Alloc with one additional block
			
 
				 		d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
			
 
				 	}
			
 
				 
			
@@ -290,13 +298,6 @@ func (d *frameDec) checkCRC() error {
 
				 	if !d.HasCheckSum {
			
 
				 		return nil
			
 
				 	}
			
 
				-	var tmp [4]byte
			
 
				-	got := d.crc.Sum64()
			
 
				-	// Flip to match file order.
			
 
				-	tmp[0] = byte(got >> 0)
			
 
				-	tmp[1] = byte(got >> 8)
			
 
				-	tmp[2] = byte(got >> 16)
			
 
				-	tmp[3] = byte(got >> 24)
			
 
				 
			
 
				 	// We can overwrite upper tmp now
			
 
				 	want, err := d.rawInput.readSmall(4)
			
@@ -305,7 +306,19 @@ func (d *frameDec) checkCRC() error {
 
				 		return err
			
 
				 	}
			
 
				 
			
 
				-	if !bytes.Equal(tmp[:], want) && !ignoreCRC {
			
 
				+	if d.o.ignoreChecksum {
			
 
				+		return nil
			
 
				+	}
			
 
				+
			
 
				+	var tmp [4]byte
			
 
				+	got := d.crc.Sum64()
			
 
				+	// Flip to match file order.
			
 
				+	tmp[0] = byte(got >> 0)
			
 
				+	tmp[1] = byte(got >> 8)
			
 
				+	tmp[2] = byte(got >> 16)
			
 
				+	tmp[3] = byte(got >> 24)
			
 
				+
			
 
				+	if !bytes.Equal(tmp[:], want) {
			
 
				 		if debugDecoder {
			
 
				 			println("CRC Check Failed:", tmp[:], "!=", want)
			
 
				 		}
			
@@ -317,6 +330,19 @@ func (d *frameDec) checkCRC() error {
 
				 	return nil
			
 
				 }
			
 
				 
			
 
				+// consumeCRC reads the checksum data if the frame has one.
			
 
				+func (d *frameDec) consumeCRC() error {
			
 
				+	if d.HasCheckSum {
			
 
				+		_, err := d.rawInput.readSmall(4)
			
 
				+		if err != nil {
			
 
				+			println("CRC missing?", err)
			
 
				+			return err
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				 // runDecoder will create a sync decoder that will decode a block of data.
			
 
				 func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
			
 
				 	saved := d.history.b
			
@@ -326,6 +352,19 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 
				 	d.history.ignoreBuffer = len(dst)
			
 
				 	// Store input length, so we only check new data.
			
 
				 	crcStart := len(dst)
			
 
				+	d.history.decoders.maxSyncLen = 0
			
 
				+	if d.FrameContentSize != fcsUnknown {
			
 
				+		d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
			
 
				+		if d.history.decoders.maxSyncLen > d.o.maxDecodedSize {
			
 
				+			return dst, ErrDecoderSizeExceeded
			
 
				+		}
			
 
				+		if uint64(cap(dst)) < d.history.decoders.maxSyncLen {
			
 
				+			// Alloc for output
			
 
				+			dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc)
			
 
				+			copy(dst2, dst)
			
 
				+			dst = dst2
			
 
				+		}
			
 
				+	}
			
 
				 	var err error
			
 
				 	for {
			
 
				 		err = dec.reset(d.rawInput, d.WindowSize)
			
@@ -360,13 +399,17 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 
				 		if d.FrameContentSize != fcsUnknown && uint64(len(d.history.b)-crcStart) != d.FrameContentSize {
			
 
				 			err = ErrFrameSizeMismatch
			
 
				 		} else if d.HasCheckSum {
			
 
				-			var n int
			
 
				-			n, err = d.crc.Write(dst[crcStart:])
			
 
				-			if err == nil {
			
 
				-				if n != len(dst)-crcStart {
			
 
				-					err = io.ErrShortWrite
			
 
				-				} else {
			
 
				-					err = d.checkCRC()
			
 
				+			if d.o.ignoreChecksum {
			
 
				+				err = d.consumeCRC()
			
 
				+			} else {
			
 
				+				var n int
			
 
				+				n, err = d.crc.Write(dst[crcStart:])
			
 
				+				if err == nil {
			
 
				+					if n != len(dst)-crcStart {
			
 
				+						err = io.ErrShortWrite
			
 
				+					} else {
			
 
				+						err = d.checkCRC()
			
 
				+					}
			
 
				 				}
			
 
				 			}
			
 
				 		}
			
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@@ -5,8 +5,10 @@
 
				 package zstd
			
 
				 
			
 
				 import (
			
 
				+	"encoding/binary"
			
 
				 	"errors"
			
 
				 	"fmt"
			
 
				+	"io"
			
 
				 )
			
 
				 
			
 
				 const (
			
@@ -178,10 +180,32 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 
				 		return fmt.Errorf("corruption detected (total %d != %d)", gotTotal, 1<<s.actualTableLog)
			
 
				 	}
			
 
				 	b.advance((bitCount + 7) >> 3)
			
 
				-	// println(s.norm[:s.symbolLen], s.symbolLen)
			
 
				 	return s.buildDtable()
			
 
				 }
			
 
				 
			
 
				+func (s *fseDecoder) mustReadFrom(r io.Reader) {
			
 
				+	fatalErr := func(err error) {
			
 
				+		if err != nil {
			
 
				+			panic(err)
			
 
				+		}
			
 
				+	}
			
 
				+	// 	dt             [maxTablesize]decSymbol // Decompression table.
			
 
				+	//	symbolLen      uint16                  // Length of active part of the symbol table.
			
 
				+	//	actualTableLog uint8                   // Selected tablelog.
			
 
				+	//	maxBits        uint8                   // Maximum number of additional bits
			
 
				+	//	// used for table creation to avoid allocations.
			
 
				+	//	stateTable [256]uint16
			
 
				+	//	norm       [maxSymbolValue + 1]int16
			
 
				+	//	preDefined bool
			
 
				+	fatalErr(binary.Read(r, binary.LittleEndian, &s.dt))
			
 
				+	fatalErr(binary.Read(r, binary.LittleEndian, &s.symbolLen))
			
 
				+	fatalErr(binary.Read(r, binary.LittleEndian, &s.actualTableLog))
			
 
				+	fatalErr(binary.Read(r, binary.LittleEndian, &s.maxBits))
			
 
				+	fatalErr(binary.Read(r, binary.LittleEndian, &s.stateTable))
			
 
				+	fatalErr(binary.Read(r, binary.LittleEndian, &s.norm))
			
 
				+	fatalErr(binary.Read(r, binary.LittleEndian, &s.preDefined))
			
 
				+}
			
 
				+
			
 
				 // decSymbol contains information about a state entry,
			
 
				 // Including the state offset base, the output symbol and
			
 
				 // the number of bits to read for the low part of the destination state.
			
@@ -204,18 +228,10 @@ func (d decSymbol) newState() uint16 {
 
				 	return uint16(d >> 16)
			
 
				 }
			
 
				 
			
 
				-func (d decSymbol) baseline() uint32 {
			
 
				-	return uint32(d >> 32)
			
 
				-}
			
 
				-
			
 
				 func (d decSymbol) baselineInt() int {
			
 
				 	return int(d >> 32)
			
 
				 }
			
 
				 
			
 
				-func (d *decSymbol) set(nbits, addBits uint8, newState uint16, baseline uint32) {
			
 
				-	*d = decSymbol(nbits) | (decSymbol(addBits) << 8) | (decSymbol(newState) << 16) | (decSymbol(baseline) << 32)
			
 
				-}
			
 
				-
			
 
				 func (d *decSymbol) setNBits(nBits uint8) {
			
 
				 	const mask = 0xffffffffffffff00
			
 
				 	*d = (*d & mask) | decSymbol(nBits)
			
@@ -231,11 +247,6 @@ func (d *decSymbol) setNewState(state uint16) {
 
				 	*d = (*d & mask) | decSymbol(state)<<16
			
 
				 }
			
 
				 
			
 
				-func (d *decSymbol) setBaseline(baseline uint32) {
			
 
				-	const mask = 0xffffffff
			
 
				-	*d = (*d & mask) | decSymbol(baseline)<<32
			
 
				-}
			
 
				-
			
 
				 func (d *decSymbol) setExt(addBits uint8, baseline uint32) {
			
 
				 	const mask = 0xffff00ff
			
 
				 	*d = (*d & mask) | (decSymbol(addBits) << 8) | (decSymbol(baseline) << 32)
			
@@ -257,68 +268,6 @@ func (s *fseDecoder) setRLE(symbol decSymbol) {
 
				 	s.dt[0] = symbol
			
 
				 }
			
 
				 
			
 
				-// buildDtable will build the decoding table.
			
 
				-func (s *fseDecoder) buildDtable() error {
			
 
				-	tableSize := uint32(1 << s.actualTableLog)
			
 
				-	highThreshold := tableSize - 1
			
 
				-	symbolNext := s.stateTable[:256]
			
 
				-
			
 
				-	// Init, lay down lowprob symbols
			
 
				-	{
			
 
				-		for i, v := range s.norm[:s.symbolLen] {
			
 
				-			if v == -1 {
			
 
				-				s.dt[highThreshold].setAddBits(uint8(i))
			
 
				-				highThreshold--
			
 
				-				symbolNext[i] = 1
			
 
				-			} else {
			
 
				-				symbolNext[i] = uint16(v)
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-	// Spread symbols
			
 
				-	{
			
 
				-		tableMask := tableSize - 1
			
 
				-		step := tableStep(tableSize)
			
 
				-		position := uint32(0)
			
 
				-		for ss, v := range s.norm[:s.symbolLen] {
			
 
				-			for i := 0; i < int(v); i++ {
			
 
				-				s.dt[position].setAddBits(uint8(ss))
			
 
				-				position = (position + step) & tableMask
			
 
				-				for position > highThreshold {
			
 
				-					// lowprob area
			
 
				-					position = (position + step) & tableMask
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-		if position != 0 {
			
 
				-			// position must reach all cells once, otherwise normalizedCounter is incorrect
			
 
				-			return errors.New("corrupted input (position != 0)")
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	// Build Decoding table
			
 
				-	{
			
 
				-		tableSize := uint16(1 << s.actualTableLog)
			
 
				-		for u, v := range s.dt[:tableSize] {
			
 
				-			symbol := v.addBits()
			
 
				-			nextState := symbolNext[symbol]
			
 
				-			symbolNext[symbol] = nextState + 1
			
 
				-			nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
			
 
				-			s.dt[u&maxTableMask].setNBits(nBits)
			
 
				-			newState := (nextState << nBits) - tableSize
			
 
				-			if newState > tableSize {
			
 
				-				return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
			
 
				-			}
			
 
				-			if newState == uint16(u) && nBits == 0 {
			
 
				-				// Seems weird that this is possible with nbits > 0.
			
 
				-				return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
			
 
				-			}
			
 
				-			s.dt[u&maxTableMask].setNewState(newState)
			
 
				-		}
			
 
				-	}
			
 
				-	return nil
			
 
				-}
			
 
				-
			
 
				 // transform will transform the decoder table into a table usable for
			
 
				 // decoding without having to apply the transformation while decoding.
			
 
				 // The state will contain the base value and the number of bits to read.
			
@@ -352,34 +301,7 @@ func (s *fseState) init(br *bitReader, tableLog uint8, dt []decSymbol) {
 
				 	s.state = dt[br.getBits(tableLog)]
			
 
				 }
			
 
				 
			
 
				-// next returns the current symbol and sets the next state.
			
 
				-// At least tablelog bits must be available in the bit reader.
			
 
				-func (s *fseState) next(br *bitReader) {
			
 
				-	lowBits := uint16(br.getBits(s.state.nbBits()))
			
 
				-	s.state = s.dt[s.state.newState()+lowBits]
			
 
				-}
			
 
				-
			
 
				-// finished returns true if all bits have been read from the bitstream
			
 
				-// and the next state would require reading bits from the input.
			
 
				-func (s *fseState) finished(br *bitReader) bool {
			
 
				-	return br.finished() && s.state.nbBits() > 0
			
 
				-}
			
 
				-
			
 
				-// final returns the current state symbol without decoding the next.
			
 
				-func (s *fseState) final() (int, uint8) {
			
 
				-	return s.state.baselineInt(), s.state.addBits()
			
 
				-}
			
 
				-
			
 
				 // final returns the current state symbol without decoding the next.
			
 
				 func (s decSymbol) final() (int, uint8) {
			
 
				 	return s.baselineInt(), s.addBits()
			
 
				 }
			
 
				-
			
 
				-// nextFast returns the next symbol and sets the next state.
			
 
				-// This can only be used if no symbols are 0 bits.
			
 
				-// At least tablelog bits must be available in the bit reader.
			
 
				-func (s *fseState) nextFast(br *bitReader) (uint32, uint8) {
			
 
				-	lowBits := br.get16BitsFast(s.state.nbBits())
			
 
				-	s.state = s.dt[s.state.newState()+lowBits]
			
 
				-	return s.state.baseline(), s.state.addBits()
			
 
				-}
			
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
@@ -0,0 +1,64 @@
 
				+//go:build amd64 && !appengine && !noasm && gc
			
 
				+// +build amd64,!appengine,!noasm,gc
			
 
				+
			
 
				+package zstd
			
 
				+
			
 
				+import (
			
 
				+	"fmt"
			
 
				+)
			
 
				+
			
 
				+type buildDtableAsmContext struct {
			
 
				+	// inputs
			
 
				+	stateTable *uint16
			
 
				+	norm       *int16
			
 
				+	dt         *uint64
			
 
				+
			
 
				+	// outputs --- set by the procedure in the case of error;
			
 
				+	// for interpretation please see the error handling part below
			
 
				+	errParam1 uint64
			
 
				+	errParam2 uint64
			
 
				+}
			
 
				+
			
 
				+// buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable.
			
 
				+// Function returns non-zero exit code on error.
			
 
				+// go:noescape
			
 
				+func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
			
 
				+
			
 
				+// please keep in sync with _generate/gen_fse.go
			
 
				+const (
			
 
				+	errorCorruptedNormalizedCounter = 1
			
 
				+	errorNewStateTooBig             = 2
			
 
				+	errorNewStateNoBits             = 3
			
 
				+)
			
 
				+
			
 
				+// buildDtable will build the decoding table.
			
 
				+func (s *fseDecoder) buildDtable() error {
			
 
				+	ctx := buildDtableAsmContext{
			
 
				+		stateTable: &s.stateTable[0],
			
 
				+		norm:       &s.norm[0],
			
 
				+		dt:         (*uint64)(&s.dt[0]),
			
 
				+	}
			
 
				+	code := buildDtable_asm(s, &ctx)
			
 
				+
			
 
				+	if code != 0 {
			
 
				+		switch code {
			
 
				+		case errorCorruptedNormalizedCounter:
			
 
				+			position := ctx.errParam1
			
 
				+			return fmt.Errorf("corrupted input (position=%d, expected 0)", position)
			
 
				+
			
 
				+		case errorNewStateTooBig:
			
 
				+			newState := decSymbol(ctx.errParam1)
			
 
				+			size := ctx.errParam2
			
 
				+			return fmt.Errorf("newState (%d) outside table size (%d)", newState, size)
			
 
				+
			
 
				+		case errorNewStateNoBits:
			
 
				+			newState := decSymbol(ctx.errParam1)
			
 
				+			oldState := decSymbol(ctx.errParam2)
			
 
				+			return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, oldState)
			
 
				+
			
 
				+		default:
			
 
				+			return fmt.Errorf("buildDtable_asm returned unhandled nonzero code = %d", code)
			
 
				+		}
			
 
				+	}
			
 
				+	return nil
			
 
				+}
			
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
@@ -0,0 +1,127 @@
 
				+// Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT.
			
 
				+
			
 
				+//go:build !appengine && !noasm && gc && !noasm
			
 
				+// +build !appengine,!noasm,gc,!noasm
			
 
				+
			
 
				+// func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
			
 
				+TEXT ·buildDtable_asm(SB), $0-24
			
 
				+	MOVQ ctx+8(FP), CX
			
 
				+	MOVQ s+0(FP), DI
			
 
				+
			
 
				+	// Load values
			
 
				+	MOVBQZX 4098(DI), DX
			
 
				+	XORQ    AX, AX
			
 
				+	BTSQ    DX, AX
			
 
				+	MOVQ    (CX), BX
			
 
				+	MOVQ    16(CX), SI
			
 
				+	LEAQ    -1(AX), R8
			
 
				+	MOVQ    8(CX), CX
			
 
				+	MOVWQZX 4096(DI), DI
			
 
				+
			
 
				+	// End load values
			
 
				+	// Init, lay down lowprob symbols
			
 
				+	XORQ R9, R9
			
 
				+	JMP  init_main_loop_condition
			
 
				+
			
 
				+init_main_loop:
			
 
				+	MOVWQSX (CX)(R9*2), R10
			
 
				+	CMPW    R10, $-1
			
 
				+	JNE     do_not_update_high_threshold
			
 
				+	MOVB    R9, 1(SI)(R8*8)
			
 
				+	DECQ    R8
			
 
				+	MOVQ    $0x0000000000000001, R10
			
 
				+
			
 
				+do_not_update_high_threshold:
			
 
				+	MOVW R10, (BX)(R9*2)
			
 
				+	INCQ R9
			
 
				+
			
 
				+init_main_loop_condition:
			
 
				+	CMPQ R9, DI
			
 
				+	JL   init_main_loop
			
 
				+
			
 
				+	// Spread symbols
			
 
				+	// Calculate table step
			
 
				+	MOVQ AX, R9
			
 
				+	SHRQ $0x01, R9
			
 
				+	MOVQ AX, R10
			
 
				+	SHRQ $0x03, R10
			
 
				+	LEAQ 3(R9)(R10*1), R9
			
 
				+
			
 
				+	// Fill add bits values
			
 
				+	LEAQ -1(AX), R10
			
 
				+	XORQ R11, R11
			
 
				+	XORQ R12, R12
			
 
				+	JMP  spread_main_loop_condition
			
 
				+
			
 
				+spread_main_loop:
			
 
				+	XORQ    R13, R13
			
 
				+	MOVWQSX (CX)(R12*2), R14
			
 
				+	JMP     spread_inner_loop_condition
			
 
				+
			
 
				+spread_inner_loop:
			
 
				+	MOVB R12, 1(SI)(R11*8)
			
 
				+
			
 
				+adjust_position:
			
 
				+	ADDQ R9, R11
			
 
				+	ANDQ R10, R11
			
 
				+	CMPQ R11, R8
			
 
				+	JG   adjust_position
			
 
				+	INCQ R13
			
 
				+
			
 
				+spread_inner_loop_condition:
			
 
				+	CMPQ R13, R14
			
 
				+	JL   spread_inner_loop
			
 
				+	INCQ R12
			
 
				+
			
 
				+spread_main_loop_condition:
			
 
				+	CMPQ  R12, DI
			
 
				+	JL    spread_main_loop
			
 
				+	TESTQ R11, R11
			
 
				+	JZ    spread_check_ok
			
 
				+	MOVQ  ctx+8(FP), AX
			
 
				+	MOVQ  R11, 24(AX)
			
 
				+	MOVQ  $+1, ret+16(FP)
			
 
				+	RET
			
 
				+
			
 
				+spread_check_ok:
			
 
				+	// Build Decoding table
			
 
				+	XORQ DI, DI
			
 
				+
			
 
				+build_table_main_table:
			
 
				+	MOVBQZX 1(SI)(DI*8), CX
			
 
				+	MOVWQZX (BX)(CX*2), R8
			
 
				+	LEAQ    1(R8), R9
			
 
				+	MOVW    R9, (BX)(CX*2)
			
 
				+	MOVQ    R8, R9
			
 
				+	BSRQ    R9, R9
			
 
				+	MOVQ    DX, CX
			
 
				+	SUBQ    R9, CX
			
 
				+	SHLQ    CL, R8
			
 
				+	SUBQ    AX, R8
			
 
				+	MOVB    CL, (SI)(DI*8)
			
 
				+	MOVW    R8, 2(SI)(DI*8)
			
 
				+	CMPQ    R8, AX
			
 
				+	JLE     build_table_check1_ok
			
 
				+	MOVQ    ctx+8(FP), CX
			
 
				+	MOVQ    R8, 24(CX)
			
 
				+	MOVQ    AX, 32(CX)
			
 
				+	MOVQ    $+2, ret+16(FP)
			
 
				+	RET
			
 
				+
			
 
				+build_table_check1_ok:
			
 
				+	TESTB CL, CL
			
 
				+	JNZ   build_table_check2_ok
			
 
				+	CMPW  R8, DI
			
 
				+	JNE   build_table_check2_ok
			
 
				+	MOVQ  ctx+8(FP), AX
			
 
				+	MOVQ  R8, 24(AX)
			
 
				+	MOVQ  DI, 32(AX)
			
 
				+	MOVQ  $+3, ret+16(FP)
			
 
				+	RET
			
 
				+
			
 
				+build_table_check2_ok:
			
 
				+	INCQ DI
			
 
				+	CMPQ DI, AX
			
 
				+	JL   build_table_main_table
			
 
				+	MOVQ $+0, ret+16(FP)
			
 
				+	RET
			
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go
@@ -0,0 +1,72 @@
 
				+//go:build !amd64 || appengine || !gc || noasm
			
 
				+// +build !amd64 appengine !gc noasm
			
 
				+
			
 
				+package zstd
			
 
				+
			
 
				+import (
			
 
				+	"errors"
			
 
				+	"fmt"
			
 
				+)
			
 
				+
			
 
				+// buildDtable will build the decoding table.
			
 
				+func (s *fseDecoder) buildDtable() error {
			
 
				+	tableSize := uint32(1 << s.actualTableLog)
			
 
				+	highThreshold := tableSize - 1
			
 
				+	symbolNext := s.stateTable[:256]
			
 
				+
			
 
				+	// Init, lay down lowprob symbols
			
 
				+	{
			
 
				+		for i, v := range s.norm[:s.symbolLen] {
			
 
				+			if v == -1 {
			
 
				+				s.dt[highThreshold].setAddBits(uint8(i))
			
 
				+				highThreshold--
			
 
				+				symbolNext[i] = 1
			
 
				+			} else {
			
 
				+				symbolNext[i] = uint16(v)
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Spread symbols
			
 
				+	{
			
 
				+		tableMask := tableSize - 1
			
 
				+		step := tableStep(tableSize)
			
 
				+		position := uint32(0)
			
 
				+		for ss, v := range s.norm[:s.symbolLen] {
			
 
				+			for i := 0; i < int(v); i++ {
			
 
				+				s.dt[position].setAddBits(uint8(ss))
			
 
				+				position = (position + step) & tableMask
			
 
				+				for position > highThreshold {
			
 
				+					// lowprob area
			
 
				+					position = (position + step) & tableMask
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		if position != 0 {
			
 
				+			// position must reach all cells once, otherwise normalizedCounter is incorrect
			
 
				+			return errors.New("corrupted input (position != 0)")
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Build Decoding table
			
 
				+	{
			
 
				+		tableSize := uint16(1 << s.actualTableLog)
			
 
				+		for u, v := range s.dt[:tableSize] {
			
 
				+			symbol := v.addBits()
			
 
				+			nextState := symbolNext[symbol]
			
 
				+			symbolNext[symbol] = nextState + 1
			
 
				+			nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
			
 
				+			s.dt[u&maxTableMask].setNBits(nBits)
			
 
				+			newState := (nextState << nBits) - tableSize
			
 
				+			if newState > tableSize {
			
 
				+				return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
			
 
				+			}
			
 
				+			if newState == uint16(u) && nBits == 0 {
			
 
				+				// Seems weird that this is possible with nbits > 0.
			
 
				+				return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
			
 
				+			}
			
 
				+			s.dt[u&maxTableMask].setNewState(newState)
			
 
				+		}
			
 
				+	}
			
 
				+	return nil
			
 
				+}
			
--- a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
@@ -76,21 +76,6 @@ func (s *fseEncoder) HistogramFinished(maxSymbol uint8, maxCount int) {
 
				 	s.clearCount = maxCount != 0
			
 
				 }
			
 
				 
			
 
				-// prepare will prepare and allocate scratch tables used for both compression and decompression.
			
 
				-func (s *fseEncoder) prepare() (*fseEncoder, error) {
			
 
				-	if s == nil {
			
 
				-		s = &fseEncoder{}
			
 
				-	}
			
 
				-	s.useRLE = false
			
 
				-	if s.clearCount && s.maxCount == 0 {
			
 
				-		for i := range s.count {
			
 
				-			s.count[i] = 0
			
 
				-		}
			
 
				-		s.clearCount = false
			
 
				-	}
			
 
				-	return s, nil
			
 
				-}
			
 
				-
			
 
				 // allocCtable will allocate tables needed for compression.
			
 
				 // If existing tables a re big enough, they are simply re-used.
			
 
				 func (s *fseEncoder) allocCtable() {
			
@@ -709,14 +694,6 @@ func (c *cState) init(bw *bitWriter, ct *cTable, first symbolTransform) {
 
				 	c.state = c.stateTable[lu]
			
 
				 }
			
 
				 
			
 
				-// encode the output symbol provided and write it to the bitstream.
			
 
				-func (c *cState) encode(symbolTT symbolTransform) {
			
 
				-	nbBitsOut := (uint32(c.state) + symbolTT.deltaNbBits) >> 16
			
 
				-	dstState := int32(c.state>>(nbBitsOut&15)) + int32(symbolTT.deltaFindState)
			
 
				-	c.bw.addBits16NC(c.state, uint8(nbBitsOut))
			
 
				-	c.state = c.stateTable[dstState]
			
 
				-}
			
 
				-
			
 
				 // flush will write the tablelog to the output and flush the remaining full bytes.
			
 
				 func (c *cState) flush(tableLog uint8) {
			
 
				 	c.bw.flush32()
			
--- a/vendor/github.com/klauspost/compress/zstd/fuzz.go
+++ b/vendor/github.com/klauspost/compress/zstd/fuzz.go
@@ -1,11 +0,0 @@
 
				-//go:build ignorecrc
			
 
				-// +build ignorecrc
			
 
				-
			
 
				-// Copyright 2019+ Klaus Post. All rights reserved.
			
 
				-// License information can be found in the LICENSE file.
			
 
				-// Based on work by Yann Collet, released under BSD License.
			
 
				-
			
 
				-package zstd
			
 
				-
			
 
				-// ignoreCRC can be used for fuzz testing to ignore CRC values...
			
 
				-const ignoreCRC = true
			
--- a/vendor/github.com/klauspost/compress/zstd/fuzz_none.go
+++ b/vendor/github.com/klauspost/compress/zstd/fuzz_none.go
@@ -1,11 +0,0 @@
 
				-//go:build !ignorecrc
			
 
				-// +build !ignorecrc
			
 
				-
			
 
				-// Copyright 2019+ Klaus Post. All rights reserved.
			
 
				-// License information can be found in the LICENSE file.
			
 
				-// Based on work by Yann Collet, released under BSD License.
			
 
				-
			
 
				-package zstd
			
 
				-
			
 
				-// ignoreCRC can be used for fuzz testing to ignore CRC values...
			
 
				-const ignoreCRC = false
			
--- a/vendor/github.com/klauspost/compress/zstd/hash.go
+++ b/vendor/github.com/klauspost/compress/zstd/hash.go
@@ -33,9 +33,3 @@ func hashLen(u uint64, length, mls uint8) uint32 {
 
				 		return (uint32(u) * prime4bytes) >> (32 - length)
			
 
				 	}
			
 
				 }
			
 
				-
			
 
				-// hash3 returns the hash of the lower 3 bytes of u to fit in a hash table with h bits.
			
 
				-// Preferably h should be a constant and should always be <32.
			
 
				-func hash3(u uint32, h uint8) uint32 {
			
 
				-	return ((u << (32 - 24)) * prime3bytes) >> ((32 - h) & 31)
			
 
				-}
			
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -73,6 +73,7 @@ type sequenceDecs struct {
 
				 	seqSize      int
			
 
				 	windowSize   int
			
 
				 	maxBits      uint8
			
 
				+	maxSyncLen   uint64
			
 
				 }
			
 
				 
			
 
				 // initialize all 3 decoders from the stream input.
			
@@ -98,153 +99,13 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) erro
 
				 	return nil
			
 
				 }
			
 
				 
			
 
				-// decode sequences from the stream with the provided history.
			
 
				-func (s *sequenceDecs) decode(seqs []seqVals) error {
			
 
				-	br := s.br
			
 
				-
			
 
				-	// Grab full sizes tables, to avoid bounds checks.
			
 
				-	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
			
 
				-	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
			
 
				-	s.seqSize = 0
			
 
				-	litRemain := len(s.literals)
			
 
				-	maxBlockSize := maxCompressedBlockSize
			
 
				-	if s.windowSize < maxBlockSize {
			
 
				-		maxBlockSize = s.windowSize
			
 
				-	}
			
 
				-	for i := range seqs {
			
 
				-		var ll, mo, ml int
			
 
				-		if br.off > 4+((maxOffsetBits+16+16)>>3) {
			
 
				-			// inlined function:
			
 
				-			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
			
 
				-
			
 
				-			// Final will not read from stream.
			
 
				-			var llB, mlB, moB uint8
			
 
				-			ll, llB = llState.final()
			
 
				-			ml, mlB = mlState.final()
			
 
				-			mo, moB = ofState.final()
			
 
				-
			
 
				-			// extra bits are stored in reverse order.
			
 
				-			br.fillFast()
			
 
				-			mo += br.getBits(moB)
			
 
				-			if s.maxBits > 32 {
			
 
				-				br.fillFast()
			
 
				-			}
			
 
				-			ml += br.getBits(mlB)
			
 
				-			ll += br.getBits(llB)
			
 
				-
			
 
				-			if moB > 1 {
			
 
				-				s.prevOffset[2] = s.prevOffset[1]
			
 
				-				s.prevOffset[1] = s.prevOffset[0]
			
 
				-				s.prevOffset[0] = mo
			
 
				-			} else {
			
 
				-				// mo = s.adjustOffset(mo, ll, moB)
			
 
				-				// Inlined for rather big speedup
			
 
				-				if ll == 0 {
			
 
				-					// There is an exception though, when current sequence's literals_length = 0.
			
 
				-					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
			
 
				-					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
			
 
				-					mo++
			
 
				-				}
			
 
				-
			
 
				-				if mo == 0 {
			
 
				-					mo = s.prevOffset[0]
			
 
				-				} else {
			
 
				-					var temp int
			
 
				-					if mo == 3 {
			
 
				-						temp = s.prevOffset[0] - 1
			
 
				-					} else {
			
 
				-						temp = s.prevOffset[mo]
			
 
				-					}
			
 
				-
			
 
				-					if temp == 0 {
			
 
				-						// 0 is not valid; input is corrupted; force offset to 1
			
 
				-						println("WARNING: temp was 0")
			
 
				-						temp = 1
			
 
				-					}
			
 
				-
			
 
				-					if mo != 1 {
			
 
				-						s.prevOffset[2] = s.prevOffset[1]
			
 
				-					}
			
 
				-					s.prevOffset[1] = s.prevOffset[0]
			
 
				-					s.prevOffset[0] = temp
			
 
				-					mo = temp
			
 
				-				}
			
 
				-			}
			
 
				-			br.fillFast()
			
 
				-		} else {
			
 
				-			if br.overread() {
			
 
				-				if debugDecoder {
			
 
				-					printf("reading sequence %d, exceeded available data\n", i)
			
 
				-				}
			
 
				-				return io.ErrUnexpectedEOF
			
 
				-			}
			
 
				-			ll, mo, ml = s.next(br, llState, mlState, ofState)
			
 
				-			br.fill()
			
 
				-		}
			
 
				-
			
 
				-		if debugSequences {
			
 
				-			println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
			
 
				-		}
			
 
				-		// Evaluate.
			
 
				-		// We might be doing this async, so do it early.
			
 
				-		if mo == 0 && ml > 0 {
			
 
				-			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
			
 
				-		}
			
 
				-		if ml > maxMatchLen {
			
 
				-			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
			
 
				-		}
			
 
				-		s.seqSize += ll + ml
			
 
				-		if s.seqSize > maxBlockSize {
			
 
				-			return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
			
 
				-		}
			
 
				-		litRemain -= ll
			
 
				-		if litRemain < 0 {
			
 
				-			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll)
			
 
				-		}
			
 
				-		seqs[i] = seqVals{
			
 
				-			ll: ll,
			
 
				-			ml: ml,
			
 
				-			mo: mo,
			
 
				-		}
			
 
				-		if i == len(seqs)-1 {
			
 
				-			// This is the last sequence, so we shouldn't update state.
			
 
				-			break
			
 
				-		}
			
 
				-
			
 
				-		// Manually inlined, ~ 5-20% faster
			
 
				-		// Update all 3 states at once. Approx 20% faster.
			
 
				-		nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
			
 
				-		if nBits == 0 {
			
 
				-			llState = llTable[llState.newState()&maxTableMask]
			
 
				-			mlState = mlTable[mlState.newState()&maxTableMask]
			
 
				-			ofState = ofTable[ofState.newState()&maxTableMask]
			
 
				-		} else {
			
 
				-			bits := br.get32BitsFast(nBits)
			
 
				-			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
			
 
				-			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
			
 
				-
			
 
				-			lowBits = uint16(bits >> (ofState.nbBits() & 31))
			
 
				-			lowBits &= bitMask[mlState.nbBits()&15]
			
 
				-			mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
			
 
				-
			
 
				-			lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
			
 
				-			ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
			
 
				-		}
			
 
				-	}
			
 
				-	s.seqSize += litRemain
			
 
				-	if s.seqSize > maxBlockSize {
			
 
				-		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
			
 
				-	}
			
 
				-	err := br.close()
			
 
				-	if err != nil {
			
 
				-		printf("Closing sequences: %v, %+v\n", err, *br)
			
 
				-	}
			
 
				-	return err
			
 
				-}
			
 
				-
			
 
				 // execute will execute the decoded sequence with the provided history.
			
 
				 // The sequence must be evaluated before being sent.
			
 
				 func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
			
 
				+	if len(s.dict) == 0 {
			
 
				+		return s.executeSimple(seqs, hist)
			
 
				+	}
			
 
				+
			
 
				 	// Ensure we have enough output size...
			
 
				 	if len(s.out)+s.seqSize > cap(s.out) {
			
 
				 		addBytes := s.seqSize + len(s.out)
			
@@ -327,6 +188,7 @@ func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				+
			
 
				 	// Add final literals
			
 
				 	copy(out[t:], s.literals)
			
 
				 	if debugDecoder {
			
@@ -341,14 +203,18 @@ func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
 
				 }
			
 
				 
			
 
				 // decode sequences from the stream with the provided history.
			
 
				-func (s *sequenceDecs) decodeSync(history *history) error {
			
 
				+func (s *sequenceDecs) decodeSync(hist []byte) error {
			
 
				+	supported, err := s.decodeSyncSimple(hist)
			
 
				+	if supported {
			
 
				+		return err
			
 
				+	}
			
 
				+
			
 
				 	br := s.br
			
 
				 	seqs := s.nSeqs
			
 
				 	startSize := len(s.out)
			
 
				 	// Grab full sizes tables, to avoid bounds checks.
			
 
				 	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
			
 
				 	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
			
 
				-	hist := history.b[history.ignoreBuffer:]
			
 
				 	out := s.out
			
 
				 	maxBlockSize := maxCompressedBlockSize
			
 
				 	if s.windowSize < maxBlockSize {
			
@@ -433,7 +299,7 @@ func (s *sequenceDecs) decodeSync(history *history) error {
 
				 		}
			
 
				 		size := ll + ml + len(out)
			
 
				 		if size-startSize > maxBlockSize {
			
 
				-			return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
			
 
				+			return fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
			
 
				 		}
			
 
				 		if size > cap(out) {
			
 
				 			// Not enough size, which can happen under high volume block streaming conditions
			
@@ -463,13 +329,13 @@ func (s *sequenceDecs) decodeSync(history *history) error {
 
				 
			
 
				 		if mo > len(out)+len(hist) || mo > s.windowSize {
			
 
				 			if len(s.dict) == 0 {
			
 
				-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist))
			
 
				+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
			
 
				 			}
			
 
				 
			
 
				 			// we may be in dictionary.
			
 
				 			dictO := len(s.dict) - (mo - (len(out) + len(hist)))
			
 
				 			if dictO < 0 || dictO >= len(s.dict) {
			
 
				-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist))
			
 
				+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
			
 
				 			}
			
 
				 			end := dictO + ml
			
 
				 			if end > len(s.dict) {
			
@@ -530,6 +396,7 @@ func (s *sequenceDecs) decodeSync(history *history) error {
 
				 			ofState = ofTable[ofState.newState()&maxTableMask]
			
 
				 		} else {
			
 
				 			bits := br.get32BitsFast(nBits)
			
 
				+
			
 
				 			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
			
 
				 			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
			
 
				 
			
@@ -543,8 +410,8 @@ func (s *sequenceDecs) decodeSync(history *history) error {
 
				 	}
			
 
				 
			
 
				 	// Check if space for literals
			
 
				-	if len(s.literals)+len(s.out)-startSize > maxBlockSize {
			
 
				-		return fmt.Errorf("output (%d) bigger than max block size (%d)", len(s.out), maxBlockSize)
			
 
				+	if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize {
			
 
				+		return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
			
 
				 	}
			
 
				 
			
 
				 	// Add final literals
			
@@ -552,16 +419,6 @@ func (s *sequenceDecs) decodeSync(history *history) error {
 
				 	return br.close()
			
 
				 }
			
 
				 
			
 
				-// update states, at least 27 bits must be available.
			
 
				-func (s *sequenceDecs) update(br *bitReader) {
			
 
				-	// Max 8 bits
			
 
				-	s.litLengths.state.next(br)
			
 
				-	// Max 9 bits
			
 
				-	s.matchLengths.state.next(br)
			
 
				-	// Max 8 bits
			
 
				-	s.offsets.state.next(br)
			
 
				-}
			
 
				-
			
 
				 var bitMask [16]uint16
			
 
				 
			
 
				 func init() {
			
@@ -570,87 +427,6 @@ func init() {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-// update states, at least 27 bits must be available.
			
 
				-func (s *sequenceDecs) updateAlt(br *bitReader) {
			
 
				-	// Update all 3 states at once. Approx 20% faster.
			
 
				-	a, b, c := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
			
 
				-
			
 
				-	nBits := a.nbBits() + b.nbBits() + c.nbBits()
			
 
				-	if nBits == 0 {
			
 
				-		s.litLengths.state.state = s.litLengths.state.dt[a.newState()]
			
 
				-		s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()]
			
 
				-		s.offsets.state.state = s.offsets.state.dt[c.newState()]
			
 
				-		return
			
 
				-	}
			
 
				-	bits := br.get32BitsFast(nBits)
			
 
				-	lowBits := uint16(bits >> ((c.nbBits() + b.nbBits()) & 31))
			
 
				-	s.litLengths.state.state = s.litLengths.state.dt[a.newState()+lowBits]
			
 
				-
			
 
				-	lowBits = uint16(bits >> (c.nbBits() & 31))
			
 
				-	lowBits &= bitMask[b.nbBits()&15]
			
 
				-	s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()+lowBits]
			
 
				-
			
 
				-	lowBits = uint16(bits) & bitMask[c.nbBits()&15]
			
 
				-	s.offsets.state.state = s.offsets.state.dt[c.newState()+lowBits]
			
 
				-}
			
 
				-
			
 
				-// nextFast will return new states when there are at least 4 unused bytes left on the stream when done.
			
 
				-func (s *sequenceDecs) nextFast(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
			
 
				-	// Final will not read from stream.
			
 
				-	ll, llB := llState.final()
			
 
				-	ml, mlB := mlState.final()
			
 
				-	mo, moB := ofState.final()
			
 
				-
			
 
				-	// extra bits are stored in reverse order.
			
 
				-	br.fillFast()
			
 
				-	mo += br.getBits(moB)
			
 
				-	if s.maxBits > 32 {
			
 
				-		br.fillFast()
			
 
				-	}
			
 
				-	ml += br.getBits(mlB)
			
 
				-	ll += br.getBits(llB)
			
 
				-
			
 
				-	if moB > 1 {
			
 
				-		s.prevOffset[2] = s.prevOffset[1]
			
 
				-		s.prevOffset[1] = s.prevOffset[0]
			
 
				-		s.prevOffset[0] = mo
			
 
				-		return
			
 
				-	}
			
 
				-	// mo = s.adjustOffset(mo, ll, moB)
			
 
				-	// Inlined for rather big speedup
			
 
				-	if ll == 0 {
			
 
				-		// There is an exception though, when current sequence's literals_length = 0.
			
 
				-		// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
			
 
				-		// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
			
 
				-		mo++
			
 
				-	}
			
 
				-
			
 
				-	if mo == 0 {
			
 
				-		mo = s.prevOffset[0]
			
 
				-		return
			
 
				-	}
			
 
				-	var temp int
			
 
				-	if mo == 3 {
			
 
				-		temp = s.prevOffset[0] - 1
			
 
				-	} else {
			
 
				-		temp = s.prevOffset[mo]
			
 
				-	}
			
 
				-
			
 
				-	if temp == 0 {
			
 
				-		// 0 is not valid; input is corrupted; force offset to 1
			
 
				-		println("temp was 0")
			
 
				-		temp = 1
			
 
				-	}
			
 
				-
			
 
				-	if mo != 1 {
			
 
				-		s.prevOffset[2] = s.prevOffset[1]
			
 
				-	}
			
 
				-	s.prevOffset[1] = s.prevOffset[0]
			
 
				-	s.prevOffset[0] = temp
			
 
				-	mo = temp
			
 
				-	return
			
 
				-}
			
 
				-
			
 
				 func (s *sequenceDecs) next(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
			
 
				 	// Final will not read from stream.
			
 
				 	ll, llB := llState.final()
			
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
@@ -0,0 +1,368 @@
 
				+//go:build amd64 && !appengine && !noasm && gc
			
 
				+// +build amd64,!appengine,!noasm,gc
			
 
				+
			
 
				+package zstd
			
 
				+
			
 
				+import (
			
 
				+	"fmt"
			
 
				+
			
 
				+	"github.com/klauspost/compress/internal/cpuinfo"
			
 
				+)
			
 
				+
			
 
				+type decodeSyncAsmContext struct {
			
 
				+	llTable     []decSymbol
			
 
				+	mlTable     []decSymbol
			
 
				+	ofTable     []decSymbol
			
 
				+	llState     uint64
			
 
				+	mlState     uint64
			
 
				+	ofState     uint64
			
 
				+	iteration   int
			
 
				+	litRemain   int
			
 
				+	out         []byte
			
 
				+	outPosition int
			
 
				+	literals    []byte
			
 
				+	litPosition int
			
 
				+	history     []byte
			
 
				+	windowSize  int
			
 
				+	ll          int // set on error (not for all errors, please refer to _generate/gen.go)
			
 
				+	ml          int // set on error (not for all errors, please refer to _generate/gen.go)
			
 
				+	mo          int // set on error (not for all errors, please refer to _generate/gen.go)
			
 
				+}
			
 
				+
			
 
				+// sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm.
			
 
				+//
			
 
				+// Please refer to seqdec_generic.go for the reference implementation.
			
 
				+//go:noescape
			
 
				+func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
			
 
				+
			
 
				+// sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions.
			
 
				+//go:noescape
			
 
				+func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
			
 
				+
			
 
				+// sequenceDecs_decodeSync_safe_amd64 does the same as above, but does not write more than output buffer.
			
 
				+//go:noescape
			
 
				+func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
			
 
				+
			
 
				+// sequenceDecs_decodeSync_safe_bmi2 does the same as above, but does not write more than output buffer.
			
 
				+//go:noescape
			
 
				+func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
			
 
				+
			
 
				+// decode sequences from the stream with the provided history but without a dictionary.
			
 
				+func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
			
 
				+	if len(s.dict) > 0 {
			
 
				+		return false, nil
			
 
				+	}
			
 
				+	if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize {
			
 
				+		return false, nil
			
 
				+	}
			
 
				+
			
 
				+	// FIXME: Using unsafe memory copies leads to rare, random crashes
			
 
				+	// with fuzz testing. It is therefore disabled for now.
			
 
				+	const useSafe = true
			
 
				+	/*
			
 
				+		useSafe := false
			
 
				+		if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
			
 
				+			useSafe = true
			
 
				+		}
			
 
				+		if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
			
 
				+			useSafe = true
			
 
				+		}
			
 
				+		if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
			
 
				+			useSafe = true
			
 
				+		}
			
 
				+	*/
			
 
				+
			
 
				+	br := s.br
			
 
				+
			
 
				+	maxBlockSize := maxCompressedBlockSize
			
 
				+	if s.windowSize < maxBlockSize {
			
 
				+		maxBlockSize = s.windowSize
			
 
				+	}
			
 
				+
			
 
				+	ctx := decodeSyncAsmContext{
			
 
				+		llTable:     s.litLengths.fse.dt[:maxTablesize],
			
 
				+		mlTable:     s.matchLengths.fse.dt[:maxTablesize],
			
 
				+		ofTable:     s.offsets.fse.dt[:maxTablesize],
			
 
				+		llState:     uint64(s.litLengths.state.state),
			
 
				+		mlState:     uint64(s.matchLengths.state.state),
			
 
				+		ofState:     uint64(s.offsets.state.state),
			
 
				+		iteration:   s.nSeqs - 1,
			
 
				+		litRemain:   len(s.literals),
			
 
				+		out:         s.out,
			
 
				+		outPosition: len(s.out),
			
 
				+		literals:    s.literals,
			
 
				+		windowSize:  s.windowSize,
			
 
				+		history:     hist,
			
 
				+	}
			
 
				+
			
 
				+	s.seqSize = 0
			
 
				+	startSize := len(s.out)
			
 
				+
			
 
				+	var errCode int
			
 
				+	if cpuinfo.HasBMI2() {
			
 
				+		if useSafe {
			
 
				+			errCode = sequenceDecs_decodeSync_safe_bmi2(s, br, &ctx)
			
 
				+		} else {
			
 
				+			errCode = sequenceDecs_decodeSync_bmi2(s, br, &ctx)
			
 
				+		}
			
 
				+	} else {
			
 
				+		if useSafe {
			
 
				+			errCode = sequenceDecs_decodeSync_safe_amd64(s, br, &ctx)
			
 
				+		} else {
			
 
				+			errCode = sequenceDecs_decodeSync_amd64(s, br, &ctx)
			
 
				+		}
			
 
				+	}
			
 
				+	switch errCode {
			
 
				+	case noError:
			
 
				+		break
			
 
				+
			
 
				+	case errorMatchLenOfsMismatch:
			
 
				+		return true, fmt.Errorf("zero matchoff and matchlen (%d) > 0", ctx.ml)
			
 
				+
			
 
				+	case errorMatchLenTooBig:
			
 
				+		return true, fmt.Errorf("match len (%d) bigger than max allowed length", ctx.ml)
			
 
				+
			
 
				+	case errorMatchOffTooBig:
			
 
				+		return true, fmt.Errorf("match offset (%d) bigger than current history (%d)",
			
 
				+			ctx.mo, ctx.outPosition+len(hist)-startSize)
			
 
				+
			
 
				+	case errorNotEnoughLiterals:
			
 
				+		return true, fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available",
			
 
				+			ctx.ll, ctx.litRemain+ctx.ll)
			
 
				+
			
 
				+	case errorNotEnoughSpace:
			
 
				+		size := ctx.outPosition + ctx.ll + ctx.ml
			
 
				+		if debugDecoder {
			
 
				+			println("msl:", s.maxSyncLen, "cap", cap(s.out), "bef:", startSize, "sz:", size-startSize, "mbs:", maxBlockSize, "outsz:", cap(s.out)-startSize)
			
 
				+		}
			
 
				+		return true, fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
			
 
				+
			
 
				+	default:
			
 
				+		return true, fmt.Errorf("sequenceDecs_decode returned erronous code %d", errCode)
			
 
				+	}
			
 
				+
			
 
				+	s.seqSize += ctx.litRemain
			
 
				+	if s.seqSize > maxBlockSize {
			
 
				+		return true, fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
			
 
				+	}
			
 
				+	err := br.close()
			
 
				+	if err != nil {
			
 
				+		printf("Closing sequences: %v, %+v\n", err, *br)
			
 
				+		return true, err
			
 
				+	}
			
 
				+
			
 
				+	s.literals = s.literals[ctx.litPosition:]
			
 
				+	t := ctx.outPosition
			
 
				+	s.out = s.out[:t]
			
 
				+
			
 
				+	// Add final literals
			
 
				+	s.out = append(s.out, s.literals...)
			
 
				+	if debugDecoder {
			
 
				+		t += len(s.literals)
			
 
				+		if t != len(s.out) {
			
 
				+			panic(fmt.Errorf("length mismatch, want %d, got %d", len(s.out), t))
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return true, nil
			
 
				+}
			
 
				+
			
 
				+// --------------------------------------------------------------------------------
			
 
				+
			
 
				+type decodeAsmContext struct {
			
 
				+	llTable   []decSymbol
			
 
				+	mlTable   []decSymbol
			
 
				+	ofTable   []decSymbol
			
 
				+	llState   uint64
			
 
				+	mlState   uint64
			
 
				+	ofState   uint64
			
 
				+	iteration int
			
 
				+	seqs      []seqVals
			
 
				+	litRemain int
			
 
				+}
			
 
				+
			
 
				+const noError = 0
			
 
				+
			
 
				+// error reported when mo == 0 && ml > 0
			
 
				+const errorMatchLenOfsMismatch = 1
			
 
				+
			
 
				+// error reported when ml > maxMatchLen
			
 
				+const errorMatchLenTooBig = 2
			
 
				+
			
 
				+// error reported when mo > available history or mo > s.windowSize
			
 
				+const errorMatchOffTooBig = 3
			
 
				+
			
 
				+// error reported when the sum of literal lengths exeeceds the literal buffer size
			
 
				+const errorNotEnoughLiterals = 4
			
 
				+
			
 
				+// error reported when capacity of `out` is too small
			
 
				+const errorNotEnoughSpace = 5
			
 
				+
			
 
				+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
			
 
				+//
			
 
				+// Please refer to seqdec_generic.go for the reference implementation.
			
 
				+//go:noescape
			
 
				+func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
			
 
				+
			
 
				+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
			
 
				+//
			
 
				+// Please refer to seqdec_generic.go for the reference implementation.
			
 
				+//go:noescape
			
 
				+func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
			
 
				+
			
 
				+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
			
 
				+//go:noescape
			
 
				+func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
			
 
				+
			
 
				+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
			
 
				+//go:noescape
			
 
				+func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
			
 
				+
			
 
				+// decode sequences from the stream without the provided history.
			
 
				+func (s *sequenceDecs) decode(seqs []seqVals) error {
			
 
				+	br := s.br
			
 
				+
			
 
				+	maxBlockSize := maxCompressedBlockSize
			
 
				+	if s.windowSize < maxBlockSize {
			
 
				+		maxBlockSize = s.windowSize
			
 
				+	}
			
 
				+
			
 
				+	ctx := decodeAsmContext{
			
 
				+		llTable:   s.litLengths.fse.dt[:maxTablesize],
			
 
				+		mlTable:   s.matchLengths.fse.dt[:maxTablesize],
			
 
				+		ofTable:   s.offsets.fse.dt[:maxTablesize],
			
 
				+		llState:   uint64(s.litLengths.state.state),
			
 
				+		mlState:   uint64(s.matchLengths.state.state),
			
 
				+		ofState:   uint64(s.offsets.state.state),
			
 
				+		seqs:      seqs,
			
 
				+		iteration: len(seqs) - 1,
			
 
				+		litRemain: len(s.literals),
			
 
				+	}
			
 
				+
			
 
				+	s.seqSize = 0
			
 
				+	lte56bits := s.maxBits+s.offsets.fse.actualTableLog+s.matchLengths.fse.actualTableLog+s.litLengths.fse.actualTableLog <= 56
			
 
				+	var errCode int
			
 
				+	if cpuinfo.HasBMI2() {
			
 
				+		if lte56bits {
			
 
				+			errCode = sequenceDecs_decode_56_bmi2(s, br, &ctx)
			
 
				+		} else {
			
 
				+			errCode = sequenceDecs_decode_bmi2(s, br, &ctx)
			
 
				+		}
			
 
				+	} else {
			
 
				+		if lte56bits {
			
 
				+			errCode = sequenceDecs_decode_56_amd64(s, br, &ctx)
			
 
				+		} else {
			
 
				+			errCode = sequenceDecs_decode_amd64(s, br, &ctx)
			
 
				+		}
			
 
				+	}
			
 
				+	if errCode != 0 {
			
 
				+		i := len(seqs) - ctx.iteration - 1
			
 
				+		switch errCode {
			
 
				+		case errorMatchLenOfsMismatch:
			
 
				+			ml := ctx.seqs[i].ml
			
 
				+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
			
 
				+
			
 
				+		case errorMatchLenTooBig:
			
 
				+			ml := ctx.seqs[i].ml
			
 
				+			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
			
 
				+
			
 
				+		case errorNotEnoughLiterals:
			
 
				+			ll := ctx.seqs[i].ll
			
 
				+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, ctx.litRemain+ll)
			
 
				+		}
			
 
				+
			
 
				+		return fmt.Errorf("sequenceDecs_decode_amd64 returned erronous code %d", errCode)
			
 
				+	}
			
 
				+
			
 
				+	if ctx.litRemain < 0 {
			
 
				+		return fmt.Errorf("literal count is too big: total available %d, total requested %d",
			
 
				+			len(s.literals), len(s.literals)-ctx.litRemain)
			
 
				+	}
			
 
				+
			
 
				+	s.seqSize += ctx.litRemain
			
 
				+	if s.seqSize > maxBlockSize {
			
 
				+		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
			
 
				+	}
			
 
				+	err := br.close()
			
 
				+	if err != nil {
			
 
				+		printf("Closing sequences: %v, %+v\n", err, *br)
			
 
				+	}
			
 
				+	return err
			
 
				+}
			
 
				+
			
 
				+// --------------------------------------------------------------------------------
			
 
				+
			
 
				+type executeAsmContext struct {
			
 
				+	seqs        []seqVals
			
 
				+	seqIndex    int
			
 
				+	out         []byte
			
 
				+	history     []byte
			
 
				+	literals    []byte
			
 
				+	outPosition int
			
 
				+	litPosition int
			
 
				+	windowSize  int
			
 
				+}
			
 
				+
			
 
				+// sequenceDecs_executeSimple_amd64 implements the main loop of sequenceDecs.executeSimple in x86 asm.
			
 
				+//
			
 
				+// Returns false if a match offset is too big.
			
 
				+//
			
 
				+// Please refer to seqdec_generic.go for the reference implementation.
			
 
				+//go:noescape
			
 
				+func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
			
 
				+
			
 
				+// Same as above, but with safe memcopies
			
 
				+//go:noescape
			
 
				+func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
			
 
				+
			
 
				+// executeSimple handles cases when dictionary is not used.
			
 
				+func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
			
 
				+	// Ensure we have enough output size...
			
 
				+	if len(s.out)+s.seqSize+compressedBlockOverAlloc > cap(s.out) {
			
 
				+		addBytes := s.seqSize + len(s.out) + compressedBlockOverAlloc
			
 
				+		s.out = append(s.out, make([]byte, addBytes)...)
			
 
				+		s.out = s.out[:len(s.out)-addBytes]
			
 
				+	}
			
 
				+
			
 
				+	if debugDecoder {
			
 
				+		printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
			
 
				+	}
			
 
				+
			
 
				+	var t = len(s.out)
			
 
				+	out := s.out[:t+s.seqSize]
			
 
				+
			
 
				+	ctx := executeAsmContext{
			
 
				+		seqs:        seqs,
			
 
				+		seqIndex:    0,
			
 
				+		out:         out,
			
 
				+		history:     hist,
			
 
				+		outPosition: t,
			
 
				+		litPosition: 0,
			
 
				+		literals:    s.literals,
			
 
				+		windowSize:  s.windowSize,
			
 
				+	}
			
 
				+	var ok bool
			
 
				+	if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
			
 
				+		ok = sequenceDecs_executeSimple_safe_amd64(&ctx)
			
 
				+	} else {
			
 
				+		ok = sequenceDecs_executeSimple_amd64(&ctx)
			
 
				+	}
			
 
				+	if !ok {
			
 
				+		return fmt.Errorf("match offset (%d) bigger than current history (%d)",
			
 
				+			seqs[ctx.seqIndex].mo, ctx.outPosition+len(hist))
			
 
				+	}
			
 
				+	s.literals = s.literals[ctx.litPosition:]
			
 
				+	t = ctx.outPosition
			
 
				+
			
 
				+	// Add final literals
			
 
				+	copy(out[t:], s.literals)
			
 
				+	if debugDecoder {
			
 
				+		t += len(s.literals)
			
 
				+		if t != len(out) {
			
 
				+			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
			
 
				+		}
			
 
				+	}
			
 
				+	s.out = out
			
 
				+
			
 
				+	return nil
			
 
				+}
			
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
@@ -0,0 +1,4100 @@
 
				+// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
			
 
				+
			
 
				+//go:build !appengine && !noasm && gc && !noasm
			
 
				+// +build !appengine,!noasm,gc,!noasm
			
 
				+
			
 
				+// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
			
 
				+// Requires: CMOV
			
 
				+TEXT ·sequenceDecs_decode_amd64(SB), $8-32
			
 
				+	MOVQ    br+8(FP), AX
			
 
				+	MOVQ    32(AX), DX
			
 
				+	MOVBQZX 40(AX), BX
			
 
				+	MOVQ    24(AX), SI
			
 
				+	MOVQ    (AX), AX
			
 
				+	ADDQ    SI, AX
			
 
				+	MOVQ    AX, (SP)
			
 
				+	MOVQ    ctx+16(FP), AX
			
 
				+	MOVQ    72(AX), DI
			
 
				+	MOVQ    80(AX), R8
			
 
				+	MOVQ    88(AX), R9
			
 
				+	MOVQ    104(AX), R10
			
 
				+	MOVQ    s+0(FP), AX
			
 
				+	MOVQ    144(AX), R11
			
 
				+	MOVQ    152(AX), R12
			
 
				+	MOVQ    160(AX), R13
			
 
				+
			
 
				+sequenceDecs_decode_amd64_main_loop:
			
 
				+	MOVQ (SP), R14
			
 
				+
			
 
				+	// Fill bitreader to have enough for the offset and match length.
			
 
				+	CMPQ SI, $0x08
			
 
				+	JL   sequenceDecs_decode_amd64_fill_byte_by_byte
			
 
				+	MOVQ BX, AX
			
 
				+	SHRQ $0x03, AX
			
 
				+	SUBQ AX, R14
			
 
				+	MOVQ (R14), DX
			
 
				+	SUBQ AX, SI
			
 
				+	ANDQ $0x07, BX
			
 
				+	JMP  sequenceDecs_decode_amd64_fill_end
			
 
				+
			
 
				+sequenceDecs_decode_amd64_fill_byte_by_byte:
			
 
				+	CMPQ    SI, $0x00
			
 
				+	JLE     sequenceDecs_decode_amd64_fill_end
			
 
				+	CMPQ    BX, $0x07
			
 
				+	JLE     sequenceDecs_decode_amd64_fill_end
			
 
				+	SHLQ    $0x08, DX
			
 
				+	SUBQ    $0x01, R14
			
 
				+	SUBQ    $0x01, SI
			
 
				+	SUBQ    $0x08, BX
			
 
				+	MOVBQZX (R14), AX
			
 
				+	ORQ     AX, DX
			
 
				+	JMP     sequenceDecs_decode_amd64_fill_byte_by_byte
			
 
				+
			
 
				+sequenceDecs_decode_amd64_fill_end:
			
 
				+	// Update offset
			
 
				+	MOVQ  R9, AX
			
 
				+	MOVQ  BX, CX
			
 
				+	MOVQ  DX, R15
			
 
				+	SHLQ  CL, R15
			
 
				+	MOVB  AH, CL
			
 
				+	SHRQ  $0x20, AX
			
 
				+	TESTQ CX, CX
			
 
				+	JZ    sequenceDecs_decode_amd64_of_update_zero
			
 
				+	ADDQ  CX, BX
			
 
				+	CMPQ  BX, $0x40
			
 
				+	JA    sequenceDecs_decode_amd64_of_update_zero
			
 
				+	CMPQ  CX, $0x40
			
 
				+	JAE   sequenceDecs_decode_amd64_of_update_zero
			
 
				+	NEGQ  CX
			
 
				+	SHRQ  CL, R15
			
 
				+	ADDQ  R15, AX
			
 
				+
			
 
				+sequenceDecs_decode_amd64_of_update_zero:
			
 
				+	MOVQ AX, 16(R10)
			
 
				+
			
 
				+	// Update match length
			
 
				+	MOVQ  R8, AX
			
 
				+	MOVQ  BX, CX
			
 
				+	MOVQ  DX, R15
			
 
				+	SHLQ  CL, R15
			
 
				+	MOVB  AH, CL
			
 
				+	SHRQ  $0x20, AX
			
 
				+	TESTQ CX, CX
			
 
				+	JZ    sequenceDecs_decode_amd64_ml_update_zero
			
 
				+	ADDQ  CX, BX
			
 
				+	CMPQ  BX, $0x40
			
 
				+	JA    sequenceDecs_decode_amd64_ml_update_zero
			
 
				+	CMPQ  CX, $0x40
			
 
				+	JAE   sequenceDecs_decode_amd64_ml_update_zero
			
 
				+	NEGQ  CX
			
 
				+	SHRQ  CL, R15
			
 
				+	ADDQ  R15, AX
			
 
				+
			
 
				+sequenceDecs_decode_amd64_ml_update_zero:
			
 
				+	MOVQ AX, 8(R10)
			
 
				+
			
 
				+	// Fill bitreader to have enough for the remaining
			
 
				+	CMPQ SI, $0x08
			
 
				+	JL   sequenceDecs_decode_amd64_fill_2_byte_by_byte
			
 
				+	MOVQ BX, AX
			
 
				+	SHRQ $0x03, AX
			
 
				+	SUBQ AX, R14
			
 
				+	MOVQ (R14), DX
			
 
				+	SUBQ AX, SI
			
 
				+	ANDQ $0x07, BX
			
 
				+	JMP  sequenceDecs_decode_amd64_fill_2_end
			
 
				+
			
 
				+sequenceDecs_decode_amd64_fill_2_byte_by_byte:
			
 
				+	CMPQ    SI, $0x00
			
 
				+	JLE     sequenceDecs_decode_amd64_fill_2_end
			
 
				+	CMPQ    BX, $0x07
			
 
				+	JLE     sequenceDecs_decode_amd64_fill_2_end
			
 
				+	SHLQ    $0x08, DX
			
 
				+	SUBQ    $0x01, R14
			
 
				+	SUBQ    $0x01, SI
			
 
				+	SUBQ    $0x08, BX
			
 
				+	MOVBQZX (R14), AX
			
 
				+	ORQ     AX, DX
			
 
				+	JMP     sequenceDecs_decode_amd64_fill_2_byte_by_byte
			
 
				+
			
 
				+sequenceDecs_decode_amd64_fill_2_end:
			
 
				+	// Update literal length
			
 
				+	MOVQ  DI, AX
			
 
				+	MOVQ  BX, CX
			
 
				+	MOVQ  DX, R15
			
 
				+	SHLQ  CL, R15
			
 
				+	MOVB  AH, CL
			
 
				+	SHRQ  $0x20, AX
			
 
				+	TESTQ CX, CX
			
 
				+	JZ    sequenceDecs_decode_amd64_ll_update_zero
			
 
				+	ADDQ  CX, BX
			
 
				+	CMPQ  BX, $0x40
			
 
				+	JA    sequenceDecs_decode_amd64_ll_update_zero
			
 
				+	CMPQ  CX, $0x40
			
 
				+	JAE   sequenceDecs_decode_amd64_ll_update_zero
			
 
				+	NEGQ  CX
			
 
				+	SHRQ  CL, R15
			
 
				+	ADDQ  R15, AX
			
 
				+
			
 
				+sequenceDecs_decode_amd64_ll_update_zero:
			
 
				+	MOVQ AX, (R10)
			
 
				+
			
 
				+	// Fill bitreader for state updates
			
 
				+	MOVQ    R14, (SP)
			
 
				+	MOVQ    R9, AX
			
 
				+	SHRQ    $0x08, AX
			
 
				+	MOVBQZX AL, AX
			
 
				+	MOVQ    ctx+16(FP), CX
			
 
				+	CMPQ    96(CX), $0x00
			
 
				+	JZ      sequenceDecs_decode_amd64_skip_update
			
 
				+
			
 
				+	// Update Literal Length State
			
 
				+	MOVBQZX DI, R14
			
 
				+	SHRQ    $0x10, DI
			
 
				+	MOVWQZX DI, DI
			
 
				+	LEAQ    (BX)(R14*1), CX
			
 
				+	MOVQ    DX, R15
			
 
				+	MOVQ    CX, BX
			
 
				+	ROLQ    CL, R15
			
 
				+	MOVL    $0x00000001, BP
			
 
				+	MOVB    R14, CL
			
 
				+	SHLL    CL, BP
			
 
				+	DECL    BP
			
 
				+	ANDQ    BP, R15
			
 
				+	ADDQ    R15, DI
			
 
				+
			
 
				+	// Load ctx.llTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ (CX), CX
			
 
				+	MOVQ (CX)(DI*8), DI
			
 
				+
			
 
				+	// Update Match Length State
			
 
				+	MOVBQZX R8, R14
			
 
				+	SHRQ    $0x10, R8
			
 
				+	MOVWQZX R8, R8
			
 
				+	LEAQ    (BX)(R14*1), CX
			
 
				+	MOVQ    DX, R15
			
 
				+	MOVQ    CX, BX
			
 
				+	ROLQ    CL, R15
			
 
				+	MOVL    $0x00000001, BP
			
 
				+	MOVB    R14, CL
			
 
				+	SHLL    CL, BP
			
 
				+	DECL    BP
			
 
				+	ANDQ    BP, R15
			
 
				+	ADDQ    R15, R8
			
 
				+
			
 
				+	// Load ctx.mlTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ 24(CX), CX
			
 
				+	MOVQ (CX)(R8*8), R8
			
 
				+
			
 
				+	// Update Offset State
			
 
				+	MOVBQZX R9, R14
			
 
				+	SHRQ    $0x10, R9
			
 
				+	MOVWQZX R9, R9
			
 
				+	LEAQ    (BX)(R14*1), CX
			
 
				+	MOVQ    DX, R15
			
 
				+	MOVQ    CX, BX
			
 
				+	ROLQ    CL, R15
			
 
				+	MOVL    $0x00000001, BP
			
 
				+	MOVB    R14, CL
			
 
				+	SHLL    CL, BP
			
 
				+	DECL    BP
			
 
				+	ANDQ    BP, R15
			
 
				+	ADDQ    R15, R9
			
 
				+
			
 
				+	// Load ctx.ofTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ 48(CX), CX
			
 
				+	MOVQ (CX)(R9*8), R9
			
 
				+
			
 
				+sequenceDecs_decode_amd64_skip_update:
			
 
				+	// Adjust offset
			
 
				+	MOVQ 16(R10), CX
			
 
				+	CMPQ AX, $0x01
			
 
				+	JBE  sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
			
 
				+	MOVQ R12, R13
			
 
				+	MOVQ R11, R12
			
 
				+	MOVQ CX, R11
			
 
				+	JMP  sequenceDecs_decode_amd64_after_adjust
			
 
				+
			
 
				+sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
			
 
				+	CMPQ (R10), $0x00000000
			
 
				+	JNE  sequenceDecs_decode_amd64_adjust_offset_maybezero
			
 
				+	INCQ CX
			
 
				+	JMP  sequenceDecs_decode_amd64_adjust_offset_nonzero
			
 
				+
			
 
				+sequenceDecs_decode_amd64_adjust_offset_maybezero:
			
 
				+	TESTQ CX, CX
			
 
				+	JNZ   sequenceDecs_decode_amd64_adjust_offset_nonzero
			
 
				+	MOVQ  R11, CX
			
 
				+	JMP   sequenceDecs_decode_amd64_after_adjust
			
 
				+
			
 
				+sequenceDecs_decode_amd64_adjust_offset_nonzero:
			
 
				+	CMPQ CX, $0x01
			
 
				+	JB   sequenceDecs_decode_amd64_adjust_zero
			
 
				+	JEQ  sequenceDecs_decode_amd64_adjust_one
			
 
				+	CMPQ CX, $0x02
			
 
				+	JA   sequenceDecs_decode_amd64_adjust_three
			
 
				+	JMP  sequenceDecs_decode_amd64_adjust_two
			
 
				+
			
 
				+sequenceDecs_decode_amd64_adjust_zero:
			
 
				+	MOVQ R11, AX
			
 
				+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
			
 
				+
			
 
				+sequenceDecs_decode_amd64_adjust_one:
			
 
				+	MOVQ R12, AX
			
 
				+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
			
 
				+
			
 
				+sequenceDecs_decode_amd64_adjust_two:
			
 
				+	MOVQ R13, AX
			
 
				+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
			
 
				+
			
 
				+sequenceDecs_decode_amd64_adjust_three:
			
 
				+	LEAQ -1(R11), AX
			
 
				+
			
 
				+sequenceDecs_decode_amd64_adjust_test_temp_valid:
			
 
				+	TESTQ AX, AX
			
 
				+	JNZ   sequenceDecs_decode_amd64_adjust_temp_valid
			
 
				+	MOVQ  $0x00000001, AX
			
 
				+
			
 
				+sequenceDecs_decode_amd64_adjust_temp_valid:
			
 
				+	CMPQ    CX, $0x01
			
 
				+	CMOVQNE R12, R13
			
 
				+	MOVQ    R11, R12
			
 
				+	MOVQ    AX, R11
			
 
				+	MOVQ    AX, CX
			
 
				+
			
 
				+sequenceDecs_decode_amd64_after_adjust:
			
 
				+	MOVQ CX, 16(R10)
			
 
				+
			
 
				+	// Check values
			
 
				+	MOVQ  8(R10), AX
			
 
				+	MOVQ  (R10), R14
			
 
				+	LEAQ  (AX)(R14*1), R15
			
 
				+	MOVQ  s+0(FP), BP
			
 
				+	ADDQ  R15, 256(BP)
			
 
				+	MOVQ  ctx+16(FP), R15
			
 
				+	SUBQ  R14, 128(R15)
			
 
				+	JS    error_not_enough_literals
			
 
				+	CMPQ  AX, $0x00020002
			
 
				+	JA    sequenceDecs_decode_amd64_error_match_len_too_big
			
 
				+	TESTQ CX, CX
			
 
				+	JNZ   sequenceDecs_decode_amd64_match_len_ofs_ok
			
 
				+	TESTQ AX, AX
			
 
				+	JNZ   sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
			
 
				+
			
 
				+sequenceDecs_decode_amd64_match_len_ofs_ok:
			
 
				+	ADDQ $0x18, R10
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	DECQ 96(AX)
			
 
				+	JNS  sequenceDecs_decode_amd64_main_loop
			
 
				+	MOVQ s+0(FP), AX
			
 
				+	MOVQ R11, 144(AX)
			
 
				+	MOVQ R12, 152(AX)
			
 
				+	MOVQ R13, 160(AX)
			
 
				+	MOVQ br+8(FP), AX
			
 
				+	MOVQ DX, 32(AX)
			
 
				+	MOVB BL, 40(AX)
			
 
				+	MOVQ SI, 24(AX)
			
 
				+
			
 
				+	// Return success
			
 
				+	MOVQ $0x00000000, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match length error
			
 
				+sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
			
 
				+	MOVQ $0x00000001, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match too long error
			
 
				+sequenceDecs_decode_amd64_error_match_len_too_big:
			
 
				+	MOVQ $0x00000002, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match offset too long error
			
 
				+	MOVQ $0x00000003, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with not enough literals error
			
 
				+error_not_enough_literals:
			
 
				+	MOVQ $0x00000004, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with not enough output space error
			
 
				+	MOVQ $0x00000005, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
			
 
				+// Requires: CMOV
			
 
				+TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
			
 
				+	MOVQ    br+8(FP), AX
			
 
				+	MOVQ    32(AX), DX
			
 
				+	MOVBQZX 40(AX), BX
			
 
				+	MOVQ    24(AX), SI
			
 
				+	MOVQ    (AX), AX
			
 
				+	ADDQ    SI, AX
			
 
				+	MOVQ    AX, (SP)
			
 
				+	MOVQ    ctx+16(FP), AX
			
 
				+	MOVQ    72(AX), DI
			
 
				+	MOVQ    80(AX), R8
			
 
				+	MOVQ    88(AX), R9
			
 
				+	MOVQ    104(AX), R10
			
 
				+	MOVQ    s+0(FP), AX
			
 
				+	MOVQ    144(AX), R11
			
 
				+	MOVQ    152(AX), R12
			
 
				+	MOVQ    160(AX), R13
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_main_loop:
			
 
				+	MOVQ (SP), R14
			
 
				+
			
 
				+	// Fill bitreader to have enough for the offset and match length.
			
 
				+	CMPQ SI, $0x08
			
 
				+	JL   sequenceDecs_decode_56_amd64_fill_byte_by_byte
			
 
				+	MOVQ BX, AX
			
 
				+	SHRQ $0x03, AX
			
 
				+	SUBQ AX, R14
			
 
				+	MOVQ (R14), DX
			
 
				+	SUBQ AX, SI
			
 
				+	ANDQ $0x07, BX
			
 
				+	JMP  sequenceDecs_decode_56_amd64_fill_end
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_fill_byte_by_byte:
			
 
				+	CMPQ    SI, $0x00
			
 
				+	JLE     sequenceDecs_decode_56_amd64_fill_end
			
 
				+	CMPQ    BX, $0x07
			
 
				+	JLE     sequenceDecs_decode_56_amd64_fill_end
			
 
				+	SHLQ    $0x08, DX
			
 
				+	SUBQ    $0x01, R14
			
 
				+	SUBQ    $0x01, SI
			
 
				+	SUBQ    $0x08, BX
			
 
				+	MOVBQZX (R14), AX
			
 
				+	ORQ     AX, DX
			
 
				+	JMP     sequenceDecs_decode_56_amd64_fill_byte_by_byte
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_fill_end:
			
 
				+	// Update offset
			
 
				+	MOVQ  R9, AX
			
 
				+	MOVQ  BX, CX
			
 
				+	MOVQ  DX, R15
			
 
				+	SHLQ  CL, R15
			
 
				+	MOVB  AH, CL
			
 
				+	SHRQ  $0x20, AX
			
 
				+	TESTQ CX, CX
			
 
				+	JZ    sequenceDecs_decode_56_amd64_of_update_zero
			
 
				+	ADDQ  CX, BX
			
 
				+	CMPQ  BX, $0x40
			
 
				+	JA    sequenceDecs_decode_56_amd64_of_update_zero
			
 
				+	CMPQ  CX, $0x40
			
 
				+	JAE   sequenceDecs_decode_56_amd64_of_update_zero
			
 
				+	NEGQ  CX
			
 
				+	SHRQ  CL, R15
			
 
				+	ADDQ  R15, AX
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_of_update_zero:
			
 
				+	MOVQ AX, 16(R10)
			
 
				+
			
 
				+	// Update match length
			
 
				+	MOVQ  R8, AX
			
 
				+	MOVQ  BX, CX
			
 
				+	MOVQ  DX, R15
			
 
				+	SHLQ  CL, R15
			
 
				+	MOVB  AH, CL
			
 
				+	SHRQ  $0x20, AX
			
 
				+	TESTQ CX, CX
			
 
				+	JZ    sequenceDecs_decode_56_amd64_ml_update_zero
			
 
				+	ADDQ  CX, BX
			
 
				+	CMPQ  BX, $0x40
			
 
				+	JA    sequenceDecs_decode_56_amd64_ml_update_zero
			
 
				+	CMPQ  CX, $0x40
			
 
				+	JAE   sequenceDecs_decode_56_amd64_ml_update_zero
			
 
				+	NEGQ  CX
			
 
				+	SHRQ  CL, R15
			
 
				+	ADDQ  R15, AX
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_ml_update_zero:
			
 
				+	MOVQ AX, 8(R10)
			
 
				+
			
 
				+	// Update literal length
			
 
				+	MOVQ  DI, AX
			
 
				+	MOVQ  BX, CX
			
 
				+	MOVQ  DX, R15
			
 
				+	SHLQ  CL, R15
			
 
				+	MOVB  AH, CL
			
 
				+	SHRQ  $0x20, AX
			
 
				+	TESTQ CX, CX
			
 
				+	JZ    sequenceDecs_decode_56_amd64_ll_update_zero
			
 
				+	ADDQ  CX, BX
			
 
				+	CMPQ  BX, $0x40
			
 
				+	JA    sequenceDecs_decode_56_amd64_ll_update_zero
			
 
				+	CMPQ  CX, $0x40
			
 
				+	JAE   sequenceDecs_decode_56_amd64_ll_update_zero
			
 
				+	NEGQ  CX
			
 
				+	SHRQ  CL, R15
			
 
				+	ADDQ  R15, AX
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_ll_update_zero:
			
 
				+	MOVQ AX, (R10)
			
 
				+
			
 
				+	// Fill bitreader for state updates
			
 
				+	MOVQ    R14, (SP)
			
 
				+	MOVQ    R9, AX
			
 
				+	SHRQ    $0x08, AX
			
 
				+	MOVBQZX AL, AX
			
 
				+	MOVQ    ctx+16(FP), CX
			
 
				+	CMPQ    96(CX), $0x00
			
 
				+	JZ      sequenceDecs_decode_56_amd64_skip_update
			
 
				+
			
 
				+	// Update Literal Length State
			
 
				+	MOVBQZX DI, R14
			
 
				+	SHRQ    $0x10, DI
			
 
				+	MOVWQZX DI, DI
			
 
				+	LEAQ    (BX)(R14*1), CX
			
 
				+	MOVQ    DX, R15
			
 
				+	MOVQ    CX, BX
			
 
				+	ROLQ    CL, R15
			
 
				+	MOVL    $0x00000001, BP
			
 
				+	MOVB    R14, CL
			
 
				+	SHLL    CL, BP
			
 
				+	DECL    BP
			
 
				+	ANDQ    BP, R15
			
 
				+	ADDQ    R15, DI
			
 
				+
			
 
				+	// Load ctx.llTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ (CX), CX
			
 
				+	MOVQ (CX)(DI*8), DI
			
 
				+
			
 
				+	// Update Match Length State
			
 
				+	MOVBQZX R8, R14
			
 
				+	SHRQ    $0x10, R8
			
 
				+	MOVWQZX R8, R8
			
 
				+	LEAQ    (BX)(R14*1), CX
			
 
				+	MOVQ    DX, R15
			
 
				+	MOVQ    CX, BX
			
 
				+	ROLQ    CL, R15
			
 
				+	MOVL    $0x00000001, BP
			
 
				+	MOVB    R14, CL
			
 
				+	SHLL    CL, BP
			
 
				+	DECL    BP
			
 
				+	ANDQ    BP, R15
			
 
				+	ADDQ    R15, R8
			
 
				+
			
 
				+	// Load ctx.mlTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ 24(CX), CX
			
 
				+	MOVQ (CX)(R8*8), R8
			
 
				+
			
 
				+	// Update Offset State
			
 
				+	MOVBQZX R9, R14
			
 
				+	SHRQ    $0x10, R9
			
 
				+	MOVWQZX R9, R9
			
 
				+	LEAQ    (BX)(R14*1), CX
			
 
				+	MOVQ    DX, R15
			
 
				+	MOVQ    CX, BX
			
 
				+	ROLQ    CL, R15
			
 
				+	MOVL    $0x00000001, BP
			
 
				+	MOVB    R14, CL
			
 
				+	SHLL    CL, BP
			
 
				+	DECL    BP
			
 
				+	ANDQ    BP, R15
			
 
				+	ADDQ    R15, R9
			
 
				+
			
 
				+	// Load ctx.ofTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ 48(CX), CX
			
 
				+	MOVQ (CX)(R9*8), R9
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_skip_update:
			
 
				+	// Adjust offset
			
 
				+	MOVQ 16(R10), CX
			
 
				+	CMPQ AX, $0x01
			
 
				+	JBE  sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
			
 
				+	MOVQ R12, R13
			
 
				+	MOVQ R11, R12
			
 
				+	MOVQ CX, R11
			
 
				+	JMP  sequenceDecs_decode_56_amd64_after_adjust
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
			
 
				+	CMPQ (R10), $0x00000000
			
 
				+	JNE  sequenceDecs_decode_56_amd64_adjust_offset_maybezero
			
 
				+	INCQ CX
			
 
				+	JMP  sequenceDecs_decode_56_amd64_adjust_offset_nonzero
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
			
 
				+	TESTQ CX, CX
			
 
				+	JNZ   sequenceDecs_decode_56_amd64_adjust_offset_nonzero
			
 
				+	MOVQ  R11, CX
			
 
				+	JMP   sequenceDecs_decode_56_amd64_after_adjust
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
			
 
				+	CMPQ CX, $0x01
			
 
				+	JB   sequenceDecs_decode_56_amd64_adjust_zero
			
 
				+	JEQ  sequenceDecs_decode_56_amd64_adjust_one
			
 
				+	CMPQ CX, $0x02
			
 
				+	JA   sequenceDecs_decode_56_amd64_adjust_three
			
 
				+	JMP  sequenceDecs_decode_56_amd64_adjust_two
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_adjust_zero:
			
 
				+	MOVQ R11, AX
			
 
				+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_adjust_one:
			
 
				+	MOVQ R12, AX
			
 
				+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_adjust_two:
			
 
				+	MOVQ R13, AX
			
 
				+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_adjust_three:
			
 
				+	LEAQ -1(R11), AX
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
			
 
				+	TESTQ AX, AX
			
 
				+	JNZ   sequenceDecs_decode_56_amd64_adjust_temp_valid
			
 
				+	MOVQ  $0x00000001, AX
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_adjust_temp_valid:
			
 
				+	CMPQ    CX, $0x01
			
 
				+	CMOVQNE R12, R13
			
 
				+	MOVQ    R11, R12
			
 
				+	MOVQ    AX, R11
			
 
				+	MOVQ    AX, CX
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_after_adjust:
			
 
				+	MOVQ CX, 16(R10)
			
 
				+
			
 
				+	// Check values
			
 
				+	MOVQ  8(R10), AX
			
 
				+	MOVQ  (R10), R14
			
 
				+	LEAQ  (AX)(R14*1), R15
			
 
				+	MOVQ  s+0(FP), BP
			
 
				+	ADDQ  R15, 256(BP)
			
 
				+	MOVQ  ctx+16(FP), R15
			
 
				+	SUBQ  R14, 128(R15)
			
 
				+	JS    error_not_enough_literals
			
 
				+	CMPQ  AX, $0x00020002
			
 
				+	JA    sequenceDecs_decode_56_amd64_error_match_len_too_big
			
 
				+	TESTQ CX, CX
			
 
				+	JNZ   sequenceDecs_decode_56_amd64_match_len_ofs_ok
			
 
				+	TESTQ AX, AX
			
 
				+	JNZ   sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
			
 
				+
			
 
				+sequenceDecs_decode_56_amd64_match_len_ofs_ok:
			
 
				+	ADDQ $0x18, R10
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	DECQ 96(AX)
			
 
				+	JNS  sequenceDecs_decode_56_amd64_main_loop
			
 
				+	MOVQ s+0(FP), AX
			
 
				+	MOVQ R11, 144(AX)
			
 
				+	MOVQ R12, 152(AX)
			
 
				+	MOVQ R13, 160(AX)
			
 
				+	MOVQ br+8(FP), AX
			
 
				+	MOVQ DX, 32(AX)
			
 
				+	MOVB BL, 40(AX)
			
 
				+	MOVQ SI, 24(AX)
			
 
				+
			
 
				+	// Return success
			
 
				+	MOVQ $0x00000000, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match length error
			
 
				+sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
			
 
				+	MOVQ $0x00000001, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match too long error
			
 
				+sequenceDecs_decode_56_amd64_error_match_len_too_big:
			
 
				+	MOVQ $0x00000002, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match offset too long error
			
 
				+	MOVQ $0x00000003, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with not enough literals error
			
 
				+error_not_enough_literals:
			
 
				+	MOVQ $0x00000004, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with not enough output space error
			
 
				+	MOVQ $0x00000005, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
			
 
				+// Requires: BMI, BMI2, CMOV
			
 
				+TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
			
 
				+	MOVQ    br+8(FP), CX
			
 
				+	MOVQ    32(CX), AX
			
 
				+	MOVBQZX 40(CX), DX
			
 
				+	MOVQ    24(CX), BX
			
 
				+	MOVQ    (CX), CX
			
 
				+	ADDQ    BX, CX
			
 
				+	MOVQ    CX, (SP)
			
 
				+	MOVQ    ctx+16(FP), CX
			
 
				+	MOVQ    72(CX), SI
			
 
				+	MOVQ    80(CX), DI
			
 
				+	MOVQ    88(CX), R8
			
 
				+	MOVQ    104(CX), R9
			
 
				+	MOVQ    s+0(FP), CX
			
 
				+	MOVQ    144(CX), R10
			
 
				+	MOVQ    152(CX), R11
			
 
				+	MOVQ    160(CX), R12
			
 
				+
			
 
				+sequenceDecs_decode_bmi2_main_loop:
			
 
				+	MOVQ (SP), R13
			
 
				+
			
 
				+	// Fill bitreader to have enough for the offset and match length.
			
 
				+	CMPQ BX, $0x08
			
 
				+	JL   sequenceDecs_decode_bmi2_fill_byte_by_byte
			
 
				+	MOVQ DX, CX
			
 
				+	SHRQ $0x03, CX
			
 
				+	SUBQ CX, R13
			
 
				+	MOVQ (R13), AX
			
 
				+	SUBQ CX, BX
			
 
				+	ANDQ $0x07, DX
			
 
				+	JMP  sequenceDecs_decode_bmi2_fill_end
			
 
				+
			
 
				+sequenceDecs_decode_bmi2_fill_byte_by_byte:
			
 
				+	CMPQ    BX, $0x00
			
 
				+	JLE     sequenceDecs_decode_bmi2_fill_end
			
 
				+	CMPQ    DX, $0x07
			
 
				+	JLE     sequenceDecs_decode_bmi2_fill_end
			
 
				+	SHLQ    $0x08, AX
			
 
				+	SUBQ    $0x01, R13
			
 
				+	SUBQ    $0x01, BX
			
 
				+	SUBQ    $0x08, DX
			
 
				+	MOVBQZX (R13), CX
			
 
				+	ORQ     CX, AX
			
 
				+	JMP     sequenceDecs_decode_bmi2_fill_byte_by_byte
			
 
				+
			
 
				+sequenceDecs_decode_bmi2_fill_end:
			
 
				+	// Update offset
			
 
				+	MOVQ   $0x00000808, CX
			
 
				+	BEXTRQ CX, R8, R14
			
 
				+	MOVQ   AX, R15
			
 
				+	LEAQ   (DX)(R14*1), CX
			
 
				+	ROLQ   CL, R15
			
 
				+	BZHIQ  R14, R15, R15
			
 
				+	MOVQ   CX, DX
			
 
				+	MOVQ   R8, CX
			
 
				+	SHRQ   $0x20, CX
			
 
				+	ADDQ   R15, CX
			
 
				+	MOVQ   CX, 16(R9)
			
 
				+
			
 
				+	// Update match length
			
 
				+	MOVQ   $0x00000808, CX
			
 
				+	BEXTRQ CX, DI, R14
			
 
				+	MOVQ   AX, R15
			
 
				+	LEAQ   (DX)(R14*1), CX
			
 
				+	ROLQ   CL, R15
			
 
				+	BZHIQ  R14, R15, R15
			
 
				+	MOVQ   CX, DX
			
 
				+	MOVQ   DI, CX
			
 
				+	SHRQ   $0x20, CX
			
 
				+	ADDQ   R15, CX
			
 
				+	MOVQ   CX, 8(R9)
			
 
				+
			
 
				+	// Fill bitreader to have enough for the remaining
			
 
				+	CMPQ BX, $0x08
			
 
				+	JL   sequenceDecs_decode_bmi2_fill_2_byte_by_byte
			
 
				+	MOVQ DX, CX
			
 
				+	SHRQ $0x03, CX
			
 
				+	SUBQ CX, R13
			
 
				+	MOVQ (R13), AX
			
 
				+	SUBQ CX, BX
			
 
				+	ANDQ $0x07, DX
			
 
				+	JMP  sequenceDecs_decode_bmi2_fill_2_end
			
 
				+
			
 
				+sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
			
 
				+	CMPQ    BX, $0x00
			
 
				+	JLE     sequenceDecs_decode_bmi2_fill_2_end
			
 
				+	CMPQ    DX, $0x07
			
 
				+	JLE     sequenceDecs_decode_bmi2_fill_2_end
			
 
				+	SHLQ    $0x08, AX
			
 
				+	SUBQ    $0x01, R13
			
 
				+	SUBQ    $0x01, BX
			
 
				+	SUBQ    $0x08, DX
			
 
				+	MOVBQZX (R13), CX
			
 
				+	ORQ     CX, AX
			
 
				+	JMP     sequenceDecs_decode_bmi2_fill_2_byte_by_byte
			
 
				+
			
 
				+sequenceDecs_decode_bmi2_fill_2_end:
			
 
				+	// Update literal length
			
 
				+	MOVQ   $0x00000808, CX
			
 
				+	BEXTRQ CX, SI, R14
			
 
				+	MOVQ   AX, R15
			
 
				+	LEAQ   (DX)(R14*1), CX
			
 
				+	ROLQ   CL, R15
			
 
				+	BZHIQ  R14, R15, R15
			
 
				+	MOVQ   CX, DX
			
 
				+	MOVQ   SI, CX
			
 
				+	SHRQ   $0x20, CX
			
 
				+	ADDQ   R15, CX
			
 
				+	MOVQ   CX, (R9)
			
 
				+
			
 
				+	// Fill bitreader for state updates
			
 
				+	MOVQ    R13, (SP)
			
 
				+	MOVQ    $0x00000808, CX
			
 
				+	BEXTRQ  CX, R8, R13
			
 
				+	MOVQ    ctx+16(FP), CX
			
 
				+	CMPQ    96(CX), $0x00
			
 
				+	JZ      sequenceDecs_decode_bmi2_skip_update
			
 
				+	LEAQ    (SI)(DI*1), R14
			
 
				+	ADDQ    R8, R14
			
 
				+	MOVBQZX R14, R14
			
 
				+	LEAQ    (DX)(R14*1), CX
			
 
				+	MOVQ    AX, R15
			
 
				+	MOVQ    CX, DX
			
 
				+	ROLQ    CL, R15
			
 
				+	BZHIQ   R14, R15, R15
			
 
				+
			
 
				+	// Update Offset State
			
 
				+	BZHIQ  R8, R15, CX
			
 
				+	SHRXQ  R8, R15, R15
			
 
				+	MOVQ   $0x00001010, R14
			
 
				+	BEXTRQ R14, R8, R8
			
 
				+	ADDQ   CX, R8
			
 
				+
			
 
				+	// Load ctx.ofTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ 48(CX), CX
			
 
				+	MOVQ (CX)(R8*8), R8
			
 
				+
			
 
				+	// Update Match Length State
			
 
				+	BZHIQ  DI, R15, CX
			
 
				+	SHRXQ  DI, R15, R15
			
 
				+	MOVQ   $0x00001010, R14
			
 
				+	BEXTRQ R14, DI, DI
			
 
				+	ADDQ   CX, DI
			
 
				+
			
 
				+	// Load ctx.mlTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ 24(CX), CX
			
 
				+	MOVQ (CX)(DI*8), DI
			
 
				+
			
 
				+	// Update Literal Length State
			
 
				+	BZHIQ  SI, R15, CX
			
 
				+	MOVQ   $0x00001010, R14
			
 
				+	BEXTRQ R14, SI, SI
			
 
				+	ADDQ   CX, SI
			
 
				+
			
 
				+	// Load ctx.llTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ (CX), CX
			
 
				+	MOVQ (CX)(SI*8), SI
			
 
				+
			
 
				+sequenceDecs_decode_bmi2_skip_update:
			
 
				+	// Adjust offset
			
 
				+	MOVQ 16(R9), CX
			
 
				+	CMPQ R13, $0x01
			
 
				+	JBE  sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
			
 
				+	MOVQ R11, R12
			
 
				+	MOVQ R10, R11
			
 
				+	MOVQ CX, R10
			
 
				+	JMP  sequenceDecs_decode_bmi2_after_adjust
			
 
				+
			
 
				+sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
			
 
				+	CMPQ (R9), $0x00000000
			
 
				+	JNE  sequenceDecs_decode_bmi2_adjust_offset_maybezero
			
 
				+	INCQ CX
			
 
				+	JMP  sequenceDecs_decode_bmi2_adjust_offset_nonzero
			
 
				+
			
 
				+sequenceDecs_decode_bmi2_adjust_offset_maybezero:
			
 
				+	TESTQ CX, CX
			
 
				+	JNZ   sequenceDecs_decode_bmi2_adjust_offset_nonzero
			
 
				+	MOVQ  R10, CX
			
 
				+	JMP   sequenceDecs_decode_bmi2_after_adjust
			
 
				+
			
 
				+sequenceDecs_decode_bmi2_adjust_offset_nonzero:
			
 
				+	CMPQ CX, $0x01
			
 
				+	JB   sequenceDecs_decode_bmi2_adjust_zero
			
 
				+	JEQ  sequenceDecs_decode_bmi2_adjust_one
			
 
				+	CMPQ CX, $0x02
			
 
				+	JA   sequenceDecs_decode_bmi2_adjust_three
			
 
				+	JMP  sequenceDecs_decode_bmi2_adjust_two
			
 
				+
			
 
				+sequenceDecs_decode_bmi2_adjust_zero:
			
 
				+	MOVQ R10, R13
			
 
				+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
			
 
				+
			
 
				+sequenceDecs_decode_bmi2_adjust_one:
			
 
				+	MOVQ R11, R13
			
 
				+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
			
 
				+
			
 
				+sequenceDecs_decode_bmi2_adjust_two:
			
 
				+	MOVQ R12, R13
			
 
				+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
			
 
				+
			
 
				+sequenceDecs_decode_bmi2_adjust_three:
			
 
				+	LEAQ -1(R10), R13
			
 
				+
			
 
				+sequenceDecs_decode_bmi2_adjust_test_temp_valid:
			
 
				+	TESTQ R13, R13
			
 
				+	JNZ   sequenceDecs_decode_bmi2_adjust_temp_valid
			
 
				+	MOVQ  $0x00000001, R13
			
 
				+
			
 
				+sequenceDecs_decode_bmi2_adjust_temp_valid:
			
 
				+	CMPQ    CX, $0x01
			
 
				+	CMOVQNE R11, R12
			
 
				+	MOVQ    R10, R11
			
 
				+	MOVQ    R13, R10
			
 
				+	MOVQ    R13, CX
			
 
				+
			
 
				+sequenceDecs_decode_bmi2_after_adjust:
			
 
				+	MOVQ CX, 16(R9)
			
 
				+
			
 
				+	// Check values
			
 
				+	MOVQ  8(R9), R13
			
 
				+	MOVQ  (R9), R14
			
 
				+	LEAQ  (R13)(R14*1), R15
			
 
				+	MOVQ  s+0(FP), BP
			
 
				+	ADDQ  R15, 256(BP)
			
 
				+	MOVQ  ctx+16(FP), R15
			
 
				+	SUBQ  R14, 128(R15)
			
 
				+	JS    error_not_enough_literals
			
 
				+	CMPQ  R13, $0x00020002
			
 
				+	JA    sequenceDecs_decode_bmi2_error_match_len_too_big
			
 
				+	TESTQ CX, CX
			
 
				+	JNZ   sequenceDecs_decode_bmi2_match_len_ofs_ok
			
 
				+	TESTQ R13, R13
			
 
				+	JNZ   sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
			
 
				+
			
 
				+sequenceDecs_decode_bmi2_match_len_ofs_ok:
			
 
				+	ADDQ $0x18, R9
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	DECQ 96(CX)
			
 
				+	JNS  sequenceDecs_decode_bmi2_main_loop
			
 
				+	MOVQ s+0(FP), CX
			
 
				+	MOVQ R10, 144(CX)
			
 
				+	MOVQ R11, 152(CX)
			
 
				+	MOVQ R12, 160(CX)
			
 
				+	MOVQ br+8(FP), CX
			
 
				+	MOVQ AX, 32(CX)
			
 
				+	MOVB DL, 40(CX)
			
 
				+	MOVQ BX, 24(CX)
			
 
				+
			
 
				+	// Return success
			
 
				+	MOVQ $0x00000000, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match length error
			
 
				+sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
			
 
				+	MOVQ $0x00000001, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match too long error
			
 
				+sequenceDecs_decode_bmi2_error_match_len_too_big:
			
 
				+	MOVQ $0x00000002, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match offset too long error
			
 
				+	MOVQ $0x00000003, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with not enough literals error
			
 
				+error_not_enough_literals:
			
 
				+	MOVQ $0x00000004, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with not enough output space error
			
 
				+	MOVQ $0x00000005, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
			
 
				+// Requires: BMI, BMI2, CMOV
			
 
				+TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
			
 
				+	MOVQ    br+8(FP), CX
			
 
				+	MOVQ    32(CX), AX
			
 
				+	MOVBQZX 40(CX), DX
			
 
				+	MOVQ    24(CX), BX
			
 
				+	MOVQ    (CX), CX
			
 
				+	ADDQ    BX, CX
			
 
				+	MOVQ    CX, (SP)
			
 
				+	MOVQ    ctx+16(FP), CX
			
 
				+	MOVQ    72(CX), SI
			
 
				+	MOVQ    80(CX), DI
			
 
				+	MOVQ    88(CX), R8
			
 
				+	MOVQ    104(CX), R9
			
 
				+	MOVQ    s+0(FP), CX
			
 
				+	MOVQ    144(CX), R10
			
 
				+	MOVQ    152(CX), R11
			
 
				+	MOVQ    160(CX), R12
			
 
				+
			
 
				+sequenceDecs_decode_56_bmi2_main_loop:
			
 
				+	MOVQ (SP), R13
			
 
				+
			
 
				+	// Fill bitreader to have enough for the offset and match length.
			
 
				+	CMPQ BX, $0x08
			
 
				+	JL   sequenceDecs_decode_56_bmi2_fill_byte_by_byte
			
 
				+	MOVQ DX, CX
			
 
				+	SHRQ $0x03, CX
			
 
				+	SUBQ CX, R13
			
 
				+	MOVQ (R13), AX
			
 
				+	SUBQ CX, BX
			
 
				+	ANDQ $0x07, DX
			
 
				+	JMP  sequenceDecs_decode_56_bmi2_fill_end
			
 
				+
			
 
				+sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
			
 
				+	CMPQ    BX, $0x00
			
 
				+	JLE     sequenceDecs_decode_56_bmi2_fill_end
			
 
				+	CMPQ    DX, $0x07
			
 
				+	JLE     sequenceDecs_decode_56_bmi2_fill_end
			
 
				+	SHLQ    $0x08, AX
			
 
				+	SUBQ    $0x01, R13
			
 
				+	SUBQ    $0x01, BX
			
 
				+	SUBQ    $0x08, DX
			
 
				+	MOVBQZX (R13), CX
			
 
				+	ORQ     CX, AX
			
 
				+	JMP     sequenceDecs_decode_56_bmi2_fill_byte_by_byte
			
 
				+
			
 
				+sequenceDecs_decode_56_bmi2_fill_end:
			
 
				+	// Update offset
			
 
				+	MOVQ   $0x00000808, CX
			
 
				+	BEXTRQ CX, R8, R14
			
 
				+	MOVQ   AX, R15
			
 
				+	LEAQ   (DX)(R14*1), CX
			
 
				+	ROLQ   CL, R15
			
 
				+	BZHIQ  R14, R15, R15
			
 
				+	MOVQ   CX, DX
			
 
				+	MOVQ   R8, CX
			
 
				+	SHRQ   $0x20, CX
			
 
				+	ADDQ   R15, CX
			
 
				+	MOVQ   CX, 16(R9)
			
 
				+
			
 
				+	// Update match length
			
 
				+	MOVQ   $0x00000808, CX
			
 
				+	BEXTRQ CX, DI, R14
			
 
				+	MOVQ   AX, R15
			
 
				+	LEAQ   (DX)(R14*1), CX
			
 
				+	ROLQ   CL, R15
			
 
				+	BZHIQ  R14, R15, R15
			
 
				+	MOVQ   CX, DX
			
 
				+	MOVQ   DI, CX
			
 
				+	SHRQ   $0x20, CX
			
 
				+	ADDQ   R15, CX
			
 
				+	MOVQ   CX, 8(R9)
			
 
				+
			
 
				+	// Update literal length
			
 
				+	MOVQ   $0x00000808, CX
			
 
				+	BEXTRQ CX, SI, R14
			
 
				+	MOVQ   AX, R15
			
 
				+	LEAQ   (DX)(R14*1), CX
			
 
				+	ROLQ   CL, R15
			
 
				+	BZHIQ  R14, R15, R15
			
 
				+	MOVQ   CX, DX
			
 
				+	MOVQ   SI, CX
			
 
				+	SHRQ   $0x20, CX
			
 
				+	ADDQ   R15, CX
			
 
				+	MOVQ   CX, (R9)
			
 
				+
			
 
				+	// Fill bitreader for state updates
			
 
				+	MOVQ    R13, (SP)
			
 
				+	MOVQ    $0x00000808, CX
			
 
				+	BEXTRQ  CX, R8, R13
			
 
				+	MOVQ    ctx+16(FP), CX
			
 
				+	CMPQ    96(CX), $0x00
			
 
				+	JZ      sequenceDecs_decode_56_bmi2_skip_update
			
 
				+	LEAQ    (SI)(DI*1), R14
			
 
				+	ADDQ    R8, R14
			
 
				+	MOVBQZX R14, R14
			
 
				+	LEAQ    (DX)(R14*1), CX
			
 
				+	MOVQ    AX, R15
			
 
				+	MOVQ    CX, DX
			
 
				+	ROLQ    CL, R15
			
 
				+	BZHIQ   R14, R15, R15
			
 
				+
			
 
				+	// Update Offset State
			
 
				+	BZHIQ  R8, R15, CX
			
 
				+	SHRXQ  R8, R15, R15
			
 
				+	MOVQ   $0x00001010, R14
			
 
				+	BEXTRQ R14, R8, R8
			
 
				+	ADDQ   CX, R8
			
 
				+
			
 
				+	// Load ctx.ofTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ 48(CX), CX
			
 
				+	MOVQ (CX)(R8*8), R8
			
 
				+
			
 
				+	// Update Match Length State
			
 
				+	BZHIQ  DI, R15, CX
			
 
				+	SHRXQ  DI, R15, R15
			
 
				+	MOVQ   $0x00001010, R14
			
 
				+	BEXTRQ R14, DI, DI
			
 
				+	ADDQ   CX, DI
			
 
				+
			
 
				+	// Load ctx.mlTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ 24(CX), CX
			
 
				+	MOVQ (CX)(DI*8), DI
			
 
				+
			
 
				+	// Update Literal Length State
			
 
				+	BZHIQ  SI, R15, CX
			
 
				+	MOVQ   $0x00001010, R14
			
 
				+	BEXTRQ R14, SI, SI
			
 
				+	ADDQ   CX, SI
			
 
				+
			
 
				+	// Load ctx.llTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ (CX), CX
			
 
				+	MOVQ (CX)(SI*8), SI
			
 
				+
			
 
				+sequenceDecs_decode_56_bmi2_skip_update:
			
 
				+	// Adjust offset
			
 
				+	MOVQ 16(R9), CX
			
 
				+	CMPQ R13, $0x01
			
 
				+	JBE  sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
			
 
				+	MOVQ R11, R12
			
 
				+	MOVQ R10, R11
			
 
				+	MOVQ CX, R10
			
 
				+	JMP  sequenceDecs_decode_56_bmi2_after_adjust
			
 
				+
			
 
				+sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
			
 
				+	CMPQ (R9), $0x00000000
			
 
				+	JNE  sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
			
 
				+	INCQ CX
			
 
				+	JMP  sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
			
 
				+
			
 
				+sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
			
 
				+	TESTQ CX, CX
			
 
				+	JNZ   sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
			
 
				+	MOVQ  R10, CX
			
 
				+	JMP   sequenceDecs_decode_56_bmi2_after_adjust
			
 
				+
			
 
				+sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
			
 
				+	CMPQ CX, $0x01
			
 
				+	JB   sequenceDecs_decode_56_bmi2_adjust_zero
			
 
				+	JEQ  sequenceDecs_decode_56_bmi2_adjust_one
			
 
				+	CMPQ CX, $0x02
			
 
				+	JA   sequenceDecs_decode_56_bmi2_adjust_three
			
 
				+	JMP  sequenceDecs_decode_56_bmi2_adjust_two
			
 
				+
			
 
				+sequenceDecs_decode_56_bmi2_adjust_zero:
			
 
				+	MOVQ R10, R13
			
 
				+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
			
 
				+
			
 
				+sequenceDecs_decode_56_bmi2_adjust_one:
			
 
				+	MOVQ R11, R13
			
 
				+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
			
 
				+
			
 
				+sequenceDecs_decode_56_bmi2_adjust_two:
			
 
				+	MOVQ R12, R13
			
 
				+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
			
 
				+
			
 
				+sequenceDecs_decode_56_bmi2_adjust_three:
			
 
				+	LEAQ -1(R10), R13
			
 
				+
			
 
				+sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
			
 
				+	TESTQ R13, R13
			
 
				+	JNZ   sequenceDecs_decode_56_bmi2_adjust_temp_valid
			
 
				+	MOVQ  $0x00000001, R13
			
 
				+
			
 
				+sequenceDecs_decode_56_bmi2_adjust_temp_valid:
			
 
				+	CMPQ    CX, $0x01
			
 
				+	CMOVQNE R11, R12
			
 
				+	MOVQ    R10, R11
			
 
				+	MOVQ    R13, R10
			
 
				+	MOVQ    R13, CX
			
 
				+
			
 
				+sequenceDecs_decode_56_bmi2_after_adjust:
			
 
				+	MOVQ CX, 16(R9)
			
 
				+
			
 
				+	// Check values
			
 
				+	MOVQ  8(R9), R13
			
 
				+	MOVQ  (R9), R14
			
 
				+	LEAQ  (R13)(R14*1), R15
			
 
				+	MOVQ  s+0(FP), BP
			
 
				+	ADDQ  R15, 256(BP)
			
 
				+	MOVQ  ctx+16(FP), R15
			
 
				+	SUBQ  R14, 128(R15)
			
 
				+	JS    error_not_enough_literals
			
 
				+	CMPQ  R13, $0x00020002
			
 
				+	JA    sequenceDecs_decode_56_bmi2_error_match_len_too_big
			
 
				+	TESTQ CX, CX
			
 
				+	JNZ   sequenceDecs_decode_56_bmi2_match_len_ofs_ok
			
 
				+	TESTQ R13, R13
			
 
				+	JNZ   sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
			
 
				+
			
 
				+sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
			
 
				+	ADDQ $0x18, R9
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	DECQ 96(CX)
			
 
				+	JNS  sequenceDecs_decode_56_bmi2_main_loop
			
 
				+	MOVQ s+0(FP), CX
			
 
				+	MOVQ R10, 144(CX)
			
 
				+	MOVQ R11, 152(CX)
			
 
				+	MOVQ R12, 160(CX)
			
 
				+	MOVQ br+8(FP), CX
			
 
				+	MOVQ AX, 32(CX)
			
 
				+	MOVB DL, 40(CX)
			
 
				+	MOVQ BX, 24(CX)
			
 
				+
			
 
				+	// Return success
			
 
				+	MOVQ $0x00000000, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match length error
			
 
				+sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
			
 
				+	MOVQ $0x00000001, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match too long error
			
 
				+sequenceDecs_decode_56_bmi2_error_match_len_too_big:
			
 
				+	MOVQ $0x00000002, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match offset too long error
			
 
				+	MOVQ $0x00000003, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with not enough literals error
			
 
				+error_not_enough_literals:
			
 
				+	MOVQ $0x00000004, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with not enough output space error
			
 
				+	MOVQ $0x00000005, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
			
 
				+// Requires: SSE
			
 
				+TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
			
 
				+	MOVQ  ctx+0(FP), R10
			
 
				+	MOVQ  8(R10), CX
			
 
				+	TESTQ CX, CX
			
 
				+	JZ    empty_seqs
			
 
				+	MOVQ  (R10), AX
			
 
				+	MOVQ  24(R10), DX
			
 
				+	MOVQ  32(R10), BX
			
 
				+	MOVQ  80(R10), SI
			
 
				+	MOVQ  104(R10), DI
			
 
				+	MOVQ  120(R10), R8
			
 
				+	MOVQ  56(R10), R9
			
 
				+	MOVQ  64(R10), R10
			
 
				+	ADDQ  R10, R9
			
 
				+
			
 
				+	// seqsBase += 24 * seqIndex
			
 
				+	LEAQ (DX)(DX*2), R11
			
 
				+	SHLQ $0x03, R11
			
 
				+	ADDQ R11, AX
			
 
				+
			
 
				+	// outBase += outPosition
			
 
				+	ADDQ DI, BX
			
 
				+
			
 
				+main_loop:
			
 
				+	MOVQ (AX), R11
			
 
				+	MOVQ 16(AX), R12
			
 
				+	MOVQ 8(AX), R13
			
 
				+
			
 
				+	// Copy literals
			
 
				+	TESTQ R11, R11
			
 
				+	JZ    check_offset
			
 
				+	XORQ  R14, R14
			
 
				+
			
 
				+copy_1:
			
 
				+	MOVUPS (SI)(R14*1), X0
			
 
				+	MOVUPS X0, (BX)(R14*1)
			
 
				+	ADDQ   $0x10, R14
			
 
				+	CMPQ   R14, R11
			
 
				+	JB     copy_1
			
 
				+	ADDQ   R11, SI
			
 
				+	ADDQ   R11, BX
			
 
				+	ADDQ   R11, DI
			
 
				+
			
 
				+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
			
 
				+check_offset:
			
 
				+	LEAQ (DI)(R10*1), R11
			
 
				+	CMPQ R12, R11
			
 
				+	JG   error_match_off_too_big
			
 
				+	CMPQ R12, R8
			
 
				+	JG   error_match_off_too_big
			
 
				+
			
 
				+	// Copy match from history
			
 
				+	MOVQ R12, R11
			
 
				+	SUBQ DI, R11
			
 
				+	JLS  copy_match
			
 
				+	MOVQ R9, R14
			
 
				+	SUBQ R11, R14
			
 
				+	CMPQ R13, R11
			
 
				+	JG   copy_all_from_history
			
 
				+	MOVQ R13, R11
			
 
				+	SUBQ $0x10, R11
			
 
				+	JB   copy_4_small
			
 
				+
			
 
				+copy_4_loop:
			
 
				+	MOVUPS (R14), X0
			
 
				+	MOVUPS X0, (BX)
			
 
				+	ADDQ   $0x10, R14
			
 
				+	ADDQ   $0x10, BX
			
 
				+	SUBQ   $0x10, R11
			
 
				+	JAE    copy_4_loop
			
 
				+	LEAQ   16(R14)(R11*1), R14
			
 
				+	LEAQ   16(BX)(R11*1), BX
			
 
				+	MOVUPS -16(R14), X0
			
 
				+	MOVUPS X0, -16(BX)
			
 
				+	JMP    copy_4_end
			
 
				+
			
 
				+copy_4_small:
			
 
				+	CMPQ R13, $0x03
			
 
				+	JE   copy_4_move_3
			
 
				+	CMPQ R13, $0x08
			
 
				+	JB   copy_4_move_4through7
			
 
				+	JMP  copy_4_move_8through16
			
 
				+
			
 
				+copy_4_move_3:
			
 
				+	MOVW (R14), R11
			
 
				+	MOVB 2(R14), R12
			
 
				+	MOVW R11, (BX)
			
 
				+	MOVB R12, 2(BX)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, BX
			
 
				+	JMP  copy_4_end
			
 
				+
			
 
				+copy_4_move_4through7:
			
 
				+	MOVL (R14), R11
			
 
				+	MOVL -4(R14)(R13*1), R12
			
 
				+	MOVL R11, (BX)
			
 
				+	MOVL R12, -4(BX)(R13*1)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, BX
			
 
				+	JMP  copy_4_end
			
 
				+
			
 
				+copy_4_move_8through16:
			
 
				+	MOVQ (R14), R11
			
 
				+	MOVQ -8(R14)(R13*1), R12
			
 
				+	MOVQ R11, (BX)
			
 
				+	MOVQ R12, -8(BX)(R13*1)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, BX
			
 
				+
			
 
				+copy_4_end:
			
 
				+	ADDQ R13, DI
			
 
				+	ADDQ $0x18, AX
			
 
				+	INCQ DX
			
 
				+	CMPQ DX, CX
			
 
				+	JB   main_loop
			
 
				+	JMP  loop_finished
			
 
				+
			
 
				+copy_all_from_history:
			
 
				+	MOVQ R11, R15
			
 
				+	SUBQ $0x10, R15
			
 
				+	JB   copy_5_small
			
 
				+
			
 
				+copy_5_loop:
			
 
				+	MOVUPS (R14), X0
			
 
				+	MOVUPS X0, (BX)
			
 
				+	ADDQ   $0x10, R14
			
 
				+	ADDQ   $0x10, BX
			
 
				+	SUBQ   $0x10, R15
			
 
				+	JAE    copy_5_loop
			
 
				+	LEAQ   16(R14)(R15*1), R14
			
 
				+	LEAQ   16(BX)(R15*1), BX
			
 
				+	MOVUPS -16(R14), X0
			
 
				+	MOVUPS X0, -16(BX)
			
 
				+	JMP    copy_5_end
			
 
				+
			
 
				+copy_5_small:
			
 
				+	CMPQ R11, $0x03
			
 
				+	JE   copy_5_move_3
			
 
				+	JB   copy_5_move_1or2
			
 
				+	CMPQ R11, $0x08
			
 
				+	JB   copy_5_move_4through7
			
 
				+	JMP  copy_5_move_8through16
			
 
				+
			
 
				+copy_5_move_1or2:
			
 
				+	MOVB (R14), R15
			
 
				+	MOVB -1(R14)(R11*1), BP
			
 
				+	MOVB R15, (BX)
			
 
				+	MOVB BP, -1(BX)(R11*1)
			
 
				+	ADDQ R11, R14
			
 
				+	ADDQ R11, BX
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_3:
			
 
				+	MOVW (R14), R15
			
 
				+	MOVB 2(R14), BP
			
 
				+	MOVW R15, (BX)
			
 
				+	MOVB BP, 2(BX)
			
 
				+	ADDQ R11, R14
			
 
				+	ADDQ R11, BX
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_4through7:
			
 
				+	MOVL (R14), R15
			
 
				+	MOVL -4(R14)(R11*1), BP
			
 
				+	MOVL R15, (BX)
			
 
				+	MOVL BP, -4(BX)(R11*1)
			
 
				+	ADDQ R11, R14
			
 
				+	ADDQ R11, BX
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_8through16:
			
 
				+	MOVQ (R14), R15
			
 
				+	MOVQ -8(R14)(R11*1), BP
			
 
				+	MOVQ R15, (BX)
			
 
				+	MOVQ BP, -8(BX)(R11*1)
			
 
				+	ADDQ R11, R14
			
 
				+	ADDQ R11, BX
			
 
				+
			
 
				+copy_5_end:
			
 
				+	ADDQ R11, DI
			
 
				+	SUBQ R11, R13
			
 
				+
			
 
				+	// Copy match from the current buffer
			
 
				+copy_match:
			
 
				+	MOVQ BX, R11
			
 
				+	SUBQ R12, R11
			
 
				+
			
 
				+	// ml <= mo
			
 
				+	CMPQ R13, R12
			
 
				+	JA   copy_overlapping_match
			
 
				+
			
 
				+	// Copy non-overlapping match
			
 
				+	ADDQ R13, DI
			
 
				+	MOVQ BX, R12
			
 
				+	ADDQ R13, BX
			
 
				+
			
 
				+copy_2:
			
 
				+	MOVUPS (R11), X0
			
 
				+	MOVUPS X0, (R12)
			
 
				+	ADDQ   $0x10, R11
			
 
				+	ADDQ   $0x10, R12
			
 
				+	SUBQ   $0x10, R13
			
 
				+	JHI    copy_2
			
 
				+	JMP    handle_loop
			
 
				+
			
 
				+	// Copy overlapping match
			
 
				+copy_overlapping_match:
			
 
				+	ADDQ R13, DI
			
 
				+
			
 
				+copy_slow_3:
			
 
				+	MOVB (R11), R12
			
 
				+	MOVB R12, (BX)
			
 
				+	INCQ R11
			
 
				+	INCQ BX
			
 
				+	DECQ R13
			
 
				+	JNZ  copy_slow_3
			
 
				+
			
 
				+handle_loop:
			
 
				+	ADDQ $0x18, AX
			
 
				+	INCQ DX
			
 
				+	CMPQ DX, CX
			
 
				+	JB   main_loop
			
 
				+
			
 
				+loop_finished:
			
 
				+	// Return value
			
 
				+	MOVB $0x01, ret+8(FP)
			
 
				+
			
 
				+	// Update the context
			
 
				+	MOVQ ctx+0(FP), AX
			
 
				+	MOVQ DX, 24(AX)
			
 
				+	MOVQ DI, 104(AX)
			
 
				+	MOVQ 80(AX), CX
			
 
				+	SUBQ CX, SI
			
 
				+	MOVQ SI, 112(AX)
			
 
				+	RET
			
 
				+
			
 
				+error_match_off_too_big:
			
 
				+	// Return value
			
 
				+	MOVB $0x00, ret+8(FP)
			
 
				+
			
 
				+	// Update the context
			
 
				+	MOVQ ctx+0(FP), AX
			
 
				+	MOVQ DX, 24(AX)
			
 
				+	MOVQ DI, 104(AX)
			
 
				+	MOVQ 80(AX), CX
			
 
				+	SUBQ CX, SI
			
 
				+	MOVQ SI, 112(AX)
			
 
				+	RET
			
 
				+
			
 
				+empty_seqs:
			
 
				+	// Return value
			
 
				+	MOVB $0x01, ret+8(FP)
			
 
				+	RET
			
 
				+
			
 
				+// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
			
 
				+// Requires: SSE
			
 
				+TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
			
 
				+	MOVQ  ctx+0(FP), R10
			
 
				+	MOVQ  8(R10), CX
			
 
				+	TESTQ CX, CX
			
 
				+	JZ    empty_seqs
			
 
				+	MOVQ  (R10), AX
			
 
				+	MOVQ  24(R10), DX
			
 
				+	MOVQ  32(R10), BX
			
 
				+	MOVQ  80(R10), SI
			
 
				+	MOVQ  104(R10), DI
			
 
				+	MOVQ  120(R10), R8
			
 
				+	MOVQ  56(R10), R9
			
 
				+	MOVQ  64(R10), R10
			
 
				+	ADDQ  R10, R9
			
 
				+
			
 
				+	// seqsBase += 24 * seqIndex
			
 
				+	LEAQ (DX)(DX*2), R11
			
 
				+	SHLQ $0x03, R11
			
 
				+	ADDQ R11, AX
			
 
				+
			
 
				+	// outBase += outPosition
			
 
				+	ADDQ DI, BX
			
 
				+
			
 
				+main_loop:
			
 
				+	MOVQ (AX), R11
			
 
				+	MOVQ 16(AX), R12
			
 
				+	MOVQ 8(AX), R13
			
 
				+
			
 
				+	// Copy literals
			
 
				+	TESTQ R11, R11
			
 
				+	JZ    check_offset
			
 
				+	MOVQ  R11, R14
			
 
				+	SUBQ  $0x10, R14
			
 
				+	JB    copy_1_small
			
 
				+
			
 
				+copy_1_loop:
			
 
				+	MOVUPS (SI), X0
			
 
				+	MOVUPS X0, (BX)
			
 
				+	ADDQ   $0x10, SI
			
 
				+	ADDQ   $0x10, BX
			
 
				+	SUBQ   $0x10, R14
			
 
				+	JAE    copy_1_loop
			
 
				+	LEAQ   16(SI)(R14*1), SI
			
 
				+	LEAQ   16(BX)(R14*1), BX
			
 
				+	MOVUPS -16(SI), X0
			
 
				+	MOVUPS X0, -16(BX)
			
 
				+	JMP    copy_1_end
			
 
				+
			
 
				+copy_1_small:
			
 
				+	CMPQ R11, $0x03
			
 
				+	JE   copy_1_move_3
			
 
				+	JB   copy_1_move_1or2
			
 
				+	CMPQ R11, $0x08
			
 
				+	JB   copy_1_move_4through7
			
 
				+	JMP  copy_1_move_8through16
			
 
				+
			
 
				+copy_1_move_1or2:
			
 
				+	MOVB (SI), R14
			
 
				+	MOVB -1(SI)(R11*1), R15
			
 
				+	MOVB R14, (BX)
			
 
				+	MOVB R15, -1(BX)(R11*1)
			
 
				+	ADDQ R11, SI
			
 
				+	ADDQ R11, BX
			
 
				+	JMP  copy_1_end
			
 
				+
			
 
				+copy_1_move_3:
			
 
				+	MOVW (SI), R14
			
 
				+	MOVB 2(SI), R15
			
 
				+	MOVW R14, (BX)
			
 
				+	MOVB R15, 2(BX)
			
 
				+	ADDQ R11, SI
			
 
				+	ADDQ R11, BX
			
 
				+	JMP  copy_1_end
			
 
				+
			
 
				+copy_1_move_4through7:
			
 
				+	MOVL (SI), R14
			
 
				+	MOVL -4(SI)(R11*1), R15
			
 
				+	MOVL R14, (BX)
			
 
				+	MOVL R15, -4(BX)(R11*1)
			
 
				+	ADDQ R11, SI
			
 
				+	ADDQ R11, BX
			
 
				+	JMP  copy_1_end
			
 
				+
			
 
				+copy_1_move_8through16:
			
 
				+	MOVQ (SI), R14
			
 
				+	MOVQ -8(SI)(R11*1), R15
			
 
				+	MOVQ R14, (BX)
			
 
				+	MOVQ R15, -8(BX)(R11*1)
			
 
				+	ADDQ R11, SI
			
 
				+	ADDQ R11, BX
			
 
				+
			
 
				+copy_1_end:
			
 
				+	ADDQ R11, DI
			
 
				+
			
 
				+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
			
 
				+check_offset:
			
 
				+	LEAQ (DI)(R10*1), R11
			
 
				+	CMPQ R12, R11
			
 
				+	JG   error_match_off_too_big
			
 
				+	CMPQ R12, R8
			
 
				+	JG   error_match_off_too_big
			
 
				+
			
 
				+	// Copy match from history
			
 
				+	MOVQ R12, R11
			
 
				+	SUBQ DI, R11
			
 
				+	JLS  copy_match
			
 
				+	MOVQ R9, R14
			
 
				+	SUBQ R11, R14
			
 
				+	CMPQ R13, R11
			
 
				+	JG   copy_all_from_history
			
 
				+	MOVQ R13, R11
			
 
				+	SUBQ $0x10, R11
			
 
				+	JB   copy_4_small
			
 
				+
			
 
				+copy_4_loop:
			
 
				+	MOVUPS (R14), X0
			
 
				+	MOVUPS X0, (BX)
			
 
				+	ADDQ   $0x10, R14
			
 
				+	ADDQ   $0x10, BX
			
 
				+	SUBQ   $0x10, R11
			
 
				+	JAE    copy_4_loop
			
 
				+	LEAQ   16(R14)(R11*1), R14
			
 
				+	LEAQ   16(BX)(R11*1), BX
			
 
				+	MOVUPS -16(R14), X0
			
 
				+	MOVUPS X0, -16(BX)
			
 
				+	JMP    copy_4_end
			
 
				+
			
 
				+copy_4_small:
			
 
				+	CMPQ R13, $0x03
			
 
				+	JE   copy_4_move_3
			
 
				+	CMPQ R13, $0x08
			
 
				+	JB   copy_4_move_4through7
			
 
				+	JMP  copy_4_move_8through16
			
 
				+
			
 
				+copy_4_move_3:
			
 
				+	MOVW (R14), R11
			
 
				+	MOVB 2(R14), R12
			
 
				+	MOVW R11, (BX)
			
 
				+	MOVB R12, 2(BX)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, BX
			
 
				+	JMP  copy_4_end
			
 
				+
			
 
				+copy_4_move_4through7:
			
 
				+	MOVL (R14), R11
			
 
				+	MOVL -4(R14)(R13*1), R12
			
 
				+	MOVL R11, (BX)
			
 
				+	MOVL R12, -4(BX)(R13*1)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, BX
			
 
				+	JMP  copy_4_end
			
 
				+
			
 
				+copy_4_move_8through16:
			
 
				+	MOVQ (R14), R11
			
 
				+	MOVQ -8(R14)(R13*1), R12
			
 
				+	MOVQ R11, (BX)
			
 
				+	MOVQ R12, -8(BX)(R13*1)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, BX
			
 
				+
			
 
				+copy_4_end:
			
 
				+	ADDQ R13, DI
			
 
				+	ADDQ $0x18, AX
			
 
				+	INCQ DX
			
 
				+	CMPQ DX, CX
			
 
				+	JB   main_loop
			
 
				+	JMP  loop_finished
			
 
				+
			
 
				+copy_all_from_history:
			
 
				+	MOVQ R11, R15
			
 
				+	SUBQ $0x10, R15
			
 
				+	JB   copy_5_small
			
 
				+
			
 
				+copy_5_loop:
			
 
				+	MOVUPS (R14), X0
			
 
				+	MOVUPS X0, (BX)
			
 
				+	ADDQ   $0x10, R14
			
 
				+	ADDQ   $0x10, BX
			
 
				+	SUBQ   $0x10, R15
			
 
				+	JAE    copy_5_loop
			
 
				+	LEAQ   16(R14)(R15*1), R14
			
 
				+	LEAQ   16(BX)(R15*1), BX
			
 
				+	MOVUPS -16(R14), X0
			
 
				+	MOVUPS X0, -16(BX)
			
 
				+	JMP    copy_5_end
			
 
				+
			
 
				+copy_5_small:
			
 
				+	CMPQ R11, $0x03
			
 
				+	JE   copy_5_move_3
			
 
				+	JB   copy_5_move_1or2
			
 
				+	CMPQ R11, $0x08
			
 
				+	JB   copy_5_move_4through7
			
 
				+	JMP  copy_5_move_8through16
			
 
				+
			
 
				+copy_5_move_1or2:
			
 
				+	MOVB (R14), R15
			
 
				+	MOVB -1(R14)(R11*1), BP
			
 
				+	MOVB R15, (BX)
			
 
				+	MOVB BP, -1(BX)(R11*1)
			
 
				+	ADDQ R11, R14
			
 
				+	ADDQ R11, BX
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_3:
			
 
				+	MOVW (R14), R15
			
 
				+	MOVB 2(R14), BP
			
 
				+	MOVW R15, (BX)
			
 
				+	MOVB BP, 2(BX)
			
 
				+	ADDQ R11, R14
			
 
				+	ADDQ R11, BX
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_4through7:
			
 
				+	MOVL (R14), R15
			
 
				+	MOVL -4(R14)(R11*1), BP
			
 
				+	MOVL R15, (BX)
			
 
				+	MOVL BP, -4(BX)(R11*1)
			
 
				+	ADDQ R11, R14
			
 
				+	ADDQ R11, BX
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_8through16:
			
 
				+	MOVQ (R14), R15
			
 
				+	MOVQ -8(R14)(R11*1), BP
			
 
				+	MOVQ R15, (BX)
			
 
				+	MOVQ BP, -8(BX)(R11*1)
			
 
				+	ADDQ R11, R14
			
 
				+	ADDQ R11, BX
			
 
				+
			
 
				+copy_5_end:
			
 
				+	ADDQ R11, DI
			
 
				+	SUBQ R11, R13
			
 
				+
			
 
				+	// Copy match from the current buffer
			
 
				+copy_match:
			
 
				+	MOVQ BX, R11
			
 
				+	SUBQ R12, R11
			
 
				+
			
 
				+	// ml <= mo
			
 
				+	CMPQ R13, R12
			
 
				+	JA   copy_overlapping_match
			
 
				+
			
 
				+	// Copy non-overlapping match
			
 
				+	ADDQ R13, DI
			
 
				+	MOVQ R13, R12
			
 
				+	SUBQ $0x10, R12
			
 
				+	JB   copy_2_small
			
 
				+
			
 
				+copy_2_loop:
			
 
				+	MOVUPS (R11), X0
			
 
				+	MOVUPS X0, (BX)
			
 
				+	ADDQ   $0x10, R11
			
 
				+	ADDQ   $0x10, BX
			
 
				+	SUBQ   $0x10, R12
			
 
				+	JAE    copy_2_loop
			
 
				+	LEAQ   16(R11)(R12*1), R11
			
 
				+	LEAQ   16(BX)(R12*1), BX
			
 
				+	MOVUPS -16(R11), X0
			
 
				+	MOVUPS X0, -16(BX)
			
 
				+	JMP    copy_2_end
			
 
				+
			
 
				+copy_2_small:
			
 
				+	CMPQ R13, $0x03
			
 
				+	JE   copy_2_move_3
			
 
				+	JB   copy_2_move_1or2
			
 
				+	CMPQ R13, $0x08
			
 
				+	JB   copy_2_move_4through7
			
 
				+	JMP  copy_2_move_8through16
			
 
				+
			
 
				+copy_2_move_1or2:
			
 
				+	MOVB (R11), R12
			
 
				+	MOVB -1(R11)(R13*1), R14
			
 
				+	MOVB R12, (BX)
			
 
				+	MOVB R14, -1(BX)(R13*1)
			
 
				+	ADDQ R13, R11
			
 
				+	ADDQ R13, BX
			
 
				+	JMP  copy_2_end
			
 
				+
			
 
				+copy_2_move_3:
			
 
				+	MOVW (R11), R12
			
 
				+	MOVB 2(R11), R14
			
 
				+	MOVW R12, (BX)
			
 
				+	MOVB R14, 2(BX)
			
 
				+	ADDQ R13, R11
			
 
				+	ADDQ R13, BX
			
 
				+	JMP  copy_2_end
			
 
				+
			
 
				+copy_2_move_4through7:
			
 
				+	MOVL (R11), R12
			
 
				+	MOVL -4(R11)(R13*1), R14
			
 
				+	MOVL R12, (BX)
			
 
				+	MOVL R14, -4(BX)(R13*1)
			
 
				+	ADDQ R13, R11
			
 
				+	ADDQ R13, BX
			
 
				+	JMP  copy_2_end
			
 
				+
			
 
				+copy_2_move_8through16:
			
 
				+	MOVQ (R11), R12
			
 
				+	MOVQ -8(R11)(R13*1), R14
			
 
				+	MOVQ R12, (BX)
			
 
				+	MOVQ R14, -8(BX)(R13*1)
			
 
				+	ADDQ R13, R11
			
 
				+	ADDQ R13, BX
			
 
				+
			
 
				+copy_2_end:
			
 
				+	JMP handle_loop
			
 
				+
			
 
				+	// Copy overlapping match
			
 
				+copy_overlapping_match:
			
 
				+	ADDQ R13, DI
			
 
				+
			
 
				+copy_slow_3:
			
 
				+	MOVB (R11), R12
			
 
				+	MOVB R12, (BX)
			
 
				+	INCQ R11
			
 
				+	INCQ BX
			
 
				+	DECQ R13
			
 
				+	JNZ  copy_slow_3
			
 
				+
			
 
				+handle_loop:
			
 
				+	ADDQ $0x18, AX
			
 
				+	INCQ DX
			
 
				+	CMPQ DX, CX
			
 
				+	JB   main_loop
			
 
				+
			
 
				+loop_finished:
			
 
				+	// Return value
			
 
				+	MOVB $0x01, ret+8(FP)
			
 
				+
			
 
				+	// Update the context
			
 
				+	MOVQ ctx+0(FP), AX
			
 
				+	MOVQ DX, 24(AX)
			
 
				+	MOVQ DI, 104(AX)
			
 
				+	MOVQ 80(AX), CX
			
 
				+	SUBQ CX, SI
			
 
				+	MOVQ SI, 112(AX)
			
 
				+	RET
			
 
				+
			
 
				+error_match_off_too_big:
			
 
				+	// Return value
			
 
				+	MOVB $0x00, ret+8(FP)
			
 
				+
			
 
				+	// Update the context
			
 
				+	MOVQ ctx+0(FP), AX
			
 
				+	MOVQ DX, 24(AX)
			
 
				+	MOVQ DI, 104(AX)
			
 
				+	MOVQ 80(AX), CX
			
 
				+	SUBQ CX, SI
			
 
				+	MOVQ SI, 112(AX)
			
 
				+	RET
			
 
				+
			
 
				+empty_seqs:
			
 
				+	// Return value
			
 
				+	MOVB $0x01, ret+8(FP)
			
 
				+	RET
			
 
				+
			
 
				+// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
			
 
				+// Requires: CMOV, SSE
			
 
				+TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
			
 
				+	MOVQ    br+8(FP), AX
			
 
				+	MOVQ    32(AX), DX
			
 
				+	MOVBQZX 40(AX), BX
			
 
				+	MOVQ    24(AX), SI
			
 
				+	MOVQ    (AX), AX
			
 
				+	ADDQ    SI, AX
			
 
				+	MOVQ    AX, (SP)
			
 
				+	MOVQ    ctx+16(FP), AX
			
 
				+	MOVQ    72(AX), DI
			
 
				+	MOVQ    80(AX), R8
			
 
				+	MOVQ    88(AX), R9
			
 
				+	XORQ    CX, CX
			
 
				+	MOVQ    CX, 8(SP)
			
 
				+	MOVQ    CX, 16(SP)
			
 
				+	MOVQ    CX, 24(SP)
			
 
				+	MOVQ    112(AX), R10
			
 
				+	MOVQ    128(AX), CX
			
 
				+	MOVQ    CX, 32(SP)
			
 
				+	MOVQ    144(AX), R11
			
 
				+	MOVQ    136(AX), R12
			
 
				+	MOVQ    200(AX), CX
			
 
				+	MOVQ    CX, 56(SP)
			
 
				+	MOVQ    176(AX), CX
			
 
				+	MOVQ    CX, 48(SP)
			
 
				+	MOVQ    184(AX), AX
			
 
				+	MOVQ    AX, 40(SP)
			
 
				+	MOVQ    40(SP), AX
			
 
				+	ADDQ    AX, 48(SP)
			
 
				+
			
 
				+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
			
 
				+	ADDQ R10, 32(SP)
			
 
				+
			
 
				+	// outBase += outPosition
			
 
				+	ADDQ R12, R10
			
 
				+
			
 
				+sequenceDecs_decodeSync_amd64_main_loop:
			
 
				+	MOVQ (SP), R13
			
 
				+
			
 
				+	// Fill bitreader to have enough for the offset and match length.
			
 
				+	CMPQ SI, $0x08
			
 
				+	JL   sequenceDecs_decodeSync_amd64_fill_byte_by_byte
			
 
				+	MOVQ BX, AX
			
 
				+	SHRQ $0x03, AX
			
 
				+	SUBQ AX, R13
			
 
				+	MOVQ (R13), DX
			
 
				+	SUBQ AX, SI
			
 
				+	ANDQ $0x07, BX
			
 
				+	JMP  sequenceDecs_decodeSync_amd64_fill_end
			
 
				+
			
 
				+sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
			
 
				+	CMPQ    SI, $0x00
			
 
				+	JLE     sequenceDecs_decodeSync_amd64_fill_end
			
 
				+	CMPQ    BX, $0x07
			
 
				+	JLE     sequenceDecs_decodeSync_amd64_fill_end
			
 
				+	SHLQ    $0x08, DX
			
 
				+	SUBQ    $0x01, R13
			
 
				+	SUBQ    $0x01, SI
			
 
				+	SUBQ    $0x08, BX
			
 
				+	MOVBQZX (R13), AX
			
 
				+	ORQ     AX, DX
			
 
				+	JMP     sequenceDecs_decodeSync_amd64_fill_byte_by_byte
			
 
				+
			
 
				+sequenceDecs_decodeSync_amd64_fill_end:
			
 
				+	// Update offset
			
 
				+	MOVQ  R9, AX
			
 
				+	MOVQ  BX, CX
			
 
				+	MOVQ  DX, R14
			
 
				+	SHLQ  CL, R14
			
 
				+	MOVB  AH, CL
			
 
				+	SHRQ  $0x20, AX
			
 
				+	TESTQ CX, CX
			
 
				+	JZ    sequenceDecs_decodeSync_amd64_of_update_zero
			
 
				+	ADDQ  CX, BX
			
 
				+	CMPQ  BX, $0x40
			
 
				+	JA    sequenceDecs_decodeSync_amd64_of_update_zero
			
 
				+	CMPQ  CX, $0x40
			
 
				+	JAE   sequenceDecs_decodeSync_amd64_of_update_zero
			
 
				+	NEGQ  CX
			
 
				+	SHRQ  CL, R14
			
 
				+	ADDQ  R14, AX
			
 
				+
			
 
				+sequenceDecs_decodeSync_amd64_of_update_zero:
			
 
				+	MOVQ AX, 8(SP)
			
 
				+
			
 
				+	// Update match length
			
 
				+	MOVQ  R8, AX
			
 
				+	MOVQ  BX, CX
			
 
				+	MOVQ  DX, R14
			
 
				+	SHLQ  CL, R14
			
 
				+	MOVB  AH, CL
			
 
				+	SHRQ  $0x20, AX
			
 
				+	TESTQ CX, CX
			
 
				+	JZ    sequenceDecs_decodeSync_amd64_ml_update_zero
			
 
				+	ADDQ  CX, BX
			
 
				+	CMPQ  BX, $0x40
			
 
				+	JA    sequenceDecs_decodeSync_amd64_ml_update_zero
			
 
				+	CMPQ  CX, $0x40
			
 
				+	JAE   sequenceDecs_decodeSync_amd64_ml_update_zero
			
 
				+	NEGQ  CX
			
 
				+	SHRQ  CL, R14
			
 
				+	ADDQ  R14, AX
			
 
				+
			
 
				+sequenceDecs_decodeSync_amd64_ml_update_zero:
			
 
				+	MOVQ AX, 16(SP)
			
 
				+
			
 
				+	// Fill bitreader to have enough for the remaining
			
 
				+	CMPQ SI, $0x08
			
 
				+	JL   sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
			
 
				+	MOVQ BX, AX
			
 
				+	SHRQ $0x03, AX
			
 
				+	SUBQ AX, R13
			
 
				+	MOVQ (R13), DX
			
 
				+	SUBQ AX, SI
			
 
				+	ANDQ $0x07, BX
			
 
				+	JMP  sequenceDecs_decodeSync_amd64_fill_2_end
			
 
				+
			
 
				+sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
			
 
				+	CMPQ    SI, $0x00
			
 
				+	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
			
 
				+	CMPQ    BX, $0x07
			
 
				+	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
			
 
				+	SHLQ    $0x08, DX
			
 
				+	SUBQ    $0x01, R13
			
 
				+	SUBQ    $0x01, SI
			
 
				+	SUBQ    $0x08, BX
			
 
				+	MOVBQZX (R13), AX
			
 
				+	ORQ     AX, DX
			
 
				+	JMP     sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
			
 
				+
			
 
				+sequenceDecs_decodeSync_amd64_fill_2_end:
			
 
				+	// Update literal length
			
 
				+	MOVQ  DI, AX
			
 
				+	MOVQ  BX, CX
			
 
				+	MOVQ  DX, R14
			
 
				+	SHLQ  CL, R14
			
 
				+	MOVB  AH, CL
			
 
				+	SHRQ  $0x20, AX
			
 
				+	TESTQ CX, CX
			
 
				+	JZ    sequenceDecs_decodeSync_amd64_ll_update_zero
			
 
				+	ADDQ  CX, BX
			
 
				+	CMPQ  BX, $0x40
			
 
				+	JA    sequenceDecs_decodeSync_amd64_ll_update_zero
			
 
				+	CMPQ  CX, $0x40
			
 
				+	JAE   sequenceDecs_decodeSync_amd64_ll_update_zero
			
 
				+	NEGQ  CX
			
 
				+	SHRQ  CL, R14
			
 
				+	ADDQ  R14, AX
			
 
				+
			
 
				+sequenceDecs_decodeSync_amd64_ll_update_zero:
			
 
				+	MOVQ AX, 24(SP)
			
 
				+
			
 
				+	// Fill bitreader for state updates
			
 
				+	MOVQ    R13, (SP)
			
 
				+	MOVQ    R9, AX
			
 
				+	SHRQ    $0x08, AX
			
 
				+	MOVBQZX AL, AX
			
 
				+	MOVQ    ctx+16(FP), CX
			
 
				+	CMPQ    96(CX), $0x00
			
 
				+	JZ      sequenceDecs_decodeSync_amd64_skip_update
			
 
				+
			
 
				+	// Update Literal Length State
			
 
				+	MOVBQZX DI, R13
			
 
				+	SHRQ    $0x10, DI
			
 
				+	MOVWQZX DI, DI
			
 
				+	LEAQ    (BX)(R13*1), CX
			
 
				+	MOVQ    DX, R14
			
 
				+	MOVQ    CX, BX
			
 
				+	ROLQ    CL, R14
			
 
				+	MOVL    $0x00000001, R15
			
 
				+	MOVB    R13, CL
			
 
				+	SHLL    CL, R15
			
 
				+	DECL    R15
			
 
				+	ANDQ    R15, R14
			
 
				+	ADDQ    R14, DI
			
 
				+
			
 
				+	// Load ctx.llTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ (CX), CX
			
 
				+	MOVQ (CX)(DI*8), DI
			
 
				+
			
 
				+	// Update Match Length State
			
 
				+	MOVBQZX R8, R13
			
 
				+	SHRQ    $0x10, R8
			
 
				+	MOVWQZX R8, R8
			
 
				+	LEAQ    (BX)(R13*1), CX
			
 
				+	MOVQ    DX, R14
			
 
				+	MOVQ    CX, BX
			
 
				+	ROLQ    CL, R14
			
 
				+	MOVL    $0x00000001, R15
			
 
				+	MOVB    R13, CL
			
 
				+	SHLL    CL, R15
			
 
				+	DECL    R15
			
 
				+	ANDQ    R15, R14
			
 
				+	ADDQ    R14, R8
			
 
				+
			
 
				+	// Load ctx.mlTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ 24(CX), CX
			
 
				+	MOVQ (CX)(R8*8), R8
			
 
				+
			
 
				+	// Update Offset State
			
 
				+	MOVBQZX R9, R13
			
 
				+	SHRQ    $0x10, R9
			
 
				+	MOVWQZX R9, R9
			
 
				+	LEAQ    (BX)(R13*1), CX
			
 
				+	MOVQ    DX, R14
			
 
				+	MOVQ    CX, BX
			
 
				+	ROLQ    CL, R14
			
 
				+	MOVL    $0x00000001, R15
			
 
				+	MOVB    R13, CL
			
 
				+	SHLL    CL, R15
			
 
				+	DECL    R15
			
 
				+	ANDQ    R15, R14
			
 
				+	ADDQ    R14, R9
			
 
				+
			
 
				+	// Load ctx.ofTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ 48(CX), CX
			
 
				+	MOVQ (CX)(R9*8), R9
			
 
				+
			
 
				+sequenceDecs_decodeSync_amd64_skip_update:
			
 
				+	// Adjust offset
			
 
				+	MOVQ   s+0(FP), CX
			
 
				+	MOVQ   8(SP), R13
			
 
				+	CMPQ   AX, $0x01
			
 
				+	JBE    sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
			
 
				+	MOVUPS 144(CX), X0
			
 
				+	MOVQ   R13, 144(CX)
			
 
				+	MOVUPS X0, 152(CX)
			
 
				+	JMP    sequenceDecs_decodeSync_amd64_after_adjust
			
 
				+
			
 
				+sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
			
 
				+	CMPQ 24(SP), $0x00000000
			
 
				+	JNE  sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
			
 
				+	INCQ R13
			
 
				+	JMP  sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
			
 
				+
			
 
				+sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
			
 
				+	TESTQ R13, R13
			
 
				+	JNZ   sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
			
 
				+	MOVQ  144(CX), R13
			
 
				+	JMP   sequenceDecs_decodeSync_amd64_after_adjust
			
 
				+
			
 
				+sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
			
 
				+	MOVQ    R13, AX
			
 
				+	XORQ    R14, R14
			
 
				+	MOVQ    $-1, R15
			
 
				+	CMPQ    R13, $0x03
			
 
				+	CMOVQEQ R14, AX
			
 
				+	CMOVQEQ R15, R14
			
 
				+	ADDQ    144(CX)(AX*8), R14
			
 
				+	JNZ     sequenceDecs_decodeSync_amd64_adjust_temp_valid
			
 
				+	MOVQ    $0x00000001, R14
			
 
				+
			
 
				+sequenceDecs_decodeSync_amd64_adjust_temp_valid:
			
 
				+	CMPQ R13, $0x01
			
 
				+	JZ   sequenceDecs_decodeSync_amd64_adjust_skip
			
 
				+	MOVQ 152(CX), AX
			
 
				+	MOVQ AX, 160(CX)
			
 
				+
			
 
				+sequenceDecs_decodeSync_amd64_adjust_skip:
			
 
				+	MOVQ 144(CX), AX
			
 
				+	MOVQ AX, 152(CX)
			
 
				+	MOVQ R14, 144(CX)
			
 
				+	MOVQ R14, R13
			
 
				+
			
 
				+sequenceDecs_decodeSync_amd64_after_adjust:
			
 
				+	MOVQ R13, 8(SP)
			
 
				+
			
 
				+	// Check values
			
 
				+	MOVQ  16(SP), AX
			
 
				+	MOVQ  24(SP), CX
			
 
				+	LEAQ  (AX)(CX*1), R14
			
 
				+	MOVQ  s+0(FP), R15
			
 
				+	ADDQ  R14, 256(R15)
			
 
				+	MOVQ  ctx+16(FP), R14
			
 
				+	SUBQ  CX, 104(R14)
			
 
				+	JS    error_not_enough_literals
			
 
				+	CMPQ  AX, $0x00020002
			
 
				+	JA    sequenceDecs_decodeSync_amd64_error_match_len_too_big
			
 
				+	TESTQ R13, R13
			
 
				+	JNZ   sequenceDecs_decodeSync_amd64_match_len_ofs_ok
			
 
				+	TESTQ AX, AX
			
 
				+	JNZ   sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
			
 
				+
			
 
				+sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
			
 
				+	MOVQ 24(SP), AX
			
 
				+	MOVQ 8(SP), CX
			
 
				+	MOVQ 16(SP), R13
			
 
				+
			
 
				+	// Check if we have enough space in s.out
			
 
				+	LEAQ (AX)(R13*1), R14
			
 
				+	ADDQ R10, R14
			
 
				+	CMPQ R14, 32(SP)
			
 
				+	JA   error_not_enough_space
			
 
				+
			
 
				+	// Copy literals
			
 
				+	TESTQ AX, AX
			
 
				+	JZ    check_offset
			
 
				+	XORQ  R14, R14
			
 
				+
			
 
				+copy_1:
			
 
				+	MOVUPS (R11)(R14*1), X0
			
 
				+	MOVUPS X0, (R10)(R14*1)
			
 
				+	ADDQ   $0x10, R14
			
 
				+	CMPQ   R14, AX
			
 
				+	JB     copy_1
			
 
				+	ADDQ   AX, R11
			
 
				+	ADDQ   AX, R10
			
 
				+	ADDQ   AX, R12
			
 
				+
			
 
				+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
			
 
				+check_offset:
			
 
				+	MOVQ R12, AX
			
 
				+	ADDQ 40(SP), AX
			
 
				+	CMPQ CX, AX
			
 
				+	JG   error_match_off_too_big
			
 
				+	CMPQ CX, 56(SP)
			
 
				+	JG   error_match_off_too_big
			
 
				+
			
 
				+	// Copy match from history
			
 
				+	MOVQ CX, AX
			
 
				+	SUBQ R12, AX
			
 
				+	JLS  copy_match
			
 
				+	MOVQ 48(SP), R14
			
 
				+	SUBQ AX, R14
			
 
				+	CMPQ R13, AX
			
 
				+	JG   copy_all_from_history
			
 
				+	MOVQ R13, AX
			
 
				+	SUBQ $0x10, AX
			
 
				+	JB   copy_4_small
			
 
				+
			
 
				+copy_4_loop:
			
 
				+	MOVUPS (R14), X0
			
 
				+	MOVUPS X0, (R10)
			
 
				+	ADDQ   $0x10, R14
			
 
				+	ADDQ   $0x10, R10
			
 
				+	SUBQ   $0x10, AX
			
 
				+	JAE    copy_4_loop
			
 
				+	LEAQ   16(R14)(AX*1), R14
			
 
				+	LEAQ   16(R10)(AX*1), R10
			
 
				+	MOVUPS -16(R14), X0
			
 
				+	MOVUPS X0, -16(R10)
			
 
				+	JMP    copy_4_end
			
 
				+
			
 
				+copy_4_small:
			
 
				+	CMPQ R13, $0x03
			
 
				+	JE   copy_4_move_3
			
 
				+	CMPQ R13, $0x08
			
 
				+	JB   copy_4_move_4through7
			
 
				+	JMP  copy_4_move_8through16
			
 
				+
			
 
				+copy_4_move_3:
			
 
				+	MOVW (R14), AX
			
 
				+	MOVB 2(R14), CL
			
 
				+	MOVW AX, (R10)
			
 
				+	MOVB CL, 2(R10)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, R10
			
 
				+	JMP  copy_4_end
			
 
				+
			
 
				+copy_4_move_4through7:
			
 
				+	MOVL (R14), AX
			
 
				+	MOVL -4(R14)(R13*1), CX
			
 
				+	MOVL AX, (R10)
			
 
				+	MOVL CX, -4(R10)(R13*1)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, R10
			
 
				+	JMP  copy_4_end
			
 
				+
			
 
				+copy_4_move_8through16:
			
 
				+	MOVQ (R14), AX
			
 
				+	MOVQ -8(R14)(R13*1), CX
			
 
				+	MOVQ AX, (R10)
			
 
				+	MOVQ CX, -8(R10)(R13*1)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, R10
			
 
				+
			
 
				+copy_4_end:
			
 
				+	ADDQ R13, R12
			
 
				+	JMP  handle_loop
			
 
				+	JMP loop_finished
			
 
				+
			
 
				+copy_all_from_history:
			
 
				+	MOVQ AX, R15
			
 
				+	SUBQ $0x10, R15
			
 
				+	JB   copy_5_small
			
 
				+
			
 
				+copy_5_loop:
			
 
				+	MOVUPS (R14), X0
			
 
				+	MOVUPS X0, (R10)
			
 
				+	ADDQ   $0x10, R14
			
 
				+	ADDQ   $0x10, R10
			
 
				+	SUBQ   $0x10, R15
			
 
				+	JAE    copy_5_loop
			
 
				+	LEAQ   16(R14)(R15*1), R14
			
 
				+	LEAQ   16(R10)(R15*1), R10
			
 
				+	MOVUPS -16(R14), X0
			
 
				+	MOVUPS X0, -16(R10)
			
 
				+	JMP    copy_5_end
			
 
				+
			
 
				+copy_5_small:
			
 
				+	CMPQ AX, $0x03
			
 
				+	JE   copy_5_move_3
			
 
				+	JB   copy_5_move_1or2
			
 
				+	CMPQ AX, $0x08
			
 
				+	JB   copy_5_move_4through7
			
 
				+	JMP  copy_5_move_8through16
			
 
				+
			
 
				+copy_5_move_1or2:
			
 
				+	MOVB (R14), R15
			
 
				+	MOVB -1(R14)(AX*1), BP
			
 
				+	MOVB R15, (R10)
			
 
				+	MOVB BP, -1(R10)(AX*1)
			
 
				+	ADDQ AX, R14
			
 
				+	ADDQ AX, R10
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_3:
			
 
				+	MOVW (R14), R15
			
 
				+	MOVB 2(R14), BP
			
 
				+	MOVW R15, (R10)
			
 
				+	MOVB BP, 2(R10)
			
 
				+	ADDQ AX, R14
			
 
				+	ADDQ AX, R10
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_4through7:
			
 
				+	MOVL (R14), R15
			
 
				+	MOVL -4(R14)(AX*1), BP
			
 
				+	MOVL R15, (R10)
			
 
				+	MOVL BP, -4(R10)(AX*1)
			
 
				+	ADDQ AX, R14
			
 
				+	ADDQ AX, R10
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_8through16:
			
 
				+	MOVQ (R14), R15
			
 
				+	MOVQ -8(R14)(AX*1), BP
			
 
				+	MOVQ R15, (R10)
			
 
				+	MOVQ BP, -8(R10)(AX*1)
			
 
				+	ADDQ AX, R14
			
 
				+	ADDQ AX, R10
			
 
				+
			
 
				+copy_5_end:
			
 
				+	ADDQ AX, R12
			
 
				+	SUBQ AX, R13
			
 
				+
			
 
				+	// Copy match from the current buffer
			
 
				+copy_match:
			
 
				+	MOVQ R10, AX
			
 
				+	SUBQ CX, AX
			
 
				+
			
 
				+	// ml <= mo
			
 
				+	CMPQ R13, CX
			
 
				+	JA   copy_overlapping_match
			
 
				+
			
 
				+	// Copy non-overlapping match
			
 
				+	ADDQ R13, R12
			
 
				+	MOVQ R10, CX
			
 
				+	ADDQ R13, R10
			
 
				+
			
 
				+copy_2:
			
 
				+	MOVUPS (AX), X0
			
 
				+	MOVUPS X0, (CX)
			
 
				+	ADDQ   $0x10, AX
			
 
				+	ADDQ   $0x10, CX
			
 
				+	SUBQ   $0x10, R13
			
 
				+	JHI    copy_2
			
 
				+	JMP    handle_loop
			
 
				+
			
 
				+	// Copy overlapping match
			
 
				+copy_overlapping_match:
			
 
				+	ADDQ R13, R12
			
 
				+
			
 
				+copy_slow_3:
			
 
				+	MOVB (AX), CL
			
 
				+	MOVB CL, (R10)
			
 
				+	INCQ AX
			
 
				+	INCQ R10
			
 
				+	DECQ R13
			
 
				+	JNZ  copy_slow_3
			
 
				+
			
 
				+handle_loop:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	DECQ 96(AX)
			
 
				+	JNS  sequenceDecs_decodeSync_amd64_main_loop
			
 
				+
			
 
				+loop_finished:
			
 
				+	MOVQ br+8(FP), AX
			
 
				+	MOVQ DX, 32(AX)
			
 
				+	MOVB BL, 40(AX)
			
 
				+	MOVQ SI, 24(AX)
			
 
				+
			
 
				+	// Update the context
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ R12, 136(AX)
			
 
				+	MOVQ 144(AX), CX
			
 
				+	SUBQ CX, R11
			
 
				+	MOVQ R11, 168(AX)
			
 
				+
			
 
				+	// Return success
			
 
				+	MOVQ $0x00000000, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match length error
			
 
				+sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
			
 
				+	MOVQ 16(SP), AX
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ AX, 216(CX)
			
 
				+	MOVQ $0x00000001, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match too long error
			
 
				+sequenceDecs_decodeSync_amd64_error_match_len_too_big:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ 16(SP), CX
			
 
				+	MOVQ CX, 216(AX)
			
 
				+	MOVQ $0x00000002, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match offset too long error
			
 
				+error_match_off_too_big:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ 8(SP), CX
			
 
				+	MOVQ CX, 224(AX)
			
 
				+	MOVQ R12, 136(AX)
			
 
				+	MOVQ $0x00000003, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with not enough literals error
			
 
				+error_not_enough_literals:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ 24(SP), CX
			
 
				+	MOVQ CX, 208(AX)
			
 
				+	MOVQ $0x00000004, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with not enough output space error
			
 
				+error_not_enough_space:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ 24(SP), CX
			
 
				+	MOVQ CX, 208(AX)
			
 
				+	MOVQ 16(SP), CX
			
 
				+	MOVQ CX, 216(AX)
			
 
				+	MOVQ R12, 136(AX)
			
 
				+	MOVQ $0x00000005, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
			
 
				+// Requires: BMI, BMI2, CMOV, SSE
			
 
				+TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
			
 
				+	MOVQ    br+8(FP), CX
			
 
				+	MOVQ    32(CX), AX
			
 
				+	MOVBQZX 40(CX), DX
			
 
				+	MOVQ    24(CX), BX
			
 
				+	MOVQ    (CX), CX
			
 
				+	ADDQ    BX, CX
			
 
				+	MOVQ    CX, (SP)
			
 
				+	MOVQ    ctx+16(FP), CX
			
 
				+	MOVQ    72(CX), SI
			
 
				+	MOVQ    80(CX), DI
			
 
				+	MOVQ    88(CX), R8
			
 
				+	XORQ    R9, R9
			
 
				+	MOVQ    R9, 8(SP)
			
 
				+	MOVQ    R9, 16(SP)
			
 
				+	MOVQ    R9, 24(SP)
			
 
				+	MOVQ    112(CX), R9
			
 
				+	MOVQ    128(CX), R10
			
 
				+	MOVQ    R10, 32(SP)
			
 
				+	MOVQ    144(CX), R10
			
 
				+	MOVQ    136(CX), R11
			
 
				+	MOVQ    200(CX), R12
			
 
				+	MOVQ    R12, 56(SP)
			
 
				+	MOVQ    176(CX), R12
			
 
				+	MOVQ    R12, 48(SP)
			
 
				+	MOVQ    184(CX), CX
			
 
				+	MOVQ    CX, 40(SP)
			
 
				+	MOVQ    40(SP), CX
			
 
				+	ADDQ    CX, 48(SP)
			
 
				+
			
 
				+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
			
 
				+	ADDQ R9, 32(SP)
			
 
				+
			
 
				+	// outBase += outPosition
			
 
				+	ADDQ R11, R9
			
 
				+
			
 
				+sequenceDecs_decodeSync_bmi2_main_loop:
			
 
				+	MOVQ (SP), R12
			
 
				+
			
 
				+	// Fill bitreader to have enough for the offset and match length.
			
 
				+	CMPQ BX, $0x08
			
 
				+	JL   sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
			
 
				+	MOVQ DX, CX
			
 
				+	SHRQ $0x03, CX
			
 
				+	SUBQ CX, R12
			
 
				+	MOVQ (R12), AX
			
 
				+	SUBQ CX, BX
			
 
				+	ANDQ $0x07, DX
			
 
				+	JMP  sequenceDecs_decodeSync_bmi2_fill_end
			
 
				+
			
 
				+sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
			
 
				+	CMPQ    BX, $0x00
			
 
				+	JLE     sequenceDecs_decodeSync_bmi2_fill_end
			
 
				+	CMPQ    DX, $0x07
			
 
				+	JLE     sequenceDecs_decodeSync_bmi2_fill_end
			
 
				+	SHLQ    $0x08, AX
			
 
				+	SUBQ    $0x01, R12
			
 
				+	SUBQ    $0x01, BX
			
 
				+	SUBQ    $0x08, DX
			
 
				+	MOVBQZX (R12), CX
			
 
				+	ORQ     CX, AX
			
 
				+	JMP     sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
			
 
				+
			
 
				+sequenceDecs_decodeSync_bmi2_fill_end:
			
 
				+	// Update offset
			
 
				+	MOVQ   $0x00000808, CX
			
 
				+	BEXTRQ CX, R8, R13
			
 
				+	MOVQ   AX, R14
			
 
				+	LEAQ   (DX)(R13*1), CX
			
 
				+	ROLQ   CL, R14
			
 
				+	BZHIQ  R13, R14, R14
			
 
				+	MOVQ   CX, DX
			
 
				+	MOVQ   R8, CX
			
 
				+	SHRQ   $0x20, CX
			
 
				+	ADDQ   R14, CX
			
 
				+	MOVQ   CX, 8(SP)
			
 
				+
			
 
				+	// Update match length
			
 
				+	MOVQ   $0x00000808, CX
			
 
				+	BEXTRQ CX, DI, R13
			
 
				+	MOVQ   AX, R14
			
 
				+	LEAQ   (DX)(R13*1), CX
			
 
				+	ROLQ   CL, R14
			
 
				+	BZHIQ  R13, R14, R14
			
 
				+	MOVQ   CX, DX
			
 
				+	MOVQ   DI, CX
			
 
				+	SHRQ   $0x20, CX
			
 
				+	ADDQ   R14, CX
			
 
				+	MOVQ   CX, 16(SP)
			
 
				+
			
 
				+	// Fill bitreader to have enough for the remaining
			
 
				+	CMPQ BX, $0x08
			
 
				+	JL   sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
			
 
				+	MOVQ DX, CX
			
 
				+	SHRQ $0x03, CX
			
 
				+	SUBQ CX, R12
			
 
				+	MOVQ (R12), AX
			
 
				+	SUBQ CX, BX
			
 
				+	ANDQ $0x07, DX
			
 
				+	JMP  sequenceDecs_decodeSync_bmi2_fill_2_end
			
 
				+
			
 
				+sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
			
 
				+	CMPQ    BX, $0x00
			
 
				+	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
			
 
				+	CMPQ    DX, $0x07
			
 
				+	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
			
 
				+	SHLQ    $0x08, AX
			
 
				+	SUBQ    $0x01, R12
			
 
				+	SUBQ    $0x01, BX
			
 
				+	SUBQ    $0x08, DX
			
 
				+	MOVBQZX (R12), CX
			
 
				+	ORQ     CX, AX
			
 
				+	JMP     sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
			
 
				+
			
 
				+sequenceDecs_decodeSync_bmi2_fill_2_end:
			
 
				+	// Update literal length
			
 
				+	MOVQ   $0x00000808, CX
			
 
				+	BEXTRQ CX, SI, R13
			
 
				+	MOVQ   AX, R14
			
 
				+	LEAQ   (DX)(R13*1), CX
			
 
				+	ROLQ   CL, R14
			
 
				+	BZHIQ  R13, R14, R14
			
 
				+	MOVQ   CX, DX
			
 
				+	MOVQ   SI, CX
			
 
				+	SHRQ   $0x20, CX
			
 
				+	ADDQ   R14, CX
			
 
				+	MOVQ   CX, 24(SP)
			
 
				+
			
 
				+	// Fill bitreader for state updates
			
 
				+	MOVQ    R12, (SP)
			
 
				+	MOVQ    $0x00000808, CX
			
 
				+	BEXTRQ  CX, R8, R12
			
 
				+	MOVQ    ctx+16(FP), CX
			
 
				+	CMPQ    96(CX), $0x00
			
 
				+	JZ      sequenceDecs_decodeSync_bmi2_skip_update
			
 
				+	LEAQ    (SI)(DI*1), R13
			
 
				+	ADDQ    R8, R13
			
 
				+	MOVBQZX R13, R13
			
 
				+	LEAQ    (DX)(R13*1), CX
			
 
				+	MOVQ    AX, R14
			
 
				+	MOVQ    CX, DX
			
 
				+	ROLQ    CL, R14
			
 
				+	BZHIQ   R13, R14, R14
			
 
				+
			
 
				+	// Update Offset State
			
 
				+	BZHIQ  R8, R14, CX
			
 
				+	SHRXQ  R8, R14, R14
			
 
				+	MOVQ   $0x00001010, R13
			
 
				+	BEXTRQ R13, R8, R8
			
 
				+	ADDQ   CX, R8
			
 
				+
			
 
				+	// Load ctx.ofTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ 48(CX), CX
			
 
				+	MOVQ (CX)(R8*8), R8
			
 
				+
			
 
				+	// Update Match Length State
			
 
				+	BZHIQ  DI, R14, CX
			
 
				+	SHRXQ  DI, R14, R14
			
 
				+	MOVQ   $0x00001010, R13
			
 
				+	BEXTRQ R13, DI, DI
			
 
				+	ADDQ   CX, DI
			
 
				+
			
 
				+	// Load ctx.mlTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ 24(CX), CX
			
 
				+	MOVQ (CX)(DI*8), DI
			
 
				+
			
 
				+	// Update Literal Length State
			
 
				+	BZHIQ  SI, R14, CX
			
 
				+	MOVQ   $0x00001010, R13
			
 
				+	BEXTRQ R13, SI, SI
			
 
				+	ADDQ   CX, SI
			
 
				+
			
 
				+	// Load ctx.llTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ (CX), CX
			
 
				+	MOVQ (CX)(SI*8), SI
			
 
				+
			
 
				+sequenceDecs_decodeSync_bmi2_skip_update:
			
 
				+	// Adjust offset
			
 
				+	MOVQ   s+0(FP), CX
			
 
				+	MOVQ   8(SP), R13
			
 
				+	CMPQ   R12, $0x01
			
 
				+	JBE    sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
			
 
				+	MOVUPS 144(CX), X0
			
 
				+	MOVQ   R13, 144(CX)
			
 
				+	MOVUPS X0, 152(CX)
			
 
				+	JMP    sequenceDecs_decodeSync_bmi2_after_adjust
			
 
				+
			
 
				+sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
			
 
				+	CMPQ 24(SP), $0x00000000
			
 
				+	JNE  sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
			
 
				+	INCQ R13
			
 
				+	JMP  sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
			
 
				+
			
 
				+sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
			
 
				+	TESTQ R13, R13
			
 
				+	JNZ   sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
			
 
				+	MOVQ  144(CX), R13
			
 
				+	JMP   sequenceDecs_decodeSync_bmi2_after_adjust
			
 
				+
			
 
				+sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
			
 
				+	MOVQ    R13, R12
			
 
				+	XORQ    R14, R14
			
 
				+	MOVQ    $-1, R15
			
 
				+	CMPQ    R13, $0x03
			
 
				+	CMOVQEQ R14, R12
			
 
				+	CMOVQEQ R15, R14
			
 
				+	ADDQ    144(CX)(R12*8), R14
			
 
				+	JNZ     sequenceDecs_decodeSync_bmi2_adjust_temp_valid
			
 
				+	MOVQ    $0x00000001, R14
			
 
				+
			
 
				+sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
			
 
				+	CMPQ R13, $0x01
			
 
				+	JZ   sequenceDecs_decodeSync_bmi2_adjust_skip
			
 
				+	MOVQ 152(CX), R12
			
 
				+	MOVQ R12, 160(CX)
			
 
				+
			
 
				+sequenceDecs_decodeSync_bmi2_adjust_skip:
			
 
				+	MOVQ 144(CX), R12
			
 
				+	MOVQ R12, 152(CX)
			
 
				+	MOVQ R14, 144(CX)
			
 
				+	MOVQ R14, R13
			
 
				+
			
 
				+sequenceDecs_decodeSync_bmi2_after_adjust:
			
 
				+	MOVQ R13, 8(SP)
			
 
				+
			
 
				+	// Check values
			
 
				+	MOVQ  16(SP), CX
			
 
				+	MOVQ  24(SP), R12
			
 
				+	LEAQ  (CX)(R12*1), R14
			
 
				+	MOVQ  s+0(FP), R15
			
 
				+	ADDQ  R14, 256(R15)
			
 
				+	MOVQ  ctx+16(FP), R14
			
 
				+	SUBQ  R12, 104(R14)
			
 
				+	JS    error_not_enough_literals
			
 
				+	CMPQ  CX, $0x00020002
			
 
				+	JA    sequenceDecs_decodeSync_bmi2_error_match_len_too_big
			
 
				+	TESTQ R13, R13
			
 
				+	JNZ   sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
			
 
				+	TESTQ CX, CX
			
 
				+	JNZ   sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
			
 
				+
			
 
				+sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
			
 
				+	MOVQ 24(SP), CX
			
 
				+	MOVQ 8(SP), R12
			
 
				+	MOVQ 16(SP), R13
			
 
				+
			
 
				+	// Check if we have enough space in s.out
			
 
				+	LEAQ (CX)(R13*1), R14
			
 
				+	ADDQ R9, R14
			
 
				+	CMPQ R14, 32(SP)
			
 
				+	JA   error_not_enough_space
			
 
				+
			
 
				+	// Copy literals
			
 
				+	TESTQ CX, CX
			
 
				+	JZ    check_offset
			
 
				+	XORQ  R14, R14
			
 
				+
			
 
				+copy_1:
			
 
				+	MOVUPS (R10)(R14*1), X0
			
 
				+	MOVUPS X0, (R9)(R14*1)
			
 
				+	ADDQ   $0x10, R14
			
 
				+	CMPQ   R14, CX
			
 
				+	JB     copy_1
			
 
				+	ADDQ   CX, R10
			
 
				+	ADDQ   CX, R9
			
 
				+	ADDQ   CX, R11
			
 
				+
			
 
				+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
			
 
				+check_offset:
			
 
				+	MOVQ R11, CX
			
 
				+	ADDQ 40(SP), CX
			
 
				+	CMPQ R12, CX
			
 
				+	JG   error_match_off_too_big
			
 
				+	CMPQ R12, 56(SP)
			
 
				+	JG   error_match_off_too_big
			
 
				+
			
 
				+	// Copy match from history
			
 
				+	MOVQ R12, CX
			
 
				+	SUBQ R11, CX
			
 
				+	JLS  copy_match
			
 
				+	MOVQ 48(SP), R14
			
 
				+	SUBQ CX, R14
			
 
				+	CMPQ R13, CX
			
 
				+	JG   copy_all_from_history
			
 
				+	MOVQ R13, CX
			
 
				+	SUBQ $0x10, CX
			
 
				+	JB   copy_4_small
			
 
				+
			
 
				+copy_4_loop:
			
 
				+	MOVUPS (R14), X0
			
 
				+	MOVUPS X0, (R9)
			
 
				+	ADDQ   $0x10, R14
			
 
				+	ADDQ   $0x10, R9
			
 
				+	SUBQ   $0x10, CX
			
 
				+	JAE    copy_4_loop
			
 
				+	LEAQ   16(R14)(CX*1), R14
			
 
				+	LEAQ   16(R9)(CX*1), R9
			
 
				+	MOVUPS -16(R14), X0
			
 
				+	MOVUPS X0, -16(R9)
			
 
				+	JMP    copy_4_end
			
 
				+
			
 
				+copy_4_small:
			
 
				+	CMPQ R13, $0x03
			
 
				+	JE   copy_4_move_3
			
 
				+	CMPQ R13, $0x08
			
 
				+	JB   copy_4_move_4through7
			
 
				+	JMP  copy_4_move_8through16
			
 
				+
			
 
				+copy_4_move_3:
			
 
				+	MOVW (R14), CX
			
 
				+	MOVB 2(R14), R12
			
 
				+	MOVW CX, (R9)
			
 
				+	MOVB R12, 2(R9)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, R9
			
 
				+	JMP  copy_4_end
			
 
				+
			
 
				+copy_4_move_4through7:
			
 
				+	MOVL (R14), CX
			
 
				+	MOVL -4(R14)(R13*1), R12
			
 
				+	MOVL CX, (R9)
			
 
				+	MOVL R12, -4(R9)(R13*1)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, R9
			
 
				+	JMP  copy_4_end
			
 
				+
			
 
				+copy_4_move_8through16:
			
 
				+	MOVQ (R14), CX
			
 
				+	MOVQ -8(R14)(R13*1), R12
			
 
				+	MOVQ CX, (R9)
			
 
				+	MOVQ R12, -8(R9)(R13*1)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, R9
			
 
				+
			
 
				+copy_4_end:
			
 
				+	ADDQ R13, R11
			
 
				+	JMP  handle_loop
			
 
				+	JMP loop_finished
			
 
				+
			
 
				+copy_all_from_history:
			
 
				+	MOVQ CX, R15
			
 
				+	SUBQ $0x10, R15
			
 
				+	JB   copy_5_small
			
 
				+
			
 
				+copy_5_loop:
			
 
				+	MOVUPS (R14), X0
			
 
				+	MOVUPS X0, (R9)
			
 
				+	ADDQ   $0x10, R14
			
 
				+	ADDQ   $0x10, R9
			
 
				+	SUBQ   $0x10, R15
			
 
				+	JAE    copy_5_loop
			
 
				+	LEAQ   16(R14)(R15*1), R14
			
 
				+	LEAQ   16(R9)(R15*1), R9
			
 
				+	MOVUPS -16(R14), X0
			
 
				+	MOVUPS X0, -16(R9)
			
 
				+	JMP    copy_5_end
			
 
				+
			
 
				+copy_5_small:
			
 
				+	CMPQ CX, $0x03
			
 
				+	JE   copy_5_move_3
			
 
				+	JB   copy_5_move_1or2
			
 
				+	CMPQ CX, $0x08
			
 
				+	JB   copy_5_move_4through7
			
 
				+	JMP  copy_5_move_8through16
			
 
				+
			
 
				+copy_5_move_1or2:
			
 
				+	MOVB (R14), R15
			
 
				+	MOVB -1(R14)(CX*1), BP
			
 
				+	MOVB R15, (R9)
			
 
				+	MOVB BP, -1(R9)(CX*1)
			
 
				+	ADDQ CX, R14
			
 
				+	ADDQ CX, R9
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_3:
			
 
				+	MOVW (R14), R15
			
 
				+	MOVB 2(R14), BP
			
 
				+	MOVW R15, (R9)
			
 
				+	MOVB BP, 2(R9)
			
 
				+	ADDQ CX, R14
			
 
				+	ADDQ CX, R9
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_4through7:
			
 
				+	MOVL (R14), R15
			
 
				+	MOVL -4(R14)(CX*1), BP
			
 
				+	MOVL R15, (R9)
			
 
				+	MOVL BP, -4(R9)(CX*1)
			
 
				+	ADDQ CX, R14
			
 
				+	ADDQ CX, R9
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_8through16:
			
 
				+	MOVQ (R14), R15
			
 
				+	MOVQ -8(R14)(CX*1), BP
			
 
				+	MOVQ R15, (R9)
			
 
				+	MOVQ BP, -8(R9)(CX*1)
			
 
				+	ADDQ CX, R14
			
 
				+	ADDQ CX, R9
			
 
				+
			
 
				+copy_5_end:
			
 
				+	ADDQ CX, R11
			
 
				+	SUBQ CX, R13
			
 
				+
			
 
				+	// Copy match from the current buffer
			
 
				+copy_match:
			
 
				+	MOVQ R9, CX
			
 
				+	SUBQ R12, CX
			
 
				+
			
 
				+	// ml <= mo
			
 
				+	CMPQ R13, R12
			
 
				+	JA   copy_overlapping_match
			
 
				+
			
 
				+	// Copy non-overlapping match
			
 
				+	ADDQ R13, R11
			
 
				+	MOVQ R9, R12
			
 
				+	ADDQ R13, R9
			
 
				+
			
 
				+copy_2:
			
 
				+	MOVUPS (CX), X0
			
 
				+	MOVUPS X0, (R12)
			
 
				+	ADDQ   $0x10, CX
			
 
				+	ADDQ   $0x10, R12
			
 
				+	SUBQ   $0x10, R13
			
 
				+	JHI    copy_2
			
 
				+	JMP    handle_loop
			
 
				+
			
 
				+	// Copy overlapping match
			
 
				+copy_overlapping_match:
			
 
				+	ADDQ R13, R11
			
 
				+
			
 
				+copy_slow_3:
			
 
				+	MOVB (CX), R12
			
 
				+	MOVB R12, (R9)
			
 
				+	INCQ CX
			
 
				+	INCQ R9
			
 
				+	DECQ R13
			
 
				+	JNZ  copy_slow_3
			
 
				+
			
 
				+handle_loop:
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	DECQ 96(CX)
			
 
				+	JNS  sequenceDecs_decodeSync_bmi2_main_loop
			
 
				+
			
 
				+loop_finished:
			
 
				+	MOVQ br+8(FP), CX
			
 
				+	MOVQ AX, 32(CX)
			
 
				+	MOVB DL, 40(CX)
			
 
				+	MOVQ BX, 24(CX)
			
 
				+
			
 
				+	// Update the context
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ R11, 136(AX)
			
 
				+	MOVQ 144(AX), CX
			
 
				+	SUBQ CX, R10
			
 
				+	MOVQ R10, 168(AX)
			
 
				+
			
 
				+	// Return success
			
 
				+	MOVQ $0x00000000, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match length error
			
 
				+sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
			
 
				+	MOVQ 16(SP), AX
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ AX, 216(CX)
			
 
				+	MOVQ $0x00000001, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match too long error
			
 
				+sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ 16(SP), CX
			
 
				+	MOVQ CX, 216(AX)
			
 
				+	MOVQ $0x00000002, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match offset too long error
			
 
				+error_match_off_too_big:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ 8(SP), CX
			
 
				+	MOVQ CX, 224(AX)
			
 
				+	MOVQ R11, 136(AX)
			
 
				+	MOVQ $0x00000003, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with not enough literals error
			
 
				+error_not_enough_literals:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ 24(SP), CX
			
 
				+	MOVQ CX, 208(AX)
			
 
				+	MOVQ $0x00000004, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with not enough output space error
			
 
				+error_not_enough_space:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ 24(SP), CX
			
 
				+	MOVQ CX, 208(AX)
			
 
				+	MOVQ 16(SP), CX
			
 
				+	MOVQ CX, 216(AX)
			
 
				+	MOVQ R11, 136(AX)
			
 
				+	MOVQ $0x00000005, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
			
 
				+// Requires: CMOV, SSE
			
 
				+TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
			
 
				+	MOVQ    br+8(FP), AX
			
 
				+	MOVQ    32(AX), DX
			
 
				+	MOVBQZX 40(AX), BX
			
 
				+	MOVQ    24(AX), SI
			
 
				+	MOVQ    (AX), AX
			
 
				+	ADDQ    SI, AX
			
 
				+	MOVQ    AX, (SP)
			
 
				+	MOVQ    ctx+16(FP), AX
			
 
				+	MOVQ    72(AX), DI
			
 
				+	MOVQ    80(AX), R8
			
 
				+	MOVQ    88(AX), R9
			
 
				+	XORQ    CX, CX
			
 
				+	MOVQ    CX, 8(SP)
			
 
				+	MOVQ    CX, 16(SP)
			
 
				+	MOVQ    CX, 24(SP)
			
 
				+	MOVQ    112(AX), R10
			
 
				+	MOVQ    128(AX), CX
			
 
				+	MOVQ    CX, 32(SP)
			
 
				+	MOVQ    144(AX), R11
			
 
				+	MOVQ    136(AX), R12
			
 
				+	MOVQ    200(AX), CX
			
 
				+	MOVQ    CX, 56(SP)
			
 
				+	MOVQ    176(AX), CX
			
 
				+	MOVQ    CX, 48(SP)
			
 
				+	MOVQ    184(AX), AX
			
 
				+	MOVQ    AX, 40(SP)
			
 
				+	MOVQ    40(SP), AX
			
 
				+	ADDQ    AX, 48(SP)
			
 
				+
			
 
				+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
			
 
				+	ADDQ R10, 32(SP)
			
 
				+
			
 
				+	// outBase += outPosition
			
 
				+	ADDQ R12, R10
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_amd64_main_loop:
			
 
				+	MOVQ (SP), R13
			
 
				+
			
 
				+	// Fill bitreader to have enough for the offset and match length.
			
 
				+	CMPQ SI, $0x08
			
 
				+	JL   sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
			
 
				+	MOVQ BX, AX
			
 
				+	SHRQ $0x03, AX
			
 
				+	SUBQ AX, R13
			
 
				+	MOVQ (R13), DX
			
 
				+	SUBQ AX, SI
			
 
				+	ANDQ $0x07, BX
			
 
				+	JMP  sequenceDecs_decodeSync_safe_amd64_fill_end
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
			
 
				+	CMPQ    SI, $0x00
			
 
				+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
			
 
				+	CMPQ    BX, $0x07
			
 
				+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
			
 
				+	SHLQ    $0x08, DX
			
 
				+	SUBQ    $0x01, R13
			
 
				+	SUBQ    $0x01, SI
			
 
				+	SUBQ    $0x08, BX
			
 
				+	MOVBQZX (R13), AX
			
 
				+	ORQ     AX, DX
			
 
				+	JMP     sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_amd64_fill_end:
			
 
				+	// Update offset
			
 
				+	MOVQ  R9, AX
			
 
				+	MOVQ  BX, CX
			
 
				+	MOVQ  DX, R14
			
 
				+	SHLQ  CL, R14
			
 
				+	MOVB  AH, CL
			
 
				+	SHRQ  $0x20, AX
			
 
				+	TESTQ CX, CX
			
 
				+	JZ    sequenceDecs_decodeSync_safe_amd64_of_update_zero
			
 
				+	ADDQ  CX, BX
			
 
				+	CMPQ  BX, $0x40
			
 
				+	JA    sequenceDecs_decodeSync_safe_amd64_of_update_zero
			
 
				+	CMPQ  CX, $0x40
			
 
				+	JAE   sequenceDecs_decodeSync_safe_amd64_of_update_zero
			
 
				+	NEGQ  CX
			
 
				+	SHRQ  CL, R14
			
 
				+	ADDQ  R14, AX
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_amd64_of_update_zero:
			
 
				+	MOVQ AX, 8(SP)
			
 
				+
			
 
				+	// Update match length
			
 
				+	MOVQ  R8, AX
			
 
				+	MOVQ  BX, CX
			
 
				+	MOVQ  DX, R14
			
 
				+	SHLQ  CL, R14
			
 
				+	MOVB  AH, CL
			
 
				+	SHRQ  $0x20, AX
			
 
				+	TESTQ CX, CX
			
 
				+	JZ    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
			
 
				+	ADDQ  CX, BX
			
 
				+	CMPQ  BX, $0x40
			
 
				+	JA    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
			
 
				+	CMPQ  CX, $0x40
			
 
				+	JAE   sequenceDecs_decodeSync_safe_amd64_ml_update_zero
			
 
				+	NEGQ  CX
			
 
				+	SHRQ  CL, R14
			
 
				+	ADDQ  R14, AX
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
			
 
				+	MOVQ AX, 16(SP)
			
 
				+
			
 
				+	// Fill bitreader to have enough for the remaining
			
 
				+	CMPQ SI, $0x08
			
 
				+	JL   sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
			
 
				+	MOVQ BX, AX
			
 
				+	SHRQ $0x03, AX
			
 
				+	SUBQ AX, R13
			
 
				+	MOVQ (R13), DX
			
 
				+	SUBQ AX, SI
			
 
				+	ANDQ $0x07, BX
			
 
				+	JMP  sequenceDecs_decodeSync_safe_amd64_fill_2_end
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
			
 
				+	CMPQ    SI, $0x00
			
 
				+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
			
 
				+	CMPQ    BX, $0x07
			
 
				+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
			
 
				+	SHLQ    $0x08, DX
			
 
				+	SUBQ    $0x01, R13
			
 
				+	SUBQ    $0x01, SI
			
 
				+	SUBQ    $0x08, BX
			
 
				+	MOVBQZX (R13), AX
			
 
				+	ORQ     AX, DX
			
 
				+	JMP     sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_amd64_fill_2_end:
			
 
				+	// Update literal length
			
 
				+	MOVQ  DI, AX
			
 
				+	MOVQ  BX, CX
			
 
				+	MOVQ  DX, R14
			
 
				+	SHLQ  CL, R14
			
 
				+	MOVB  AH, CL
			
 
				+	SHRQ  $0x20, AX
			
 
				+	TESTQ CX, CX
			
 
				+	JZ    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
			
 
				+	ADDQ  CX, BX
			
 
				+	CMPQ  BX, $0x40
			
 
				+	JA    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
			
 
				+	CMPQ  CX, $0x40
			
 
				+	JAE   sequenceDecs_decodeSync_safe_amd64_ll_update_zero
			
 
				+	NEGQ  CX
			
 
				+	SHRQ  CL, R14
			
 
				+	ADDQ  R14, AX
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
			
 
				+	MOVQ AX, 24(SP)
			
 
				+
			
 
				+	// Fill bitreader for state updates
			
 
				+	MOVQ    R13, (SP)
			
 
				+	MOVQ    R9, AX
			
 
				+	SHRQ    $0x08, AX
			
 
				+	MOVBQZX AL, AX
			
 
				+	MOVQ    ctx+16(FP), CX
			
 
				+	CMPQ    96(CX), $0x00
			
 
				+	JZ      sequenceDecs_decodeSync_safe_amd64_skip_update
			
 
				+
			
 
				+	// Update Literal Length State
			
 
				+	MOVBQZX DI, R13
			
 
				+	SHRQ    $0x10, DI
			
 
				+	MOVWQZX DI, DI
			
 
				+	LEAQ    (BX)(R13*1), CX
			
 
				+	MOVQ    DX, R14
			
 
				+	MOVQ    CX, BX
			
 
				+	ROLQ    CL, R14
			
 
				+	MOVL    $0x00000001, R15
			
 
				+	MOVB    R13, CL
			
 
				+	SHLL    CL, R15
			
 
				+	DECL    R15
			
 
				+	ANDQ    R15, R14
			
 
				+	ADDQ    R14, DI
			
 
				+
			
 
				+	// Load ctx.llTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ (CX), CX
			
 
				+	MOVQ (CX)(DI*8), DI
			
 
				+
			
 
				+	// Update Match Length State
			
 
				+	MOVBQZX R8, R13
			
 
				+	SHRQ    $0x10, R8
			
 
				+	MOVWQZX R8, R8
			
 
				+	LEAQ    (BX)(R13*1), CX
			
 
				+	MOVQ    DX, R14
			
 
				+	MOVQ    CX, BX
			
 
				+	ROLQ    CL, R14
			
 
				+	MOVL    $0x00000001, R15
			
 
				+	MOVB    R13, CL
			
 
				+	SHLL    CL, R15
			
 
				+	DECL    R15
			
 
				+	ANDQ    R15, R14
			
 
				+	ADDQ    R14, R8
			
 
				+
			
 
				+	// Load ctx.mlTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ 24(CX), CX
			
 
				+	MOVQ (CX)(R8*8), R8
			
 
				+
			
 
				+	// Update Offset State
			
 
				+	MOVBQZX R9, R13
			
 
				+	SHRQ    $0x10, R9
			
 
				+	MOVWQZX R9, R9
			
 
				+	LEAQ    (BX)(R13*1), CX
			
 
				+	MOVQ    DX, R14
			
 
				+	MOVQ    CX, BX
			
 
				+	ROLQ    CL, R14
			
 
				+	MOVL    $0x00000001, R15
			
 
				+	MOVB    R13, CL
			
 
				+	SHLL    CL, R15
			
 
				+	DECL    R15
			
 
				+	ANDQ    R15, R14
			
 
				+	ADDQ    R14, R9
			
 
				+
			
 
				+	// Load ctx.ofTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ 48(CX), CX
			
 
				+	MOVQ (CX)(R9*8), R9
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_amd64_skip_update:
			
 
				+	// Adjust offset
			
 
				+	MOVQ   s+0(FP), CX
			
 
				+	MOVQ   8(SP), R13
			
 
				+	CMPQ   AX, $0x01
			
 
				+	JBE    sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
			
 
				+	MOVUPS 144(CX), X0
			
 
				+	MOVQ   R13, 144(CX)
			
 
				+	MOVUPS X0, 152(CX)
			
 
				+	JMP    sequenceDecs_decodeSync_safe_amd64_after_adjust
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
			
 
				+	CMPQ 24(SP), $0x00000000
			
 
				+	JNE  sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
			
 
				+	INCQ R13
			
 
				+	JMP  sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
			
 
				+	TESTQ R13, R13
			
 
				+	JNZ   sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
			
 
				+	MOVQ  144(CX), R13
			
 
				+	JMP   sequenceDecs_decodeSync_safe_amd64_after_adjust
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
			
 
				+	MOVQ    R13, AX
			
 
				+	XORQ    R14, R14
			
 
				+	MOVQ    $-1, R15
			
 
				+	CMPQ    R13, $0x03
			
 
				+	CMOVQEQ R14, AX
			
 
				+	CMOVQEQ R15, R14
			
 
				+	ADDQ    144(CX)(AX*8), R14
			
 
				+	JNZ     sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
			
 
				+	MOVQ    $0x00000001, R14
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
			
 
				+	CMPQ R13, $0x01
			
 
				+	JZ   sequenceDecs_decodeSync_safe_amd64_adjust_skip
			
 
				+	MOVQ 152(CX), AX
			
 
				+	MOVQ AX, 160(CX)
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_amd64_adjust_skip:
			
 
				+	MOVQ 144(CX), AX
			
 
				+	MOVQ AX, 152(CX)
			
 
				+	MOVQ R14, 144(CX)
			
 
				+	MOVQ R14, R13
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_amd64_after_adjust:
			
 
				+	MOVQ R13, 8(SP)
			
 
				+
			
 
				+	// Check values
			
 
				+	MOVQ  16(SP), AX
			
 
				+	MOVQ  24(SP), CX
			
 
				+	LEAQ  (AX)(CX*1), R14
			
 
				+	MOVQ  s+0(FP), R15
			
 
				+	ADDQ  R14, 256(R15)
			
 
				+	MOVQ  ctx+16(FP), R14
			
 
				+	SUBQ  CX, 104(R14)
			
 
				+	JS    error_not_enough_literals
			
 
				+	CMPQ  AX, $0x00020002
			
 
				+	JA    sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
			
 
				+	TESTQ R13, R13
			
 
				+	JNZ   sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
			
 
				+	TESTQ AX, AX
			
 
				+	JNZ   sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
			
 
				+	MOVQ 24(SP), AX
			
 
				+	MOVQ 8(SP), CX
			
 
				+	MOVQ 16(SP), R13
			
 
				+
			
 
				+	// Check if we have enough space in s.out
			
 
				+	LEAQ (AX)(R13*1), R14
			
 
				+	ADDQ R10, R14
			
 
				+	CMPQ R14, 32(SP)
			
 
				+	JA   error_not_enough_space
			
 
				+
			
 
				+	// Copy literals
			
 
				+	TESTQ AX, AX
			
 
				+	JZ    check_offset
			
 
				+	MOVQ  AX, R14
			
 
				+	SUBQ  $0x10, R14
			
 
				+	JB    copy_1_small
			
 
				+
			
 
				+copy_1_loop:
			
 
				+	MOVUPS (R11), X0
			
 
				+	MOVUPS X0, (R10)
			
 
				+	ADDQ   $0x10, R11
			
 
				+	ADDQ   $0x10, R10
			
 
				+	SUBQ   $0x10, R14
			
 
				+	JAE    copy_1_loop
			
 
				+	LEAQ   16(R11)(R14*1), R11
			
 
				+	LEAQ   16(R10)(R14*1), R10
			
 
				+	MOVUPS -16(R11), X0
			
 
				+	MOVUPS X0, -16(R10)
			
 
				+	JMP    copy_1_end
			
 
				+
			
 
				+copy_1_small:
			
 
				+	CMPQ AX, $0x03
			
 
				+	JE   copy_1_move_3
			
 
				+	JB   copy_1_move_1or2
			
 
				+	CMPQ AX, $0x08
			
 
				+	JB   copy_1_move_4through7
			
 
				+	JMP  copy_1_move_8through16
			
 
				+
			
 
				+copy_1_move_1or2:
			
 
				+	MOVB (R11), R14
			
 
				+	MOVB -1(R11)(AX*1), R15
			
 
				+	MOVB R14, (R10)
			
 
				+	MOVB R15, -1(R10)(AX*1)
			
 
				+	ADDQ AX, R11
			
 
				+	ADDQ AX, R10
			
 
				+	JMP  copy_1_end
			
 
				+
			
 
				+copy_1_move_3:
			
 
				+	MOVW (R11), R14
			
 
				+	MOVB 2(R11), R15
			
 
				+	MOVW R14, (R10)
			
 
				+	MOVB R15, 2(R10)
			
 
				+	ADDQ AX, R11
			
 
				+	ADDQ AX, R10
			
 
				+	JMP  copy_1_end
			
 
				+
			
 
				+copy_1_move_4through7:
			
 
				+	MOVL (R11), R14
			
 
				+	MOVL -4(R11)(AX*1), R15
			
 
				+	MOVL R14, (R10)
			
 
				+	MOVL R15, -4(R10)(AX*1)
			
 
				+	ADDQ AX, R11
			
 
				+	ADDQ AX, R10
			
 
				+	JMP  copy_1_end
			
 
				+
			
 
				+copy_1_move_8through16:
			
 
				+	MOVQ (R11), R14
			
 
				+	MOVQ -8(R11)(AX*1), R15
			
 
				+	MOVQ R14, (R10)
			
 
				+	MOVQ R15, -8(R10)(AX*1)
			
 
				+	ADDQ AX, R11
			
 
				+	ADDQ AX, R10
			
 
				+
			
 
				+copy_1_end:
			
 
				+	ADDQ AX, R12
			
 
				+
			
 
				+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
			
 
				+check_offset:
			
 
				+	MOVQ R12, AX
			
 
				+	ADDQ 40(SP), AX
			
 
				+	CMPQ CX, AX
			
 
				+	JG   error_match_off_too_big
			
 
				+	CMPQ CX, 56(SP)
			
 
				+	JG   error_match_off_too_big
			
 
				+
			
 
				+	// Copy match from history
			
 
				+	MOVQ CX, AX
			
 
				+	SUBQ R12, AX
			
 
				+	JLS  copy_match
			
 
				+	MOVQ 48(SP), R14
			
 
				+	SUBQ AX, R14
			
 
				+	CMPQ R13, AX
			
 
				+	JG   copy_all_from_history
			
 
				+	MOVQ R13, AX
			
 
				+	SUBQ $0x10, AX
			
 
				+	JB   copy_4_small
			
 
				+
			
 
				+copy_4_loop:
			
 
				+	MOVUPS (R14), X0
			
 
				+	MOVUPS X0, (R10)
			
 
				+	ADDQ   $0x10, R14
			
 
				+	ADDQ   $0x10, R10
			
 
				+	SUBQ   $0x10, AX
			
 
				+	JAE    copy_4_loop
			
 
				+	LEAQ   16(R14)(AX*1), R14
			
 
				+	LEAQ   16(R10)(AX*1), R10
			
 
				+	MOVUPS -16(R14), X0
			
 
				+	MOVUPS X0, -16(R10)
			
 
				+	JMP    copy_4_end
			
 
				+
			
 
				+copy_4_small:
			
 
				+	CMPQ R13, $0x03
			
 
				+	JE   copy_4_move_3
			
 
				+	CMPQ R13, $0x08
			
 
				+	JB   copy_4_move_4through7
			
 
				+	JMP  copy_4_move_8through16
			
 
				+
			
 
				+copy_4_move_3:
			
 
				+	MOVW (R14), AX
			
 
				+	MOVB 2(R14), CL
			
 
				+	MOVW AX, (R10)
			
 
				+	MOVB CL, 2(R10)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, R10
			
 
				+	JMP  copy_4_end
			
 
				+
			
 
				+copy_4_move_4through7:
			
 
				+	MOVL (R14), AX
			
 
				+	MOVL -4(R14)(R13*1), CX
			
 
				+	MOVL AX, (R10)
			
 
				+	MOVL CX, -4(R10)(R13*1)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, R10
			
 
				+	JMP  copy_4_end
			
 
				+
			
 
				+copy_4_move_8through16:
			
 
				+	MOVQ (R14), AX
			
 
				+	MOVQ -8(R14)(R13*1), CX
			
 
				+	MOVQ AX, (R10)
			
 
				+	MOVQ CX, -8(R10)(R13*1)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, R10
			
 
				+
			
 
				+copy_4_end:
			
 
				+	ADDQ R13, R12
			
 
				+	JMP  handle_loop
			
 
				+	JMP loop_finished
			
 
				+
			
 
				+copy_all_from_history:
			
 
				+	MOVQ AX, R15
			
 
				+	SUBQ $0x10, R15
			
 
				+	JB   copy_5_small
			
 
				+
			
 
				+copy_5_loop:
			
 
				+	MOVUPS (R14), X0
			
 
				+	MOVUPS X0, (R10)
			
 
				+	ADDQ   $0x10, R14
			
 
				+	ADDQ   $0x10, R10
			
 
				+	SUBQ   $0x10, R15
			
 
				+	JAE    copy_5_loop
			
 
				+	LEAQ   16(R14)(R15*1), R14
			
 
				+	LEAQ   16(R10)(R15*1), R10
			
 
				+	MOVUPS -16(R14), X0
			
 
				+	MOVUPS X0, -16(R10)
			
 
				+	JMP    copy_5_end
			
 
				+
			
 
				+copy_5_small:
			
 
				+	CMPQ AX, $0x03
			
 
				+	JE   copy_5_move_3
			
 
				+	JB   copy_5_move_1or2
			
 
				+	CMPQ AX, $0x08
			
 
				+	JB   copy_5_move_4through7
			
 
				+	JMP  copy_5_move_8through16
			
 
				+
			
 
				+copy_5_move_1or2:
			
 
				+	MOVB (R14), R15
			
 
				+	MOVB -1(R14)(AX*1), BP
			
 
				+	MOVB R15, (R10)
			
 
				+	MOVB BP, -1(R10)(AX*1)
			
 
				+	ADDQ AX, R14
			
 
				+	ADDQ AX, R10
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_3:
			
 
				+	MOVW (R14), R15
			
 
				+	MOVB 2(R14), BP
			
 
				+	MOVW R15, (R10)
			
 
				+	MOVB BP, 2(R10)
			
 
				+	ADDQ AX, R14
			
 
				+	ADDQ AX, R10
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_4through7:
			
 
				+	MOVL (R14), R15
			
 
				+	MOVL -4(R14)(AX*1), BP
			
 
				+	MOVL R15, (R10)
			
 
				+	MOVL BP, -4(R10)(AX*1)
			
 
				+	ADDQ AX, R14
			
 
				+	ADDQ AX, R10
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_8through16:
			
 
				+	MOVQ (R14), R15
			
 
				+	MOVQ -8(R14)(AX*1), BP
			
 
				+	MOVQ R15, (R10)
			
 
				+	MOVQ BP, -8(R10)(AX*1)
			
 
				+	ADDQ AX, R14
			
 
				+	ADDQ AX, R10
			
 
				+
			
 
				+copy_5_end:
			
 
				+	ADDQ AX, R12
			
 
				+	SUBQ AX, R13
			
 
				+
			
 
				+	// Copy match from the current buffer
			
 
				+copy_match:
			
 
				+	MOVQ R10, AX
			
 
				+	SUBQ CX, AX
			
 
				+
			
 
				+	// ml <= mo
			
 
				+	CMPQ R13, CX
			
 
				+	JA   copy_overlapping_match
			
 
				+
			
 
				+	// Copy non-overlapping match
			
 
				+	ADDQ R13, R12
			
 
				+	MOVQ R13, CX
			
 
				+	SUBQ $0x10, CX
			
 
				+	JB   copy_2_small
			
 
				+
			
 
				+copy_2_loop:
			
 
				+	MOVUPS (AX), X0
			
 
				+	MOVUPS X0, (R10)
			
 
				+	ADDQ   $0x10, AX
			
 
				+	ADDQ   $0x10, R10
			
 
				+	SUBQ   $0x10, CX
			
 
				+	JAE    copy_2_loop
			
 
				+	LEAQ   16(AX)(CX*1), AX
			
 
				+	LEAQ   16(R10)(CX*1), R10
			
 
				+	MOVUPS -16(AX), X0
			
 
				+	MOVUPS X0, -16(R10)
			
 
				+	JMP    copy_2_end
			
 
				+
			
 
				+copy_2_small:
			
 
				+	CMPQ R13, $0x03
			
 
				+	JE   copy_2_move_3
			
 
				+	JB   copy_2_move_1or2
			
 
				+	CMPQ R13, $0x08
			
 
				+	JB   copy_2_move_4through7
			
 
				+	JMP  copy_2_move_8through16
			
 
				+
			
 
				+copy_2_move_1or2:
			
 
				+	MOVB (AX), CL
			
 
				+	MOVB -1(AX)(R13*1), R14
			
 
				+	MOVB CL, (R10)
			
 
				+	MOVB R14, -1(R10)(R13*1)
			
 
				+	ADDQ R13, AX
			
 
				+	ADDQ R13, R10
			
 
				+	JMP  copy_2_end
			
 
				+
			
 
				+copy_2_move_3:
			
 
				+	MOVW (AX), CX
			
 
				+	MOVB 2(AX), R14
			
 
				+	MOVW CX, (R10)
			
 
				+	MOVB R14, 2(R10)
			
 
				+	ADDQ R13, AX
			
 
				+	ADDQ R13, R10
			
 
				+	JMP  copy_2_end
			
 
				+
			
 
				+copy_2_move_4through7:
			
 
				+	MOVL (AX), CX
			
 
				+	MOVL -4(AX)(R13*1), R14
			
 
				+	MOVL CX, (R10)
			
 
				+	MOVL R14, -4(R10)(R13*1)
			
 
				+	ADDQ R13, AX
			
 
				+	ADDQ R13, R10
			
 
				+	JMP  copy_2_end
			
 
				+
			
 
				+copy_2_move_8through16:
			
 
				+	MOVQ (AX), CX
			
 
				+	MOVQ -8(AX)(R13*1), R14
			
 
				+	MOVQ CX, (R10)
			
 
				+	MOVQ R14, -8(R10)(R13*1)
			
 
				+	ADDQ R13, AX
			
 
				+	ADDQ R13, R10
			
 
				+
			
 
				+copy_2_end:
			
 
				+	JMP handle_loop
			
 
				+
			
 
				+	// Copy overlapping match
			
 
				+copy_overlapping_match:
			
 
				+	ADDQ R13, R12
			
 
				+
			
 
				+copy_slow_3:
			
 
				+	MOVB (AX), CL
			
 
				+	MOVB CL, (R10)
			
 
				+	INCQ AX
			
 
				+	INCQ R10
			
 
				+	DECQ R13
			
 
				+	JNZ  copy_slow_3
			
 
				+
			
 
				+handle_loop:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	DECQ 96(AX)
			
 
				+	JNS  sequenceDecs_decodeSync_safe_amd64_main_loop
			
 
				+
			
 
				+loop_finished:
			
 
				+	MOVQ br+8(FP), AX
			
 
				+	MOVQ DX, 32(AX)
			
 
				+	MOVB BL, 40(AX)
			
 
				+	MOVQ SI, 24(AX)
			
 
				+
			
 
				+	// Update the context
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ R12, 136(AX)
			
 
				+	MOVQ 144(AX), CX
			
 
				+	SUBQ CX, R11
			
 
				+	MOVQ R11, 168(AX)
			
 
				+
			
 
				+	// Return success
			
 
				+	MOVQ $0x00000000, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match length error
			
 
				+sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
			
 
				+	MOVQ 16(SP), AX
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ AX, 216(CX)
			
 
				+	MOVQ $0x00000001, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match too long error
			
 
				+sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ 16(SP), CX
			
 
				+	MOVQ CX, 216(AX)
			
 
				+	MOVQ $0x00000002, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match offset too long error
			
 
				+error_match_off_too_big:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ 8(SP), CX
			
 
				+	MOVQ CX, 224(AX)
			
 
				+	MOVQ R12, 136(AX)
			
 
				+	MOVQ $0x00000003, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with not enough literals error
			
 
				+error_not_enough_literals:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ 24(SP), CX
			
 
				+	MOVQ CX, 208(AX)
			
 
				+	MOVQ $0x00000004, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with not enough output space error
			
 
				+error_not_enough_space:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ 24(SP), CX
			
 
				+	MOVQ CX, 208(AX)
			
 
				+	MOVQ 16(SP), CX
			
 
				+	MOVQ CX, 216(AX)
			
 
				+	MOVQ R12, 136(AX)
			
 
				+	MOVQ $0x00000005, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
			
 
				+// Requires: BMI, BMI2, CMOV, SSE
			
 
				+TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
			
 
				+	MOVQ    br+8(FP), CX
			
 
				+	MOVQ    32(CX), AX
			
 
				+	MOVBQZX 40(CX), DX
			
 
				+	MOVQ    24(CX), BX
			
 
				+	MOVQ    (CX), CX
			
 
				+	ADDQ    BX, CX
			
 
				+	MOVQ    CX, (SP)
			
 
				+	MOVQ    ctx+16(FP), CX
			
 
				+	MOVQ    72(CX), SI
			
 
				+	MOVQ    80(CX), DI
			
 
				+	MOVQ    88(CX), R8
			
 
				+	XORQ    R9, R9
			
 
				+	MOVQ    R9, 8(SP)
			
 
				+	MOVQ    R9, 16(SP)
			
 
				+	MOVQ    R9, 24(SP)
			
 
				+	MOVQ    112(CX), R9
			
 
				+	MOVQ    128(CX), R10
			
 
				+	MOVQ    R10, 32(SP)
			
 
				+	MOVQ    144(CX), R10
			
 
				+	MOVQ    136(CX), R11
			
 
				+	MOVQ    200(CX), R12
			
 
				+	MOVQ    R12, 56(SP)
			
 
				+	MOVQ    176(CX), R12
			
 
				+	MOVQ    R12, 48(SP)
			
 
				+	MOVQ    184(CX), CX
			
 
				+	MOVQ    CX, 40(SP)
			
 
				+	MOVQ    40(SP), CX
			
 
				+	ADDQ    CX, 48(SP)
			
 
				+
			
 
				+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
			
 
				+	ADDQ R9, 32(SP)
			
 
				+
			
 
				+	// outBase += outPosition
			
 
				+	ADDQ R11, R9
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_bmi2_main_loop:
			
 
				+	MOVQ (SP), R12
			
 
				+
			
 
				+	// Fill bitreader to have enough for the offset and match length.
			
 
				+	CMPQ BX, $0x08
			
 
				+	JL   sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
			
 
				+	MOVQ DX, CX
			
 
				+	SHRQ $0x03, CX
			
 
				+	SUBQ CX, R12
			
 
				+	MOVQ (R12), AX
			
 
				+	SUBQ CX, BX
			
 
				+	ANDQ $0x07, DX
			
 
				+	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_end
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
			
 
				+	CMPQ    BX, $0x00
			
 
				+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
			
 
				+	CMPQ    DX, $0x07
			
 
				+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
			
 
				+	SHLQ    $0x08, AX
			
 
				+	SUBQ    $0x01, R12
			
 
				+	SUBQ    $0x01, BX
			
 
				+	SUBQ    $0x08, DX
			
 
				+	MOVBQZX (R12), CX
			
 
				+	ORQ     CX, AX
			
 
				+	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_bmi2_fill_end:
			
 
				+	// Update offset
			
 
				+	MOVQ   $0x00000808, CX
			
 
				+	BEXTRQ CX, R8, R13
			
 
				+	MOVQ   AX, R14
			
 
				+	LEAQ   (DX)(R13*1), CX
			
 
				+	ROLQ   CL, R14
			
 
				+	BZHIQ  R13, R14, R14
			
 
				+	MOVQ   CX, DX
			
 
				+	MOVQ   R8, CX
			
 
				+	SHRQ   $0x20, CX
			
 
				+	ADDQ   R14, CX
			
 
				+	MOVQ   CX, 8(SP)
			
 
				+
			
 
				+	// Update match length
			
 
				+	MOVQ   $0x00000808, CX
			
 
				+	BEXTRQ CX, DI, R13
			
 
				+	MOVQ   AX, R14
			
 
				+	LEAQ   (DX)(R13*1), CX
			
 
				+	ROLQ   CL, R14
			
 
				+	BZHIQ  R13, R14, R14
			
 
				+	MOVQ   CX, DX
			
 
				+	MOVQ   DI, CX
			
 
				+	SHRQ   $0x20, CX
			
 
				+	ADDQ   R14, CX
			
 
				+	MOVQ   CX, 16(SP)
			
 
				+
			
 
				+	// Fill bitreader to have enough for the remaining
			
 
				+	CMPQ BX, $0x08
			
 
				+	JL   sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
			
 
				+	MOVQ DX, CX
			
 
				+	SHRQ $0x03, CX
			
 
				+	SUBQ CX, R12
			
 
				+	MOVQ (R12), AX
			
 
				+	SUBQ CX, BX
			
 
				+	ANDQ $0x07, DX
			
 
				+	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_2_end
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
			
 
				+	CMPQ    BX, $0x00
			
 
				+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
			
 
				+	CMPQ    DX, $0x07
			
 
				+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
			
 
				+	SHLQ    $0x08, AX
			
 
				+	SUBQ    $0x01, R12
			
 
				+	SUBQ    $0x01, BX
			
 
				+	SUBQ    $0x08, DX
			
 
				+	MOVBQZX (R12), CX
			
 
				+	ORQ     CX, AX
			
 
				+	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
			
 
				+	// Update literal length
			
 
				+	MOVQ   $0x00000808, CX
			
 
				+	BEXTRQ CX, SI, R13
			
 
				+	MOVQ   AX, R14
			
 
				+	LEAQ   (DX)(R13*1), CX
			
 
				+	ROLQ   CL, R14
			
 
				+	BZHIQ  R13, R14, R14
			
 
				+	MOVQ   CX, DX
			
 
				+	MOVQ   SI, CX
			
 
				+	SHRQ   $0x20, CX
			
 
				+	ADDQ   R14, CX
			
 
				+	MOVQ   CX, 24(SP)
			
 
				+
			
 
				+	// Fill bitreader for state updates
			
 
				+	MOVQ    R12, (SP)
			
 
				+	MOVQ    $0x00000808, CX
			
 
				+	BEXTRQ  CX, R8, R12
			
 
				+	MOVQ    ctx+16(FP), CX
			
 
				+	CMPQ    96(CX), $0x00
			
 
				+	JZ      sequenceDecs_decodeSync_safe_bmi2_skip_update
			
 
				+	LEAQ    (SI)(DI*1), R13
			
 
				+	ADDQ    R8, R13
			
 
				+	MOVBQZX R13, R13
			
 
				+	LEAQ    (DX)(R13*1), CX
			
 
				+	MOVQ    AX, R14
			
 
				+	MOVQ    CX, DX
			
 
				+	ROLQ    CL, R14
			
 
				+	BZHIQ   R13, R14, R14
			
 
				+
			
 
				+	// Update Offset State
			
 
				+	BZHIQ  R8, R14, CX
			
 
				+	SHRXQ  R8, R14, R14
			
 
				+	MOVQ   $0x00001010, R13
			
 
				+	BEXTRQ R13, R8, R8
			
 
				+	ADDQ   CX, R8
			
 
				+
			
 
				+	// Load ctx.ofTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ 48(CX), CX
			
 
				+	MOVQ (CX)(R8*8), R8
			
 
				+
			
 
				+	// Update Match Length State
			
 
				+	BZHIQ  DI, R14, CX
			
 
				+	SHRXQ  DI, R14, R14
			
 
				+	MOVQ   $0x00001010, R13
			
 
				+	BEXTRQ R13, DI, DI
			
 
				+	ADDQ   CX, DI
			
 
				+
			
 
				+	// Load ctx.mlTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ 24(CX), CX
			
 
				+	MOVQ (CX)(DI*8), DI
			
 
				+
			
 
				+	// Update Literal Length State
			
 
				+	BZHIQ  SI, R14, CX
			
 
				+	MOVQ   $0x00001010, R13
			
 
				+	BEXTRQ R13, SI, SI
			
 
				+	ADDQ   CX, SI
			
 
				+
			
 
				+	// Load ctx.llTable
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ (CX), CX
			
 
				+	MOVQ (CX)(SI*8), SI
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_bmi2_skip_update:
			
 
				+	// Adjust offset
			
 
				+	MOVQ   s+0(FP), CX
			
 
				+	MOVQ   8(SP), R13
			
 
				+	CMPQ   R12, $0x01
			
 
				+	JBE    sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
			
 
				+	MOVUPS 144(CX), X0
			
 
				+	MOVQ   R13, 144(CX)
			
 
				+	MOVUPS X0, 152(CX)
			
 
				+	JMP    sequenceDecs_decodeSync_safe_bmi2_after_adjust
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
			
 
				+	CMPQ 24(SP), $0x00000000
			
 
				+	JNE  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
			
 
				+	INCQ R13
			
 
				+	JMP  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
			
 
				+	TESTQ R13, R13
			
 
				+	JNZ   sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
			
 
				+	MOVQ  144(CX), R13
			
 
				+	JMP   sequenceDecs_decodeSync_safe_bmi2_after_adjust
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
			
 
				+	MOVQ    R13, R12
			
 
				+	XORQ    R14, R14
			
 
				+	MOVQ    $-1, R15
			
 
				+	CMPQ    R13, $0x03
			
 
				+	CMOVQEQ R14, R12
			
 
				+	CMOVQEQ R15, R14
			
 
				+	ADDQ    144(CX)(R12*8), R14
			
 
				+	JNZ     sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
			
 
				+	MOVQ    $0x00000001, R14
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
			
 
				+	CMPQ R13, $0x01
			
 
				+	JZ   sequenceDecs_decodeSync_safe_bmi2_adjust_skip
			
 
				+	MOVQ 152(CX), R12
			
 
				+	MOVQ R12, 160(CX)
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
			
 
				+	MOVQ 144(CX), R12
			
 
				+	MOVQ R12, 152(CX)
			
 
				+	MOVQ R14, 144(CX)
			
 
				+	MOVQ R14, R13
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_bmi2_after_adjust:
			
 
				+	MOVQ R13, 8(SP)
			
 
				+
			
 
				+	// Check values
			
 
				+	MOVQ  16(SP), CX
			
 
				+	MOVQ  24(SP), R12
			
 
				+	LEAQ  (CX)(R12*1), R14
			
 
				+	MOVQ  s+0(FP), R15
			
 
				+	ADDQ  R14, 256(R15)
			
 
				+	MOVQ  ctx+16(FP), R14
			
 
				+	SUBQ  R12, 104(R14)
			
 
				+	JS    error_not_enough_literals
			
 
				+	CMPQ  CX, $0x00020002
			
 
				+	JA    sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
			
 
				+	TESTQ R13, R13
			
 
				+	JNZ   sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
			
 
				+	TESTQ CX, CX
			
 
				+	JNZ   sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
			
 
				+
			
 
				+sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
			
 
				+	MOVQ 24(SP), CX
			
 
				+	MOVQ 8(SP), R12
			
 
				+	MOVQ 16(SP), R13
			
 
				+
			
 
				+	// Check if we have enough space in s.out
			
 
				+	LEAQ (CX)(R13*1), R14
			
 
				+	ADDQ R9, R14
			
 
				+	CMPQ R14, 32(SP)
			
 
				+	JA   error_not_enough_space
			
 
				+
			
 
				+	// Copy literals
			
 
				+	TESTQ CX, CX
			
 
				+	JZ    check_offset
			
 
				+	MOVQ  CX, R14
			
 
				+	SUBQ  $0x10, R14
			
 
				+	JB    copy_1_small
			
 
				+
			
 
				+copy_1_loop:
			
 
				+	MOVUPS (R10), X0
			
 
				+	MOVUPS X0, (R9)
			
 
				+	ADDQ   $0x10, R10
			
 
				+	ADDQ   $0x10, R9
			
 
				+	SUBQ   $0x10, R14
			
 
				+	JAE    copy_1_loop
			
 
				+	LEAQ   16(R10)(R14*1), R10
			
 
				+	LEAQ   16(R9)(R14*1), R9
			
 
				+	MOVUPS -16(R10), X0
			
 
				+	MOVUPS X0, -16(R9)
			
 
				+	JMP    copy_1_end
			
 
				+
			
 
				+copy_1_small:
			
 
				+	CMPQ CX, $0x03
			
 
				+	JE   copy_1_move_3
			
 
				+	JB   copy_1_move_1or2
			
 
				+	CMPQ CX, $0x08
			
 
				+	JB   copy_1_move_4through7
			
 
				+	JMP  copy_1_move_8through16
			
 
				+
			
 
				+copy_1_move_1or2:
			
 
				+	MOVB (R10), R14
			
 
				+	MOVB -1(R10)(CX*1), R15
			
 
				+	MOVB R14, (R9)
			
 
				+	MOVB R15, -1(R9)(CX*1)
			
 
				+	ADDQ CX, R10
			
 
				+	ADDQ CX, R9
			
 
				+	JMP  copy_1_end
			
 
				+
			
 
				+copy_1_move_3:
			
 
				+	MOVW (R10), R14
			
 
				+	MOVB 2(R10), R15
			
 
				+	MOVW R14, (R9)
			
 
				+	MOVB R15, 2(R9)
			
 
				+	ADDQ CX, R10
			
 
				+	ADDQ CX, R9
			
 
				+	JMP  copy_1_end
			
 
				+
			
 
				+copy_1_move_4through7:
			
 
				+	MOVL (R10), R14
			
 
				+	MOVL -4(R10)(CX*1), R15
			
 
				+	MOVL R14, (R9)
			
 
				+	MOVL R15, -4(R9)(CX*1)
			
 
				+	ADDQ CX, R10
			
 
				+	ADDQ CX, R9
			
 
				+	JMP  copy_1_end
			
 
				+
			
 
				+copy_1_move_8through16:
			
 
				+	MOVQ (R10), R14
			
 
				+	MOVQ -8(R10)(CX*1), R15
			
 
				+	MOVQ R14, (R9)
			
 
				+	MOVQ R15, -8(R9)(CX*1)
			
 
				+	ADDQ CX, R10
			
 
				+	ADDQ CX, R9
			
 
				+
			
 
				+copy_1_end:
			
 
				+	ADDQ CX, R11
			
 
				+
			
 
				+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
			
 
				+check_offset:
			
 
				+	MOVQ R11, CX
			
 
				+	ADDQ 40(SP), CX
			
 
				+	CMPQ R12, CX
			
 
				+	JG   error_match_off_too_big
			
 
				+	CMPQ R12, 56(SP)
			
 
				+	JG   error_match_off_too_big
			
 
				+
			
 
				+	// Copy match from history
			
 
				+	MOVQ R12, CX
			
 
				+	SUBQ R11, CX
			
 
				+	JLS  copy_match
			
 
				+	MOVQ 48(SP), R14
			
 
				+	SUBQ CX, R14
			
 
				+	CMPQ R13, CX
			
 
				+	JG   copy_all_from_history
			
 
				+	MOVQ R13, CX
			
 
				+	SUBQ $0x10, CX
			
 
				+	JB   copy_4_small
			
 
				+
			
 
				+copy_4_loop:
			
 
				+	MOVUPS (R14), X0
			
 
				+	MOVUPS X0, (R9)
			
 
				+	ADDQ   $0x10, R14
			
 
				+	ADDQ   $0x10, R9
			
 
				+	SUBQ   $0x10, CX
			
 
				+	JAE    copy_4_loop
			
 
				+	LEAQ   16(R14)(CX*1), R14
			
 
				+	LEAQ   16(R9)(CX*1), R9
			
 
				+	MOVUPS -16(R14), X0
			
 
				+	MOVUPS X0, -16(R9)
			
 
				+	JMP    copy_4_end
			
 
				+
			
 
				+copy_4_small:
			
 
				+	CMPQ R13, $0x03
			
 
				+	JE   copy_4_move_3
			
 
				+	CMPQ R13, $0x08
			
 
				+	JB   copy_4_move_4through7
			
 
				+	JMP  copy_4_move_8through16
			
 
				+
			
 
				+copy_4_move_3:
			
 
				+	MOVW (R14), CX
			
 
				+	MOVB 2(R14), R12
			
 
				+	MOVW CX, (R9)
			
 
				+	MOVB R12, 2(R9)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, R9
			
 
				+	JMP  copy_4_end
			
 
				+
			
 
				+copy_4_move_4through7:
			
 
				+	MOVL (R14), CX
			
 
				+	MOVL -4(R14)(R13*1), R12
			
 
				+	MOVL CX, (R9)
			
 
				+	MOVL R12, -4(R9)(R13*1)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, R9
			
 
				+	JMP  copy_4_end
			
 
				+
			
 
				+copy_4_move_8through16:
			
 
				+	MOVQ (R14), CX
			
 
				+	MOVQ -8(R14)(R13*1), R12
			
 
				+	MOVQ CX, (R9)
			
 
				+	MOVQ R12, -8(R9)(R13*1)
			
 
				+	ADDQ R13, R14
			
 
				+	ADDQ R13, R9
			
 
				+
			
 
				+copy_4_end:
			
 
				+	ADDQ R13, R11
			
 
				+	JMP  handle_loop
			
 
				+	JMP loop_finished
			
 
				+
			
 
				+copy_all_from_history:
			
 
				+	MOVQ CX, R15
			
 
				+	SUBQ $0x10, R15
			
 
				+	JB   copy_5_small
			
 
				+
			
 
				+copy_5_loop:
			
 
				+	MOVUPS (R14), X0
			
 
				+	MOVUPS X0, (R9)
			
 
				+	ADDQ   $0x10, R14
			
 
				+	ADDQ   $0x10, R9
			
 
				+	SUBQ   $0x10, R15
			
 
				+	JAE    copy_5_loop
			
 
				+	LEAQ   16(R14)(R15*1), R14
			
 
				+	LEAQ   16(R9)(R15*1), R9
			
 
				+	MOVUPS -16(R14), X0
			
 
				+	MOVUPS X0, -16(R9)
			
 
				+	JMP    copy_5_end
			
 
				+
			
 
				+copy_5_small:
			
 
				+	CMPQ CX, $0x03
			
 
				+	JE   copy_5_move_3
			
 
				+	JB   copy_5_move_1or2
			
 
				+	CMPQ CX, $0x08
			
 
				+	JB   copy_5_move_4through7
			
 
				+	JMP  copy_5_move_8through16
			
 
				+
			
 
				+copy_5_move_1or2:
			
 
				+	MOVB (R14), R15
			
 
				+	MOVB -1(R14)(CX*1), BP
			
 
				+	MOVB R15, (R9)
			
 
				+	MOVB BP, -1(R9)(CX*1)
			
 
				+	ADDQ CX, R14
			
 
				+	ADDQ CX, R9
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_3:
			
 
				+	MOVW (R14), R15
			
 
				+	MOVB 2(R14), BP
			
 
				+	MOVW R15, (R9)
			
 
				+	MOVB BP, 2(R9)
			
 
				+	ADDQ CX, R14
			
 
				+	ADDQ CX, R9
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_4through7:
			
 
				+	MOVL (R14), R15
			
 
				+	MOVL -4(R14)(CX*1), BP
			
 
				+	MOVL R15, (R9)
			
 
				+	MOVL BP, -4(R9)(CX*1)
			
 
				+	ADDQ CX, R14
			
 
				+	ADDQ CX, R9
			
 
				+	JMP  copy_5_end
			
 
				+
			
 
				+copy_5_move_8through16:
			
 
				+	MOVQ (R14), R15
			
 
				+	MOVQ -8(R14)(CX*1), BP
			
 
				+	MOVQ R15, (R9)
			
 
				+	MOVQ BP, -8(R9)(CX*1)
			
 
				+	ADDQ CX, R14
			
 
				+	ADDQ CX, R9
			
 
				+
			
 
				+copy_5_end:
			
 
				+	ADDQ CX, R11
			
 
				+	SUBQ CX, R13
			
 
				+
			
 
				+	// Copy match from the current buffer
			
 
				+copy_match:
			
 
				+	MOVQ R9, CX
			
 
				+	SUBQ R12, CX
			
 
				+
			
 
				+	// ml <= mo
			
 
				+	CMPQ R13, R12
			
 
				+	JA   copy_overlapping_match
			
 
				+
			
 
				+	// Copy non-overlapping match
			
 
				+	ADDQ R13, R11
			
 
				+	MOVQ R13, R12
			
 
				+	SUBQ $0x10, R12
			
 
				+	JB   copy_2_small
			
 
				+
			
 
				+copy_2_loop:
			
 
				+	MOVUPS (CX), X0
			
 
				+	MOVUPS X0, (R9)
			
 
				+	ADDQ   $0x10, CX
			
 
				+	ADDQ   $0x10, R9
			
 
				+	SUBQ   $0x10, R12
			
 
				+	JAE    copy_2_loop
			
 
				+	LEAQ   16(CX)(R12*1), CX
			
 
				+	LEAQ   16(R9)(R12*1), R9
			
 
				+	MOVUPS -16(CX), X0
			
 
				+	MOVUPS X0, -16(R9)
			
 
				+	JMP    copy_2_end
			
 
				+
			
 
				+copy_2_small:
			
 
				+	CMPQ R13, $0x03
			
 
				+	JE   copy_2_move_3
			
 
				+	JB   copy_2_move_1or2
			
 
				+	CMPQ R13, $0x08
			
 
				+	JB   copy_2_move_4through7
			
 
				+	JMP  copy_2_move_8through16
			
 
				+
			
 
				+copy_2_move_1or2:
			
 
				+	MOVB (CX), R12
			
 
				+	MOVB -1(CX)(R13*1), R14
			
 
				+	MOVB R12, (R9)
			
 
				+	MOVB R14, -1(R9)(R13*1)
			
 
				+	ADDQ R13, CX
			
 
				+	ADDQ R13, R9
			
 
				+	JMP  copy_2_end
			
 
				+
			
 
				+copy_2_move_3:
			
 
				+	MOVW (CX), R12
			
 
				+	MOVB 2(CX), R14
			
 
				+	MOVW R12, (R9)
			
 
				+	MOVB R14, 2(R9)
			
 
				+	ADDQ R13, CX
			
 
				+	ADDQ R13, R9
			
 
				+	JMP  copy_2_end
			
 
				+
			
 
				+copy_2_move_4through7:
			
 
				+	MOVL (CX), R12
			
 
				+	MOVL -4(CX)(R13*1), R14
			
 
				+	MOVL R12, (R9)
			
 
				+	MOVL R14, -4(R9)(R13*1)
			
 
				+	ADDQ R13, CX
			
 
				+	ADDQ R13, R9
			
 
				+	JMP  copy_2_end
			
 
				+
			
 
				+copy_2_move_8through16:
			
 
				+	MOVQ (CX), R12
			
 
				+	MOVQ -8(CX)(R13*1), R14
			
 
				+	MOVQ R12, (R9)
			
 
				+	MOVQ R14, -8(R9)(R13*1)
			
 
				+	ADDQ R13, CX
			
 
				+	ADDQ R13, R9
			
 
				+
			
 
				+copy_2_end:
			
 
				+	JMP handle_loop
			
 
				+
			
 
				+	// Copy overlapping match
			
 
				+copy_overlapping_match:
			
 
				+	ADDQ R13, R11
			
 
				+
			
 
				+copy_slow_3:
			
 
				+	MOVB (CX), R12
			
 
				+	MOVB R12, (R9)
			
 
				+	INCQ CX
			
 
				+	INCQ R9
			
 
				+	DECQ R13
			
 
				+	JNZ  copy_slow_3
			
 
				+
			
 
				+handle_loop:
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	DECQ 96(CX)
			
 
				+	JNS  sequenceDecs_decodeSync_safe_bmi2_main_loop
			
 
				+
			
 
				+loop_finished:
			
 
				+	MOVQ br+8(FP), CX
			
 
				+	MOVQ AX, 32(CX)
			
 
				+	MOVB DL, 40(CX)
			
 
				+	MOVQ BX, 24(CX)
			
 
				+
			
 
				+	// Update the context
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ R11, 136(AX)
			
 
				+	MOVQ 144(AX), CX
			
 
				+	SUBQ CX, R10
			
 
				+	MOVQ R10, 168(AX)
			
 
				+
			
 
				+	// Return success
			
 
				+	MOVQ $0x00000000, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match length error
			
 
				+sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
			
 
				+	MOVQ 16(SP), AX
			
 
				+	MOVQ ctx+16(FP), CX
			
 
				+	MOVQ AX, 216(CX)
			
 
				+	MOVQ $0x00000001, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match too long error
			
 
				+sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ 16(SP), CX
			
 
				+	MOVQ CX, 216(AX)
			
 
				+	MOVQ $0x00000002, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with match offset too long error
			
 
				+error_match_off_too_big:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ 8(SP), CX
			
 
				+	MOVQ CX, 224(AX)
			
 
				+	MOVQ R11, 136(AX)
			
 
				+	MOVQ $0x00000003, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with not enough literals error
			
 
				+error_not_enough_literals:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ 24(SP), CX
			
 
				+	MOVQ CX, 208(AX)
			
 
				+	MOVQ $0x00000004, ret+24(FP)
			
 
				+	RET
			
 
				+
			
 
				+	// Return with not enough output space error
			
 
				+error_not_enough_space:
			
 
				+	MOVQ ctx+16(FP), AX
			
 
				+	MOVQ 24(SP), CX
			
 
				+	MOVQ CX, 208(AX)
			
 
				+	MOVQ 16(SP), CX
			
 
				+	MOVQ CX, 216(AX)
			
 
				+	MOVQ R11, 136(AX)
			
 
				+	MOVQ $0x00000005, ret+24(FP)
			
 
				+	RET
			
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
@@ -0,0 +1,237 @@
 
				+//go:build !amd64 || appengine || !gc || noasm
			
 
				+// +build !amd64 appengine !gc noasm
			
 
				+
			
 
				+package zstd
			
 
				+
			
 
				+import (
			
 
				+	"fmt"
			
 
				+	"io"
			
 
				+)
			
 
				+
			
 
				+// decode sequences from the stream with the provided history but without dictionary.
			
 
				+func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
			
 
				+	return false, nil
			
 
				+}
			
 
				+
			
 
				+// decode sequences from the stream without the provided history.
			
 
				+func (s *sequenceDecs) decode(seqs []seqVals) error {
			
 
				+	br := s.br
			
 
				+
			
 
				+	// Grab full sizes tables, to avoid bounds checks.
			
 
				+	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
			
 
				+	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
			
 
				+	s.seqSize = 0
			
 
				+	litRemain := len(s.literals)
			
 
				+
			
 
				+	maxBlockSize := maxCompressedBlockSize
			
 
				+	if s.windowSize < maxBlockSize {
			
 
				+		maxBlockSize = s.windowSize
			
 
				+	}
			
 
				+	for i := range seqs {
			
 
				+		var ll, mo, ml int
			
 
				+		if br.off > 4+((maxOffsetBits+16+16)>>3) {
			
 
				+			// inlined function:
			
 
				+			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
			
 
				+
			
 
				+			// Final will not read from stream.
			
 
				+			var llB, mlB, moB uint8
			
 
				+			ll, llB = llState.final()
			
 
				+			ml, mlB = mlState.final()
			
 
				+			mo, moB = ofState.final()
			
 
				+
			
 
				+			// extra bits are stored in reverse order.
			
 
				+			br.fillFast()
			
 
				+			mo += br.getBits(moB)
			
 
				+			if s.maxBits > 32 {
			
 
				+				br.fillFast()
			
 
				+			}
			
 
				+			ml += br.getBits(mlB)
			
 
				+			ll += br.getBits(llB)
			
 
				+
			
 
				+			if moB > 1 {
			
 
				+				s.prevOffset[2] = s.prevOffset[1]
			
 
				+				s.prevOffset[1] = s.prevOffset[0]
			
 
				+				s.prevOffset[0] = mo
			
 
				+			} else {
			
 
				+				// mo = s.adjustOffset(mo, ll, moB)
			
 
				+				// Inlined for rather big speedup
			
 
				+				if ll == 0 {
			
 
				+					// There is an exception though, when current sequence's literals_length = 0.
			
 
				+					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
			
 
				+					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
			
 
				+					mo++
			
 
				+				}
			
 
				+
			
 
				+				if mo == 0 {
			
 
				+					mo = s.prevOffset[0]
			
 
				+				} else {
			
 
				+					var temp int
			
 
				+					if mo == 3 {
			
 
				+						temp = s.prevOffset[0] - 1
			
 
				+					} else {
			
 
				+						temp = s.prevOffset[mo]
			
 
				+					}
			
 
				+
			
 
				+					if temp == 0 {
			
 
				+						// 0 is not valid; input is corrupted; force offset to 1
			
 
				+						println("WARNING: temp was 0")
			
 
				+						temp = 1
			
 
				+					}
			
 
				+
			
 
				+					if mo != 1 {
			
 
				+						s.prevOffset[2] = s.prevOffset[1]
			
 
				+					}
			
 
				+					s.prevOffset[1] = s.prevOffset[0]
			
 
				+					s.prevOffset[0] = temp
			
 
				+					mo = temp
			
 
				+				}
			
 
				+			}
			
 
				+			br.fillFast()
			
 
				+		} else {
			
 
				+			if br.overread() {
			
 
				+				if debugDecoder {
			
 
				+					printf("reading sequence %d, exceeded available data\n", i)
			
 
				+				}
			
 
				+				return io.ErrUnexpectedEOF
			
 
				+			}
			
 
				+			ll, mo, ml = s.next(br, llState, mlState, ofState)
			
 
				+			br.fill()
			
 
				+		}
			
 
				+
			
 
				+		if debugSequences {
			
 
				+			println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
			
 
				+		}
			
 
				+		// Evaluate.
			
 
				+		// We might be doing this async, so do it early.
			
 
				+		if mo == 0 && ml > 0 {
			
 
				+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
			
 
				+		}
			
 
				+		if ml > maxMatchLen {
			
 
				+			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
			
 
				+		}
			
 
				+		s.seqSize += ll + ml
			
 
				+		if s.seqSize > maxBlockSize {
			
 
				+			return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
			
 
				+		}
			
 
				+		litRemain -= ll
			
 
				+		if litRemain < 0 {
			
 
				+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll)
			
 
				+		}
			
 
				+		seqs[i] = seqVals{
			
 
				+			ll: ll,
			
 
				+			ml: ml,
			
 
				+			mo: mo,
			
 
				+		}
			
 
				+		if i == len(seqs)-1 {
			
 
				+			// This is the last sequence, so we shouldn't update state.
			
 
				+			break
			
 
				+		}
			
 
				+
			
 
				+		// Manually inlined, ~ 5-20% faster
			
 
				+		// Update all 3 states at once. Approx 20% faster.
			
 
				+		nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
			
 
				+		if nBits == 0 {
			
 
				+			llState = llTable[llState.newState()&maxTableMask]
			
 
				+			mlState = mlTable[mlState.newState()&maxTableMask]
			
 
				+			ofState = ofTable[ofState.newState()&maxTableMask]
			
 
				+		} else {
			
 
				+			bits := br.get32BitsFast(nBits)
			
 
				+			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
			
 
				+			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
			
 
				+
			
 
				+			lowBits = uint16(bits >> (ofState.nbBits() & 31))
			
 
				+			lowBits &= bitMask[mlState.nbBits()&15]
			
 
				+			mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
			
 
				+
			
 
				+			lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
			
 
				+			ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
			
 
				+		}
			
 
				+	}
			
 
				+	s.seqSize += litRemain
			
 
				+	if s.seqSize > maxBlockSize {
			
 
				+		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
			
 
				+	}
			
 
				+	err := br.close()
			
 
				+	if err != nil {
			
 
				+		printf("Closing sequences: %v, %+v\n", err, *br)
			
 
				+	}
			
 
				+	return err
			
 
				+}
			
 
				+
			
 
				+// executeSimple handles cases when a dictionary is not used.
			
 
				+func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
			
 
				+	// Ensure we have enough output size...
			
 
				+	if len(s.out)+s.seqSize > cap(s.out) {
			
 
				+		addBytes := s.seqSize + len(s.out)
			
 
				+		s.out = append(s.out, make([]byte, addBytes)...)
			
 
				+		s.out = s.out[:len(s.out)-addBytes]
			
 
				+	}
			
 
				+
			
 
				+	if debugDecoder {
			
 
				+		printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
			
 
				+	}
			
 
				+
			
 
				+	var t = len(s.out)
			
 
				+	out := s.out[:t+s.seqSize]
			
 
				+
			
 
				+	for _, seq := range seqs {
			
 
				+		// Add literals
			
 
				+		copy(out[t:], s.literals[:seq.ll])
			
 
				+		t += seq.ll
			
 
				+		s.literals = s.literals[seq.ll:]
			
 
				+
			
 
				+		// Malformed input
			
 
				+		if seq.mo > t+len(hist) || seq.mo > s.windowSize {
			
 
				+			return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist))
			
 
				+		}
			
 
				+
			
 
				+		// Copy from history.
			
 
				+		if v := seq.mo - t; v > 0 {
			
 
				+			// v is the start position in history from end.
			
 
				+			start := len(hist) - v
			
 
				+			if seq.ml > v {
			
 
				+				// Some goes into the current block.
			
 
				+				// Copy remainder of history
			
 
				+				copy(out[t:], hist[start:])
			
 
				+				t += v
			
 
				+				seq.ml -= v
			
 
				+			} else {
			
 
				+				copy(out[t:], hist[start:start+seq.ml])
			
 
				+				t += seq.ml
			
 
				+				continue
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// We must be in the current buffer now
			
 
				+		if seq.ml > 0 {
			
 
				+			start := t - seq.mo
			
 
				+			if seq.ml <= t-start {
			
 
				+				// No overlap
			
 
				+				copy(out[t:], out[start:start+seq.ml])
			
 
				+				t += seq.ml
			
 
				+			} else {
			
 
				+				// Overlapping copy
			
 
				+				// Extend destination slice and copy one byte at the time.
			
 
				+				src := out[start : start+seq.ml]
			
 
				+				dst := out[t:]
			
 
				+				dst = dst[:len(src)]
			
 
				+				t += len(src)
			
 
				+				// Destination is the space we just added.
			
 
				+				for i := range src {
			
 
				+					dst[i] = src[i]
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	// Add final literals
			
 
				+	copy(out[t:], s.literals)
			
 
				+	if debugDecoder {
			
 
				+		t += len(s.literals)
			
 
				+		if t != len(out) {
			
 
				+			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
			
 
				+		}
			
 
				+	}
			
 
				+	s.out = out
			
 
				+
			
 
				+	return nil
			
 
				+}
			
--- a/vendor/github.com/klauspost/compress/zstd/zip.go
+++ b/vendor/github.com/klauspost/compress/zstd/zip.go
@@ -18,26 +18,44 @@ const ZipMethodWinZip = 93
 
				 // See https://pkware.cachefly.net/webdocs/APPNOTE/APPNOTE-6.3.9.TXT
			
 
				 const ZipMethodPKWare = 20
			
 
				 
			
 
				-var zipReaderPool sync.Pool
			
 
				+// zipReaderPool is the default reader pool.
			
 
				+var zipReaderPool = sync.Pool{New: func() interface{} {
			
 
				+	z, err := NewReader(nil, WithDecoderLowmem(true), WithDecoderMaxWindow(128<<20), WithDecoderConcurrency(1))
			
 
				+	if err != nil {
			
 
				+		panic(err)
			
 
				+	}
			
 
				+	return z
			
 
				+}}
			
 
				 
			
 
				 // newZipReader creates a pooled zip decompressor.
			
 
				-func newZipReader(r io.Reader) io.ReadCloser {
			
 
				-	dec, ok := zipReaderPool.Get().(*Decoder)
			
 
				-	if ok {
			
 
				-		dec.Reset(r)
			
 
				-	} else {
			
 
				-		d, err := NewReader(r, WithDecoderConcurrency(1), WithDecoderLowmem(true))
			
 
				-		if err != nil {
			
 
				-			panic(err)
			
 
				+func newZipReader(opts ...DOption) func(r io.Reader) io.ReadCloser {
			
 
				+	pool := &zipReaderPool
			
 
				+	if len(opts) > 0 {
			
 
				+		opts = append([]DOption{WithDecoderLowmem(true), WithDecoderMaxWindow(128 << 20)}, opts...)
			
 
				+		// Force concurrency 1
			
 
				+		opts = append(opts, WithDecoderConcurrency(1))
			
 
				+		// Create our own pool
			
 
				+		pool = &sync.Pool{}
			
 
				+	}
			
 
				+	return func(r io.Reader) io.ReadCloser {
			
 
				+		dec, ok := pool.Get().(*Decoder)
			
 
				+		if ok {
			
 
				+			dec.Reset(r)
			
 
				+		} else {
			
 
				+			d, err := NewReader(r, opts...)
			
 
				+			if err != nil {
			
 
				+				panic(err)
			
 
				+			}
			
 
				+			dec = d
			
 
				 		}
			
 
				-		dec = d
			
 
				+		return &pooledZipReader{dec: dec, pool: pool}
			
 
				 	}
			
 
				-	return &pooledZipReader{dec: dec}
			
 
				 }
			
 
				 
			
 
				 type pooledZipReader struct {
			
 
				-	mu  sync.Mutex // guards Close and Read
			
 
				-	dec *Decoder
			
 
				+	mu   sync.Mutex // guards Close and Read
			
 
				+	pool *sync.Pool
			
 
				+	dec  *Decoder
			
 
				 }
			
 
				 
			
 
				 func (r *pooledZipReader) Read(p []byte) (n int, err error) {
			
@@ -48,8 +66,8 @@ func (r *pooledZipReader) Read(p []byte) (n int, err error) {
 
				 	}
			
 
				 	dec, err := r.dec.Read(p)
			
 
				 	if err == io.EOF {
			
 
				-		err = r.dec.Reset(nil)
			
 
				-		zipReaderPool.Put(r.dec)
			
 
				+		r.dec.Reset(nil)
			
 
				+		r.pool.Put(r.dec)
			
 
				 		r.dec = nil
			
 
				 	}
			
 
				 	return dec, err
			
@@ -61,7 +79,7 @@ func (r *pooledZipReader) Close() error {
 
				 	var err error
			
 
				 	if r.dec != nil {
			
 
				 		err = r.dec.Reset(nil)
			
 
				-		zipReaderPool.Put(r.dec)
			
 
				+		r.pool.Put(r.dec)
			
 
				 		r.dec = nil
			
 
				 	}
			
 
				 	return err
			
@@ -115,6 +133,9 @@ func ZipCompressor(opts ...EOption) func(w io.Writer) (io.WriteCloser, error) {
 
				 
			
 
				 // ZipDecompressor returns a decompressor that can be registered with zip libraries.
			
 
				 // See ZipCompressor for example.
			
 
				-func ZipDecompressor() func(r io.Reader) io.ReadCloser {
			
 
				-	return newZipReader
			
 
				+// Options can be specified. WithDecoderConcurrency(1) is forced,
			
 
				+// and by default a 128MB maximum decompression window is specified.
			
 
				+// The window size can be overridden if required.
			
 
				+func ZipDecompressor(opts ...DOption) func(r io.Reader) io.ReadCloser {
			
 
				+	return newZipReader(opts...)
			
 
				 }
			
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@@ -110,17 +110,6 @@ func printf(format string, a ...interface{}) {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-// matchLenFast does matching, but will not match the last up to 7 bytes.
			
 
				-func matchLenFast(a, b []byte) int {
			
 
				-	endI := len(a) & (math.MaxInt32 - 7)
			
 
				-	for i := 0; i < endI; i += 8 {
			
 
				-		if diff := load64(a, i) ^ load64(b, i); diff != 0 {
			
 
				-			return i + bits.TrailingZeros64(diff)>>3
			
 
				-		}
			
 
				-	}
			
 
				-	return endI
			
 
				-}
			
 
				-
			
 
				 // matchLen returns the maximum length.
			
 
				 // a must be the shortest of the two.
			
 
				 // The function also returns whether all bytes matched.
			
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -454,11 +454,12 @@ github.com/ishidawataru/sctp
 
				 # github.com/jmespath/go-jmespath v0.3.0
			
 
				 ## explicit; go 1.14
			
 
				 github.com/jmespath/go-jmespath
			
 
				-# github.com/klauspost/compress v1.15.1
			
 
				-## explicit; go 1.15
			
 
				+# github.com/klauspost/compress v1.15.9
			
 
				+## explicit; go 1.16
			
 
				 github.com/klauspost/compress
			
 
				 github.com/klauspost/compress/fse
			
 
				 github.com/klauspost/compress/huff0
			
 
				+github.com/klauspost/compress/internal/cpuinfo
			
 
				 github.com/klauspost/compress/internal/snapref
			
 
				 github.com/klauspost/compress/zstd
			
 
				 github.com/klauspost/compress/zstd/internal/xxhash