Deflate.h 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. /*
  2. * Copyright (c) 2020, the SerenityOS developers.
  3. * Copyright (c) 2021, Idan Horowitz <idan.horowitz@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #pragma once
  8. #include <AK/ByteBuffer.h>
  9. #include <AK/CircularBuffer.h>
  10. #include <AK/Endian.h>
  11. #include <AK/Forward.h>
  12. #include <AK/MaybeOwned.h>
  13. #include <AK/Stream.h>
  14. #include <AK/Vector.h>
  15. #include <LibCompress/DeflateTables.h>
  16. namespace Compress {
  17. class CanonicalCode {
  18. public:
  19. CanonicalCode() = default;
  20. ErrorOr<u32> read_symbol(LittleEndianInputBitStream&) const;
  21. ErrorOr<void> write_symbol(LittleEndianOutputBitStream&, u32) const;
  22. static CanonicalCode const& fixed_literal_codes();
  23. static CanonicalCode const& fixed_distance_codes();
  24. static ErrorOr<CanonicalCode> from_bytes(ReadonlyBytes);
  25. private:
  26. static constexpr size_t max_allowed_prefixed_code_length = 8;
  27. struct PrefixTableEntry {
  28. u16 symbol_value { 0 };
  29. u16 code_length { 0 };
  30. };
  31. // Decompression - indexed by code
  32. Vector<u16, 286> m_symbol_codes;
  33. Vector<u16, 286> m_symbol_values;
  34. Array<PrefixTableEntry, 1 << max_allowed_prefixed_code_length> m_prefix_table {};
  35. size_t m_max_prefixed_code_length { 0 };
  36. // Compression - indexed by symbol
  37. // Deflate uses a maximum of 288 symbols (maximum of 32 for distances),
  38. // but this is also used by webp, which can use up to 256 + 24 + (1 << 11) == 2328 symbols.
  39. Vector<u16, 288> m_bit_codes {};
  40. Vector<u16, 288> m_bit_code_lengths {};
  41. };
  42. class DeflateDecompressor final : public Stream {
  43. private:
  44. class CompressedBlock {
  45. public:
  46. CompressedBlock(DeflateDecompressor&, CanonicalCode literal_codes, Optional<CanonicalCode> distance_codes);
  47. ErrorOr<bool> try_read_more();
  48. private:
  49. bool m_eof { false };
  50. DeflateDecompressor& m_decompressor;
  51. CanonicalCode m_literal_codes;
  52. Optional<CanonicalCode> m_distance_codes;
  53. };
  54. class UncompressedBlock {
  55. public:
  56. UncompressedBlock(DeflateDecompressor&, size_t);
  57. ErrorOr<bool> try_read_more();
  58. private:
  59. DeflateDecompressor& m_decompressor;
  60. size_t m_bytes_remaining;
  61. };
  62. enum class State {
  63. Idle,
  64. ReadingCompressedBlock,
  65. ReadingUncompressedBlock
  66. };
  67. public:
  68. friend CompressedBlock;
  69. friend UncompressedBlock;
  70. static ErrorOr<NonnullOwnPtr<DeflateDecompressor>> construct(MaybeOwned<LittleEndianInputBitStream> stream);
  71. ~DeflateDecompressor();
  72. virtual ErrorOr<Bytes> read_some(Bytes) override;
  73. virtual ErrorOr<size_t> write_some(ReadonlyBytes) override;
  74. virtual bool is_eof() const override;
  75. virtual bool is_open() const override;
  76. virtual void close() override;
  77. static ErrorOr<ByteBuffer> decompress_all(ReadonlyBytes);
  78. private:
  79. DeflateDecompressor(MaybeOwned<LittleEndianInputBitStream> stream, CircularBuffer buffer);
  80. ErrorOr<u32> decode_length(u32);
  81. ErrorOr<u32> decode_distance(u32);
  82. ErrorOr<void> decode_codes(CanonicalCode& literal_code, Optional<CanonicalCode>& distance_code);
  83. static constexpr u16 max_back_reference_length = 258;
  84. bool m_read_final_block { false };
  85. State m_state { State::Idle };
  86. union {
  87. CompressedBlock m_compressed_block;
  88. UncompressedBlock m_uncompressed_block;
  89. };
  90. MaybeOwned<LittleEndianInputBitStream> m_input_stream;
  91. CircularBuffer m_output_buffer;
  92. };
  93. class DeflateCompressor final : public Stream {
  94. public:
  95. static constexpr size_t block_size = 32 * KiB - 1; // TODO: this can theoretically be increased to 64 KiB - 2
  96. static constexpr size_t window_size = block_size * 2;
  97. static constexpr size_t hash_bits = 15;
  98. static constexpr size_t max_huffman_literals = 288;
  99. static constexpr size_t max_huffman_distances = 32;
  100. static constexpr size_t min_match_length = 4; // matches smaller than these are not worth the size of the back reference
  101. static constexpr size_t max_match_length = 258; // matches longer than these cannot be encoded using huffman codes
  102. static constexpr u16 empty_slot = UINT16_MAX;
  103. struct CompressionConstants {
  104. size_t good_match_length; // Once we find a match of at least this length (a good enough match) we reduce max_chain to lower processing time
  105. size_t max_lazy_length; // If the match is at least this long we dont defer matching to the next byte (which takes time) as its good enough
  106. size_t great_match_length; // Once we find a match of at least this length (a great match) we can just stop searching for longer ones
  107. size_t max_chain; // We only check the actual length of the max_chain closest matches
  108. };
  109. // These constants were shamelessly "borrowed" from zlib
  110. static constexpr CompressionConstants compression_constants[] = {
  111. { 0, 0, 0, 0 },
  112. { 4, 4, 8, 4 },
  113. { 8, 16, 128, 128 },
  114. { 32, 258, 258, 4096 },
  115. { max_match_length, max_match_length, max_match_length, 1 << hash_bits } // disable all limits
  116. };
  117. enum class CompressionLevel : int {
  118. STORE = 0,
  119. FAST,
  120. GOOD,
  121. GREAT,
  122. BEST // WARNING: this one can take an unreasonable amount of time!
  123. };
  124. static ErrorOr<NonnullOwnPtr<DeflateCompressor>> construct(MaybeOwned<Stream>, CompressionLevel = CompressionLevel::GOOD);
  125. ~DeflateCompressor();
  126. virtual ErrorOr<Bytes> read_some(Bytes) override;
  127. virtual ErrorOr<size_t> write_some(ReadonlyBytes) override;
  128. virtual bool is_eof() const override;
  129. virtual bool is_open() const override;
  130. virtual void close() override;
  131. ErrorOr<void> final_flush();
  132. static ErrorOr<ByteBuffer> compress_all(ReadonlyBytes bytes, CompressionLevel = CompressionLevel::GOOD);
  133. private:
  134. DeflateCompressor(NonnullOwnPtr<LittleEndianOutputBitStream>, CompressionLevel = CompressionLevel::GOOD);
  135. Bytes pending_block() { return { m_rolling_window + block_size, block_size }; }
  136. // LZ77 Compression
  137. static u16 hash_sequence(u8 const* bytes);
  138. size_t compare_match_candidate(size_t start, size_t candidate, size_t prev_match_length, size_t max_match_length);
  139. size_t find_back_match(size_t start, u16 hash, size_t previous_match_length, size_t max_match_length, size_t& match_position);
  140. void lz77_compress_block();
  141. // Huffman Coding
  142. struct code_length_symbol {
  143. u8 symbol;
  144. u8 count; // used for special symbols 16-18
  145. };
  146. static u8 distance_to_base(u16 distance);
  147. template<size_t Size>
  148. static void generate_huffman_lengths(Array<u8, Size>& lengths, Array<u16, Size> const& frequencies, size_t max_bit_length, u16 frequency_cap = UINT16_MAX);
  149. size_t huffman_block_length(Array<u8, max_huffman_literals> const& literal_bit_lengths, Array<u8, max_huffman_distances> const& distance_bit_lengths);
  150. ErrorOr<void> write_huffman(CanonicalCode const& literal_code, Optional<CanonicalCode> const& distance_code);
  151. static size_t encode_huffman_lengths(Array<u8, max_huffman_literals + max_huffman_distances> const& lengths, size_t lengths_count, Array<code_length_symbol, max_huffman_literals + max_huffman_distances>& encoded_lengths);
  152. size_t encode_block_lengths(Array<u8, max_huffman_literals> const& literal_bit_lengths, Array<u8, max_huffman_distances> const& distance_bit_lengths, Array<code_length_symbol, max_huffman_literals + max_huffman_distances>& encoded_lengths, size_t& literal_code_count, size_t& distance_code_count);
  153. ErrorOr<void> write_dynamic_huffman(CanonicalCode const& literal_code, size_t literal_code_count, Optional<CanonicalCode> const& distance_code, size_t distance_code_count, Array<u8, 19> const& code_lengths_bit_lengths, size_t code_length_count, Array<code_length_symbol, max_huffman_literals + max_huffman_distances> const& encoded_lengths, size_t encoded_lengths_count);
  154. size_t uncompressed_block_length();
  155. size_t fixed_block_length();
  156. size_t dynamic_block_length(Array<u8, max_huffman_literals> const& literal_bit_lengths, Array<u8, max_huffman_distances> const& distance_bit_lengths, Array<u8, 19> const& code_lengths_bit_lengths, Array<u16, 19> const& code_lengths_frequencies, size_t code_lengths_count);
  157. ErrorOr<void> flush();
  158. bool m_finished { false };
  159. CompressionLevel m_compression_level;
  160. CompressionConstants m_compression_constants;
  161. NonnullOwnPtr<LittleEndianOutputBitStream> m_output_stream;
  162. u8 m_rolling_window[window_size];
  163. size_t m_pending_block_size { 0 };
  164. struct [[gnu::packed]] {
  165. u16 distance; // back reference length
  166. union {
  167. u16 literal; // literal byte or on of block symbol
  168. u16 length; // back reference length (if distance != 0)
  169. };
  170. } m_symbol_buffer[block_size + 1];
  171. size_t m_pending_symbol_size { 0 };
  172. Array<u16, max_huffman_literals> m_symbol_frequencies; // there are 286 valid symbol values (symbols 286-287 never occur)
  173. Array<u16, max_huffman_distances> m_distance_frequencies; // there are 30 valid distance values (distances 30-31 never occur)
  174. // LZ77 Chained hash table
  175. u16 m_hash_head[1 << hash_bits];
  176. u16 m_hash_prev[window_size];
  177. };
  178. }