Deflate.h 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. /*
  2. * Copyright (c) 2020, the SerenityOS developers.
  3. * Copyright (c) 2021, Idan Horowitz <idan.horowitz@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #pragma once
  8. #include <AK/BitStream.h>
  9. #include <AK/ByteBuffer.h>
  10. #include <AK/CircularDuplexStream.h>
  11. #include <AK/Endian.h>
  12. #include <AK/Vector.h>
  13. #include <LibCompress/DeflateTables.h>
  14. namespace Compress {
  15. class CanonicalCode {
  16. public:
  17. CanonicalCode() = default;
  18. u32 read_symbol(InputBitStream&) const;
  19. void write_symbol(OutputBitStream&, u32) const;
  20. static const CanonicalCode& fixed_literal_codes();
  21. static const CanonicalCode& fixed_distance_codes();
  22. static Optional<CanonicalCode> from_bytes(ReadonlyBytes);
  23. private:
  24. // Decompression - indexed by code
  25. Vector<u16> m_symbol_codes;
  26. Vector<u16> m_symbol_values;
  27. // Compression - indexed by symbol
  28. Array<u16, 288> m_bit_codes {}; // deflate uses a maximum of 288 symbols (maximum of 32 for distances)
  29. Array<u16, 288> m_bit_code_lengths {};
  30. };
  31. class DeflateDecompressor final : public InputStream {
  32. private:
  33. class CompressedBlock {
  34. public:
  35. CompressedBlock(DeflateDecompressor&, CanonicalCode literal_codes, Optional<CanonicalCode> distance_codes);
  36. bool try_read_more();
  37. private:
  38. bool m_eof { false };
  39. DeflateDecompressor& m_decompressor;
  40. CanonicalCode m_literal_codes;
  41. Optional<CanonicalCode> m_distance_codes;
  42. };
  43. class UncompressedBlock {
  44. public:
  45. UncompressedBlock(DeflateDecompressor&, size_t);
  46. bool try_read_more();
  47. private:
  48. DeflateDecompressor& m_decompressor;
  49. size_t m_bytes_remaining;
  50. };
  51. enum class State {
  52. Idle,
  53. ReadingCompressedBlock,
  54. ReadingUncompressedBlock
  55. };
  56. public:
  57. friend CompressedBlock;
  58. friend UncompressedBlock;
  59. DeflateDecompressor(InputStream&);
  60. ~DeflateDecompressor();
  61. size_t read(Bytes) override;
  62. bool read_or_error(Bytes) override;
  63. bool discard_or_error(size_t) override;
  64. bool unreliable_eof() const override;
  65. bool handle_any_error() override;
  66. static Optional<ByteBuffer> decompress_all(ReadonlyBytes);
  67. private:
  68. u32 decode_length(u32);
  69. u32 decode_distance(u32);
  70. void decode_codes(CanonicalCode& literal_code, Optional<CanonicalCode>& distance_code);
  71. bool m_read_final_bock { false };
  72. State m_state { State::Idle };
  73. union {
  74. CompressedBlock m_compressed_block;
  75. UncompressedBlock m_uncompressed_block;
  76. };
  77. InputBitStream m_input_stream;
  78. CircularDuplexStream<32 * KiB> m_output_stream;
  79. };
  80. enum DeflateSpecialCodeLengths : u32 {
  81. COPY = 16,
  82. ZEROS = 17,
  83. LONG_ZEROS = 18
  84. };
  85. class DeflateCompressor final : public OutputStream {
  86. public:
  87. static constexpr size_t block_size = 32 * KiB - 1; // TODO: this can theoretically be increased to 64 KiB - 2
  88. static constexpr size_t window_size = block_size * 2;
  89. static constexpr size_t hash_bits = 15;
  90. static constexpr size_t max_huffman_literals = 288;
  91. static constexpr size_t max_huffman_distances = 32;
  92. static constexpr size_t min_match_length = 4; // matches smaller than these are not worth the size of the back reference
  93. static constexpr size_t max_match_length = 258; // matches longer than these cannot be encoded using huffman codes
  94. static constexpr u16 empty_slot = UINT16_MAX;
  95. struct CompressionConstants {
  96. size_t good_match_length; // Once we find a match of at least this length (a good enough match) we reduce max_chain to lower processing time
  97. size_t max_lazy_length; // If the match is at least this long we dont defer matching to the next byte (which takes time) as its good enough
  98. size_t great_match_length; // Once we find a match of at least this length (a great match) we can just stop searching for longer ones
  99. size_t max_chain; // We only check the actual length of the max_chain closest matches
  100. };
  101. // These constants were shamelessly "borrowed" from zlib
  102. static constexpr CompressionConstants compression_constants[] = {
  103. { 0, 0, 0, 0 },
  104. { 4, 4, 8, 4 },
  105. { 8, 16, 128, 128 },
  106. { 32, 258, 258, 4096 },
  107. { max_match_length, max_match_length, max_match_length, 1 << hash_bits } // disable all limits
  108. };
  109. enum class CompressionLevel : int {
  110. STORE = 0,
  111. FAST,
  112. GOOD,
  113. GREAT,
  114. BEST // WARNING: this one can take an unreasonable amount of time!
  115. };
  116. DeflateCompressor(OutputStream&, CompressionLevel = CompressionLevel::GOOD);
  117. ~DeflateCompressor();
  118. size_t write(ReadonlyBytes) override;
  119. bool write_or_error(ReadonlyBytes) override;
  120. void final_flush();
  121. static Optional<ByteBuffer> compress_all(const ReadonlyBytes& bytes, CompressionLevel = CompressionLevel::GOOD);
  122. private:
  123. Bytes pending_block() { return { m_rolling_window + block_size, block_size }; }
  124. // LZ77 Compression
  125. static u16 hash_sequence(const u8* bytes);
  126. size_t compare_match_candidate(size_t start, size_t candidate, size_t prev_match_length, size_t max_match_length);
  127. size_t find_back_match(size_t start, u16 hash, size_t previous_match_length, size_t max_match_length, size_t& match_position);
  128. void lz77_compress_block();
  129. // Huffman Coding
  130. struct code_length_symbol {
  131. u8 symbol;
  132. u8 count; // used for special symbols 16-18
  133. };
  134. static u8 distance_to_base(u16 distance);
  135. template<size_t Size>
  136. static void generate_huffman_lengths(Array<u8, Size>& lengths, const Array<u16, Size>& frequencies, size_t max_bit_length, u16 frequency_cap = UINT16_MAX);
  137. size_t huffman_block_length(const Array<u8, max_huffman_literals>& literal_bit_lengths, const Array<u8, max_huffman_distances>& distance_bit_lengths);
  138. void write_huffman(const CanonicalCode& literal_code, const Optional<CanonicalCode>& distance_code);
  139. static size_t encode_huffman_lengths(const Array<u8, max_huffman_literals + max_huffman_distances>& lengths, size_t lengths_count, Array<code_length_symbol, max_huffman_literals + max_huffman_distances>& encoded_lengths);
  140. size_t encode_block_lengths(const Array<u8, max_huffman_literals>& literal_bit_lengths, const Array<u8, max_huffman_distances>& distance_bit_lengths, Array<code_length_symbol, max_huffman_literals + max_huffman_distances>& encoded_lengths, size_t& literal_code_count, size_t& distance_code_count);
  141. void write_dynamic_huffman(const CanonicalCode& literal_code, size_t literal_code_count, const Optional<CanonicalCode>& distance_code, size_t distance_code_count, const Array<u8, 19>& code_lengths_bit_lengths, size_t code_length_count, const Array<code_length_symbol, max_huffman_literals + max_huffman_distances>& encoded_lengths, size_t encoded_lengths_count);
  142. size_t uncompressed_block_length();
  143. size_t fixed_block_length();
  144. size_t dynamic_block_length(const Array<u8, max_huffman_literals>& literal_bit_lengths, const Array<u8, max_huffman_distances>& distance_bit_lengths, const Array<u8, 19>& code_lengths_bit_lengths, const Array<u16, 19>& code_lengths_frequencies, size_t code_lengths_count);
  145. void flush();
  146. bool m_finished { false };
  147. CompressionLevel m_compression_level;
  148. CompressionConstants m_compression_constants;
  149. OutputBitStream m_output_stream;
  150. u8 m_rolling_window[window_size];
  151. size_t m_pending_block_size { 0 };
  152. struct [[gnu::packed]] {
  153. u16 distance; // back reference length
  154. union {
  155. u16 literal; // literal byte or on of block symbol
  156. u16 length; // back reference length (if distance != 0)
  157. };
  158. } m_symbol_buffer[block_size + 1];
  159. size_t m_pending_symbol_size { 0 };
  160. Array<u16, max_huffman_literals> m_symbol_frequencies; // there are 286 valid symbol values (symbols 286-287 never occur)
  161. Array<u16, max_huffman_distances> m_distance_frequencies; // there are 30 valid distance values (distances 30-31 never occur)
  162. // LZ77 Chained hash table
  163. u16 m_hash_head[1 << hash_bits];
  164. u16 m_hash_prev[window_size];
  165. };
  166. }