Lzma.h 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. /*
  2. * Copyright (c) 2023, Tim Schumacher <timschumi@gmx.de>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include <AK/CircularBuffer.h>
  8. #include <AK/FixedArray.h>
  9. #include <AK/MaybeOwned.h>
  10. #include <AK/NonnullOwnPtr.h>
  11. #include <AK/Stream.h>
  12. namespace Compress {
  13. // This implementation is mostly based on the LZMA specification contained in the 7-Zip SDK, which has been placed in the public domain.
  14. // LZMA Specification Draft (2015): https://www.7-zip.org/a/lzma-specification.7z
  15. struct LzmaModelProperties {
  16. u8 literal_context_bits;
  17. u8 literal_position_bits;
  18. u8 position_bits;
  19. };
  20. struct LzmaDecompressorOptions {
  21. u8 literal_context_bits { 0 };
  22. u8 literal_position_bits { 0 };
  23. u8 position_bits { 0 };
  24. u32 dictionary_size { 0 };
  25. Optional<u64> uncompressed_size;
  26. bool reject_end_of_stream_marker { false };
  27. };
  28. // Described in section "lzma file format".
  29. struct [[gnu::packed]] LzmaHeader {
  30. u32 dictionary_size() const;
  31. Optional<u64> uncompressed_size() const;
  32. ErrorOr<LzmaDecompressorOptions> as_decompressor_options() const;
  33. static ErrorOr<LzmaModelProperties> decode_model_properties(u8 input_bits);
  34. private:
  35. u8 m_encoded_model_properties;
  36. u32 m_dictionary_size;
  37. u64 m_uncompressed_size;
  38. };
  39. static_assert(sizeof(LzmaHeader) == 13);
  40. class LzmaDecompressor : public Stream {
  41. public:
  42. /// Creates a decompressor from a standalone LZMA container (.lzma file extension, occasionally known as an LZMA 'archive').
  43. static ErrorOr<NonnullOwnPtr<LzmaDecompressor>> create_from_container(MaybeOwned<Stream>, Optional<MaybeOwned<CircularBuffer>> dictionary = {});
  44. /// Creates a decompressor from a raw stream of LZMA-compressed data (found inside an LZMA container or embedded in other file formats).
  45. static ErrorOr<NonnullOwnPtr<LzmaDecompressor>> create_from_raw_stream(MaybeOwned<Stream>, LzmaDecompressorOptions const&, Optional<MaybeOwned<CircularBuffer>> dictionary = {});
  46. ErrorOr<void> append_input_stream(MaybeOwned<Stream>, Optional<u64> uncompressed_size);
  47. virtual ErrorOr<Bytes> read_some(Bytes) override;
  48. virtual ErrorOr<size_t> write_some(ReadonlyBytes) override;
  49. virtual bool is_eof() const override;
  50. virtual bool is_open() const override;
  51. virtual void close() override;
  52. private:
  53. // LZMA uses 11-bit probability counters, but they are usually stored in 16-bit variables.
  54. // Therefore, we can model probabilities with a resolution of up to 1 / 2^11 (which is equal to 1 / 2048).
  55. // The default probability for most counters is 0.5.
  56. using Probability = u16;
  57. static constexpr size_t probability_bit_count = 11;
  58. static constexpr Probability default_probability = (1 << probability_bit_count) / 2;
  59. static void initialize_to_default_probability(Span<Probability>);
  60. LzmaDecompressor(MaybeOwned<Stream>, LzmaDecompressorOptions, MaybeOwned<CircularBuffer>, FixedArray<Probability> literal_probabilities);
  61. MaybeOwned<Stream> m_stream;
  62. LzmaDecompressorOptions m_options;
  63. // This doubles as an output buffer, since we have to write all of our results into this anyways.
  64. MaybeOwned<CircularBuffer> m_dictionary;
  65. u64 m_total_decoded_bytes { 0 };
  66. bool m_found_end_of_stream_marker { false };
  67. bool is_range_decoder_in_clean_state() const;
  68. bool has_reached_expected_data_size() const;
  69. Optional<u16> m_leftover_match_length;
  70. // Range decoder state (initialized with stream data in LzmaDecompressor::create).
  71. u32 m_range_decoder_range { 0xFFFFFFFF };
  72. u32 m_range_decoder_code { 0 };
  73. ErrorOr<void> initialize_range_decoder();
  74. ErrorOr<void> normalize_range_decoder();
  75. ErrorOr<u8> decode_direct_bit();
  76. ErrorOr<u8> decode_bit_with_probability(Probability& probability);
  77. // Decodes a multi-bit symbol using a given probability tree (either in normal or in reverse order).
  78. // The specification states that "unsigned" is at least 16 bits in size, our implementation assumes this as the maximum symbol size.
  79. ErrorOr<u16> decode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree);
  80. ErrorOr<u16> decode_symbol_using_reverse_bit_tree(size_t bit_count, Span<Probability> probability_tree);
  81. ErrorOr<void> decode_literal_to_output_buffer();
  82. static constexpr size_t literal_probability_table_size = 0x300;
  83. FixedArray<Probability> m_literal_probabilities;
  84. struct LzmaLengthDecoderState {
  85. public:
  86. LzmaLengthDecoderState();
  87. Probability m_first_choice_probability { default_probability };
  88. Probability m_second_choice_probability { default_probability };
  89. static constexpr size_t maximum_number_of_position_bits = 4;
  90. Array<Array<Probability, (1 << 3)>, (1 << maximum_number_of_position_bits)> m_low_length_probabilities;
  91. Array<Array<Probability, (1 << 3)>, (1 << maximum_number_of_position_bits)> m_medium_length_probabilities;
  92. Array<Probability, (1 << 8)> m_high_length_probabilities;
  93. };
  94. LzmaLengthDecoderState m_length_decoder;
  95. LzmaLengthDecoderState m_rep_length_decoder;
  96. static constexpr u16 normalized_to_real_match_length_offset = 2;
  97. ErrorOr<u16> decode_normalized_match_length(LzmaLengthDecoderState&);
  98. static constexpr size_t number_of_length_to_position_states = 4;
  99. Array<Array<Probability, (1 << 6)>, number_of_length_to_position_states> m_length_to_position_states;
  100. static constexpr size_t first_position_slot_with_binary_tree_bits = 4;
  101. static constexpr size_t first_position_slot_with_direct_encoded_bits = 14;
  102. // This is a bit wasteful on memory and not in the specification, but it makes the math easier.
  103. static constexpr size_t number_of_binary_tree_distance_slots = first_position_slot_with_direct_encoded_bits - first_position_slot_with_binary_tree_bits;
  104. static constexpr size_t largest_number_of_binary_tree_distance_bits = 5;
  105. Array<Array<Probability, (1 << largest_number_of_binary_tree_distance_bits)>, number_of_binary_tree_distance_slots> m_binary_tree_distance_probabilities;
  106. static constexpr size_t number_of_alignment_bits = 4;
  107. Array<Probability, (1 << number_of_alignment_bits)> m_alignment_bit_probabilities;
  108. // This deviates from the specification, which states that "unsigned" is at least 16-bit.
  109. // However, the match distance needs to be at least 32-bit, at the very least to hold the 0xFFFFFFFF end marker value.
  110. static constexpr u32 normalized_to_real_match_distance_offset = 1;
  111. ErrorOr<u32> decode_normalized_match_distance(u16 normalized_match_length);
  112. // LZ state tracking.
  113. u16 m_state { 0 };
  114. u32 m_rep0 { 0 };
  115. u32 m_rep1 { 0 };
  116. u32 m_rep2 { 0 };
  117. u32 m_rep3 { 0 };
  118. u32 current_repetition_offset() const;
  119. static constexpr size_t maximum_number_of_position_bits = 4;
  120. static constexpr size_t number_of_states = 12;
  121. Array<Probability, (number_of_states << maximum_number_of_position_bits)> m_is_match_probabilities;
  122. Array<Probability, number_of_states> m_is_rep_probabilities;
  123. Array<Probability, number_of_states> m_is_rep_g0_probabilities;
  124. Array<Probability, number_of_states> m_is_rep_g1_probabilities;
  125. Array<Probability, number_of_states> m_is_rep_g2_probabilities;
  126. Array<Probability, (number_of_states << maximum_number_of_position_bits)> m_is_rep0_long_probabilities;
  127. };
  128. }
  129. template<>
  130. struct AK::Traits<Compress::LzmaHeader> : public AK::GenericTraits<Compress::LzmaHeader> {
  131. static constexpr bool is_trivially_serializable() { return true; }
  132. };