Pārlūkot izejas kodu

LibCompress: Add an LZMA encoder

Tim Schumacher 2 gadi atpakaļ
vecāks
revīzija
85a54cc796

+ 520 - 2
Userland/Libraries/LibCompress/Lzma.cpp

@@ -4,6 +4,8 @@
  * SPDX-License-Identifier: BSD-2-Clause
  */
 
+#include <AK/Debug.h>
+#include <AK/IntegralMath.h>
 #include <LibCompress/Lzma.h>
 
 namespace Compress {
@@ -29,7 +31,7 @@ Optional<u64> LzmaHeader::uncompressed_size() const
     // "If "Uncompressed size" field contains ones in all 64 bits, it means that
     //  uncompressed size is unknown and there is the "end marker" in stream,
     //  that indicates the end of decoding point."
-    if (uncompressed_size == UINT64_MAX)
+    if (uncompressed_size == placeholder_for_unknown_uncompressed_size)
         return {};
 
     // "In opposite case, if the value from "Uncompressed size" field is not
@@ -71,6 +73,20 @@ ErrorOr<LzmaModelProperties> LzmaHeader::decode_model_properties(u8 input_bits)
     };
 }
 
+ErrorOr<u8> LzmaHeader::encode_model_properties(LzmaModelProperties const& model_properties)
+{
+    if (model_properties.literal_context_bits > 8)
+        return Error::from_string_literal("LZMA literal context bits are too large to encode");
+
+    if (model_properties.literal_position_bits > 4)
+        return Error::from_string_literal("LZMA literal position bits are too large to encode");
+
+    if (model_properties.position_bits > 4)
+        return Error::from_string_literal("LZMA position bits are too large to encode");
+
+    return (model_properties.position_bits * 5 + model_properties.literal_position_bits) * 9 + model_properties.literal_context_bits;
+}
+
 ErrorOr<LzmaDecompressorOptions> LzmaHeader::as_decompressor_options() const
 {
     auto model_properties = TRY(decode_model_properties(encoded_model_properties));
@@ -85,6 +101,21 @@ ErrorOr<LzmaDecompressorOptions> LzmaHeader::as_decompressor_options() const
     };
 }
 
+ErrorOr<LzmaHeader> LzmaHeader::from_compressor_options(LzmaCompressorOptions const& options)
+{
+    auto encoded_model_properties = TRY(encode_model_properties({
+        .literal_context_bits = options.literal_context_bits,
+        .literal_position_bits = options.literal_position_bits,
+        .position_bits = options.position_bits,
+    }));
+
+    return LzmaHeader {
+        .encoded_model_properties = encoded_model_properties,
+        .unchecked_dictionary_size = options.dictionary_size,
+        .encoded_uncompressed_size = options.uncompressed_size.value_or(placeholder_for_unknown_uncompressed_size),
+    };
+}
+
 void LzmaState::initialize_to_default_probability(Span<Probability> span)
 {
     for (auto& entry : span)
@@ -218,6 +249,39 @@ ErrorOr<void> LzmaDecompressor::normalize_range_decoder()
     return {};
 }
 
+ErrorOr<void> LzmaCompressor::normalize_range_encoder()
+{
+    u64 const maximum_range_value = m_range_encoder_code + m_range_encoder_range;
+
+    // If we hit this, we have the potential to overflow into a byte that we already flushed.
+    VERIFY((maximum_range_value & ((1ull << m_range_encoder_code_used_bits) - 1)) == maximum_range_value);
+
+    constexpr u32 minimum_range_value = 1 << 24;
+
+    if (m_range_encoder_range >= minimum_range_value)
+        return {};
+
+    u64 const flipped_bits = maximum_range_value ^ m_range_encoder_code;
+    u64 const size_of_flipped_bits = count_required_bits(flipped_bits);
+
+    // If we can flush a full byte without impacting future bits, do so.
+    while (m_range_encoder_code_used_bits - 8 >= size_of_flipped_bits) {
+        u8 const next_byte = (m_range_encoder_code >> (m_range_encoder_code_used_bits - 8));
+        m_range_encoder_code -= static_cast<u64>(next_byte) << (m_range_encoder_code_used_bits - 8);
+        m_range_encoder_code_used_bits -= 8;
+        TRY(m_stream->write_value(next_byte));
+    }
+
+    // Now, shift in a fresh null byte from the bottom.
+    m_range_encoder_range <<= 8;
+    m_range_encoder_code <<= 8;
+    m_range_encoder_code_used_bits += 8;
+
+    VERIFY(m_range_encoder_range >= minimum_range_value);
+
+    return {};
+}
+
 ErrorOr<u8> LzmaDecompressor::decode_direct_bit()
 {
     m_range_decoder_range >>= 1;
@@ -235,6 +299,18 @@ ErrorOr<u8> LzmaDecompressor::decode_direct_bit()
     return temp + 1;
 }
 
+ErrorOr<void> LzmaCompressor::encode_direct_bit(u8 value)
+{
+    m_range_encoder_range >>= 1;
+
+    if (value != 0)
+        m_range_encoder_code += m_range_encoder_range;
+
+    TRY(normalize_range_encoder());
+
+    return {};
+}
+
 ErrorOr<u8> LzmaDecompressor::decode_bit_with_probability(Probability& probability)
 {
     // "The LZMA decoder provides the pointer to CProb variable that contains
@@ -260,6 +336,25 @@ ErrorOr<u8> LzmaDecompressor::decode_bit_with_probability(Probability& probabili
     }
 }
 
+ErrorOr<void> LzmaCompressor::encode_bit_with_probability(Probability& probability, u8 value)
+{
+    constexpr size_t probability_shift_width = 5;
+
+    u32 bound = (m_range_encoder_range >> probability_bit_count) * probability;
+
+    if (value == 0) {
+        probability += ((1 << probability_bit_count) - probability) >> probability_shift_width;
+        m_range_encoder_range = bound;
+    } else {
+        probability -= probability >> probability_shift_width;
+        m_range_encoder_code += bound;
+        m_range_encoder_range -= bound;
+    }
+
+    TRY(normalize_range_encoder());
+    return {};
+}
+
 ErrorOr<u16> LzmaDecompressor::decode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree)
 {
     VERIFY(bit_count <= sizeof(u16) * 8);
@@ -280,6 +375,27 @@ ErrorOr<u16> LzmaDecompressor::decode_symbol_using_bit_tree(size_t bit_count, Sp
     return result;
 }
 
+ErrorOr<void> LzmaCompressor::encode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree, u16 value)
+{
+    VERIFY(bit_count <= sizeof(u16) * 8);
+    VERIFY(probability_tree.size() >= 1ul << bit_count);
+    VERIFY(value <= (1 << bit_count) - 1);
+
+    // Shift value to make the first sent byte the most significant bit. This makes the shifting logic a lot easier to read.
+    value <<= sizeof(u16) * 8 - bit_count;
+
+    size_t tree_index = 1;
+
+    for (size_t i = 0; i < bit_count; i++) {
+        u8 const next_bit = (value & 0x8000) >> (sizeof(u16) * 8 - 1);
+        value <<= 1;
+        TRY(encode_bit_with_probability(probability_tree[tree_index], next_bit));
+        tree_index = (tree_index << 1) | next_bit;
+    }
+
+    return {};
+}
+
 ErrorOr<u16> LzmaDecompressor::decode_symbol_using_reverse_bit_tree(size_t bit_count, Span<Probability> probability_tree)
 {
     VERIFY(bit_count <= sizeof(u16) * 8);
@@ -297,6 +413,24 @@ ErrorOr<u16> LzmaDecompressor::decode_symbol_using_reverse_bit_tree(size_t bit_c
     return result;
 }
 
+ErrorOr<void> LzmaCompressor::encode_symbol_using_reverse_bit_tree(size_t bit_count, Span<Probability> probability_tree, u16 value)
+{
+    VERIFY(bit_count <= sizeof(u16) * 8);
+    VERIFY(probability_tree.size() >= 1ul << bit_count);
+    VERIFY(value <= (1 << bit_count) - 1);
+
+    size_t tree_index = 1;
+
+    for (size_t i = 0; i < bit_count; i++) {
+        u8 const next_bit = value & 1;
+        value >>= 1;
+        TRY(encode_bit_with_probability(probability_tree[tree_index], next_bit));
+        tree_index = (tree_index << 1) | next_bit;
+    }
+
+    return {};
+}
+
 ErrorOr<void> LzmaDecompressor::decode_literal_to_output_buffer()
 {
     u8 previous_byte = 0;
@@ -353,6 +487,139 @@ ErrorOr<void> LzmaDecompressor::decode_literal_to_output_buffer()
     return {};
 }
 
+ErrorOr<void> LzmaCompressor::encode_literal(u8 literal)
+{
+    // This function largely mirrors `decode_literal_to_output_buffer`, so specification comments have been omitted.
+
+    TRY(encode_match_type(MatchType::Literal));
+
+    // Note: We have already read the next byte from the input buffer, so it's now in the seekback buffer, shifting all seekback offsets by one.
+    u8 previous_byte = 0;
+    if (m_dictionary->seekback_limit() - m_dictionary->used_space() > 1) {
+        auto read_bytes = MUST(m_dictionary->read_with_seekback({ &previous_byte, sizeof(previous_byte) }, 2 + m_dictionary->used_space()));
+        VERIFY(read_bytes.size() == sizeof(previous_byte));
+    }
+    u16 const literal_state_bits_from_position = m_total_processed_bytes & ((1 << m_options.literal_position_bits) - 1);
+    u16 const literal_state_bits_from_output = previous_byte >> (8 - m_options.literal_context_bits);
+    u16 const literal_state = literal_state_bits_from_position << m_options.literal_context_bits | literal_state_bits_from_output;
+
+    Span<Probability> selected_probability_table = m_literal_probabilities.span().slice(literal_probability_table_size * literal_state, literal_probability_table_size);
+
+    u16 result = 1;
+
+    if (m_state >= 7) {
+        u8 matched_byte = 0;
+        auto read_bytes = TRY(m_dictionary->read_with_seekback({ &matched_byte, sizeof(matched_byte) }, current_repetition_offset() + m_dictionary->used_space() + 1));
+        VERIFY(read_bytes.size() == sizeof(matched_byte));
+
+        do {
+            u8 const match_bit = (matched_byte >> 7) & 1;
+            matched_byte <<= 1;
+
+            u8 const encoded_bit = (literal & 0x80) >> 7;
+            literal <<= 1;
+
+            TRY(encode_bit_with_probability(selected_probability_table[((1 + match_bit) << 8) + result], encoded_bit));
+            result = result << 1 | encoded_bit;
+
+            if (match_bit != encoded_bit)
+                break;
+        } while (result < 0x100);
+    }
+
+    while (result < 0x100) {
+        u8 const encoded_bit = (literal & 0x80) >> 7;
+        literal <<= 1;
+
+        TRY(encode_bit_with_probability(selected_probability_table[result], encoded_bit));
+
+        result = (result << 1) | encoded_bit;
+    }
+
+    m_total_processed_bytes += sizeof(literal);
+
+    update_state_after_literal();
+
+    return {};
+}
+
+ErrorOr<void> LzmaCompressor::encode_existing_match(size_t real_distance, size_t real_length)
+{
+    VERIFY(real_distance >= normalized_to_real_match_distance_offset);
+    u32 const normalized_distance = real_distance - normalized_to_real_match_distance_offset;
+
+    VERIFY(real_length >= normalized_to_real_match_length_offset);
+    u16 const normalized_length = real_length - normalized_to_real_match_length_offset;
+
+    if (normalized_distance == m_rep0) {
+        TRY(encode_match_type(MatchType::RepMatch0));
+    } else if (normalized_distance == m_rep1) {
+        TRY(encode_match_type(MatchType::RepMatch1));
+
+        u32 const distance = m_rep1;
+        m_rep1 = m_rep0;
+        m_rep0 = distance;
+    } else if (normalized_distance == m_rep2) {
+        TRY(encode_match_type(MatchType::RepMatch2));
+
+        u32 const distance = m_rep2;
+        m_rep2 = m_rep1;
+        m_rep1 = m_rep0;
+        m_rep0 = distance;
+    } else if (normalized_distance == m_rep3) {
+        TRY(encode_match_type(MatchType::RepMatch3));
+
+        u32 const distance = m_rep3;
+        m_rep3 = m_rep2;
+        m_rep2 = m_rep1;
+        m_rep1 = m_rep0;
+        m_rep0 = distance;
+    } else {
+        VERIFY_NOT_REACHED();
+    }
+
+    TRY(encode_normalized_match_length(m_rep_length_coder, normalized_length));
+    update_state_after_rep();
+    MUST(m_dictionary->discard(real_length));
+    m_total_processed_bytes += real_length;
+
+    return {};
+}
+
+ErrorOr<void> LzmaCompressor::encode_new_match(size_t real_distance, size_t real_length)
+{
+    VERIFY(real_distance >= normalized_to_real_match_distance_offset);
+    u32 const normalized_distance = real_distance - normalized_to_real_match_distance_offset;
+
+    VERIFY(real_length >= normalized_to_real_match_length_offset);
+    u16 const normalized_length = real_length - normalized_to_real_match_length_offset;
+
+    TRY(encode_normalized_simple_match(normalized_distance, normalized_length));
+
+    MUST(m_dictionary->discard(real_length));
+    m_total_processed_bytes += real_length;
+
+    return {};
+}
+
+ErrorOr<void> LzmaCompressor::encode_normalized_simple_match(u32 normalized_distance, u16 normalized_length)
+{
+    TRY(encode_match_type(MatchType::SimpleMatch));
+
+    m_rep3 = m_rep2;
+    m_rep2 = m_rep1;
+    m_rep1 = m_rep0;
+
+    TRY(encode_normalized_match_length(m_length_coder, normalized_length));
+
+    update_state_after_match();
+
+    TRY(encode_normalized_match_distance(normalized_length, normalized_distance));
+    m_rep0 = normalized_distance;
+
+    return {};
+}
+
 LzmaState::LzmaLengthCoderState::LzmaLengthCoderState()
 {
     for (auto& array : m_low_length_probabilities)
@@ -387,6 +654,29 @@ ErrorOr<u16> LzmaDecompressor::decode_normalized_match_length(LzmaLengthCoderSta
     return TRY(decode_symbol_using_bit_tree(8, length_decoder_state.m_high_length_probabilities.span())) + 16;
 }
 
+ErrorOr<void> LzmaCompressor::encode_normalized_match_length(LzmaLengthCoderState& length_coder_state, u16 normalized_length)
+{
+    u16 const position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);
+
+    if (normalized_length < 8) {
+        TRY(encode_bit_with_probability(length_coder_state.m_first_choice_probability, 0));
+        TRY(encode_symbol_using_bit_tree(3, length_coder_state.m_low_length_probabilities[position_state].span(), normalized_length));
+        return {};
+    }
+
+    TRY(encode_bit_with_probability(length_coder_state.m_first_choice_probability, 1));
+
+    if (normalized_length < 16) {
+        TRY(encode_bit_with_probability(length_coder_state.m_second_choice_probability, 0));
+        TRY(encode_symbol_using_bit_tree(3, length_coder_state.m_medium_length_probabilities[position_state].span(), normalized_length - 8));
+        return {};
+    }
+
+    TRY(encode_bit_with_probability(length_coder_state.m_second_choice_probability, 1));
+    TRY(encode_symbol_using_bit_tree(8, length_coder_state.m_high_length_probabilities.span(), normalized_length - 16));
+    return {};
+}
+
 ErrorOr<u32> LzmaDecompressor::decode_normalized_match_distance(u16 normalized_match_length)
 {
     // "LZMA uses normalized match length (zero-based length)
@@ -460,6 +750,51 @@ ErrorOr<u32> LzmaDecompressor::decode_normalized_match_distance(u16 normalized_m
     return (distance_prefix << number_of_alignment_bits) | TRY(decode_symbol_using_reverse_bit_tree(number_of_alignment_bits, m_alignment_bit_probabilities));
 }
 
+ErrorOr<void> LzmaCompressor::encode_normalized_match_distance(u16 normalized_match_length, u32 normalized_match_distance)
+{
+    u16 const length_state = min(normalized_match_length, number_of_length_to_position_states - 1);
+
+    if (normalized_match_distance < first_position_slot_with_binary_tree_bits) {
+        // The normalized distance gets encoded as the position slot.
+        TRY(encode_symbol_using_bit_tree(6, m_length_to_position_states[length_state].span(), normalized_match_distance));
+        return {};
+    }
+
+    // Note: This has been deduced, there is no immediate relation to the decoding function.
+    u16 const distance_log2 = AK::log2(normalized_match_distance);
+    u16 number_of_distance_bits = count_required_bits(normalized_match_distance);
+    u16 const position_slot = (distance_log2 << 1) + ((normalized_match_distance >> (distance_log2 - 1)) & 1);
+
+    TRY(encode_symbol_using_bit_tree(6, m_length_to_position_states[length_state].span(), position_slot));
+
+    // Mask off the top two bits of the value, those are already encoded by the position slot.
+    normalized_match_distance &= (1 << (number_of_distance_bits - 2)) - 1;
+    number_of_distance_bits -= 2;
+
+    if (position_slot < first_position_slot_with_direct_encoded_bits) {
+        // The value gets encoded using only a reverse bit tree coder.
+        auto& selected_probability_tree = m_binary_tree_distance_probabilities[position_slot - first_position_slot_with_binary_tree_bits];
+        TRY(encode_symbol_using_reverse_bit_tree(number_of_distance_bits, selected_probability_tree, normalized_match_distance));
+        return {};
+    }
+
+    // The value is split into direct bits (everything except the last four bits) and alignment bits (last four bits).
+    auto direct_bits = normalized_match_distance & ~((1 << number_of_alignment_bits) - 1);
+    auto const alignment_bits = normalized_match_distance & ((1 << number_of_alignment_bits) - 1);
+
+    // Shift to-be-written direct bits to the most significant position for easier access.
+    direct_bits <<= sizeof(direct_bits) * 8 - number_of_distance_bits;
+
+    for (auto i = 0u; i < number_of_distance_bits - number_of_alignment_bits; i++) {
+        TRY(encode_direct_bit((direct_bits & 0x80000000) ? 1 : 0));
+        direct_bits <<= 1;
+    }
+
+    TRY(encode_symbol_using_reverse_bit_tree(number_of_alignment_bits, m_alignment_bit_probabilities, alignment_bits));
+
+    return {};
+}
+
 u32 LzmaState::current_repetition_offset() const
 {
     // LZMA never needs to read at offset 0 (i.e. the actual read head of the buffer).
@@ -554,6 +889,77 @@ ErrorOr<LzmaDecompressor::MatchType> LzmaDecompressor::decode_match_type()
     return MatchType::RepMatch3;
 }
 
+ErrorOr<void> LzmaCompressor::encode_match_type(MatchType match_type)
+{
+    u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);
+    u16 state2 = (m_state << maximum_number_of_position_bits) + position_state;
+
+    if (match_type == MatchType::Literal) {
+        TRY(encode_bit_with_probability(m_is_match_probabilities[state2], 0));
+        return {};
+    }
+    TRY(encode_bit_with_probability(m_is_match_probabilities[state2], 1));
+
+    if (match_type == MatchType::SimpleMatch) {
+        TRY(encode_bit_with_probability(m_is_rep_probabilities[m_state], 0));
+        return {};
+    }
+    TRY(encode_bit_with_probability(m_is_rep_probabilities[m_state], 1));
+
+    if (match_type == MatchType::ShortRepMatch || match_type == MatchType::RepMatch0) {
+        TRY(encode_bit_with_probability(m_is_rep_g0_probabilities[m_state], 0));
+        TRY(encode_bit_with_probability(m_is_rep0_long_probabilities[state2], match_type == MatchType::RepMatch0));
+        return {};
+    }
+    TRY(encode_bit_with_probability(m_is_rep_g0_probabilities[m_state], 1));
+
+    if (match_type == MatchType::RepMatch1) {
+        TRY(encode_bit_with_probability(m_is_rep_g1_probabilities[m_state], 0));
+        return {};
+    }
+    TRY(encode_bit_with_probability(m_is_rep_g1_probabilities[m_state], 1));
+
+    if (match_type == MatchType::RepMatch2) {
+        TRY(encode_bit_with_probability(m_is_rep_g2_probabilities[m_state], 0));
+        return {};
+    }
+    TRY(encode_bit_with_probability(m_is_rep_g2_probabilities[m_state], 1));
+    return {};
+}
+
+ErrorOr<void> LzmaCompressor::encode_once()
+{
+    // Check if any of our existing match distances are currently usable.
+    Vector<size_t> const existing_distance_hints {
+        m_rep0 + normalized_to_real_match_distance_offset,
+        m_rep1 + normalized_to_real_match_distance_offset,
+        m_rep2 + normalized_to_real_match_distance_offset,
+        m_rep3 + normalized_to_real_match_distance_offset,
+    };
+    auto existing_distance_results = TRY(m_dictionary->find_copy_in_seekback(m_dictionary->used_space(), normalized_to_real_match_length_offset, existing_distance_hints));
+
+    if (existing_distance_results.size() > 0) {
+        auto selected_match = existing_distance_results[0];
+        TRY(encode_existing_match(selected_match.distance, selected_match.length));
+        return {};
+    }
+
+    // If we weren't able to find any viable existing offsets, we now have to search the rest of the dictionary for possible new offsets.
+    auto new_distance_results = TRY(m_dictionary->find_copy_in_seekback(m_dictionary->used_space(), normalized_to_real_match_length_offset));
+
+    if (new_distance_results.size() > 0) {
+        auto selected_match = new_distance_results[0];
+        TRY(encode_new_match(selected_match.distance, selected_match.length));
+        return {};
+    }
+
+    // If we weren't able to find any matches, we don't have any other choice than to encode the next byte as a literal.
+    u8 next_byte { 0 };
+    m_dictionary->read({ &next_byte, sizeof(next_byte) });
+    TRY(encode_literal(next_byte));
+    return {};
+}
+
 ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
 {
     while (m_dictionary->used_space() < bytes.size() && m_dictionary->empty_space() != 0) {
@@ -628,7 +1034,7 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
             // "If the value of "rep0" is equal to 0xFFFFFFFF, it means that we have
             //  "End of stream" marker, so we can stop decoding and check finishing
             //  condition in Range Decoder"
-            if (m_rep0 == 0xFFFFFFFF) {
+            if (m_rep0 == end_of_stream_marker) {
                 // If we should reject end-of-stream markers, do so now.
                 // Note that this is not part of LZMA, as LZMA allows end-of-stream markers in all contexts, so pure LZMA should never set this option.
                 if (m_options.reject_end_of_stream_marker)
@@ -744,4 +1150,116 @@ void LzmaDecompressor::close()
 {
 }
 
+ErrorOr<NonnullOwnPtr<LzmaCompressor>> LzmaCompressor::create_container(MaybeOwned<Stream> stream, LzmaCompressorOptions const& options)
+{
+    auto dictionary = TRY(try_make<CircularBuffer>(TRY(CircularBuffer::create_empty(options.dictionary_size + largest_real_match_length))));
+
+    // "The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where each table contains 0x300 CProb values."
+    auto literal_probabilities = TRY(FixedArray<Probability>::create(literal_probability_table_size * (1 << (options.literal_context_bits + options.literal_position_bits))));
+
+    auto header = TRY(LzmaHeader::from_compressor_options(options));
+    TRY(stream->write_value(header));
+
+    // Note: The reference LZMA implementation has a starting null byte due to how their overflow reservoir is implemented and subsequently wrote it into the specification.
+    //       Therefore, we just have to add it manually.
+    TRY(stream->write_value<u8>(0x00));
+
+    auto compressor = TRY(adopt_nonnull_own_or_enomem(new (nothrow) LzmaCompressor(move(stream), options, move(dictionary), move(literal_probabilities))));
+
+    return compressor;
+}
+
+LzmaCompressor::LzmaCompressor(MaybeOwned<AK::Stream> stream, Compress::LzmaCompressorOptions options, MaybeOwned<CircularBuffer> dictionary, FixedArray<Compress::LzmaState::Probability> literal_probabilities)
+    : LzmaState(move(literal_probabilities))
+    , m_stream(move(stream))
+    , m_options(move(options))
+    , m_dictionary(move(dictionary))
+{
+}
+
+ErrorOr<Bytes> LzmaCompressor::read_some(Bytes)
+{
+    return Error::from_errno(EBADF);
+}
+
+ErrorOr<size_t> LzmaCompressor::write_some(ReadonlyBytes bytes)
+{
+    // Fill the input buffer until it's full or until we can't read any more data.
+    size_t processed_bytes = min(bytes.size(), largest_real_match_length - m_dictionary->used_space());
+    bytes = bytes.trim(processed_bytes);
+
+    while (bytes.size() > 0) {
+        auto const written_bytes = m_dictionary->write(bytes);
+        bytes = bytes.slice(written_bytes);
+    }
+
+    VERIFY(m_dictionary->used_space() <= largest_real_match_length);
+
+    if (m_options.uncompressed_size.has_value() && m_total_processed_bytes + m_dictionary->used_space() > m_options.uncompressed_size.value())
+        return Error::from_string_literal("Tried to compress more LZMA data than announced");
+
+    TRY(encode_once());
+
+    // If we read enough data to reach the final uncompressed size, flush automatically.
+    // Flushing will handle encoding the remaining data for us and finalize the stream.
+    if (m_options.uncompressed_size.has_value() && m_total_processed_bytes + m_dictionary->used_space() >= m_options.uncompressed_size.value())
+        TRY(flush());
+
+    return processed_bytes;
+}
+
+ErrorOr<void> LzmaCompressor::flush()
+{
+    if (m_has_flushed_data)
+        return Error::from_string_literal("Flushed an LZMA stream twice");
+
+    while (m_dictionary->used_space() > 0)
+        TRY(encode_once());
+
+    if (m_options.uncompressed_size.has_value() && m_total_processed_bytes < m_options.uncompressed_size.value())
+        return Error::from_string_literal("Flushing LZMA data with known but unreached uncompressed size");
+
+    // The LZMA specification technically also allows both a known size and an end-of-stream marker simultaneously,
+    // but LZMA2 rejects them, so skip emitting the end-of-stream marker if we know the uncompressed size.
+    if (!m_options.uncompressed_size.has_value())
+        TRY(encode_normalized_simple_match(end_of_stream_marker, 0));
+
+    while (m_range_encoder_code_used_bits > 0) {
+        VERIFY(m_range_encoder_code_used_bits >= 8);
+        u8 const next_byte = (m_range_encoder_code >> (m_range_encoder_code_used_bits - 8));
+        m_range_encoder_code -= static_cast<u64>(next_byte) << (m_range_encoder_code_used_bits - 8);
+        m_range_encoder_code_used_bits -= 8;
+        TRY(m_stream->write_value(next_byte));
+    }
+
+    m_has_flushed_data = true;
+    return {};
+}
+
+bool LzmaCompressor::is_eof() const
+{
+    return true;
+}
+
+bool LzmaCompressor::is_open() const
+{
+    return !m_has_flushed_data;
+}
+
+void LzmaCompressor::close()
+{
+    if (!m_has_flushed_data) {
+        // Note: We need a better API for specifying things like this.
+        flush().release_value_but_fixme_should_propagate_errors();
+    }
+}
+
+LzmaCompressor::~LzmaCompressor()
+{
+    if (!m_has_flushed_data) {
+        // Note: We need a better API for specifying things like this.
+        flush().release_value_but_fixme_should_propagate_errors();
+    }
+}
+
 }

+ 71 - 0
Userland/Libraries/LibCompress/Lzma.h

@@ -32,18 +32,31 @@ struct LzmaDecompressorOptions {
     bool reject_end_of_stream_marker { false };
 };
 
+struct LzmaCompressorOptions {
+    // Note: The default settings have been chosen based on the default settings of other LZMA compressors.
+    u8 literal_context_bits { 3 };
+    u8 literal_position_bits { 0 };
+    u8 position_bits { 2 };
+    u32 dictionary_size { 8 * MiB };
+    Optional<u64> uncompressed_size {};
+};
+
 // Described in section "lzma file format".
 struct [[gnu::packed]] LzmaHeader {
     u32 dictionary_size() const;
     Optional<u64> uncompressed_size() const;
 
     ErrorOr<LzmaDecompressorOptions> as_decompressor_options() const;
+    static ErrorOr<LzmaHeader> from_compressor_options(LzmaCompressorOptions const&);
 
     static ErrorOr<LzmaModelProperties> decode_model_properties(u8 input_bits);
+    static ErrorOr<u8> encode_model_properties(LzmaModelProperties const&);
 
     u8 encoded_model_properties;
     u32 unchecked_dictionary_size;
     u64 encoded_uncompressed_size;
+
+    static constexpr u64 placeholder_for_unknown_uncompressed_size = UINT64_MAX;
 };
 static_assert(sizeof(LzmaHeader) == 13);
 
@@ -83,6 +96,13 @@ protected:
     static constexpr u16 normalized_to_real_match_length_offset = 2;
     static constexpr u32 normalized_to_real_match_distance_offset = 1;
 
+    // According to the specification, the largest possible normalized match length is provided by the high coder,
+    // which processes 8 bits (0 to 255) and adds a displacement of 16 on top.
+    // This is the minimum size that our input buffer has to have to not miss any possible repetitions while encoding.
+    static constexpr u16 largest_real_match_length = 255 + 16 + normalized_to_real_match_length_offset;
+
+    static constexpr u32 end_of_stream_marker = 0xFFFFFFFF;
+
     static constexpr size_t number_of_length_to_position_states = 4;
     Array<Array<Probability, (1 << 6)>, number_of_length_to_position_states> m_length_to_position_states;
 
@@ -185,6 +205,57 @@ private:
     ErrorOr<u32> decode_normalized_match_distance(u16 normalized_match_length);
 };
 
+class LzmaCompressor : public Stream
+    , LzmaState {
+public:
+    /// Creates a compressor for a standalone LZMA container (.lzma file extension, occasionally known as an LZMA 'archive').
+    static ErrorOr<NonnullOwnPtr<LzmaCompressor>> create_container(MaybeOwned<Stream>, LzmaCompressorOptions const&);
+
+    /// Finishes the archive by writing out the remaining data from the range coder.
+    ErrorOr<void> flush();
+
+    virtual ErrorOr<Bytes> read_some(Bytes) override;
+    virtual ErrorOr<size_t> write_some(ReadonlyBytes) override;
+    virtual bool is_eof() const override;
+    virtual bool is_open() const override;
+    virtual void close() override;
+
+    virtual ~LzmaCompressor();
+
+private:
+    LzmaCompressor(MaybeOwned<Stream>, LzmaCompressorOptions, MaybeOwned<CircularBuffer>, FixedArray<Probability> literal_probabilities);
+
+    ErrorOr<void> normalize_range_encoder();
+    ErrorOr<void> encode_direct_bit(u8 value);
+    ErrorOr<void> encode_bit_with_probability(Probability&, u8 value);
+    ErrorOr<void> encode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree, u16 value);
+    ErrorOr<void> encode_symbol_using_reverse_bit_tree(size_t bit_count, Span<Probability> probability_tree, u16 value);
+    ErrorOr<void> encode_normalized_match_length(LzmaLengthCoderState&, u16 normalized_length);
+    ErrorOr<void> encode_normalized_match_distance(u16 normalized_match_length, u32 normalized_match_distance);
+
+    ErrorOr<void> encode_match_type(MatchType);
+    ErrorOr<void> encode_literal(u8 literal);
+    ErrorOr<void> encode_existing_match(size_t real_distance, size_t real_length);
+    ErrorOr<void> encode_new_match(size_t real_distance, size_t real_length);
+    ErrorOr<void> encode_normalized_simple_match(u32 normalized_distance, u16 normalized_length);
+
+    ErrorOr<void> encode_once();
+
+    bool m_has_flushed_data { false };
+
+    MaybeOwned<Stream> m_stream;
+    LzmaCompressorOptions m_options;
+
+    // This doubles as an input buffer, which is appended at the very front of the buffer.
+    // Therefore, the size of this should at least be the dictionary size + the largest possible repetition length.
+    MaybeOwned<CircularBuffer> m_dictionary;
+
+    // Range encoder state.
+    u32 m_range_encoder_range { 0xFFFFFFFF };
+    u64 m_range_encoder_code { 0 };
+    size_t m_range_encoder_code_used_bits { 32 };
+};
+
 }
 
 template<>