Browse Source

LibCompress: Implement proper handling of LZMA end-of-stream markers

Tim Schumacher 2 years ago
parent
commit
8ff36e5910

+ 2 - 3
Tests/LibCompress/TestLzma.cpp

@@ -166,7 +166,6 @@ TEST_CASE(specification_bad_incorrect_size_lzma_decompress)
 
     auto stream = MUST(try_make<FixedMemoryStream>(compressed));
     auto decompressor = MUST(Compress::LzmaDecompressor::create_from_container(move(stream)));
-    // FIXME: This should detect that there is still remaining (non-'end of stream') data after the expected data and reject the last read.
-    //        As of now, this test accepts any result, except for crashes.
-    (void)decompressor->read_until_eof(PAGE_SIZE);
+    auto buffer_or_error = decompressor->read_until_eof(PAGE_SIZE);
+    EXPECT(buffer_or_error.is_error());
 }

+ 28 - 9
Userland/Libraries/LibCompress/Lzma.cpp

@@ -81,6 +81,7 @@ ErrorOr<LzmaDecompressorOptions> LzmaHeader::as_decompressor_options() const
         .position_bits = model_properties.position_bits,
         .dictionary_size = dictionary_size(),
         .uncompressed_size = uncompressed_size(),
+        .reject_end_of_stream_marker = false,
     };
 }
 
@@ -457,17 +458,15 @@ ErrorOr<u32> LzmaDecompressor::decode_normalized_match_distance(u16 normalized_m
 ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
 {
     while (m_dictionary->used_space() < bytes.size() && m_dictionary->empty_space() != 0) {
-        if (m_found_end_of_stream_marker) {
-            if (m_options.uncompressed_size.has_value() && m_total_decoded_bytes < m_options.uncompressed_size.value())
-                return Error::from_string_literal("Found end-of-stream marker earlier than expected");
-
+        if (m_found_end_of_stream_marker)
             break;
-        }
 
         if (has_reached_expected_data_size()) {
-            // FIXME: This should validate that either EOF or the 'end of stream' marker follow immediately.
-            //        Both of those cases count as the 'end of stream' marker being found and should check for a clean decoder state.
-            break;
+            // If the decoder is in a clean state, we assume that this is fine.
+            if (is_range_decoder_in_clean_state())
+                break;
+
+            // Otherwise, we give it one last try to find the end marker in the remaining data.
         }
 
         // "The decoder calculates "state2" variable value to select exact variable from
@@ -543,6 +542,10 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
         //  IsMatch[state2] decode
         //   0 - the Literal"
         if (TRY(decode_bit_with_probability(m_is_match_probabilities[state2])) == 0) {
+            // If we are already past the expected uncompressed size, we are already in "look for EOS only" mode.
+            if (has_reached_expected_data_size())
+                return Error::from_string_literal("Found literal after reaching expected uncompressed size");
+
             // "At first the LZMA decoder must check that it doesn't exceed
             //  specified uncompressed size."
             // This is already checked for at the beginning of the loop.
@@ -577,11 +580,20 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
             //  "End of stream" marker, so we can stop decoding and check finishing
             //  condition in Range Decoder"
             if (m_rep0 == 0xFFFFFFFF) {
+                // If we should reject end-of-stream markers, do so now.
+                // Note that this is not part of LZMA, as LZMA allows end-of-stream markers in all contexts, so pure LZMA should never set this option.
+                if (m_options.reject_end_of_stream_marker)
+                    return Error::from_string_literal("An end-of-stream marker was found, but the LZMA stream is configured to reject them");
+
                 // The range decoder condition is checked after breaking out of the loop.
                 m_found_end_of_stream_marker = true;
                 continue;
             }
 
+            // If we are looking for EOS, but haven't found it here, the stream is corrupted.
+            if (has_reached_expected_data_size())
+                return Error::from_string_literal("First simple match after the expected uncompressed size is not the EOS marker");
+
             // "If uncompressed size is defined, LZMA decoder must check that it doesn't
             //  exceed that specified uncompressed size."
             // This is being checked for in the common "copy to buffer" implementation.
@@ -598,6 +610,10 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
             continue;
         }
 
+        // If we are looking for EOS, but find another match type, the stream is also corrupted.
+        if (has_reached_expected_data_size())
+            return Error::from_string_literal("First match type after the expected uncompressed size is not a simple match");
+
         // "     1 - Rep Match
         //         IsRepG0[state] decode
         //           0 - the distance is rep0"
@@ -666,7 +682,10 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
         TRY(copy_match_to_buffer(normalized_length + normalized_to_real_match_length_offset));
     }
 
-    if (m_found_end_of_stream_marker) {
+    if (m_found_end_of_stream_marker || has_reached_expected_data_size()) {
+        if (m_options.uncompressed_size.has_value() && m_total_decoded_bytes < m_options.uncompressed_size.value())
+            return Error::from_string_literal("Found end-of-stream marker earlier than expected");
+
         if (!is_range_decoder_in_clean_state())
             return Error::from_string_literal("LZMA stream ends in an unclean state");
     }

+ 1 - 0
Userland/Libraries/LibCompress/Lzma.h

@@ -29,6 +29,7 @@ struct LzmaDecompressorOptions {
     u8 position_bits { 0 };
     u32 dictionary_size { 0 };
     Optional<u64> uncompressed_size;
+    bool reject_end_of_stream_marker { false };
 };
 
 // Described in section "lzma file format".

+ 3 - 0
Userland/Libraries/LibCompress/Lzma2.cpp

@@ -106,6 +106,9 @@ ErrorOr<Bytes> Lzma2Decompressor::read_some(Bytes bytes)
                     .position_bits = properties.position_bits,
                     .dictionary_size = static_cast<u32>(dictionary_size),
                     .uncompressed_size = uncompressed_size,
+
+                    // Note: This is not specified anywhere. However, it is apparently tested by bad-1-lzma2-7.xz from the XZ utils test files.
+                    .reject_end_of_stream_marker = true,
                 };
                 [[fallthrough]];
             }