Lzma2.cpp 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. /*
  2. * Copyright (c) 2023, Tim Schumacher <timschumi@gmx.de>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/ConstrainedStream.h>
  7. #include <AK/Endian.h>
  8. #include <LibCompress/Lzma2.h>
  9. namespace Compress {
  10. ErrorOr<NonnullOwnPtr<Lzma2Decompressor>> Lzma2Decompressor::create_from_raw_stream(MaybeOwned<Stream> stream, u32 dictionary_size)
  11. {
  12. auto dictionary = TRY(CircularBuffer::create_empty(dictionary_size));
  13. auto decompressor = TRY(adopt_nonnull_own_or_enomem(new (nothrow) Lzma2Decompressor(move(stream), move(dictionary))));
  14. return decompressor;
  15. }
  16. Lzma2Decompressor::Lzma2Decompressor(MaybeOwned<Stream> stream, CircularBuffer dictionary)
  17. : m_stream(move(stream))
  18. , m_dictionary(move(dictionary))
  19. {
  20. }
  21. ErrorOr<Bytes> Lzma2Decompressor::read_some(Bytes bytes)
  22. {
  23. if (!m_current_chunk_stream.has_value() || (*m_current_chunk_stream)->is_eof()) {
  24. // "LZMA2 data consists of packets starting with a control byte, with the following values:"
  25. auto const control_byte = TRY(m_stream->read_value<u8>());
  26. if (control_byte == 0) {
  27. // " - 0 denotes the end of the file"
  28. m_found_end_of_stream = true;
  29. return bytes.trim(0);
  30. }
  31. if (control_byte == 1) {
  32. // " - 1 denotes a dictionary reset followed by an uncompressed chunk"
  33. m_dictionary.clear();
  34. m_dictionary_initialized = true;
  35. // The XZ utils test files (bad-1-lzma2-8.xz) check that the decompressor
  36. // requires a new set of properties after a dictionary reset.
  37. m_last_lzma_options = {};
  38. }
  39. if (control_byte == 1 || control_byte == 2) {
  40. // " - 2 denotes an uncompressed chunk without a dictionary reset"
  41. if (!m_dictionary_initialized)
  42. return Error::from_string_literal("LZMA2 stream uses dictionary without ever resetting it");
  43. // "Uncompressed chunks consist of:
  44. // - A 16-bit big-endian value encoding the data size minus one
  45. // - The data to be copied verbatim into the dictionary and the output"
  46. u32 data_size = TRY(m_stream->read_value<BigEndian<u16>>()) + 1;
  47. m_in_uncompressed_chunk = true;
  48. m_current_chunk_stream = TRY(try_make<ConstrainedStream>(MaybeOwned { *m_stream }, data_size));
  49. }
  50. if (3 <= control_byte && control_byte <= 0x7f) {
  51. // " - 3-0x7f are invalid values"
  52. return Error::from_string_literal("Invalid control byte in LZMA2 stream");
  53. }
  54. if (0x80 <= control_byte) {
  55. // " - 0x80-0xff denotes an LZMA chunk, where the lowest 5 bits are used as bit 16-20
  56. // of the uncompressed size minus one, and bit 5-6 indicates what should be reset."
  57. auto encoded_uncompressed_size_high = control_byte & 0b11111;
  58. auto reset_indicator = (control_byte & 0b1100000) >> 5;
  59. // "LZMA chunks consist of:
  60. // - A 16-bit big-endian value encoding the low 16-bits of the uncompressed size minus one
  61. // - A 16-bit big-endian value encoding the compressed size minus one
  62. // - A properties/lclppb byte if bit 6 in the control byte is set
  63. // - The LZMA compressed data, starting with the 5 bytes (of which the first is ignored)
  64. // used to initialize the range coder (which are included in the compressed size)"
  65. u16 encoded_uncompressed_size_low = TRY(m_stream->read_value<BigEndian<u16>>());
  66. u16 encoded_compressed_size = TRY(m_stream->read_value<BigEndian<u16>>());
  67. u64 uncompressed_size = ((encoded_uncompressed_size_high << 16) | encoded_uncompressed_size_low) + 1;
  68. u32 compressed_size = encoded_compressed_size + 1;
  69. m_current_chunk_stream = TRY(try_make<ConstrainedStream>(MaybeOwned { *m_stream }, compressed_size));
  70. // "Bits 5-6 for LZMA chunks can be:"
  71. switch (reset_indicator) {
  72. case 3: {
  73. // " - 3: state reset, properties reset using properties byte, dictionary reset"
  74. m_dictionary.clear();
  75. m_dictionary_initialized = true;
  76. [[fallthrough]];
  77. }
  78. case 2: {
  79. // " - 2: state reset, properties reset using properties byte"
  80. // Update the stored LZMA options with the new settings, the stream will be recreated later.
  81. auto encoded_properties = TRY(m_stream->read_value<u8>());
  82. auto properties = TRY(LzmaHeader::decode_model_properties(encoded_properties));
  83. auto dictionary_size = m_dictionary.capacity();
  84. VERIFY(dictionary_size <= NumericLimits<u32>::max());
  85. m_last_lzma_options = LzmaDecompressorOptions {
  86. .literal_context_bits = properties.literal_context_bits,
  87. .literal_position_bits = properties.literal_position_bits,
  88. .position_bits = properties.position_bits,
  89. .dictionary_size = static_cast<u32>(dictionary_size),
  90. .uncompressed_size = uncompressed_size,
  91. // Note: This is not specified anywhere. However, it is apparently tested by bad-1-lzma2-7.xz from the XZ utils test files.
  92. .reject_end_of_stream_marker = true,
  93. };
  94. [[fallthrough]];
  95. }
  96. case 1: {
  97. // " - 1: state reset"
  98. if (!m_last_lzma_options.has_value())
  99. return Error::from_string_literal("LZMA2 stream contains LZMA chunk without settings");
  100. if (!m_dictionary_initialized)
  101. return Error::from_string_literal("LZMA2 stream uses dictionary without ever resetting it");
  102. m_last_lzma_options->uncompressed_size = uncompressed_size;
  103. m_last_lzma_stream = TRY(LzmaDecompressor::create_from_raw_stream(m_current_chunk_stream.release_value(), *m_last_lzma_options, MaybeOwned<CircularBuffer> { m_dictionary }));
  104. break;
  105. }
  106. case 0: {
  107. // " - 0: nothing reset"
  108. if (!m_last_lzma_stream.has_value())
  109. return Error::from_string_literal("LZMA2 stream contains no-reset LZMA chunk without previous state");
  110. if (!m_dictionary_initialized)
  111. return Error::from_string_literal("LZMA2 stream uses dictionary without ever resetting it");
  112. TRY((*m_last_lzma_stream)->append_input_stream(m_current_chunk_stream.release_value(), uncompressed_size));
  113. break;
  114. }
  115. }
  116. m_in_uncompressed_chunk = false;
  117. m_current_chunk_stream = MaybeOwned<Stream> { **m_last_lzma_stream };
  118. }
  119. }
  120. auto result = TRY((*m_current_chunk_stream)->read_some(bytes));
  121. // For an uncompressed block we are reading directly from the input stream,
  122. // so we need to capture the 'uncompressed' data into the dictionary manually.
  123. // Since we only care about having the correct value in the seekback buffer,
  124. // we can also immediately discard the written data and only ever have to write
  125. // the last <dictionary size> bytes into it.
  126. if (m_in_uncompressed_chunk) {
  127. VERIFY(m_dictionary.used_space() == 0);
  128. auto relevant_data = result;
  129. if (relevant_data.size() > m_dictionary.capacity())
  130. relevant_data = relevant_data.slice(relevant_data.size() - m_dictionary.capacity(), relevant_data.size());
  131. auto written_bytes = m_dictionary.write(relevant_data);
  132. VERIFY(written_bytes == relevant_data.size());
  133. MUST(m_dictionary.discard(written_bytes));
  134. }
  135. return result;
  136. }
  137. ErrorOr<size_t> Lzma2Decompressor::write_some(ReadonlyBytes)
  138. {
  139. return Error::from_errno(EBADF);
  140. }
  141. bool Lzma2Decompressor::is_eof() const
  142. {
  143. return m_found_end_of_stream;
  144. }
  145. bool Lzma2Decompressor::is_open() const
  146. {
  147. return true;
  148. }
  149. void Lzma2Decompressor::close()
  150. {
  151. }
  152. }