Lzma.cpp 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702
  1. /*
  2. * Copyright (c) 2023, Tim Schumacher <timschumi@gmx.de>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <LibCompress/Lzma.h>
  7. namespace Compress {
  8. u32 LzmaHeader::dictionary_size() const
  9. {
  10. // "If the value of dictionary size in properties is smaller than (1 << 12),
  11. // the LZMA decoder must set the dictionary size variable to (1 << 12)."
  12. constexpr u32 minimum_dictionary_size = (1 << 12);
  13. if (m_dictionary_size < minimum_dictionary_size)
  14. return minimum_dictionary_size;
  15. return m_dictionary_size;
  16. }
  17. Optional<u64> LzmaHeader::uncompressed_size() const
  18. {
  19. // We are making a copy of the packed field here because we would otherwise
  20. // pass an unaligned reference to the constructor of Optional, which is
  21. // undefined behavior.
  22. auto uncompressed_size = m_uncompressed_size;
  23. // "If "Uncompressed size" field contains ones in all 64 bits, it means that
  24. // uncompressed size is unknown and there is the "end marker" in stream,
  25. // that indicates the end of decoding point."
  26. if (uncompressed_size == UINT64_MAX)
  27. return {};
  28. // "In opposite case, if the value from "Uncompressed size" field is not
  29. // equal to ((2^64) - 1), the LZMA stream decoding must be finished after
  30. // specified number of bytes (Uncompressed size) is decoded. And if there
  31. // is the "end marker", the LZMA decoder must read that marker also."
  32. return uncompressed_size;
  33. }
  34. ErrorOr<LzmaModelProperties> LzmaHeader::decode_model_properties(u8 input_bits)
  35. {
  36. // "Decodes the following values from the encoded model properties field:
  37. //
  38. // name Range Description
  39. // lc [0, 8] the number of "literal context" bits
  40. // lp [0, 4] the number of "literal pos" bits
  41. // pb [0, 4] the number of "pos" bits
  42. //
  43. // Encoded using `((pb * 5 + lp) * 9 + lc)`."
  44. if (input_bits >= (9 * 5 * 5))
  45. return Error::from_string_literal("Encoded model properties value is larger than the highest possible value");
  46. u8 literal_context_bits = input_bits % 9;
  47. input_bits /= 9;
  48. VERIFY(literal_context_bits >= 0 && literal_context_bits <= 8);
  49. u8 literal_position_bits = input_bits % 5;
  50. input_bits /= 5;
  51. VERIFY(literal_position_bits >= 0 && literal_position_bits <= 4);
  52. u8 position_bits = input_bits;
  53. VERIFY(position_bits >= 0 && position_bits <= 4);
  54. return LzmaModelProperties {
  55. .literal_context_bits = literal_context_bits,
  56. .literal_position_bits = literal_position_bits,
  57. .position_bits = position_bits,
  58. };
  59. }
  60. ErrorOr<LzmaDecompressorOptions> LzmaHeader::as_decompressor_options() const
  61. {
  62. auto model_properties = TRY(decode_model_properties(m_encoded_model_properties));
  63. return Compress::LzmaDecompressorOptions {
  64. .literal_context_bits = model_properties.literal_context_bits,
  65. .literal_position_bits = model_properties.literal_position_bits,
  66. .position_bits = model_properties.position_bits,
  67. .dictionary_size = dictionary_size(),
  68. .uncompressed_size = uncompressed_size(),
  69. };
  70. }
  71. void LzmaDecompressor::initialize_to_default_probability(Span<Probability> span)
  72. {
  73. for (auto& entry : span)
  74. entry = default_probability;
  75. }
  76. ErrorOr<NonnullOwnPtr<LzmaDecompressor>> LzmaDecompressor::create_from_container(MaybeOwned<Stream> stream, Optional<MaybeOwned<CircularBuffer>> dictionary)
  77. {
  78. auto header = TRY(stream->read_value<LzmaHeader>());
  79. return TRY(LzmaDecompressor::create_from_raw_stream(move(stream), TRY(header.as_decompressor_options()), move(dictionary)));
  80. }
  81. ErrorOr<NonnullOwnPtr<LzmaDecompressor>> LzmaDecompressor::create_from_raw_stream(MaybeOwned<Stream> stream, LzmaDecompressorOptions const& options, Optional<MaybeOwned<CircularBuffer>> dictionary)
  82. {
  83. if (!dictionary.has_value()) {
  84. auto new_dictionary = TRY(CircularBuffer::create_empty(options.dictionary_size));
  85. dictionary = TRY(try_make<CircularBuffer>(move(new_dictionary)));
  86. }
  87. VERIFY((*dictionary)->capacity() >= options.dictionary_size);
  88. // "The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where each table contains 0x300 CProb values."
  89. auto literal_probabilities = TRY(FixedArray<Probability>::create(literal_probability_table_size * (1 << (options.literal_context_bits + options.literal_position_bits))));
  90. auto decompressor = TRY(adopt_nonnull_own_or_enomem(new (nothrow) LzmaDecompressor(move(stream), options, dictionary.release_value(), move(literal_probabilities))));
  91. TRY(decompressor->initialize_range_decoder());
  92. return decompressor;
  93. }
  94. LzmaDecompressor::LzmaDecompressor(MaybeOwned<Stream> stream, LzmaDecompressorOptions options, MaybeOwned<CircularBuffer> dictionary, FixedArray<Probability> literal_probabilities)
  95. : m_stream(move(stream))
  96. , m_options(move(options))
  97. , m_dictionary(move(dictionary))
  98. , m_literal_probabilities(move(literal_probabilities))
  99. {
  100. initialize_to_default_probability(m_literal_probabilities.span());
  101. for (auto& array : m_length_to_position_states)
  102. initialize_to_default_probability(array);
  103. for (auto& array : m_binary_tree_distance_probabilities)
  104. initialize_to_default_probability(array);
  105. initialize_to_default_probability(m_alignment_bit_probabilities);
  106. initialize_to_default_probability(m_is_match_probabilities);
  107. initialize_to_default_probability(m_is_rep_probabilities);
  108. initialize_to_default_probability(m_is_rep_g0_probabilities);
  109. initialize_to_default_probability(m_is_rep_g1_probabilities);
  110. initialize_to_default_probability(m_is_rep_g2_probabilities);
  111. initialize_to_default_probability(m_is_rep0_long_probabilities);
  112. }
  113. bool LzmaDecompressor::is_range_decoder_in_clean_state() const
  114. {
  115. return m_range_decoder_code == 0;
  116. }
  117. bool LzmaDecompressor::has_reached_expected_data_size() const
  118. {
  119. if (!m_options.uncompressed_size.has_value())
  120. return false;
  121. return m_total_decoded_bytes >= m_options.uncompressed_size.value();
  122. }
  123. ErrorOr<void> LzmaDecompressor::initialize_range_decoder()
  124. {
  125. // "The LZMA Encoder always writes ZERO in initial byte of compressed stream.
  126. // That scheme allows to simplify the code of the Range Encoder in the
  127. // LZMA Encoder. If initial byte is not equal to ZERO, the LZMA Decoder must
  128. // stop decoding and report error."
  129. {
  130. auto byte = TRY(m_stream->read_value<u8>());
  131. if (byte != 0)
  132. return Error::from_string_literal("Initial byte of data stream is not zero");
  133. }
  134. // Read the initial bytes into the range decoder.
  135. m_range_decoder_code = 0;
  136. for (size_t i = 0; i < 4; i++) {
  137. auto byte = TRY(m_stream->read_value<u8>());
  138. m_range_decoder_code = m_range_decoder_code << 8 | byte;
  139. }
  140. m_range_decoder_range = 0xFFFFFFFF;
  141. return {};
  142. }
  143. ErrorOr<void> LzmaDecompressor::append_input_stream(MaybeOwned<Stream> stream, Optional<u64> uncompressed_size)
  144. {
  145. m_stream = move(stream);
  146. TRY(initialize_range_decoder());
  147. if (m_options.uncompressed_size.has_value() != uncompressed_size.has_value())
  148. return Error::from_string_literal("Appending LZMA streams with mismatching uncompressed size status");
  149. if (uncompressed_size.has_value())
  150. *m_options.uncompressed_size += *uncompressed_size;
  151. return {};
  152. }
  153. ErrorOr<void> LzmaDecompressor::normalize_range_decoder()
  154. {
  155. // "The value of the "Range" variable before each bit decoding can not be smaller
  156. // than ((UInt32)1 << 24). The Normalize() function keeps the "Range" value in
  157. // described range."
  158. constexpr u32 minimum_range_value = 1 << 24;
  159. if (m_range_decoder_range >= minimum_range_value)
  160. return {};
  161. m_range_decoder_range <<= 8;
  162. m_range_decoder_code <<= 8;
  163. m_range_decoder_code |= TRY(m_stream->read_value<u8>());
  164. VERIFY(m_range_decoder_range >= minimum_range_value);
  165. return {};
  166. }
  167. ErrorOr<u8> LzmaDecompressor::decode_direct_bit()
  168. {
  169. m_range_decoder_range >>= 1;
  170. m_range_decoder_code -= m_range_decoder_range;
  171. u32 temp = 0 - (m_range_decoder_code >> 31);
  172. m_range_decoder_code += m_range_decoder_range & temp;
  173. if (m_range_decoder_code == m_range_decoder_range)
  174. return Error::from_string_literal("Reached an invalid state while decoding LZMA stream");
  175. TRY(normalize_range_decoder());
  176. return temp + 1;
  177. }
  178. ErrorOr<u8> LzmaDecompressor::decode_bit_with_probability(Probability& probability)
  179. {
  180. // "The LZMA decoder provides the pointer to CProb variable that contains
  181. // information about estimated probability for symbol 0 and the Range Decoder
  182. // updates that CProb variable after decoding."
  183. // The significance of the shift width is not explained and appears to be a magic constant.
  184. constexpr size_t probability_shift_width = 5;
  185. u32 bound = (m_range_decoder_range >> probability_bit_count) * probability;
  186. if (m_range_decoder_code < bound) {
  187. probability += ((1 << probability_bit_count) - probability) >> probability_shift_width;
  188. m_range_decoder_range = bound;
  189. TRY(normalize_range_decoder());
  190. return 0;
  191. } else {
  192. probability -= probability >> probability_shift_width;
  193. m_range_decoder_code -= bound;
  194. m_range_decoder_range -= bound;
  195. TRY(normalize_range_decoder());
  196. return 1;
  197. }
  198. }
  199. ErrorOr<u16> LzmaDecompressor::decode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree)
  200. {
  201. VERIFY(bit_count <= sizeof(u16) * 8);
  202. VERIFY(probability_tree.size() >= 1ul << bit_count);
  203. // This has been modified from the reference implementation to unlink the result and the tree index,
  204. // which should allow for better readability.
  205. u16 result = 0;
  206. size_t tree_index = 1;
  207. for (size_t i = 0; i < bit_count; i++) {
  208. u16 next_bit = TRY(decode_bit_with_probability(probability_tree[tree_index]));
  209. result = (result << 1) | next_bit;
  210. tree_index = (tree_index << 1) | next_bit;
  211. }
  212. return result;
  213. }
  214. ErrorOr<u16> LzmaDecompressor::decode_symbol_using_reverse_bit_tree(size_t bit_count, Span<Probability> probability_tree)
  215. {
  216. VERIFY(bit_count <= sizeof(u16) * 8);
  217. VERIFY(probability_tree.size() >= 1ul << bit_count);
  218. u16 result = 0;
  219. size_t tree_index = 1;
  220. for (size_t i = 0; i < bit_count; i++) {
  221. u16 next_bit = TRY(decode_bit_with_probability(probability_tree[tree_index]));
  222. result |= next_bit << i;
  223. tree_index = (tree_index << 1) | next_bit;
  224. }
  225. return result;
  226. }
  227. ErrorOr<void> LzmaDecompressor::decode_literal_to_output_buffer()
  228. {
  229. u8 previous_byte = 0;
  230. if (m_total_decoded_bytes > 0) {
  231. auto read_bytes = MUST(m_dictionary->read_with_seekback({ &previous_byte, sizeof(previous_byte) }, 1));
  232. VERIFY(read_bytes.size() == sizeof(previous_byte));
  233. }
  234. // "To select the table for decoding it uses the context that consists of
  235. // (lc) high bits from previous literal and (lp) low bits from value that
  236. // represents current position in outputStream."
  237. u16 literal_state_bits_from_position = m_total_decoded_bytes & ((1 << m_options.literal_position_bits) - 1);
  238. u16 literal_state_bits_from_output = previous_byte >> (8 - m_options.literal_context_bits);
  239. u16 literal_state = literal_state_bits_from_position << m_options.literal_context_bits | literal_state_bits_from_output;
  240. Span<Probability> selected_probability_table = m_literal_probabilities.span().slice(literal_probability_table_size * literal_state, literal_probability_table_size);
  241. // The result is defined as u16 here and initialized to 1, but we will cut off the top bits before queueing them into the output buffer.
  242. // The top bit is only used to track how much we have decoded already, and to select the correct probability table.
  243. u16 result = 1;
  244. // "If (State > 7), the Literal Decoder also uses "matchByte" that represents
  245. // the byte in OutputStream at position the is the DISTANCE bytes before
  246. // current position, where the DISTANCE is the distance in DISTANCE-LENGTH pair
  247. // of latest decoded match."
  248. // Note: The specification says `(State > 7)`, but the reference implementation does `(State >= 7)`, which is a mismatch.
  249. // Testing `(State > 7)` with actual test files yields errors, so the reference implementation appears to be the correct one.
  250. if (m_state >= 7) {
  251. u8 matched_byte = 0;
  252. auto read_bytes = TRY(m_dictionary->read_with_seekback({ &matched_byte, sizeof(matched_byte) }, m_rep0 + 1));
  253. VERIFY(read_bytes.size() == sizeof(matched_byte));
  254. do {
  255. u8 match_bit = (matched_byte >> 7) & 1;
  256. matched_byte <<= 1;
  257. u8 decoded_bit = TRY(decode_bit_with_probability(selected_probability_table[((1 + match_bit) << 8) + result]));
  258. result = result << 1 | decoded_bit;
  259. if (match_bit != decoded_bit)
  260. break;
  261. } while (result < 0x100);
  262. }
  263. while (result < 0x100)
  264. result = (result << 1) | TRY(decode_bit_with_probability(selected_probability_table[result]));
  265. u8 actual_result = result - 0x100;
  266. size_t written_bytes = m_dictionary->write({ &actual_result, sizeof(actual_result) });
  267. VERIFY(written_bytes == sizeof(actual_result));
  268. m_total_decoded_bytes += sizeof(actual_result);
  269. return {};
  270. }
  271. LzmaDecompressor::LzmaLengthDecoderState::LzmaLengthDecoderState()
  272. {
  273. for (auto& array : m_low_length_probabilities)
  274. initialize_to_default_probability(array);
  275. for (auto& array : m_medium_length_probabilities)
  276. initialize_to_default_probability(array);
  277. initialize_to_default_probability(m_high_length_probabilities);
  278. }
  279. ErrorOr<u16> LzmaDecompressor::decode_normalized_match_length(LzmaLengthDecoderState& length_decoder_state)
  280. {
  281. // "LZMA uses "posState" value as context to select the binary tree
  282. // from LowCoder and MidCoder binary tree arrays:"
  283. u16 position_state = m_total_decoded_bytes & ((1 << m_options.position_bits) - 1);
  284. // "The following scheme is used for the match length encoding:
  285. //
  286. // Binary encoding Binary Tree structure Zero-based match length
  287. // sequence (binary + decimal):
  288. //
  289. // 0 xxx LowCoder[posState] xxx
  290. if (TRY(decode_bit_with_probability(length_decoder_state.m_first_choice_probability)) == 0)
  291. return TRY(decode_symbol_using_bit_tree(3, length_decoder_state.m_low_length_probabilities[position_state].span()));
  292. // 1 0 yyy MidCoder[posState] yyy + 8
  293. if (TRY(decode_bit_with_probability(length_decoder_state.m_second_choice_probability)) == 0)
  294. return TRY(decode_symbol_using_bit_tree(3, length_decoder_state.m_medium_length_probabilities[position_state].span())) + 8;
  295. // 1 1 zzzzzzzz HighCoder zzzzzzzz + 16"
  296. return TRY(decode_symbol_using_bit_tree(8, length_decoder_state.m_high_length_probabilities.span())) + 16;
  297. }
  298. ErrorOr<u32> LzmaDecompressor::decode_normalized_match_distance(u16 normalized_match_length)
  299. {
  300. // "LZMA uses normalized match length (zero-based length)
  301. // to calculate the context state "lenState" do decode the distance value."
  302. u16 length_state = min(normalized_match_length, number_of_length_to_position_states - 1);
  303. // "At first stage the distance decoder decodes 6-bit "posSlot" value with bit
  304. // tree decoder from PosSlotDecoder array."
  305. u16 position_slot = TRY(decode_symbol_using_bit_tree(6, m_length_to_position_states[length_state].span()));
  306. // "The encoding scheme for distance value is shown in the following table:
  307. //
  308. // posSlot (decimal) /
  309. // zero-based distance (binary)
  310. // 0 0
  311. // 1 1
  312. // 2 10
  313. // 3 11
  314. //
  315. // 4 10 x
  316. // 5 11 x
  317. // 6 10 xx
  318. // 7 11 xx
  319. // 8 10 xxx
  320. // 9 11 xxx
  321. // 10 10 xxxx
  322. // 11 11 xxxx
  323. // 12 10 xxxxx
  324. // 13 11 xxxxx
  325. //
  326. // 14 10 yy zzzz
  327. // 15 11 yy zzzz
  328. // 16 10 yyy zzzz
  329. // 17 11 yyy zzzz
  330. // ...
  331. // 62 10 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz
  332. // 63 11 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz
  333. //
  334. // where
  335. // "x ... x" means the sequence of binary symbols encoded with binary tree and
  336. // "Reverse" scheme. It uses separated binary tree for each posSlot from 4 to 13.
  337. // "y" means direct bit encoded with range coder.
  338. // "zzzz" means the sequence of four binary symbols encoded with binary
  339. // tree with "Reverse" scheme, where one common binary tree "AlignDecoder"
  340. // is used for all posSlot values."
  341. // "If (posSlot < 4), the "dist" value is equal to posSlot value."
  342. if (position_slot < first_position_slot_with_binary_tree_bits)
  343. return position_slot;
  344. // From here on, the first bit of the distance is always set and the second bit is set if the last bit of the position slot is set.
  345. u32 distance_prefix = ((1 << 1) | ((position_slot & 1) << 0));
  346. // "If (posSlot >= 4), the decoder uses "posSlot" value to calculate the value of
  347. // the high bits of "dist" value and the number of the low bits.
  348. // If (4 <= posSlot < kEndPosModelIndex), the decoder uses bit tree decoders.
  349. // (one separated bit tree decoder per one posSlot value) and "Reverse" scheme."
  350. if (position_slot < first_position_slot_with_direct_encoded_bits) {
  351. size_t number_of_bits_to_decode = (position_slot / 2) - 1;
  352. auto& selected_probability_tree = m_binary_tree_distance_probabilities[position_slot - first_position_slot_with_binary_tree_bits];
  353. return (distance_prefix << number_of_bits_to_decode) | TRY(decode_symbol_using_reverse_bit_tree(number_of_bits_to_decode, selected_probability_tree));
  354. }
  355. // " if (posSlot >= kEndPosModelIndex), the middle bits are decoded as direct
  356. // bits from RangeDecoder and the low 4 bits are decoded with a bit tree
  357. // decoder "AlignDecoder" with "Reverse" scheme."
  358. size_t number_of_direct_bits_to_decode = ((position_slot - first_position_slot_with_direct_encoded_bits) / 2) + 2;
  359. for (size_t i = 0; i < number_of_direct_bits_to_decode; i++) {
  360. distance_prefix = (distance_prefix << 1) | TRY(decode_direct_bit());
  361. }
  362. return (distance_prefix << number_of_alignment_bits) | TRY(decode_symbol_using_reverse_bit_tree(number_of_alignment_bits, m_alignment_bit_probabilities));
  363. }
  364. ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
  365. {
  366. while (m_dictionary->used_space() < bytes.size() && m_dictionary->empty_space() != 0) {
  367. if (m_found_end_of_stream_marker) {
  368. if (m_options.uncompressed_size.has_value() && m_total_decoded_bytes < m_options.uncompressed_size.value())
  369. return Error::from_string_literal("Found end-of-stream marker earlier than expected");
  370. break;
  371. }
  372. if (has_reached_expected_data_size()) {
  373. // FIXME: This should validate that either EOF or the 'end of stream' marker follow immediately.
  374. // Both of those cases count as the 'end of stream' marker being found and should check for a clean decoder state.
  375. break;
  376. }
  377. // "The decoder calculates "state2" variable value to select exact variable from
  378. // "IsMatch" and "IsRep0Long" arrays."
  379. u16 position_state = m_total_decoded_bytes & ((1 << m_options.position_bits) - 1);
  380. u16 state2 = (m_state << maximum_number_of_position_bits) + position_state;
  381. auto update_state_after_literal = [&] {
  382. if (m_state < 4)
  383. m_state = 0;
  384. else if (m_state < 10)
  385. m_state -= 3;
  386. else
  387. m_state -= 6;
  388. };
  389. auto update_state_after_match = [&] {
  390. if (m_state < 7)
  391. m_state = 7;
  392. else
  393. m_state = 10;
  394. };
  395. auto update_state_after_rep = [&] {
  396. if (m_state < 7)
  397. m_state = 8;
  398. else
  399. m_state = 11;
  400. };
  401. auto update_state_after_short_rep = [&] {
  402. if (m_state < 7)
  403. m_state = 9;
  404. else
  405. m_state = 11;
  406. };
  407. auto copy_match_to_buffer = [&](u16 real_length) -> ErrorOr<void> {
  408. VERIFY(!m_leftover_match_length.has_value());
  409. if (m_options.uncompressed_size.has_value() && m_options.uncompressed_size.value() < m_total_decoded_bytes + real_length)
  410. return Error::from_string_literal("Tried to copy match beyond expected uncompressed file size");
  411. while (real_length > 0) {
  412. if (m_dictionary->empty_space() == 0) {
  413. m_leftover_match_length = real_length;
  414. break;
  415. }
  416. u8 byte;
  417. auto read_bytes = TRY(m_dictionary->read_with_seekback({ &byte, sizeof(byte) }, m_rep0 + 1));
  418. VERIFY(read_bytes.size() == sizeof(byte));
  419. auto written_bytes = m_dictionary->write({ &byte, sizeof(byte) });
  420. VERIFY(written_bytes == sizeof(byte));
  421. m_total_decoded_bytes++;
  422. real_length--;
  423. }
  424. return {};
  425. };
  426. // If we have a leftover part of a repeating match, we should finish that first.
  427. if (m_leftover_match_length.has_value()) {
  428. TRY(copy_match_to_buffer(m_leftover_match_length.release_value()));
  429. continue;
  430. }
  431. // "The decoder uses the following code flow scheme to select exact
  432. // type of LITERAL or MATCH:
  433. //
  434. // IsMatch[state2] decode
  435. // 0 - the Literal"
  436. if (TRY(decode_bit_with_probability(m_is_match_probabilities[state2])) == 0) {
  437. // "At first the LZMA decoder must check that it doesn't exceed
  438. // specified uncompressed size."
  439. // This is already checked for at the beginning of the loop.
  440. // "Then it decodes literal value and puts it to sliding window."
  441. TRY(decode_literal_to_output_buffer());
  442. // "Then the decoder must update the "state" value."
  443. update_state_after_literal();
  444. continue;
  445. }
  446. // " 1 - the Match
  447. // IsRep[state] decode
  448. // 0 - Simple Match"
  449. if (TRY(decode_bit_with_probability(m_is_rep_probabilities[m_state])) == 0) {
  450. // "The distance history table is updated with the following scheme:"
  451. m_rep3 = m_rep2;
  452. m_rep2 = m_rep1;
  453. m_rep1 = m_rep0;
  454. // "The zero-based length is decoded with "LenDecoder"."
  455. u16 normalized_length = TRY(decode_normalized_match_length(m_length_decoder));
  456. // "The state is update with UpdateState_Match function."
  457. update_state_after_match();
  458. // "and the new "rep0" value is decoded with DecodeDistance."
  459. m_rep0 = TRY(decode_normalized_match_distance(normalized_length));
  460. // "If the value of "rep0" is equal to 0xFFFFFFFF, it means that we have
  461. // "End of stream" marker, so we can stop decoding and check finishing
  462. // condition in Range Decoder"
  463. if (m_rep0 == 0xFFFFFFFF) {
  464. // The range decoder condition is checked after breaking out of the loop.
  465. m_found_end_of_stream_marker = true;
  466. continue;
  467. }
  468. // "If uncompressed size is defined, LZMA decoder must check that it doesn't
  469. // exceed that specified uncompressed size."
  470. // This is being checked for in the common "copy to buffer" implementation.
  471. // "Also the decoder must check that "rep0" value is not larger than dictionary size
  472. // and is not larger than the number of already decoded bytes."
  473. if (m_rep0 > m_dictionary->seekback_limit())
  474. return Error::from_string_literal("rep0 value is larger than the possible lookback size");
  475. // "Then the decoder must copy match bytes as described in
  476. // "The match symbols copying" section."
  477. TRY(copy_match_to_buffer(normalized_length + normalized_to_real_match_length_offset));
  478. continue;
  479. }
  480. // " 1 - Rep Match
  481. // IsRepG0[state] decode
  482. // 0 - the distance is rep0"
  483. if (TRY(decode_bit_with_probability(m_is_rep_g0_probabilities[m_state])) == 0) {
  484. // "LZMA doesn't update the distance history."
  485. // " IsRep0Long[state2] decode
  486. // 0 - Short Rep Match"
  487. if (TRY(decode_bit_with_probability(m_is_rep0_long_probabilities[state2])) == 0) {
  488. // "If the subtype is "Short Rep Match", the decoder updates the state, puts
  489. // the one byte from window to current position in window and goes to next
  490. // MATCH/LITERAL symbol."
  491. update_state_after_short_rep();
  492. u8 byte;
  493. auto read_bytes = TRY(m_dictionary->read_with_seekback({ &byte, sizeof(byte) }, m_rep0 + 1));
  494. VERIFY(read_bytes.size() == sizeof(byte));
  495. auto written_bytes = m_dictionary->write({ &byte, sizeof(byte) });
  496. VERIFY(written_bytes == sizeof(byte));
  497. m_total_decoded_bytes++;
  498. continue;
  499. }
  500. // " 1 - Rep Match 0"
  501. // Intentional fallthrough, we just need to make sure to not run the detection for other match types and to not switch around the distance history.
  502. } else {
  503. // " 1 -
  504. // IsRepG1[state] decode
  505. // 0 - Rep Match 1"
  506. if (TRY(decode_bit_with_probability(m_is_rep_g1_probabilities[m_state])) == 0) {
  507. u32 distance = m_rep1;
  508. m_rep1 = m_rep0;
  509. m_rep0 = distance;
  510. }
  511. // " 1 -
  512. // IsRepG2[state] decode
  513. // 0 - Rep Match 2"
  514. else if (TRY(decode_bit_with_probability(m_is_rep_g2_probabilities[m_state])) == 0) {
  515. u32 distance = m_rep2;
  516. m_rep2 = m_rep1;
  517. m_rep1 = m_rep0;
  518. m_rep0 = distance;
  519. }
  520. // " 1 - Rep Match 3"
  521. else {
  522. u32 distance = m_rep3;
  523. m_rep3 = m_rep2;
  524. m_rep2 = m_rep1;
  525. m_rep1 = m_rep0;
  526. m_rep0 = distance;
  527. }
  528. }
  529. // "In other cases (Rep Match 0/1/2/3), it decodes the zero-based
  530. // length of match with "RepLenDecoder" decoder."
  531. u16 normalized_length = TRY(decode_normalized_match_length(m_rep_length_decoder));
  532. // "Then it updates the state."
  533. update_state_after_rep();
  534. // "Then the decoder must copy match bytes as described in
  535. // "The Match symbols copying" section."
  536. TRY(copy_match_to_buffer(normalized_length + normalized_to_real_match_length_offset));
  537. }
  538. if (m_found_end_of_stream_marker) {
  539. if (!is_range_decoder_in_clean_state())
  540. return Error::from_string_literal("LZMA stream ends in an unclean state");
  541. }
  542. return m_dictionary->read(bytes);
  543. }
  544. ErrorOr<size_t> LzmaDecompressor::write_some(ReadonlyBytes)
  545. {
  546. return Error::from_errno(EBADF);
  547. }
  548. bool LzmaDecompressor::is_eof() const
  549. {
  550. if (m_dictionary->used_space() > 0)
  551. return false;
  552. if (has_reached_expected_data_size())
  553. return true;
  554. return m_found_end_of_stream_marker;
  555. }
  556. bool LzmaDecompressor::is_open() const
  557. {
  558. return true;
  559. }
  560. void LzmaDecompressor::close()
  561. {
  562. }
  563. }