2023-03-05 13:34:05 +00:00
/*
* Copyright ( c ) 2023 , Tim Schumacher < timschumi @ gmx . de >
*
* SPDX - License - Identifier : BSD - 2 - Clause
*/
2023-05-03 09:15:37 +00:00
# include <AK/Debug.h>
# include <AK/IntegralMath.h>
2023-03-05 13:34:05 +00:00
# include <LibCompress/Lzma.h>
namespace Compress {
u32 LzmaHeader : : dictionary_size ( ) const
{
// "If the value of dictionary size in properties is smaller than (1 << 12),
// the LZMA decoder must set the dictionary size variable to (1 << 12)."
constexpr u32 minimum_dictionary_size = ( 1 < < 12 ) ;
2023-05-01 11:38:04 +00:00
if ( unchecked_dictionary_size < minimum_dictionary_size )
2023-03-05 13:34:05 +00:00
return minimum_dictionary_size ;
2023-05-01 11:38:04 +00:00
return unchecked_dictionary_size ;
2023-03-05 13:34:05 +00:00
}
Optional < u64 > LzmaHeader : : uncompressed_size ( ) const
{
// We are making a copy of the packed field here because we would otherwise
// pass an unaligned reference to the constructor of Optional, which is
// undefined behavior.
2023-05-01 11:38:04 +00:00
auto uncompressed_size = encoded_uncompressed_size ;
2023-03-05 13:34:05 +00:00
// "If "Uncompressed size" field contains ones in all 64 bits, it means that
// uncompressed size is unknown and there is the "end marker" in stream,
// that indicates the end of decoding point."
2023-05-03 09:15:37 +00:00
if ( uncompressed_size = = placeholder_for_unknown_uncompressed_size )
2023-03-05 13:34:05 +00:00
return { } ;
// "In opposite case, if the value from "Uncompressed size" field is not
// equal to ((2^64) - 1), the LZMA stream decoding must be finished after
// specified number of bytes (Uncompressed size) is decoded. And if there
// is the "end marker", the LZMA decoder must read that marker also."
return uncompressed_size ;
}
2023-03-14 01:36:03 +00:00
ErrorOr < LzmaModelProperties > LzmaHeader : : decode_model_properties ( u8 input_bits )
2023-03-05 13:34:05 +00:00
{
// "Decodes the following values from the encoded model properties field:
//
// name Range Description
// lc [0, 8] the number of "literal context" bits
// lp [0, 4] the number of "literal pos" bits
// pb [0, 4] the number of "pos" bits
//
// Encoded using `((pb * 5 + lp) * 9 + lc)`."
if ( input_bits > = ( 9 * 5 * 5 ) )
return Error : : from_string_literal ( " Encoded model properties value is larger than the highest possible value " ) ;
2023-03-14 01:36:03 +00:00
u8 literal_context_bits = input_bits % 9 ;
2023-03-05 13:34:05 +00:00
input_bits / = 9 ;
2023-03-14 01:36:03 +00:00
VERIFY ( literal_context_bits > = 0 & & literal_context_bits < = 8 ) ;
2023-03-05 13:34:05 +00:00
2023-03-14 01:36:03 +00:00
u8 literal_position_bits = input_bits % 5 ;
2023-03-05 13:34:05 +00:00
input_bits / = 5 ;
2023-03-14 01:36:03 +00:00
VERIFY ( literal_position_bits > = 0 & & literal_position_bits < = 4 ) ;
2023-03-05 13:34:05 +00:00
2023-03-14 01:36:03 +00:00
u8 position_bits = input_bits ;
VERIFY ( position_bits > = 0 & & position_bits < = 4 ) ;
2023-03-05 13:34:05 +00:00
2023-03-14 01:36:03 +00:00
return LzmaModelProperties {
. literal_context_bits = literal_context_bits ,
. literal_position_bits = literal_position_bits ,
. position_bits = position_bits ,
} ;
2023-03-05 13:34:05 +00:00
}
2023-05-03 09:15:37 +00:00
ErrorOr < u8 > LzmaHeader : : encode_model_properties ( LzmaModelProperties const & model_properties )
{
if ( model_properties . literal_context_bits > 8 )
return Error : : from_string_literal ( " LZMA literal context bits are too large to encode " ) ;
if ( model_properties . literal_position_bits > 4 )
return Error : : from_string_literal ( " LZMA literal position bits are too large to encode " ) ;
if ( model_properties . position_bits > 4 )
return Error : : from_string_literal ( " LZMA position bits are too large to encode " ) ;
return ( model_properties . position_bits * 5 + model_properties . literal_position_bits ) * 9 + model_properties . literal_context_bits ;
}
2023-03-05 13:34:05 +00:00
ErrorOr < LzmaDecompressorOptions > LzmaHeader : : as_decompressor_options ( ) const
{
2023-05-01 11:38:04 +00:00
auto model_properties = TRY ( decode_model_properties ( encoded_model_properties ) ) ;
2023-03-05 13:34:05 +00:00
2023-03-14 01:36:03 +00:00
return Compress : : LzmaDecompressorOptions {
. literal_context_bits = model_properties . literal_context_bits ,
. literal_position_bits = model_properties . literal_position_bits ,
. position_bits = model_properties . position_bits ,
2023-03-05 13:34:05 +00:00
. dictionary_size = dictionary_size ( ) ,
. uncompressed_size = uncompressed_size ( ) ,
2023-03-29 22:39:36 +00:00
. reject_end_of_stream_marker = false ,
2023-03-05 13:34:05 +00:00
} ;
}
2023-05-03 09:15:37 +00:00
ErrorOr < LzmaHeader > LzmaHeader : : from_compressor_options ( LzmaCompressorOptions const & options )
{
auto encoded_model_properties = TRY ( encode_model_properties ( {
. literal_context_bits = options . literal_context_bits ,
. literal_position_bits = options . literal_position_bits ,
. position_bits = options . position_bits ,
} ) ) ;
return LzmaHeader {
. encoded_model_properties = encoded_model_properties ,
. unchecked_dictionary_size = options . dictionary_size ,
. encoded_uncompressed_size = options . uncompressed_size . value_or ( placeholder_for_unknown_uncompressed_size ) ,
} ;
}
2023-05-01 11:01:06 +00:00
void LzmaState : : initialize_to_default_probability ( Span < Probability > span )
2023-03-05 13:34:05 +00:00
{
for ( auto & entry : span )
entry = default_probability ;
}
2023-03-16 12:07:47 +00:00
ErrorOr < NonnullOwnPtr < LzmaDecompressor > > LzmaDecompressor : : create_from_container ( MaybeOwned < Stream > stream , Optional < MaybeOwned < CircularBuffer > > dictionary )
2023-03-05 13:34:05 +00:00
{
auto header = TRY ( stream - > read_value < LzmaHeader > ( ) ) ;
2023-03-16 12:07:47 +00:00
return TRY ( LzmaDecompressor : : create_from_raw_stream ( move ( stream ) , TRY ( header . as_decompressor_options ( ) ) , move ( dictionary ) ) ) ;
2023-03-05 13:34:05 +00:00
}
2023-03-16 12:07:47 +00:00
ErrorOr < NonnullOwnPtr < LzmaDecompressor > > LzmaDecompressor : : create_from_raw_stream ( MaybeOwned < Stream > stream , LzmaDecompressorOptions const & options , Optional < MaybeOwned < CircularBuffer > > dictionary )
2023-03-05 13:34:05 +00:00
{
2023-03-16 12:07:47 +00:00
if ( ! dictionary . has_value ( ) ) {
auto new_dictionary = TRY ( CircularBuffer : : create_empty ( options . dictionary_size ) ) ;
dictionary = TRY ( try_make < CircularBuffer > ( move ( new_dictionary ) ) ) ;
}
VERIFY ( ( * dictionary ) - > capacity ( ) > = options . dictionary_size ) ;
2023-03-05 13:34:05 +00:00
// "The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where each table contains 0x300 CProb values."
auto literal_probabilities = TRY ( FixedArray < Probability > : : create ( literal_probability_table_size * ( 1 < < ( options . literal_context_bits + options . literal_position_bits ) ) ) ) ;
2023-03-16 12:07:47 +00:00
auto decompressor = TRY ( adopt_nonnull_own_or_enomem ( new ( nothrow ) LzmaDecompressor ( move ( stream ) , options , dictionary . release_value ( ) , move ( literal_probabilities ) ) ) ) ;
2023-03-05 13:34:05 +00:00
2023-03-14 03:12:20 +00:00
TRY ( decompressor - > initialize_range_decoder ( ) ) ;
2023-03-05 13:34:05 +00:00
return decompressor ;
}
2023-05-01 11:01:06 +00:00
LzmaState : : LzmaState ( FixedArray < Probability > literal_probabilities )
: m_literal_probabilities ( move ( literal_probabilities ) )
2023-03-05 13:34:05 +00:00
{
initialize_to_default_probability ( m_literal_probabilities . span ( ) ) ;
for ( auto & array : m_length_to_position_states )
initialize_to_default_probability ( array ) ;
for ( auto & array : m_binary_tree_distance_probabilities )
initialize_to_default_probability ( array ) ;
initialize_to_default_probability ( m_alignment_bit_probabilities ) ;
initialize_to_default_probability ( m_is_match_probabilities ) ;
initialize_to_default_probability ( m_is_rep_probabilities ) ;
initialize_to_default_probability ( m_is_rep_g0_probabilities ) ;
initialize_to_default_probability ( m_is_rep_g1_probabilities ) ;
initialize_to_default_probability ( m_is_rep_g2_probabilities ) ;
initialize_to_default_probability ( m_is_rep0_long_probabilities ) ;
}
2023-05-01 11:01:06 +00:00
LzmaDecompressor : : LzmaDecompressor ( MaybeOwned < Stream > stream , LzmaDecompressorOptions options , MaybeOwned < CircularBuffer > dictionary , FixedArray < Probability > literal_probabilities )
: LzmaState ( move ( literal_probabilities ) )
, m_stream ( move ( stream ) )
, m_options ( move ( options ) )
, m_dictionary ( move ( dictionary ) )
{
}
2023-03-29 21:37:59 +00:00
bool LzmaDecompressor : : is_range_decoder_in_clean_state ( ) const
{
return m_range_decoder_code = = 0 ;
}
bool LzmaDecompressor : : has_reached_expected_data_size ( ) const
{
if ( ! m_options . uncompressed_size . has_value ( ) )
return false ;
2023-05-01 11:01:06 +00:00
return m_total_processed_bytes > = m_options . uncompressed_size . value ( ) ;
2023-03-29 21:37:59 +00:00
}
2023-03-14 03:12:20 +00:00
ErrorOr < void > LzmaDecompressor : : initialize_range_decoder ( )
{
// "The LZMA Encoder always writes ZERO in initial byte of compressed stream.
// That scheme allows to simplify the code of the Range Encoder in the
// LZMA Encoder. If initial byte is not equal to ZERO, the LZMA Decoder must
// stop decoding and report error."
{
auto byte = TRY ( m_stream - > read_value < u8 > ( ) ) ;
if ( byte ! = 0 )
return Error : : from_string_literal ( " Initial byte of data stream is not zero " ) ;
}
// Read the initial bytes into the range decoder.
m_range_decoder_code = 0 ;
for ( size_t i = 0 ; i < 4 ; i + + ) {
auto byte = TRY ( m_stream - > read_value < u8 > ( ) ) ;
m_range_decoder_code = m_range_decoder_code < < 8 | byte ;
}
m_range_decoder_range = 0xFFFFFFFF ;
return { } ;
}
ErrorOr < void > LzmaDecompressor : : append_input_stream ( MaybeOwned < Stream > stream , Optional < u64 > uncompressed_size )
{
m_stream = move ( stream ) ;
TRY ( initialize_range_decoder ( ) ) ;
if ( m_options . uncompressed_size . has_value ( ) ! = uncompressed_size . has_value ( ) )
return Error : : from_string_literal ( " Appending LZMA streams with mismatching uncompressed size status " ) ;
if ( uncompressed_size . has_value ( ) )
* m_options . uncompressed_size + = * uncompressed_size ;
return { } ;
}
2023-03-05 13:34:05 +00:00
ErrorOr < void > LzmaDecompressor : : normalize_range_decoder ( )
{
2023-05-19 12:13:13 +00:00
// "The Normalize() function keeps the "Range" value in described range."
2023-03-05 13:34:05 +00:00
if ( m_range_decoder_range > = minimum_range_value )
return { } ;
m_range_decoder_range < < = 8 ;
m_range_decoder_code < < = 8 ;
m_range_decoder_code | = TRY ( m_stream - > read_value < u8 > ( ) ) ;
VERIFY ( m_range_decoder_range > = minimum_range_value ) ;
return { } ;
}
2023-05-18 23:27:31 +00:00
ErrorOr < void > LzmaCompressor : : shift_range_encoder ( )
{
if ( ( m_range_encoder_code > > 32 ) = = 0x01 ) {
// If there is an overflow, we can finalize the chain we were previously building.
// This includes incrementing both the cached byte and all the 0xFF bytes that we generate.
VERIFY ( m_range_encoder_cached_byte ! = 0xFF ) ;
TRY ( m_stream - > write_value < u8 > ( m_range_encoder_cached_byte + 1 ) ) ;
for ( size_t i = 0 ; i < m_range_encoder_ff_chain_length ; i + + )
TRY ( m_stream - > write_value < u8 > ( 0x00 ) ) ;
m_range_encoder_ff_chain_length = 0 ;
m_range_encoder_cached_byte = ( m_range_encoder_code > > 24 ) ;
} else if ( ( m_range_encoder_code > > 24 ) = = 0xFF ) {
// If the byte to flush is 0xFF, it can potentially propagate an overflow and needs to be added to the chain.
m_range_encoder_ff_chain_length + + ;
} else {
// If the byte to flush isn't 0xFF, any future overflows will not be propagated beyond this point,
// so we can be sure that the built chain doesn't change anymore.
TRY ( m_stream - > write_value < u8 > ( m_range_encoder_cached_byte ) ) ;
for ( size_t i = 0 ; i < m_range_encoder_ff_chain_length ; i + + )
TRY ( m_stream - > write_value < u8 > ( 0xFF ) ) ;
m_range_encoder_ff_chain_length = 0 ;
m_range_encoder_cached_byte = ( m_range_encoder_code > > 24 ) ;
}
// In all three cases we now recorded the highest byte in some way, so we can shift it away and shift in a null byte as the lowest byte.
m_range_encoder_range < < = 8 ;
m_range_encoder_code < < = 8 ;
// Since we are working with a 64-bit code, we need to limit it to 32 bits artificially.
m_range_encoder_code & = 0xFFFFFFFF ;
return { } ;
}
2023-05-03 09:15:37 +00:00
ErrorOr < void > LzmaCompressor : : normalize_range_encoder ( )
{
u64 const maximum_range_value = m_range_encoder_code + m_range_encoder_range ;
2023-05-18 23:27:31 +00:00
// Logically, we should only ever build up an overflow that is smaller than or equal to 0x01.
VERIFY ( ( maximum_range_value > > 32 ) < = 0x01 ) ;
2023-05-03 09:15:37 +00:00
if ( m_range_encoder_range > = minimum_range_value )
return { } ;
2023-05-18 23:27:31 +00:00
TRY ( shift_range_encoder ( ) ) ;
2023-05-03 09:15:37 +00:00
VERIFY ( m_range_encoder_range > = minimum_range_value ) ;
return { } ;
}
2023-03-05 13:34:05 +00:00
ErrorOr < u8 > LzmaDecompressor : : decode_direct_bit ( )
{
2023-05-18 16:08:29 +00:00
dbgln_if ( LZMA_DEBUG , " Decoding direct bit {} with code = {:#x}, range = {:#x} " , 1 - ( ( m_range_decoder_code - ( m_range_decoder_range > > 1 ) ) > > 31 ) , m_range_decoder_code , m_range_decoder_range ) ;
2023-03-05 13:34:05 +00:00
m_range_decoder_range > > = 1 ;
m_range_decoder_code - = m_range_decoder_range ;
u32 temp = 0 - ( m_range_decoder_code > > 31 ) ;
m_range_decoder_code + = m_range_decoder_range & temp ;
if ( m_range_decoder_code = = m_range_decoder_range )
return Error : : from_string_literal ( " Reached an invalid state while decoding LZMA stream " ) ;
TRY ( normalize_range_decoder ( ) ) ;
return temp + 1 ;
}
2023-05-03 09:15:37 +00:00
ErrorOr < void > LzmaCompressor : : encode_direct_bit ( u8 value )
{
2023-05-18 16:08:29 +00:00
dbgln_if ( LZMA_DEBUG , " Encoding direct bit {} with code = {:#x}, range = {:#x} " , value , m_range_encoder_code , m_range_encoder_range ) ;
2023-05-03 09:15:37 +00:00
m_range_encoder_range > > = 1 ;
if ( value ! = 0 )
m_range_encoder_code + = m_range_encoder_range ;
TRY ( normalize_range_encoder ( ) ) ;
return { } ;
}
2023-03-05 13:34:05 +00:00
ErrorOr < u8 > LzmaDecompressor : : decode_bit_with_probability ( Probability & probability )
{
// "The LZMA decoder provides the pointer to CProb variable that contains
// information about estimated probability for symbol 0 and the Range Decoder
// updates that CProb variable after decoding."
u32 bound = ( m_range_decoder_range > > probability_bit_count ) * probability ;
2023-05-13 15:55:31 +00:00
dbgln_if ( LZMA_DEBUG , " Decoding bit {} with probability = {:#x}, bound = {:#x}, code = {:#x}, range = {:#x} " , m_range_decoder_code < bound ? 0 : 1 , probability , bound , m_range_decoder_code , m_range_decoder_range ) ;
2023-03-05 13:34:05 +00:00
if ( m_range_decoder_code < bound ) {
probability + = ( ( 1 < < probability_bit_count ) - probability ) > > probability_shift_width ;
m_range_decoder_range = bound ;
TRY ( normalize_range_decoder ( ) ) ;
return 0 ;
} else {
probability - = probability > > probability_shift_width ;
m_range_decoder_code - = bound ;
m_range_decoder_range - = bound ;
TRY ( normalize_range_decoder ( ) ) ;
return 1 ;
}
}
2023-05-03 09:15:37 +00:00
ErrorOr < void > LzmaCompressor : : encode_bit_with_probability ( Probability & probability , u8 value )
{
u32 bound = ( m_range_encoder_range > > probability_bit_count ) * probability ;
2023-05-13 15:55:31 +00:00
dbgln_if ( LZMA_DEBUG , " Encoding bit {} with probability = {:#x}, bound = {:#x}, code = {:#x}, range = {:#x} " , value , probability , bound , m_range_encoder_code , m_range_encoder_range ) ;
2023-05-03 09:15:37 +00:00
if ( value = = 0 ) {
probability + = ( ( 1 < < probability_bit_count ) - probability ) > > probability_shift_width ;
m_range_encoder_range = bound ;
} else {
probability - = probability > > probability_shift_width ;
m_range_encoder_code + = bound ;
m_range_encoder_range - = bound ;
}
TRY ( normalize_range_encoder ( ) ) ;
return { } ;
}
2023-03-05 13:34:05 +00:00
ErrorOr < u16 > LzmaDecompressor : : decode_symbol_using_bit_tree ( size_t bit_count , Span < Probability > probability_tree )
{
VERIFY ( bit_count < = sizeof ( u16 ) * 8 ) ;
VERIFY ( probability_tree . size ( ) > = 1ul < < bit_count ) ;
// This has been modified from the reference implementation to unlink the result and the tree index,
// which should allow for better readability.
u16 result = 0 ;
size_t tree_index = 1 ;
for ( size_t i = 0 ; i < bit_count ; i + + ) {
u16 next_bit = TRY ( decode_bit_with_probability ( probability_tree [ tree_index ] ) ) ;
result = ( result < < 1 ) | next_bit ;
tree_index = ( tree_index < < 1 ) | next_bit ;
}
2023-05-13 15:55:31 +00:00
dbgln_if ( LZMA_DEBUG , " Decoded value {:#x} with {} bits using bit tree " , result , bit_count ) ;
2023-03-05 13:34:05 +00:00
return result ;
}
2023-05-03 09:15:37 +00:00
ErrorOr < void > LzmaCompressor : : encode_symbol_using_bit_tree ( size_t bit_count , Span < Probability > probability_tree , u16 value )
{
VERIFY ( bit_count < = sizeof ( u16 ) * 8 ) ;
VERIFY ( probability_tree . size ( ) > = 1ul < < bit_count ) ;
VERIFY ( value < = ( 1 < < bit_count ) - 1 ) ;
2023-05-13 15:55:31 +00:00
auto original_value = value ;
2023-05-03 09:15:37 +00:00
// Shift value to make the first sent byte the most significant bit. This makes the shifting logic a lot easier to read.
value < < = sizeof ( u16 ) * 8 - bit_count ;
size_t tree_index = 1 ;
for ( size_t i = 0 ; i < bit_count ; i + + ) {
u8 const next_bit = ( value & 0x8000 ) > > ( sizeof ( u16 ) * 8 - 1 ) ;
value < < = 1 ;
TRY ( encode_bit_with_probability ( probability_tree [ tree_index ] , next_bit ) ) ;
tree_index = ( tree_index < < 1 ) | next_bit ;
}
2023-05-13 15:55:31 +00:00
dbgln_if ( LZMA_DEBUG , " Encoded value {:#x} with {} bits using bit tree " , original_value , bit_count ) ;
2023-05-03 09:15:37 +00:00
return { } ;
}
2023-03-05 13:34:05 +00:00
ErrorOr < u16 > LzmaDecompressor : : decode_symbol_using_reverse_bit_tree ( size_t bit_count , Span < Probability > probability_tree )
{
VERIFY ( bit_count < = sizeof ( u16 ) * 8 ) ;
VERIFY ( probability_tree . size ( ) > = 1ul < < bit_count ) ;
u16 result = 0 ;
size_t tree_index = 1 ;
for ( size_t i = 0 ; i < bit_count ; i + + ) {
u16 next_bit = TRY ( decode_bit_with_probability ( probability_tree [ tree_index ] ) ) ;
result | = next_bit < < i ;
tree_index = ( tree_index < < 1 ) | next_bit ;
}
2023-05-13 15:55:31 +00:00
dbgln_if ( LZMA_DEBUG , " Decoded value {:#x} with {} bits using reverse bit tree " , result , bit_count ) ;
2023-03-05 13:34:05 +00:00
return result ;
}
2023-05-03 09:15:37 +00:00
ErrorOr < void > LzmaCompressor : : encode_symbol_using_reverse_bit_tree ( size_t bit_count , Span < Probability > probability_tree , u16 value )
{
VERIFY ( bit_count < = sizeof ( u16 ) * 8 ) ;
VERIFY ( probability_tree . size ( ) > = 1ul < < bit_count ) ;
VERIFY ( value < = ( 1 < < bit_count ) - 1 ) ;
2023-05-13 15:55:31 +00:00
auto original_value = value ;
2023-05-03 09:15:37 +00:00
size_t tree_index = 1 ;
for ( size_t i = 0 ; i < bit_count ; i + + ) {
u8 const next_bit = value & 1 ;
value > > = 1 ;
TRY ( encode_bit_with_probability ( probability_tree [ tree_index ] , next_bit ) ) ;
tree_index = ( tree_index < < 1 ) | next_bit ;
}
2023-05-13 15:55:31 +00:00
dbgln_if ( LZMA_DEBUG , " Encoded value {:#x} with {} bits using reverse bit tree " , original_value , bit_count ) ;
2023-05-03 09:15:37 +00:00
return { } ;
}
2023-03-05 13:34:05 +00:00
ErrorOr < void > LzmaDecompressor : : decode_literal_to_output_buffer ( )
{
u8 previous_byte = 0 ;
2023-03-29 16:15:44 +00:00
if ( m_dictionary - > seekback_limit ( ) > 0 ) {
2023-03-16 12:07:47 +00:00
auto read_bytes = MUST ( m_dictionary - > read_with_seekback ( { & previous_byte , sizeof ( previous_byte ) } , 1 ) ) ;
2023-03-05 13:34:05 +00:00
VERIFY ( read_bytes . size ( ) = = sizeof ( previous_byte ) ) ;
}
// "To select the table for decoding it uses the context that consists of
// (lc) high bits from previous literal and (lp) low bits from value that
// represents current position in outputStream."
2023-05-01 11:01:06 +00:00
u16 literal_state_bits_from_position = m_total_processed_bytes & ( ( 1 < < m_options . literal_position_bits ) - 1 ) ;
2023-03-05 13:34:05 +00:00
u16 literal_state_bits_from_output = previous_byte > > ( 8 - m_options . literal_context_bits ) ;
u16 literal_state = literal_state_bits_from_position < < m_options . literal_context_bits | literal_state_bits_from_output ;
Span < Probability > selected_probability_table = m_literal_probabilities . span ( ) . slice ( literal_probability_table_size * literal_state , literal_probability_table_size ) ;
// The result is defined as u16 here and initialized to 1, but we will cut off the top bits before queueing them into the output buffer.
// The top bit is only used to track how much we have decoded already, and to select the correct probability table.
u16 result = 1 ;
// "If (State > 7), the Literal Decoder also uses "matchByte" that represents
// the byte in OutputStream at position the is the DISTANCE bytes before
// current position, where the DISTANCE is the distance in DISTANCE-LENGTH pair
// of latest decoded match."
// Note: The specification says `(State > 7)`, but the reference implementation does `(State >= 7)`, which is a mismatch.
// Testing `(State > 7)` with actual test files yields errors, so the reference implementation appears to be the correct one.
if ( m_state > = 7 ) {
u8 matched_byte = 0 ;
2023-03-29 12:46:36 +00:00
auto read_bytes = TRY ( m_dictionary - > read_with_seekback ( { & matched_byte , sizeof ( matched_byte ) } , current_repetition_offset ( ) ) ) ;
2023-03-05 13:34:05 +00:00
VERIFY ( read_bytes . size ( ) = = sizeof ( matched_byte ) ) ;
2023-05-13 15:55:31 +00:00
dbgln_if ( LZMA_DEBUG , " Decoding literal using match byte {:#x} " , matched_byte ) ;
2023-03-05 13:34:05 +00:00
do {
u8 match_bit = ( matched_byte > > 7 ) & 1 ;
matched_byte < < = 1 ;
u8 decoded_bit = TRY ( decode_bit_with_probability ( selected_probability_table [ ( ( 1 + match_bit ) < < 8 ) + result ] ) ) ;
result = result < < 1 | decoded_bit ;
if ( match_bit ! = decoded_bit )
break ;
} while ( result < 0x100 ) ;
}
while ( result < 0x100 )
result = ( result < < 1 ) | TRY ( decode_bit_with_probability ( selected_probability_table [ result ] ) ) ;
u8 actual_result = result - 0x100 ;
2023-03-16 12:07:47 +00:00
size_t written_bytes = m_dictionary - > write ( { & actual_result , sizeof ( actual_result ) } ) ;
2023-03-05 13:34:05 +00:00
VERIFY ( written_bytes = = sizeof ( actual_result ) ) ;
2023-05-01 11:01:06 +00:00
m_total_processed_bytes + = sizeof ( actual_result ) ;
2023-03-05 13:34:05 +00:00
2023-05-13 15:55:31 +00:00
dbgln_if ( LZMA_DEBUG , " Decoded literal {:#x} in state {} using literal state {:#x} (previous byte is {:#x}) " , actual_result , m_state , literal_state , previous_byte ) ;
2023-03-05 13:34:05 +00:00
return { } ;
}
2023-05-03 09:15:37 +00:00
ErrorOr < void > LzmaCompressor : : encode_literal ( u8 literal )
{
// This function largely mirrors `decode_literal_to_output_buffer`, so specification comments have been omitted.
TRY ( encode_match_type ( MatchType : : Literal ) ) ;
// Note: We have already read the next byte from the input buffer, so it's now in the seekback buffer, shifting all seekback offsets by one.
u8 previous_byte = 0 ;
if ( m_dictionary - > seekback_limit ( ) - m_dictionary - > used_space ( ) > 1 ) {
auto read_bytes = MUST ( m_dictionary - > read_with_seekback ( { & previous_byte , sizeof ( previous_byte ) } , 2 + m_dictionary - > used_space ( ) ) ) ;
VERIFY ( read_bytes . size ( ) = = sizeof ( previous_byte ) ) ;
}
u16 const literal_state_bits_from_position = m_total_processed_bytes & ( ( 1 < < m_options . literal_position_bits ) - 1 ) ;
u16 const literal_state_bits_from_output = previous_byte > > ( 8 - m_options . literal_context_bits ) ;
u16 const literal_state = literal_state_bits_from_position < < m_options . literal_context_bits | literal_state_bits_from_output ;
Span < Probability > selected_probability_table = m_literal_probabilities . span ( ) . slice ( literal_probability_table_size * literal_state , literal_probability_table_size ) ;
2023-05-13 15:55:31 +00:00
auto original_literal = literal ;
2023-05-03 09:15:37 +00:00
u16 result = 1 ;
if ( m_state > = 7 ) {
u8 matched_byte = 0 ;
auto read_bytes = TRY ( m_dictionary - > read_with_seekback ( { & matched_byte , sizeof ( matched_byte ) } , current_repetition_offset ( ) + m_dictionary - > used_space ( ) + 1 ) ) ;
VERIFY ( read_bytes . size ( ) = = sizeof ( matched_byte ) ) ;
2023-05-13 15:55:31 +00:00
dbgln_if ( LZMA_DEBUG , " Encoding literal using match byte {:#x} " , matched_byte ) ;
2023-05-03 09:15:37 +00:00
do {
u8 const match_bit = ( matched_byte > > 7 ) & 1 ;
matched_byte < < = 1 ;
u8 const encoded_bit = ( literal & 0x80 ) > > 7 ;
literal < < = 1 ;
TRY ( encode_bit_with_probability ( selected_probability_table [ ( ( 1 + match_bit ) < < 8 ) + result ] , encoded_bit ) ) ;
result = result < < 1 | encoded_bit ;
if ( match_bit ! = encoded_bit )
break ;
} while ( result < 0x100 ) ;
}
while ( result < 0x100 ) {
u8 const encoded_bit = ( literal & 0x80 ) > > 7 ;
literal < < = 1 ;
TRY ( encode_bit_with_probability ( selected_probability_table [ result ] , encoded_bit ) ) ;
result = ( result < < 1 ) | encoded_bit ;
}
m_total_processed_bytes + = sizeof ( literal ) ;
2023-05-13 15:55:31 +00:00
dbgln_if ( LZMA_DEBUG , " Encoded literal {:#x} in state {} using literal state {:#x} (previous byte is {:#x}) " , original_literal , m_state , literal_state , previous_byte ) ;
2023-05-03 09:15:37 +00:00
update_state_after_literal ( ) ;
return { } ;
}
ErrorOr < void > LzmaCompressor : : encode_existing_match ( size_t real_distance , size_t real_length )
{
VERIFY ( real_distance > = normalized_to_real_match_distance_offset ) ;
u32 const normalized_distance = real_distance - normalized_to_real_match_distance_offset ;
VERIFY ( real_length > = normalized_to_real_match_length_offset ) ;
u16 const normalized_length = real_length - normalized_to_real_match_length_offset ;
if ( normalized_distance = = m_rep0 ) {
TRY ( encode_match_type ( MatchType : : RepMatch0 ) ) ;
} else if ( normalized_distance = = m_rep1 ) {
TRY ( encode_match_type ( MatchType : : RepMatch1 ) ) ;
u32 const distance = m_rep1 ;
m_rep1 = m_rep0 ;
m_rep0 = distance ;
} else if ( normalized_distance = = m_rep2 ) {
TRY ( encode_match_type ( MatchType : : RepMatch2 ) ) ;
u32 const distance = m_rep2 ;
m_rep2 = m_rep1 ;
m_rep1 = m_rep0 ;
m_rep0 = distance ;
} else if ( normalized_distance = = m_rep3 ) {
TRY ( encode_match_type ( MatchType : : RepMatch3 ) ) ;
u32 const distance = m_rep3 ;
m_rep3 = m_rep2 ;
m_rep2 = m_rep1 ;
m_rep1 = m_rep0 ;
m_rep0 = distance ;
} else {
VERIFY_NOT_REACHED ( ) ;
}
TRY ( encode_normalized_match_length ( m_rep_length_coder , normalized_length ) ) ;
update_state_after_rep ( ) ;
MUST ( m_dictionary - > discard ( real_length ) ) ;
m_total_processed_bytes + = real_length ;
return { } ;
}
ErrorOr < void > LzmaCompressor : : encode_new_match ( size_t real_distance , size_t real_length )
{
VERIFY ( real_distance > = normalized_to_real_match_distance_offset ) ;
u32 const normalized_distance = real_distance - normalized_to_real_match_distance_offset ;
VERIFY ( real_length > = normalized_to_real_match_length_offset ) ;
u16 const normalized_length = real_length - normalized_to_real_match_length_offset ;
TRY ( encode_normalized_simple_match ( normalized_distance , normalized_length ) ) ;
MUST ( m_dictionary - > discard ( real_length ) ) ;
m_total_processed_bytes + = real_length ;
return { } ;
}
ErrorOr < void > LzmaCompressor : : encode_normalized_simple_match ( u32 normalized_distance , u16 normalized_length )
{
TRY ( encode_match_type ( MatchType : : SimpleMatch ) ) ;
m_rep3 = m_rep2 ;
m_rep2 = m_rep1 ;
m_rep1 = m_rep0 ;
TRY ( encode_normalized_match_length ( m_length_coder , normalized_length ) ) ;
update_state_after_match ( ) ;
TRY ( encode_normalized_match_distance ( normalized_length , normalized_distance ) ) ;
m_rep0 = normalized_distance ;
return { } ;
}
2023-05-01 11:01:06 +00:00
LzmaState : : LzmaLengthCoderState : : LzmaLengthCoderState ( )
2023-03-05 13:34:05 +00:00
{
for ( auto & array : m_low_length_probabilities )
initialize_to_default_probability ( array ) ;
for ( auto & array : m_medium_length_probabilities )
initialize_to_default_probability ( array ) ;
initialize_to_default_probability ( m_high_length_probabilities ) ;
}
2023-05-01 11:01:06 +00:00
ErrorOr < u16 > LzmaDecompressor : : decode_normalized_match_length ( LzmaLengthCoderState & length_decoder_state )
2023-03-05 13:34:05 +00:00
{
// "LZMA uses "posState" value as context to select the binary tree
// from LowCoder and MidCoder binary tree arrays:"
2023-05-01 11:01:06 +00:00
u16 position_state = m_total_processed_bytes & ( ( 1 < < m_options . position_bits ) - 1 ) ;
2023-03-05 13:34:05 +00:00
// "The following scheme is used for the match length encoding:
//
// Binary encoding Binary Tree structure Zero-based match length
// sequence (binary + decimal):
//
// 0 xxx LowCoder[posState] xxx
if ( TRY ( decode_bit_with_probability ( length_decoder_state . m_first_choice_probability ) ) = = 0 )
return TRY ( decode_symbol_using_bit_tree ( 3 , length_decoder_state . m_low_length_probabilities [ position_state ] . span ( ) ) ) ;
// 1 0 yyy MidCoder[posState] yyy + 8
if ( TRY ( decode_bit_with_probability ( length_decoder_state . m_second_choice_probability ) ) = = 0 )
return TRY ( decode_symbol_using_bit_tree ( 3 , length_decoder_state . m_medium_length_probabilities [ position_state ] . span ( ) ) ) + 8 ;
// 1 1 zzzzzzzz HighCoder zzzzzzzz + 16"
return TRY ( decode_symbol_using_bit_tree ( 8 , length_decoder_state . m_high_length_probabilities . span ( ) ) ) + 16 ;
}
2023-05-03 09:15:37 +00:00
ErrorOr < void > LzmaCompressor : : encode_normalized_match_length ( LzmaLengthCoderState & length_coder_state , u16 normalized_length )
{
u16 const position_state = m_total_processed_bytes & ( ( 1 < < m_options . position_bits ) - 1 ) ;
if ( normalized_length < 8 ) {
TRY ( encode_bit_with_probability ( length_coder_state . m_first_choice_probability , 0 ) ) ;
TRY ( encode_symbol_using_bit_tree ( 3 , length_coder_state . m_low_length_probabilities [ position_state ] . span ( ) , normalized_length ) ) ;
return { } ;
}
TRY ( encode_bit_with_probability ( length_coder_state . m_first_choice_probability , 1 ) ) ;
if ( normalized_length < 16 ) {
TRY ( encode_bit_with_probability ( length_coder_state . m_second_choice_probability , 0 ) ) ;
TRY ( encode_symbol_using_bit_tree ( 3 , length_coder_state . m_medium_length_probabilities [ position_state ] . span ( ) , normalized_length - 8 ) ) ;
return { } ;
}
TRY ( encode_bit_with_probability ( length_coder_state . m_second_choice_probability , 1 ) ) ;
TRY ( encode_symbol_using_bit_tree ( 8 , length_coder_state . m_high_length_probabilities . span ( ) , normalized_length - 16 ) ) ;
return { } ;
}
2023-03-05 13:34:05 +00:00
ErrorOr < u32 > LzmaDecompressor : : decode_normalized_match_distance ( u16 normalized_match_length )
{
// "LZMA uses normalized match length (zero-based length)
// to calculate the context state "lenState" do decode the distance value."
u16 length_state = min ( normalized_match_length , number_of_length_to_position_states - 1 ) ;
// "At first stage the distance decoder decodes 6-bit "posSlot" value with bit
// tree decoder from PosSlotDecoder array."
u16 position_slot = TRY ( decode_symbol_using_bit_tree ( 6 , m_length_to_position_states [ length_state ] . span ( ) ) ) ;
// "The encoding scheme for distance value is shown in the following table:
//
// posSlot (decimal) /
// zero-based distance (binary)
// 0 0
// 1 1
// 2 10
// 3 11
//
// 4 10 x
// 5 11 x
// 6 10 xx
// 7 11 xx
// 8 10 xxx
// 9 11 xxx
// 10 10 xxxx
// 11 11 xxxx
// 12 10 xxxxx
// 13 11 xxxxx
//
// 14 10 yy zzzz
// 15 11 yy zzzz
// 16 10 yyy zzzz
// 17 11 yyy zzzz
// ...
// 62 10 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz
// 63 11 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz
//
// where
// "x ... x" means the sequence of binary symbols encoded with binary tree and
// "Reverse" scheme. It uses separated binary tree for each posSlot from 4 to 13.
// "y" means direct bit encoded with range coder.
// "zzzz" means the sequence of four binary symbols encoded with binary
// tree with "Reverse" scheme, where one common binary tree "AlignDecoder"
// is used for all posSlot values."
// "If (posSlot < 4), the "dist" value is equal to posSlot value."
if ( position_slot < first_position_slot_with_binary_tree_bits )
return position_slot ;
// From here on, the first bit of the distance is always set and the second bit is set if the last bit of the position slot is set.
u32 distance_prefix = ( ( 1 < < 1 ) | ( ( position_slot & 1 ) < < 0 ) ) ;
// "If (posSlot >= 4), the decoder uses "posSlot" value to calculate the value of
// the high bits of "dist" value and the number of the low bits.
// If (4 <= posSlot < kEndPosModelIndex), the decoder uses bit tree decoders.
// (one separated bit tree decoder per one posSlot value) and "Reverse" scheme."
if ( position_slot < first_position_slot_with_direct_encoded_bits ) {
size_t number_of_bits_to_decode = ( position_slot / 2 ) - 1 ;
auto & selected_probability_tree = m_binary_tree_distance_probabilities [ position_slot - first_position_slot_with_binary_tree_bits ] ;
return ( distance_prefix < < number_of_bits_to_decode ) | TRY ( decode_symbol_using_reverse_bit_tree ( number_of_bits_to_decode , selected_probability_tree ) ) ;
}
// " if (posSlot >= kEndPosModelIndex), the middle bits are decoded as direct
// bits from RangeDecoder and the low 4 bits are decoded with a bit tree
// decoder "AlignDecoder" with "Reverse" scheme."
size_t number_of_direct_bits_to_decode = ( ( position_slot - first_position_slot_with_direct_encoded_bits ) / 2 ) + 2 ;
for ( size_t i = 0 ; i < number_of_direct_bits_to_decode ; i + + ) {
distance_prefix = ( distance_prefix < < 1 ) | TRY ( decode_direct_bit ( ) ) ;
}
return ( distance_prefix < < number_of_alignment_bits ) | TRY ( decode_symbol_using_reverse_bit_tree ( number_of_alignment_bits , m_alignment_bit_probabilities ) ) ;
}
2023-05-03 09:15:37 +00:00
ErrorOr < void > LzmaCompressor : : encode_normalized_match_distance ( u16 normalized_match_length , u32 normalized_match_distance )
{
u16 const length_state = min ( normalized_match_length , number_of_length_to_position_states - 1 ) ;
if ( normalized_match_distance < first_position_slot_with_binary_tree_bits ) {
// The normalized distance gets encoded as the position slot.
TRY ( encode_symbol_using_bit_tree ( 6 , m_length_to_position_states [ length_state ] . span ( ) , normalized_match_distance ) ) ;
return { } ;
}
// Note: This has been deduced, there is no immediate relation to the decoding function.
u16 const distance_log2 = AK : : log2 ( normalized_match_distance ) ;
u16 number_of_distance_bits = count_required_bits ( normalized_match_distance ) ;
u16 const position_slot = ( distance_log2 < < 1 ) + ( ( normalized_match_distance > > ( distance_log2 - 1 ) ) & 1 ) ;
TRY ( encode_symbol_using_bit_tree ( 6 , m_length_to_position_states [ length_state ] . span ( ) , position_slot ) ) ;
// Mask off the top two bits of the value, those are already encoded by the position slot.
normalized_match_distance & = ( 1 < < ( number_of_distance_bits - 2 ) ) - 1 ;
number_of_distance_bits - = 2 ;
if ( position_slot < first_position_slot_with_direct_encoded_bits ) {
// The value gets encoded using only a reverse bit tree coder.
auto & selected_probability_tree = m_binary_tree_distance_probabilities [ position_slot - first_position_slot_with_binary_tree_bits ] ;
TRY ( encode_symbol_using_reverse_bit_tree ( number_of_distance_bits , selected_probability_tree , normalized_match_distance ) ) ;
return { } ;
}
// The value is split into direct bits (everything except the last four bits) and alignment bits (last four bits).
auto direct_bits = normalized_match_distance & ~ ( ( 1 < < number_of_alignment_bits ) - 1 ) ;
auto const alignment_bits = normalized_match_distance & ( ( 1 < < number_of_alignment_bits ) - 1 ) ;
// Shift to-be-written direct bits to the most significant position for easier access.
direct_bits < < = sizeof ( direct_bits ) * 8 - number_of_distance_bits ;
for ( auto i = 0u ; i < number_of_distance_bits - number_of_alignment_bits ; i + + ) {
TRY ( encode_direct_bit ( ( direct_bits & 0x80000000 ) ? 1 : 0 ) ) ;
direct_bits < < = 1 ;
}
TRY ( encode_symbol_using_reverse_bit_tree ( number_of_alignment_bits , m_alignment_bit_probabilities , alignment_bits ) ) ;
return { } ;
}
2023-05-01 11:01:06 +00:00
u32 LzmaState : : current_repetition_offset ( ) const
2023-03-29 12:46:36 +00:00
{
// LZMA never needs to read at offset 0 (i.e. the actual read head of the buffer).
// Instead, the values are remapped so that the rep-value n starts reading n + 1 bytes back.
// The special rep-value 0xFFFFFFFF is reserved for marking the end of the stream,
// so this should never overflow.
2023-05-03 07:46:00 +00:00
VERIFY ( m_rep0 < = NumericLimits < u32 > : : max ( ) - normalized_to_real_match_distance_offset ) ;
return m_rep0 + normalized_to_real_match_distance_offset ;
2023-03-29 12:46:36 +00:00
}
2023-05-01 11:01:06 +00:00
void LzmaState : : update_state_after_literal ( )
{
if ( m_state < 4 )
m_state = 0 ;
else if ( m_state < 10 )
m_state - = 3 ;
else
m_state - = 6 ;
}
void LzmaState : : update_state_after_match ( )
{
if ( m_state < 7 )
m_state = 7 ;
else
m_state = 10 ;
2023-07-08 02:48:11 +00:00
}
2023-05-01 11:01:06 +00:00
void LzmaState : : update_state_after_rep ( )
{
if ( m_state < 7 )
m_state = 8 ;
else
m_state = 11 ;
}
void LzmaState : : update_state_after_short_rep ( )
{
if ( m_state < 7 )
m_state = 9 ;
else
m_state = 11 ;
}
2023-05-01 13:51:04 +00:00
ErrorOr < LzmaDecompressor : : MatchType > LzmaDecompressor : : decode_match_type ( )
{
// "The decoder calculates "state2" variable value to select exact variable from
// "IsMatch" and "IsRep0Long" arrays."
u16 position_state = m_total_processed_bytes & ( ( 1 < < m_options . position_bits ) - 1 ) ;
u16 state2 = ( m_state < < maximum_number_of_position_bits ) + position_state ;
// "The decoder uses the following code flow scheme to select exact
// type of LITERAL or MATCH:
//
// IsMatch[state2] decode
// 0 - the Literal"
2023-05-13 15:55:31 +00:00
if ( TRY ( decode_bit_with_probability ( m_is_match_probabilities [ state2 ] ) ) = = 0 ) {
dbgln_if ( LZMA_DEBUG , " Decoded match type 'Literal' " ) ;
2023-05-01 13:51:04 +00:00
return MatchType : : Literal ;
2023-05-13 15:55:31 +00:00
}
2023-05-01 13:51:04 +00:00
// " 1 - the Match
// IsRep[state] decode
// 0 - Simple Match"
2023-05-13 15:55:31 +00:00
if ( TRY ( decode_bit_with_probability ( m_is_rep_probabilities [ m_state ] ) ) = = 0 ) {
dbgln_if ( LZMA_DEBUG , " Decoded match type 'SimpleMatch' " ) ;
2023-05-01 13:51:04 +00:00
return MatchType : : SimpleMatch ;
2023-05-13 15:55:31 +00:00
}
2023-05-01 13:51:04 +00:00
// " 1 - Rep Match
// IsRepG0[state] decode
// 0 - the distance is rep0"
if ( TRY ( decode_bit_with_probability ( m_is_rep_g0_probabilities [ m_state ] ) ) = = 0 ) {
// " IsRep0Long[state2] decode
// 0 - Short Rep Match"
2023-05-13 15:55:31 +00:00
if ( TRY ( decode_bit_with_probability ( m_is_rep0_long_probabilities [ state2 ] ) ) = = 0 ) {
dbgln_if ( LZMA_DEBUG , " Decoded match type 'ShortRepMatch' " ) ;
2023-05-01 13:51:04 +00:00
return MatchType : : ShortRepMatch ;
2023-05-13 15:55:31 +00:00
}
2023-05-01 13:51:04 +00:00
// " 1 - Rep Match 0"
2023-05-13 15:55:31 +00:00
dbgln_if ( LZMA_DEBUG , " Decoded match type 'RepMatch0' " ) ;
2023-05-01 13:51:04 +00:00
return MatchType : : RepMatch0 ;
}
// " 1 -
// IsRepG1[state] decode
// 0 - Rep Match 1"
2023-05-13 15:55:31 +00:00
if ( TRY ( decode_bit_with_probability ( m_is_rep_g1_probabilities [ m_state ] ) ) = = 0 ) {
dbgln_if ( LZMA_DEBUG , " Decoded match type 'RepMatch1' " ) ;
2023-05-01 13:51:04 +00:00
return MatchType : : RepMatch1 ;
2023-05-13 15:55:31 +00:00
}
2023-05-01 13:51:04 +00:00
// " 1 -
// IsRepG2[state] decode
// 0 - Rep Match 2"
2023-05-13 15:55:31 +00:00
if ( TRY ( decode_bit_with_probability ( m_is_rep_g2_probabilities [ m_state ] ) ) = = 0 ) {
dbgln_if ( LZMA_DEBUG , " Decoded match type 'RepMatch2' " ) ;
2023-05-01 13:51:04 +00:00
return MatchType : : RepMatch2 ;
2023-05-13 15:55:31 +00:00
}
2023-05-01 13:51:04 +00:00
// " 1 - Rep Match 3"
2023-05-13 15:55:31 +00:00
dbgln_if ( LZMA_DEBUG , " Decoded match type 'RepMatch3' " ) ;
2023-05-01 13:51:04 +00:00
return MatchType : : RepMatch3 ;
}
2023-05-03 09:15:37 +00:00
ErrorOr < void > LzmaCompressor : : encode_match_type ( MatchType match_type )
{
u16 position_state = m_total_processed_bytes & ( ( 1 < < m_options . position_bits ) - 1 ) ;
u16 state2 = ( m_state < < maximum_number_of_position_bits ) + position_state ;
if ( match_type = = MatchType : : Literal ) {
TRY ( encode_bit_with_probability ( m_is_match_probabilities [ state2 ] , 0 ) ) ;
2023-05-13 15:55:31 +00:00
dbgln_if ( LZMA_DEBUG , " Encoded match type 'Literal' " ) ;
2023-05-03 09:15:37 +00:00
return { } ;
}
TRY ( encode_bit_with_probability ( m_is_match_probabilities [ state2 ] , 1 ) ) ;
if ( match_type = = MatchType : : SimpleMatch ) {
TRY ( encode_bit_with_probability ( m_is_rep_probabilities [ m_state ] , 0 ) ) ;
2023-05-13 15:55:31 +00:00
dbgln_if ( LZMA_DEBUG , " Encoded match type 'SimpleMatch' " ) ;
2023-05-03 09:15:37 +00:00
return { } ;
}
TRY ( encode_bit_with_probability ( m_is_rep_probabilities [ m_state ] , 1 ) ) ;
if ( match_type = = MatchType : : ShortRepMatch | | match_type = = MatchType : : RepMatch0 ) {
TRY ( encode_bit_with_probability ( m_is_rep_g0_probabilities [ m_state ] , 0 ) ) ;
TRY ( encode_bit_with_probability ( m_is_rep0_long_probabilities [ state2 ] , match_type = = MatchType : : RepMatch0 ) ) ;
2023-05-13 15:55:31 +00:00
if constexpr ( LZMA_DEBUG ) {
if ( match_type = = RepMatch0 )
dbgln ( " Encoded match type 'RepMatch0' " ) ;
else
dbgln ( " Encoded match type 'ShortRepMatch' " ) ;
}
2023-05-03 09:15:37 +00:00
return { } ;
}
TRY ( encode_bit_with_probability ( m_is_rep_g0_probabilities [ m_state ] , 1 ) ) ;
if ( match_type = = MatchType : : RepMatch1 ) {
TRY ( encode_bit_with_probability ( m_is_rep_g1_probabilities [ m_state ] , 0 ) ) ;
2023-05-13 15:55:31 +00:00
dbgln_if ( LZMA_DEBUG , " Encoded match type 'RepMatch1' " ) ;
2023-05-03 09:15:37 +00:00
return { } ;
}
TRY ( encode_bit_with_probability ( m_is_rep_g1_probabilities [ m_state ] , 1 ) ) ;
if ( match_type = = MatchType : : RepMatch2 ) {
TRY ( encode_bit_with_probability ( m_is_rep_g2_probabilities [ m_state ] , 0 ) ) ;
2023-05-13 15:55:31 +00:00
dbgln_if ( LZMA_DEBUG , " Encoded match type 'RepMatch2' " ) ;
2023-05-03 09:15:37 +00:00
return { } ;
}
TRY ( encode_bit_with_probability ( m_is_rep_g2_probabilities [ m_state ] , 1 ) ) ;
2023-05-13 15:55:31 +00:00
dbgln_if ( LZMA_DEBUG , " Encoded match type 'RepMatch3' " ) ;
2023-05-03 09:15:37 +00:00
return { } ;
}
ErrorOr < void > LzmaCompressor : : encode_once ( )
{
// Check if any of our existing match distances are currently usable.
2023-06-01 20:24:28 +00:00
Vector < size_t > const existing_distances {
2023-05-03 09:15:37 +00:00
m_rep0 + normalized_to_real_match_distance_offset ,
m_rep1 + normalized_to_real_match_distance_offset ,
m_rep2 + normalized_to_real_match_distance_offset ,
m_rep3 + normalized_to_real_match_distance_offset ,
} ;
2023-06-01 23:25:56 +00:00
auto existing_distance_result = m_dictionary - > find_copy_in_seekback ( existing_distances , m_dictionary - > used_space ( ) , normalized_to_real_match_length_offset ) ;
2023-05-03 09:15:37 +00:00
2023-06-01 23:25:56 +00:00
if ( existing_distance_result . has_value ( ) ) {
auto selected_match = existing_distance_result . release_value ( ) ;
2023-05-03 09:15:37 +00:00
TRY ( encode_existing_match ( selected_match . distance , selected_match . length ) ) ;
return { } ;
}
// If we weren't able to find any viable existing offsets, we now have to search the rest of the dictionary for possible new offsets.
2023-06-02 11:37:47 +00:00
auto new_distance_result = m_dictionary - > find_copy_in_seekback ( m_dictionary - > used_space ( ) , normalized_to_real_match_length_offset ) ;
2023-05-03 09:15:37 +00:00
2023-06-02 11:37:47 +00:00
if ( new_distance_result . has_value ( ) ) {
auto selected_match = new_distance_result . release_value ( ) ;
2023-05-03 09:15:37 +00:00
TRY ( encode_new_match ( selected_match . distance , selected_match . length ) ) ;
return { } ;
}
// If we weren't able to find any matches, we don't have any other choice than to encode the next byte as a literal.
u8 next_byte { 0 } ;
2023-06-02 11:37:47 +00:00
TRY ( m_dictionary - > read ( { & next_byte , sizeof ( next_byte ) } ) ) ;
2023-05-03 09:15:37 +00:00
TRY ( encode_literal ( next_byte ) ) ;
return { } ;
}
2023-03-05 13:34:05 +00:00
ErrorOr < Bytes > LzmaDecompressor : : read_some ( Bytes bytes )
{
2023-03-16 12:07:47 +00:00
while ( m_dictionary - > used_space ( ) < bytes . size ( ) & & m_dictionary - > empty_space ( ) ! = 0 ) {
2023-03-29 22:39:36 +00:00
if ( m_found_end_of_stream_marker )
2023-03-05 13:34:05 +00:00
break ;
2023-03-29 21:37:59 +00:00
if ( has_reached_expected_data_size ( ) ) {
2023-03-29 22:39:36 +00:00
// If the decoder is in a clean state, we assume that this is fine.
if ( is_range_decoder_in_clean_state ( ) )
break ;
// Otherwise, we give it one last try to find the end marker in the remaining data.
2023-03-05 13:34:05 +00:00
}
auto copy_match_to_buffer = [ & ] ( u16 real_length ) - > ErrorOr < void > {
VERIFY ( ! m_leftover_match_length . has_value ( ) ) ;
2023-05-01 11:01:06 +00:00
if ( m_options . uncompressed_size . has_value ( ) & & m_options . uncompressed_size . value ( ) < m_total_processed_bytes + real_length )
2023-03-05 13:34:05 +00:00
return Error : : from_string_literal ( " Tried to copy match beyond expected uncompressed file size " ) ;
2023-04-04 17:50:17 +00:00
auto copied_length = TRY ( m_dictionary - > copy_from_seekback ( current_repetition_offset ( ) , real_length ) ) ;
2023-05-01 11:01:06 +00:00
m_total_processed_bytes + = copied_length ;
2023-04-04 17:50:17 +00:00
real_length - = copied_length ;
if ( real_length > 0 )
m_leftover_match_length = real_length ;
2023-03-05 13:34:05 +00:00
return { } ;
} ;
// If we have a leftover part of a repeating match, we should finish that first.
if ( m_leftover_match_length . has_value ( ) ) {
TRY ( copy_match_to_buffer ( m_leftover_match_length . release_value ( ) ) ) ;
continue ;
}
2023-05-01 13:51:04 +00:00
auto const match_type = TRY ( decode_match_type ( ) ) ;
2023-03-29 22:39:36 +00:00
2023-05-01 13:51:04 +00:00
// If we are looking for EOS, but find another match type, the stream is also corrupted.
if ( has_reached_expected_data_size ( ) & & match_type ! = MatchType : : SimpleMatch )
return Error : : from_string_literal ( " First match type after the expected uncompressed size is not a simple match " ) ;
if ( match_type = = MatchType : : Literal ) {
2023-03-05 13:34:05 +00:00
// "At first the LZMA decoder must check that it doesn't exceed
// specified uncompressed size."
// This is already checked for at the beginning of the loop.
// "Then it decodes literal value and puts it to sliding window."
TRY ( decode_literal_to_output_buffer ( ) ) ;
// "Then the decoder must update the "state" value."
update_state_after_literal ( ) ;
continue ;
}
2023-05-01 13:51:04 +00:00
if ( match_type = = MatchType : : SimpleMatch ) {
2023-03-05 13:34:05 +00:00
// "The distance history table is updated with the following scheme:"
m_rep3 = m_rep2 ;
m_rep2 = m_rep1 ;
m_rep1 = m_rep0 ;
// "The zero-based length is decoded with "LenDecoder"."
2023-05-01 11:01:06 +00:00
u16 normalized_length = TRY ( decode_normalized_match_length ( m_length_coder ) ) ;
2023-03-05 13:34:05 +00:00
// "The state is update with UpdateState_Match function."
update_state_after_match ( ) ;
// "and the new "rep0" value is decoded with DecodeDistance."
m_rep0 = TRY ( decode_normalized_match_distance ( normalized_length ) ) ;
// "If the value of "rep0" is equal to 0xFFFFFFFF, it means that we have
// "End of stream" marker, so we can stop decoding and check finishing
// condition in Range Decoder"
2023-05-03 09:15:37 +00:00
if ( m_rep0 = = end_of_stream_marker ) {
2023-03-29 22:39:36 +00:00
// If we should reject end-of-stream markers, do so now.
// Note that this is not part of LZMA, as LZMA allows end-of-stream markers in all contexts, so pure LZMA should never set this option.
if ( m_options . reject_end_of_stream_marker )
return Error : : from_string_literal ( " An end-of-stream marker was found, but the LZMA stream is configured to reject them " ) ;
2023-03-05 13:34:05 +00:00
// The range decoder condition is checked after breaking out of the loop.
m_found_end_of_stream_marker = true ;
continue ;
}
2023-03-29 22:39:36 +00:00
// If we are looking for EOS, but haven't found it here, the stream is corrupted.
if ( has_reached_expected_data_size ( ) )
return Error : : from_string_literal ( " First simple match after the expected uncompressed size is not the EOS marker " ) ;
2023-03-05 13:34:05 +00:00
// "If uncompressed size is defined, LZMA decoder must check that it doesn't
// exceed that specified uncompressed size."
// This is being checked for in the common "copy to buffer" implementation.
// "Also the decoder must check that "rep0" value is not larger than dictionary size
// and is not larger than the number of already decoded bytes."
2023-03-29 12:46:36 +00:00
if ( current_repetition_offset ( ) > m_dictionary - > seekback_limit ( ) )
2023-03-05 13:34:05 +00:00
return Error : : from_string_literal ( " rep0 value is larger than the possible lookback size " ) ;
// "Then the decoder must copy match bytes as described in
// "The match symbols copying" section."
TRY ( copy_match_to_buffer ( normalized_length + normalized_to_real_match_length_offset ) ) ;
continue ;
}
2023-05-01 13:51:04 +00:00
if ( match_type = = MatchType : : ShortRepMatch ) {
2023-03-05 13:34:05 +00:00
// "LZMA doesn't update the distance history."
2023-05-01 13:51:04 +00:00
// "If the subtype is "Short Rep Match", the decoder updates the state, puts
// the one byte from window to current position in window and goes to next
// MATCH/LITERAL symbol."
update_state_after_short_rep ( ) ;
2023-03-05 13:34:05 +00:00
2023-05-01 13:51:04 +00:00
TRY ( copy_match_to_buffer ( 1 ) ) ;
2023-03-05 13:34:05 +00:00
2023-05-01 13:51:04 +00:00
continue ;
}
2023-03-05 13:34:05 +00:00
2023-05-01 13:51:04 +00:00
// Note: We don't need to do anything specific for "Rep Match 0", we just need to make sure to not
// run the detection for other match types and to not switch around the distance history.
2023-03-05 13:34:05 +00:00
2023-05-01 13:51:04 +00:00
if ( match_type = = MatchType : : RepMatch1 ) {
u32 distance = m_rep1 ;
m_rep1 = m_rep0 ;
m_rep0 = distance ;
}
if ( match_type = = MatchType : : RepMatch2 ) {
u32 distance = m_rep2 ;
m_rep2 = m_rep1 ;
m_rep1 = m_rep0 ;
m_rep0 = distance ;
}
if ( match_type = = MatchType : : RepMatch3 ) {
u32 distance = m_rep3 ;
m_rep3 = m_rep2 ;
m_rep2 = m_rep1 ;
m_rep1 = m_rep0 ;
m_rep0 = distance ;
2023-03-05 13:34:05 +00:00
}
// "In other cases (Rep Match 0/1/2/3), it decodes the zero-based
// length of match with "RepLenDecoder" decoder."
2023-05-01 11:01:06 +00:00
u16 normalized_length = TRY ( decode_normalized_match_length ( m_rep_length_coder ) ) ;
2023-03-05 13:34:05 +00:00
// "Then it updates the state."
update_state_after_rep ( ) ;
// "Then the decoder must copy match bytes as described in
// "The Match symbols copying" section."
TRY ( copy_match_to_buffer ( normalized_length + normalized_to_real_match_length_offset ) ) ;
}
2023-03-29 22:39:36 +00:00
if ( m_found_end_of_stream_marker | | has_reached_expected_data_size ( ) ) {
2023-05-01 11:01:06 +00:00
if ( m_options . uncompressed_size . has_value ( ) & & m_total_processed_bytes < m_options . uncompressed_size . value ( ) )
2023-03-29 22:39:36 +00:00
return Error : : from_string_literal ( " Found end-of-stream marker earlier than expected " ) ;
2023-03-29 21:37:59 +00:00
if ( ! is_range_decoder_in_clean_state ( ) )
2023-03-05 13:34:05 +00:00
return Error : : from_string_literal ( " LZMA stream ends in an unclean state " ) ;
}
2023-03-16 12:07:47 +00:00
return m_dictionary - > read ( bytes ) ;
2023-03-05 13:34:05 +00:00
}
ErrorOr < size_t > LzmaDecompressor : : write_some ( ReadonlyBytes )
{
return Error : : from_errno ( EBADF ) ;
}
bool LzmaDecompressor : : is_eof ( ) const
{
2023-03-16 12:07:47 +00:00
if ( m_dictionary - > used_space ( ) > 0 )
2023-03-05 13:34:05 +00:00
return false ;
2023-03-29 21:37:59 +00:00
if ( has_reached_expected_data_size ( ) )
return true ;
2023-03-05 13:34:05 +00:00
return m_found_end_of_stream_marker ;
}
bool LzmaDecompressor : : is_open ( ) const
{
return true ;
}
void LzmaDecompressor : : close ( )
{
}
2023-05-03 09:15:37 +00:00
ErrorOr < NonnullOwnPtr < LzmaCompressor > > LzmaCompressor : : create_container ( MaybeOwned < Stream > stream , LzmaCompressorOptions const & options )
{
2023-06-01 18:48:35 +00:00
auto dictionary = TRY ( try_make < SearchableCircularBuffer > ( TRY ( SearchableCircularBuffer : : create_empty ( options . dictionary_size + largest_real_match_length ) ) ) ) ;
2023-05-03 09:15:37 +00:00
// "The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where each table contains 0x300 CProb values."
auto literal_probabilities = TRY ( FixedArray < Probability > : : create ( literal_probability_table_size * ( 1 < < ( options . literal_context_bits + options . literal_position_bits ) ) ) ) ;
auto header = TRY ( LzmaHeader : : from_compressor_options ( options ) ) ;
TRY ( stream - > write_value ( header ) ) ;
auto compressor = TRY ( adopt_nonnull_own_or_enomem ( new ( nothrow ) LzmaCompressor ( move ( stream ) , options , move ( dictionary ) , move ( literal_probabilities ) ) ) ) ;
return compressor ;
}
2023-06-01 18:48:35 +00:00
LzmaCompressor : : LzmaCompressor ( MaybeOwned < AK : : Stream > stream , Compress : : LzmaCompressorOptions options , MaybeOwned < SearchableCircularBuffer > dictionary , FixedArray < Compress : : LzmaState : : Probability > literal_probabilities )
2023-05-03 09:15:37 +00:00
: LzmaState ( move ( literal_probabilities ) )
, m_stream ( move ( stream ) )
, m_options ( move ( options ) )
, m_dictionary ( move ( dictionary ) )
{
}
ErrorOr < Bytes > LzmaCompressor : : read_some ( Bytes )
{
return Error : : from_errno ( EBADF ) ;
}
ErrorOr < size_t > LzmaCompressor : : write_some ( ReadonlyBytes bytes )
{
// Fill the input buffer until it's full or until we can't read any more data.
size_t processed_bytes = min ( bytes . size ( ) , largest_real_match_length - m_dictionary - > used_space ( ) ) ;
bytes = bytes . trim ( processed_bytes ) ;
while ( bytes . size ( ) > 0 ) {
auto const written_bytes = m_dictionary - > write ( bytes ) ;
bytes = bytes . slice ( written_bytes ) ;
}
VERIFY ( m_dictionary - > used_space ( ) < = largest_real_match_length ) ;
if ( m_options . uncompressed_size . has_value ( ) & & m_total_processed_bytes + m_dictionary - > used_space ( ) > m_options . uncompressed_size . value ( ) )
return Error : : from_string_literal ( " Tried to compress more LZMA data than announced " ) ;
TRY ( encode_once ( ) ) ;
// If we read enough data to reach the final uncompressed size, flush automatically.
// Flushing will handle encoding the remaining data for us and finalize the stream.
if ( m_options . uncompressed_size . has_value ( ) & & m_total_processed_bytes + m_dictionary - > used_space ( ) > = m_options . uncompressed_size . value ( ) )
TRY ( flush ( ) ) ;
return processed_bytes ;
}
ErrorOr < void > LzmaCompressor : : flush ( )
{
if ( m_has_flushed_data )
return Error : : from_string_literal ( " Flushed an LZMA stream twice " ) ;
while ( m_dictionary - > used_space ( ) > 0 )
TRY ( encode_once ( ) ) ;
if ( m_options . uncompressed_size . has_value ( ) & & m_total_processed_bytes < m_options . uncompressed_size . value ( ) )
return Error : : from_string_literal ( " Flushing LZMA data with known but unreached uncompressed size " ) ;
// The LZMA specification technically also allows both a known size and an end-of-stream marker simultaneously,
// but LZMA2 rejects them, so skip emitting the end-of-stream marker if we know the uncompressed size.
if ( ! m_options . uncompressed_size . has_value ( ) )
TRY ( encode_normalized_simple_match ( end_of_stream_marker , 0 ) ) ;
2023-05-18 23:27:31 +00:00
// Shifting the range encoder using the normal operation handles any pending overflows.
TRY ( shift_range_encoder ( ) ) ;
// Now, the remaining bytes are the cached byte, the chain of 0xFF, and the upper 3 bytes of the current `code`.
// Incrementing the values does not have to be considered as no overflows are pending. The fourth byte is the
// null byte that we just shifted in, which should not be flushed as it would be extraneous junk data.
TRY ( m_stream - > write_value < u8 > ( m_range_encoder_cached_byte ) ) ;
for ( size_t i = 0 ; i < m_range_encoder_ff_chain_length ; i + + )
TRY ( m_stream - > write_value < u8 > ( 0xFF ) ) ;
TRY ( m_stream - > write_value < u8 > ( m_range_encoder_code > > 24 ) ) ;
TRY ( m_stream - > write_value < u8 > ( m_range_encoder_code > > 16 ) ) ;
TRY ( m_stream - > write_value < u8 > ( m_range_encoder_code > > 8 ) ) ;
2023-05-03 09:15:37 +00:00
m_has_flushed_data = true ;
return { } ;
}
bool LzmaCompressor : : is_eof ( ) const
{
return true ;
}
bool LzmaCompressor : : is_open ( ) const
{
return ! m_has_flushed_data ;
}
void LzmaCompressor : : close ( )
{
if ( ! m_has_flushed_data ) {
// Note: We need a better API for specifying things like this.
flush ( ) . release_value_but_fixme_should_propagate_errors ( ) ;
}
}
LzmaCompressor : : ~ LzmaCompressor ( )
{
if ( ! m_has_flushed_data ) {
// Note: We need a better API for specifying things like this.
flush ( ) . release_value_but_fixme_should_propagate_errors ( ) ;
}
}
2023-03-05 13:34:05 +00:00
}