2021-09-12 13:00:27 +00:00
/*
* Copyright ( c ) 2021 , Ali Mohammad Pur < mpfard @ serenityos . org >
*
* SPDX - License - Identifier : BSD - 2 - Clause
*/
2023-01-02 20:07:18 +00:00
# include <AK/Debug.h>
2023-07-28 17:32:34 +00:00
# include <AK/Function.h>
# include <AK/Queue.h>
2021-09-12 13:00:27 +00:00
# include <AK/QuickSort.h>
# include <AK/RedBlackTree.h>
# include <AK/Stack.h>
2023-07-28 17:32:34 +00:00
# include <AK/Trie.h>
2021-09-12 13:00:27 +00:00
# include <LibRegex/Regex.h>
# include <LibRegex/RegexBytecodeStreamOptimizer.h>
2023-07-14 04:43:59 +00:00
# include <LibUnicode/CharacterTypes.h>
2022-02-19 23:55:21 +00:00
# if REGEX_DEBUG
# include <AK / ScopeGuard.h>
# include <AK / ScopeLogger.h>
# endif
2021-09-12 13:00:27 +00:00
namespace regex {
using Detail : : Block ;
template < typename Parser >
void Regex < Parser > : : run_optimization_passes ( )
{
2021-09-29 16:43:58 +00:00
parser_result . bytecode . flatten ( ) ;
2023-07-28 17:29:33 +00:00
auto blocks = split_basic_blocks ( parser_result . bytecode ) ;
if ( attempt_rewrite_entire_match_as_substring_search ( blocks ) )
return ;
2021-09-12 13:00:27 +00:00
// Rewrite fork loops as atomic groups
// e.g. a*b -> (ATOMIC a*)b
2023-07-28 17:29:33 +00:00
attempt_rewrite_loops_as_atomic_groups ( blocks ) ;
2021-09-13 19:15:22 +00:00
2024-09-30 02:49:15 +00:00
// FIXME: "There are a few more conditions this can be true in (e.g. within an arbitrarily nested capture group)"
MatchState state ;
auto & opcode = parser_result . bytecode . get_opcode ( state ) ;
if ( opcode . opcode_id ( ) = = OpCodeId : : CheckBegin )
parser_result . optimization_data . only_start_of_line = true ;
2021-09-13 19:15:22 +00:00
parser_result . bytecode . flatten ( ) ;
2021-09-12 13:00:27 +00:00
}
template < typename Parser >
2021-12-15 05:22:07 +00:00
typename Regex < Parser > : : BasicBlockList Regex < Parser > : : split_basic_blocks ( ByteCode const & bytecode )
2021-09-12 13:00:27 +00:00
{
BasicBlockList block_boundaries ;
size_t end_of_last_block = 0 ;
2021-12-21 14:38:15 +00:00
auto bytecode_size = bytecode . size ( ) ;
2021-09-12 13:00:27 +00:00
MatchState state ;
state . instruction_position = 0 ;
auto check_jump = [ & ] < typename T > ( OpCode const & opcode ) {
auto & op = static_cast < T const & > ( opcode ) ;
ssize_t jump_offset = op . size ( ) + op . offset ( ) ;
if ( jump_offset > = 0 ) {
block_boundaries . append ( { end_of_last_block , state . instruction_position } ) ;
end_of_last_block = state . instruction_position + opcode . size ( ) ;
} else {
// This op jumps back, see if that's within this "block".
if ( jump_offset + state . instruction_position > end_of_last_block ) {
// Split the block!
block_boundaries . append ( { end_of_last_block , jump_offset + state . instruction_position } ) ;
block_boundaries . append ( { jump_offset + state . instruction_position , state . instruction_position } ) ;
end_of_last_block = state . instruction_position + opcode . size ( ) ;
} else {
// Nope, it's just a jump to another block
block_boundaries . append ( { end_of_last_block , state . instruction_position } ) ;
end_of_last_block = state . instruction_position + opcode . size ( ) ;
}
}
} ;
for ( ; ; ) {
auto & opcode = bytecode . get_opcode ( state ) ;
switch ( opcode . opcode_id ( ) ) {
case OpCodeId : : Jump :
check_jump . template operator ( ) < OpCode_Jump > ( opcode ) ;
break ;
case OpCodeId : : JumpNonEmpty :
check_jump . template operator ( ) < OpCode_JumpNonEmpty > ( opcode ) ;
break ;
case OpCodeId : : ForkJump :
check_jump . template operator ( ) < OpCode_ForkJump > ( opcode ) ;
break ;
case OpCodeId : : ForkStay :
check_jump . template operator ( ) < OpCode_ForkStay > ( opcode ) ;
break ;
case OpCodeId : : FailForks :
block_boundaries . append ( { end_of_last_block , state . instruction_position } ) ;
end_of_last_block = state . instruction_position + opcode . size ( ) ;
break ;
case OpCodeId : : Repeat : {
// Repeat produces two blocks, one containing its repeated expr, and one after that.
auto repeat_start = state . instruction_position - static_cast < OpCode_Repeat const & > ( opcode ) . offset ( ) ;
if ( repeat_start > end_of_last_block )
block_boundaries . append ( { end_of_last_block , repeat_start } ) ;
block_boundaries . append ( { repeat_start , state . instruction_position } ) ;
end_of_last_block = state . instruction_position + opcode . size ( ) ;
break ;
}
default :
break ;
}
auto next_ip = state . instruction_position + opcode . size ( ) ;
2021-12-21 14:38:15 +00:00
if ( next_ip < bytecode_size )
2021-09-12 13:00:27 +00:00
state . instruction_position = next_ip ;
else
break ;
}
2021-12-21 14:38:15 +00:00
if ( end_of_last_block < bytecode_size )
block_boundaries . append ( { end_of_last_block , bytecode_size } ) ;
2021-09-12 13:00:27 +00:00
quick_sort ( block_boundaries , [ ] ( auto & a , auto & b ) { return a . start < b . start ; } ) ;
return block_boundaries ;
}
2022-07-04 16:41:15 +00:00
static bool has_overlap ( Vector < CompareTypeAndValuePair > const & lhs , Vector < CompareTypeAndValuePair > const & rhs )
{
// We have to fully interpret the two sequences to determine if they overlap (that is, keep track of inversion state and what ranges they cover).
bool inverse { false } ;
bool temporary_inverse { false } ;
bool reset_temporary_inverse { false } ;
auto current_lhs_inversion_state = [ & ] ( ) - > bool { return temporary_inverse ^ inverse ; } ;
RedBlackTree < u32 , u32 > lhs_ranges ;
RedBlackTree < u32 , u32 > lhs_negated_ranges ;
HashTable < CharClass > lhs_char_classes ;
HashTable < CharClass > lhs_negated_char_classes ;
2023-07-14 04:43:59 +00:00
auto has_any_unicode_property = false ;
HashTable < Unicode : : GeneralCategory > lhs_unicode_general_categories ;
HashTable < Unicode : : Property > lhs_unicode_properties ;
HashTable < Unicode : : Script > lhs_unicode_scripts ;
HashTable < Unicode : : Script > lhs_unicode_script_extensions ;
HashTable < Unicode : : GeneralCategory > lhs_negated_unicode_general_categories ;
HashTable < Unicode : : Property > lhs_negated_unicode_properties ;
HashTable < Unicode : : Script > lhs_negated_unicode_scripts ;
HashTable < Unicode : : Script > lhs_negated_unicode_script_extensions ;
auto any_unicode_property_matches = [ & ] ( u32 code_point ) {
if ( any_of ( lhs_negated_unicode_general_categories , [ code_point ] ( auto category ) { return Unicode : : code_point_has_general_category ( code_point , category ) ; } ) )
return false ;
if ( any_of ( lhs_negated_unicode_properties , [ code_point ] ( auto property ) { return Unicode : : code_point_has_property ( code_point , property ) ; } ) )
return false ;
if ( any_of ( lhs_negated_unicode_scripts , [ code_point ] ( auto script ) { return Unicode : : code_point_has_script ( code_point , script ) ; } ) )
return false ;
if ( any_of ( lhs_negated_unicode_script_extensions , [ code_point ] ( auto script ) { return Unicode : : code_point_has_script_extension ( code_point , script ) ; } ) )
return false ;
if ( any_of ( lhs_unicode_general_categories , [ code_point ] ( auto category ) { return Unicode : : code_point_has_general_category ( code_point , category ) ; } ) )
return true ;
if ( any_of ( lhs_unicode_properties , [ code_point ] ( auto property ) { return Unicode : : code_point_has_property ( code_point , property ) ; } ) )
return true ;
if ( any_of ( lhs_unicode_scripts , [ code_point ] ( auto script ) { return Unicode : : code_point_has_script ( code_point , script ) ; } ) )
return true ;
if ( any_of ( lhs_unicode_script_extensions , [ code_point ] ( auto script ) { return Unicode : : code_point_has_script_extension ( code_point , script ) ; } ) )
return true ;
return false ;
} ;
2022-07-04 16:41:15 +00:00
auto range_contains = [ & ] < typename T > ( T & value ) - > bool {
u32 start ;
u32 end ;
if constexpr ( IsSame < T , CharRange > ) {
start = value . from ;
end = value . to ;
} else {
start = value ;
end = value ;
}
2023-07-14 04:43:59 +00:00
if ( has_any_unicode_property ) {
// We have some properties, and a range is present
// Instead of checking every single code point in the range, assume it's a match.
return start ! = end | | any_unicode_property_matches ( start ) ;
}
2022-07-04 16:41:15 +00:00
auto * max = lhs_ranges . find_smallest_not_below ( start ) ;
return max & & * max < = end ;
} ;
auto char_class_contains = [ & ] ( CharClass const & value ) - > bool {
if ( lhs_char_classes . contains ( value ) )
return true ;
if ( lhs_negated_char_classes . contains ( value ) )
return false ;
2024-10-24 09:39:26 +00:00
if ( lhs_ranges . is_empty ( ) )
return false ;
for ( auto it = lhs_ranges . begin ( ) ; it ! = lhs_ranges . end ( ) ; + + it ) {
auto start = it . key ( ) ;
auto end = * it ;
for ( u32 ch = start ; ch < = end ; + + ch ) {
if ( OpCode_Compare : : matches_character_class ( value , ch , false ) )
return true ;
}
}
return false ;
2022-07-04 16:41:15 +00:00
} ;
for ( auto const & pair : lhs ) {
if ( reset_temporary_inverse ) {
reset_temporary_inverse = false ;
temporary_inverse = false ;
} else {
reset_temporary_inverse = true ;
}
switch ( pair . type ) {
case CharacterCompareType : : Inverse :
inverse = ! inverse ;
break ;
case CharacterCompareType : : TemporaryInverse :
2022-07-10 07:10:17 +00:00
temporary_inverse = true ;
reset_temporary_inverse = true ;
2022-07-04 16:41:15 +00:00
break ;
case CharacterCompareType : : AnyChar :
// Special case: if not inverted, AnyChar is always in the range.
if ( ! current_lhs_inversion_state ( ) )
return true ;
break ;
case CharacterCompareType : : Char :
if ( ! current_lhs_inversion_state ( ) )
lhs_ranges . insert ( pair . value , pair . value ) ;
else
lhs_negated_ranges . insert ( pair . value , pair . value ) ;
break ;
case CharacterCompareType : : String :
// FIXME: We just need to look at the last character of this string, but we only have the first character here.
// Just bail out to avoid false positives.
return true ;
case CharacterCompareType : : CharClass :
if ( ! current_lhs_inversion_state ( ) )
lhs_char_classes . set ( static_cast < CharClass > ( pair . value ) ) ;
else
lhs_negated_char_classes . set ( static_cast < CharClass > ( pair . value ) ) ;
break ;
case CharacterCompareType : : CharRange : {
2022-07-05 00:26:21 +00:00
auto range = CharRange ( pair . value ) ;
2022-07-04 16:41:15 +00:00
if ( ! current_lhs_inversion_state ( ) )
lhs_ranges . insert ( range . from , range . to ) ;
else
lhs_negated_ranges . insert ( range . from , range . to ) ;
break ;
}
case CharacterCompareType : : LookupTable :
// We've transformed this into a series of ranges in flat_compares(), so bail out if we see it.
return true ;
case CharacterCompareType : : Reference :
// We've handled this before coming here.
break ;
case CharacterCompareType : : Property :
2023-07-14 04:43:59 +00:00
has_any_unicode_property = true ;
if ( ! current_lhs_inversion_state ( ) )
lhs_unicode_properties . set ( static_cast < Unicode : : Property > ( pair . value ) ) ;
else
lhs_negated_unicode_properties . set ( static_cast < Unicode : : Property > ( pair . value ) ) ;
break ;
2022-07-04 16:41:15 +00:00
case CharacterCompareType : : GeneralCategory :
2023-07-14 04:43:59 +00:00
has_any_unicode_property = true ;
if ( ! current_lhs_inversion_state ( ) )
lhs_unicode_general_categories . set ( static_cast < Unicode : : GeneralCategory > ( pair . value ) ) ;
else
lhs_negated_unicode_general_categories . set ( static_cast < Unicode : : GeneralCategory > ( pair . value ) ) ;
break ;
2022-07-04 16:41:15 +00:00
case CharacterCompareType : : Script :
2023-07-14 04:43:59 +00:00
has_any_unicode_property = true ;
if ( ! current_lhs_inversion_state ( ) )
lhs_unicode_scripts . set ( static_cast < Unicode : : Script > ( pair . value ) ) ;
else
lhs_negated_unicode_scripts . set ( static_cast < Unicode : : Script > ( pair . value ) ) ;
break ;
2022-07-04 16:41:15 +00:00
case CharacterCompareType : : ScriptExtension :
2023-07-14 04:43:59 +00:00
has_any_unicode_property = true ;
if ( ! current_lhs_inversion_state ( ) )
lhs_unicode_script_extensions . set ( static_cast < Unicode : : Script > ( pair . value ) ) ;
else
lhs_negated_unicode_script_extensions . set ( static_cast < Unicode : : Script > ( pair . value ) ) ;
break ;
2022-07-20 18:52:07 +00:00
case CharacterCompareType : : Or :
case CharacterCompareType : : EndAndOr :
2024-10-24 19:59:15 +00:00
// These are the default behaviour for [...], so we don't need to do anything (unless we add support for 'And' below).
break ;
case CharacterCompareType : : And :
2022-07-04 16:41:15 +00:00
// FIXME: These are too difficult to handle, so bail out.
return true ;
case CharacterCompareType : : Undefined :
case CharacterCompareType : : RangeExpressionDummy :
// These do not occur in valid bytecode.
VERIFY_NOT_REACHED ( ) ;
}
}
if constexpr ( REGEX_DEBUG ) {
dbgln ( " lhs ranges: " ) ;
for ( auto it = lhs_ranges . begin ( ) ; it ! = lhs_ranges . end ( ) ; + + it )
dbgln ( " {}..{} " , it . key ( ) , * it ) ;
dbgln ( " lhs negated ranges: " ) ;
for ( auto it = lhs_negated_ranges . begin ( ) ; it ! = lhs_negated_ranges . end ( ) ; + + it )
dbgln ( " {}..{} " , it . key ( ) , * it ) ;
}
2024-10-24 09:39:26 +00:00
temporary_inverse = false ;
reset_temporary_inverse = false ;
inverse = false ;
2024-10-24 19:59:15 +00:00
auto in_or = false ; // We're in an OR block, so we should wait for the EndAndOr to decide if we would match.
auto matched_in_or = false ;
auto inverse_matched_in_or = false ;
2024-10-24 09:39:26 +00:00
2022-07-04 16:41:15 +00:00
for ( auto const & pair : rhs ) {
if ( reset_temporary_inverse ) {
reset_temporary_inverse = false ;
temporary_inverse = false ;
} else {
reset_temporary_inverse = true ;
}
2024-10-24 19:59:15 +00:00
if constexpr ( REGEX_DEBUG ) {
dbgln ( " check {} ({}) [inverted? {}] against {{ " , character_compare_type_name ( pair . type ) , pair . value , current_lhs_inversion_state ( ) ) ;
for ( auto it = lhs_ranges . begin ( ) ; it ! = lhs_ranges . end ( ) ; + + it )
dbgln ( " {}..{} " , it . key ( ) , * it ) ;
for ( auto it = lhs_negated_ranges . begin ( ) ; it ! = lhs_negated_ranges . end ( ) ; + + it )
dbgln ( " ^[{}..{}] " , it . key ( ) , * it ) ;
for ( auto & char_class : lhs_char_classes )
dbgln ( " {} " , character_class_name ( char_class ) ) ;
for ( auto & char_class : lhs_negated_char_classes )
dbgln ( " ^{} " , character_class_name ( char_class ) ) ;
dbgln ( " }}, in or: {}, matched in or: {}, inverse matched in or: {} " , in_or , matched_in_or , inverse_matched_in_or ) ;
}
2022-07-04 16:41:15 +00:00
switch ( pair . type ) {
case CharacterCompareType : : Inverse :
inverse = ! inverse ;
break ;
case CharacterCompareType : : TemporaryInverse :
2022-07-10 07:10:17 +00:00
temporary_inverse = true ;
reset_temporary_inverse = true ;
2022-07-04 16:41:15 +00:00
break ;
case CharacterCompareType : : AnyChar :
// Special case: if not inverted, AnyChar is always in the range.
2024-10-24 19:59:15 +00:00
if ( ! in_or & & ! current_lhs_inversion_state ( ) )
2022-07-04 16:41:15 +00:00
return true ;
2024-10-24 19:59:15 +00:00
if ( in_or ) {
matched_in_or = true ;
inverse_matched_in_or = false ;
}
2022-07-04 16:41:15 +00:00
break ;
2024-10-24 19:59:15 +00:00
case CharacterCompareType : : Char : {
auto matched = range_contains ( pair . value ) ;
if ( ! in_or & & ( current_lhs_inversion_state ( ) ^ matched ) )
2022-07-04 16:41:15 +00:00
return true ;
2024-10-24 19:59:15 +00:00
if ( in_or ) {
matched_in_or | = matched ;
inverse_matched_in_or | = ! matched ;
}
2022-07-04 16:41:15 +00:00
break ;
2024-10-24 19:59:15 +00:00
}
2022-07-04 16:41:15 +00:00
case CharacterCompareType : : String :
// FIXME: We just need to look at the last character of this string, but we only have the first character here.
// Just bail out to avoid false positives.
return true ;
2024-10-24 19:59:15 +00:00
case CharacterCompareType : : CharClass : {
auto contains = char_class_contains ( static_cast < CharClass > ( pair . value ) ) ;
if ( ! in_or & & ( current_lhs_inversion_state ( ) ^ contains ) )
2022-07-04 16:41:15 +00:00
return true ;
2024-10-24 19:59:15 +00:00
if ( in_or ) {
matched_in_or | = contains ;
inverse_matched_in_or | = ! contains ;
}
2022-07-04 16:41:15 +00:00
break ;
2024-10-24 19:59:15 +00:00
}
2022-07-04 16:41:15 +00:00
case CharacterCompareType : : CharRange : {
2022-07-05 00:26:21 +00:00
auto range = CharRange ( pair . value ) ;
2024-10-24 19:59:15 +00:00
auto contains = range_contains ( range ) ;
if ( ! in_or & & ( contains ^ current_lhs_inversion_state ( ) ) )
2022-07-04 16:41:15 +00:00
return true ;
2024-10-24 19:59:15 +00:00
if ( in_or ) {
matched_in_or | = contains ;
inverse_matched_in_or | = ! contains ;
}
2022-07-04 16:41:15 +00:00
break ;
}
case CharacterCompareType : : LookupTable :
// We've transformed this into a series of ranges in flat_compares(), so bail out if we see it.
return true ;
case CharacterCompareType : : Reference :
// We've handled this before coming here.
break ;
case CharacterCompareType : : Property :
2023-07-14 04:43:59 +00:00
// The only reasonable scenario where we can check these properties without spending too much time is if:
// - the ranges are empty
// - the char classes are empty
// - the unicode properties are empty or contain only this property
if ( ! lhs_ranges . is_empty ( ) | | ! lhs_negated_ranges . is_empty ( ) | | ! lhs_char_classes . is_empty ( ) | | ! lhs_negated_char_classes . is_empty ( ) )
return true ;
if ( has_any_unicode_property & & ! lhs_unicode_properties . is_empty ( ) & & ! lhs_negated_unicode_properties . is_empty ( ) ) {
2024-10-24 19:59:15 +00:00
auto contains = lhs_unicode_properties . contains ( static_cast < Unicode : : Property > ( pair . value ) ) ;
if ( ! in_or & & ( current_lhs_inversion_state ( ) ^ contains ) )
2023-07-14 04:43:59 +00:00
return true ;
2024-10-24 19:59:15 +00:00
auto inverse_contains = lhs_negated_unicode_properties . contains ( static_cast < Unicode : : Property > ( pair . value ) ) ;
if ( ! in_or & & ! ( current_lhs_inversion_state ( ) ^ inverse_contains ) )
2023-07-14 04:43:59 +00:00
return true ;
2024-10-24 19:59:15 +00:00
if ( in_or ) {
matched_in_or | = contains ;
inverse_matched_in_or | = inverse_contains ;
}
2023-07-14 04:43:59 +00:00
}
break ;
2022-07-04 16:41:15 +00:00
case CharacterCompareType : : GeneralCategory :
2023-07-14 04:43:59 +00:00
if ( ! lhs_ranges . is_empty ( ) | | ! lhs_negated_ranges . is_empty ( ) | | ! lhs_char_classes . is_empty ( ) | | ! lhs_negated_char_classes . is_empty ( ) )
return true ;
if ( has_any_unicode_property & & ! lhs_unicode_general_categories . is_empty ( ) & & ! lhs_negated_unicode_general_categories . is_empty ( ) ) {
2024-10-24 19:59:15 +00:00
auto contains = lhs_unicode_general_categories . contains ( static_cast < Unicode : : GeneralCategory > ( pair . value ) ) ;
if ( ! in_or & & ( current_lhs_inversion_state ( ) ^ contains ) )
2023-07-14 04:43:59 +00:00
return true ;
2024-10-24 19:59:15 +00:00
auto inverse_contains = lhs_negated_unicode_general_categories . contains ( static_cast < Unicode : : GeneralCategory > ( pair . value ) ) ;
if ( ! in_or & & ! ( current_lhs_inversion_state ( ) ^ inverse_contains ) )
2023-07-14 04:43:59 +00:00
return true ;
2024-10-24 19:59:15 +00:00
if ( in_or ) {
matched_in_or | = contains ;
inverse_matched_in_or | = inverse_contains ;
}
2023-07-14 04:43:59 +00:00
}
break ;
2022-07-04 16:41:15 +00:00
case CharacterCompareType : : Script :
2023-07-14 04:43:59 +00:00
if ( ! lhs_ranges . is_empty ( ) | | ! lhs_negated_ranges . is_empty ( ) | | ! lhs_char_classes . is_empty ( ) | | ! lhs_negated_char_classes . is_empty ( ) )
return true ;
if ( has_any_unicode_property & & ! lhs_unicode_scripts . is_empty ( ) & & ! lhs_negated_unicode_scripts . is_empty ( ) ) {
2024-10-24 19:59:15 +00:00
auto contains = lhs_unicode_scripts . contains ( static_cast < Unicode : : Script > ( pair . value ) ) ;
if ( ! in_or & & ( current_lhs_inversion_state ( ) ^ contains ) )
2023-07-14 04:43:59 +00:00
return true ;
2024-10-24 19:59:15 +00:00
auto inverse_contains = lhs_negated_unicode_scripts . contains ( static_cast < Unicode : : Script > ( pair . value ) ) ;
if ( ! in_or & & ! ( current_lhs_inversion_state ( ) ^ inverse_contains ) )
2023-07-14 04:43:59 +00:00
return true ;
2024-10-24 19:59:15 +00:00
if ( in_or ) {
matched_in_or | = contains ;
inverse_matched_in_or | = inverse_contains ;
}
2023-07-14 04:43:59 +00:00
}
break ;
2022-07-04 16:41:15 +00:00
case CharacterCompareType : : ScriptExtension :
2023-07-14 04:43:59 +00:00
if ( ! lhs_ranges . is_empty ( ) | | ! lhs_negated_ranges . is_empty ( ) | | ! lhs_char_classes . is_empty ( ) | | ! lhs_negated_char_classes . is_empty ( ) )
return true ;
if ( has_any_unicode_property & & ! lhs_unicode_script_extensions . is_empty ( ) & & ! lhs_negated_unicode_script_extensions . is_empty ( ) ) {
2024-10-24 19:59:15 +00:00
auto contains = lhs_unicode_script_extensions . contains ( static_cast < Unicode : : Script > ( pair . value ) ) ;
if ( ! in_or & & ( current_lhs_inversion_state ( ) ^ contains ) )
2023-07-14 04:43:59 +00:00
return true ;
2024-10-24 19:59:15 +00:00
auto inverse_contains = lhs_negated_unicode_script_extensions . contains ( static_cast < Unicode : : Script > ( pair . value ) ) ;
if ( ! in_or & & ! ( current_lhs_inversion_state ( ) ^ inverse_contains ) )
2023-07-14 04:43:59 +00:00
return true ;
2024-10-24 19:59:15 +00:00
if ( in_or ) {
matched_in_or | = contains ;
inverse_matched_in_or | = inverse_contains ;
}
2023-07-14 04:43:59 +00:00
}
break ;
2022-07-20 18:52:07 +00:00
case CharacterCompareType : : Or :
2024-10-24 19:59:15 +00:00
in_or = true ;
break ;
2022-07-20 18:52:07 +00:00
case CharacterCompareType : : EndAndOr :
2024-10-24 19:59:15 +00:00
// FIXME: Handle And when we support it below.
VERIFY ( in_or ) ;
in_or = false ;
if ( current_lhs_inversion_state ( ) ) {
if ( ! inverse_matched_in_or )
return true ;
} else {
if ( matched_in_or )
return true ;
}
break ;
case CharacterCompareType : : And :
2022-07-04 16:41:15 +00:00
// FIXME: These are too difficult to handle, so bail out.
return true ;
case CharacterCompareType : : Undefined :
case CharacterCompareType : : RangeExpressionDummy :
// These do not occur in valid bytecode.
VERIFY_NOT_REACHED ( ) ;
}
}
return false ;
}
2021-11-18 04:30:29 +00:00
enum class AtomicRewritePreconditionResult {
SatisfiedWithProperHeader ,
SatisfiedWithEmptyHeader ,
NotSatisfied ,
} ;
static AtomicRewritePreconditionResult block_satisfies_atomic_rewrite_precondition ( ByteCode const & bytecode , Block const & repeated_block , Block const & following_block )
2021-09-12 13:00:27 +00:00
{
Vector < Vector < CompareTypeAndValuePair > > repeated_values ;
2021-09-15 10:01:55 +00:00
HashTable < size_t > active_capture_groups ;
2021-09-12 13:00:27 +00:00
MatchState state ;
2023-02-15 06:44:13 +00:00
auto has_seen_actionable_opcode = false ;
2021-09-12 13:00:27 +00:00
for ( state . instruction_position = repeated_block . start ; state . instruction_position < repeated_block . end ; ) {
auto & opcode = bytecode . get_opcode ( state ) ;
switch ( opcode . opcode_id ( ) ) {
case OpCodeId : : Compare : {
2023-02-15 06:44:13 +00:00
has_seen_actionable_opcode = true ;
2021-09-12 13:00:27 +00:00
auto compares = static_cast < OpCode_Compare const & > ( opcode ) . flat_compares ( ) ;
if ( repeated_values . is_empty ( ) & & any_of ( compares , [ ] ( auto & compare ) { return compare . type = = CharacterCompareType : : AnyChar ; } ) )
2021-11-18 04:30:29 +00:00
return AtomicRewritePreconditionResult : : NotSatisfied ;
2021-09-12 13:00:27 +00:00
repeated_values . append ( move ( compares ) ) ;
break ;
}
case OpCodeId : : CheckBegin :
case OpCodeId : : CheckEnd :
2023-02-15 06:44:13 +00:00
has_seen_actionable_opcode = true ;
2021-09-12 13:00:27 +00:00
if ( repeated_values . is_empty ( ) )
2021-11-18 04:30:29 +00:00
return AtomicRewritePreconditionResult : : SatisfiedWithProperHeader ;
2021-09-12 13:00:27 +00:00
break ;
case OpCodeId : : CheckBoundary :
// FIXME: What should we do with these? for now, let's fail.
2021-11-18 04:30:29 +00:00
return AtomicRewritePreconditionResult : : NotSatisfied ;
2021-09-12 13:00:27 +00:00
case OpCodeId : : Restore :
case OpCodeId : : GoBack :
2021-11-18 04:30:29 +00:00
return AtomicRewritePreconditionResult : : NotSatisfied ;
2021-09-15 10:01:55 +00:00
case OpCodeId : : SaveRightCaptureGroup :
active_capture_groups . set ( static_cast < OpCode_SaveRightCaptureGroup const & > ( opcode ) . id ( ) ) ;
break ;
case OpCodeId : : SaveLeftCaptureGroup :
active_capture_groups . set ( static_cast < OpCode_SaveLeftCaptureGroup const & > ( opcode ) . id ( ) ) ;
break ;
2023-02-15 06:44:13 +00:00
case OpCodeId : : ForkJump :
case OpCodeId : : ForkReplaceJump :
case OpCodeId : : JumpNonEmpty :
// We could attempt to recursively resolve the follow set, but pretending that this just goes nowhere is faster.
if ( ! has_seen_actionable_opcode )
return AtomicRewritePreconditionResult : : NotSatisfied ;
break ;
2021-09-12 13:00:27 +00:00
default :
break ;
}
state . instruction_position + = opcode . size ( ) ;
}
dbgln_if ( REGEX_DEBUG , " Found {} entries in reference " , repeated_values . size ( ) ) ;
2021-09-15 10:01:55 +00:00
dbgln_if ( REGEX_DEBUG , " Found {} active capture groups " , active_capture_groups . size ( ) ) ;
2021-09-12 13:00:27 +00:00
2021-11-18 04:30:29 +00:00
bool following_block_has_at_least_one_compare = false ;
2021-09-12 13:00:27 +00:00
// Find the first compare in the following block, it must NOT match any of the values in `repeated_values'.
2023-06-13 21:32:40 +00:00
auto final_instruction = following_block . start ;
2021-09-12 13:00:27 +00:00
for ( state . instruction_position = following_block . start ; state . instruction_position < following_block . end ; ) {
2023-06-13 21:32:40 +00:00
final_instruction = state . instruction_position ;
2021-09-12 13:00:27 +00:00
auto & opcode = bytecode . get_opcode ( state ) ;
switch ( opcode . opcode_id ( ) ) {
2021-09-15 10:01:55 +00:00
// Note: These have to exist since we're effectively repeating the following block as well
case OpCodeId : : SaveRightCaptureGroup :
active_capture_groups . set ( static_cast < OpCode_SaveRightCaptureGroup const & > ( opcode ) . id ( ) ) ;
break ;
case OpCodeId : : SaveLeftCaptureGroup :
active_capture_groups . set ( static_cast < OpCode_SaveLeftCaptureGroup const & > ( opcode ) . id ( ) ) ;
break ;
2021-09-12 13:00:27 +00:00
case OpCodeId : : Compare : {
2021-11-18 04:30:29 +00:00
following_block_has_at_least_one_compare = true ;
2021-09-12 13:00:27 +00:00
// We found a compare, let's see what it has.
auto compares = static_cast < OpCode_Compare const & > ( opcode ) . flat_compares ( ) ;
if ( compares . is_empty ( ) )
break ;
2021-09-15 10:01:55 +00:00
if ( any_of ( compares , [ & ] ( auto & compare ) {
return compare . type = = CharacterCompareType : : AnyChar
| | ( compare . type = = CharacterCompareType : : Reference & & active_capture_groups . contains ( compare . value ) ) ;
} ) )
2021-11-18 04:30:29 +00:00
return AtomicRewritePreconditionResult : : NotSatisfied ;
2021-09-12 13:00:27 +00:00
2022-07-04 16:41:15 +00:00
if ( any_of ( repeated_values , [ & ] ( auto & repeated_value ) { return has_overlap ( compares , repeated_value ) ; } ) )
return AtomicRewritePreconditionResult : : NotSatisfied ;
2021-09-12 13:00:27 +00:00
2021-11-18 04:30:29 +00:00
return AtomicRewritePreconditionResult : : SatisfiedWithProperHeader ;
2021-09-12 13:00:27 +00:00
}
case OpCodeId : : CheckBegin :
case OpCodeId : : CheckEnd :
2021-11-18 04:30:29 +00:00
return AtomicRewritePreconditionResult : : SatisfiedWithProperHeader ; // Nothing can match the end!
2021-09-12 13:00:27 +00:00
case OpCodeId : : CheckBoundary :
// FIXME: What should we do with these? For now, consider them a failure.
2021-11-18 04:30:29 +00:00
return AtomicRewritePreconditionResult : : NotSatisfied ;
2023-02-15 06:44:13 +00:00
case OpCodeId : : ForkJump :
case OpCodeId : : ForkReplaceJump :
case OpCodeId : : JumpNonEmpty :
// See note in the previous switch, same cases.
if ( ! following_block_has_at_least_one_compare )
return AtomicRewritePreconditionResult : : NotSatisfied ;
break ;
2021-09-12 13:00:27 +00:00
default :
break ;
}
state . instruction_position + = opcode . size ( ) ;
}
2023-06-13 21:32:40 +00:00
// If the following block falls through, we can't rewrite it.
state . instruction_position = final_instruction ;
switch ( bytecode . get_opcode ( state ) . opcode_id ( ) ) {
case OpCodeId : : Jump :
case OpCodeId : : JumpNonEmpty :
case OpCodeId : : ForkJump :
case OpCodeId : : ForkReplaceJump :
break ;
default :
return AtomicRewritePreconditionResult : : NotSatisfied ;
}
2021-11-18 04:30:29 +00:00
if ( following_block_has_at_least_one_compare )
return AtomicRewritePreconditionResult : : SatisfiedWithProperHeader ;
return AtomicRewritePreconditionResult : : SatisfiedWithEmptyHeader ;
2021-09-12 13:00:27 +00:00
}
2023-07-28 17:29:33 +00:00
template < typename Parser >
bool Regex < Parser > : : attempt_rewrite_entire_match_as_substring_search ( BasicBlockList const & basic_blocks )
{
// If there's no jumps, we can probably rewrite this as a substring search (Compare { string = str }).
if ( basic_blocks . size ( ) > 1 )
return false ;
if ( basic_blocks . is_empty ( ) ) {
parser_result . optimization_data . pure_substring_search = " " sv ;
return true ; // Empty regex, sure.
}
auto & bytecode = parser_result . bytecode ;
auto is_unicode = parser_result . options . has_flag_set ( AllFlags : : Unicode ) ;
// We have a single basic block, let's see if it's a series of character or string compares.
StringBuilder final_string ;
MatchState state ;
while ( state . instruction_position < bytecode . size ( ) ) {
auto & opcode = bytecode . get_opcode ( state ) ;
switch ( opcode . opcode_id ( ) ) {
case OpCodeId : : Compare : {
auto & compare = static_cast < OpCode_Compare const & > ( opcode ) ;
for ( auto & flat_compare : compare . flat_compares ( ) ) {
if ( flat_compare . type ! = CharacterCompareType : : Char )
return false ;
if ( is_unicode | | flat_compare . value < = 0x7f )
final_string . append_code_point ( flat_compare . value ) ;
else
final_string . append ( bit_cast < char > ( static_cast < u8 > ( flat_compare . value ) ) ) ;
}
break ;
}
default :
return false ;
}
state . instruction_position + = opcode . size ( ) ;
}
2023-12-16 14:19:34 +00:00
parser_result . optimization_data . pure_substring_search = final_string . to_byte_string ( ) ;
2023-07-28 17:29:33 +00:00
return true ;
}
2021-09-12 13:00:27 +00:00
template < typename Parser >
void Regex < Parser > : : attempt_rewrite_loops_as_atomic_groups ( BasicBlockList const & basic_blocks )
{
auto & bytecode = parser_result . bytecode ;
if constexpr ( REGEX_DEBUG ) {
RegexDebug dbg ;
dbg . print_bytecode ( * this ) ;
2021-12-21 16:38:51 +00:00
for ( auto const & block : basic_blocks )
2021-09-12 13:00:27 +00:00
dbgln ( " block from {} to {} " , block . start , block . end ) ;
}
// A pattern such as:
// bb0 | RE0
// | ForkX bb0
// -------------------------
// bb1 | RE1
// can be rewritten as:
// -------------------------
// bb0 | RE0
// | ForkReplaceX bb0
// -------------------------
// bb1 | RE1
// provided that first(RE1) not-in end(RE0), which is to say
// that RE1 cannot start with whatever RE0 has matched (ever).
//
// Alternatively, a second form of this pattern can also occur:
// bb0 | *
// | ForkX bb2
// ------------------------
// bb1 | RE0
// | Jump bb0
// ------------------------
// bb2 | RE1
// which can be transformed (with the same preconditions) to:
// bb0 | *
// | ForkReplaceX bb2
// ------------------------
// bb1 | RE0
// | Jump bb0
// ------------------------
// bb2 | RE1
enum class AlternateForm {
2021-11-18 04:30:29 +00:00
DirectLoopWithoutHeader , // loop without proper header, a block forking to itself. i.e. the first form.
DirectLoopWithoutHeaderAndEmptyFollow , // loop without proper header, a block forking to itself. i.e. the first form but with RE1 being empty.
DirectLoopWithHeader , // loop with proper header, i.e. the second form.
2021-09-12 13:00:27 +00:00
} ;
struct CandidateBlock {
Block forking_block ;
Optional < Block > new_target_block ;
AlternateForm form ;
} ;
Vector < CandidateBlock > candidate_blocks ;
auto is_an_eligible_jump = [ ] ( OpCode const & opcode , size_t ip , size_t block_start , AlternateForm alternate_form ) {
switch ( opcode . opcode_id ( ) ) {
case OpCodeId : : JumpNonEmpty : {
2021-12-21 16:38:51 +00:00
auto const & op = static_cast < OpCode_JumpNonEmpty const & > ( opcode ) ;
2021-09-12 13:00:27 +00:00
auto form = op . form ( ) ;
if ( form ! = OpCodeId : : Jump & & alternate_form = = AlternateForm : : DirectLoopWithHeader )
return false ;
if ( form ! = OpCodeId : : ForkJump & & form ! = OpCodeId : : ForkStay & & alternate_form = = AlternateForm : : DirectLoopWithoutHeader )
return false ;
return op . offset ( ) + ip + opcode . size ( ) = = block_start ;
}
case OpCodeId : : ForkJump :
if ( alternate_form = = AlternateForm : : DirectLoopWithHeader )
return false ;
return static_cast < OpCode_ForkJump const & > ( opcode ) . offset ( ) + ip + opcode . size ( ) = = block_start ;
case OpCodeId : : ForkStay :
if ( alternate_form = = AlternateForm : : DirectLoopWithHeader )
return false ;
return static_cast < OpCode_ForkStay const & > ( opcode ) . offset ( ) + ip + opcode . size ( ) = = block_start ;
case OpCodeId : : Jump :
// Infinite loop does *not* produce forks.
if ( alternate_form = = AlternateForm : : DirectLoopWithoutHeader )
return false ;
if ( alternate_form = = AlternateForm : : DirectLoopWithHeader )
return static_cast < OpCode_Jump const & > ( opcode ) . offset ( ) + ip + opcode . size ( ) = = block_start ;
VERIFY_NOT_REACHED ( ) ;
default :
return false ;
}
} ;
for ( size_t i = 0 ; i < basic_blocks . size ( ) ; + + i ) {
auto forking_block = basic_blocks [ i ] ;
Optional < Block > fork_fallback_block ;
if ( i + 1 < basic_blocks . size ( ) )
fork_fallback_block = basic_blocks [ i + 1 ] ;
MatchState state ;
// Check if the last instruction in this block is a jump to the block itself:
{
state . instruction_position = forking_block . end ;
auto & opcode = bytecode . get_opcode ( state ) ;
if ( is_an_eligible_jump ( opcode , state . instruction_position , forking_block . start , AlternateForm : : DirectLoopWithoutHeader ) ) {
// We've found RE0 (and RE1 is just the following block, if any), let's see if the precondition applies.
// if RE1 is empty, there's no first(RE1), so this is an automatic pass.
2023-02-15 06:44:13 +00:00
if ( ! fork_fallback_block . has_value ( )
| | ( fork_fallback_block - > end = = fork_fallback_block - > start & & block_satisfies_atomic_rewrite_precondition ( bytecode , forking_block , * fork_fallback_block ) ! = AtomicRewritePreconditionResult : : NotSatisfied ) ) {
2021-09-12 13:00:27 +00:00
candidate_blocks . append ( { forking_block , fork_fallback_block , AlternateForm : : DirectLoopWithoutHeader } ) ;
break ;
}
2021-11-18 04:30:29 +00:00
auto precondition = block_satisfies_atomic_rewrite_precondition ( bytecode , forking_block , * fork_fallback_block ) ;
if ( precondition = = AtomicRewritePreconditionResult : : SatisfiedWithProperHeader ) {
2021-09-12 13:00:27 +00:00
candidate_blocks . append ( { forking_block , fork_fallback_block , AlternateForm : : DirectLoopWithoutHeader } ) ;
break ;
}
2021-11-18 04:30:29 +00:00
if ( precondition = = AtomicRewritePreconditionResult : : SatisfiedWithEmptyHeader ) {
candidate_blocks . append ( { forking_block , fork_fallback_block , AlternateForm : : DirectLoopWithoutHeaderAndEmptyFollow } ) ;
break ;
}
2021-09-12 13:00:27 +00:00
}
}
// Check if the last instruction in the last block is a direct jump to this block
if ( fork_fallback_block . has_value ( ) ) {
state . instruction_position = fork_fallback_block - > end ;
auto & opcode = bytecode . get_opcode ( state ) ;
if ( is_an_eligible_jump ( opcode , state . instruction_position , forking_block . start , AlternateForm : : DirectLoopWithHeader ) ) {
// We've found bb1 and bb0, let's just make sure that bb0 forks to bb2.
state . instruction_position = forking_block . end ;
auto & opcode = bytecode . get_opcode ( state ) ;
if ( opcode . opcode_id ( ) = = OpCodeId : : ForkJump | | opcode . opcode_id ( ) = = OpCodeId : : ForkStay ) {
Optional < Block > block_following_fork_fallback ;
if ( i + 2 < basic_blocks . size ( ) )
block_following_fork_fallback = basic_blocks [ i + 2 ] ;
2021-11-18 04:30:29 +00:00
if ( ! block_following_fork_fallback . has_value ( )
| | block_satisfies_atomic_rewrite_precondition ( bytecode , * fork_fallback_block , * block_following_fork_fallback ) ! = AtomicRewritePreconditionResult : : NotSatisfied ) {
2021-09-12 13:00:27 +00:00
candidate_blocks . append ( { forking_block , { } , AlternateForm : : DirectLoopWithHeader } ) ;
break ;
}
}
}
2024-10-24 19:59:15 +00:00
// We've found a slightly degenerate case, where the next block jumps back to the _jump_ instruction in the forking block.
// This is a direct loop without a proper header that is posing as a loop with a header.
if ( is_an_eligible_jump ( opcode , state . instruction_position , forking_block . end , AlternateForm : : DirectLoopWithHeader ) ) {
// We've found bb1 and bb0, let's just make sure that bb0 forks to bb2.
state . instruction_position = forking_block . end ;
auto & opcode = bytecode . get_opcode ( state ) ;
if ( opcode . opcode_id ( ) = = OpCodeId : : ForkJump | | opcode . opcode_id ( ) = = OpCodeId : : ForkStay ) {
Optional < Block > block_following_fork_fallback ;
if ( i + 2 < basic_blocks . size ( ) )
block_following_fork_fallback = basic_blocks [ i + 2 ] ;
if ( ! block_following_fork_fallback . has_value ( )
| | block_satisfies_atomic_rewrite_precondition ( bytecode , * fork_fallback_block , * block_following_fork_fallback ) ! = AtomicRewritePreconditionResult : : NotSatisfied ) {
candidate_blocks . append ( { forking_block , { } , AlternateForm : : DirectLoopWithoutHeader } ) ;
break ;
}
}
}
2021-09-12 13:00:27 +00:00
}
}
dbgln_if ( REGEX_DEBUG , " Found {} candidate blocks " , candidate_blocks . size ( ) ) ;
if ( candidate_blocks . is_empty ( ) ) {
dbgln_if ( REGEX_DEBUG , " Failed to find anything for {} " , pattern_value ) ;
return ;
}
RedBlackTree < size_t , size_t > needed_patches ;
// Reverse the blocks, so we can patch the bytecode without messing with the latter patches.
quick_sort ( candidate_blocks , [ ] ( auto & a , auto & b ) { return b . forking_block . start > a . forking_block . start ; } ) ;
for ( auto & candidate : candidate_blocks ) {
// Note that both forms share a ForkReplace patch in forking_block.
// Patch the ForkX in forking_block to be a ForkReplaceX instead.
auto & opcode_id = bytecode [ candidate . forking_block . end ] ;
if ( opcode_id = = ( ByteCodeValueType ) OpCodeId : : ForkStay ) {
opcode_id = ( ByteCodeValueType ) OpCodeId : : ForkReplaceStay ;
} else if ( opcode_id = = ( ByteCodeValueType ) OpCodeId : : ForkJump ) {
opcode_id = ( ByteCodeValueType ) OpCodeId : : ForkReplaceJump ;
} else if ( opcode_id = = ( ByteCodeValueType ) OpCodeId : : JumpNonEmpty ) {
auto & jump_opcode_id = bytecode [ candidate . forking_block . end + 3 ] ;
if ( jump_opcode_id = = ( ByteCodeValueType ) OpCodeId : : ForkStay )
jump_opcode_id = ( ByteCodeValueType ) OpCodeId : : ForkReplaceStay ;
else if ( jump_opcode_id = = ( ByteCodeValueType ) OpCodeId : : ForkJump )
jump_opcode_id = ( ByteCodeValueType ) OpCodeId : : ForkReplaceJump ;
else
VERIFY_NOT_REACHED ( ) ;
} else {
VERIFY_NOT_REACHED ( ) ;
}
}
if ( ! needed_patches . is_empty ( ) ) {
MatchState state ;
2021-12-21 14:38:15 +00:00
auto bytecode_size = bytecode . size ( ) ;
2021-09-12 13:00:27 +00:00
state . instruction_position = 0 ;
struct Patch {
ssize_t value ;
size_t offset ;
bool should_negate { false } ;
} ;
for ( ; ; ) {
2021-12-21 14:38:15 +00:00
if ( state . instruction_position > = bytecode_size )
2021-09-12 13:00:27 +00:00
break ;
auto & opcode = bytecode . get_opcode ( state ) ;
Stack < Patch , 2 > patch_points ;
switch ( opcode . opcode_id ( ) ) {
case OpCodeId : : Jump :
patch_points . push ( { static_cast < OpCode_Jump const & > ( opcode ) . offset ( ) , state . instruction_position + 1 } ) ;
break ;
case OpCodeId : : JumpNonEmpty :
patch_points . push ( { static_cast < OpCode_JumpNonEmpty const & > ( opcode ) . offset ( ) , state . instruction_position + 1 } ) ;
patch_points . push ( { static_cast < OpCode_JumpNonEmpty const & > ( opcode ) . checkpoint ( ) , state . instruction_position + 2 } ) ;
break ;
case OpCodeId : : ForkJump :
patch_points . push ( { static_cast < OpCode_ForkJump const & > ( opcode ) . offset ( ) , state . instruction_position + 1 } ) ;
break ;
case OpCodeId : : ForkStay :
patch_points . push ( { static_cast < OpCode_ForkStay const & > ( opcode ) . offset ( ) , state . instruction_position + 1 } ) ;
break ;
case OpCodeId : : Repeat :
patch_points . push ( { - ( ssize_t ) static_cast < OpCode_Repeat const & > ( opcode ) . offset ( ) , state . instruction_position + 1 , true } ) ;
break ;
default :
break ;
}
while ( ! patch_points . is_empty ( ) ) {
auto & patch_point = patch_points . top ( ) ;
auto target_offset = patch_point . value + state . instruction_position + opcode . size ( ) ;
constexpr auto do_patch = [ ] ( auto & patch_it , auto & patch_point , auto & target_offset , auto & bytecode , auto ip ) {
if ( patch_it . key ( ) = = ip )
return ;
2022-01-21 09:03:49 +00:00
if ( patch_point . value < 0 & & target_offset < = patch_it . key ( ) & & ip > patch_it . key ( ) )
2021-09-12 13:00:27 +00:00
bytecode [ patch_point . offset ] + = ( patch_point . should_negate ? 1 : - 1 ) * ( * patch_it ) ;
2022-01-21 09:03:49 +00:00
else if ( patch_point . value > 0 & & target_offset > = patch_it . key ( ) & & ip < patch_it . key ( ) )
2021-09-12 13:00:27 +00:00
bytecode [ patch_point . offset ] + = ( patch_point . should_negate ? - 1 : 1 ) * ( * patch_it ) ;
} ;
if ( auto patch_it = needed_patches . find_largest_not_above_iterator ( target_offset ) ; ! patch_it . is_end ( ) )
do_patch ( patch_it , patch_point , target_offset , bytecode , state . instruction_position ) ;
else if ( auto patch_it = needed_patches . find_largest_not_above_iterator ( state . instruction_position ) ; ! patch_it . is_end ( ) )
do_patch ( patch_it , patch_point , target_offset , bytecode , state . instruction_position ) ;
patch_points . pop ( ) ;
}
state . instruction_position + = opcode . size ( ) ;
}
}
if constexpr ( REGEX_DEBUG ) {
warnln ( " Transformed to: " ) ;
RegexDebug dbg ;
dbg . print_bytecode ( * this ) ;
}
}
2021-09-13 19:15:22 +00:00
void Optimizer : : append_alternation ( ByteCode & target , ByteCode & & left , ByteCode & & right )
2021-09-12 13:00:27 +00:00
{
2022-02-19 23:55:21 +00:00
Array < ByteCode , 2 > alternatives ;
alternatives [ 0 ] = move ( left ) ;
alternatives [ 1 ] = move ( right ) ;
append_alternation ( target , alternatives ) ;
}
2023-07-28 17:32:34 +00:00
template < typename K , typename V , typename KTraits >
using OrderedHashMapForTrie = OrderedHashMap < K , V , KTraits > ;
2022-02-19 23:55:21 +00:00
void Optimizer : : append_alternation ( ByteCode & target , Span < ByteCode > alternatives )
{
if ( alternatives . size ( ) = = 0 )
return ;
if ( alternatives . size ( ) = = 1 )
return target . extend ( move ( alternatives [ 0 ] ) ) ;
if ( all_of ( alternatives , [ ] ( auto & x ) { return x . is_empty ( ) ; } ) )
2021-09-12 13:00:27 +00:00
return ;
2022-02-19 23:55:21 +00:00
for ( auto & entry : alternatives )
entry . flatten ( ) ;
# if REGEX_DEBUG
ScopeLogger < true > log ;
warnln ( " Alternations: " ) ;
RegexDebug dbg ;
for ( auto & entry : alternatives ) {
warnln ( " ---------- " ) ;
dbg . print_bytecode ( entry ) ;
2021-09-12 13:00:27 +00:00
}
2022-02-19 23:55:21 +00:00
ScopeGuard print_at_end {
[ & ] {
warnln ( " ====================== " ) ;
RegexDebug dbg ;
dbg . print_bytecode ( target ) ;
}
} ;
# endif
2021-09-12 13:00:27 +00:00
2023-07-28 17:32:34 +00:00
// First, find incoming jump edges.
// We need them for two reasons:
// - We need to distinguish between insn-A-jumped-to-by-insn-B and insn-A-jumped-to-by-insn-C (as otherwise we'd break trie invariants)
// - We need to know which jumps to patch when we're done
2021-12-15 05:22:07 +00:00
2023-07-28 17:32:34 +00:00
struct JumpEdge {
Span < ByteCodeValueType const > jump_insn ;
} ;
Vector < HashMap < size_t , Vector < JumpEdge > > > incoming_jump_edges_for_each_alternative ;
incoming_jump_edges_for_each_alternative . resize ( alternatives . size ( ) ) ;
2021-12-15 05:22:07 +00:00
2023-08-05 14:02:47 +00:00
auto has_any_backwards_jump = false ;
2022-02-19 23:55:21 +00:00
MatchState state ;
2023-07-28 17:32:34 +00:00
for ( size_t i = 0 ; i < alternatives . size ( ) ; + + i ) {
auto & alternative = alternatives [ i ] ;
// Add a jump to the "end" of the block; this is implicit in the bytecode, but we need it to be explicit in the trie.
// Jump{offset=0}
alternative . append ( static_cast < ByteCodeValueType > ( OpCodeId : : Jump ) ) ;
alternative . append ( 0 ) ;
2021-09-12 13:00:27 +00:00
2023-07-28 17:32:34 +00:00
auto & incoming_jump_edges = incoming_jump_edges_for_each_alternative [ i ] ;
auto alternative_bytes = alternative . spans < 1 > ( ) . singular_span ( ) ;
for ( state . instruction_position = 0 ; state . instruction_position < alternative . size ( ) ; ) {
auto & opcode = alternative . get_opcode ( state ) ;
auto opcode_bytes = alternative_bytes . slice ( state . instruction_position , opcode . size ( ) ) ;
switch ( opcode . opcode_id ( ) ) {
case OpCodeId : : Jump :
incoming_jump_edges . ensure ( static_cast < OpCode_Jump const & > ( opcode ) . offset ( ) + state . instruction_position ) . append ( { opcode_bytes } ) ;
2023-08-05 14:02:47 +00:00
has_any_backwards_jump | = static_cast < OpCode_Jump const & > ( opcode ) . offset ( ) < 0 ;
2023-07-28 17:32:34 +00:00
break ;
case OpCodeId : : JumpNonEmpty :
incoming_jump_edges . ensure ( static_cast < OpCode_JumpNonEmpty const & > ( opcode ) . offset ( ) + state . instruction_position ) . append ( { opcode_bytes } ) ;
2023-08-05 14:02:47 +00:00
has_any_backwards_jump | = static_cast < OpCode_JumpNonEmpty const & > ( opcode ) . offset ( ) < 0 ;
2023-07-28 17:32:34 +00:00
break ;
case OpCodeId : : ForkJump :
incoming_jump_edges . ensure ( static_cast < OpCode_ForkJump const & > ( opcode ) . offset ( ) + state . instruction_position ) . append ( { opcode_bytes } ) ;
2023-08-05 14:02:47 +00:00
has_any_backwards_jump | = static_cast < OpCode_ForkJump const & > ( opcode ) . offset ( ) < 0 ;
2023-07-28 17:32:34 +00:00
break ;
case OpCodeId : : ForkStay :
incoming_jump_edges . ensure ( static_cast < OpCode_ForkStay const & > ( opcode ) . offset ( ) + state . instruction_position ) . append ( { opcode_bytes } ) ;
2023-08-05 14:02:47 +00:00
has_any_backwards_jump | = static_cast < OpCode_ForkStay const & > ( opcode ) . offset ( ) < 0 ;
2023-07-28 17:32:34 +00:00
break ;
case OpCodeId : : ForkReplaceJump :
incoming_jump_edges . ensure ( static_cast < OpCode_ForkReplaceJump const & > ( opcode ) . offset ( ) + state . instruction_position ) . append ( { opcode_bytes } ) ;
2023-08-05 14:02:47 +00:00
has_any_backwards_jump | = static_cast < OpCode_ForkReplaceJump const & > ( opcode ) . offset ( ) < 0 ;
2023-07-28 17:32:34 +00:00
break ;
case OpCodeId : : ForkReplaceStay :
incoming_jump_edges . ensure ( static_cast < OpCode_ForkReplaceStay const & > ( opcode ) . offset ( ) + state . instruction_position ) . append ( { opcode_bytes } ) ;
2023-08-05 14:02:47 +00:00
has_any_backwards_jump | = static_cast < OpCode_ForkReplaceStay const & > ( opcode ) . offset ( ) < 0 ;
2023-07-28 17:32:34 +00:00
break ;
case OpCodeId : : Repeat :
2023-09-14 02:13:29 +00:00
incoming_jump_edges . ensure ( state . instruction_position - static_cast < OpCode_Repeat const & > ( opcode ) . offset ( ) ) . append ( { opcode_bytes } ) ;
2023-08-05 14:02:47 +00:00
has_any_backwards_jump = true ;
2023-07-28 17:32:34 +00:00
break ;
default :
2022-02-19 23:55:21 +00:00
break ;
}
2023-07-28 17:32:34 +00:00
state . instruction_position + = opcode . size ( ) ;
2023-06-13 19:55:00 +00:00
}
}
2023-07-28 17:32:34 +00:00
struct QualifiedIP {
size_t alternative_index ;
size_t instruction_position ;
} ;
using Tree = Trie < DisjointSpans < ByteCodeValueType const > , Vector < QualifiedIP > , Traits < DisjointSpans < ByteCodeValueType const > > , void , OrderedHashMapForTrie > ;
Tree trie { { } } ; // Root node is empty, key{ instruction_bytes, dependent_instruction_bytes... } -> IP
size_t common_hits = 0 ;
size_t total_nodes = 0 ;
size_t total_bytecode_entries_in_tree = 0 ;
for ( size_t i = 0 ; i < alternatives . size ( ) ; + + i ) {
auto & alternative = alternatives [ i ] ;
auto & incoming_jump_edges = incoming_jump_edges_for_each_alternative [ i ] ;
auto * active_node = & trie ;
auto alternative_span = alternative . spans < 1 > ( ) . singular_span ( ) ;
for ( state . instruction_position = 0 ; state . instruction_position < alternative_span . size ( ) ; ) {
total_nodes + = 1 ;
auto & opcode = alternative . get_opcode ( state ) ;
auto opcode_bytes = alternative_span . slice ( state . instruction_position , opcode . size ( ) ) ;
Vector < Span < ByteCodeValueType const > > node_key_bytes ;
node_key_bytes . append ( opcode_bytes ) ;
if ( auto edges = incoming_jump_edges . get ( state . instruction_position ) ; edges . has_value ( ) ) {
for ( auto & edge : * edges )
node_key_bytes . append ( edge . jump_insn ) ;
2023-06-13 19:55:00 +00:00
}
2021-09-12 13:00:27 +00:00
2023-07-28 17:32:34 +00:00
active_node = static_cast < decltype ( active_node ) > ( MUST ( active_node - > ensure_child ( DisjointSpans < ByteCodeValueType const > { move ( node_key_bytes ) } ) ) ) ;
2021-09-12 13:00:27 +00:00
2023-07-28 17:32:34 +00:00
if ( active_node - > has_metadata ( ) ) {
active_node - > metadata_value ( ) . append ( { i , state . instruction_position } ) ;
common_hits + = 1 ;
} else {
active_node - > set_metadata ( Vector < QualifiedIP > { QualifiedIP { i , state . instruction_position } } ) ;
total_bytecode_entries_in_tree + = opcode . size ( ) ;
2022-02-19 23:55:21 +00:00
}
2023-07-28 17:32:34 +00:00
state . instruction_position + = opcode . size ( ) ;
2022-02-19 23:55:21 +00:00
}
2021-09-13 19:15:22 +00:00
}
2021-09-12 13:00:27 +00:00
2023-07-28 17:32:34 +00:00
if constexpr ( REGEX_DEBUG ) {
Function < void ( decltype ( trie ) & , size_t ) > print_tree = [ & ] ( decltype ( trie ) & node , size_t indent = 0 ) mutable {
2023-12-16 14:19:34 +00:00
ByteString name = " (no ip) " ;
ByteString insn ;
2023-07-28 17:32:34 +00:00
if ( node . has_metadata ( ) ) {
2023-12-16 14:19:34 +00:00
name = ByteString : : formatted (
2023-07-28 17:32:34 +00:00
" {}@{} ({} node{}) " ,
node . metadata_value ( ) . first ( ) . instruction_position ,
node . metadata_value ( ) . first ( ) . alternative_index ,
node . metadata_value ( ) . size ( ) ,
node . metadata_value ( ) . size ( ) = = 1 ? " " : " s " ) ;
MatchState state ;
state . instruction_position = node . metadata_value ( ) . first ( ) . instruction_position ;
auto & opcode = alternatives [ node . metadata_value ( ) . first ( ) . alternative_index ] . get_opcode ( state ) ;
2023-12-16 14:19:34 +00:00
insn = ByteString : : formatted ( " {} {} " , opcode . to_byte_string ( ) , opcode . arguments_string ( ) ) ;
2023-07-28 17:32:34 +00:00
}
dbgln ( " {:->{}}| {} -- {} " , " " , indent * 2 , name , insn ) ;
for ( auto & child : node . children ( ) )
print_tree ( static_cast < decltype ( trie ) & > ( * child . value ) , indent + 1 ) ;
} ;
2021-09-12 13:00:27 +00:00
2023-07-28 17:32:34 +00:00
print_tree ( trie , 0 ) ;
2022-02-19 23:55:21 +00:00
}
2021-09-12 13:00:27 +00:00
2023-07-28 17:32:34 +00:00
// This is really only worth it if we don't blow up the size by the 2-extra-instruction-per-node scheme, similarly, if no nodes are shared, we're better off not using a tree.
auto tree_cost = ( total_nodes - common_hits ) * 2 ;
auto chain_cost = total_nodes + alternatives . size ( ) * 2 ;
dbgln_if ( REGEX_DEBUG , " Total nodes: {}, common hits: {} (tree cost = {}, chain cost = {}) " , total_nodes , common_hits , tree_cost , chain_cost ) ;
if ( common_hits = = 0 | | tree_cost > chain_cost ) {
// It's better to lay these out as a normal sequence of instructions.
auto patch_start = target . size ( ) ;
for ( size_t i = 1 ; i < alternatives . size ( ) ; + + i ) {
target . empend ( static_cast < ByteCodeValueType > ( OpCodeId : : ForkJump ) ) ;
target . empend ( 0u ) ; // To be filled later.
2022-02-19 23:55:21 +00:00
}
2021-09-12 13:00:27 +00:00
2023-07-28 17:32:34 +00:00
size_t size_to_jump = 0 ;
bool seen_one_empty = false ;
for ( size_t i = alternatives . size ( ) ; i > 0 ; - - i ) {
auto & entry = alternatives [ i - 1 ] ;
if ( entry . is_empty ( ) ) {
if ( seen_one_empty )
continue ;
seen_one_empty = true ;
}
2022-02-19 23:55:21 +00:00
2023-07-28 17:32:34 +00:00
auto is_first = i = = 1 ;
auto instruction_size = entry . size ( ) + ( is_first ? 0 : 2 ) ; // Jump; -> +2
size_to_jump + = instruction_size ;
2022-02-19 23:55:21 +00:00
2023-07-28 17:32:34 +00:00
if ( ! is_first )
target [ patch_start + ( i - 2 ) * 2 + 1 ] = size_to_jump + ( alternatives . size ( ) - i ) * 2 ;
2021-09-12 13:00:27 +00:00
2023-07-28 17:32:34 +00:00
dbgln_if ( REGEX_DEBUG , " {} size = {}, cum={} " , i - 1 , instruction_size , size_to_jump ) ;
2022-02-19 23:55:21 +00:00
}
2021-09-12 13:00:27 +00:00
2023-07-28 17:32:34 +00:00
seen_one_empty = false ;
for ( size_t i = alternatives . size ( ) ; i > 0 ; - - i ) {
auto & chunk = alternatives [ i - 1 ] ;
if ( chunk . is_empty ( ) ) {
if ( seen_one_empty )
2022-02-19 23:55:21 +00:00
continue ;
2023-07-28 17:32:34 +00:00
seen_one_empty = true ;
}
ByteCode * previous_chunk = nullptr ;
size_t j = i - 1 ;
auto seen_one_empty_before = chunk . is_empty ( ) ;
while ( j > = 1 ) {
- - j ;
auto & candidate_chunk = alternatives [ j ] ;
if ( candidate_chunk . is_empty ( ) ) {
if ( seen_one_empty_before )
continue ;
}
previous_chunk = & candidate_chunk ;
break ;
}
size_to_jump - = chunk . size ( ) + ( previous_chunk ? 2 : 0 ) ;
target . extend ( move ( chunk ) ) ;
target . empend ( static_cast < ByteCodeValueType > ( OpCodeId : : Jump ) ) ;
target . empend ( size_to_jump ) ; // Jump to the _END label
}
} else {
target . ensure_capacity ( total_bytecode_entries_in_tree + common_hits * 6 ) ;
auto node_is = [ ] ( Tree const * node , QualifiedIP ip ) {
if ( ! node - > has_metadata ( ) )
return false ;
for ( auto & node_ip : node - > metadata_value ( ) ) {
if ( node_ip . alternative_index = = ip . alternative_index & & node_ip . instruction_position = = ip . instruction_position )
return true ;
}
return false ;
} ;
struct Patch {
QualifiedIP source_ip ;
size_t target_ip ;
bool done { false } ;
} ;
Vector < Patch > patch_locations ;
patch_locations . ensure_capacity ( total_nodes ) ;
auto add_patch_point = [ & ] ( Tree const * node , size_t target_ip ) {
if ( ! node - > has_metadata ( ) )
return ;
auto & node_ip = node - > metadata_value ( ) . first ( ) ;
patch_locations . append ( { node_ip , target_ip } ) ;
} ;
Queue < Tree * > nodes_to_visit ;
nodes_to_visit . enqueue ( & trie ) ;
2023-08-05 14:02:47 +00:00
HashMap < size_t , NonnullOwnPtr < RedBlackTree < u64 , u64 > > > instruction_positions ;
if ( has_any_backwards_jump )
MUST ( instruction_positions . try_ensure_capacity ( alternatives . size ( ) ) ) ;
auto ip_mapping_for_alternative = [ & ] ( size_t i ) - > RedBlackTree < u64 , u64 > & {
return * instruction_positions . ensure ( i , [ ] {
return make < RedBlackTree < u64 , u64 > > ( ) ;
} ) ;
} ;
2023-07-28 17:32:34 +00:00
// each node:
// node.re
// forkjump child1
// forkjump child2
// ...
while ( ! nodes_to_visit . is_empty ( ) ) {
auto const * node = nodes_to_visit . dequeue ( ) ;
for ( auto & patch : patch_locations ) {
if ( ! patch . done & & node_is ( node , patch . source_ip ) ) {
auto value = static_cast < ByteCodeValueType > ( target . size ( ) - patch . target_ip - 1 ) ;
target [ patch . target_ip ] = value ;
patch . done = true ;
}
}
if ( ! node - > value ( ) . individual_spans ( ) . is_empty ( ) ) {
auto insn_bytes = node - > value ( ) . individual_spans ( ) . first ( ) ;
target . ensure_capacity ( target . size ( ) + insn_bytes . size ( ) ) ;
state . instruction_position = target . size ( ) ;
target . append ( insn_bytes ) ;
2023-08-05 14:02:47 +00:00
if ( has_any_backwards_jump ) {
for ( auto & ip : node - > metadata_value ( ) )
ip_mapping_for_alternative ( ip . alternative_index ) . insert ( ip . instruction_position , state . instruction_position ) ;
}
2023-07-28 17:32:34 +00:00
auto & opcode = target . get_opcode ( state ) ;
ssize_t jump_offset ;
auto is_jump = true ;
auto patch_location = state . instruction_position + 1 ;
switch ( opcode . opcode_id ( ) ) {
case OpCodeId : : Jump :
jump_offset = static_cast < OpCode_Jump const & > ( opcode ) . offset ( ) ;
break ;
case OpCodeId : : JumpNonEmpty :
jump_offset = static_cast < OpCode_JumpNonEmpty const & > ( opcode ) . offset ( ) ;
break ;
case OpCodeId : : ForkJump :
jump_offset = static_cast < OpCode_ForkJump const & > ( opcode ) . offset ( ) ;
break ;
case OpCodeId : : ForkStay :
jump_offset = static_cast < OpCode_ForkStay const & > ( opcode ) . offset ( ) ;
break ;
case OpCodeId : : ForkReplaceJump :
jump_offset = static_cast < OpCode_ForkReplaceJump const & > ( opcode ) . offset ( ) ;
break ;
case OpCodeId : : ForkReplaceStay :
jump_offset = static_cast < OpCode_ForkReplaceStay const & > ( opcode ) . offset ( ) ;
break ;
case OpCodeId : : Repeat :
2023-09-14 02:13:29 +00:00
jump_offset = static_cast < ssize_t > ( 0 ) - static_cast < ssize_t > ( static_cast < OpCode_Repeat const & > ( opcode ) . offset ( ) ) - static_cast < ssize_t > ( opcode . size ( ) ) ;
2023-07-28 17:32:34 +00:00
break ;
default :
is_jump = false ;
break ;
}
if ( is_jump ) {
VERIFY ( node - > has_metadata ( ) ) ;
2023-08-05 14:02:47 +00:00
QualifiedIP ip = node - > metadata_value ( ) . first ( ) ;
auto intended_jump_ip = ip . instruction_position + jump_offset + opcode . size ( ) ;
2023-08-16 16:18:50 +00:00
if ( jump_offset < 0 ) {
2023-08-05 14:02:47 +00:00
VERIFY ( has_any_backwards_jump ) ;
// We should've already seen this instruction, so we can just patch it in.
auto & ip_mapping = ip_mapping_for_alternative ( ip . alternative_index ) ;
auto target_ip = ip_mapping . find ( intended_jump_ip ) ;
if ( ! target_ip ) {
RegexDebug dbg ;
size_t x = 0 ;
for ( auto & entry : alternatives ) {
warnln ( " ----------- {} ---------- " , x + + ) ;
dbg . print_bytecode ( entry ) ;
}
dbgln ( " Regex Tree / Unknown backwards jump: {}@{} -> {} " ,
ip . instruction_position ,
ip . alternative_index ,
intended_jump_ip ) ;
VERIFY_NOT_REACHED ( ) ;
}
target [ patch_location ] = static_cast < ByteCodeValueType > ( * target_ip - patch_location - 1 ) ;
} else {
patch_locations . append ( { QualifiedIP { ip . alternative_index , intended_jump_ip } , patch_location } ) ;
}
2023-07-28 17:32:34 +00:00
}
}
for ( auto const & child : node - > children ( ) ) {
auto * child_node = static_cast < Tree * > ( child . value . ptr ( ) ) ;
target . append ( static_cast < ByteCodeValueType > ( OpCodeId : : ForkJump ) ) ;
add_patch_point ( child_node , target . size ( ) ) ;
target . append ( static_cast < ByteCodeValueType > ( 0 ) ) ;
nodes_to_visit . enqueue ( child_node ) ;
2022-02-19 23:55:21 +00:00
}
}
2023-07-28 17:32:34 +00:00
for ( auto & patch : patch_locations ) {
if ( patch . done )
continue ;
auto & alternative = alternatives [ patch . source_ip . alternative_index ] ;
if ( patch . source_ip . instruction_position > = alternative . size ( ) ) {
// This just wants to jump to the end of the alternative, which is fine.
// Patch it to jump to the end of the target instead.
target [ patch . target_ip ] = static_cast < ByteCodeValueType > ( target . size ( ) - patch . target_ip - 1 ) ;
continue ;
}
2021-09-12 13:00:27 +00:00
2023-07-28 17:32:34 +00:00
dbgln ( " Regex Tree / Unpatched jump: {}@{} -> {}@{} " ,
patch . source_ip . instruction_position ,
patch . source_ip . alternative_index ,
patch . target_ip ,
target [ patch . target_ip ] ) ;
VERIFY_NOT_REACHED ( ) ;
}
2022-02-19 23:55:21 +00:00
}
2021-09-12 13:00:27 +00:00
}
2021-10-03 15:31:25 +00:00
enum class LookupTableInsertionOutcome {
Successful ,
ReplaceWithAnyChar ,
TemporaryInversionNeeded ,
PermanentInversionNeeded ,
2022-07-20 18:52:07 +00:00
FlushOnInsertion ,
FinishFlushOnInsertion ,
2021-10-03 15:31:25 +00:00
CannotPlaceInTable ,
} ;
static LookupTableInsertionOutcome insert_into_lookup_table ( RedBlackTree < ByteCodeValueType , CharRange > & table , CompareTypeAndValuePair pair )
{
switch ( pair . type ) {
case CharacterCompareType : : Inverse :
return LookupTableInsertionOutcome : : PermanentInversionNeeded ;
case CharacterCompareType : : TemporaryInverse :
return LookupTableInsertionOutcome : : TemporaryInversionNeeded ;
case CharacterCompareType : : AnyChar :
return LookupTableInsertionOutcome : : ReplaceWithAnyChar ;
case CharacterCompareType : : CharClass :
return LookupTableInsertionOutcome : : CannotPlaceInTable ;
case CharacterCompareType : : Char :
table . insert ( pair . value , { ( u32 ) pair . value , ( u32 ) pair . value } ) ;
break ;
case CharacterCompareType : : CharRange : {
CharRange range { pair . value } ;
table . insert ( range . from , range ) ;
break ;
}
2022-07-20 18:52:07 +00:00
case CharacterCompareType : : EndAndOr :
return LookupTableInsertionOutcome : : FinishFlushOnInsertion ;
case CharacterCompareType : : And :
return LookupTableInsertionOutcome : : FlushOnInsertion ;
2021-10-03 15:31:25 +00:00
case CharacterCompareType : : Reference :
case CharacterCompareType : : Property :
case CharacterCompareType : : GeneralCategory :
case CharacterCompareType : : Script :
case CharacterCompareType : : ScriptExtension :
2022-07-20 18:52:07 +00:00
case CharacterCompareType : : Or :
2021-10-03 15:31:25 +00:00
return LookupTableInsertionOutcome : : CannotPlaceInTable ;
case CharacterCompareType : : Undefined :
case CharacterCompareType : : RangeExpressionDummy :
case CharacterCompareType : : String :
case CharacterCompareType : : LookupTable :
VERIFY_NOT_REACHED ( ) ;
}
return LookupTableInsertionOutcome : : Successful ;
}
void Optimizer : : append_character_class ( ByteCode & target , Vector < CompareTypeAndValuePair > & & pairs )
{
ByteCode arguments ;
size_t argument_count = 0 ;
if ( pairs . size ( ) < = 1 ) {
for ( auto & pair : pairs ) {
arguments . append ( to_underlying ( pair . type ) ) ;
2022-07-20 18:52:07 +00:00
if ( pair . type ! = CharacterCompareType : : AnyChar
& & pair . type ! = CharacterCompareType : : TemporaryInverse
& & pair . type ! = CharacterCompareType : : Inverse
& & pair . type ! = CharacterCompareType : : And
& & pair . type ! = CharacterCompareType : : Or
& & pair . type ! = CharacterCompareType : : EndAndOr )
2021-10-03 15:31:25 +00:00
arguments . append ( pair . value ) ;
+ + argument_count ;
}
} else {
RedBlackTree < ByteCodeValueType , CharRange > table ;
RedBlackTree < ByteCodeValueType , CharRange > inverted_table ;
auto * current_table = & table ;
auto * current_inverted_table = & inverted_table ;
bool invert_for_next_iteration = false ;
bool is_currently_inverted = false ;
2022-07-10 07:12:31 +00:00
auto flush_tables = [ & ] {
auto append_table = [ & ] ( auto & table ) {
+ + argument_count ;
arguments . append ( to_underlying ( CharacterCompareType : : LookupTable ) ) ;
auto size_index = arguments . size ( ) ;
arguments . append ( 0 ) ;
Optional < CharRange > active_range ;
size_t range_count = 0 ;
for ( auto & range : table ) {
if ( ! active_range . has_value ( ) ) {
active_range = range ;
continue ;
}
if ( range . from < = active_range - > to + 1 & & range . to + 1 > = active_range - > from ) {
active_range = CharRange { min ( range . from , active_range - > from ) , max ( range . to , active_range - > to ) } ;
} else {
+ + range_count ;
arguments . append ( active_range . release_value ( ) ) ;
active_range = range ;
}
}
if ( active_range . has_value ( ) ) {
+ + range_count ;
arguments . append ( active_range . release_value ( ) ) ;
}
arguments [ size_index ] = range_count ;
} ;
auto contains_regular_table = ! table . is_empty ( ) ;
auto contains_inverted_table = ! inverted_table . is_empty ( ) ;
if ( contains_regular_table )
append_table ( table ) ;
if ( contains_inverted_table ) {
+ + argument_count ;
arguments . append ( to_underlying ( CharacterCompareType : : TemporaryInverse ) ) ;
append_table ( inverted_table ) ;
}
2022-07-20 18:52:07 +00:00
table . clear ( ) ;
inverted_table . clear ( ) ;
2022-07-10 07:12:31 +00:00
} ;
2022-07-20 18:52:07 +00:00
auto flush_on_every_insertion = false ;
2021-10-03 15:31:25 +00:00
for ( auto & value : pairs ) {
auto should_invert_after_this_iteration = invert_for_next_iteration ;
invert_for_next_iteration = false ;
auto insertion_result = insert_into_lookup_table ( * current_table , value ) ;
switch ( insertion_result ) {
case LookupTableInsertionOutcome : : Successful :
2022-07-20 18:52:07 +00:00
if ( flush_on_every_insertion )
flush_tables ( ) ;
2021-10-03 15:31:25 +00:00
break ;
case LookupTableInsertionOutcome : : ReplaceWithAnyChar : {
table . clear ( ) ;
inverted_table . clear ( ) ;
arguments . append ( to_underlying ( CharacterCompareType : : AnyChar ) ) ;
+ + argument_count ;
break ;
}
case LookupTableInsertionOutcome : : TemporaryInversionNeeded :
swap ( current_table , current_inverted_table ) ;
invert_for_next_iteration = true ;
is_currently_inverted = ! is_currently_inverted ;
break ;
case LookupTableInsertionOutcome : : PermanentInversionNeeded :
2022-07-10 07:12:31 +00:00
flush_tables ( ) ;
arguments . append ( to_underlying ( CharacterCompareType : : Inverse ) ) ;
+ + argument_count ;
2021-10-03 15:31:25 +00:00
break ;
2022-07-20 18:52:07 +00:00
case LookupTableInsertionOutcome : : FlushOnInsertion :
case LookupTableInsertionOutcome : : FinishFlushOnInsertion :
flush_tables ( ) ;
flush_on_every_insertion = insertion_result = = LookupTableInsertionOutcome : : FlushOnInsertion ;
[[fallthrough]] ;
2021-10-03 15:31:25 +00:00
case LookupTableInsertionOutcome : : CannotPlaceInTable :
if ( is_currently_inverted ) {
arguments . append ( to_underlying ( CharacterCompareType : : TemporaryInverse ) ) ;
+ + argument_count ;
}
arguments . append ( to_underlying ( value . type ) ) ;
2022-07-20 18:52:07 +00:00
if ( value . type ! = CharacterCompareType : : AnyChar
& & value . type ! = CharacterCompareType : : TemporaryInverse
& & value . type ! = CharacterCompareType : : Inverse
& & value . type ! = CharacterCompareType : : And
& & value . type ! = CharacterCompareType : : Or
& & value . type ! = CharacterCompareType : : EndAndOr )
arguments . append ( value . value ) ;
2021-10-03 15:31:25 +00:00
+ + argument_count ;
break ;
}
if ( should_invert_after_this_iteration ) {
swap ( current_table , current_inverted_table ) ;
is_currently_inverted = ! is_currently_inverted ;
}
}
2022-07-10 07:12:31 +00:00
flush_tables ( ) ;
2021-10-03 15:31:25 +00:00
}
target . empend ( static_cast < ByteCodeValueType > ( OpCodeId : : Compare ) ) ;
target . empend ( argument_count ) ; // number of arguments
target . empend ( arguments . size ( ) ) ; // size of arguments
target . extend ( move ( arguments ) ) ;
}
2021-09-12 13:00:27 +00:00
template void Regex < PosixBasicParser > : : run_optimization_passes ( ) ;
template void Regex < PosixExtendedParser > : : run_optimization_passes ( ) ;
template void Regex < ECMA262Parser > : : run_optimization_passes ( ) ;
}