mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-22 15:40:19 +00:00
LibRegex: Add the literal string search optimisation
This switches to using a simple string equality check if the regex pattern is strictly a string literal. Technically this optimisation can also be made on bounded literal patterns like /[abc]def/ or /abc|def/ as well, but those are significantly more complex to implement due to our bytecode-only approach.
This commit is contained in:
parent
221c52c696
commit
18f4b6c670
Notes:
sideshowbarker
2024-07-17 05:09:48 +09:00
Author: https://github.com/alimpfard Commit: https://github.com/SerenityOS/serenity/commit/18f4b6c670 Pull-request: https://github.com/SerenityOS/serenity/pull/20240 Reviewed-by: https://github.com/awesomekling Reviewed-by: https://github.com/trflynn89
6 changed files with 82 additions and 4 deletions
|
@ -900,9 +900,8 @@ Vector<CompareTypeAndValuePair> OpCode_Compare::flat_compares() const
|
||||||
result.append({ compare_type, ref });
|
result.append({ compare_type, ref });
|
||||||
} else if (compare_type == CharacterCompareType::String) {
|
} else if (compare_type == CharacterCompareType::String) {
|
||||||
auto& length = m_bytecode->at(offset++);
|
auto& length = m_bytecode->at(offset++);
|
||||||
if (length > 0)
|
for (size_t k = 0; k < length; ++k)
|
||||||
result.append({ compare_type, m_bytecode->at(offset) });
|
result.append({ CharacterCompareType::Char, m_bytecode->at(offset + k) });
|
||||||
StringBuilder str_builder;
|
|
||||||
offset += length;
|
offset += length;
|
||||||
} else if (compare_type == CharacterCompareType::CharClass) {
|
} else if (compare_type == CharacterCompareType::CharClass) {
|
||||||
auto character_class = m_bytecode->at(offset++);
|
auto character_class = m_bytecode->at(offset++);
|
||||||
|
|
|
@ -190,6 +190,11 @@ public:
|
||||||
|
|
||||||
explicit RegexStringView(DeprecatedString&&) = delete;
|
explicit RegexStringView(DeprecatedString&&) = delete;
|
||||||
|
|
||||||
|
bool is_string_view() const
|
||||||
|
{
|
||||||
|
return m_view.has<StringView>();
|
||||||
|
}
|
||||||
|
|
||||||
StringView string_view() const
|
StringView string_view() const
|
||||||
{
|
{
|
||||||
return m_view.get<StringView>();
|
return m_view.get<StringView>();
|
||||||
|
|
|
@ -419,6 +419,26 @@ private:
|
||||||
template<class Parser>
|
template<class Parser>
|
||||||
bool Matcher<Parser>::execute(MatchInput const& input, MatchState& state, size_t& operations) const
|
bool Matcher<Parser>::execute(MatchInput const& input, MatchState& state, size_t& operations) const
|
||||||
{
|
{
|
||||||
|
if (m_pattern->parser_result.optimization_data.pure_substring_search.has_value() && input.view.is_string_view()) {
|
||||||
|
// Yay, we can do a simple substring search!
|
||||||
|
auto& needle = m_pattern->parser_result.optimization_data.pure_substring_search.value();
|
||||||
|
if (needle.length() + state.string_position > input.view.length())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
auto haystack = input.view.string_view().substring_view(state.string_position);
|
||||||
|
if (input.regex_options.has_flag_set(AllFlags::Insensitive)) {
|
||||||
|
if (!haystack.substring_view(0, needle.length()).equals_ignoring_ascii_case(needle))
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
if (!haystack.starts_with(needle))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
state.string_position += needle.length();
|
||||||
|
state.string_position_in_code_units += needle.length();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
BumpAllocatedLinkedList<MatchState> states_to_try_next;
|
BumpAllocatedLinkedList<MatchState> states_to_try_next;
|
||||||
#if REGEX_DEBUG
|
#if REGEX_DEBUG
|
||||||
size_t recursion_level = 0;
|
size_t recursion_level = 0;
|
||||||
|
|
|
@ -231,6 +231,7 @@ public:
|
||||||
private:
|
private:
|
||||||
void run_optimization_passes();
|
void run_optimization_passes();
|
||||||
void attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&);
|
void attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&);
|
||||||
|
bool attempt_rewrite_entire_match_as_substring_search(BasicBlockList const&);
|
||||||
};
|
};
|
||||||
|
|
||||||
// free standing functions for match, search and has_match
|
// free standing functions for match, search and has_match
|
||||||
|
|
|
@ -25,9 +25,13 @@ void Regex<Parser>::run_optimization_passes()
|
||||||
{
|
{
|
||||||
parser_result.bytecode.flatten();
|
parser_result.bytecode.flatten();
|
||||||
|
|
||||||
|
auto blocks = split_basic_blocks(parser_result.bytecode);
|
||||||
|
if (attempt_rewrite_entire_match_as_substring_search(blocks))
|
||||||
|
return;
|
||||||
|
|
||||||
// Rewrite fork loops as atomic groups
|
// Rewrite fork loops as atomic groups
|
||||||
// e.g. a*b -> (ATOMIC a*)b
|
// e.g. a*b -> (ATOMIC a*)b
|
||||||
attempt_rewrite_loops_as_atomic_groups(split_basic_blocks(parser_result.bytecode));
|
attempt_rewrite_loops_as_atomic_groups(blocks);
|
||||||
|
|
||||||
parser_result.bytecode.flatten();
|
parser_result.bytecode.flatten();
|
||||||
}
|
}
|
||||||
|
@ -520,6 +524,51 @@ static AtomicRewritePreconditionResult block_satisfies_atomic_rewrite_preconditi
|
||||||
return AtomicRewritePreconditionResult::SatisfiedWithEmptyHeader;
|
return AtomicRewritePreconditionResult::SatisfiedWithEmptyHeader;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename Parser>
|
||||||
|
bool Regex<Parser>::attempt_rewrite_entire_match_as_substring_search(BasicBlockList const& basic_blocks)
|
||||||
|
{
|
||||||
|
// If there's no jumps, we can probably rewrite this as a substring search (Compare { string = str }).
|
||||||
|
if (basic_blocks.size() > 1)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (basic_blocks.is_empty()) {
|
||||||
|
parser_result.optimization_data.pure_substring_search = ""sv;
|
||||||
|
return true; // Empty regex, sure.
|
||||||
|
}
|
||||||
|
|
||||||
|
auto& bytecode = parser_result.bytecode;
|
||||||
|
|
||||||
|
auto is_unicode = parser_result.options.has_flag_set(AllFlags::Unicode);
|
||||||
|
|
||||||
|
// We have a single basic block, let's see if it's a series of character or string compares.
|
||||||
|
StringBuilder final_string;
|
||||||
|
MatchState state;
|
||||||
|
while (state.instruction_position < bytecode.size()) {
|
||||||
|
auto& opcode = bytecode.get_opcode(state);
|
||||||
|
switch (opcode.opcode_id()) {
|
||||||
|
case OpCodeId::Compare: {
|
||||||
|
auto& compare = static_cast<OpCode_Compare const&>(opcode);
|
||||||
|
for (auto& flat_compare : compare.flat_compares()) {
|
||||||
|
if (flat_compare.type != CharacterCompareType::Char)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (is_unicode || flat_compare.value <= 0x7f)
|
||||||
|
final_string.append_code_point(flat_compare.value);
|
||||||
|
else
|
||||||
|
final_string.append(bit_cast<char>(static_cast<u8>(flat_compare.value)));
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
state.instruction_position += opcode.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
parser_result.optimization_data.pure_substring_search = final_string.to_deprecated_string();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
template<typename Parser>
|
template<typename Parser>
|
||||||
void Regex<Parser>::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const& basic_blocks)
|
void Regex<Parser>::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const& basic_blocks)
|
||||||
{
|
{
|
||||||
|
|
|
@ -55,6 +55,10 @@ public:
|
||||||
Token error_token;
|
Token error_token;
|
||||||
Vector<DeprecatedFlyString> capture_groups;
|
Vector<DeprecatedFlyString> capture_groups;
|
||||||
AllOptions options;
|
AllOptions options;
|
||||||
|
|
||||||
|
struct {
|
||||||
|
Optional<DeprecatedString> pure_substring_search;
|
||||||
|
} optimization_data {};
|
||||||
};
|
};
|
||||||
|
|
||||||
explicit Parser(Lexer& lexer)
|
explicit Parser(Lexer& lexer)
|
||||||
|
|
Loading…
Reference in a new issue