diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index 8ae12c2ba4e..6047112fa6a 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -900,9 +900,8 @@ Vector OpCode_Compare::flat_compares() const result.append({ compare_type, ref }); } else if (compare_type == CharacterCompareType::String) { auto& length = m_bytecode->at(offset++); - if (length > 0) - result.append({ compare_type, m_bytecode->at(offset) }); - StringBuilder str_builder; + for (size_t k = 0; k < length; ++k) + result.append({ CharacterCompareType::Char, m_bytecode->at(offset + k) }); offset += length; } else if (compare_type == CharacterCompareType::CharClass) { auto character_class = m_bytecode->at(offset++); diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h index 2abef98f507..a09eb25aaf3 100644 --- a/Userland/Libraries/LibRegex/RegexMatch.h +++ b/Userland/Libraries/LibRegex/RegexMatch.h @@ -190,6 +190,11 @@ public: explicit RegexStringView(DeprecatedString&&) = delete; + bool is_string_view() const + { + return m_view.has(); + } + StringView string_view() const { return m_view.get(); diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp index 4afe01fa03c..b91b31f7260 100644 --- a/Userland/Libraries/LibRegex/RegexMatcher.cpp +++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp @@ -419,6 +419,26 @@ private: template bool Matcher::execute(MatchInput const& input, MatchState& state, size_t& operations) const { + if (m_pattern->parser_result.optimization_data.pure_substring_search.has_value() && input.view.is_string_view()) { + // Yay, we can do a simple substring search! + auto& needle = m_pattern->parser_result.optimization_data.pure_substring_search.value(); + if (needle.length() + state.string_position > input.view.length()) + return false; + + auto haystack = input.view.string_view().substring_view(state.string_position); + if (input.regex_options.has_flag_set(AllFlags::Insensitive)) { + if (!haystack.substring_view(0, needle.length()).equals_ignoring_ascii_case(needle)) + return false; + } else { + if (!haystack.starts_with(needle)) + return false; + } + + state.string_position += needle.length(); + state.string_position_in_code_units += needle.length(); + return true; + } + BumpAllocatedLinkedList states_to_try_next; #if REGEX_DEBUG size_t recursion_level = 0; diff --git a/Userland/Libraries/LibRegex/RegexMatcher.h b/Userland/Libraries/LibRegex/RegexMatcher.h index 4e8786d0bb2..7f1ef652c32 100644 --- a/Userland/Libraries/LibRegex/RegexMatcher.h +++ b/Userland/Libraries/LibRegex/RegexMatcher.h @@ -231,6 +231,7 @@ public: private: void run_optimization_passes(); void attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&); + bool attempt_rewrite_entire_match_as_substring_search(BasicBlockList const&); }; // free standing functions for match, search and has_match diff --git a/Userland/Libraries/LibRegex/RegexOptimizer.cpp b/Userland/Libraries/LibRegex/RegexOptimizer.cpp index d3e3689a939..e765902464e 100644 --- a/Userland/Libraries/LibRegex/RegexOptimizer.cpp +++ b/Userland/Libraries/LibRegex/RegexOptimizer.cpp @@ -25,9 +25,13 @@ void Regex::run_optimization_passes() { parser_result.bytecode.flatten(); + auto blocks = split_basic_blocks(parser_result.bytecode); + if (attempt_rewrite_entire_match_as_substring_search(blocks)) + return; + // Rewrite fork loops as atomic groups // e.g. a*b -> (ATOMIC a*)b - attempt_rewrite_loops_as_atomic_groups(split_basic_blocks(parser_result.bytecode)); + attempt_rewrite_loops_as_atomic_groups(blocks); parser_result.bytecode.flatten(); } @@ -520,6 +524,51 @@ static AtomicRewritePreconditionResult block_satisfies_atomic_rewrite_preconditi return AtomicRewritePreconditionResult::SatisfiedWithEmptyHeader; } +template +bool Regex::attempt_rewrite_entire_match_as_substring_search(BasicBlockList const& basic_blocks) +{ + // If there's no jumps, we can probably rewrite this as a substring search (Compare { string = str }). + if (basic_blocks.size() > 1) + return false; + + if (basic_blocks.is_empty()) { + parser_result.optimization_data.pure_substring_search = ""sv; + return true; // Empty regex, sure. + } + + auto& bytecode = parser_result.bytecode; + + auto is_unicode = parser_result.options.has_flag_set(AllFlags::Unicode); + + // We have a single basic block, let's see if it's a series of character or string compares. + StringBuilder final_string; + MatchState state; + while (state.instruction_position < bytecode.size()) { + auto& opcode = bytecode.get_opcode(state); + switch (opcode.opcode_id()) { + case OpCodeId::Compare: { + auto& compare = static_cast(opcode); + for (auto& flat_compare : compare.flat_compares()) { + if (flat_compare.type != CharacterCompareType::Char) + return false; + + if (is_unicode || flat_compare.value <= 0x7f) + final_string.append_code_point(flat_compare.value); + else + final_string.append(bit_cast(static_cast(flat_compare.value))); + } + break; + } + default: + return false; + } + state.instruction_position += opcode.size(); + } + + parser_result.optimization_data.pure_substring_search = final_string.to_deprecated_string(); + return true; +} + template void Regex::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const& basic_blocks) { diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index 938b6888741..d2923484ed6 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -55,6 +55,10 @@ public: Token error_token; Vector capture_groups; AllOptions options; + + struct { + Optional pure_substring_search; + } optimization_data {}; }; explicit Parser(Lexer& lexer)