Ver Fonte

LibRegex: Add the literal string search optimisation

This switches to using a simple string equality check if the regex
pattern is strictly a string literal.
Technically this optimisation can also be made on bounded literal
patterns like /[abc]def/ or /abc|def/ as well, but those are
significantly more complex to implement due to our bytecode-only
approach.
Ali Mohammad Pur há 1 ano atrás
pai
commit
18f4b6c670

+ 2 - 3
Userland/Libraries/LibRegex/RegexByteCode.cpp

@@ -900,9 +900,8 @@ Vector<CompareTypeAndValuePair> OpCode_Compare::flat_compares() const
             result.append({ compare_type, ref });
             result.append({ compare_type, ref });
         } else if (compare_type == CharacterCompareType::String) {
         } else if (compare_type == CharacterCompareType::String) {
             auto& length = m_bytecode->at(offset++);
             auto& length = m_bytecode->at(offset++);
-            if (length > 0)
-                result.append({ compare_type, m_bytecode->at(offset) });
-            StringBuilder str_builder;
+            for (size_t k = 0; k < length; ++k)
+                result.append({ CharacterCompareType::Char, m_bytecode->at(offset + k) });
             offset += length;
             offset += length;
         } else if (compare_type == CharacterCompareType::CharClass) {
         } else if (compare_type == CharacterCompareType::CharClass) {
             auto character_class = m_bytecode->at(offset++);
             auto character_class = m_bytecode->at(offset++);

+ 5 - 0
Userland/Libraries/LibRegex/RegexMatch.h

@@ -190,6 +190,11 @@ public:
 
 
     explicit RegexStringView(DeprecatedString&&) = delete;
     explicit RegexStringView(DeprecatedString&&) = delete;
 
 
+    bool is_string_view() const
+    {
+        return m_view.has<StringView>();
+    }
+
     StringView string_view() const
     StringView string_view() const
     {
     {
         return m_view.get<StringView>();
         return m_view.get<StringView>();

+ 20 - 0
Userland/Libraries/LibRegex/RegexMatcher.cpp

@@ -419,6 +419,26 @@ private:
 template<class Parser>
 template<class Parser>
 bool Matcher<Parser>::execute(MatchInput const& input, MatchState& state, size_t& operations) const
 bool Matcher<Parser>::execute(MatchInput const& input, MatchState& state, size_t& operations) const
 {
 {
+    if (m_pattern->parser_result.optimization_data.pure_substring_search.has_value() && input.view.is_string_view()) {
+        // Yay, we can do a simple substring search!
+        auto& needle = m_pattern->parser_result.optimization_data.pure_substring_search.value();
+        if (needle.length() + state.string_position > input.view.length())
+            return false;
+
+        auto haystack = input.view.string_view().substring_view(state.string_position);
+        if (input.regex_options.has_flag_set(AllFlags::Insensitive)) {
+            if (!haystack.substring_view(0, needle.length()).equals_ignoring_ascii_case(needle))
+                return false;
+        } else {
+            if (!haystack.starts_with(needle))
+                return false;
+        }
+
+        state.string_position += needle.length();
+        state.string_position_in_code_units += needle.length();
+        return true;
+    }
+
     BumpAllocatedLinkedList<MatchState> states_to_try_next;
     BumpAllocatedLinkedList<MatchState> states_to_try_next;
 #if REGEX_DEBUG
 #if REGEX_DEBUG
     size_t recursion_level = 0;
     size_t recursion_level = 0;

+ 1 - 0
Userland/Libraries/LibRegex/RegexMatcher.h

@@ -231,6 +231,7 @@ public:
 private:
 private:
     void run_optimization_passes();
     void run_optimization_passes();
     void attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&);
     void attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&);
+    bool attempt_rewrite_entire_match_as_substring_search(BasicBlockList const&);
 };
 };
 
 
 // free standing functions for match, search and has_match
 // free standing functions for match, search and has_match

+ 50 - 1
Userland/Libraries/LibRegex/RegexOptimizer.cpp

@@ -25,9 +25,13 @@ void Regex<Parser>::run_optimization_passes()
 {
 {
     parser_result.bytecode.flatten();
     parser_result.bytecode.flatten();
 
 
+    auto blocks = split_basic_blocks(parser_result.bytecode);
+    if (attempt_rewrite_entire_match_as_substring_search(blocks))
+        return;
+
     // Rewrite fork loops as atomic groups
     // Rewrite fork loops as atomic groups
     // e.g. a*b -> (ATOMIC a*)b
     // e.g. a*b -> (ATOMIC a*)b
-    attempt_rewrite_loops_as_atomic_groups(split_basic_blocks(parser_result.bytecode));
+    attempt_rewrite_loops_as_atomic_groups(blocks);
 
 
     parser_result.bytecode.flatten();
     parser_result.bytecode.flatten();
 }
 }
@@ -520,6 +524,51 @@ static AtomicRewritePreconditionResult block_satisfies_atomic_rewrite_preconditi
     return AtomicRewritePreconditionResult::SatisfiedWithEmptyHeader;
     return AtomicRewritePreconditionResult::SatisfiedWithEmptyHeader;
 }
 }
 
 
+template<typename Parser>
+bool Regex<Parser>::attempt_rewrite_entire_match_as_substring_search(BasicBlockList const& basic_blocks)
+{
+    // If there's no jumps, we can probably rewrite this as a substring search (Compare { string = str }).
+    if (basic_blocks.size() > 1)
+        return false;
+
+    if (basic_blocks.is_empty()) {
+        parser_result.optimization_data.pure_substring_search = ""sv;
+        return true; // Empty regex, sure.
+    }
+
+    auto& bytecode = parser_result.bytecode;
+
+    auto is_unicode = parser_result.options.has_flag_set(AllFlags::Unicode);
+
+    // We have a single basic block, let's see if it's a series of character or string compares.
+    StringBuilder final_string;
+    MatchState state;
+    while (state.instruction_position < bytecode.size()) {
+        auto& opcode = bytecode.get_opcode(state);
+        switch (opcode.opcode_id()) {
+        case OpCodeId::Compare: {
+            auto& compare = static_cast<OpCode_Compare const&>(opcode);
+            for (auto& flat_compare : compare.flat_compares()) {
+                if (flat_compare.type != CharacterCompareType::Char)
+                    return false;
+
+                if (is_unicode || flat_compare.value <= 0x7f)
+                    final_string.append_code_point(flat_compare.value);
+                else
+                    final_string.append(bit_cast<char>(static_cast<u8>(flat_compare.value)));
+            }
+            break;
+        }
+        default:
+            return false;
+        }
+        state.instruction_position += opcode.size();
+    }
+
+    parser_result.optimization_data.pure_substring_search = final_string.to_deprecated_string();
+    return true;
+}
+
 template<typename Parser>
 template<typename Parser>
 void Regex<Parser>::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const& basic_blocks)
 void Regex<Parser>::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const& basic_blocks)
 {
 {

+ 4 - 0
Userland/Libraries/LibRegex/RegexParser.h

@@ -55,6 +55,10 @@ public:
         Token error_token;
         Token error_token;
         Vector<DeprecatedFlyString> capture_groups;
         Vector<DeprecatedFlyString> capture_groups;
         AllOptions options;
         AllOptions options;
+
+        struct {
+            Optional<DeprecatedString> pure_substring_search;
+        } optimization_data {};
     };
     };
 
 
     explicit Parser(Lexer& lexer)
     explicit Parser(Lexer& lexer)