Jelajahi Sumber

LibRegex: Only search start of line if pattern begins with ^

Gingeh 10 bulan lalu
induk
melakukan
de588a97c0

+ 24 - 0
Tests/LibRegex/Regex.cpp

@@ -998,6 +998,15 @@ BENCHMARK_CASE(fork_performance)
     EXPECT_EQ(result.success, true);
 }
 
+BENCHMARK_CASE(anchor_performance)
+{
+    Regex<ECMA262> re("^b");
+    for (auto i = 0; i < 100'000; i++) {
+        auto result = re.match(g_lots_of_a_s);
+        EXPECT_EQ(result.success, false);
+    }
+}
+
 TEST_CASE(optimizer_atomic_groups)
 {
     Array tests {
@@ -1078,6 +1087,21 @@ TEST_CASE(optimizer_alternation)
     }
 }
 
+TEST_CASE(start_anchor)
+{
+    // Ensure that a circumflex at the start only matches the start of the line.
+    {
+        Regex<PosixBasic> re("^abc");
+        EXPECT_EQ(re.match("123abcdef"sv, PosixFlags::Global).success, false);
+        EXPECT_EQ(re.match("abc123"sv, PosixFlags::Global).success, true);
+        EXPECT_EQ(re.match("123^abcdef"sv, PosixFlags::Global).success, false);
+        EXPECT_EQ(re.match("^abc123"sv, PosixFlags::Global).success, false);
+
+        // Multiple lines
+        EXPECT_EQ(re.match("123\nabc"sv, PosixFlags::Multiline).success, true);
+    }
+}
+
 TEST_CASE(posix_basic_dollar_is_end_anchor)
 {
     // Ensure that a dollar sign at the end only matches the end of the line.

+ 2 - 1
Userland/Libraries/LibRegex/RegexMatcher.cpp

@@ -184,6 +184,7 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
         continue_search = false;
 
     auto single_match_only = input.regex_options.has_flag_set(AllFlags::SingleMatch);
+    auto only_start_of_line = m_pattern->parser_result.optimization_data.only_start_of_line && !input.regex_options.has_flag_set(AllFlags::Multiline);
 
     for (auto const& view : views) {
         if (lines_to_skip != 0) {
@@ -294,7 +295,7 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
                 break;
             }
 
-            if (!continue_search)
+            if (!continue_search || only_start_of_line)
                 break;
         }
 

+ 6 - 0
Userland/Libraries/LibRegex/RegexOptimizer.cpp

@@ -36,6 +36,12 @@ void Regex<Parser>::run_optimization_passes()
     // e.g. a*b -> (ATOMIC a*)b
     attempt_rewrite_loops_as_atomic_groups(blocks);
 
+    // FIXME: "There are a few more conditions this can be true in (e.g. within an arbitrarily nested capture group)"
+    MatchState state;
+    auto& opcode = parser_result.bytecode.get_opcode(state);
+    if (opcode.opcode_id() == OpCodeId::CheckBegin)
+        parser_result.optimization_data.only_start_of_line = true;
+
     parser_result.bytecode.flatten();
 }
 

+ 1 - 0
Userland/Libraries/LibRegex/RegexParser.h

@@ -58,6 +58,7 @@ public:
 
         struct {
             Optional<ByteString> pure_substring_search;
+            bool only_start_of_line = false;
         } optimization_data {};
     };