Browse Source

LibRegex: Implement an ECMA262 Regex quirk with negative lookarounds

This implements the quirk defined by "Note 3" in section "Canonicalize"
(https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch).

Crosses off another quirk from #6042.
Ali Mohammad Pur 3 years ago
parent
commit
c11be92e23

+ 1 - 0
Tests/LibRegex/Regex.cpp

@@ -682,6 +682,7 @@ TEST_CASE(ECMA262_match)
         { "[\\0]"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
         { "[\\01]"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended },
         { "(\0|a)"sv, "a"sv, true }, // #9686, Should allow null bytes in pattern
+        { "(.*?)a(?!(a+)b\\2c)\\2(.*)"sv, "baaabaac"sv, true }, // #6042, Groups inside lookarounds may be referenced outside, but their contents appear empty if the pattern in the lookaround fails.
     };
     // clang-format on
 

+ 13 - 2
Userland/Libraries/LibRegex/RegexMatcher.cpp

@@ -486,7 +486,14 @@ Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& sta
             return true;
         case ExecutionResult::Failed:
             if (!states_to_try_next.is_empty()) {
-                state = states_to_try_next.take_last();
+                auto next_state = states_to_try_next.take_last();
+                // Note: ECMA262 quirk: Note 3, https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch
+                //       capture groups defined in lookarounds "leak" outside the regex,
+                //       but their contents are empty if the lookaround fails.
+                //       This is done by manually clearing the groups where needed, and leaking their contents here.
+                if constexpr (IsSame<Parser, ECMA262>)
+                    swap(next_state.capture_group_matches, state.capture_group_matches);
+                state = move(next_state);
                 continue;
             }
             return false;
@@ -496,7 +503,11 @@ Optional<bool> Matcher<Parser>::execute(MatchInput const& input, MatchState& sta
                     return {};
                 return false;
             }
-            state = states_to_try_next.take_last();
+            auto next_state = states_to_try_next.take_last();
+            // See note above about an ECMA262 quirk.
+            if constexpr (IsSame<Parser, ECMA262>)
+                swap(next_state.capture_group_matches, state.capture_group_matches);
+            state = move(next_state);
             ++recursion_level;
             continue;
         }

+ 26 - 14
Userland/Libraries/LibRegex/RegexParser.cpp

@@ -10,6 +10,7 @@
 #include <AK/AnyOf.h>
 #include <AK/CharacterTypes.h>
 #include <AK/GenericLexer.h>
+#include <AK/ScopeGuard.h>
 #include <AK/String.h>
 #include <AK/StringBuilder.h>
 #include <AK/StringUtils.h>
@@ -1065,9 +1066,17 @@ bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& ma
             return true;
         }
         if (should_parse_forward_assertion && try_skip("!")) {
+            enter_capture_group_scope();
+            ScopeGuard quit_scope {
+                [this] {
+                    exit_capture_group_scope();
+                }
+            };
             if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
                 return false;
             stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
+            clear_all_capture_groups_in_scope(stack);
+
             return true;
         }
         if (m_should_use_browser_extended_grammar) {
@@ -1086,9 +1095,16 @@ bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& ma
             return true;
         }
         if (try_skip("<!")) {
+            enter_capture_group_scope();
+            ScopeGuard quit_scope {
+                [this] {
+                    exit_capture_group_scope();
+                }
+            };
             if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
                 return false;
             stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookBehind, length_dummy);
+            clear_all_capture_groups_in_scope(stack);
             return true;
         }
 
@@ -1124,10 +1140,17 @@ bool ECMA262Parser::parse_quantifiable_assertion(ByteCode& stack, size_t&, bool
         return true;
     }
     if (try_skip("!")) {
+        enter_capture_group_scope();
+        ScopeGuard quit_scope {
+            [this] {
+                exit_capture_group_scope();
+            }
+        };
         if (!parse_inner_disjunction(assertion_stack, match_length_minimum, false, named))
             return false;
 
         stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
+        clear_all_capture_groups_in_scope(stack);
         return true;
     }
 
@@ -2189,20 +2212,9 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
 {
     consume(TokenType::LeftParen, Error::InvalidPattern);
 
-    auto enter_capture_group_scope = [&] {
-        m_capture_groups_in_scope.empend();
-    };
-    auto exit_capture_group_scope = [&] {
-        auto last = m_capture_groups_in_scope.take_last();
-        m_capture_groups_in_scope.last().extend(move(last));
-    };
     auto register_capture_group_in_current_scope = [&](auto identifier) {
         m_capture_groups_in_scope.last().empend(identifier);
     };
-    auto clear_all_capture_groups_in_scope = [&] {
-        for (auto& index : m_capture_groups_in_scope.last())
-            stack.insert_bytecode_clear_capture_group(index);
-    };
 
     if (match(TokenType::Questionmark)) {
         // Non-capturing group or group with specifier.
@@ -2216,7 +2228,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
             enter_capture_group_scope();
             if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named))
                 return set_error(Error::InvalidPattern);
-            clear_all_capture_groups_in_scope();
+            clear_all_capture_groups_in_scope(stack);
             exit_capture_group_scope();
 
             consume(TokenType::RightParen, Error::MismatchingParen);
@@ -2246,7 +2258,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
             enter_capture_group_scope();
             if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
                 return set_error(Error::InvalidPattern);
-            clear_all_capture_groups_in_scope();
+            clear_all_capture_groups_in_scope(stack);
             exit_capture_group_scope();
 
             register_capture_group_in_current_scope(group_index);
@@ -2277,7 +2289,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
     if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
         return set_error(Error::InvalidPattern);
 
-    clear_all_capture_groups_in_scope();
+    clear_all_capture_groups_in_scope(stack);
     exit_capture_group_scope();
 
     register_capture_group_in_current_scope(group_index);

+ 14 - 0
Userland/Libraries/LibRegex/RegexParser.h

@@ -254,6 +254,20 @@ private:
 
     size_t ensure_total_number_of_capturing_parenthesis();
 
+    void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); }
+
+    void exit_capture_group_scope()
+    {
+        auto last = m_capture_groups_in_scope.take_last();
+        m_capture_groups_in_scope.last().extend(move(last));
+    }
+
+    void clear_all_capture_groups_in_scope(ByteCode& stack)
+    {
+        for (auto& index : m_capture_groups_in_scope.last())
+            stack.insert_bytecode_clear_capture_group(index);
+    };
+
     // ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
     // always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
     // parse it twice, so we'll just do so when it's actually needed.