Pārlūkot izejas kodu

LibRegex: Clear previous capture group contents in ECMA262 mode

ECMA262 requires that the capture groups only contain the values from
the last iteration, e.g. `((c)(a)?(b))` should _not_ contain 'a' in the
second capture group when matching "cabcb".
Ali Mohammad Pur 4 gadi atpakaļ
vecāks
revīzija
c8b2199251

+ 25 - 0
Userland/Libraries/LibRegex/RegexByteCode.cpp

@@ -132,6 +132,12 @@ void ByteCode::ensure_opcodes_initialized()
         case OpCodeId::CheckBegin:
             s_opcodes[i] = make<OpCode_CheckBegin>();
             break;
+        case OpCodeId::ClearCaptureGroup:
+            s_opcodes[i] = make<OpCode_ClearCaptureGroup>();
+            break;
+        case OpCodeId::ClearNamedCaptureGroup:
+            s_opcodes[i] = make<OpCode_ClearNamedCaptureGroup>();
+            break;
         case OpCodeId::SaveLeftCaptureGroup:
             s_opcodes[i] = make<OpCode_SaveLeftCaptureGroup>();
             break;
@@ -288,6 +294,16 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input,
     return ExecutionResult::Failed_ExecuteLowPrioForks;
 }
 
+ALWAYS_INLINE ExecutionResult OpCode_ClearCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
+{
+    if (input.match_index < state.capture_group_matches.size()) {
+        auto& group = state.capture_group_matches[input.match_index];
+        if (id() < group.size())
+            group[id()] = {};
+    }
+    return ExecutionResult::Continue;
+}
+
 ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
 {
     if (input.match_index >= state.capture_group_matches.size()) {
@@ -333,6 +349,15 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchI
     return ExecutionResult::Continue;
 }
 
+ALWAYS_INLINE ExecutionResult OpCode_ClearNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
+{
+    if (input.match_index < state.capture_group_matches.size()) {
+        auto& group = state.named_capture_group_matches[input.match_index];
+        group.remove(name());
+    }
+    return ExecutionResult::Continue;
+}
+
 ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
 {
     if (input.match_index >= state.named_capture_group_matches.size()) {

+ 37 - 0
Userland/Libraries/LibRegex/RegexByteCode.h

@@ -39,6 +39,8 @@ using ByteCodeValueType = u64;
     __ENUMERATE_OPCODE(Save)                       \
     __ENUMERATE_OPCODE(Restore)                    \
     __ENUMERATE_OPCODE(GoBack)                     \
+    __ENUMERATE_OPCODE(ClearCaptureGroup)          \
+    __ENUMERATE_OPCODE(ClearNamedCaptureGroup)     \
     __ENUMERATE_OPCODE(Exit)
 
 // clang-format off
@@ -174,6 +176,19 @@ public:
         extend(move(bytecode));
     }
 
+    void insert_bytecode_clear_capture_group(size_t index)
+    {
+        empend(static_cast<ByteCodeValueType>(OpCodeId::ClearCaptureGroup));
+        empend(index);
+    }
+
+    void insert_bytecode_clear_named_capture_group(StringView name)
+    {
+        empend(static_cast<ByteCodeValueType>(OpCodeId::ClearNamedCaptureGroup));
+        empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
+        empend(name.length());
+    }
+
     void insert_bytecode_compare_string(StringView view)
     {
         ByteCode bytecode;
@@ -626,6 +641,28 @@ public:
     const String arguments_string() const override { return String::formatted("kind={} ({})", (long unsigned int)argument(0), boundary_check_type_name(type())); }
 };
 
+class OpCode_ClearCaptureGroup final : public OpCode {
+public:
+    ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+    ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearCaptureGroup; }
+    ALWAYS_INLINE size_t size() const override { return 2; }
+    ALWAYS_INLINE size_t id() const { return argument(0); }
+    const String arguments_string() const override { return String::formatted("id={}", id()); }
+};
+
+class OpCode_ClearNamedCaptureGroup final : public OpCode {
+public:
+    ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+    ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearNamedCaptureGroup; }
+    ALWAYS_INLINE size_t size() const override { return 3; }
+    ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
+    ALWAYS_INLINE size_t length() const { return argument(1); }
+    const String arguments_string() const override
+    {
+        return String::formatted("name={}, length={}", name(), length());
+    }
+};
+
 class OpCode_SaveLeftCaptureGroup final : public OpCode {
 public:
     ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;

+ 39 - 1
Userland/Libraries/LibRegex/RegexParser.cpp

@@ -1877,6 +1877,28 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
 {
     consume(TokenType::LeftParen, Error::InvalidPattern);
 
+    auto enter_capture_group_scope = [&] {
+        m_capture_groups_in_scope.empend();
+    };
+    auto exit_capture_group_scope = [&] {
+        auto last = m_capture_groups_in_scope.take_last();
+        m_capture_groups_in_scope.last().extend(move(last));
+    };
+    auto register_capture_group_in_current_scope = [&](auto identifier) {
+        m_capture_groups_in_scope.last().empend(identifier);
+    };
+    auto clear_all_capture_groups_in_scope = [&] {
+        for (auto& entry : m_capture_groups_in_scope.last()) {
+            entry.visit(
+                [&](size_t index) {
+                    stack.insert_bytecode_clear_capture_group(index);
+                },
+                [&](String const& name) {
+                    stack.insert_bytecode_clear_named_capture_group(name);
+                });
+        }
+    };
+
     if (match(TokenType::Questionmark)) {
         // Non-capturing group or group with specifier.
         consume();
@@ -1885,8 +1907,12 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
             consume();
             ByteCode noncapture_group_bytecode;
             size_t length = 0;
+
+            enter_capture_group_scope();
             if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named))
                 return set_error(Error::InvalidPattern);
+            clear_all_capture_groups_in_scope();
+            exit_capture_group_scope();
 
             consume(TokenType::RightParen, Error::MismatchingParen);
 
@@ -1907,8 +1933,14 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
 
             ByteCode capture_group_bytecode;
             size_t length = 0;
+            enter_capture_group_scope();
             if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
                 return set_error(Error::InvalidPattern);
+            clear_all_capture_groups_in_scope();
+            exit_capture_group_scope();
+
+            register_capture_group_in_current_scope(name);
+            register_capture_group_in_current_scope(group_index);
 
             consume(TokenType::RightParen, Error::MismatchingParen);
 
@@ -1930,7 +1962,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
     }
 
     auto group_index = ++m_parser_state.capture_groups_count;
-    stack.insert_bytecode_group_capture_left(group_index);
+    enter_capture_group_scope();
 
     ByteCode capture_group_bytecode;
     size_t length = 0;
@@ -1938,6 +1970,12 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
     if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
         return set_error(Error::InvalidPattern);
 
+    clear_all_capture_groups_in_scope();
+    exit_capture_group_scope();
+
+    register_capture_group_in_current_scope(group_index);
+
+    stack.insert_bytecode_group_capture_left(group_index);
     stack.extend(move(capture_group_bytecode));
 
     m_parser_state.capture_group_minimum_lengths.set(group_index, length);

+ 8 - 0
Userland/Libraries/LibRegex/RegexParser.h

@@ -190,12 +190,14 @@ public:
     explicit ECMA262Parser(Lexer& lexer)
         : Parser(lexer)
     {
+        m_capture_groups_in_scope.empend();
     }
 
     ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
         : Parser(lexer, regex_options.value_or({}))
     {
         m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
+        m_capture_groups_in_scope.empend();
     }
 
     ~ECMA262Parser() = default;
@@ -242,6 +244,12 @@ private:
 
     // Keep the Annex B. behaviour behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
     bool m_should_use_browser_extended_grammar { false };
+
+    // ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it,
+    // by requiring that (...)+ only contain the matches for the last iteration.
+    // To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed.
+    using CaptureGroup = Variant<size_t, String>;
+    Vector<Vector<CaptureGroup>> m_capture_groups_in_scope;
 };
 
 using PosixExtended = PosixExtendedParser;