Explorar o código

LibRegex: Correctly handle failing in the middle of explicit repeats

- Make sure that all the Repeat ops are reset (otherwise the operation
  would not be correct when going over the Repeat op a second time)
- Make sure that all matches that are allowed to fail are backed by a
  fork, otherwise the last failing fork would not have anywhere to
  return to.
Fixes #9707.
Ali Mohammad Pur %!s(int64=3) %!d(string=hai) anos
pai
achega
dd82c2e9b4

+ 12 - 0
Userland/Libraries/LibRegex/RegexByteCode.cpp

@@ -189,6 +189,9 @@ void ByteCode::ensure_opcodes_initialized()
         case OpCodeId::Repeat:
             s_opcodes[i] = make<OpCode_Repeat>();
             break;
+        case OpCodeId::ResetRepeat:
+            s_opcodes[i] = make<OpCode_ResetRepeat>();
+            break;
         }
     }
     s_opcodes_initialized = true;
@@ -883,4 +886,13 @@ ALWAYS_INLINE ExecutionResult OpCode_Repeat::execute(MatchInput const&, MatchSta
     return ExecutionResult::Continue;
 }
 
+ALWAYS_INLINE ExecutionResult OpCode_ResetRepeat::execute(MatchInput const&, MatchState& state) const
+{
+    if (id() >= state.repetition_marks.size())
+        state.repetition_marks.resize(id() + 1);
+
+    state.repetition_marks.at(id()) = 0;
+    return ExecutionResult::Continue;
+}
+
 }

+ 38 - 18
Userland/Libraries/LibRegex/RegexByteCode.h

@@ -41,6 +41,7 @@ using ByteCodeValueType = u64;
     __ENUMERATE_OPCODE(GoBack)                     \
     __ENUMERATE_OPCODE(ClearCaptureGroup)          \
     __ENUMERATE_OPCODE(Repeat)                     \
+    __ENUMERATE_OPCODE(ResetRepeat)                \
     __ENUMERATE_OPCODE(Exit)
 
 // clang-format off
@@ -333,40 +334,46 @@ public:
     }
 
     template<typename T>
-    static void transform_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, T minimum, Optional<T> maximum, size_t repetition_mark_id, bool greedy = true) requires(IsIntegral<T>)
+    static void transform_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, T minimum, Optional<T> maximum, size_t min_repetition_mark_id, size_t max_repetition_mark_id, bool greedy = true) requires(IsIntegral<T>)
     {
         ByteCode new_bytecode;
-        new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, repetition_mark_id);
+        new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, min_repetition_mark_id);
 
         if (maximum.has_value()) {
             // (REPEAT REGEXP MIN)
-            // LABEL _MAX_LOOP        |
-            // FORK END               |
-            // REGEXP                 |
-            // REPEAT _MAX_LOOP MAX-1 | if max > min
-            // REGEXP                 |
-            // FORK END               |
-            // LABEL END              |
+            // LABEL _MAX_LOOP            |
+            // FORK END                   |
+            // REGEXP                     |
+            // REPEAT _MAX_LOOP MAX-MIN   | if max > min
+            // FORK END                   |
+            // REGEXP                     |
+            // LABEL END                  |
+            // RESET _MAX_LOOP            |
             auto jump_kind = static_cast<ByteCodeValueType>(greedy ? OpCodeId::ForkStay : OpCodeId::ForkJump);
             if (maximum.value() > minimum) {
                 new_bytecode.empend(jump_kind);
                 new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target.
                 auto pre_loop_fork_jump_index = new_bytecode.size();
+                new_bytecode.extend(bytecode_to_repeat);
                 auto repetitions = maximum.value() - minimum;
-                dbgln("max {}, min {}, reps {}", *maximum, minimum, repetitions);
+                auto fork_jump_address = new_bytecode.size();
                 if (repetitions > 1) {
-                    new_bytecode.extend(bytecode_to_repeat);
                     new_bytecode.empend((ByteCodeValueType)OpCodeId::Repeat);
                     new_bytecode.empend(bytecode_to_repeat.size() + 2);
                     new_bytecode.empend(static_cast<ByteCodeValueType>(repetitions - 1));
-                    new_bytecode.empend(repetition_mark_id);
+                    new_bytecode.empend(max_repetition_mark_id);
+                    new_bytecode.empend(jump_kind);
+                    new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target.
+                    auto post_loop_fork_jump_index = new_bytecode.size();
+                    new_bytecode.extend(bytecode_to_repeat);
+                    fork_jump_address = new_bytecode.size();
+
+                    new_bytecode[post_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - post_loop_fork_jump_index);
+
+                    new_bytecode.empend((ByteCodeValueType)OpCodeId::ResetRepeat);
+                    new_bytecode.empend((ByteCodeValueType)max_repetition_mark_id);
                 }
-                new_bytecode.extend(bytecode_to_repeat);
-                new_bytecode.empend(jump_kind);
-                new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target.
-                auto post_loop_fork_jump_index = new_bytecode.size();
-                new_bytecode[pre_loop_fork_jump_index - 1] = (ByteCodeValueType)(new_bytecode.size() - pre_loop_fork_jump_index);
-                new_bytecode[post_loop_fork_jump_index - 1] = (ByteCodeValueType)(new_bytecode.size() - post_loop_fork_jump_index);
+                new_bytecode[pre_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - pre_loop_fork_jump_index);
             }
         } else {
             // no maximum value set, repeat finding if possible
@@ -724,6 +731,19 @@ public:
     }
 };
 
+class OpCode_ResetRepeat : public OpCode {
+public:
+    ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
+    ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ResetRepeat; }
+    ALWAYS_INLINE size_t size() const override { return 2; }
+    ALWAYS_INLINE size_t id() const { return argument(0); }
+    String const arguments_string() const override
+    {
+        auto reps = id() < state().repetition_marks.size() ? state().repetition_marks.at(id()) : 0;
+        return String::formatted("id={} rep={}", id(), reps + 1);
+    }
+};
+
 template<typename T>
 bool is(OpCode const&);
 

+ 9 - 6
Userland/Libraries/LibRegex/RegexParser.cpp

@@ -446,8 +446,9 @@ bool PosixBasicParser::parse_simple_re(ByteCode& bytecode, size_t& match_length_
         if (min_limit > s_maximum_repetition_count || (max_limit.has_value() && *max_limit > s_maximum_repetition_count))
             return set_error(Error::InvalidBraceContent);
 
-        auto repetition_mark_id = m_parser_state.repetition_mark_count++;
-        ByteCode::transform_bytecode_repetition_min_max(simple_re_bytecode, min_limit, max_limit, repetition_mark_id, true);
+        auto min_repetition_mark_id = m_parser_state.repetition_mark_count++;
+        auto max_repetition_mark_id = m_parser_state.repetition_mark_count++;
+        ByteCode::transform_bytecode_repetition_min_max(simple_re_bytecode, min_limit, max_limit, min_repetition_mark_id, max_repetition_mark_id, true);
         match_length_minimum += re_match_length_minimum * min_limit;
     } else {
         match_length_minimum += re_match_length_minimum;
@@ -620,8 +621,9 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& byteco
             maybe_maximum = value.value();
         }
 
-        auto repetition_mark_id = m_parser_state.repetition_mark_count++;
-        ByteCode::transform_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum, repetition_mark_id);
+        auto min_repetition_mark_id = m_parser_state.repetition_mark_count++;
+        auto max_repetition_mark_id = m_parser_state.repetition_mark_count++;
+        ByteCode::transform_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum, min_repetition_mark_id, max_repetition_mark_id);
 
         consume(TokenType::RightCurly, Error::MismatchingBrace);
         return !has_error();
@@ -1219,8 +1221,9 @@ bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minim
         match_length_minimum = 0;
         break;
     case Repetition::Explicit: {
-        auto repetition_mark_id = m_parser_state.repetition_mark_count++;
-        ByteCode::transform_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max, repetition_mark_id, !ungreedy);
+        auto min_repetition_mark_id = m_parser_state.repetition_mark_count++;
+        auto max_repetition_mark_id = m_parser_state.repetition_mark_count++;
+        ByteCode::transform_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max, min_repetition_mark_id, max_repetition_mark_id, !ungreedy);
         match_length_minimum *= repeat_min.value();
         break;
     }