LibRegex: Keep track of instruction positions for backwards tree jumps

Author: https://github.com/alimpfard Commit: https://github.com/SerenityOS/serenity/commit/e689422564 Pull-request: https://github.com/SerenityOS/serenity/pull/20382
2023-08-05 17:32:47 +03:30 · 2023-08-05 17:32:47 +03:30 · e689422564 · 2024-07-17 06:33:00 +09:00
commit e689422564
parent 64c06c345e
2 changed files with 51 additions and 2 deletions
--- a/Tests/LibRegex/Regex.cpp
+++ b/Tests/LibRegex/Regex.cpp
@ -1049,6 +1049,8 @@ TEST_CASE(optimizer_alternation)
        Tuple { "a|"sv, "a"sv, 1u },
        Tuple { "a|a|a|a|a|a|a|a|a|b"sv, "a"sv, 1u },
        Tuple { "ab|ac|ad|bc"sv, "bc"sv, 2u },
+        // Should not crash on backwards jumps introduced by '.*'.
+        Tuple { "\\bDroid\\b.*Build|XT912|XT928|XT926|XT915|XT919|XT925|XT1021|\\bMoto E\\b|XT1068|XT1092|XT1052"sv, "XT1068"sv, 6u },
    };

    for (auto& test : tests) {
--- a/Userland/Libraries/LibRegex/RegexOptimizer.cpp
+++ b/Userland/Libraries/LibRegex/RegexOptimizer.cpp
@ -863,6 +863,8 @@ void Optimizer::append_alternation(ByteCode& target, Span<ByteCode> alternatives
    Vector<HashMap<size_t, Vector<JumpEdge>>> incoming_jump_edges_for_each_alternative;
    incoming_jump_edges_for_each_alternative.resize(alternatives.size());

+    auto has_any_backwards_jump = false;
+
    MatchState state;

    for (size_t i = 0; i < alternatives.size(); ++i) {
@ -882,24 +884,31 @@ void Optimizer::append_alternation(ByteCode& target, Span<ByteCode> alternatives
            switch (opcode.opcode_id()) {
            case OpCodeId::Jump:
                incoming_jump_edges.ensure(static_cast<OpCode_Jump const&>(opcode).offset() + state.instruction_position).append({ opcode_bytes });
+                has_any_backwards_jump |= static_cast<OpCode_Jump const&>(opcode).offset() < 0;
                break;
            case OpCodeId::JumpNonEmpty:
                incoming_jump_edges.ensure(static_cast<OpCode_JumpNonEmpty const&>(opcode).offset() + state.instruction_position).append({ opcode_bytes });
+                has_any_backwards_jump |= static_cast<OpCode_JumpNonEmpty const&>(opcode).offset() < 0;
                break;
            case OpCodeId::ForkJump:
                incoming_jump_edges.ensure(static_cast<OpCode_ForkJump const&>(opcode).offset() + state.instruction_position).append({ opcode_bytes });
+                has_any_backwards_jump |= static_cast<OpCode_ForkJump const&>(opcode).offset() < 0;
                break;
            case OpCodeId::ForkStay:
                incoming_jump_edges.ensure(static_cast<OpCode_ForkStay const&>(opcode).offset() + state.instruction_position).append({ opcode_bytes });
+                has_any_backwards_jump |= static_cast<OpCode_ForkStay const&>(opcode).offset() < 0;
                break;
            case OpCodeId::ForkReplaceJump:
                incoming_jump_edges.ensure(static_cast<OpCode_ForkReplaceJump const&>(opcode).offset() + state.instruction_position).append({ opcode_bytes });
+                has_any_backwards_jump |= static_cast<OpCode_ForkReplaceJump const&>(opcode).offset() < 0;
                break;
            case OpCodeId::ForkReplaceStay:
                incoming_jump_edges.ensure(static_cast<OpCode_ForkReplaceStay const&>(opcode).offset() + state.instruction_position).append({ opcode_bytes });
+                has_any_backwards_jump |= static_cast<OpCode_ForkReplaceStay const&>(opcode).offset() < 0;
                break;
            case OpCodeId::Repeat:
                incoming_jump_edges.ensure(static_cast<OpCode_Repeat const&>(opcode).offset() + state.instruction_position).append({ opcode_bytes });
+                has_any_backwards_jump = true;
                break;
            default:
                break;
@ -1067,6 +1076,16 @@ void Optimizer::append_alternation(ByteCode& target, Span<ByteCode> alternatives
        Queue<Tree*> nodes_to_visit;
        nodes_to_visit.enqueue(&trie);

+        HashMap<size_t, NonnullOwnPtr<RedBlackTree<u64, u64>>> instruction_positions;
+        if (has_any_backwards_jump)
+            MUST(instruction_positions.try_ensure_capacity(alternatives.size()));
+
+        auto ip_mapping_for_alternative = [&](size_t i) -> RedBlackTree<u64, u64>& {
+            return *instruction_positions.ensure(i, [] {
+                return make<RedBlackTree<u64, u64>>();
+            });
+        };
+
        // each node:
        //   node.re
        //   forkjump child1
@ -1089,6 +1108,11 @@ void Optimizer::append_alternation(ByteCode& target, Span<ByteCode> alternatives
                state.instruction_position = target.size();
                target.append(insn_bytes);

+                if (has_any_backwards_jump) {
+                    for (auto& ip : node->metadata_value())
+                        ip_mapping_for_alternative(ip.alternative_index).insert(ip.instruction_position, state.instruction_position);
+                }
+
                auto& opcode = target.get_opcode(state);

                ssize_t jump_offset;
@ -1124,8 +1148,31 @@ void Optimizer::append_alternation(ByteCode& target, Span<ByteCode> alternatives

                if (is_jump) {
                    VERIFY(node->has_metadata());
-                    auto& ip = node->metadata_value().first();
-                    patch_locations.append({ QualifiedIP { ip.alternative_index, ip.instruction_position + jump_offset + opcode.size() }, patch_location });
+                    QualifiedIP ip = node->metadata_value().first();
+                    auto intended_jump_ip = ip.instruction_position + jump_offset + opcode.size();
+                    if (jump_offset < 0 && intended_jump_ip > 0) {
+                        VERIFY(has_any_backwards_jump);
+                        // We should've already seen this instruction, so we can just patch it in.
+                        auto& ip_mapping = ip_mapping_for_alternative(ip.alternative_index);
+                        auto target_ip = ip_mapping.find(intended_jump_ip);
+                        if (!target_ip) {
+                            RegexDebug dbg;
+                            size_t x = 0;
+                            for (auto& entry : alternatives) {
+                                warnln("----------- {} ----------", x++);
+                                dbg.print_bytecode(entry);
+                            }
+
+                            dbgln("Regex Tree / Unknown backwards jump: {}@{} -> {}",
+                                ip.instruction_position,
+                                ip.alternative_index,
+                                intended_jump_ip);
+                            VERIFY_NOT_REACHED();
+                        }
+                        target[patch_location] = static_cast<ByteCodeValueType>(*target_ip - patch_location - 1);
+                    } else {
+                        patch_locations.append({ QualifiedIP { ip.alternative_index, intended_jump_ip }, patch_location });
+                    }
                }
            }