Merge e383f52d04 into d6bcd3fb0b

LibRegex: Pick the right target for OpCode_Repeat
Repeat's 'offset' field is a bit odd in that it is treated as a negative offset, causing a backwards jump when positive; the optimizer didn't correctly model this behaviour, which caused crashes and misopts when dealing with Repeats. This commit fixes that behaviour.
2024-11-25 00:50:22 +00:00 · 2024-11-20 20:01:46 -05:00 · 2024-11-18 16:35:11 +01:00 · 2024-11-18 16:35:11 +01:00
3 changed files with 25 additions and 12 deletions
--- a/Libraries/LibRegex/RegexByteCode.h
+++ b/Libraries/LibRegex/RegexByteCode.h
@ -791,7 +791,13 @@ public:
    ByteString arguments_string() const override
    {
        auto reps = id() < state().repetition_marks.size() ? state().repetition_marks.at(id()) : 0;
-        return ByteString::formatted("offset={} count={} id={} rep={}, sp: {}", offset(), count() + 1, id(), reps + 1, state().string_position);
+        return ByteString::formatted("offset={} [&{}] count={} id={} rep={}, sp: {}",
+            static_cast<ssize_t>(offset()),
+            state().instruction_position - offset(),
+            count() + 1,
+            id(),
+            reps + 1,
+            state().string_position);
    }
 };

--- a/Libraries/LibRegex/RegexMatcher.h
+++ b/Libraries/LibRegex/RegexMatcher.h
@ -28,6 +28,7 @@ namespace Detail {
 struct Block {
    size_t start;
    size_t end;
+    StringView comment { "N/A"sv };
 };

 }
--- a/Libraries/LibRegex/RegexOptimizer.cpp
+++ b/Libraries/LibRegex/RegexOptimizer.cpp
@ -59,18 +59,18 @@ typename Regex<Parser>::BasicBlockList Regex<Parser>::split_basic_blocks(ByteCod
        auto& op = static_cast<T const&>(opcode);
        ssize_t jump_offset = op.size() + op.offset();
        if (jump_offset >= 0) {
-            block_boundaries.append({ end_of_last_block, state.instruction_position });
+            block_boundaries.append({ end_of_last_block, state.instruction_position, "Jump ahead"sv });
            end_of_last_block = state.instruction_position + opcode.size();
        } else {
            // This op jumps back, see if that's within this "block".
            if (jump_offset + state.instruction_position > end_of_last_block) {
                // Split the block!
-                block_boundaries.append({ end_of_last_block, jump_offset + state.instruction_position });
-                block_boundaries.append({ jump_offset + state.instruction_position, state.instruction_position });
+                block_boundaries.append({ end_of_last_block, jump_offset + state.instruction_position, "Jump back 1"sv });
+                block_boundaries.append({ jump_offset + state.instruction_position, state.instruction_position, "Jump back 2"sv });
                end_of_last_block = state.instruction_position + opcode.size();
            } else {
                // Nope, it's just a jump to another block
-                block_boundaries.append({ end_of_last_block, state.instruction_position });
+                block_boundaries.append({ end_of_last_block, state.instruction_position, "Jump"sv });
                end_of_last_block = state.instruction_position + opcode.size();
            }
        }
@ -92,15 +92,16 @@ typename Regex<Parser>::BasicBlockList Regex<Parser>::split_basic_blocks(ByteCod
            check_jump.template operator()<OpCode_ForkStay>(opcode);
            break;
        case OpCodeId::FailForks:
-            block_boundaries.append({ end_of_last_block, state.instruction_position });
+            block_boundaries.append({ end_of_last_block, state.instruction_position, "FailForks"sv });
            end_of_last_block = state.instruction_position + opcode.size();
            break;
        case OpCodeId::Repeat: {
            // Repeat produces two blocks, one containing its repeated expr, and one after that.
-            auto repeat_start = state.instruction_position - static_cast<OpCode_Repeat const&>(opcode).offset();
+            auto& repeat = static_cast<OpCode_Repeat const&>(opcode);
+            auto repeat_start = state.instruction_position - repeat.offset() - repeat.size();
            if (repeat_start > end_of_last_block)
-                block_boundaries.append({ end_of_last_block, repeat_start });
-            block_boundaries.append({ repeat_start, state.instruction_position });
+                block_boundaries.append({ end_of_last_block, repeat_start, "Repeat"sv });
+            block_boundaries.append({ repeat_start, state.instruction_position, "Repeat after"sv });
            end_of_last_block = state.instruction_position + opcode.size();
            break;
        }
@ -116,7 +117,7 @@ typename Regex<Parser>::BasicBlockList Regex<Parser>::split_basic_blocks(ByteCod
    }

    if (end_of_last_block < bytecode_size)
-        block_boundaries.append({ end_of_last_block, bytecode_size });
+        block_boundaries.append({ end_of_last_block, bytecode_size, "End"sv });

    quick_sort(block_boundaries, [](auto& a, auto& b) { return a.start < b.start; });

@ -664,7 +665,7 @@ void Regex<Parser>::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&
        RegexDebug dbg;
        dbg.print_bytecode(*this);
        for (auto const& block : basic_blocks)
-            dbgln("block from {} to {}", block.start, block.end);
+            dbgln("block from {} to {} (comment: {})", block.start, block.end, block.comment);
    }

    // A pattern such as:
@ -1221,6 +1222,7 @@ void Optimizer::append_alternation(ByteCode& target, Span<ByteCode> alternatives
                ssize_t jump_offset;
                auto is_jump = true;
                auto patch_location = state.instruction_position + 1;
+                bool should_negate = false;

                switch (opcode.opcode_id()) {
                case OpCodeId::Jump:
@ -1243,6 +1245,7 @@ void Optimizer::append_alternation(ByteCode& target, Span<ByteCode> alternatives
                    break;
                case OpCodeId::Repeat:
                    jump_offset = static_cast<ssize_t>(0) - static_cast<ssize_t>(static_cast<OpCode_Repeat const&>(opcode).offset()) - static_cast<ssize_t>(opcode.size());
+                    should_negate = true;
                    break;
                default:
                    is_jump = false;
@ -1272,7 +1275,10 @@ void Optimizer::append_alternation(ByteCode& target, Span<ByteCode> alternatives
                                intended_jump_ip);
                            VERIFY_NOT_REACHED();
                        }
-                        target[patch_location] = static_cast<ByteCodeValueType>(*target_ip - patch_location - 1);
+                        ssize_t target_value = *target_ip - patch_location - 1;
+                        if (should_negate)
+                            target_value = -target_value + 2; // from -1 to +1.
+                        target[patch_location] = static_cast<ByteCodeValueType>(target_value);
                    } else {
                        patch_locations.append({ QualifiedIP { ip.alternative_index, intended_jump_ip }, patch_location });
                    }
Author	SHA1	Message	Date
Ali Mohammad Pur	0dd8dec490	Merge `e383f52d04` into `d6bcd3fb0b`	2024-11-20 20:01:46 -05:00
Ali Mohammad Pur	e383f52d04	LibRegex: Pick the right target for OpCode_Repeat Repeat's 'offset' field is a bit odd in that it is treated as a negative offset, causing a backwards jump when positive; the optimizer didn't correctly model this behaviour, which caused crashes and misopts when dealing with Repeats. This commit fixes that behaviour.	2024-11-18 16:35:11 +01:00
Ali Mohammad Pur	f36d87eefa	LibRegex: Print OpCode_Repeat's offset as ssize_t	2024-11-18 16:35:11 +01:00