From e383f52d04bbe2a22099cf1e80e2c448712e170b Mon Sep 17 00:00:00 2001 From: Ali Mohammad Pur Date: Sun, 17 Nov 2024 23:34:45 +0100 Subject: [PATCH] LibRegex: Pick the right target for OpCode_Repeat Repeat's 'offset' field is a bit odd in that it is treated as a negative offset, causing a backwards jump when positive; the optimizer didn't correctly model this behaviour, which caused crashes and misopts when dealing with Repeats. This commit fixes that behaviour. --- Libraries/LibRegex/RegexMatcher.h | 1 + Libraries/LibRegex/RegexOptimizer.cpp | 28 ++++++++++++++++----------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/Libraries/LibRegex/RegexMatcher.h b/Libraries/LibRegex/RegexMatcher.h index 884bc93d521..666a4d67109 100644 --- a/Libraries/LibRegex/RegexMatcher.h +++ b/Libraries/LibRegex/RegexMatcher.h @@ -28,6 +28,7 @@ namespace Detail { struct Block { size_t start; size_t end; + StringView comment { "N/A"sv }; }; } diff --git a/Libraries/LibRegex/RegexOptimizer.cpp b/Libraries/LibRegex/RegexOptimizer.cpp index df8b1ac0b2c..b3d08d935c2 100644 --- a/Libraries/LibRegex/RegexOptimizer.cpp +++ b/Libraries/LibRegex/RegexOptimizer.cpp @@ -59,18 +59,18 @@ typename Regex::BasicBlockList Regex::split_basic_blocks(ByteCod auto& op = static_cast(opcode); ssize_t jump_offset = op.size() + op.offset(); if (jump_offset >= 0) { - block_boundaries.append({ end_of_last_block, state.instruction_position }); + block_boundaries.append({ end_of_last_block, state.instruction_position, "Jump ahead"sv }); end_of_last_block = state.instruction_position + opcode.size(); } else { // This op jumps back, see if that's within this "block". if (jump_offset + state.instruction_position > end_of_last_block) { // Split the block! - block_boundaries.append({ end_of_last_block, jump_offset + state.instruction_position }); - block_boundaries.append({ jump_offset + state.instruction_position, state.instruction_position }); + block_boundaries.append({ end_of_last_block, jump_offset + state.instruction_position, "Jump back 1"sv }); + block_boundaries.append({ jump_offset + state.instruction_position, state.instruction_position, "Jump back 2"sv }); end_of_last_block = state.instruction_position + opcode.size(); } else { // Nope, it's just a jump to another block - block_boundaries.append({ end_of_last_block, state.instruction_position }); + block_boundaries.append({ end_of_last_block, state.instruction_position, "Jump"sv }); end_of_last_block = state.instruction_position + opcode.size(); } } @@ -92,15 +92,16 @@ typename Regex::BasicBlockList Regex::split_basic_blocks(ByteCod check_jump.template operator()(opcode); break; case OpCodeId::FailForks: - block_boundaries.append({ end_of_last_block, state.instruction_position }); + block_boundaries.append({ end_of_last_block, state.instruction_position, "FailForks"sv }); end_of_last_block = state.instruction_position + opcode.size(); break; case OpCodeId::Repeat: { // Repeat produces two blocks, one containing its repeated expr, and one after that. - auto repeat_start = state.instruction_position - static_cast(opcode).offset(); + auto& repeat = static_cast(opcode); + auto repeat_start = state.instruction_position - repeat.offset() - repeat.size(); if (repeat_start > end_of_last_block) - block_boundaries.append({ end_of_last_block, repeat_start }); - block_boundaries.append({ repeat_start, state.instruction_position }); + block_boundaries.append({ end_of_last_block, repeat_start, "Repeat"sv }); + block_boundaries.append({ repeat_start, state.instruction_position, "Repeat after"sv }); end_of_last_block = state.instruction_position + opcode.size(); break; } @@ -116,7 +117,7 @@ typename Regex::BasicBlockList Regex::split_basic_blocks(ByteCod } if (end_of_last_block < bytecode_size) - block_boundaries.append({ end_of_last_block, bytecode_size }); + block_boundaries.append({ end_of_last_block, bytecode_size, "End"sv }); quick_sort(block_boundaries, [](auto& a, auto& b) { return a.start < b.start; }); @@ -664,7 +665,7 @@ void Regex::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const& RegexDebug dbg; dbg.print_bytecode(*this); for (auto const& block : basic_blocks) - dbgln("block from {} to {}", block.start, block.end); + dbgln("block from {} to {} (comment: {})", block.start, block.end, block.comment); } // A pattern such as: @@ -1221,6 +1222,7 @@ void Optimizer::append_alternation(ByteCode& target, Span alternatives ssize_t jump_offset; auto is_jump = true; auto patch_location = state.instruction_position + 1; + bool should_negate = false; switch (opcode.opcode_id()) { case OpCodeId::Jump: @@ -1243,6 +1245,7 @@ void Optimizer::append_alternation(ByteCode& target, Span alternatives break; case OpCodeId::Repeat: jump_offset = static_cast(0) - static_cast(static_cast(opcode).offset()) - static_cast(opcode.size()); + should_negate = true; break; default: is_jump = false; @@ -1272,7 +1275,10 @@ void Optimizer::append_alternation(ByteCode& target, Span alternatives intended_jump_ip); VERIFY_NOT_REACHED(); } - target[patch_location] = static_cast(*target_ip - patch_location - 1); + ssize_t target_value = *target_ip - patch_location - 1; + if (should_negate) + target_value = -target_value + 2; // from -1 to +1. + target[patch_location] = static_cast(target_value); } else { patch_locations.append({ QualifiedIP { ip.alternative_index, intended_jump_ip }, patch_location }); }