12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397 |
- /*
- * Copyright (c) 2021, Ali Mohammad Pur <mpfard@serenityos.org>
- *
- * SPDX-License-Identifier: BSD-2-Clause
- */
- #include <AK/Debug.h>
- #include <AK/Function.h>
- #include <AK/Queue.h>
- #include <AK/QuickSort.h>
- #include <AK/RedBlackTree.h>
- #include <AK/Stack.h>
- #include <AK/Trie.h>
- #include <LibRegex/Regex.h>
- #include <LibRegex/RegexBytecodeStreamOptimizer.h>
- #include <LibUnicode/CharacterTypes.h>
- #if REGEX_DEBUG
- # include <AK/ScopeGuard.h>
- # include <AK/ScopeLogger.h>
- #endif
- namespace regex {
- using Detail::Block;
- template<typename Parser>
- void Regex<Parser>::run_optimization_passes()
- {
- parser_result.bytecode.flatten();
- auto blocks = split_basic_blocks(parser_result.bytecode);
- if (attempt_rewrite_entire_match_as_substring_search(blocks))
- return;
- // Rewrite fork loops as atomic groups
- // e.g. a*b -> (ATOMIC a*)b
- attempt_rewrite_loops_as_atomic_groups(blocks);
- parser_result.bytecode.flatten();
- }
- template<typename Parser>
- typename Regex<Parser>::BasicBlockList Regex<Parser>::split_basic_blocks(ByteCode const& bytecode)
- {
- BasicBlockList block_boundaries;
- size_t end_of_last_block = 0;
- auto bytecode_size = bytecode.size();
- MatchState state;
- state.instruction_position = 0;
- auto check_jump = [&]<typename T>(OpCode const& opcode) {
- auto& op = static_cast<T const&>(opcode);
- ssize_t jump_offset = op.size() + op.offset();
- if (jump_offset >= 0) {
- block_boundaries.append({ end_of_last_block, state.instruction_position });
- end_of_last_block = state.instruction_position + opcode.size();
- } else {
- // This op jumps back, see if that's within this "block".
- if (jump_offset + state.instruction_position > end_of_last_block) {
- // Split the block!
- block_boundaries.append({ end_of_last_block, jump_offset + state.instruction_position });
- block_boundaries.append({ jump_offset + state.instruction_position, state.instruction_position });
- end_of_last_block = state.instruction_position + opcode.size();
- } else {
- // Nope, it's just a jump to another block
- block_boundaries.append({ end_of_last_block, state.instruction_position });
- end_of_last_block = state.instruction_position + opcode.size();
- }
- }
- };
- for (;;) {
- auto& opcode = bytecode.get_opcode(state);
- switch (opcode.opcode_id()) {
- case OpCodeId::Jump:
- check_jump.template operator()<OpCode_Jump>(opcode);
- break;
- case OpCodeId::JumpNonEmpty:
- check_jump.template operator()<OpCode_JumpNonEmpty>(opcode);
- break;
- case OpCodeId::ForkJump:
- check_jump.template operator()<OpCode_ForkJump>(opcode);
- break;
- case OpCodeId::ForkStay:
- check_jump.template operator()<OpCode_ForkStay>(opcode);
- break;
- case OpCodeId::FailForks:
- block_boundaries.append({ end_of_last_block, state.instruction_position });
- end_of_last_block = state.instruction_position + opcode.size();
- break;
- case OpCodeId::Repeat: {
- // Repeat produces two blocks, one containing its repeated expr, and one after that.
- auto repeat_start = state.instruction_position - static_cast<OpCode_Repeat const&>(opcode).offset();
- if (repeat_start > end_of_last_block)
- block_boundaries.append({ end_of_last_block, repeat_start });
- block_boundaries.append({ repeat_start, state.instruction_position });
- end_of_last_block = state.instruction_position + opcode.size();
- break;
- }
- default:
- break;
- }
- auto next_ip = state.instruction_position + opcode.size();
- if (next_ip < bytecode_size)
- state.instruction_position = next_ip;
- else
- break;
- }
- if (end_of_last_block < bytecode_size)
- block_boundaries.append({ end_of_last_block, bytecode_size });
- quick_sort(block_boundaries, [](auto& a, auto& b) { return a.start < b.start; });
- return block_boundaries;
- }
- static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<CompareTypeAndValuePair> const& rhs)
- {
- // We have to fully interpret the two sequences to determine if they overlap (that is, keep track of inversion state and what ranges they cover).
- bool inverse { false };
- bool temporary_inverse { false };
- bool reset_temporary_inverse { false };
- auto current_lhs_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; };
- RedBlackTree<u32, u32> lhs_ranges;
- RedBlackTree<u32, u32> lhs_negated_ranges;
- HashTable<CharClass> lhs_char_classes;
- HashTable<CharClass> lhs_negated_char_classes;
- auto has_any_unicode_property = false;
- HashTable<Unicode::GeneralCategory> lhs_unicode_general_categories;
- HashTable<Unicode::Property> lhs_unicode_properties;
- HashTable<Unicode::Script> lhs_unicode_scripts;
- HashTable<Unicode::Script> lhs_unicode_script_extensions;
- HashTable<Unicode::GeneralCategory> lhs_negated_unicode_general_categories;
- HashTable<Unicode::Property> lhs_negated_unicode_properties;
- HashTable<Unicode::Script> lhs_negated_unicode_scripts;
- HashTable<Unicode::Script> lhs_negated_unicode_script_extensions;
- auto any_unicode_property_matches = [&](u32 code_point) {
- if (any_of(lhs_negated_unicode_general_categories, [code_point](auto category) { return Unicode::code_point_has_general_category(code_point, category); }))
- return false;
- if (any_of(lhs_negated_unicode_properties, [code_point](auto property) { return Unicode::code_point_has_property(code_point, property); }))
- return false;
- if (any_of(lhs_negated_unicode_scripts, [code_point](auto script) { return Unicode::code_point_has_script(code_point, script); }))
- return false;
- if (any_of(lhs_negated_unicode_script_extensions, [code_point](auto script) { return Unicode::code_point_has_script_extension(code_point, script); }))
- return false;
- if (any_of(lhs_unicode_general_categories, [code_point](auto category) { return Unicode::code_point_has_general_category(code_point, category); }))
- return true;
- if (any_of(lhs_unicode_properties, [code_point](auto property) { return Unicode::code_point_has_property(code_point, property); }))
- return true;
- if (any_of(lhs_unicode_scripts, [code_point](auto script) { return Unicode::code_point_has_script(code_point, script); }))
- return true;
- if (any_of(lhs_unicode_script_extensions, [code_point](auto script) { return Unicode::code_point_has_script_extension(code_point, script); }))
- return true;
- return false;
- };
- auto range_contains = [&]<typename T>(T& value) -> bool {
- u32 start;
- u32 end;
- if constexpr (IsSame<T, CharRange>) {
- start = value.from;
- end = value.to;
- } else {
- start = value;
- end = value;
- }
- if (has_any_unicode_property) {
- // We have some properties, and a range is present
- // Instead of checking every single code point in the range, assume it's a match.
- return start != end || any_unicode_property_matches(start);
- }
- auto* max = lhs_ranges.find_smallest_not_below(start);
- return max && *max <= end;
- };
- auto char_class_contains = [&](CharClass const& value) -> bool {
- if (lhs_char_classes.contains(value))
- return true;
- if (lhs_negated_char_classes.contains(value))
- return false;
- // This char class might match something in the ranges we have, and checking that is far too expensive, so just bail out.
- return true;
- };
- for (auto const& pair : lhs) {
- if (reset_temporary_inverse) {
- reset_temporary_inverse = false;
- temporary_inverse = false;
- } else {
- reset_temporary_inverse = true;
- }
- switch (pair.type) {
- case CharacterCompareType::Inverse:
- inverse = !inverse;
- break;
- case CharacterCompareType::TemporaryInverse:
- temporary_inverse = true;
- reset_temporary_inverse = true;
- break;
- case CharacterCompareType::AnyChar:
- // Special case: if not inverted, AnyChar is always in the range.
- if (!current_lhs_inversion_state())
- return true;
- break;
- case CharacterCompareType::Char:
- if (!current_lhs_inversion_state())
- lhs_ranges.insert(pair.value, pair.value);
- else
- lhs_negated_ranges.insert(pair.value, pair.value);
- break;
- case CharacterCompareType::String:
- // FIXME: We just need to look at the last character of this string, but we only have the first character here.
- // Just bail out to avoid false positives.
- return true;
- case CharacterCompareType::CharClass:
- if (!current_lhs_inversion_state())
- lhs_char_classes.set(static_cast<CharClass>(pair.value));
- else
- lhs_negated_char_classes.set(static_cast<CharClass>(pair.value));
- break;
- case CharacterCompareType::CharRange: {
- auto range = CharRange(pair.value);
- if (!current_lhs_inversion_state())
- lhs_ranges.insert(range.from, range.to);
- else
- lhs_negated_ranges.insert(range.from, range.to);
- break;
- }
- case CharacterCompareType::LookupTable:
- // We've transformed this into a series of ranges in flat_compares(), so bail out if we see it.
- return true;
- case CharacterCompareType::Reference:
- // We've handled this before coming here.
- break;
- case CharacterCompareType::Property:
- has_any_unicode_property = true;
- if (!current_lhs_inversion_state())
- lhs_unicode_properties.set(static_cast<Unicode::Property>(pair.value));
- else
- lhs_negated_unicode_properties.set(static_cast<Unicode::Property>(pair.value));
- break;
- case CharacterCompareType::GeneralCategory:
- has_any_unicode_property = true;
- if (!current_lhs_inversion_state())
- lhs_unicode_general_categories.set(static_cast<Unicode::GeneralCategory>(pair.value));
- else
- lhs_negated_unicode_general_categories.set(static_cast<Unicode::GeneralCategory>(pair.value));
- break;
- case CharacterCompareType::Script:
- has_any_unicode_property = true;
- if (!current_lhs_inversion_state())
- lhs_unicode_scripts.set(static_cast<Unicode::Script>(pair.value));
- else
- lhs_negated_unicode_scripts.set(static_cast<Unicode::Script>(pair.value));
- break;
- case CharacterCompareType::ScriptExtension:
- has_any_unicode_property = true;
- if (!current_lhs_inversion_state())
- lhs_unicode_script_extensions.set(static_cast<Unicode::Script>(pair.value));
- else
- lhs_negated_unicode_script_extensions.set(static_cast<Unicode::Script>(pair.value));
- break;
- case CharacterCompareType::And:
- case CharacterCompareType::Or:
- case CharacterCompareType::EndAndOr:
- // FIXME: These are too difficult to handle, so bail out.
- return true;
- case CharacterCompareType::Undefined:
- case CharacterCompareType::RangeExpressionDummy:
- // These do not occur in valid bytecode.
- VERIFY_NOT_REACHED();
- }
- }
- if constexpr (REGEX_DEBUG) {
- dbgln("lhs ranges:");
- for (auto it = lhs_ranges.begin(); it != lhs_ranges.end(); ++it)
- dbgln(" {}..{}", it.key(), *it);
- dbgln("lhs negated ranges:");
- for (auto it = lhs_negated_ranges.begin(); it != lhs_negated_ranges.end(); ++it)
- dbgln(" {}..{}", it.key(), *it);
- }
- for (auto const& pair : rhs) {
- if (reset_temporary_inverse) {
- reset_temporary_inverse = false;
- temporary_inverse = false;
- } else {
- reset_temporary_inverse = true;
- }
- dbgln_if(REGEX_DEBUG, "check {} ({})...", character_compare_type_name(pair.type), pair.value);
- switch (pair.type) {
- case CharacterCompareType::Inverse:
- inverse = !inverse;
- break;
- case CharacterCompareType::TemporaryInverse:
- temporary_inverse = true;
- reset_temporary_inverse = true;
- break;
- case CharacterCompareType::AnyChar:
- // Special case: if not inverted, AnyChar is always in the range.
- if (!current_lhs_inversion_state())
- return true;
- break;
- case CharacterCompareType::Char:
- if (current_lhs_inversion_state() ^ range_contains(pair.value))
- return true;
- break;
- case CharacterCompareType::String:
- // FIXME: We just need to look at the last character of this string, but we only have the first character here.
- // Just bail out to avoid false positives.
- return true;
- case CharacterCompareType::CharClass:
- if (current_lhs_inversion_state() ^ char_class_contains(static_cast<CharClass>(pair.value)))
- return true;
- break;
- case CharacterCompareType::CharRange: {
- auto range = CharRange(pair.value);
- if (current_lhs_inversion_state() ^ range_contains(range))
- return true;
- break;
- }
- case CharacterCompareType::LookupTable:
- // We've transformed this into a series of ranges in flat_compares(), so bail out if we see it.
- return true;
- case CharacterCompareType::Reference:
- // We've handled this before coming here.
- break;
- case CharacterCompareType::Property:
- // The only reasonable scenario where we can check these properties without spending too much time is if:
- // - the ranges are empty
- // - the char classes are empty
- // - the unicode properties are empty or contain only this property
- if (!lhs_ranges.is_empty() || !lhs_negated_ranges.is_empty() || !lhs_char_classes.is_empty() || !lhs_negated_char_classes.is_empty())
- return true;
- if (has_any_unicode_property && !lhs_unicode_properties.is_empty() && !lhs_negated_unicode_properties.is_empty()) {
- if (current_lhs_inversion_state() ^ lhs_unicode_properties.contains(static_cast<Unicode::Property>(pair.value)))
- return true;
- if (false == (current_lhs_inversion_state() ^ lhs_negated_unicode_properties.contains(static_cast<Unicode::Property>(pair.value))))
- return true;
- }
- break;
- case CharacterCompareType::GeneralCategory:
- if (!lhs_ranges.is_empty() || !lhs_negated_ranges.is_empty() || !lhs_char_classes.is_empty() || !lhs_negated_char_classes.is_empty())
- return true;
- if (has_any_unicode_property && !lhs_unicode_general_categories.is_empty() && !lhs_negated_unicode_general_categories.is_empty()) {
- if (current_lhs_inversion_state() ^ lhs_unicode_general_categories.contains(static_cast<Unicode::GeneralCategory>(pair.value)))
- return true;
- if (false == (current_lhs_inversion_state() ^ lhs_negated_unicode_general_categories.contains(static_cast<Unicode::GeneralCategory>(pair.value))))
- return true;
- }
- break;
- case CharacterCompareType::Script:
- if (!lhs_ranges.is_empty() || !lhs_negated_ranges.is_empty() || !lhs_char_classes.is_empty() || !lhs_negated_char_classes.is_empty())
- return true;
- if (has_any_unicode_property && !lhs_unicode_scripts.is_empty() && !lhs_negated_unicode_scripts.is_empty()) {
- if (current_lhs_inversion_state() ^ lhs_unicode_scripts.contains(static_cast<Unicode::Script>(pair.value)))
- return true;
- if (false == (current_lhs_inversion_state() ^ lhs_negated_unicode_scripts.contains(static_cast<Unicode::Script>(pair.value))))
- return true;
- }
- break;
- case CharacterCompareType::ScriptExtension:
- if (!lhs_ranges.is_empty() || !lhs_negated_ranges.is_empty() || !lhs_char_classes.is_empty() || !lhs_negated_char_classes.is_empty())
- return true;
- if (has_any_unicode_property && !lhs_unicode_script_extensions.is_empty() && !lhs_negated_unicode_script_extensions.is_empty()) {
- if (current_lhs_inversion_state() ^ lhs_unicode_script_extensions.contains(static_cast<Unicode::Script>(pair.value)))
- return true;
- if (false == (current_lhs_inversion_state() ^ lhs_negated_unicode_script_extensions.contains(static_cast<Unicode::Script>(pair.value))))
- return true;
- }
- break;
- case CharacterCompareType::And:
- case CharacterCompareType::Or:
- case CharacterCompareType::EndAndOr:
- // FIXME: These are too difficult to handle, so bail out.
- return true;
- case CharacterCompareType::Undefined:
- case CharacterCompareType::RangeExpressionDummy:
- // These do not occur in valid bytecode.
- VERIFY_NOT_REACHED();
- }
- }
- return false;
- }
- enum class AtomicRewritePreconditionResult {
- SatisfiedWithProperHeader,
- SatisfiedWithEmptyHeader,
- NotSatisfied,
- };
- static AtomicRewritePreconditionResult block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode, Block const& repeated_block, Block const& following_block)
- {
- Vector<Vector<CompareTypeAndValuePair>> repeated_values;
- HashTable<size_t> active_capture_groups;
- MatchState state;
- auto has_seen_actionable_opcode = false;
- for (state.instruction_position = repeated_block.start; state.instruction_position < repeated_block.end;) {
- auto& opcode = bytecode.get_opcode(state);
- switch (opcode.opcode_id()) {
- case OpCodeId::Compare: {
- has_seen_actionable_opcode = true;
- auto compares = static_cast<OpCode_Compare const&>(opcode).flat_compares();
- if (repeated_values.is_empty() && any_of(compares, [](auto& compare) { return compare.type == CharacterCompareType::AnyChar; }))
- return AtomicRewritePreconditionResult::NotSatisfied;
- repeated_values.append(move(compares));
- break;
- }
- case OpCodeId::CheckBegin:
- case OpCodeId::CheckEnd:
- has_seen_actionable_opcode = true;
- if (repeated_values.is_empty())
- return AtomicRewritePreconditionResult::SatisfiedWithProperHeader;
- break;
- case OpCodeId::CheckBoundary:
- // FIXME: What should we do with these? for now, let's fail.
- return AtomicRewritePreconditionResult::NotSatisfied;
- case OpCodeId::Restore:
- case OpCodeId::GoBack:
- return AtomicRewritePreconditionResult::NotSatisfied;
- case OpCodeId::SaveRightCaptureGroup:
- active_capture_groups.set(static_cast<OpCode_SaveRightCaptureGroup const&>(opcode).id());
- break;
- case OpCodeId::SaveLeftCaptureGroup:
- active_capture_groups.set(static_cast<OpCode_SaveLeftCaptureGroup const&>(opcode).id());
- break;
- case OpCodeId::ForkJump:
- case OpCodeId::ForkReplaceJump:
- case OpCodeId::JumpNonEmpty:
- // We could attempt to recursively resolve the follow set, but pretending that this just goes nowhere is faster.
- if (!has_seen_actionable_opcode)
- return AtomicRewritePreconditionResult::NotSatisfied;
- break;
- default:
- break;
- }
- state.instruction_position += opcode.size();
- }
- dbgln_if(REGEX_DEBUG, "Found {} entries in reference", repeated_values.size());
- dbgln_if(REGEX_DEBUG, "Found {} active capture groups", active_capture_groups.size());
- bool following_block_has_at_least_one_compare = false;
- // Find the first compare in the following block, it must NOT match any of the values in `repeated_values'.
- auto final_instruction = following_block.start;
- for (state.instruction_position = following_block.start; state.instruction_position < following_block.end;) {
- final_instruction = state.instruction_position;
- auto& opcode = bytecode.get_opcode(state);
- switch (opcode.opcode_id()) {
- // Note: These have to exist since we're effectively repeating the following block as well
- case OpCodeId::SaveRightCaptureGroup:
- active_capture_groups.set(static_cast<OpCode_SaveRightCaptureGroup const&>(opcode).id());
- break;
- case OpCodeId::SaveLeftCaptureGroup:
- active_capture_groups.set(static_cast<OpCode_SaveLeftCaptureGroup const&>(opcode).id());
- break;
- case OpCodeId::Compare: {
- following_block_has_at_least_one_compare = true;
- // We found a compare, let's see what it has.
- auto compares = static_cast<OpCode_Compare const&>(opcode).flat_compares();
- if (compares.is_empty())
- break;
- if (any_of(compares, [&](auto& compare) {
- return compare.type == CharacterCompareType::AnyChar
- || (compare.type == CharacterCompareType::Reference && active_capture_groups.contains(compare.value));
- }))
- return AtomicRewritePreconditionResult::NotSatisfied;
- if (any_of(repeated_values, [&](auto& repeated_value) { return has_overlap(compares, repeated_value); }))
- return AtomicRewritePreconditionResult::NotSatisfied;
- return AtomicRewritePreconditionResult::SatisfiedWithProperHeader;
- }
- case OpCodeId::CheckBegin:
- case OpCodeId::CheckEnd:
- return AtomicRewritePreconditionResult::SatisfiedWithProperHeader; // Nothing can match the end!
- case OpCodeId::CheckBoundary:
- // FIXME: What should we do with these? For now, consider them a failure.
- return AtomicRewritePreconditionResult::NotSatisfied;
- case OpCodeId::ForkJump:
- case OpCodeId::ForkReplaceJump:
- case OpCodeId::JumpNonEmpty:
- // See note in the previous switch, same cases.
- if (!following_block_has_at_least_one_compare)
- return AtomicRewritePreconditionResult::NotSatisfied;
- break;
- default:
- break;
- }
- state.instruction_position += opcode.size();
- }
- // If the following block falls through, we can't rewrite it.
- state.instruction_position = final_instruction;
- switch (bytecode.get_opcode(state).opcode_id()) {
- case OpCodeId::Jump:
- case OpCodeId::JumpNonEmpty:
- case OpCodeId::ForkJump:
- case OpCodeId::ForkReplaceJump:
- break;
- default:
- return AtomicRewritePreconditionResult::NotSatisfied;
- }
- if (following_block_has_at_least_one_compare)
- return AtomicRewritePreconditionResult::SatisfiedWithProperHeader;
- return AtomicRewritePreconditionResult::SatisfiedWithEmptyHeader;
- }
- template<typename Parser>
- bool Regex<Parser>::attempt_rewrite_entire_match_as_substring_search(BasicBlockList const& basic_blocks)
- {
- // If there's no jumps, we can probably rewrite this as a substring search (Compare { string = str }).
- if (basic_blocks.size() > 1)
- return false;
- if (basic_blocks.is_empty()) {
- parser_result.optimization_data.pure_substring_search = ""sv;
- return true; // Empty regex, sure.
- }
- auto& bytecode = parser_result.bytecode;
- auto is_unicode = parser_result.options.has_flag_set(AllFlags::Unicode);
- // We have a single basic block, let's see if it's a series of character or string compares.
- StringBuilder final_string;
- MatchState state;
- while (state.instruction_position < bytecode.size()) {
- auto& opcode = bytecode.get_opcode(state);
- switch (opcode.opcode_id()) {
- case OpCodeId::Compare: {
- auto& compare = static_cast<OpCode_Compare const&>(opcode);
- for (auto& flat_compare : compare.flat_compares()) {
- if (flat_compare.type != CharacterCompareType::Char)
- return false;
- if (is_unicode || flat_compare.value <= 0x7f)
- final_string.append_code_point(flat_compare.value);
- else
- final_string.append(bit_cast<char>(static_cast<u8>(flat_compare.value)));
- }
- break;
- }
- default:
- return false;
- }
- state.instruction_position += opcode.size();
- }
- parser_result.optimization_data.pure_substring_search = final_string.to_byte_string();
- return true;
- }
- template<typename Parser>
- void Regex<Parser>::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const& basic_blocks)
- {
- auto& bytecode = parser_result.bytecode;
- if constexpr (REGEX_DEBUG) {
- RegexDebug dbg;
- dbg.print_bytecode(*this);
- for (auto const& block : basic_blocks)
- dbgln("block from {} to {}", block.start, block.end);
- }
- // A pattern such as:
- // bb0 | RE0
- // | ForkX bb0
- // -------------------------
- // bb1 | RE1
- // can be rewritten as:
- // -------------------------
- // bb0 | RE0
- // | ForkReplaceX bb0
- // -------------------------
- // bb1 | RE1
- // provided that first(RE1) not-in end(RE0), which is to say
- // that RE1 cannot start with whatever RE0 has matched (ever).
- //
- // Alternatively, a second form of this pattern can also occur:
- // bb0 | *
- // | ForkX bb2
- // ------------------------
- // bb1 | RE0
- // | Jump bb0
- // ------------------------
- // bb2 | RE1
- // which can be transformed (with the same preconditions) to:
- // bb0 | *
- // | ForkReplaceX bb2
- // ------------------------
- // bb1 | RE0
- // | Jump bb0
- // ------------------------
- // bb2 | RE1
- enum class AlternateForm {
- DirectLoopWithoutHeader, // loop without proper header, a block forking to itself. i.e. the first form.
- DirectLoopWithoutHeaderAndEmptyFollow, // loop without proper header, a block forking to itself. i.e. the first form but with RE1 being empty.
- DirectLoopWithHeader, // loop with proper header, i.e. the second form.
- };
- struct CandidateBlock {
- Block forking_block;
- Optional<Block> new_target_block;
- AlternateForm form;
- };
- Vector<CandidateBlock> candidate_blocks;
- auto is_an_eligible_jump = [](OpCode const& opcode, size_t ip, size_t block_start, AlternateForm alternate_form) {
- switch (opcode.opcode_id()) {
- case OpCodeId::JumpNonEmpty: {
- auto const& op = static_cast<OpCode_JumpNonEmpty const&>(opcode);
- auto form = op.form();
- if (form != OpCodeId::Jump && alternate_form == AlternateForm::DirectLoopWithHeader)
- return false;
- if (form != OpCodeId::ForkJump && form != OpCodeId::ForkStay && alternate_form == AlternateForm::DirectLoopWithoutHeader)
- return false;
- return op.offset() + ip + opcode.size() == block_start;
- }
- case OpCodeId::ForkJump:
- if (alternate_form == AlternateForm::DirectLoopWithHeader)
- return false;
- return static_cast<OpCode_ForkJump const&>(opcode).offset() + ip + opcode.size() == block_start;
- case OpCodeId::ForkStay:
- if (alternate_form == AlternateForm::DirectLoopWithHeader)
- return false;
- return static_cast<OpCode_ForkStay const&>(opcode).offset() + ip + opcode.size() == block_start;
- case OpCodeId::Jump:
- // Infinite loop does *not* produce forks.
- if (alternate_form == AlternateForm::DirectLoopWithoutHeader)
- return false;
- if (alternate_form == AlternateForm::DirectLoopWithHeader)
- return static_cast<OpCode_Jump const&>(opcode).offset() + ip + opcode.size() == block_start;
- VERIFY_NOT_REACHED();
- default:
- return false;
- }
- };
- for (size_t i = 0; i < basic_blocks.size(); ++i) {
- auto forking_block = basic_blocks[i];
- Optional<Block> fork_fallback_block;
- if (i + 1 < basic_blocks.size())
- fork_fallback_block = basic_blocks[i + 1];
- MatchState state;
- // Check if the last instruction in this block is a jump to the block itself:
- {
- state.instruction_position = forking_block.end;
- auto& opcode = bytecode.get_opcode(state);
- if (is_an_eligible_jump(opcode, state.instruction_position, forking_block.start, AlternateForm::DirectLoopWithoutHeader)) {
- // We've found RE0 (and RE1 is just the following block, if any), let's see if the precondition applies.
- // if RE1 is empty, there's no first(RE1), so this is an automatic pass.
- if (!fork_fallback_block.has_value()
- || (fork_fallback_block->end == fork_fallback_block->start && block_satisfies_atomic_rewrite_precondition(bytecode, forking_block, *fork_fallback_block) != AtomicRewritePreconditionResult::NotSatisfied)) {
- candidate_blocks.append({ forking_block, fork_fallback_block, AlternateForm::DirectLoopWithoutHeader });
- break;
- }
- auto precondition = block_satisfies_atomic_rewrite_precondition(bytecode, forking_block, *fork_fallback_block);
- if (precondition == AtomicRewritePreconditionResult::SatisfiedWithProperHeader) {
- candidate_blocks.append({ forking_block, fork_fallback_block, AlternateForm::DirectLoopWithoutHeader });
- break;
- }
- if (precondition == AtomicRewritePreconditionResult::SatisfiedWithEmptyHeader) {
- candidate_blocks.append({ forking_block, fork_fallback_block, AlternateForm::DirectLoopWithoutHeaderAndEmptyFollow });
- break;
- }
- }
- }
- // Check if the last instruction in the last block is a direct jump to this block
- if (fork_fallback_block.has_value()) {
- state.instruction_position = fork_fallback_block->end;
- auto& opcode = bytecode.get_opcode(state);
- if (is_an_eligible_jump(opcode, state.instruction_position, forking_block.start, AlternateForm::DirectLoopWithHeader)) {
- // We've found bb1 and bb0, let's just make sure that bb0 forks to bb2.
- state.instruction_position = forking_block.end;
- auto& opcode = bytecode.get_opcode(state);
- if (opcode.opcode_id() == OpCodeId::ForkJump || opcode.opcode_id() == OpCodeId::ForkStay) {
- Optional<Block> block_following_fork_fallback;
- if (i + 2 < basic_blocks.size())
- block_following_fork_fallback = basic_blocks[i + 2];
- if (!block_following_fork_fallback.has_value()
- || block_satisfies_atomic_rewrite_precondition(bytecode, *fork_fallback_block, *block_following_fork_fallback) != AtomicRewritePreconditionResult::NotSatisfied) {
- candidate_blocks.append({ forking_block, {}, AlternateForm::DirectLoopWithHeader });
- break;
- }
- }
- }
- }
- }
- dbgln_if(REGEX_DEBUG, "Found {} candidate blocks", candidate_blocks.size());
- if (candidate_blocks.is_empty()) {
- dbgln_if(REGEX_DEBUG, "Failed to find anything for {}", pattern_value);
- return;
- }
- RedBlackTree<size_t, size_t> needed_patches;
- // Reverse the blocks, so we can patch the bytecode without messing with the latter patches.
- quick_sort(candidate_blocks, [](auto& a, auto& b) { return b.forking_block.start > a.forking_block.start; });
- for (auto& candidate : candidate_blocks) {
- // Note that both forms share a ForkReplace patch in forking_block.
- // Patch the ForkX in forking_block to be a ForkReplaceX instead.
- auto& opcode_id = bytecode[candidate.forking_block.end];
- if (opcode_id == (ByteCodeValueType)OpCodeId::ForkStay) {
- opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceStay;
- } else if (opcode_id == (ByteCodeValueType)OpCodeId::ForkJump) {
- opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceJump;
- } else if (opcode_id == (ByteCodeValueType)OpCodeId::JumpNonEmpty) {
- auto& jump_opcode_id = bytecode[candidate.forking_block.end + 3];
- if (jump_opcode_id == (ByteCodeValueType)OpCodeId::ForkStay)
- jump_opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceStay;
- else if (jump_opcode_id == (ByteCodeValueType)OpCodeId::ForkJump)
- jump_opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceJump;
- else
- VERIFY_NOT_REACHED();
- } else {
- VERIFY_NOT_REACHED();
- }
- }
- if (!needed_patches.is_empty()) {
- MatchState state;
- auto bytecode_size = bytecode.size();
- state.instruction_position = 0;
- struct Patch {
- ssize_t value;
- size_t offset;
- bool should_negate { false };
- };
- for (;;) {
- if (state.instruction_position >= bytecode_size)
- break;
- auto& opcode = bytecode.get_opcode(state);
- Stack<Patch, 2> patch_points;
- switch (opcode.opcode_id()) {
- case OpCodeId::Jump:
- patch_points.push({ static_cast<OpCode_Jump const&>(opcode).offset(), state.instruction_position + 1 });
- break;
- case OpCodeId::JumpNonEmpty:
- patch_points.push({ static_cast<OpCode_JumpNonEmpty const&>(opcode).offset(), state.instruction_position + 1 });
- patch_points.push({ static_cast<OpCode_JumpNonEmpty const&>(opcode).checkpoint(), state.instruction_position + 2 });
- break;
- case OpCodeId::ForkJump:
- patch_points.push({ static_cast<OpCode_ForkJump const&>(opcode).offset(), state.instruction_position + 1 });
- break;
- case OpCodeId::ForkStay:
- patch_points.push({ static_cast<OpCode_ForkStay const&>(opcode).offset(), state.instruction_position + 1 });
- break;
- case OpCodeId::Repeat:
- patch_points.push({ -(ssize_t) static_cast<OpCode_Repeat const&>(opcode).offset(), state.instruction_position + 1, true });
- break;
- default:
- break;
- }
- while (!patch_points.is_empty()) {
- auto& patch_point = patch_points.top();
- auto target_offset = patch_point.value + state.instruction_position + opcode.size();
- constexpr auto do_patch = [](auto& patch_it, auto& patch_point, auto& target_offset, auto& bytecode, auto ip) {
- if (patch_it.key() == ip)
- return;
- if (patch_point.value < 0 && target_offset <= patch_it.key() && ip > patch_it.key())
- bytecode[patch_point.offset] += (patch_point.should_negate ? 1 : -1) * (*patch_it);
- else if (patch_point.value > 0 && target_offset >= patch_it.key() && ip < patch_it.key())
- bytecode[patch_point.offset] += (patch_point.should_negate ? -1 : 1) * (*patch_it);
- };
- if (auto patch_it = needed_patches.find_largest_not_above_iterator(target_offset); !patch_it.is_end())
- do_patch(patch_it, patch_point, target_offset, bytecode, state.instruction_position);
- else if (auto patch_it = needed_patches.find_largest_not_above_iterator(state.instruction_position); !patch_it.is_end())
- do_patch(patch_it, patch_point, target_offset, bytecode, state.instruction_position);
- patch_points.pop();
- }
- state.instruction_position += opcode.size();
- }
- }
- if constexpr (REGEX_DEBUG) {
- warnln("Transformed to:");
- RegexDebug dbg;
- dbg.print_bytecode(*this);
- }
- }
- void Optimizer::append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& right)
- {
- Array<ByteCode, 2> alternatives;
- alternatives[0] = move(left);
- alternatives[1] = move(right);
- append_alternation(target, alternatives);
- }
- template<typename K, typename V, typename KTraits>
- using OrderedHashMapForTrie = OrderedHashMap<K, V, KTraits>;
- void Optimizer::append_alternation(ByteCode& target, Span<ByteCode> alternatives)
- {
- if (alternatives.size() == 0)
- return;
- if (alternatives.size() == 1)
- return target.extend(move(alternatives[0]));
- if (all_of(alternatives, [](auto& x) { return x.is_empty(); }))
- return;
- for (auto& entry : alternatives)
- entry.flatten();
- #if REGEX_DEBUG
- ScopeLogger<true> log;
- warnln("Alternations:");
- RegexDebug dbg;
- for (auto& entry : alternatives) {
- warnln("----------");
- dbg.print_bytecode(entry);
- }
- ScopeGuard print_at_end {
- [&] {
- warnln("======================");
- RegexDebug dbg;
- dbg.print_bytecode(target);
- }
- };
- #endif
- // First, find incoming jump edges.
- // We need them for two reasons:
- // - We need to distinguish between insn-A-jumped-to-by-insn-B and insn-A-jumped-to-by-insn-C (as otherwise we'd break trie invariants)
- // - We need to know which jumps to patch when we're done
- struct JumpEdge {
- Span<ByteCodeValueType const> jump_insn;
- };
- Vector<HashMap<size_t, Vector<JumpEdge>>> incoming_jump_edges_for_each_alternative;
- incoming_jump_edges_for_each_alternative.resize(alternatives.size());
- auto has_any_backwards_jump = false;
- MatchState state;
- for (size_t i = 0; i < alternatives.size(); ++i) {
- auto& alternative = alternatives[i];
- // Add a jump to the "end" of the block; this is implicit in the bytecode, but we need it to be explicit in the trie.
- // Jump{offset=0}
- alternative.append(static_cast<ByteCodeValueType>(OpCodeId::Jump));
- alternative.append(0);
- auto& incoming_jump_edges = incoming_jump_edges_for_each_alternative[i];
- auto alternative_bytes = alternative.spans<1>().singular_span();
- for (state.instruction_position = 0; state.instruction_position < alternative.size();) {
- auto& opcode = alternative.get_opcode(state);
- auto opcode_bytes = alternative_bytes.slice(state.instruction_position, opcode.size());
- switch (opcode.opcode_id()) {
- case OpCodeId::Jump:
- incoming_jump_edges.ensure(static_cast<OpCode_Jump const&>(opcode).offset() + state.instruction_position).append({ opcode_bytes });
- has_any_backwards_jump |= static_cast<OpCode_Jump const&>(opcode).offset() < 0;
- break;
- case OpCodeId::JumpNonEmpty:
- incoming_jump_edges.ensure(static_cast<OpCode_JumpNonEmpty const&>(opcode).offset() + state.instruction_position).append({ opcode_bytes });
- has_any_backwards_jump |= static_cast<OpCode_JumpNonEmpty const&>(opcode).offset() < 0;
- break;
- case OpCodeId::ForkJump:
- incoming_jump_edges.ensure(static_cast<OpCode_ForkJump const&>(opcode).offset() + state.instruction_position).append({ opcode_bytes });
- has_any_backwards_jump |= static_cast<OpCode_ForkJump const&>(opcode).offset() < 0;
- break;
- case OpCodeId::ForkStay:
- incoming_jump_edges.ensure(static_cast<OpCode_ForkStay const&>(opcode).offset() + state.instruction_position).append({ opcode_bytes });
- has_any_backwards_jump |= static_cast<OpCode_ForkStay const&>(opcode).offset() < 0;
- break;
- case OpCodeId::ForkReplaceJump:
- incoming_jump_edges.ensure(static_cast<OpCode_ForkReplaceJump const&>(opcode).offset() + state.instruction_position).append({ opcode_bytes });
- has_any_backwards_jump |= static_cast<OpCode_ForkReplaceJump const&>(opcode).offset() < 0;
- break;
- case OpCodeId::ForkReplaceStay:
- incoming_jump_edges.ensure(static_cast<OpCode_ForkReplaceStay const&>(opcode).offset() + state.instruction_position).append({ opcode_bytes });
- has_any_backwards_jump |= static_cast<OpCode_ForkReplaceStay const&>(opcode).offset() < 0;
- break;
- case OpCodeId::Repeat:
- incoming_jump_edges.ensure(state.instruction_position - static_cast<OpCode_Repeat const&>(opcode).offset()).append({ opcode_bytes });
- has_any_backwards_jump = true;
- break;
- default:
- break;
- }
- state.instruction_position += opcode.size();
- }
- }
- struct QualifiedIP {
- size_t alternative_index;
- size_t instruction_position;
- };
- using Tree = Trie<DisjointSpans<ByteCodeValueType const>, Vector<QualifiedIP>, Traits<DisjointSpans<ByteCodeValueType const>>, void, OrderedHashMapForTrie>;
- Tree trie { {} }; // Root node is empty, key{ instruction_bytes, dependent_instruction_bytes... } -> IP
- size_t common_hits = 0;
- size_t total_nodes = 0;
- size_t total_bytecode_entries_in_tree = 0;
- for (size_t i = 0; i < alternatives.size(); ++i) {
- auto& alternative = alternatives[i];
- auto& incoming_jump_edges = incoming_jump_edges_for_each_alternative[i];
- auto* active_node = ≜
- auto alternative_span = alternative.spans<1>().singular_span();
- for (state.instruction_position = 0; state.instruction_position < alternative_span.size();) {
- total_nodes += 1;
- auto& opcode = alternative.get_opcode(state);
- auto opcode_bytes = alternative_span.slice(state.instruction_position, opcode.size());
- Vector<Span<ByteCodeValueType const>> node_key_bytes;
- node_key_bytes.append(opcode_bytes);
- if (auto edges = incoming_jump_edges.get(state.instruction_position); edges.has_value()) {
- for (auto& edge : *edges)
- node_key_bytes.append(edge.jump_insn);
- }
- active_node = static_cast<decltype(active_node)>(MUST(active_node->ensure_child(DisjointSpans<ByteCodeValueType const> { move(node_key_bytes) })));
- if (active_node->has_metadata()) {
- active_node->metadata_value().append({ i, state.instruction_position });
- common_hits += 1;
- } else {
- active_node->set_metadata(Vector<QualifiedIP> { QualifiedIP { i, state.instruction_position } });
- total_bytecode_entries_in_tree += opcode.size();
- }
- state.instruction_position += opcode.size();
- }
- }
- if constexpr (REGEX_DEBUG) {
- Function<void(decltype(trie)&, size_t)> print_tree = [&](decltype(trie)& node, size_t indent = 0) mutable {
- ByteString name = "(no ip)";
- ByteString insn;
- if (node.has_metadata()) {
- name = ByteString::formatted(
- "{}@{} ({} node{})",
- node.metadata_value().first().instruction_position,
- node.metadata_value().first().alternative_index,
- node.metadata_value().size(),
- node.metadata_value().size() == 1 ? "" : "s");
- MatchState state;
- state.instruction_position = node.metadata_value().first().instruction_position;
- auto& opcode = alternatives[node.metadata_value().first().alternative_index].get_opcode(state);
- insn = ByteString::formatted("{} {}", opcode.to_byte_string(), opcode.arguments_string());
- }
- dbgln("{:->{}}| {} -- {}", "", indent * 2, name, insn);
- for (auto& child : node.children())
- print_tree(static_cast<decltype(trie)&>(*child.value), indent + 1);
- };
- print_tree(trie, 0);
- }
- // This is really only worth it if we don't blow up the size by the 2-extra-instruction-per-node scheme, similarly, if no nodes are shared, we're better off not using a tree.
- auto tree_cost = (total_nodes - common_hits) * 2;
- auto chain_cost = total_nodes + alternatives.size() * 2;
- dbgln_if(REGEX_DEBUG, "Total nodes: {}, common hits: {} (tree cost = {}, chain cost = {})", total_nodes, common_hits, tree_cost, chain_cost);
- if (common_hits == 0 || tree_cost > chain_cost) {
- // It's better to lay these out as a normal sequence of instructions.
- auto patch_start = target.size();
- for (size_t i = 1; i < alternatives.size(); ++i) {
- target.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
- target.empend(0u); // To be filled later.
- }
- size_t size_to_jump = 0;
- bool seen_one_empty = false;
- for (size_t i = alternatives.size(); i > 0; --i) {
- auto& entry = alternatives[i - 1];
- if (entry.is_empty()) {
- if (seen_one_empty)
- continue;
- seen_one_empty = true;
- }
- auto is_first = i == 1;
- auto instruction_size = entry.size() + (is_first ? 0 : 2); // Jump; -> +2
- size_to_jump += instruction_size;
- if (!is_first)
- target[patch_start + (i - 2) * 2 + 1] = size_to_jump + (alternatives.size() - i) * 2;
- dbgln_if(REGEX_DEBUG, "{} size = {}, cum={}", i - 1, instruction_size, size_to_jump);
- }
- seen_one_empty = false;
- for (size_t i = alternatives.size(); i > 0; --i) {
- auto& chunk = alternatives[i - 1];
- if (chunk.is_empty()) {
- if (seen_one_empty)
- continue;
- seen_one_empty = true;
- }
- ByteCode* previous_chunk = nullptr;
- size_t j = i - 1;
- auto seen_one_empty_before = chunk.is_empty();
- while (j >= 1) {
- --j;
- auto& candidate_chunk = alternatives[j];
- if (candidate_chunk.is_empty()) {
- if (seen_one_empty_before)
- continue;
- }
- previous_chunk = &candidate_chunk;
- break;
- }
- size_to_jump -= chunk.size() + (previous_chunk ? 2 : 0);
- target.extend(move(chunk));
- target.empend(static_cast<ByteCodeValueType>(OpCodeId::Jump));
- target.empend(size_to_jump); // Jump to the _END label
- }
- } else {
- target.ensure_capacity(total_bytecode_entries_in_tree + common_hits * 6);
- auto node_is = [](Tree const* node, QualifiedIP ip) {
- if (!node->has_metadata())
- return false;
- for (auto& node_ip : node->metadata_value()) {
- if (node_ip.alternative_index == ip.alternative_index && node_ip.instruction_position == ip.instruction_position)
- return true;
- }
- return false;
- };
- struct Patch {
- QualifiedIP source_ip;
- size_t target_ip;
- bool done { false };
- };
- Vector<Patch> patch_locations;
- patch_locations.ensure_capacity(total_nodes);
- auto add_patch_point = [&](Tree const* node, size_t target_ip) {
- if (!node->has_metadata())
- return;
- auto& node_ip = node->metadata_value().first();
- patch_locations.append({ node_ip, target_ip });
- };
- Queue<Tree*> nodes_to_visit;
- nodes_to_visit.enqueue(&trie);
- HashMap<size_t, NonnullOwnPtr<RedBlackTree<u64, u64>>> instruction_positions;
- if (has_any_backwards_jump)
- MUST(instruction_positions.try_ensure_capacity(alternatives.size()));
- auto ip_mapping_for_alternative = [&](size_t i) -> RedBlackTree<u64, u64>& {
- return *instruction_positions.ensure(i, [] {
- return make<RedBlackTree<u64, u64>>();
- });
- };
- // each node:
- // node.re
- // forkjump child1
- // forkjump child2
- // ...
- while (!nodes_to_visit.is_empty()) {
- auto const* node = nodes_to_visit.dequeue();
- for (auto& patch : patch_locations) {
- if (!patch.done && node_is(node, patch.source_ip)) {
- auto value = static_cast<ByteCodeValueType>(target.size() - patch.target_ip - 1);
- target[patch.target_ip] = value;
- patch.done = true;
- }
- }
- if (!node->value().individual_spans().is_empty()) {
- auto insn_bytes = node->value().individual_spans().first();
- target.ensure_capacity(target.size() + insn_bytes.size());
- state.instruction_position = target.size();
- target.append(insn_bytes);
- if (has_any_backwards_jump) {
- for (auto& ip : node->metadata_value())
- ip_mapping_for_alternative(ip.alternative_index).insert(ip.instruction_position, state.instruction_position);
- }
- auto& opcode = target.get_opcode(state);
- ssize_t jump_offset;
- auto is_jump = true;
- auto patch_location = state.instruction_position + 1;
- switch (opcode.opcode_id()) {
- case OpCodeId::Jump:
- jump_offset = static_cast<OpCode_Jump const&>(opcode).offset();
- break;
- case OpCodeId::JumpNonEmpty:
- jump_offset = static_cast<OpCode_JumpNonEmpty const&>(opcode).offset();
- break;
- case OpCodeId::ForkJump:
- jump_offset = static_cast<OpCode_ForkJump const&>(opcode).offset();
- break;
- case OpCodeId::ForkStay:
- jump_offset = static_cast<OpCode_ForkStay const&>(opcode).offset();
- break;
- case OpCodeId::ForkReplaceJump:
- jump_offset = static_cast<OpCode_ForkReplaceJump const&>(opcode).offset();
- break;
- case OpCodeId::ForkReplaceStay:
- jump_offset = static_cast<OpCode_ForkReplaceStay const&>(opcode).offset();
- break;
- case OpCodeId::Repeat:
- jump_offset = static_cast<ssize_t>(0) - static_cast<ssize_t>(static_cast<OpCode_Repeat const&>(opcode).offset()) - static_cast<ssize_t>(opcode.size());
- break;
- default:
- is_jump = false;
- break;
- }
- if (is_jump) {
- VERIFY(node->has_metadata());
- QualifiedIP ip = node->metadata_value().first();
- auto intended_jump_ip = ip.instruction_position + jump_offset + opcode.size();
- if (jump_offset < 0) {
- VERIFY(has_any_backwards_jump);
- // We should've already seen this instruction, so we can just patch it in.
- auto& ip_mapping = ip_mapping_for_alternative(ip.alternative_index);
- auto target_ip = ip_mapping.find(intended_jump_ip);
- if (!target_ip) {
- RegexDebug dbg;
- size_t x = 0;
- for (auto& entry : alternatives) {
- warnln("----------- {} ----------", x++);
- dbg.print_bytecode(entry);
- }
- dbgln("Regex Tree / Unknown backwards jump: {}@{} -> {}",
- ip.instruction_position,
- ip.alternative_index,
- intended_jump_ip);
- VERIFY_NOT_REACHED();
- }
- target[patch_location] = static_cast<ByteCodeValueType>(*target_ip - patch_location - 1);
- } else {
- patch_locations.append({ QualifiedIP { ip.alternative_index, intended_jump_ip }, patch_location });
- }
- }
- }
- for (auto const& child : node->children()) {
- auto* child_node = static_cast<Tree*>(child.value.ptr());
- target.append(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
- add_patch_point(child_node, target.size());
- target.append(static_cast<ByteCodeValueType>(0));
- nodes_to_visit.enqueue(child_node);
- }
- }
- for (auto& patch : patch_locations) {
- if (patch.done)
- continue;
- auto& alternative = alternatives[patch.source_ip.alternative_index];
- if (patch.source_ip.instruction_position >= alternative.size()) {
- // This just wants to jump to the end of the alternative, which is fine.
- // Patch it to jump to the end of the target instead.
- target[patch.target_ip] = static_cast<ByteCodeValueType>(target.size() - patch.target_ip - 1);
- continue;
- }
- dbgln("Regex Tree / Unpatched jump: {}@{} -> {}@{}",
- patch.source_ip.instruction_position,
- patch.source_ip.alternative_index,
- patch.target_ip,
- target[patch.target_ip]);
- VERIFY_NOT_REACHED();
- }
- }
- }
- enum class LookupTableInsertionOutcome {
- Successful,
- ReplaceWithAnyChar,
- TemporaryInversionNeeded,
- PermanentInversionNeeded,
- FlushOnInsertion,
- FinishFlushOnInsertion,
- CannotPlaceInTable,
- };
- static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree<ByteCodeValueType, CharRange>& table, CompareTypeAndValuePair pair)
- {
- switch (pair.type) {
- case CharacterCompareType::Inverse:
- return LookupTableInsertionOutcome::PermanentInversionNeeded;
- case CharacterCompareType::TemporaryInverse:
- return LookupTableInsertionOutcome::TemporaryInversionNeeded;
- case CharacterCompareType::AnyChar:
- return LookupTableInsertionOutcome::ReplaceWithAnyChar;
- case CharacterCompareType::CharClass:
- return LookupTableInsertionOutcome::CannotPlaceInTable;
- case CharacterCompareType::Char:
- table.insert(pair.value, { (u32)pair.value, (u32)pair.value });
- break;
- case CharacterCompareType::CharRange: {
- CharRange range { pair.value };
- table.insert(range.from, range);
- break;
- }
- case CharacterCompareType::EndAndOr:
- return LookupTableInsertionOutcome::FinishFlushOnInsertion;
- case CharacterCompareType::And:
- return LookupTableInsertionOutcome::FlushOnInsertion;
- case CharacterCompareType::Reference:
- case CharacterCompareType::Property:
- case CharacterCompareType::GeneralCategory:
- case CharacterCompareType::Script:
- case CharacterCompareType::ScriptExtension:
- case CharacterCompareType::Or:
- return LookupTableInsertionOutcome::CannotPlaceInTable;
- case CharacterCompareType::Undefined:
- case CharacterCompareType::RangeExpressionDummy:
- case CharacterCompareType::String:
- case CharacterCompareType::LookupTable:
- VERIFY_NOT_REACHED();
- }
- return LookupTableInsertionOutcome::Successful;
- }
- void Optimizer::append_character_class(ByteCode& target, Vector<CompareTypeAndValuePair>&& pairs)
- {
- ByteCode arguments;
- size_t argument_count = 0;
- if (pairs.size() <= 1) {
- for (auto& pair : pairs) {
- arguments.append(to_underlying(pair.type));
- if (pair.type != CharacterCompareType::AnyChar
- && pair.type != CharacterCompareType::TemporaryInverse
- && pair.type != CharacterCompareType::Inverse
- && pair.type != CharacterCompareType::And
- && pair.type != CharacterCompareType::Or
- && pair.type != CharacterCompareType::EndAndOr)
- arguments.append(pair.value);
- ++argument_count;
- }
- } else {
- RedBlackTree<ByteCodeValueType, CharRange> table;
- RedBlackTree<ByteCodeValueType, CharRange> inverted_table;
- auto* current_table = &table;
- auto* current_inverted_table = &inverted_table;
- bool invert_for_next_iteration = false;
- bool is_currently_inverted = false;
- auto flush_tables = [&] {
- auto append_table = [&](auto& table) {
- ++argument_count;
- arguments.append(to_underlying(CharacterCompareType::LookupTable));
- auto size_index = arguments.size();
- arguments.append(0);
- Optional<CharRange> active_range;
- size_t range_count = 0;
- for (auto& range : table) {
- if (!active_range.has_value()) {
- active_range = range;
- continue;
- }
- if (range.from <= active_range->to + 1 && range.to + 1 >= active_range->from) {
- active_range = CharRange { min(range.from, active_range->from), max(range.to, active_range->to) };
- } else {
- ++range_count;
- arguments.append(active_range.release_value());
- active_range = range;
- }
- }
- if (active_range.has_value()) {
- ++range_count;
- arguments.append(active_range.release_value());
- }
- arguments[size_index] = range_count;
- };
- auto contains_regular_table = !table.is_empty();
- auto contains_inverted_table = !inverted_table.is_empty();
- if (contains_regular_table)
- append_table(table);
- if (contains_inverted_table) {
- ++argument_count;
- arguments.append(to_underlying(CharacterCompareType::TemporaryInverse));
- append_table(inverted_table);
- }
- table.clear();
- inverted_table.clear();
- };
- auto flush_on_every_insertion = false;
- for (auto& value : pairs) {
- auto should_invert_after_this_iteration = invert_for_next_iteration;
- invert_for_next_iteration = false;
- auto insertion_result = insert_into_lookup_table(*current_table, value);
- switch (insertion_result) {
- case LookupTableInsertionOutcome::Successful:
- if (flush_on_every_insertion)
- flush_tables();
- break;
- case LookupTableInsertionOutcome::ReplaceWithAnyChar: {
- table.clear();
- inverted_table.clear();
- arguments.append(to_underlying(CharacterCompareType::AnyChar));
- ++argument_count;
- break;
- }
- case LookupTableInsertionOutcome::TemporaryInversionNeeded:
- swap(current_table, current_inverted_table);
- invert_for_next_iteration = true;
- is_currently_inverted = !is_currently_inverted;
- break;
- case LookupTableInsertionOutcome::PermanentInversionNeeded:
- flush_tables();
- arguments.append(to_underlying(CharacterCompareType::Inverse));
- ++argument_count;
- break;
- case LookupTableInsertionOutcome::FlushOnInsertion:
- case LookupTableInsertionOutcome::FinishFlushOnInsertion:
- flush_tables();
- flush_on_every_insertion = insertion_result == LookupTableInsertionOutcome::FlushOnInsertion;
- [[fallthrough]];
- case LookupTableInsertionOutcome::CannotPlaceInTable:
- if (is_currently_inverted) {
- arguments.append(to_underlying(CharacterCompareType::TemporaryInverse));
- ++argument_count;
- }
- arguments.append(to_underlying(value.type));
- if (value.type != CharacterCompareType::AnyChar
- && value.type != CharacterCompareType::TemporaryInverse
- && value.type != CharacterCompareType::Inverse
- && value.type != CharacterCompareType::And
- && value.type != CharacterCompareType::Or
- && value.type != CharacterCompareType::EndAndOr)
- arguments.append(value.value);
- ++argument_count;
- break;
- }
- if (should_invert_after_this_iteration) {
- swap(current_table, current_inverted_table);
- is_currently_inverted = !is_currently_inverted;
- }
- }
- flush_tables();
- }
- target.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
- target.empend(argument_count); // number of arguments
- target.empend(arguments.size()); // size of arguments
- target.extend(move(arguments));
- }
- template void Regex<PosixBasicParser>::run_optimization_passes();
- template void Regex<PosixExtendedParser>::run_optimization_passes();
- template void Regex<ECMA262Parser>::run_optimization_passes();
- }
|