From 221c52c69683f053fd6421562f467260938628d5 Mon Sep 17 00:00:00 2001 From: Ali Mohammad Pur Date: Fri, 28 Jul 2023 20:57:51 +0330 Subject: [PATCH] LibRegex: Avoid slicing a RegexStringView in non-unicode Compare ops Getting a single code point is much faster than slicing into the string. --- Userland/Libraries/LibRegex/RegexByteCode.cpp | 28 +++++++++++++------ Userland/Libraries/LibRegex/RegexMatch.h | 23 +++++++++++++++ 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index c109ce301ef..8ae12c2ba4e 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -416,6 +416,9 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(MatchIn ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, MatchState& state) const { + auto argument_count = arguments_count(); + auto has_single_argument = argument_count == 1; + bool inverse { false }; bool temporary_inverse { false }; bool reset_temp_inverse { false }; @@ -443,7 +446,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M state.string_position_before_match = state.string_position; size_t offset { state.instruction_position + 3 }; - for (size_t i = 0; i < arguments_count(); ++i) { + for (size_t i = 0; i < argument_count; ++i) { if (state.string_position > string_position) break; @@ -647,7 +650,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M inverse_matched = true; } - if (new_disjunction_state.active) { + if (!has_single_argument && new_disjunction_state.active) { auto failed = (!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length(); if (!failed) { @@ -665,11 +668,13 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M } } - auto& new_disjunction_state = current_disjunction_state(); - if (new_disjunction_state.active) { - if (!new_disjunction_state.fail) { - state.string_position = new_disjunction_state.last_accepted_position.value_or(new_disjunction_state.initial_position); - state.string_position_in_code_units = new_disjunction_state.last_accepted_code_unit_position.value_or(new_disjunction_state.initial_code_unit_position); + if (!has_single_argument) { + auto& new_disjunction_state = current_disjunction_state(); + if (new_disjunction_state.active) { + if (!new_disjunction_state.fail) { + state.string_position = new_disjunction_state.last_accepted_position.value_or(new_disjunction_state.initial_position); + state.string_position_in_code_units = new_disjunction_state.last_accepted_code_unit_position.value_or(new_disjunction_state.initial_code_unit_position); + } } } @@ -687,7 +692,8 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt if (state.string_position == input.view.length()) return; - auto input_view = input.view.substring_view(state.string_position, 1)[0]; + // FIXME: Figure out how to do this if unicode() without performing a substring split first. + auto input_view = input.view.unicode() ? input.view.substring_view(state.string_position, 1)[0] : input.view.code_unit_at(state.string_position_in_code_units); bool equal; if (input.regex_options & AllFlags::Insensitive) equal = to_ascii_lowercase(input_view) == to_ascii_lowercase(ch1); // FIXME: Implement case-insensitive matching for non-ascii characters @@ -717,6 +723,12 @@ ALWAYS_INLINE bool OpCode_Compare::compare_string(MatchInput const& input, Match return true; } + if (str.length() == 1) { + auto inverse_matched = false; + compare_char(input, state, str[0], false, inverse_matched); + return !inverse_matched; + } + auto subject = input.view.substring_view(state.string_position, str.length()); bool equals; if (input.regex_options & AllFlags::Insensitive) diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h index 3e0c219c77d..2abef98f507 100644 --- a/Userland/Libraries/LibRegex/RegexMatch.h +++ b/Userland/Libraries/LibRegex/RegexMatch.h @@ -434,6 +434,29 @@ public: }); } + u32 code_unit_at(size_t code_unit_index) const + { + if (unicode()) + return operator[](code_unit_index); + + return m_view.visit( + [&](StringView view) -> u32 { + auto ch = view[code_unit_index]; + if constexpr (IsSigned) { + if (ch < 0) + return 256u + ch; + return ch; + } + }, + [&](Utf32View const& view) -> u32 { return view[code_unit_index]; }, + [&](Utf16View const& view) -> u32 { return view.code_unit_at(code_unit_index); }, + [&](Utf8View const& view) -> u32 { + auto it = view.iterator_at_byte_offset(code_unit_index); + VERIFY(it != view.end()); + return *it; + }); + } + size_t code_unit_offset_of(size_t code_point_index) const { return m_view.visit(