LibRegex: Avoid slicing a RegexStringView in non-unicode Compare ops
Getting a single code point is much faster than slicing into the string.
This commit is contained in:
parent
11abca421a
commit
221c52c696
Notes:
sideshowbarker
2024-07-17 08:55:54 +09:00
Author: https://github.com/alimpfard Commit: https://github.com/SerenityOS/serenity/commit/221c52c696 Pull-request: https://github.com/SerenityOS/serenity/pull/20240 Reviewed-by: https://github.com/awesomekling Reviewed-by: https://github.com/trflynn89
2 changed files with 43 additions and 8 deletions
|
@ -416,6 +416,9 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(MatchIn
|
|||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, MatchState& state) const
|
||||
{
|
||||
auto argument_count = arguments_count();
|
||||
auto has_single_argument = argument_count == 1;
|
||||
|
||||
bool inverse { false };
|
||||
bool temporary_inverse { false };
|
||||
bool reset_temp_inverse { false };
|
||||
|
@ -443,7 +446,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
|||
state.string_position_before_match = state.string_position;
|
||||
|
||||
size_t offset { state.instruction_position + 3 };
|
||||
for (size_t i = 0; i < arguments_count(); ++i) {
|
||||
for (size_t i = 0; i < argument_count; ++i) {
|
||||
if (state.string_position > string_position)
|
||||
break;
|
||||
|
||||
|
@ -647,7 +650,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
|||
inverse_matched = true;
|
||||
}
|
||||
|
||||
if (new_disjunction_state.active) {
|
||||
if (!has_single_argument && new_disjunction_state.active) {
|
||||
auto failed = (!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length();
|
||||
|
||||
if (!failed) {
|
||||
|
@ -665,11 +668,13 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
|||
}
|
||||
}
|
||||
|
||||
auto& new_disjunction_state = current_disjunction_state();
|
||||
if (new_disjunction_state.active) {
|
||||
if (!new_disjunction_state.fail) {
|
||||
state.string_position = new_disjunction_state.last_accepted_position.value_or(new_disjunction_state.initial_position);
|
||||
state.string_position_in_code_units = new_disjunction_state.last_accepted_code_unit_position.value_or(new_disjunction_state.initial_code_unit_position);
|
||||
if (!has_single_argument) {
|
||||
auto& new_disjunction_state = current_disjunction_state();
|
||||
if (new_disjunction_state.active) {
|
||||
if (!new_disjunction_state.fail) {
|
||||
state.string_position = new_disjunction_state.last_accepted_position.value_or(new_disjunction_state.initial_position);
|
||||
state.string_position_in_code_units = new_disjunction_state.last_accepted_code_unit_position.value_or(new_disjunction_state.initial_code_unit_position);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -687,7 +692,8 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt
|
|||
if (state.string_position == input.view.length())
|
||||
return;
|
||||
|
||||
auto input_view = input.view.substring_view(state.string_position, 1)[0];
|
||||
// FIXME: Figure out how to do this if unicode() without performing a substring split first.
|
||||
auto input_view = input.view.unicode() ? input.view.substring_view(state.string_position, 1)[0] : input.view.code_unit_at(state.string_position_in_code_units);
|
||||
bool equal;
|
||||
if (input.regex_options & AllFlags::Insensitive)
|
||||
equal = to_ascii_lowercase(input_view) == to_ascii_lowercase(ch1); // FIXME: Implement case-insensitive matching for non-ascii characters
|
||||
|
@ -717,6 +723,12 @@ ALWAYS_INLINE bool OpCode_Compare::compare_string(MatchInput const& input, Match
|
|||
return true;
|
||||
}
|
||||
|
||||
if (str.length() == 1) {
|
||||
auto inverse_matched = false;
|
||||
compare_char(input, state, str[0], false, inverse_matched);
|
||||
return !inverse_matched;
|
||||
}
|
||||
|
||||
auto subject = input.view.substring_view(state.string_position, str.length());
|
||||
bool equals;
|
||||
if (input.regex_options & AllFlags::Insensitive)
|
||||
|
|
|
@ -434,6 +434,29 @@ public:
|
|||
});
|
||||
}
|
||||
|
||||
u32 code_unit_at(size_t code_unit_index) const
|
||||
{
|
||||
if (unicode())
|
||||
return operator[](code_unit_index);
|
||||
|
||||
return m_view.visit(
|
||||
[&](StringView view) -> u32 {
|
||||
auto ch = view[code_unit_index];
|
||||
if constexpr (IsSigned<char>) {
|
||||
if (ch < 0)
|
||||
return 256u + ch;
|
||||
return ch;
|
||||
}
|
||||
},
|
||||
[&](Utf32View const& view) -> u32 { return view[code_unit_index]; },
|
||||
[&](Utf16View const& view) -> u32 { return view.code_unit_at(code_unit_index); },
|
||||
[&](Utf8View const& view) -> u32 {
|
||||
auto it = view.iterator_at_byte_offset(code_unit_index);
|
||||
VERIFY(it != view.end());
|
||||
return *it;
|
||||
});
|
||||
}
|
||||
|
||||
size_t code_unit_offset_of(size_t code_point_index) const
|
||||
{
|
||||
return m_view.visit(
|
||||
|
|
Loading…
Add table
Reference in a new issue