LibRegex: Avoid slicing a RegexStringView in non-unicode Compare ops

Getting a single code point is much faster than slicing into the string.
This commit is contained in:
Ali Mohammad Pur 2023-07-28 20:57:51 +03:30 committed by Andreas Kling
parent 11abca421a
commit 221c52c696
Notes: sideshowbarker 2024-07-17 08:55:54 +09:00
2 changed files with 43 additions and 8 deletions

View file

@ -416,6 +416,9 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(MatchIn
ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, MatchState& state) const
{
auto argument_count = arguments_count();
auto has_single_argument = argument_count == 1;
bool inverse { false };
bool temporary_inverse { false };
bool reset_temp_inverse { false };
@ -443,7 +446,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
state.string_position_before_match = state.string_position;
size_t offset { state.instruction_position + 3 };
for (size_t i = 0; i < arguments_count(); ++i) {
for (size_t i = 0; i < argument_count; ++i) {
if (state.string_position > string_position)
break;
@ -647,7 +650,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
inverse_matched = true;
}
if (new_disjunction_state.active) {
if (!has_single_argument && new_disjunction_state.active) {
auto failed = (!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length();
if (!failed) {
@ -665,11 +668,13 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
}
}
auto& new_disjunction_state = current_disjunction_state();
if (new_disjunction_state.active) {
if (!new_disjunction_state.fail) {
state.string_position = new_disjunction_state.last_accepted_position.value_or(new_disjunction_state.initial_position);
state.string_position_in_code_units = new_disjunction_state.last_accepted_code_unit_position.value_or(new_disjunction_state.initial_code_unit_position);
if (!has_single_argument) {
auto& new_disjunction_state = current_disjunction_state();
if (new_disjunction_state.active) {
if (!new_disjunction_state.fail) {
state.string_position = new_disjunction_state.last_accepted_position.value_or(new_disjunction_state.initial_position);
state.string_position_in_code_units = new_disjunction_state.last_accepted_code_unit_position.value_or(new_disjunction_state.initial_code_unit_position);
}
}
}
@ -687,7 +692,8 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt
if (state.string_position == input.view.length())
return;
auto input_view = input.view.substring_view(state.string_position, 1)[0];
// FIXME: Figure out how to do this if unicode() without performing a substring split first.
auto input_view = input.view.unicode() ? input.view.substring_view(state.string_position, 1)[0] : input.view.code_unit_at(state.string_position_in_code_units);
bool equal;
if (input.regex_options & AllFlags::Insensitive)
equal = to_ascii_lowercase(input_view) == to_ascii_lowercase(ch1); // FIXME: Implement case-insensitive matching for non-ascii characters
@ -717,6 +723,12 @@ ALWAYS_INLINE bool OpCode_Compare::compare_string(MatchInput const& input, Match
return true;
}
if (str.length() == 1) {
auto inverse_matched = false;
compare_char(input, state, str[0], false, inverse_matched);
return !inverse_matched;
}
auto subject = input.view.substring_view(state.string_position, str.length());
bool equals;
if (input.regex_options & AllFlags::Insensitive)

View file

@ -434,6 +434,29 @@ public:
});
}
u32 code_unit_at(size_t code_unit_index) const
{
if (unicode())
return operator[](code_unit_index);
return m_view.visit(
[&](StringView view) -> u32 {
auto ch = view[code_unit_index];
if constexpr (IsSigned<char>) {
if (ch < 0)
return 256u + ch;
return ch;
}
},
[&](Utf32View const& view) -> u32 { return view[code_unit_index]; },
[&](Utf16View const& view) -> u32 { return view.code_unit_at(code_unit_index); },
[&](Utf8View const& view) -> u32 {
auto it = view.iterator_at_byte_offset(code_unit_index);
VERIFY(it != view.end());
return *it;
});
}
size_t code_unit_offset_of(size_t code_point_index) const
{
return m_view.visit(