From f364fcec5da902ca8ae84dc0bca0f23533cebaa4 Mon Sep 17 00:00:00 2001 From: Ali Mohammad Pur Date: Sun, 18 Jul 2021 05:07:01 +0430 Subject: [PATCH] LibRegex+Everywhere: Make LibRegex more unicode-aware This commit makes LibRegex (mostly) capable of operating on any of the three main string views: - StringView for raw strings - Utf8View for utf-8 encoded strings - Utf32View for raw unicode strings As a result, regexps with unicode strings should be able to properly handle utf-8 and not stop in the middle of a code point. A future commit will update LibJS to use the correct type of string depending on the flags. --- Tests/LibRegex/Regex.cpp | 3 +- .../Cpp/CppComprehensionEngine.cpp | 4 +- Userland/Libraries/LibDebug/DebugSession.cpp | 2 +- Userland/Libraries/LibRegex/RegexByteCode.cpp | 155 ++++----- Userland/Libraries/LibRegex/RegexByteCode.h | 2 +- Userland/Libraries/LibRegex/RegexMatch.h | 307 ++++++++++++------ Userland/Libraries/LibRegex/RegexMatcher.cpp | 42 +-- Userland/Utilities/expr.cpp | 2 +- 8 files changed, 310 insertions(+), 207 deletions(-) diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index 7e0269622c5..1643acd37b9 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -249,7 +249,7 @@ TEST_CASE(char_utf8) Regex re("😀"); RegexResult result; - EXPECT_EQ((result = match("Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界", re, PosixFlags::Global)).success, true); + EXPECT_EQ((result = match(Utf8View { "Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界" }, re, PosixFlags::Global)).success, true); EXPECT_EQ(result.count, 2u); } @@ -312,7 +312,6 @@ TEST_CASE(match_all_character_class) EXPECT_EQ(result.matches.at(0).view, "W"); EXPECT_EQ(result.matches.at(1).view, "i"); EXPECT_EQ(result.matches.at(2).view, "n"); - EXPECT(&result.matches.at(0).view.characters_without_null_termination()[0] != &str.view().characters_without_null_termination()[1]); } TEST_CASE(match_character_class_with_assertion) diff --git a/Userland/DevTools/HackStudio/LanguageServers/Cpp/CppComprehensionEngine.cpp b/Userland/DevTools/HackStudio/LanguageServers/Cpp/CppComprehensionEngine.cpp index e00cc853c0f..20d43636b8f 100644 --- a/Userland/DevTools/HackStudio/LanguageServers/Cpp/CppComprehensionEngine.cpp +++ b/Userland/DevTools/HackStudio/LanguageServers/Cpp/CppComprehensionEngine.cpp @@ -359,7 +359,7 @@ String CppComprehensionEngine::document_path_from_include_path(const StringView& if (!library_include.search(include_path, result)) return {}; - auto path = result.capture_group_matches.at(0).at(0).view.u8view(); + auto path = result.capture_group_matches.at(0).at(0).view.string_view(); return String::formatted("/usr/include/{}", path); }; @@ -368,7 +368,7 @@ String CppComprehensionEngine::document_path_from_include_path(const StringView& if (!user_defined_include.search(include_path, result)) return {}; - return result.capture_group_matches.at(0).at(0).view.u8view(); + return result.capture_group_matches.at(0).at(0).view.string_view(); }; auto result = document_path_for_library_include(include_path); diff --git a/Userland/Libraries/LibDebug/DebugSession.cpp b/Userland/Libraries/LibDebug/DebugSession.cpp index f7ab3a67a83..fccd487e053 100644 --- a/Userland/Libraries/LibDebug/DebugSession.cpp +++ b/Userland/Libraries/LibDebug/DebugSession.cpp @@ -422,7 +422,7 @@ void DebugSession::update_loaded_libs() auto rc = re.search(vm_name, result); if (!rc) return {}; - auto lib_name = result.capture_group_matches.at(0).at(0).view.u8view().to_string(); + auto lib_name = result.capture_group_matches.at(0).at(0).view.string_view().to_string(); if (lib_name.starts_with("/")) return lib_name; return String::formatted("/usr/lib/{}", lib_name); diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index bede259630a..259aaf26afb 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -288,30 +288,33 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input, return ExecutionResult::Failed_ExecuteLowPrioForks; } -ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const +ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const { - if (input.match_index >= output.capture_group_matches.size()) { - output.capture_group_matches.ensure_capacity(input.match_index); - auto capacity = output.capture_group_matches.capacity(); - for (size_t i = output.capture_group_matches.size(); i <= capacity; ++i) - output.capture_group_matches.empend(); + if (input.match_index >= state.capture_group_matches.size()) { + state.capture_group_matches.ensure_capacity(input.match_index); + auto capacity = state.capture_group_matches.capacity(); + for (size_t i = state.capture_group_matches.size(); i <= capacity; ++i) + state.capture_group_matches.empend(); } - if (id() >= output.capture_group_matches.at(input.match_index).size()) { - output.capture_group_matches.at(input.match_index).ensure_capacity(id()); - auto capacity = output.capture_group_matches.at(input.match_index).capacity(); - for (size_t i = output.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i) - output.capture_group_matches.at(input.match_index).empend(); + if (id() >= state.capture_group_matches.at(input.match_index).size()) { + state.capture_group_matches.at(input.match_index).ensure_capacity(id()); + auto capacity = state.capture_group_matches.at(input.match_index).capacity(); + for (size_t i = state.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i) + state.capture_group_matches.at(input.match_index).empend(); } - output.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position; + state.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position; return ExecutionResult::Continue; } -ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const +ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const { - auto& match = output.capture_group_matches.at(input.match_index).at(id()); + auto& match = state.capture_group_matches.at(input.match_index).at(id()); auto start_position = match.left_column; + if (state.string_position < start_position) + return ExecutionResult::Failed_ExecuteLowPrioForks; + auto length = state.string_position - start_position; if (start_position < match.column) @@ -330,27 +333,27 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchI return ExecutionResult::Continue; } -ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const +ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const { - if (input.match_index >= output.named_capture_group_matches.size()) { - output.named_capture_group_matches.ensure_capacity(input.match_index); - auto capacity = output.named_capture_group_matches.capacity(); - for (size_t i = output.named_capture_group_matches.size(); i <= capacity; ++i) - output.named_capture_group_matches.empend(); + if (input.match_index >= state.named_capture_group_matches.size()) { + state.named_capture_group_matches.ensure_capacity(input.match_index); + auto capacity = state.named_capture_group_matches.capacity(); + for (size_t i = state.named_capture_group_matches.size(); i <= capacity; ++i) + state.named_capture_group_matches.empend(); } - output.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position; + state.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position; return ExecutionResult::Continue; } -ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const +ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const { StringView capture_group_name = name(); - if (output.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) { - auto start_position = output.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column; + if (state.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) { + auto start_position = state.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column; auto length = state.string_position - start_position; - auto& map = output.named_capture_group_matches.at(input.match_index); + auto& map = state.named_capture_group_matches.at(input.match_index); if constexpr (REGEX_DEBUG) { VERIFY(start_position + length <= input.view.length()); @@ -371,7 +374,7 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const M return ExecutionResult::Continue; } -ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const +ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput&) const { bool inverse { false }; bool temporary_inverse { false }; @@ -414,14 +417,14 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M u32 ch = m_bytecode->at(offset++); // We want to compare a string that is longer or equal in length to the available string - if (input.view.length() - state.string_position < 1) + if (input.view.length() <= state.string_position) return ExecutionResult::Failed_ExecuteLowPrioForks; compare_char(input, state, ch, current_inversion_state(), inverse_matched); } else if (compare_type == CharacterCompareType::AnyChar) { // We want to compare a string that is definitely longer than the available string - if (input.view.length() - state.string_position < 1) + if (input.view.length() <= state.string_position) return ExecutionResult::Failed_ExecuteLowPrioForks; VERIFY(!current_inversion_state()); @@ -431,20 +434,25 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M VERIFY(!current_inversion_state()); const auto& length = m_bytecode->at(offset++); - StringBuilder str_builder; - for (size_t i = 0; i < length; ++i) - str_builder.append(m_bytecode->at(offset++)); // We want to compare a string that is definitely longer than the available string - if (input.view.length() - state.string_position < length) + if (input.view.length() < state.string_position + length) return ExecutionResult::Failed_ExecuteLowPrioForks; - if (!compare_string(input, state, str_builder.string_view().characters_without_null_termination(), length, had_zero_length_match)) + Optional str; + Vector data; + data.ensure_capacity(length); + for (size_t i = offset; i < offset + length; ++i) + data.unchecked_append(m_bytecode->at(i)); + + auto view = input.view.construct_as_same(data, str); + offset += length; + if (!compare_string(input, state, view, had_zero_length_match)) return ExecutionResult::Failed_ExecuteLowPrioForks; } else if (compare_type == CharacterCompareType::CharClass) { - if (input.view.length() - state.string_position < 1) + if (input.view.length() <= state.string_position) return ExecutionResult::Failed_ExecuteLowPrioForks; auto character_class = (CharClass)m_bytecode->at(offset++); @@ -453,6 +461,9 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched); } else if (compare_type == CharacterCompareType::CharRange) { + if (input.view.length() <= state.string_position) + return ExecutionResult::Failed_ExecuteLowPrioForks; + auto value = (CharRange)m_bytecode->at(offset++); auto from = value.from; @@ -463,17 +474,17 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M } else if (compare_type == CharacterCompareType::Reference) { auto reference_number = (size_t)m_bytecode->at(offset++); - auto& groups = output.capture_group_matches.at(input.match_index); + auto& groups = state.capture_group_matches.at(input.match_index); if (groups.size() <= reference_number) return ExecutionResult::Failed_ExecuteLowPrioForks; auto str = groups.at(reference_number).view; // We want to compare a string that is definitely longer than the available string - if (input.view.length() - state.string_position < str.length()) + if (input.view.length() < state.string_position + str.length()) return ExecutionResult::Failed_ExecuteLowPrioForks; - if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match)) + if (!compare_string(input, state, str, had_zero_length_match)) return ExecutionResult::Failed_ExecuteLowPrioForks; } else if (compare_type == CharacterCompareType::NamedReference) { @@ -481,17 +492,17 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M auto length = (size_t)m_bytecode->at(offset++); StringView name { ptr, length }; - auto group = output.named_capture_group_matches.at(input.match_index).get(name); + auto group = state.named_capture_group_matches.at(input.match_index).get(name); if (!group.has_value()) return ExecutionResult::Failed_ExecuteLowPrioForks; auto str = group.value().view; // We want to compare a string that is definitely longer than the available string - if (input.view.length() - state.string_position < str.length()) + if (input.view.length() < state.string_position + str.length()) return ExecutionResult::Failed_ExecuteLowPrioForks; - if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match)) + if (!compare_string(input, state, str, had_zero_length_match)) return ExecutionResult::Failed_ExecuteLowPrioForks; } else { @@ -512,14 +523,19 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched) { - u32 ch2 = input.view[state.string_position]; + if (state.string_position == input.view.length()) + return; - if (input.regex_options & AllFlags::Insensitive) { - ch1 = to_ascii_lowercase(ch1); - ch2 = to_ascii_lowercase(ch2); - } + auto input_view = input.view.substring_view(state.string_position, 1); + Optional str; + auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str); + bool equal; + if (input.regex_options & AllFlags::Insensitive) + equal = input_view.equals_ignoring_case(compare_view); + else + equal = input_view.equals(compare_view); - if (ch1 == ch2) { + if (equal) { if (inverse) inverse_matched = true; else @@ -527,41 +543,32 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchSt } } -ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match) +ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, RegexStringView const& str, bool& had_zero_length_match) { - if (length == 0) { + if (state.string_position + str.length() > input.view.length()) { + if (str.is_empty()) { + had_zero_length_match = true; + return true; + } + return false; + } + + if (str.length() == 0) { had_zero_length_match = true; return true; } - if (input.view.is_u8_view()) { - auto str_view1 = StringView(str, length); - auto str_view2 = StringView(&input.view.u8view()[state.string_position], length); + auto subject = input.view.substring_view(state.string_position, str.length()); + bool equals; + if (input.regex_options & AllFlags::Insensitive) + equals = subject.equals_ignoring_case(str); + else + equals = subject.equals(str); - bool string_equals; - if (input.regex_options & AllFlags::Insensitive) - string_equals = str_view1.equals_ignoring_case(str_view2); - else - string_equals = str_view1 == str_view2; + if (equals) + state.string_position += str.length(); - if (string_equals) { - state.string_position += length; - return true; - } - } else { - bool equals; - if (input.regex_options & AllFlags::Insensitive) - TODO(); - else - equals = __builtin_memcmp(str, &input.view.u32view().code_points()[state.string_position], length) == 0; - - if (equals) { - state.string_position += length; - return true; - } - } - - return false; + return equals; } ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched) diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h index d53ab64462e..ba1d3214a20 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.h +++ b/Userland/Libraries/LibRegex/RegexByteCode.h @@ -682,7 +682,7 @@ public: private: ALWAYS_INLINE static void compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched); - ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match); + ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, RegexStringView const& str, bool& had_zero_length_match); ALWAYS_INLINE static void compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched); ALWAYS_INLINE static void compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched); }; diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h index 285b5e406ff..39b22ab5657 100644 --- a/Userland/Libraries/LibRegex/RegexMatch.h +++ b/Userland/Libraries/LibRegex/RegexMatch.h @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include namespace regex { @@ -22,124 +24,172 @@ namespace regex { class RegexStringView { public: RegexStringView(const char* chars) - : m_u8view(chars) + : m_view(StringView { chars }) { } RegexStringView(const String& string) - : m_u8view(string) + : m_view(string.view()) { } RegexStringView(const StringView view) - : m_u8view(view) - { - } - RegexStringView(const Utf32View view) - : m_u32view(view) + : m_view(view) { } - bool is_u8_view() const { return m_u8view.has_value(); } - bool is_u32_view() const { return m_u32view.has_value(); } - - const StringView& u8view() const + RegexStringView(Utf32View view) + : m_view(view) { - VERIFY(m_u8view.has_value()); - return m_u8view.value(); - }; + } - const Utf32View& u32view() const + RegexStringView(Utf8View view) + : m_view(view) { - VERIFY(m_u32view.has_value()); - return m_u32view.value(); - }; + } + + const StringView& string_view() const + { + return m_view.get(); + } + + const Utf32View& u32_view() const + { + return m_view.get(); + } + + const Utf8View& u8_view() const + { + return m_view.get(); + } bool is_empty() const { - if (is_u8_view()) - return m_u8view.value().is_empty(); - else - return m_u32view.value().is_empty(); + return m_view.visit([](auto& view) { return view.is_empty(); }); } bool is_null() const { - if (is_u8_view()) - return m_u8view.value().is_null(); - else - return m_u32view.value().code_points() == nullptr; + return m_view.visit([](auto& view) { return view.is_null(); }); } size_t length() const { - if (is_u8_view()) - return m_u8view.value().length(); - else - return m_u32view.value().length(); + return m_view.visit([](auto& view) { return view.length(); }); + } + + RegexStringView construct_as_same(Span data, Optional& optional_string_storage) const + { + return m_view.visit( + [&](T const&) { + StringBuilder builder; + for (auto ch : data) + builder.append(ch); // Note: The type conversion is intentional. + optional_string_storage = builder.build(); + return RegexStringView { T { *optional_string_storage } }; + }, + [&](Utf32View) { + return RegexStringView { Utf32View { data.data(), data.size() } }; + }); } Vector lines() const { - if (is_u8_view()) { - auto views = u8view().lines(false); - Vector new_views; - for (auto& view : views) - new_views.append(move(view)); - return new_views; - } - - Vector views; - auto view = u32view(); - u32 newline = '\n'; - while (!view.is_empty()) { - auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32)); - if (!position.has_value()) - break; - auto offset = position.value() / sizeof(u32); - views.append(view.substring_view(0, offset)); - view = view.substring_view(offset + 1, view.length() - offset - 1); - } - if (!view.is_empty()) - views.append(view); - return views; + return m_view.visit( + [](StringView view) { + auto views = view.lines(false); + Vector new_views; + for (auto& view : views) + new_views.empend(view); + return new_views; + }, + [](Utf32View view) { + Vector views; + u32 newline = '\n'; + while (!view.is_empty()) { + auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32)); + if (!position.has_value()) + break; + auto offset = position.value() / sizeof(u32); + views.empend(view.substring_view(0, offset)); + view = view.substring_view(offset + 1, view.length() - offset - 1); + } + if (!view.is_empty()) + views.empend(view); + return views; + }, + [](Utf8View& view) { + Vector views; + auto it = view.begin(); + auto previous_newline_position_it = it; + for (;;) { + if (*it == '\n') { + auto previous_offset = view.byte_offset_of(previous_newline_position_it); + auto new_offset = view.byte_offset_of(it); + auto slice = view.substring_view(previous_offset, new_offset - previous_offset); + views.empend(slice); + ++it; + previous_newline_position_it = it; + } + if (it.done()) + break; + ++it; + } + if (it != previous_newline_position_it) { + auto previous_offset = view.byte_offset_of(previous_newline_position_it); + auto new_offset = view.byte_offset_of(it); + auto slice = view.substring_view(previous_offset, new_offset - previous_offset); + views.empend(slice); + } + return views; + }); } RegexStringView substring_view(size_t offset, size_t length) const { - if (is_u8_view()) { - return u8view().substring_view(offset, length); - } - return u32view().substring_view(offset, length); + return m_view.visit( + [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; }, + [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; }); } String to_string() const { - if (is_u8_view()) { - return u8view().to_string(); - } - - StringBuilder builder; - builder.append(u32view()); - return builder.to_string(); + return m_view.visit( + [](StringView view) { return view.to_string(); }, + [](auto& view) { + StringBuilder builder; + for (auto it = view.begin(); it != view.end(); ++it) + builder.append_code_point(*it); + return builder.to_string(); + }); } u32 operator[](size_t index) const { - if (is_u8_view()) { - i8 ch = u8view()[index]; - u8 value = *reinterpret_cast(&ch); - return static_cast(value); - } - return u32view().code_points()[index]; + return m_view.visit( + [&](StringView view) -> u32 { + auto ch = view[index]; + if (ch < 0) + return 256u + ch; + return ch; + }, + [&](auto view) -> u32 { return view[index]; }, + [&](Utf8View& view) -> u32 { + size_t i = index; + for (auto it = view.begin(); it != view.end(); ++it, --i) { + if (i == 0) + return *it; + } + VERIFY_NOT_REACHED(); + }); } bool operator==(const char* cstring) const { - if (is_u8_view()) - return u8view() == cstring; - - return to_string() == cstring; + return m_view.visit( + [&](Utf32View) { return to_string() == cstring; }, + [&](Utf8View const& view) { return view.as_string() == cstring; }, + [&](StringView view) { return view == cstring; }); } bool operator!=(const char* cstring) const @@ -149,18 +199,18 @@ public: bool operator==(const String& string) const { - if (is_u8_view()) - return u8view() == string; - - return to_string() == string; + return m_view.visit( + [&](Utf32View) { return to_string() == string; }, + [&](Utf8View const& view) { return view.as_string() == string; }, + [&](StringView view) { return view == string; }); } - bool operator==(const StringView& other) const + bool operator==(const StringView& string) const { - if (is_u8_view()) - return u8view() == other; - - return false; + return m_view.visit( + [&](Utf32View) { return to_string() == string; }, + [&](Utf8View const& view) { return view.as_string() == string; }, + [&](StringView view) { return view == string; }); } bool operator!=(const StringView& other) const @@ -170,13 +220,12 @@ public: bool operator==(const Utf32View& other) const { - if (is_u32_view()) { - StringBuilder builder; - builder.append(other); - return to_string() == builder.to_string(); - } - - return false; + return m_view.visit( + [&](Utf32View view) { + return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0; + }, + [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); }, + [&](StringView view) { return view == RegexStringView { other }.to_string(); }); } bool operator!=(const Utf32View& other) const @@ -184,34 +233,78 @@ public: return !(*this == other); } - const char* characters_without_null_termination() const + bool operator==(const Utf8View& other) const { - if (is_u8_view()) - return u8view().characters_without_null_termination(); + return m_view.visit( + [&](Utf32View) { + return to_string() == other.as_string(); + }, + [&](Utf8View const& view) { return view.as_string() == other.as_string(); }, + [&](StringView view) { return other.as_string() == view; }); + } - return to_string().characters(); // FIXME: it contains the null termination, does that actually matter? + bool operator!=(const Utf8View& other) const + { + return !(*this == other); + } + + bool equals(const RegexStringView& other) const + { + return other.m_view.visit([&](auto const& view) { return operator==(view); }); + } + + bool equals_ignoring_case(const RegexStringView& other) const + { + // FIXME: Implement equals_ignoring_case() for unicode. + return m_view.visit( + [&](StringView view) { + return other.m_view.visit( + [&](StringView other_view) { return view.equals_ignoring_case(other_view); }, + [](auto&) -> bool { TODO(); }); + }, + [](auto&) -> bool { TODO(); }); } bool starts_with(const StringView& str) const { - if (is_u32_view()) - return false; - return u8view().starts_with(str); + return m_view.visit( + [&](Utf32View) -> bool { + TODO(); + }, + [&](Utf8View const& view) { return view.as_string().starts_with(str); }, + [&](StringView view) { return view.starts_with(str); }); } bool starts_with(const Utf32View& str) const { - if (is_u8_view()) - return false; - - StringBuilder builder; - builder.append(str); - return to_string().starts_with(builder.to_string()); + return m_view.visit( + [&](Utf32View view) -> bool { + if (str.length() > view.length()) + return false; + if (str.length() == view.length()) + return operator==(str); + for (size_t i = 0; i < str.length(); ++i) { + if (str.at(i) != view.at(i)) + return false; + } + return true; + }, + [&](Utf8View const& view) { + auto it = view.begin(); + for (auto code_point : str) { + if (it.done()) + return false; + if (code_point != *it) + return false; + ++it; + } + return true; + }, + [&](StringView) -> bool { TODO(); }); } private: - Optional m_u8view; - Optional m_u32view; + Variant m_view; }; class Match final { @@ -271,6 +364,9 @@ struct MatchState { size_t string_position { 0 }; size_t instruction_position { 0 }; size_t fork_at_position { 0 }; + Vector matches; + Vector> capture_group_matches; + Vector> named_capture_group_matches; }; struct MatchOutput { @@ -288,6 +384,7 @@ template<> struct AK::Formatter : Formatter { void format(FormatBuilder& builder, const regex::RegexStringView& value) { - return Formatter::format(builder, { value.characters_without_null_termination(), value.length() }); + auto string = value.to_string(); + return Formatter::format(builder, string); } }; diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp index d08b636f53c..fff1487c73d 100644 --- a/Userland/Libraries/LibRegex/RegexMatcher.cpp +++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp @@ -99,34 +99,34 @@ RegexResult Matcher::match(const Vector views, Optional } if (c_match_preallocation_count) { - output.matches.ensure_capacity(c_match_preallocation_count); - output.capture_group_matches.ensure_capacity(c_match_preallocation_count); - output.named_capture_group_matches.ensure_capacity(c_match_preallocation_count); + state.matches.ensure_capacity(c_match_preallocation_count); + state.capture_group_matches.ensure_capacity(c_match_preallocation_count); + state.named_capture_group_matches.ensure_capacity(c_match_preallocation_count); auto& capture_groups_count = m_pattern.parser_result.capture_groups_count; auto& named_capture_groups_count = m_pattern.parser_result.named_capture_groups_count; for (size_t j = 0; j < c_match_preallocation_count; ++j) { - output.matches.empend(); - output.capture_group_matches.unchecked_append({}); - output.capture_group_matches.at(j).ensure_capacity(capture_groups_count); + state.matches.empend(); + state.capture_group_matches.unchecked_append({}); + state.capture_group_matches.at(j).ensure_capacity(capture_groups_count); for (size_t k = 0; k < capture_groups_count; ++k) - output.capture_group_matches.at(j).unchecked_append({}); + state.capture_group_matches.at(j).unchecked_append({}); - output.named_capture_group_matches.unchecked_append({}); - output.named_capture_group_matches.at(j).ensure_capacity(named_capture_groups_count); + state.named_capture_group_matches.unchecked_append({}); + state.named_capture_group_matches.at(j).ensure_capacity(named_capture_groups_count); } } - auto append_match = [](auto& input, auto& state, auto& output, auto& start_position) { - if (output.matches.size() == input.match_index) - output.matches.empend(); + auto append_match = [](auto& input, auto& state, auto& start_position) { + if (state.matches.size() == input.match_index) + state.matches.empend(); VERIFY(start_position + state.string_position - start_position <= input.view.length()); if (input.regex_options.has_flag_set(AllFlags::StringCopyMatches)) { - output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_string(), input.line, start_position, input.global_offset + start_position }; + state.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_string(), input.line, start_position, input.global_offset + start_position }; } else { // let the view point to the original string ... - output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position }; + state.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position }; } }; @@ -171,7 +171,7 @@ RegexResult Matcher::match(const Vector views, Optional output = move(temp_output); if (!match_count) { // Nothing was *actually* matched, so append an empty match. - append_match(input, state, output, view_index); + append_match(input, state, view_index); ++match_count; } } @@ -219,21 +219,21 @@ RegexResult Matcher::match(const Vector views, Optional ++match_count; if (continue_search) { - append_match(input, state, output, view_index); + append_match(input, state, view_index); bool has_zero_length = state.string_position == view_index; view_index = state.string_position - (has_zero_length ? 0 : 1); continue; } else if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) { - append_match(input, state, output, view_index); + append_match(input, state, view_index); break; } else if (state.string_position < view_length) { return { false, 0, {}, {}, {}, output.operations }; } - append_match(input, state, output, view_index); + append_match(input, state, view_index); break; } @@ -253,7 +253,7 @@ RegexResult Matcher::match(const Vector views, Optional MatchOutput output_copy; if (match_count) { - output_copy.capture_group_matches = output.capture_group_matches; + output_copy.capture_group_matches = state.capture_group_matches; // Make sure there are as many capture matches as there are actual matches. if (output_copy.capture_group_matches.size() < match_count) output_copy.capture_group_matches.resize(match_count); @@ -264,12 +264,12 @@ RegexResult Matcher::match(const Vector views, Optional matches.template remove_all_matching([](auto& match) { return match.view.is_null(); }); } - output_copy.named_capture_group_matches = output.named_capture_group_matches; + output_copy.named_capture_group_matches = state.named_capture_group_matches; // Make sure there are as many capture matches as there are actual matches. if (output_copy.named_capture_group_matches.size() < match_count) output_copy.named_capture_group_matches.resize(match_count); - output_copy.matches = output.matches; + output_copy.matches = state.matches; } else { output_copy.capture_group_matches.clear_with_capacity(); output_copy.named_capture_group_matches.clear_with_capacity(); diff --git a/Userland/Utilities/expr.cpp b/Userland/Utilities/expr.cpp index 38348d65a6c..5a6f12afe91 100644 --- a/Userland/Utilities/expr.cpp +++ b/Userland/Utilities/expr.cpp @@ -417,7 +417,7 @@ private: StringBuilder result; for (auto& e : match.capture_group_matches[0]) - result.append(e.view.u8view()); + result.append(e.view.string_view()); return result.build(); }