浏览代码

LibRegex+Everywhere: Make LibRegex more unicode-aware

This commit makes LibRegex (mostly) capable of operating on any of
the three main string views:
- StringView for raw strings
- Utf8View for utf-8 encoded strings
- Utf32View for raw unicode strings

As a result, regexps with unicode strings should be able to properly
handle utf-8 and not stop in the middle of a code point.
A future commit will update LibJS to use the correct type of string
depending on the flags.
Ali Mohammad Pur 4 年之前
父节点
当前提交
f364fcec5d

+ 1 - 2
Tests/LibRegex/Regex.cpp

@@ -249,7 +249,7 @@ TEST_CASE(char_utf8)
     Regex<PosixExtended> re("😀");
     RegexResult result;
 
-    EXPECT_EQ((result = match("Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界", re, PosixFlags::Global)).success, true);
+    EXPECT_EQ((result = match(Utf8View { "Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界" }, re, PosixFlags::Global)).success, true);
     EXPECT_EQ(result.count, 2u);
 }
 
@@ -312,7 +312,6 @@ TEST_CASE(match_all_character_class)
     EXPECT_EQ(result.matches.at(0).view, "W");
     EXPECT_EQ(result.matches.at(1).view, "i");
     EXPECT_EQ(result.matches.at(2).view, "n");
-    EXPECT(&result.matches.at(0).view.characters_without_null_termination()[0] != &str.view().characters_without_null_termination()[1]);
 }
 
 TEST_CASE(match_character_class_with_assertion)

+ 2 - 2
Userland/DevTools/HackStudio/LanguageServers/Cpp/CppComprehensionEngine.cpp

@@ -359,7 +359,7 @@ String CppComprehensionEngine::document_path_from_include_path(const StringView&
         if (!library_include.search(include_path, result))
             return {};
 
-        auto path = result.capture_group_matches.at(0).at(0).view.u8view();
+        auto path = result.capture_group_matches.at(0).at(0).view.string_view();
         return String::formatted("/usr/include/{}", path);
     };
 
@@ -368,7 +368,7 @@ String CppComprehensionEngine::document_path_from_include_path(const StringView&
         if (!user_defined_include.search(include_path, result))
             return {};
 
-        return result.capture_group_matches.at(0).at(0).view.u8view();
+        return result.capture_group_matches.at(0).at(0).view.string_view();
     };
 
     auto result = document_path_for_library_include(include_path);

+ 1 - 1
Userland/Libraries/LibDebug/DebugSession.cpp

@@ -422,7 +422,7 @@ void DebugSession::update_loaded_libs()
         auto rc = re.search(vm_name, result);
         if (!rc)
             return {};
-        auto lib_name = result.capture_group_matches.at(0).at(0).view.u8view().to_string();
+        auto lib_name = result.capture_group_matches.at(0).at(0).view.string_view().to_string();
         if (lib_name.starts_with("/"))
             return lib_name;
         return String::formatted("/usr/lib/{}", lib_name);

+ 81 - 74
Userland/Libraries/LibRegex/RegexByteCode.cpp

@@ -288,30 +288,33 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input,
     return ExecutionResult::Failed_ExecuteLowPrioForks;
 }
 
-ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
+ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
 {
-    if (input.match_index >= output.capture_group_matches.size()) {
-        output.capture_group_matches.ensure_capacity(input.match_index);
-        auto capacity = output.capture_group_matches.capacity();
-        for (size_t i = output.capture_group_matches.size(); i <= capacity; ++i)
-            output.capture_group_matches.empend();
+    if (input.match_index >= state.capture_group_matches.size()) {
+        state.capture_group_matches.ensure_capacity(input.match_index);
+        auto capacity = state.capture_group_matches.capacity();
+        for (size_t i = state.capture_group_matches.size(); i <= capacity; ++i)
+            state.capture_group_matches.empend();
     }
 
-    if (id() >= output.capture_group_matches.at(input.match_index).size()) {
-        output.capture_group_matches.at(input.match_index).ensure_capacity(id());
-        auto capacity = output.capture_group_matches.at(input.match_index).capacity();
-        for (size_t i = output.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i)
-            output.capture_group_matches.at(input.match_index).empend();
+    if (id() >= state.capture_group_matches.at(input.match_index).size()) {
+        state.capture_group_matches.at(input.match_index).ensure_capacity(id());
+        auto capacity = state.capture_group_matches.at(input.match_index).capacity();
+        for (size_t i = state.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i)
+            state.capture_group_matches.at(input.match_index).empend();
     }
 
-    output.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position;
+    state.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position;
     return ExecutionResult::Continue;
 }
 
-ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
+ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
 {
-    auto& match = output.capture_group_matches.at(input.match_index).at(id());
+    auto& match = state.capture_group_matches.at(input.match_index).at(id());
     auto start_position = match.left_column;
+    if (state.string_position < start_position)
+        return ExecutionResult::Failed_ExecuteLowPrioForks;
+
     auto length = state.string_position - start_position;
 
     if (start_position < match.column)
@@ -330,27 +333,27 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchI
     return ExecutionResult::Continue;
 }
 
-ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
+ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
 {
-    if (input.match_index >= output.named_capture_group_matches.size()) {
-        output.named_capture_group_matches.ensure_capacity(input.match_index);
-        auto capacity = output.named_capture_group_matches.capacity();
-        for (size_t i = output.named_capture_group_matches.size(); i <= capacity; ++i)
-            output.named_capture_group_matches.empend();
+    if (input.match_index >= state.named_capture_group_matches.size()) {
+        state.named_capture_group_matches.ensure_capacity(input.match_index);
+        auto capacity = state.named_capture_group_matches.capacity();
+        for (size_t i = state.named_capture_group_matches.size(); i <= capacity; ++i)
+            state.named_capture_group_matches.empend();
     }
-    output.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position;
+    state.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position;
     return ExecutionResult::Continue;
 }
 
-ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
+ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
 {
     StringView capture_group_name = name();
 
-    if (output.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) {
-        auto start_position = output.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column;
+    if (state.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) {
+        auto start_position = state.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column;
         auto length = state.string_position - start_position;
 
-        auto& map = output.named_capture_group_matches.at(input.match_index);
+        auto& map = state.named_capture_group_matches.at(input.match_index);
 
         if constexpr (REGEX_DEBUG) {
             VERIFY(start_position + length <= input.view.length());
@@ -371,7 +374,7 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const M
     return ExecutionResult::Continue;
 }
 
-ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
+ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
 {
     bool inverse { false };
     bool temporary_inverse { false };
@@ -414,14 +417,14 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
             u32 ch = m_bytecode->at(offset++);
 
             // We want to compare a string that is longer or equal in length to the available string
-            if (input.view.length() - state.string_position < 1)
+            if (input.view.length() <= state.string_position)
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
             compare_char(input, state, ch, current_inversion_state(), inverse_matched);
 
         } else if (compare_type == CharacterCompareType::AnyChar) {
             // We want to compare a string that is definitely longer than the available string
-            if (input.view.length() - state.string_position < 1)
+            if (input.view.length() <= state.string_position)
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
             VERIFY(!current_inversion_state());
@@ -431,20 +434,25 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
             VERIFY(!current_inversion_state());
 
             const auto& length = m_bytecode->at(offset++);
-            StringBuilder str_builder;
-            for (size_t i = 0; i < length; ++i)
-                str_builder.append(m_bytecode->at(offset++));
 
             // We want to compare a string that is definitely longer than the available string
-            if (input.view.length() - state.string_position < length)
+            if (input.view.length() < state.string_position + length)
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
-            if (!compare_string(input, state, str_builder.string_view().characters_without_null_termination(), length, had_zero_length_match))
+            Optional<String> str;
+            Vector<u32> data;
+            data.ensure_capacity(length);
+            for (size_t i = offset; i < offset + length; ++i)
+                data.unchecked_append(m_bytecode->at(i));
+
+            auto view = input.view.construct_as_same(data, str);
+            offset += length;
+            if (!compare_string(input, state, view, had_zero_length_match))
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
         } else if (compare_type == CharacterCompareType::CharClass) {
 
-            if (input.view.length() - state.string_position < 1)
+            if (input.view.length() <= state.string_position)
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
             auto character_class = (CharClass)m_bytecode->at(offset++);
@@ -453,6 +461,9 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
             compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
 
         } else if (compare_type == CharacterCompareType::CharRange) {
+            if (input.view.length() <= state.string_position)
+                return ExecutionResult::Failed_ExecuteLowPrioForks;
+
             auto value = (CharRange)m_bytecode->at(offset++);
 
             auto from = value.from;
@@ -463,17 +474,17 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
 
         } else if (compare_type == CharacterCompareType::Reference) {
             auto reference_number = (size_t)m_bytecode->at(offset++);
-            auto& groups = output.capture_group_matches.at(input.match_index);
+            auto& groups = state.capture_group_matches.at(input.match_index);
             if (groups.size() <= reference_number)
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
             auto str = groups.at(reference_number).view;
 
             // We want to compare a string that is definitely longer than the available string
-            if (input.view.length() - state.string_position < str.length())
+            if (input.view.length() < state.string_position + str.length())
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
-            if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match))
+            if (!compare_string(input, state, str, had_zero_length_match))
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
         } else if (compare_type == CharacterCompareType::NamedReference) {
@@ -481,17 +492,17 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
             auto length = (size_t)m_bytecode->at(offset++);
             StringView name { ptr, length };
 
-            auto group = output.named_capture_group_matches.at(input.match_index).get(name);
+            auto group = state.named_capture_group_matches.at(input.match_index).get(name);
             if (!group.has_value())
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
             auto str = group.value().view;
 
             // We want to compare a string that is definitely longer than the available string
-            if (input.view.length() - state.string_position < str.length())
+            if (input.view.length() < state.string_position + str.length())
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
-            if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match))
+            if (!compare_string(input, state, str, had_zero_length_match))
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
         } else {
@@ -512,14 +523,19 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
 
 ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched)
 {
-    u32 ch2 = input.view[state.string_position];
+    if (state.string_position == input.view.length())
+        return;
 
-    if (input.regex_options & AllFlags::Insensitive) {
-        ch1 = to_ascii_lowercase(ch1);
-        ch2 = to_ascii_lowercase(ch2);
-    }
+    auto input_view = input.view.substring_view(state.string_position, 1);
+    Optional<String> str;
+    auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str);
+    bool equal;
+    if (input.regex_options & AllFlags::Insensitive)
+        equal = input_view.equals_ignoring_case(compare_view);
+    else
+        equal = input_view.equals(compare_view);
 
-    if (ch1 == ch2) {
+    if (equal) {
         if (inverse)
             inverse_matched = true;
         else
@@ -527,41 +543,32 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchSt
     }
 }
 
-ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match)
+ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, RegexStringView const& str, bool& had_zero_length_match)
 {
-    if (length == 0) {
+    if (state.string_position + str.length() > input.view.length()) {
+        if (str.is_empty()) {
+            had_zero_length_match = true;
+            return true;
+        }
+        return false;
+    }
+
+    if (str.length() == 0) {
         had_zero_length_match = true;
         return true;
     }
 
-    if (input.view.is_u8_view()) {
-        auto str_view1 = StringView(str, length);
-        auto str_view2 = StringView(&input.view.u8view()[state.string_position], length);
-
-        bool string_equals;
-        if (input.regex_options & AllFlags::Insensitive)
-            string_equals = str_view1.equals_ignoring_case(str_view2);
-        else
-            string_equals = str_view1 == str_view2;
-
-        if (string_equals) {
-            state.string_position += length;
-            return true;
-        }
-    } else {
-        bool equals;
-        if (input.regex_options & AllFlags::Insensitive)
-            TODO();
-        else
-            equals = __builtin_memcmp(str, &input.view.u32view().code_points()[state.string_position], length) == 0;
+    auto subject = input.view.substring_view(state.string_position, str.length());
+    bool equals;
+    if (input.regex_options & AllFlags::Insensitive)
+        equals = subject.equals_ignoring_case(str);
+    else
+        equals = subject.equals(str);
 
-        if (equals) {
-            state.string_position += length;
-            return true;
-        }
-    }
+    if (equals)
+        state.string_position += str.length();
 
-    return false;
+    return equals;
 }
 
 ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched)

+ 1 - 1
Userland/Libraries/LibRegex/RegexByteCode.h

@@ -682,7 +682,7 @@ public:
 
 private:
     ALWAYS_INLINE static void compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched);
-    ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match);
+    ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, RegexStringView const& str, bool& had_zero_length_match);
     ALWAYS_INLINE static void compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched);
     ALWAYS_INLINE static void compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched);
 };

+ 199 - 102
Userland/Libraries/LibRegex/RegexMatch.h

@@ -15,6 +15,8 @@
 #include <AK/StringBuilder.h>
 #include <AK/StringView.h>
 #include <AK/Utf32View.h>
+#include <AK/Utf8View.h>
+#include <AK/Variant.h>
 #include <AK/Vector.h>
 
 namespace regex {
@@ -22,124 +24,172 @@ namespace regex {
 class RegexStringView {
 public:
     RegexStringView(const char* chars)
-        : m_u8view(chars)
+        : m_view(StringView { chars })
     {
     }
 
     RegexStringView(const String& string)
-        : m_u8view(string)
+        : m_view(string.view())
     {
     }
 
     RegexStringView(const StringView view)
-        : m_u8view(view)
+        : m_view(view)
     {
     }
-    RegexStringView(const Utf32View view)
-        : m_u32view(view)
+
+    RegexStringView(Utf32View view)
+        : m_view(view)
+    {
+    }
+
+    RegexStringView(Utf8View view)
+        : m_view(view)
     {
     }
 
-    bool is_u8_view() const { return m_u8view.has_value(); }
-    bool is_u32_view() const { return m_u32view.has_value(); }
+    const StringView& string_view() const
+    {
+        return m_view.get<StringView>();
+    }
 
-    const StringView& u8view() const
+    const Utf32View& u32_view() const
     {
-        VERIFY(m_u8view.has_value());
-        return m_u8view.value();
-    };
+        return m_view.get<Utf32View>();
+    }
 
-    const Utf32View& u32view() const
+    const Utf8View& u8_view() const
     {
-        VERIFY(m_u32view.has_value());
-        return m_u32view.value();
-    };
+        return m_view.get<Utf8View>();
+    }
 
     bool is_empty() const
     {
-        if (is_u8_view())
-            return m_u8view.value().is_empty();
-        else
-            return m_u32view.value().is_empty();
+        return m_view.visit([](auto& view) { return view.is_empty(); });
     }
 
     bool is_null() const
     {
-        if (is_u8_view())
-            return m_u8view.value().is_null();
-        else
-            return m_u32view.value().code_points() == nullptr;
+        return m_view.visit([](auto& view) { return view.is_null(); });
     }
 
     size_t length() const
     {
-        if (is_u8_view())
-            return m_u8view.value().length();
-        else
-            return m_u32view.value().length();
+        return m_view.visit([](auto& view) { return view.length(); });
+    }
+
+    RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage) const
+    {
+        return m_view.visit(
+            [&]<typename T>(T const&) {
+                StringBuilder builder;
+                for (auto ch : data)
+                    builder.append(ch); // Note: The type conversion is intentional.
+                optional_string_storage = builder.build();
+                return RegexStringView { T { *optional_string_storage } };
+            },
+            [&](Utf32View) {
+                return RegexStringView { Utf32View { data.data(), data.size() } };
+            });
     }
 
     Vector<RegexStringView> lines() const
     {
-        if (is_u8_view()) {
-            auto views = u8view().lines(false);
-            Vector<RegexStringView> new_views;
-            for (auto& view : views)
-                new_views.append(move(view));
-            return new_views;
-        }
-
-        Vector<RegexStringView> views;
-        auto view = u32view();
-        u32 newline = '\n';
-        while (!view.is_empty()) {
-            auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32));
-            if (!position.has_value())
-                break;
-            auto offset = position.value() / sizeof(u32);
-            views.append(view.substring_view(0, offset));
-            view = view.substring_view(offset + 1, view.length() - offset - 1);
-        }
-        if (!view.is_empty())
-            views.append(view);
-        return views;
+        return m_view.visit(
+            [](StringView view) {
+                auto views = view.lines(false);
+                Vector<RegexStringView> new_views;
+                for (auto& view : views)
+                    new_views.empend(view);
+                return new_views;
+            },
+            [](Utf32View view) {
+                Vector<RegexStringView> views;
+                u32 newline = '\n';
+                while (!view.is_empty()) {
+                    auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32));
+                    if (!position.has_value())
+                        break;
+                    auto offset = position.value() / sizeof(u32);
+                    views.empend(view.substring_view(0, offset));
+                    view = view.substring_view(offset + 1, view.length() - offset - 1);
+                }
+                if (!view.is_empty())
+                    views.empend(view);
+                return views;
+            },
+            [](Utf8View& view) {
+                Vector<RegexStringView> views;
+                auto it = view.begin();
+                auto previous_newline_position_it = it;
+                for (;;) {
+                    if (*it == '\n') {
+                        auto previous_offset = view.byte_offset_of(previous_newline_position_it);
+                        auto new_offset = view.byte_offset_of(it);
+                        auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
+                        views.empend(slice);
+                        ++it;
+                        previous_newline_position_it = it;
+                    }
+                    if (it.done())
+                        break;
+                    ++it;
+                }
+                if (it != previous_newline_position_it) {
+                    auto previous_offset = view.byte_offset_of(previous_newline_position_it);
+                    auto new_offset = view.byte_offset_of(it);
+                    auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
+                    views.empend(slice);
+                }
+                return views;
+            });
     }
 
     RegexStringView substring_view(size_t offset, size_t length) const
     {
-        if (is_u8_view()) {
-            return u8view().substring_view(offset, length);
-        }
-        return u32view().substring_view(offset, length);
+        return m_view.visit(
+            [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
+            [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
     }
 
     String to_string() const
     {
-        if (is_u8_view()) {
-            return u8view().to_string();
-        }
-
-        StringBuilder builder;
-        builder.append(u32view());
-        return builder.to_string();
+        return m_view.visit(
+            [](StringView view) { return view.to_string(); },
+            [](auto& view) {
+                StringBuilder builder;
+                for (auto it = view.begin(); it != view.end(); ++it)
+                    builder.append_code_point(*it);
+                return builder.to_string();
+            });
     }
 
     u32 operator[](size_t index) const
     {
-        if (is_u8_view()) {
-            i8 ch = u8view()[index];
-            u8 value = *reinterpret_cast<u8*>(&ch);
-            return static_cast<u32>(value);
-        }
-        return u32view().code_points()[index];
+        return m_view.visit(
+            [&](StringView view) -> u32 {
+                auto ch = view[index];
+                if (ch < 0)
+                    return 256u + ch;
+                return ch;
+            },
+            [&](auto view) -> u32 { return view[index]; },
+            [&](Utf8View& view) -> u32 {
+                size_t i = index;
+                for (auto it = view.begin(); it != view.end(); ++it, --i) {
+                    if (i == 0)
+                        return *it;
+                }
+                VERIFY_NOT_REACHED();
+            });
     }
 
     bool operator==(const char* cstring) const
     {
-        if (is_u8_view())
-            return u8view() == cstring;
-
-        return to_string() == cstring;
+        return m_view.visit(
+            [&](Utf32View) { return to_string() == cstring; },
+            [&](Utf8View const& view) { return view.as_string() == cstring; },
+            [&](StringView view) { return view == cstring; });
     }
 
     bool operator!=(const char* cstring) const
@@ -149,18 +199,18 @@ public:
 
     bool operator==(const String& string) const
     {
-        if (is_u8_view())
-            return u8view() == string;
-
-        return to_string() == string;
+        return m_view.visit(
+            [&](Utf32View) { return to_string() == string; },
+            [&](Utf8View const& view) { return view.as_string() == string; },
+            [&](StringView view) { return view == string; });
     }
 
-    bool operator==(const StringView& other) const
+    bool operator==(const StringView& string) const
     {
-        if (is_u8_view())
-            return u8view() == other;
-
-        return false;
+        return m_view.visit(
+            [&](Utf32View) { return to_string() == string; },
+            [&](Utf8View const& view) { return view.as_string() == string; },
+            [&](StringView view) { return view == string; });
     }
 
     bool operator!=(const StringView& other) const
@@ -170,13 +220,12 @@ public:
 
     bool operator==(const Utf32View& other) const
     {
-        if (is_u32_view()) {
-            StringBuilder builder;
-            builder.append(other);
-            return to_string() == builder.to_string();
-        }
-
-        return false;
+        return m_view.visit(
+            [&](Utf32View view) {
+                return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0;
+            },
+            [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
+            [&](StringView view) { return view == RegexStringView { other }.to_string(); });
     }
 
     bool operator!=(const Utf32View& other) const
@@ -184,34 +233,78 @@ public:
         return !(*this == other);
     }
 
-    const char* characters_without_null_termination() const
+    bool operator==(const Utf8View& other) const
+    {
+        return m_view.visit(
+            [&](Utf32View) {
+                return to_string() == other.as_string();
+            },
+            [&](Utf8View const& view) { return view.as_string() == other.as_string(); },
+            [&](StringView view) { return other.as_string() == view; });
+    }
+
+    bool operator!=(const Utf8View& other) const
+    {
+        return !(*this == other);
+    }
+
+    bool equals(const RegexStringView& other) const
     {
-        if (is_u8_view())
-            return u8view().characters_without_null_termination();
+        return other.m_view.visit([&](auto const& view) { return operator==(view); });
+    }
 
-        return to_string().characters(); // FIXME: it contains the null termination, does that actually matter?
+    bool equals_ignoring_case(const RegexStringView& other) const
+    {
+        // FIXME: Implement equals_ignoring_case() for unicode.
+        return m_view.visit(
+            [&](StringView view) {
+                return other.m_view.visit(
+                    [&](StringView other_view) { return view.equals_ignoring_case(other_view); },
+                    [](auto&) -> bool { TODO(); });
+            },
+            [](auto&) -> bool { TODO(); });
     }
 
     bool starts_with(const StringView& str) const
     {
-        if (is_u32_view())
-            return false;
-        return u8view().starts_with(str);
+        return m_view.visit(
+            [&](Utf32View) -> bool {
+                TODO();
+            },
+            [&](Utf8View const& view) { return view.as_string().starts_with(str); },
+            [&](StringView view) { return view.starts_with(str); });
     }
 
     bool starts_with(const Utf32View& str) const
     {
-        if (is_u8_view())
-            return false;
-
-        StringBuilder builder;
-        builder.append(str);
-        return to_string().starts_with(builder.to_string());
+        return m_view.visit(
+            [&](Utf32View view) -> bool {
+                if (str.length() > view.length())
+                    return false;
+                if (str.length() == view.length())
+                    return operator==(str);
+                for (size_t i = 0; i < str.length(); ++i) {
+                    if (str.at(i) != view.at(i))
+                        return false;
+                }
+                return true;
+            },
+            [&](Utf8View const& view) {
+                auto it = view.begin();
+                for (auto code_point : str) {
+                    if (it.done())
+                        return false;
+                    if (code_point != *it)
+                        return false;
+                    ++it;
+                }
+                return true;
+            },
+            [&](StringView) -> bool { TODO(); });
     }
 
 private:
-    Optional<StringView> m_u8view;
-    Optional<Utf32View> m_u32view;
+    Variant<StringView, Utf8View, Utf32View> m_view;
 };
 
 class Match final {
@@ -271,6 +364,9 @@ struct MatchState {
     size_t string_position { 0 };
     size_t instruction_position { 0 };
     size_t fork_at_position { 0 };
+    Vector<Match> matches;
+    Vector<Vector<Match>> capture_group_matches;
+    Vector<HashMap<String, Match>> named_capture_group_matches;
 };
 
 struct MatchOutput {
@@ -288,6 +384,7 @@ template<>
 struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> {
     void format(FormatBuilder& builder, const regex::RegexStringView& value)
     {
-        return Formatter<StringView>::format(builder, { value.characters_without_null_termination(), value.length() });
+        auto string = value.to_string();
+        return Formatter<StringView>::format(builder, string);
     }
 };

+ 21 - 21
Userland/Libraries/LibRegex/RegexMatcher.cpp

@@ -99,34 +99,34 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional
     }
 
     if (c_match_preallocation_count) {
-        output.matches.ensure_capacity(c_match_preallocation_count);
-        output.capture_group_matches.ensure_capacity(c_match_preallocation_count);
-        output.named_capture_group_matches.ensure_capacity(c_match_preallocation_count);
+        state.matches.ensure_capacity(c_match_preallocation_count);
+        state.capture_group_matches.ensure_capacity(c_match_preallocation_count);
+        state.named_capture_group_matches.ensure_capacity(c_match_preallocation_count);
 
         auto& capture_groups_count = m_pattern.parser_result.capture_groups_count;
         auto& named_capture_groups_count = m_pattern.parser_result.named_capture_groups_count;
 
         for (size_t j = 0; j < c_match_preallocation_count; ++j) {
-            output.matches.empend();
-            output.capture_group_matches.unchecked_append({});
-            output.capture_group_matches.at(j).ensure_capacity(capture_groups_count);
+            state.matches.empend();
+            state.capture_group_matches.unchecked_append({});
+            state.capture_group_matches.at(j).ensure_capacity(capture_groups_count);
             for (size_t k = 0; k < capture_groups_count; ++k)
-                output.capture_group_matches.at(j).unchecked_append({});
+                state.capture_group_matches.at(j).unchecked_append({});
 
-            output.named_capture_group_matches.unchecked_append({});
-            output.named_capture_group_matches.at(j).ensure_capacity(named_capture_groups_count);
+            state.named_capture_group_matches.unchecked_append({});
+            state.named_capture_group_matches.at(j).ensure_capacity(named_capture_groups_count);
         }
     }
 
-    auto append_match = [](auto& input, auto& state, auto& output, auto& start_position) {
-        if (output.matches.size() == input.match_index)
-            output.matches.empend();
+    auto append_match = [](auto& input, auto& state, auto& start_position) {
+        if (state.matches.size() == input.match_index)
+            state.matches.empend();
 
         VERIFY(start_position + state.string_position - start_position <= input.view.length());
         if (input.regex_options.has_flag_set(AllFlags::StringCopyMatches)) {
-            output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_string(), input.line, start_position, input.global_offset + start_position };
+            state.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_string(), input.line, start_position, input.global_offset + start_position };
         } else { // let the view point to the original string ...
-            output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position };
+            state.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position };
         }
     };
 
@@ -171,7 +171,7 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional
                     output = move(temp_output);
                     if (!match_count) {
                         // Nothing was *actually* matched, so append an empty match.
-                        append_match(input, state, output, view_index);
+                        append_match(input, state, view_index);
                         ++match_count;
                     }
                 }
@@ -219,21 +219,21 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional
                 ++match_count;
 
                 if (continue_search) {
-                    append_match(input, state, output, view_index);
+                    append_match(input, state, view_index);
 
                     bool has_zero_length = state.string_position == view_index;
                     view_index = state.string_position - (has_zero_length ? 0 : 1);
                     continue;
 
                 } else if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) {
-                    append_match(input, state, output, view_index);
+                    append_match(input, state, view_index);
                     break;
 
                 } else if (state.string_position < view_length) {
                     return { false, 0, {}, {}, {}, output.operations };
                 }
 
-                append_match(input, state, output, view_index);
+                append_match(input, state, view_index);
                 break;
             }
 
@@ -253,7 +253,7 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional
 
     MatchOutput output_copy;
     if (match_count) {
-        output_copy.capture_group_matches = output.capture_group_matches;
+        output_copy.capture_group_matches = state.capture_group_matches;
         // Make sure there are as many capture matches as there are actual matches.
         if (output_copy.capture_group_matches.size() < match_count)
             output_copy.capture_group_matches.resize(match_count);
@@ -264,12 +264,12 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional
                 matches.template remove_all_matching([](auto& match) { return match.view.is_null(); });
         }
 
-        output_copy.named_capture_group_matches = output.named_capture_group_matches;
+        output_copy.named_capture_group_matches = state.named_capture_group_matches;
         // Make sure there are as many capture matches as there are actual matches.
         if (output_copy.named_capture_group_matches.size() < match_count)
             output_copy.named_capture_group_matches.resize(match_count);
 
-        output_copy.matches = output.matches;
+        output_copy.matches = state.matches;
     } else {
         output_copy.capture_group_matches.clear_with_capacity();
         output_copy.named_capture_group_matches.clear_with_capacity();

+ 1 - 1
Userland/Utilities/expr.cpp

@@ -417,7 +417,7 @@ private:
 
                 StringBuilder result;
                 for (auto& e : match.capture_group_matches[0])
-                    result.append(e.view.u8view());
+                    result.append(e.view.string_view());
 
                 return result.build();
             }