Procházet zdrojové kódy

LibRegex: Track string position in both code units and code points

In non-Unicode mode, the existing MatchState::string_position is tracked
in code units; in Unicode mode, it is tracked in code points.

In order for some RegexStringView operations to be performant, it is
useful for the MatchState to have a field to always track the position
in code units. This will allow RegexStringView methods (e.g. operator[])
to perform lookups based on code unit offsets, rather than needing to
iterate over the entire string to find a code point offset.
Timothy Flynn před 4 roky
rodič
revize
27d555bab0

+ 57 - 29
Userland/Libraries/LibRegex/RegexByteCode.cpp

@@ -88,6 +88,36 @@ static char const* character_class_name(CharClass ch_class)
     }
 }
 
+static void advance_string_position(MatchState& state, RegexStringView const& view, Optional<u32> code_point = {})
+{
+    ++state.string_position;
+
+    if (view.unicode()) {
+        if (!code_point.has_value() && (state.string_position_in_code_units < view.length_in_code_units()))
+            code_point = view[state.string_position_in_code_units];
+        if (code_point.has_value())
+            state.string_position_in_code_units += view.length_of_code_point(*code_point);
+    } else {
+        ++state.string_position_in_code_units;
+    }
+}
+
+static void save_string_position(MatchInput const& input, MatchState const& state)
+{
+    input.saved_positions.append(state.string_position);
+    input.saved_code_unit_positions.append(state.string_position_in_code_units);
+}
+
+static bool restore_string_position(MatchInput const& input, MatchState& state)
+{
+    if (input.saved_positions.is_empty())
+        return false;
+
+    state.string_position = input.saved_positions.take_last();
+    state.string_position_in_code_units = input.saved_code_unit_positions.take_last();
+    return true;
+}
+
 OwnPtr<OpCode> ByteCode::s_opcodes[(size_t)OpCodeId::Last + 1];
 bool ByteCode::s_opcodes_initialized { false };
 
@@ -188,16 +218,14 @@ ALWAYS_INLINE ExecutionResult OpCode_Exit::execute(MatchInput const& input, Matc
 
 ALWAYS_INLINE ExecutionResult OpCode_Save::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
 {
-    input.saved_positions.append(state.string_position);
+    save_string_position(input, state);
     return ExecutionResult::Continue;
 }
 
 ALWAYS_INLINE ExecutionResult OpCode_Restore::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
 {
-    if (input.saved_positions.is_empty())
+    if (!restore_string_position(input, state))
         return ExecutionResult::Failed;
-
-    state.string_position = input.saved_positions.take_last();
     return ExecutionResult::Continue;
 }
 
@@ -254,7 +282,7 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(MatchInput const& in
     auto isword = [](auto ch) { return is_ascii_alphanumeric(ch) || ch == '_'; };
     auto is_word_boundary = [&] {
         if (state.string_position == input.view.length()) {
-            if (state.string_position > 0 && isword(input.view[state.string_position - 1]))
+            if (state.string_position > 0 && isword(input.view[state.string_position_in_code_units - 1]))
                 return true;
             return false;
         }
@@ -266,7 +294,7 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(MatchInput const& in
             return false;
         }
 
-        return !!(isword(input.view[state.string_position]) ^ isword(input.view[state.string_position - 1]));
+        return !!(isword(input.view[state.string_position_in_code_units]) ^ isword(input.view[state.string_position_in_code_units - 1]));
     };
     switch (type()) {
     case BoundaryCheckType::Word: {
@@ -455,7 +483,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
             VERIFY(!current_inversion_state());
-            ++state.string_position;
+            advance_string_position(state, input.view);
 
         } else if (compare_type == CharacterCompareType::String) {
             VERIFY(!current_inversion_state());
@@ -484,7 +512,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
             auto character_class = (CharClass)m_bytecode->at(offset++);
-            auto ch = input.view[state.string_position];
+            auto ch = input.view[state.string_position_in_code_units];
 
             compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
 
@@ -496,7 +524,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
 
             auto from = value.from;
             auto to = value.to;
-            auto ch = input.view[state.string_position];
+            auto ch = input.view[state.string_position_in_code_units];
 
             compare_character_range(input, state, from, to, ch, current_inversion_state(), inverse_matched);
 
@@ -549,7 +577,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
     }
 
     if (current_inversion_state() && !inverse_matched)
-        ++state.string_position;
+        advance_string_position(state, input.view);
 
     if ((!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length())
         return ExecutionResult::Failed_ExecuteLowPrioForks;
@@ -576,7 +604,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt
         if (inverse)
             inverse_matched = true;
         else
-            ++state.string_position;
+            advance_string_position(state, input.view, ch1);
     }
 }
 
@@ -616,19 +644,19 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
             if (inverse)
                 inverse_matched = true;
             else
-                ++state.string_position;
+                advance_string_position(state, input.view, ch);
         }
         break;
     case CharClass::Alpha:
         if (is_ascii_alpha(ch))
-            ++state.string_position;
+            advance_string_position(state, input.view, ch);
         break;
     case CharClass::Blank:
         if (is_ascii_blank(ch)) {
             if (inverse)
                 inverse_matched = true;
             else
-                ++state.string_position;
+                advance_string_position(state, input.view, ch);
         }
         break;
     case CharClass::Cntrl:
@@ -636,7 +664,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
             if (inverse)
                 inverse_matched = true;
             else
-                ++state.string_position;
+                advance_string_position(state, input.view, ch);
         }
         break;
     case CharClass::Digit:
@@ -644,7 +672,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
             if (inverse)
                 inverse_matched = true;
             else
-                ++state.string_position;
+                advance_string_position(state, input.view, ch);
         }
         break;
     case CharClass::Graph:
@@ -652,7 +680,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
             if (inverse)
                 inverse_matched = true;
             else
-                ++state.string_position;
+                advance_string_position(state, input.view, ch);
         }
         break;
     case CharClass::Lower:
@@ -660,7 +688,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
             if (inverse)
                 inverse_matched = true;
             else
-                ++state.string_position;
+                advance_string_position(state, input.view, ch);
         }
         break;
     case CharClass::Print:
@@ -668,7 +696,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
             if (inverse)
                 inverse_matched = true;
             else
-                ++state.string_position;
+                advance_string_position(state, input.view, ch);
         }
         break;
     case CharClass::Punct:
@@ -676,7 +704,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
             if (inverse)
                 inverse_matched = true;
             else
-                ++state.string_position;
+                advance_string_position(state, input.view, ch);
         }
         break;
     case CharClass::Space:
@@ -684,7 +712,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
             if (inverse)
                 inverse_matched = true;
             else
-                ++state.string_position;
+                advance_string_position(state, input.view, ch);
         }
         break;
     case CharClass::Upper:
@@ -692,7 +720,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
             if (inverse)
                 inverse_matched = true;
             else
-                ++state.string_position;
+                advance_string_position(state, input.view, ch);
         }
         break;
     case CharClass::Word:
@@ -700,7 +728,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
             if (inverse)
                 inverse_matched = true;
             else
-                ++state.string_position;
+                advance_string_position(state, input.view, ch);
         }
         break;
     case CharClass::Xdigit:
@@ -708,7 +736,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
             if (inverse)
                 inverse_matched = true;
             else
-                ++state.string_position;
+                advance_string_position(state, input.view, ch);
         }
         break;
     }
@@ -726,7 +754,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_range(MatchInput const& inp
         if (inverse)
             inverse_matched = true;
         else
-            ++state.string_position;
+            advance_string_position(state, input.view, ch);
     }
 }
 
@@ -735,14 +763,14 @@ ALWAYS_INLINE void OpCode_Compare::compare_property(MatchInput const& input, Mat
     if (state.string_position == input.view.length())
         return;
 
-    u32 code_point = input.view[state.string_position];
+    u32 code_point = input.view[state.string_position_in_code_units];
     bool equal = Unicode::code_point_has_property(code_point, property);
 
     if (equal) {
         if (inverse)
             inverse_matched = true;
         else
-            ++state.string_position;
+            advance_string_position(state, input.view, code_point);
     }
 }
 
@@ -751,14 +779,14 @@ ALWAYS_INLINE void OpCode_Compare::compare_general_category(MatchInput const& in
     if (state.string_position == input.view.length())
         return;
 
-    u32 code_point = input.view[state.string_position];
+    u32 code_point = input.view[state.string_position_in_code_units];
     bool equal = Unicode::code_point_has_general_category(code_point, general_category);
 
     if (equal) {
         if (inverse)
             inverse_matched = true;
         else
-            ++state.string_position;
+            advance_string_position(state, input.view, code_point);
     }
 }
 

+ 34 - 11
Userland/Libraries/LibRegex/RegexMatch.h

@@ -95,12 +95,37 @@ public:
                 [](auto const& view) { return view.length(); });
         }
 
+        return length_in_code_units();
+    }
+
+    size_t length_in_code_units() const
+    {
         return m_view.visit(
             [](Utf16View const& view) { return view.length_in_code_units(); },
             [](Utf8View const& view) { return view.byte_length(); },
             [](auto const& view) { return view.length(); });
     }
 
+    size_t length_of_code_point(u32 code_point) const
+    {
+        return m_view.visit(
+            [](Utf32View const&) { return 1; },
+            [&](Utf16View const&) {
+                if (code_point < 0x10000)
+                    return 1;
+                return 2;
+            },
+            [&](auto const&) {
+                if (code_point <= 0x7f)
+                    return 1;
+                else if (code_point <= 0x07ff)
+                    return 2;
+                else if (code_point <= 0xffff)
+                    return 3;
+                return 4;
+            });
+    }
+
     RegexStringView typed_null_view()
     {
         auto view = m_view.visit(
@@ -230,6 +255,7 @@ public:
             });
     }
 
+    // Note: index must always be the code unit offset to return.
     u32 operator[](size_t index) const
     {
         return m_view.visit(
@@ -239,17 +265,12 @@ public:
                     return 256u + ch;
                 return ch;
             },
-            [&](Utf32View& view) -> u32 { return view[index]; },
-            [&](Utf16View& view) -> u32 { return view.code_point_at(index); },
-            [&](auto& view) -> u32 {
-                // FIXME: Iterating to the code point is inefficient, particularly for very large
-                // strings. Implement something like code_point_at to Utf8View.
-                size_t i = index;
-                for (auto it = view.begin(); it != view.end(); ++it, --i) {
-                    if (i == 0)
-                        return *it;
-                }
-                VERIFY_NOT_REACHED();
+            [&](Utf32View const& view) -> u32 { return view[index]; },
+            [&](Utf16View const& view) -> u32 { return view.code_point_at(index); },
+            [&](Utf8View const& view) -> u32 {
+                auto it = view.iterator_at_byte_offset(index);
+                VERIFY(it != view.end());
+                return *it;
             });
     }
 
@@ -462,11 +483,13 @@ struct MatchInput {
 
     mutable size_t fail_counter { 0 };
     mutable Vector<size_t> saved_positions;
+    mutable Vector<size_t> saved_code_unit_positions;
 };
 
 struct MatchState {
     size_t string_position_before_match { 0 };
     size_t string_position { 0 };
+    size_t string_position_in_code_units { 0 };
     size_t instruction_position { 0 };
     size_t fork_at_position { 0 };
     Vector<Match> matches;

+ 4 - 1
Userland/Libraries/LibRegex/RegexMatcher.cpp

@@ -198,6 +198,7 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
         auto view_length = view.length();
         size_t view_index = m_pattern->start_offset;
         state.string_position = view_index;
+        state.string_position_in_code_units = view_index;
         bool succeeded = false;
 
         if (view_index == view_length && m_pattern->parser_result.match_length_minimum == 0) {
@@ -210,6 +211,7 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
             input.match_index = match_count;
 
             state.string_position = view_index;
+            state.string_position_in_code_units = view_index;
             state.instruction_position = 0;
 
             auto success = execute(input, state, temp_output);
@@ -241,6 +243,7 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
             input.match_index = match_count;
 
             state.string_position = view_index;
+            state.string_position_in_code_units = view_index;
             state.instruction_position = 0;
 
             auto success = execute(input, state, output);
@@ -388,7 +391,7 @@ private:
         Node* previous { nullptr };
     };
 
-    UniformBumpAllocator<Node, true> m_allocator;
+    UniformBumpAllocator<Node, true, 8 * MiB> m_allocator;
     Node* m_first { nullptr };
     Node* m_last { nullptr };
 };