LibRegex: Use GenericLexer to consume escaped code points

Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/6131c0485e4 Pull-request: https://github.com/SerenityOS/serenity/pull/9499 Reviewed-by: https://github.com/alimpfard Reviewed-by: https://github.com/davidot ✅
2024-11-22 07:30:19 +00:00 · 2021-08-18 14:43:11 -04:00 · 2021-08-18 14:43:11 -04:00 · 6131c0485e · 2024-07-18 05:28:22 +09:00
commit 6131c0485e
parent 5ff9596678
2 changed files with 29 additions and 61 deletions
--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@ -92,6 +92,29 @@ ALWAYS_INLINE bool Parser::consume(String const& str)
    return true;
 }

+ALWAYS_INLINE Optional<u32> Parser::consume_escaped_code_point(bool unicode)
+{
+    if (match(TokenType::LeftCurly) && !unicode) {
+        // In non-Unicode mode, this should be parsed as a repetition symbol (repeating the 'u').
+        return static_cast<u32>('u');
+    }
+
+    m_parser_state.lexer.retreat(2 + !done()); // Go back to just before '\u' (+1 char, because we will have consumed an extra character)
+
+    if (auto code_point_or_error = m_parser_state.lexer.consume_escaped_code_point(unicode); !code_point_or_error.is_error()) {
+        m_parser_state.current_token = m_parser_state.lexer.next();
+        return code_point_or_error.value();
+    }
+
+    if (!unicode) {
+        // '\u' is allowed in non-unicode mode, just matches 'u'.
+        return static_cast<u32>('u');
+    }
+
+    set_error(Error::InvalidPattern);
+    return {};
+}
+
 ALWAYS_INLINE bool Parser::try_skip(StringView str)
 {
    if (str.starts_with(m_parser_state.current_token.value()))
@ -1489,64 +1512,13 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
    }

    if (try_skip("u")) {
-        if (match(TokenType::LeftCurly)) {
-            if (!unicode) {
-                // In non-Unicode mode, this should be parsed as a repetition symbol (repeating the 'u').
-                match_length_minimum += 1;
-                stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'u' } });
-                return true;
-            }
-
-            consume();
-
-            auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 6);
-            if (code_point.has_value() && is_unicode(*code_point) && match(TokenType::RightCurly)) {
-                consume();
-                match_length_minimum += 1;
-                stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } });
-                return true;
-            }
-
-            set_error(Error::InvalidPattern);
-            return false;
-        }
-
-        if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4, 4); code_point.has_value()) {
-            // In Unicode mode, we need to combine surrogate pairs into a single code point. But we also need to be
-            // rather forgiving if the surrogate pairs are invalid. So if a second code unit follows this code unit,
-            // but doesn't form a valid surrogate pair, insert bytecode for both code units individually.
-            Optional<u32> low_surrogate;
-            if (unicode && Utf16View::is_high_surrogate(*code_point) && try_skip("\\u")) {
-                low_surrogate = read_digits(ReadDigitsInitialZeroState::Allow, true, 4);
-                if (!low_surrogate.has_value()) {
-                    set_error(Error::InvalidPattern);
-                    return false;
-                }
-
-                if (Utf16View::is_low_surrogate(*low_surrogate)) {
-                    *code_point = Utf16View::decode_surrogate_pair(*code_point, *low_surrogate);
-                    low_surrogate.clear();
-                }
-            }
-
+        if (auto code_point = consume_escaped_code_point(unicode); code_point.has_value()) {
            match_length_minimum += 1;
            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } });
-
-            if (low_surrogate.has_value()) {
-                match_length_minimum += 1;
-                stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)low_surrogate.value() } });
-            }
-
            return true;
-        } else if (!unicode) {
-            // '\u' is allowed in non-unicode mode, just matches 'u'.
-            match_length_minimum += 1;
-            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'u' } });
-            return true;
-        } else {
-            set_error(Error::InvalidPattern);
-            return false;
        }
+
+        return false;
    }

    // IdentityEscape
@ -1847,16 +1819,11 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
            }

            if (try_skip("u")) {
-                if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4, 4); code_point.has_value()) {
+                if (auto code_point = consume_escaped_code_point(unicode); code_point.has_value()) {
                    // FIXME: While code point ranges are supported, code point matches as "Char" are not!
                    return { CharClassRangeElement { .code_point = code_point.value(), .is_character_class = false } };
-                } else if (!unicode) {
-                    // '\u' is allowed in non-unicode mode, just matches 'u'.
-                    return { CharClassRangeElement { .code_point = 'u', .is_character_class = false } };
-                } else {
-                    set_error(Error::InvalidPattern);
-                    return {};
                }
+                return {};
            }

            // IdentityEscape
--- a/Userland/Libraries/LibRegex/RegexParser.h
+++ b/Userland/Libraries/LibRegex/RegexParser.h
@ -80,6 +80,7 @@ protected:
    ALWAYS_INLINE Token consume();
    ALWAYS_INLINE Token consume(TokenType type, Error error);
    ALWAYS_INLINE bool consume(String const&);
+    ALWAYS_INLINE Optional<u32> consume_escaped_code_point(bool unicode);
    ALWAYS_INLINE bool try_skip(StringView);
    ALWAYS_INLINE bool lookahead_any(StringView);
    ALWAYS_INLINE char skip();