From 6131c0485e433646bc8f6495e3a5127ef75d90c9 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 18 Aug 2021 14:43:11 -0400 Subject: [PATCH] LibRegex: Use GenericLexer to consume escaped code points --- Userland/Libraries/LibRegex/RegexParser.cpp | 89 +++++++-------------- Userland/Libraries/LibRegex/RegexParser.h | 1 + 2 files changed, 29 insertions(+), 61 deletions(-) diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 0e85f3a2c67..3617f0ec5c2 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -92,6 +92,29 @@ ALWAYS_INLINE bool Parser::consume(String const& str) return true; } +ALWAYS_INLINE Optional Parser::consume_escaped_code_point(bool unicode) +{ + if (match(TokenType::LeftCurly) && !unicode) { + // In non-Unicode mode, this should be parsed as a repetition symbol (repeating the 'u'). + return static_cast('u'); + } + + m_parser_state.lexer.retreat(2 + !done()); // Go back to just before '\u' (+1 char, because we will have consumed an extra character) + + if (auto code_point_or_error = m_parser_state.lexer.consume_escaped_code_point(unicode); !code_point_or_error.is_error()) { + m_parser_state.current_token = m_parser_state.lexer.next(); + return code_point_or_error.value(); + } + + if (!unicode) { + // '\u' is allowed in non-unicode mode, just matches 'u'. + return static_cast('u'); + } + + set_error(Error::InvalidPattern); + return {}; +} + ALWAYS_INLINE bool Parser::try_skip(StringView str) { if (str.starts_with(m_parser_state.current_token.value())) @@ -1489,64 +1512,13 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini } if (try_skip("u")) { - if (match(TokenType::LeftCurly)) { - if (!unicode) { - // In non-Unicode mode, this should be parsed as a repetition symbol (repeating the 'u'). - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'u' } }); - return true; - } - - consume(); - - auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 6); - if (code_point.has_value() && is_unicode(*code_point) && match(TokenType::RightCurly)) { - consume(); - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } }); - return true; - } - - set_error(Error::InvalidPattern); - return false; - } - - if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4, 4); code_point.has_value()) { - // In Unicode mode, we need to combine surrogate pairs into a single code point. But we also need to be - // rather forgiving if the surrogate pairs are invalid. So if a second code unit follows this code unit, - // but doesn't form a valid surrogate pair, insert bytecode for both code units individually. - Optional low_surrogate; - if (unicode && Utf16View::is_high_surrogate(*code_point) && try_skip("\\u")) { - low_surrogate = read_digits(ReadDigitsInitialZeroState::Allow, true, 4); - if (!low_surrogate.has_value()) { - set_error(Error::InvalidPattern); - return false; - } - - if (Utf16View::is_low_surrogate(*low_surrogate)) { - *code_point = Utf16View::decode_surrogate_pair(*code_point, *low_surrogate); - low_surrogate.clear(); - } - } - + if (auto code_point = consume_escaped_code_point(unicode); code_point.has_value()) { match_length_minimum += 1; stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } }); - - if (low_surrogate.has_value()) { - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)low_surrogate.value() } }); - } - return true; - } else if (!unicode) { - // '\u' is allowed in non-unicode mode, just matches 'u'. - match_length_minimum += 1; - stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'u' } }); - return true; - } else { - set_error(Error::InvalidPattern); - return false; } + + return false; } // IdentityEscape @@ -1847,16 +1819,11 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector& } if (try_skip("u")) { - if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4, 4); code_point.has_value()) { + if (auto code_point = consume_escaped_code_point(unicode); code_point.has_value()) { // FIXME: While code point ranges are supported, code point matches as "Char" are not! return { CharClassRangeElement { .code_point = code_point.value(), .is_character_class = false } }; - } else if (!unicode) { - // '\u' is allowed in non-unicode mode, just matches 'u'. - return { CharClassRangeElement { .code_point = 'u', .is_character_class = false } }; - } else { - set_error(Error::InvalidPattern); - return {}; } + return {}; } // IdentityEscape diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index d9825bcd788..ee3fd648aa7 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -80,6 +80,7 @@ protected: ALWAYS_INLINE Token consume(); ALWAYS_INLINE Token consume(TokenType type, Error error); ALWAYS_INLINE bool consume(String const&); + ALWAYS_INLINE Optional consume_escaped_code_point(bool unicode); ALWAYS_INLINE bool try_skip(StringView); ALWAYS_INLINE bool lookahead_any(StringView); ALWAYS_INLINE char skip();