From df14d11a11ba256dc8e7fcee4bffcdbc9934166f Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Tue, 10 Aug 2021 16:35:45 -0400 Subject: [PATCH] LibRegex: Disallow invalid interval qualifiers in Unicode mode Fixes all remaining 'built-ins/RegExp/property-escapes' test262 tests. --- Tests/LibRegex/Regex.cpp | 3 + Userland/Libraries/LibRegex/RegexParser.cpp | 104 +++++++++++--------- Userland/Libraries/LibRegex/RegexParser.h | 1 + 3 files changed, 63 insertions(+), 45 deletions(-) diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index dc7bad6b330..820d5401fa8 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -522,6 +522,9 @@ TEST_CASE(ECMA262_parse) { "\\p{hello friends}", regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode }, { "\\p{Prepended_Concatenation_Mark}", regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode }, { "\\p{ASCII}", regex::Error::NoError, ECMAScriptFlags::Unicode }, + { "\\\\p{1}", regex::Error::NoError, ECMAScriptFlags::Unicode }, + { "\\\\p{AsCiI}", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, + { "\\\\p{ASCII}", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, }; for (auto& test : tests) { diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 6fe6519ddeb..8fa19de4b8c 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -1121,7 +1121,7 @@ Optional ECMA262Parser::read_digits(ECMA262Parser::ReadDigitsInitialZe return str.to_uint(); } -bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minimum, bool, bool) +bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool) { enum class Repetition { OneOrMore, @@ -1144,52 +1144,13 @@ bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minim consume(); repetition_mark = Repetition::Optional; } else if (match(TokenType::LeftCurly)) { - consume(); - auto chars_consumed = 1; repetition_mark = Repetition::Explicit; - - auto low_bound_string = read_digits_as_string(); - chars_consumed += low_bound_string.length(); - - auto low_bound = low_bound_string.to_uint(); - - if (!low_bound.has_value()) { - if (!m_should_use_browser_extended_grammar && done()) - return set_error(Error::MismatchingBrace); - - back(chars_consumed + !done()); - return true; - } - - repeat_min = low_bound.value(); - - if (match(TokenType::Comma)) { - consume(); - ++chars_consumed; - auto high_bound_string = read_digits_as_string(); - auto high_bound = high_bound_string.to_uint(); - if (high_bound.has_value()) { - repeat_max = high_bound.value(); - chars_consumed += high_bound_string.length(); + if (!parse_interval_quantifier(repeat_min, repeat_max)) { + if (unicode) { + // Invalid interval quantifiers are disallowed in Unicode mod - they must be esacped with '\{'. + set_error(Error::InvalidPattern); } - } else { - repeat_max = repeat_min; - } - - if (!match(TokenType::RightCurly)) { - if (!m_should_use_browser_extended_grammar && done()) - return set_error(Error::MismatchingBrace); - - back(chars_consumed + !done()); - return true; - } - - consume(); - ++chars_consumed; - - if (repeat_max.has_value()) { - if (repeat_min.value() > repeat_max.value()) - set_error(Error::InvalidBraceContent); + return !has_error(); } } else { return true; @@ -1223,6 +1184,59 @@ bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minim return true; } +bool ECMA262Parser::parse_interval_quantifier(Optional& repeat_min, Optional& repeat_max) +{ + VERIFY(match(TokenType::LeftCurly)); + consume(); + auto chars_consumed = 1; + + auto low_bound_string = read_digits_as_string(); + chars_consumed += low_bound_string.length(); + + auto low_bound = low_bound_string.to_uint(); + + if (!low_bound.has_value()) { + if (!m_should_use_browser_extended_grammar && done()) + return set_error(Error::MismatchingBrace); + + back(chars_consumed + !done()); + return false; + } + + repeat_min = low_bound.value(); + + if (match(TokenType::Comma)) { + consume(); + ++chars_consumed; + auto high_bound_string = read_digits_as_string(); + auto high_bound = high_bound_string.to_uint(); + if (high_bound.has_value()) { + repeat_max = high_bound.value(); + chars_consumed += high_bound_string.length(); + } + } else { + repeat_max = repeat_min; + } + + if (!match(TokenType::RightCurly)) { + if (!m_should_use_browser_extended_grammar && done()) + return set_error(Error::MismatchingBrace); + + back(chars_consumed + !done()); + return false; + } + + consume(); + ++chars_consumed; + + if (repeat_max.has_value()) { + if (repeat_min.value() > repeat_max.value()) + set_error(Error::InvalidBraceContent); + } + + return true; +} + bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named) { if (match(TokenType::EscapeSequence)) { diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index ec5cb12f1cb..117ea4cd8c1 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -228,6 +228,7 @@ private: bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named); bool parse_atom(ByteCode&, size_t&, bool unicode, bool named); bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named); + bool parse_interval_quantifier(Optional& repeat_min, Optional& repeat_max); bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named); bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named); bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);