diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index a4731e62d34..a1f6f084d6c 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -510,6 +510,11 @@ TEST_CASE(ECMA262_parse) { "\\uxxxx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, { "\\ud83d", regex::Error::NoError, ECMAScriptFlags::Unicode }, { "\\ud83d\\uxxxx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, + { "\\u{0}", regex::Error::NoError, ECMAScriptFlags::Unicode }, + { "\\u{10ffff}", regex::Error::NoError, ECMAScriptFlags::Unicode }, + { "\\u{10ffff", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, + { "\\u{10ffffx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, + { "\\u{110000}", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode }, }; for (auto& test : tests) { @@ -605,6 +610,7 @@ TEST_CASE(ECMA262_unicode_match) { "\\ude00", "😀", false, ECMAScriptFlags::Unicode }, { "\\ud83d\\ude00", "😀", true }, { "\\ud83d\\ude00", "😀", true, ECMAScriptFlags::Unicode }, + { "\\u{1f600}", "😀", true, ECMAScriptFlags::Unicode }, { "\\ud83d\\ud83d", "\xed\xa0\xbd\xed\xa0\xbd", true }, { "\\ud83d\\ud83d", "\xed\xa0\xbd\xed\xa0\xbd", true, ECMAScriptFlags::Unicode }, }; diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 07885173b66..becff1810f9 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -7,6 +7,7 @@ #include "RegexParser.h" #include "RegexDebug.h" +#include #include #include #include @@ -1440,6 +1441,26 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini } if (try_skip("u")) { + if (match(TokenType::LeftCurly)) { + consume(); + + if (!unicode) { + // FIXME: In non-Unicode mode, this should be parsed as a repetition symbol (repeating the 'u'). + TODO(); + } + + auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 6); + if (code_point.has_value() && is_unicode(*code_point) && match(TokenType::RightCurly)) { + consume(); + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } }); + return true; + } + + set_error(Error::InvalidPattern); + return false; + } + if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4); code_point.has_value()) { // In Unicode mode, we need to combine surrogate pairs into a single code point. But we also need to be // rather forgiving if the surrogate pairs are invalid. So if a second code unit follows this code unit,