mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-12-04 05:20:30 +00:00
LibRegex: Parse capture group names according to the ECMA262 spec
This commit is contained in:
parent
733a70671b
commit
154ed3994c
Notes:
sideshowbarker
2024-07-17 22:31:05 +09:00
Author: https://github.com/davidot Commit: https://github.com/SerenityOS/serenity/commit/154ed3994c8 Pull-request: https://github.com/SerenityOS/serenity/pull/11316 Reviewed-by: https://github.com/alimpfard Reviewed-by: https://github.com/linusg ✅
2 changed files with 96 additions and 10 deletions
|
@ -593,6 +593,12 @@ TEST_CASE(ECMA262_parse)
|
|||
{ "a{9007199254740992,9007199254740992}"sv, regex::Error::InvalidBraceContent },
|
||||
{ "(?<a>a)(?<a>b)"sv, regex::Error::DuplicateNamedCapture },
|
||||
{ "(?<a>a)(?<b>b)(?<a>c)"sv, regex::Error::DuplicateNamedCapture },
|
||||
{ "(?<1a>a)"sv, regex::Error::InvalidNameForCaptureGroup },
|
||||
{ "(?<\\a>a)"sv, regex::Error::InvalidNameForCaptureGroup },
|
||||
{ "(?<\ta>a)"sv, regex::Error::InvalidNameForCaptureGroup },
|
||||
{ "(?<$$_$$>a)"sv },
|
||||
{ "(?<ÿ>a)"sv },
|
||||
{ "(?<𝓑𝓻𝓸𝔀𝓷>a)"sv },
|
||||
};
|
||||
|
||||
for (auto& test : tests) {
|
||||
|
|
|
@ -2036,29 +2036,109 @@ bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool
|
|||
|
||||
FlyString ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
|
||||
{
|
||||
static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
|
||||
static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
|
||||
static constexpr const u32 REPLACEMENT_CHARACTER = 0xFFFD;
|
||||
constexpr const u32 ZERO_WIDTH_NON_JOINER { 0x200C };
|
||||
constexpr const u32 ZERO_WIDTH_JOINER { 0x200D };
|
||||
|
||||
if (take_starting_angle_bracket && !consume("<"))
|
||||
return {};
|
||||
|
||||
StringBuilder builder;
|
||||
while (match(TokenType::Char) || match(TokenType::Dollar) || match(TokenType::LeftCurly) || match(TokenType::RightCurly)) {
|
||||
auto c = m_parser_state.current_token.value();
|
||||
if (c == ">")
|
||||
break;
|
||||
|
||||
if (try_skip("\\u"sv)) {
|
||||
if (auto code_point = consume_escaped_code_point(true); code_point.has_value()) {
|
||||
builder.append_code_point(*code_point);
|
||||
auto consume_code_point = [&] {
|
||||
Utf8View utf_8_view { m_parser_state.lexer.source().substring_view(m_parser_state.lexer.tell() - 1) };
|
||||
if (utf_8_view.is_empty())
|
||||
return REPLACEMENT_CHARACTER;
|
||||
u32 code_point = *utf_8_view.begin();
|
||||
auto characters = utf_8_view.byte_offset_of(1);
|
||||
|
||||
while (characters-- > 0)
|
||||
consume();
|
||||
|
||||
return code_point;
|
||||
};
|
||||
|
||||
{
|
||||
// The first character is limited to: https://tc39.es/ecma262/#prod-RegExpIdentifierStart
|
||||
// RegExpIdentifierStart[UnicodeMode] ::
|
||||
// IdentifierStartChar
|
||||
// \ RegExpUnicodeEscapeSequence[+UnicodeMode]
|
||||
// [~UnicodeMode] UnicodeLeadSurrogate UnicodeTrailSurrogate
|
||||
|
||||
auto code_point = consume_code_point();
|
||||
|
||||
if (code_point == '\\' && match('u')) {
|
||||
consume();
|
||||
|
||||
if (auto maybe_code_point = consume_escaped_code_point(true); maybe_code_point.has_value()) {
|
||||
code_point = *maybe_code_point;
|
||||
} else {
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
return {};
|
||||
}
|
||||
} else {
|
||||
builder.append(consume().value());
|
||||
}
|
||||
|
||||
if (is_ascii(code_point)) {
|
||||
// The only valid ID_Start unicode characters in ascii are the letters.
|
||||
if (!is_ascii_alpha(code_point) && code_point != '$' && code_point != '_') {
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
return {};
|
||||
}
|
||||
} else if (id_start_category.has_value() && !Unicode::code_point_has_property(code_point, *id_start_category)) {
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
return {};
|
||||
}
|
||||
builder.append_code_point(code_point);
|
||||
}
|
||||
|
||||
bool hit_end = false;
|
||||
|
||||
// Any following characters are limited to:
|
||||
// RegExpIdentifierPart[UnicodeMode] ::
|
||||
// IdentifierPartChar
|
||||
// \ RegExpUnicodeEscapeSequence[+UnicodeMode]
|
||||
// [~UnicodeMode] UnicodeLeadSurrogate UnicodeTrailSurrogate
|
||||
|
||||
while (match(TokenType::Char) || match(TokenType::Dollar) || match(TokenType::LeftCurly) || match(TokenType::RightCurly)) {
|
||||
auto code_point = consume_code_point();
|
||||
|
||||
if (code_point == '>') {
|
||||
hit_end = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if (code_point == '\\') {
|
||||
if (!try_skip("u")) {
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
return {};
|
||||
}
|
||||
if (auto maybe_code_point = consume_escaped_code_point(true); maybe_code_point.has_value()) {
|
||||
code_point = *maybe_code_point;
|
||||
} else {
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
if (is_ascii(code_point)) {
|
||||
// The only valid ID_Continue unicode characters in ascii are the letters and numbers.
|
||||
if (!is_ascii_alphanumeric(code_point) && code_point != '$' && code_point != '_') {
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
return {};
|
||||
}
|
||||
} else if (code_point != ZERO_WIDTH_JOINER && code_point != ZERO_WIDTH_NON_JOINER) {
|
||||
if (id_continue_category.has_value() && !Unicode::code_point_has_property(code_point, *id_continue_category)) {
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
return {};
|
||||
}
|
||||
}
|
||||
builder.append_code_point(code_point);
|
||||
}
|
||||
|
||||
FlyString name = builder.build();
|
||||
if (!consume(">") || name.is_empty())
|
||||
if (!hit_end || name.is_empty())
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
|
||||
return name;
|
||||
|
|
Loading…
Reference in a new issue