Quellcode durchsuchen

LibRegex: Parse capture group names according to the ECMA262 spec

davidot vor 3 Jahren
Ursprung
Commit
154ed3994c
2 geänderte Dateien mit 94 neuen und 8 gelöschten Zeilen
  1. 6 0
      Tests/LibRegex/Regex.cpp
  2. 88 8
      Userland/Libraries/LibRegex/RegexParser.cpp

+ 6 - 0
Tests/LibRegex/Regex.cpp

@@ -593,6 +593,12 @@ TEST_CASE(ECMA262_parse)
         { "a{9007199254740992,9007199254740992}"sv, regex::Error::InvalidBraceContent },
         { "(?<a>a)(?<a>b)"sv, regex::Error::DuplicateNamedCapture },
         { "(?<a>a)(?<b>b)(?<a>c)"sv, regex::Error::DuplicateNamedCapture },
+        { "(?<1a>a)"sv, regex::Error::InvalidNameForCaptureGroup },
+        { "(?<\\a>a)"sv, regex::Error::InvalidNameForCaptureGroup },
+        { "(?<\ta>a)"sv, regex::Error::InvalidNameForCaptureGroup },
+        { "(?<$$_$$>a)"sv },
+        { "(?<ÿ>a)"sv },
+        { "(?<𝓑𝓻𝓸𝔀𝓷>a)"sv },
     };
 
     for (auto& test : tests) {

+ 88 - 8
Userland/Libraries/LibRegex/RegexParser.cpp

@@ -2036,29 +2036,109 @@ bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool
 
 FlyString ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
 {
+    static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
+    static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
+    static constexpr const u32 REPLACEMENT_CHARACTER = 0xFFFD;
+    constexpr const u32 ZERO_WIDTH_NON_JOINER { 0x200C };
+    constexpr const u32 ZERO_WIDTH_JOINER { 0x200D };
+
     if (take_starting_angle_bracket && !consume("<"))
         return {};
 
     StringBuilder builder;
+
+    auto consume_code_point = [&] {
+        Utf8View utf_8_view { m_parser_state.lexer.source().substring_view(m_parser_state.lexer.tell() - 1) };
+        if (utf_8_view.is_empty())
+            return REPLACEMENT_CHARACTER;
+        u32 code_point = *utf_8_view.begin();
+        auto characters = utf_8_view.byte_offset_of(1);
+
+        while (characters-- > 0)
+            consume();
+
+        return code_point;
+    };
+
+    {
+        // The first character is limited to: https://tc39.es/ecma262/#prod-RegExpIdentifierStart
+        //  RegExpIdentifierStart[UnicodeMode] ::
+        //      IdentifierStartChar
+        //      \ RegExpUnicodeEscapeSequence[+UnicodeMode]
+        //      [~UnicodeMode] UnicodeLeadSurrogate UnicodeTrailSurrogate
+
+        auto code_point = consume_code_point();
+
+        if (code_point == '\\' && match('u')) {
+            consume();
+
+            if (auto maybe_code_point = consume_escaped_code_point(true); maybe_code_point.has_value()) {
+                code_point = *maybe_code_point;
+            } else {
+                set_error(Error::InvalidNameForCaptureGroup);
+                return {};
+            }
+        }
+
+        if (is_ascii(code_point)) {
+            // The only valid ID_Start unicode characters in ascii are the letters.
+            if (!is_ascii_alpha(code_point) && code_point != '$' && code_point != '_') {
+                set_error(Error::InvalidNameForCaptureGroup);
+                return {};
+            }
+        } else if (id_start_category.has_value() && !Unicode::code_point_has_property(code_point, *id_start_category)) {
+            set_error(Error::InvalidNameForCaptureGroup);
+            return {};
+        }
+        builder.append_code_point(code_point);
+    }
+
+    bool hit_end = false;
+
+    // Any following characters are limited to:
+    //  RegExpIdentifierPart[UnicodeMode] ::
+    //      IdentifierPartChar
+    //      \ RegExpUnicodeEscapeSequence[+UnicodeMode]
+    //      [~UnicodeMode] UnicodeLeadSurrogate UnicodeTrailSurrogate
+
     while (match(TokenType::Char) || match(TokenType::Dollar) || match(TokenType::LeftCurly) || match(TokenType::RightCurly)) {
-        auto c = m_parser_state.current_token.value();
-        if (c == ">")
+        auto code_point = consume_code_point();
+
+        if (code_point == '>') {
+            hit_end = true;
             break;
+        }
 
-        if (try_skip("\\u"sv)) {
-            if (auto code_point = consume_escaped_code_point(true); code_point.has_value()) {
-                builder.append_code_point(*code_point);
+        if (code_point == '\\') {
+            if (!try_skip("u")) {
+                set_error(Error::InvalidNameForCaptureGroup);
+                return {};
+            }
+            if (auto maybe_code_point = consume_escaped_code_point(true); maybe_code_point.has_value()) {
+                code_point = *maybe_code_point;
             } else {
                 set_error(Error::InvalidNameForCaptureGroup);
                 return {};
             }
-        } else {
-            builder.append(consume().value());
         }
+
+        if (is_ascii(code_point)) {
+            // The only valid ID_Continue unicode characters in ascii are the letters and numbers.
+            if (!is_ascii_alphanumeric(code_point) && code_point != '$' && code_point != '_') {
+                set_error(Error::InvalidNameForCaptureGroup);
+                return {};
+            }
+        } else if (code_point != ZERO_WIDTH_JOINER && code_point != ZERO_WIDTH_NON_JOINER) {
+            if (id_continue_category.has_value() && !Unicode::code_point_has_property(code_point, *id_continue_category)) {
+                set_error(Error::InvalidNameForCaptureGroup);
+                return {};
+            }
+        }
+        builder.append_code_point(code_point);
     }
 
     FlyString name = builder.build();
-    if (!consume(">") || name.is_empty())
+    if (!hit_end || name.is_empty())
         set_error(Error::InvalidNameForCaptureGroup);
 
     return name;