LibRegex: Add basic support for unicode escapes in ECMA262Parser

This parses unicode escapes (and matches them only for utf8 strings).
Author: https://github.com/alimpfard Commit: https://github.com/SerenityOS/serenity/commit/765d2977bcc Pull-request: https://github.com/SerenityOS/serenity/pull/4333
2024-11-21 23:20:20 +00:00 · 2020-12-06 17:04:28 +03:30 · 2020-12-06 17:04:28 +03:30 · 765d2977bc · 2024-07-19 01:01:54 +09:00
commit 765d2977bc
parent 86811683b0
2 changed files with 31 additions and 4 deletions
--- a/Libraries/LibRegex/RegexParser.cpp
+++ b/Libraries/LibRegex/RegexParser.cpp
@ -1066,8 +1066,24 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
    }

    if (try_skip("u")) {
-        // FIXME: Implement this path, unicode escape sequence.
-        TODO();
+        if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 4); code_point.has_value()) {
+            // FIXME: The minimum length depends on the mode - should be utf8-length in u8 mode.
+            match_length_minimum += 1;
+            StringBuilder builder;
+            builder.append_code_point(code_point.value());
+            // FIXME: This isn't actually correct for ECMAScript.
+            auto u8_encoded = builder.string_view();
+            stack.insert_bytecode_compare_string(u8_encoded);
+            return true;
+        } else if (!unicode) {
+            // '\u' is allowed in non-unicode mode, just matches 'u'.
+            match_length_minimum += 1;
+            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'u' } });
+            return true;
+        } else {
+            set_error(Error::InvalidPattern);
+            return false;
+        }
    }

    // IdentityEscape
@ -1261,8 +1277,16 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
            }

            if (try_skip("u")) {
-                // FIXME: Implement this path, unicode escape sequence.
-                TODO();
+                if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 4); code_point.has_value()) {
+                    // FIXME: While codepoint ranges are supported, codepoint matches as "Char" are not!
+                    return { { .code_point = code_point.value(), .is_character_class = false } };
+                } else if (!unicode) {
+                    // '\u' is allowed in non-unicode mode, just matches 'u'.
+                    return { { .code_point = 'u', .is_character_class = false } };
+                } else {
+                    set_error(Error::InvalidPattern);
+                    return {};
+                }
            }

            if (unicode) {
--- a/Libraries/LibRegex/Tests/Regex.cpp
+++ b/Libraries/LibRegex/Tests/Regex.cpp
@ -477,6 +477,7 @@ TEST_CASE(ECMA262_parse)
    struct _test {
        const char* pattern;
        regex::Error expected_error { regex::Error::NoError };
+        regex::ECMAScriptFlags flags {};
    };

    constexpr _test tests[] {
@ -497,6 +498,8 @@ TEST_CASE(ECMA262_parse)
        { "\\x" },                                         // Even invalid escapes are allowed if ~unicode.
        { "\\", regex::Error::InvalidTrailingEscape },
        { "(?", regex::Error::InvalidCaptureGroup },
+        { "\\u1234", regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
+        { "[\\u1234]", regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
    };

    for (auto& test : tests) {