mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-21 23:20:20 +00:00
LibRegex: Add basic support for unicode escapes in ECMA262Parser
This parses unicode escapes (and matches them only for utf8 strings).
This commit is contained in:
parent
86811683b0
commit
765d2977bc
Notes:
sideshowbarker
2024-07-19 01:01:54 +09:00
Author: https://github.com/alimpfard Commit: https://github.com/SerenityOS/serenity/commit/765d2977bcc Pull-request: https://github.com/SerenityOS/serenity/pull/4333
2 changed files with 31 additions and 4 deletions
|
@ -1066,8 +1066,24 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
|
|||
}
|
||||
|
||||
if (try_skip("u")) {
|
||||
// FIXME: Implement this path, unicode escape sequence.
|
||||
TODO();
|
||||
if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 4); code_point.has_value()) {
|
||||
// FIXME: The minimum length depends on the mode - should be utf8-length in u8 mode.
|
||||
match_length_minimum += 1;
|
||||
StringBuilder builder;
|
||||
builder.append_code_point(code_point.value());
|
||||
// FIXME: This isn't actually correct for ECMAScript.
|
||||
auto u8_encoded = builder.string_view();
|
||||
stack.insert_bytecode_compare_string(u8_encoded);
|
||||
return true;
|
||||
} else if (!unicode) {
|
||||
// '\u' is allowed in non-unicode mode, just matches 'u'.
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'u' } });
|
||||
return true;
|
||||
} else {
|
||||
set_error(Error::InvalidPattern);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// IdentityEscape
|
||||
|
@ -1261,8 +1277,16 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
|
|||
}
|
||||
|
||||
if (try_skip("u")) {
|
||||
// FIXME: Implement this path, unicode escape sequence.
|
||||
TODO();
|
||||
if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 4); code_point.has_value()) {
|
||||
// FIXME: While codepoint ranges are supported, codepoint matches as "Char" are not!
|
||||
return { { .code_point = code_point.value(), .is_character_class = false } };
|
||||
} else if (!unicode) {
|
||||
// '\u' is allowed in non-unicode mode, just matches 'u'.
|
||||
return { { .code_point = 'u', .is_character_class = false } };
|
||||
} else {
|
||||
set_error(Error::InvalidPattern);
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
if (unicode) {
|
||||
|
|
|
@ -477,6 +477,7 @@ TEST_CASE(ECMA262_parse)
|
|||
struct _test {
|
||||
const char* pattern;
|
||||
regex::Error expected_error { regex::Error::NoError };
|
||||
regex::ECMAScriptFlags flags {};
|
||||
};
|
||||
|
||||
constexpr _test tests[] {
|
||||
|
@ -497,6 +498,8 @@ TEST_CASE(ECMA262_parse)
|
|||
{ "\\x" }, // Even invalid escapes are allowed if ~unicode.
|
||||
{ "\\", regex::Error::InvalidTrailingEscape },
|
||||
{ "(?", regex::Error::InvalidCaptureGroup },
|
||||
{ "\\u1234", regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
|
||||
{ "[\\u1234]", regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
|
||||
};
|
||||
|
||||
for (auto& test : tests) {
|
||||
|
|
Loading…
Reference in a new issue