mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-12-02 04:20:28 +00:00
LibJS: Hook up the 'v' (unicodeSets) RegExp flag
This commit is contained in:
parent
598dc74a76
commit
f4b26b0cea
Notes:
sideshowbarker
2024-07-17 17:06:59 +09:00
Author: https://github.com/alimpfard Commit: https://github.com/SerenityOS/serenity/commit/f4b26b0cea Pull-request: https://github.com/SerenityOS/serenity/pull/14592 Reviewed-by: https://github.com/linusg ✅
8 changed files with 80 additions and 30 deletions
|
@ -124,13 +124,14 @@
|
|||
__JS_ENUMERATE(toPrimitive, to_primitive) \
|
||||
__JS_ENUMERATE(toStringTag, to_string_tag)
|
||||
|
||||
#define JS_ENUMERATE_REGEXP_FLAGS \
|
||||
__JS_ENUMERATE(hasIndices, has_indices, d) \
|
||||
__JS_ENUMERATE(global, global, g) \
|
||||
__JS_ENUMERATE(ignoreCase, ignore_case, i) \
|
||||
__JS_ENUMERATE(multiline, multiline, m) \
|
||||
__JS_ENUMERATE(dotAll, dot_all, s) \
|
||||
__JS_ENUMERATE(unicode, unicode, u) \
|
||||
#define JS_ENUMERATE_REGEXP_FLAGS \
|
||||
__JS_ENUMERATE(hasIndices, has_indices, d) \
|
||||
__JS_ENUMERATE(global, global, g) \
|
||||
__JS_ENUMERATE(ignoreCase, ignore_case, i) \
|
||||
__JS_ENUMERATE(multiline, multiline, m) \
|
||||
__JS_ENUMERATE(dotAll, dot_all, s) \
|
||||
__JS_ENUMERATE(unicodeSets, unicode_sets, v) \
|
||||
__JS_ENUMERATE(unicode, unicode, u) \
|
||||
__JS_ENUMERATE(sticky, sticky, y)
|
||||
|
||||
namespace JS {
|
||||
|
|
|
@ -1522,7 +1522,14 @@ NonnullRefPtr<RegExpLiteral> Parser::parse_regexp_literal()
|
|||
parsed_flags = parsed_flags_or_error.release_value();
|
||||
}
|
||||
|
||||
auto parsed_pattern = parse_regex_pattern(pattern, parsed_flags.has_flag_set(ECMAScriptFlags::Unicode));
|
||||
String parsed_pattern;
|
||||
auto parsed_pattern_result = parse_regex_pattern(pattern, parsed_flags.has_flag_set(ECMAScriptFlags::Unicode), parsed_flags.has_flag_set(ECMAScriptFlags::UnicodeSets));
|
||||
if (parsed_pattern_result.is_error()) {
|
||||
syntax_error(parsed_pattern_result.release_error().error, rule_start.position());
|
||||
parsed_pattern = String::empty();
|
||||
} else {
|
||||
parsed_pattern = parsed_pattern_result.release_value();
|
||||
}
|
||||
auto parsed_regex = Regex<ECMA262>::parse_pattern(parsed_pattern, parsed_flags);
|
||||
|
||||
if (parsed_regex.error != regex::Error::NoError)
|
||||
|
|
|
@ -526,11 +526,12 @@ namespace JS {
|
|||
P(undefined) \
|
||||
P(unescape) \
|
||||
P(unicode) \
|
||||
P(unicodeSets) \
|
||||
P(unit) \
|
||||
P(unitDisplay) \
|
||||
P(until) \
|
||||
P(unregister) \
|
||||
P(unshift) \
|
||||
P(until) \
|
||||
P(usage) \
|
||||
P(useGrouping) \
|
||||
P(value) \
|
||||
|
|
|
@ -209,6 +209,7 @@
|
|||
M(RegExpCompileError, "RegExp compile error: {}") \
|
||||
M(RegExpObjectBadFlag, "Invalid RegExp flag '{}'") \
|
||||
M(RegExpObjectRepeatedFlag, "Repeated RegExp flag '{}'") \
|
||||
M(RegExpObjectIncompatibleFlags, "RegExp flag '{}' is incompatible with flag '{}'") \
|
||||
M(RestrictedFunctionPropertiesAccess, "Restricted function properties like 'callee', 'caller' and 'arguments' may " \
|
||||
"not be accessed in strict mode") \
|
||||
M(RestrictedGlobalProperty, "Cannot declare global property '{}'") \
|
||||
|
|
|
@ -16,7 +16,7 @@ namespace JS {
|
|||
|
||||
Result<regex::RegexOptions<ECMAScriptFlags>, String> regex_flags_from_string(StringView flags)
|
||||
{
|
||||
bool d = false, g = false, i = false, m = false, s = false, u = false, y = false;
|
||||
bool d = false, g = false, i = false, m = false, s = false, u = false, y = false, v = false;
|
||||
auto options = RegExpObject::default_flags;
|
||||
|
||||
for (auto ch : flags) {
|
||||
|
@ -68,6 +68,12 @@ Result<regex::RegexOptions<ECMAScriptFlags>, String> regex_flags_from_string(Str
|
|||
options |= (regex::ECMAScriptFlags)regex::AllFlags::Internal_Stateful;
|
||||
options |= regex::ECMAScriptFlags::Sticky;
|
||||
break;
|
||||
case 'v':
|
||||
if (v)
|
||||
return String::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
|
||||
v = true;
|
||||
options |= regex::ECMAScriptFlags::UnicodeSets;
|
||||
break;
|
||||
default:
|
||||
return String::formatted(ErrorType::RegExpObjectBadFlag.message(), ch);
|
||||
}
|
||||
|
@ -76,8 +82,11 @@ Result<regex::RegexOptions<ECMAScriptFlags>, String> regex_flags_from_string(Str
|
|||
return options;
|
||||
}
|
||||
|
||||
String parse_regex_pattern(StringView pattern, bool unicode)
|
||||
ErrorOr<String, ParseRegexPatternError> parse_regex_pattern(StringView pattern, bool unicode, bool unicode_sets)
|
||||
{
|
||||
if (unicode && unicode_sets)
|
||||
return ParseRegexPatternError { String::formatted(ErrorType::RegExpObjectIncompatibleFlags.message(), 'u', 'v') };
|
||||
|
||||
auto utf16_pattern = AK::utf8_to_utf16(pattern);
|
||||
Utf16View utf16_pattern_view { utf16_pattern };
|
||||
StringBuilder builder;
|
||||
|
@ -85,7 +94,7 @@ String parse_regex_pattern(StringView pattern, bool unicode)
|
|||
// If the Unicode flag is set, append each code point to the pattern. Otherwise, append each
|
||||
// code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse.
|
||||
for (size_t i = 0; i < utf16_pattern_view.length_in_code_units();) {
|
||||
if (unicode) {
|
||||
if (unicode || unicode_sets) {
|
||||
auto code_point = code_point_at(utf16_pattern_view, i);
|
||||
builder.append_code_point(code_point.code_point);
|
||||
i += code_point.code_unit_count;
|
||||
|
@ -104,6 +113,15 @@ String parse_regex_pattern(StringView pattern, bool unicode)
|
|||
return builder.build();
|
||||
}
|
||||
|
||||
ThrowCompletionOr<String> parse_regex_pattern(StringView pattern, VM& vm, GlobalObject& global_object, bool unicode, bool unicode_sets)
|
||||
{
|
||||
auto result = parse_regex_pattern(pattern, unicode, unicode_sets);
|
||||
if (result.is_error())
|
||||
return vm.throw_completion<JS::SyntaxError>(global_object, result.release_error().error);
|
||||
|
||||
return result.release_value();
|
||||
}
|
||||
|
||||
RegExpObject* RegExpObject::create(GlobalObject& global_object)
|
||||
{
|
||||
return global_object.heap().allocate<RegExpObject>(global_object, *global_object.regexp_prototype());
|
||||
|
@ -156,7 +174,8 @@ ThrowCompletionOr<RegExpObject*> RegExpObject::regexp_initialize(GlobalObject& g
|
|||
} else {
|
||||
original_pattern = TRY(pattern.to_string(global_object));
|
||||
bool unicode = f.find('u').has_value();
|
||||
parsed_pattern = parse_regex_pattern(original_pattern, unicode);
|
||||
bool unicode_sets = f.find('v').has_value();
|
||||
parsed_pattern = TRY(parse_regex_pattern(original_pattern, vm, global_object, unicode, unicode_sets));
|
||||
}
|
||||
|
||||
auto parsed_flags_or_error = regex_flags_from_string(f);
|
||||
|
@ -181,7 +200,7 @@ String RegExpObject::escape_regexp_pattern() const
|
|||
{
|
||||
if (m_pattern.is_empty())
|
||||
return "(?:)";
|
||||
// FIXME: Check u flag and escape accordingly
|
||||
// FIXME: Check the 'u' and 'v' flags and escape accordingly
|
||||
return m_pattern.replace("\n"sv, "\\n"sv, ReplaceMode::All).replace("\r"sv, "\\r"sv, ReplaceMode::All).replace(LINE_SEPARATOR_STRING, "\\u2028"sv, ReplaceMode::All).replace(PARAGRAPH_SEPARATOR_STRING, "\\u2029"sv, ReplaceMode::All).replace("/"sv, "\\/"sv, ReplaceMode::All);
|
||||
}
|
||||
|
||||
|
|
|
@ -17,7 +17,11 @@ namespace JS {
|
|||
ThrowCompletionOr<RegExpObject*> regexp_create(GlobalObject&, Value pattern, Value flags);
|
||||
|
||||
Result<regex::RegexOptions<ECMAScriptFlags>, String> regex_flags_from_string(StringView flags);
|
||||
String parse_regex_pattern(StringView pattern, bool unicode);
|
||||
struct ParseRegexPatternError {
|
||||
String error;
|
||||
};
|
||||
ErrorOr<String, ParseRegexPatternError> parse_regex_pattern(StringView pattern, bool unicode, bool unicode_sets);
|
||||
ThrowCompletionOr<String> parse_regex_pattern(StringView pattern, VM& vm, GlobalObject& global_object, bool unicode, bool unicode_sets);
|
||||
|
||||
class RegExpObject : public Object {
|
||||
JS_OBJECT(RegExpObject, Object);
|
||||
|
|
|
@ -410,6 +410,7 @@ size_t advance_string_index(Utf16View const& string, size_t index, bool unicode)
|
|||
// 22.2.5.10 get RegExp.prototype.multiline, https://tc39.es/ecma262/#sec-get-regexp.prototype.multiline
|
||||
// 22.2.5.15 get RegExp.prototype.sticky, https://tc39.es/ecma262/#sec-get-regexp.prototype.sticky
|
||||
// 22.2.5.18 get RegExp.prototype.unicode, https://tc39.es/ecma262/#sec-get-regexp.prototype.unicode
|
||||
// 22.2.5.18 get RegExp.prototype.unicodeSets, https://arai-a.github.io/ecma262-compare/?pr=2418&id=sec-get-regexp.prototype.unicodeSets
|
||||
#define __JS_ENUMERATE(flagName, flag_name, flag_char) \
|
||||
JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::flag_name) \
|
||||
{ \
|
||||
|
@ -467,10 +468,12 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::flags)
|
|||
// 11. If multiline is true, append the code unit 0x006D (LATIN SMALL LETTER M) as the last code unit of result.
|
||||
// 12. Let dotAll be ToBoolean(? Get(R, "dotAll")).
|
||||
// 13. If dotAll is true, append the code unit 0x0073 (LATIN SMALL LETTER S) as the last code unit of result.
|
||||
// 14. Let unicode be ToBoolean(? Get(R, "unicode")).
|
||||
// 15. If unicode is true, append the code unit 0x0075 (LATIN SMALL LETTER U) as the last code unit of result.
|
||||
// 16. Let sticky be ToBoolean(? Get(R, "sticky")).
|
||||
// 17. If sticky is true, append the code unit 0x0079 (LATIN SMALL LETTER Y) as the last code unit of result.
|
||||
// 14. Let unicodeSets be ! ToBoolean(? Get(R, "unicodeSets")).
|
||||
// 15. If unicodeSets is true, append the code unit 0x0076 (LATIN SMALL LETTER V) as the last code unit of result.
|
||||
// 16. Let unicode be ToBoolean(? Get(R, "unicode")).
|
||||
// 17. If unicode is true, append the code unit 0x0075 (LATIN SMALL LETTER U) as the last code unit of result.
|
||||
// 18. Let sticky be ToBoolean(? Get(R, "sticky")).
|
||||
// 19. If sticky is true, append the code unit 0x0079 (LATIN SMALL LETTER Y) as the last code unit of result.
|
||||
#define __JS_ENUMERATE(flagName, flag_name, flag_char) \
|
||||
auto flag_##flag_name = TRY(regexp_object->get(vm.names.flagName)); \
|
||||
if (flag_##flag_name.to_boolean()) \
|
||||
|
@ -483,6 +486,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::flags)
|
|||
}
|
||||
|
||||
// 22.2.5.8 RegExp.prototype [ @@match ] ( string ), https://tc39.es/ecma262/#sec-regexp.prototype-@@match
|
||||
// With changes from https://arai-a.github.io/ecma262-compare/?pr=2418&id=sec-regexp.prototype-%2540%2540match
|
||||
JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match)
|
||||
{
|
||||
// 1. Let rx be the this value.
|
||||
|
@ -504,19 +508,23 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match)
|
|||
// 6. Else,
|
||||
// a. Assert: global is true.
|
||||
|
||||
// b. Let fullUnicode be ToBoolean(? Get(rx, "unicode")).
|
||||
bool full_unicode = TRY(regexp_object->get(vm.names.unicode)).to_boolean();
|
||||
// b. Let fullUnicode be ToBoolean(? Get(rx, "unicodeSets")).
|
||||
bool full_unicode = TRY(regexp_object->get(vm.names.unicodeSets)).to_boolean();
|
||||
|
||||
// c. Perform ? Set(rx, "lastIndex", +0𝔽, true).
|
||||
// c. If fullUnicode is false, set fullUnicode to ! ToBoolean(? Get(rx, "unicode")).
|
||||
if (!full_unicode)
|
||||
full_unicode = TRY(regexp_object->get(vm.names.unicode)).to_boolean();
|
||||
|
||||
// d. Perform ? Set(rx, "lastIndex", +0𝔽, true).
|
||||
TRY(regexp_object->set(vm.names.lastIndex, Value(0), Object::ShouldThrowExceptions::Yes));
|
||||
|
||||
// d. Let A be ! ArrayCreate(0).
|
||||
// e. Let A be ! ArrayCreate(0).
|
||||
auto* array = MUST(Array::create(global_object, 0));
|
||||
|
||||
// e. Let n be 0.
|
||||
// f. Let n be 0.
|
||||
size_t n = 0;
|
||||
|
||||
// f. Repeat,
|
||||
// g. Repeat,
|
||||
while (true) {
|
||||
// i. Let result be ? RegExpExec(rx, S).
|
||||
auto result = TRY(regexp_exec(global_object, *regexp_object, string));
|
||||
|
@ -552,6 +560,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match)
|
|||
}
|
||||
|
||||
// 22.2.5.9 RegExp.prototype [ @@matchAll ] ( string ), https://tc39.es/ecma262/#sec-regexp-prototype-matchall
|
||||
// With changes from https://arai-a.github.io/ecma262-compare/?pr=2418&id=sec-regexp-prototype-matchall
|
||||
JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match_all)
|
||||
{
|
||||
// 1. Let R be the this value.
|
||||
|
@ -576,7 +585,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match_all)
|
|||
|
||||
// 11. If flags contains "u", let fullUnicode be true.
|
||||
// 12. Else, let fullUnicode be false.
|
||||
bool full_unicode = flags.contains('u');
|
||||
bool full_unicode = flags.contains('u') || flags.contains('v');
|
||||
|
||||
// 6. Let matcher be ? Construct(C, « R, flags »).
|
||||
auto* matcher = TRY(construct(global_object, *constructor, regexp_object, js_string(vm, move(flags))));
|
||||
|
@ -593,6 +602,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match_all)
|
|||
}
|
||||
|
||||
// 22.2.5.11 RegExp.prototype [ @@replace ] ( string, replaceValue ), https://tc39.es/ecma262/#sec-regexp.prototype-@@replace
|
||||
// With changes from https://arai-a.github.io/ecma262-compare/?pr=2418&id=sec-regexp.prototype-@@replace
|
||||
JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
|
||||
{
|
||||
auto string_value = vm.argument(0);
|
||||
|
@ -621,10 +631,14 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
|
|||
|
||||
// 8. If global is true, then
|
||||
if (global) {
|
||||
// a. Let fullUnicode be ToBoolean(? Get(rx, "unicode")).
|
||||
full_unicode = TRY(regexp_object->get(vm.names.unicode)).to_boolean();
|
||||
// a. Let fullUnicode be ToBoolean(? Get(rx, "unicodeSets")).
|
||||
full_unicode = TRY(regexp_object->get(vm.names.unicodeSets)).to_boolean();
|
||||
|
||||
// b. Perform ? Set(rx, "lastIndex", +0𝔽, true).
|
||||
// b. If fullUnicode is false, set fullUnicode to ! ToBoolean(? Get(rx, "unicode")).
|
||||
if (!full_unicode)
|
||||
full_unicode = TRY(regexp_object->get(vm.names.unicode)).to_boolean();
|
||||
|
||||
// c. Perform ? Set(rx, "lastIndex", +0𝔽, true).
|
||||
TRY(regexp_object->set(vm.names.lastIndex, Value(0), Object::ShouldThrowExceptions::Yes));
|
||||
}
|
||||
|
||||
|
@ -863,7 +877,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_split)
|
|||
|
||||
// 6. If flags contains "u", let unicodeMatching be true.
|
||||
// 7. Else, let unicodeMatching be false.
|
||||
bool unicode_matching = flags.find('u').has_value();
|
||||
bool unicode_matching = flags.contains('u') || flags.contains('v');
|
||||
|
||||
// 8. If flags contains "y", let newFlags be flags.
|
||||
// 9. Else, let newFlags be the string-concatenation of flags and "y".
|
||||
|
|
|
@ -5,8 +5,11 @@ test("basic functionality", () => {
|
|||
expect(/foo/i.flags).toBe("i");
|
||||
expect(/foo/m.flags).toBe("m");
|
||||
expect(/foo/s.flags).toBe("s");
|
||||
expect(/foo/v.flags).toBe("v");
|
||||
expect(/foo/u.flags).toBe("u");
|
||||
expect(/foo/y.flags).toBe("y");
|
||||
// prettier-ignore
|
||||
expect(/foo/dsgimyu.flags).toBe("dgimsuy");
|
||||
// prettier-ignore
|
||||
expect(/foo/dgimsvy.flags).toBe("dgimsvy");
|
||||
});
|
||||
|
|
Loading…
Reference in a new issue