ladybird/Userland/Libraries/LibJS/Runtime/RegExpObject.cpp
Ali Mohammad Pur 5e1499d104 Everywhere: Rename {Deprecated => Byte}String
This commit un-deprecates DeprecatedString, and repurposes it as a byte
string.
As the null state has already been removed, there are no other
particularly hairy blockers in repurposing this type as a byte string
(what it _really_ is).

This commit is auto-generated:
  $ xs=$(ack -l \bDeprecatedString\b\|deprecated_string AK Userland \
    Meta Ports Ladybird Tests Kernel)
  $ perl -pie 's/\bDeprecatedString\b/ByteString/g;
    s/deprecated_string/byte_string/g' $xs
  $ clang-format --style=file -i \
    $(git diff --name-only | grep \.cpp\|\.h)
  $ gn format $(git ls-files '*.gn' '*.gni')
2023-12-17 18:25:10 +03:30

350 lines
15 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (c) 2020, Matthew Olsson <mattco@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Function.h>
#include <LibJS/Runtime/AbstractOperations.h>
#include <LibJS/Runtime/GlobalObject.h>
#include <LibJS/Runtime/PrimitiveString.h>
#include <LibJS/Runtime/RegExpConstructor.h>
#include <LibJS/Runtime/RegExpObject.h>
#include <LibJS/Runtime/StringPrototype.h>
#include <LibJS/Runtime/Value.h>
#include <LibJS/Token.h>
namespace JS {
JS_DEFINE_ALLOCATOR(RegExpObject);
Result<regex::RegexOptions<ECMAScriptFlags>, ByteString> regex_flags_from_string(StringView flags)
{
bool d = false, g = false, i = false, m = false, s = false, u = false, y = false, v = false;
auto options = RegExpObject::default_flags;
for (auto ch : flags) {
switch (ch) {
case 'd':
if (d)
return ByteString::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
d = true;
break;
case 'g':
if (g)
return ByteString::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
g = true;
options |= regex::ECMAScriptFlags::Global;
break;
case 'i':
if (i)
return ByteString::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
i = true;
options |= regex::ECMAScriptFlags::Insensitive;
break;
case 'm':
if (m)
return ByteString::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
m = true;
options |= regex::ECMAScriptFlags::Multiline;
break;
case 's':
if (s)
return ByteString::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
s = true;
options |= regex::ECMAScriptFlags::SingleLine;
break;
case 'u':
if (u)
return ByteString::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
u = true;
options |= regex::ECMAScriptFlags::Unicode;
break;
case 'y':
if (y)
return ByteString::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
y = true;
// Now for the more interesting flag, 'sticky' actually unsets 'global', part of which is the default.
options.reset_flag(regex::ECMAScriptFlags::Global);
// "What's the difference between sticky and global, then", that's simple.
// all the other flags imply 'global', and the "global" flag implies 'stateful';
// however, the "sticky" flag does *not* imply 'global', only 'stateful'.
options |= (regex::ECMAScriptFlags)regex::AllFlags::Internal_Stateful;
options |= regex::ECMAScriptFlags::Sticky;
break;
case 'v':
if (v)
return ByteString::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
v = true;
options |= regex::ECMAScriptFlags::UnicodeSets;
break;
default:
return ByteString::formatted(ErrorType::RegExpObjectBadFlag.message(), ch);
}
}
return options;
}
// 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern
ErrorOr<ByteString, ParseRegexPatternError> parse_regex_pattern(StringView pattern, bool unicode, bool unicode_sets)
{
if (unicode && unicode_sets)
return ParseRegexPatternError { ByteString::formatted(ErrorType::RegExpObjectIncompatibleFlags.message(), 'u', 'v') };
auto utf16_pattern_result = AK::utf8_to_utf16(pattern);
if (utf16_pattern_result.is_error())
return ParseRegexPatternError { "Out of memory"sv };
auto utf16_pattern = utf16_pattern_result.release_value();
Utf16View utf16_pattern_view { utf16_pattern };
StringBuilder builder;
// If the Unicode flag is set, append each code point to the pattern. Otherwise, append each
// code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse.
auto previous_code_unit_was_backslash = false;
for (size_t i = 0; i < utf16_pattern_view.length_in_code_units();) {
if (unicode || unicode_sets) {
auto code_point = code_point_at(utf16_pattern_view, i);
builder.append_code_point(code_point.code_point);
i += code_point.code_unit_count;
continue;
}
u16 code_unit = utf16_pattern_view.code_unit_at(i);
++i;
if (code_unit > 0x7f) {
// Incorrectly escaping this code unit will result in a wildly different regex than intended
// as we're converting <c> to <\uhhhh>, which would turn into <\\uhhhh> if (incorrectly) escaped again,
// leading to a matcher for the literal string "\uhhhh" instead of the intended code unit <c>.
// As such, we're going to remove the (invalid) backslash and pretend it never existed.
if (!previous_code_unit_was_backslash)
builder.append('\\');
builder.appendff("u{:04x}", code_unit);
} else {
builder.append_code_point(code_unit);
}
if (code_unit == '\\')
previous_code_unit_was_backslash = !previous_code_unit_was_backslash;
else
previous_code_unit_was_backslash = false;
}
return builder.to_byte_string();
}
// 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern
ThrowCompletionOr<ByteString> parse_regex_pattern(VM& vm, StringView pattern, bool unicode, bool unicode_sets)
{
auto result = parse_regex_pattern(pattern, unicode, unicode_sets);
if (result.is_error())
return vm.throw_completion<JS::SyntaxError>(result.release_error().error);
return result.release_value();
}
NonnullGCPtr<RegExpObject> RegExpObject::create(Realm& realm)
{
return realm.heap().allocate<RegExpObject>(realm, realm.intrinsics().regexp_prototype());
}
NonnullGCPtr<RegExpObject> RegExpObject::create(Realm& realm, Regex<ECMA262> regex, ByteString pattern, ByteString flags)
{
return realm.heap().allocate<RegExpObject>(realm, move(regex), move(pattern), move(flags), realm.intrinsics().regexp_prototype());
}
RegExpObject::RegExpObject(Object& prototype)
: Object(ConstructWithPrototypeTag::Tag, prototype)
{
}
RegExpObject::RegExpObject(Regex<ECMA262> regex, ByteString pattern, ByteString flags, Object& prototype)
: Object(ConstructWithPrototypeTag::Tag, prototype)
, m_pattern(move(pattern))
, m_flags(move(flags))
, m_regex(move(regex))
{
VERIFY(m_regex->parser_result.error == regex::Error::NoError);
}
void RegExpObject::initialize(Realm& realm)
{
auto& vm = this->vm();
Base::initialize(realm);
define_direct_property(vm.names.lastIndex, Value(0), Attribute::Writable);
}
// 22.2.3.3 RegExpInitialize ( obj, pattern, flags ), https://tc39.es/ecma262/#sec-regexpinitialize
ThrowCompletionOr<NonnullGCPtr<RegExpObject>> RegExpObject::regexp_initialize(VM& vm, Value pattern_value, Value flags_value)
{
// 1. If pattern is undefined, let P be the empty String.
// 2. Else, let P be ? ToString(pattern).
auto pattern = pattern_value.is_undefined()
? ByteString::empty()
: TRY(pattern_value.to_byte_string(vm));
// 3. If flags is undefined, let F be the empty String.
// 4. Else, let F be ? ToString(flags).
auto flags = flags_value.is_undefined()
? ByteString::empty()
: TRY(flags_value.to_byte_string(vm));
// 5. If F contains any code unit other than "d", "g", "i", "m", "s", "u", "v", or "y", or if F contains any code unit more than once, throw a SyntaxError exception.
// 6. If F contains "i", let i be true; else let i be false.
// 7. If F contains "m", let m be true; else let m be false.
// 8. If F contains "s", let s be true; else let s be false.
// 9. If F contains "u", let u be true; else let u be false.
// 10. If F contains "v", let v be true; else let v be false.
auto parsed_flags_or_error = regex_flags_from_string(flags);
if (parsed_flags_or_error.is_error())
return vm.throw_completion<SyntaxError>(parsed_flags_or_error.release_error());
auto parsed_flags = parsed_flags_or_error.release_value();
auto parsed_pattern = ByteString::empty();
if (!pattern.is_empty()) {
bool unicode = parsed_flags.has_flag_set(regex::ECMAScriptFlags::Unicode);
bool unicode_sets = parsed_flags.has_flag_set(regex::ECMAScriptFlags::UnicodeSets);
// 11. If u is true or v is true, then
// a. Let patternText be StringToCodePoints(P).
// 12. Else,
// a. Let patternText be the result of interpreting each of P's 16-bit elements as a Unicode BMP code point. UTF-16 decoding is not applied to the elements.
// 13. Let parseResult be ParsePattern(patternText, u, v).
parsed_pattern = TRY(parse_regex_pattern(vm, pattern, unicode, unicode_sets));
}
// 14. If parseResult is a non-empty List of SyntaxError objects, throw a SyntaxError exception.
Regex<ECMA262> regex(move(parsed_pattern), parsed_flags);
if (regex.parser_result.error != regex::Error::NoError)
return vm.throw_completion<SyntaxError>(ErrorType::RegExpCompileError, regex.error_string());
// 15. Assert: parseResult is a Pattern Parse Node.
VERIFY(regex.parser_result.error == regex::Error::NoError);
// 16. Set obj.[[OriginalSource]] to P.
m_pattern = move(pattern);
// 17. Set obj.[[OriginalFlags]] to F.
m_flags = move(flags);
// 18. Let capturingGroupsCount be CountLeftCapturingParensWithin(parseResult).
// 19. Let rer be the RegExp Record { [[IgnoreCase]]: i, [[Multiline]]: m, [[DotAll]]: s, [[Unicode]]: u, [[CapturingGroupsCount]]: capturingGroupsCount }.
// 20. Set obj.[[RegExpRecord]] to rer.
// 21. Set obj.[[RegExpMatcher]] to CompilePattern of parseResult with argument rer.
m_regex = move(regex);
// 22. Perform ? Set(obj, "lastIndex", +0𝔽, true).
TRY(set(vm.names.lastIndex, Value(0), Object::ShouldThrowExceptions::Yes));
// 23. Return obj.
return NonnullGCPtr { *this };
}
// 22.2.6.13.1 EscapeRegExpPattern ( P, F ), https://tc39.es/ecma262/#sec-escaperegexppattern
ByteString RegExpObject::escape_regexp_pattern() const
{
// 1. Let S be a String in the form of a Pattern[~UnicodeMode] (Pattern[+UnicodeMode] if F contains "u") equivalent
// to P interpreted as UTF-16 encoded Unicode code points (6.1.4), in which certain code points are escaped as
// described below. S may or may not be identical to P; however, the Abstract Closure that would result from
// evaluating S as a Pattern[~UnicodeMode] (Pattern[+UnicodeMode] if F contains "u") must behave identically to
// the Abstract Closure given by the constructed object's [[RegExpMatcher]] internal slot. Multiple calls to
// this abstract operation using the same values for P and F must produce identical results.
// 2. The code points / or any LineTerminator occurring in the pattern shall be escaped in S as necessary to ensure
// that the string-concatenation of "/", S, "/", and F can be parsed (in an appropriate lexical context) as a
// RegularExpressionLiteral that behaves identically to the constructed regular expression. For example, if P is
// "/", then S could be "\/" or "\u002F", among other possibilities, but not "/", because /// followed by F
// would be parsed as a SingleLineComment rather than a RegularExpressionLiteral. If P is the empty String, this
// specification can be met by letting S be "(?:)".
// 3. Return S.
if (m_pattern.is_empty())
return "(?:)";
// FIXME: Check the 'u' and 'v' flags and escape accordingly
StringBuilder builder;
auto pattern = Utf8View { m_pattern };
auto escaped = false;
for (auto code_point : pattern) {
if (escaped) {
escaped = false;
builder.append_code_point('\\');
builder.append_code_point(code_point);
continue;
}
if (code_point == '\\') {
escaped = true;
continue;
}
switch (code_point) {
case '/':
builder.append("\\/"sv);
break;
case '\n':
builder.append("\\n"sv);
break;
case '\r':
builder.append("\\r"sv);
break;
case LINE_SEPARATOR:
builder.append("\\u2028"sv);
break;
case PARAGRAPH_SEPARATOR:
builder.append("\\u2029"sv);
break;
default:
builder.append_code_point(code_point);
break;
}
}
return builder.to_byte_string();
}
// 22.2.3.1 RegExpCreate ( P, F ), https://tc39.es/ecma262/#sec-regexpcreate
ThrowCompletionOr<NonnullGCPtr<RegExpObject>> regexp_create(VM& vm, Value pattern, Value flags)
{
auto& realm = *vm.current_realm();
// 1. Let obj be ! RegExpAlloc(%RegExp%).
auto regexp_object = MUST(regexp_alloc(vm, realm.intrinsics().regexp_constructor()));
// 2. Return ? RegExpInitialize(obj, P, F).
return TRY(regexp_object->regexp_initialize(vm, pattern, flags));
}
// 22.2.3.2 RegExpAlloc ( newTarget ), https://tc39.es/ecma262/#sec-regexpalloc
// 22.2.3.2 RegExpAlloc ( newTarget ), https://github.com/tc39/proposal-regexp-legacy-features#regexpalloc--newtarget-
ThrowCompletionOr<NonnullGCPtr<RegExpObject>> regexp_alloc(VM& vm, FunctionObject& new_target)
{
// 1. Let obj be ? OrdinaryCreateFromConstructor(newTarget, "%RegExp.prototype%", « [[OriginalSource]], [[OriginalFlags]], [[RegExpRecord]], [[RegExpMatcher]] »).
auto regexp_object = TRY(ordinary_create_from_constructor<RegExpObject>(vm, new_target, &Intrinsics::regexp_prototype));
// 2. Let thisRealm be the current Realm Record.
auto& this_realm = *vm.current_realm();
// 3. Set the value of objs [[Realm]] internal slot to thisRealm.
regexp_object->set_realm(this_realm);
// 4. If SameValue(newTarget, thisRealm.[[Intrinsics]].[[%RegExp%]]) is true, then
if (same_value(&new_target, this_realm.intrinsics().regexp_constructor())) {
// i. Set the value of objs [[LegacyFeaturesEnabled]] internal slot to true.
regexp_object->set_legacy_features_enabled(true);
}
// 5. Else,
else {
// i. Set the value of objs [[LegacyFeaturesEnabled]] internal slot to false.
regexp_object->set_legacy_features_enabled(false);
}
// 6. Perform ! DefinePropertyOrThrow(obj, "lastIndex", PropertyDescriptor { [[Writable]]: true, [[Enumerable]]: false, [[Configurable]]: false }).
MUST(regexp_object->define_property_or_throw(vm.names.lastIndex, PropertyDescriptor { .writable = true, .enumerable = false, .configurable = false }));
// 7. Return obj.
return regexp_object;
}
}