
RegExpInitialize specifies how the pattern string should be created before passing it to [[RegExpMatcher]]. Rather than passing it as-is, the string should be converted to code points and back to a "List" (if the Unicode flag is present), or as a "List" of UTF-16 code units. Further. the spec requires that we keep both the original pattern string and this parsed string in the RegExp object. The caveat is that the LibRegex parser further requires any multi-byte code units to be escaped (as "\unnnn"). Otherwise, the code unit is recognized as individual UTF-8 bytes.
180 lines
6.9 KiB
C++
180 lines
6.9 KiB
C++
/*
|
|
* Copyright (c) 2020, Matthew Olsson <mattco@serenityos.org>
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
#include <AK/Function.h>
|
|
#include <LibJS/Heap/Heap.h>
|
|
#include <LibJS/Runtime/GlobalObject.h>
|
|
#include <LibJS/Runtime/PrimitiveString.h>
|
|
#include <LibJS/Runtime/RegExpObject.h>
|
|
#include <LibJS/Runtime/StringPrototype.h>
|
|
#include <LibJS/Runtime/Value.h>
|
|
|
|
namespace JS {
|
|
|
|
static Flags options_from(GlobalObject& global_object, const String& flags)
|
|
{
|
|
auto& vm = global_object.vm();
|
|
bool d = false, g = false, i = false, m = false, s = false, u = false, y = false;
|
|
Flags options {
|
|
// JS regexps are all 'global' by default as per our definition, but the "global" flag enables "stateful".
|
|
// FIXME: Enable 'BrowserExtended' only if in a browser context.
|
|
.effective_flags = { (regex::ECMAScriptFlags)regex::AllFlags::Global | (regex::ECMAScriptFlags)regex::AllFlags::SkipTrimEmptyMatches | regex::ECMAScriptFlags::BrowserExtended },
|
|
.declared_flags = {},
|
|
};
|
|
|
|
for (auto ch : flags) {
|
|
switch (ch) {
|
|
case 'd':
|
|
if (d)
|
|
vm.throw_exception<SyntaxError>(global_object, ErrorType::RegExpObjectRepeatedFlag, ch);
|
|
d = true;
|
|
break;
|
|
case 'g':
|
|
if (g)
|
|
vm.throw_exception<SyntaxError>(global_object, ErrorType::RegExpObjectRepeatedFlag, ch);
|
|
g = true;
|
|
options.effective_flags |= regex::ECMAScriptFlags::Global;
|
|
options.declared_flags |= regex::ECMAScriptFlags::Global;
|
|
break;
|
|
case 'i':
|
|
if (i)
|
|
vm.throw_exception<SyntaxError>(global_object, ErrorType::RegExpObjectRepeatedFlag, ch);
|
|
i = true;
|
|
options.effective_flags |= regex::ECMAScriptFlags::Insensitive;
|
|
options.declared_flags |= regex::ECMAScriptFlags::Insensitive;
|
|
break;
|
|
case 'm':
|
|
if (m)
|
|
vm.throw_exception<SyntaxError>(global_object, ErrorType::RegExpObjectRepeatedFlag, ch);
|
|
m = true;
|
|
options.effective_flags |= regex::ECMAScriptFlags::Multiline;
|
|
options.declared_flags |= regex::ECMAScriptFlags::Multiline;
|
|
break;
|
|
case 's':
|
|
if (s)
|
|
vm.throw_exception<SyntaxError>(global_object, ErrorType::RegExpObjectRepeatedFlag, ch);
|
|
s = true;
|
|
options.effective_flags |= regex::ECMAScriptFlags::SingleLine;
|
|
options.declared_flags |= regex::ECMAScriptFlags::SingleLine;
|
|
break;
|
|
case 'u':
|
|
if (u)
|
|
vm.throw_exception<SyntaxError>(global_object, ErrorType::RegExpObjectRepeatedFlag, ch);
|
|
u = true;
|
|
options.effective_flags |= regex::ECMAScriptFlags::Unicode;
|
|
options.declared_flags |= regex::ECMAScriptFlags::Unicode;
|
|
break;
|
|
case 'y':
|
|
if (y)
|
|
vm.throw_exception<SyntaxError>(global_object, ErrorType::RegExpObjectRepeatedFlag, ch);
|
|
y = true;
|
|
// Now for the more interesting flag, 'sticky' actually unsets 'global', part of which is the default.
|
|
options.effective_flags.reset_flag(regex::ECMAScriptFlags::Global);
|
|
// "What's the difference between sticky and global, then", that's simple.
|
|
// all the other flags imply 'global', and the "global" flag implies 'stateful';
|
|
// however, the "sticky" flag does *not* imply 'global', only 'stateful'.
|
|
options.effective_flags |= (regex::ECMAScriptFlags)regex::AllFlags::Internal_Stateful;
|
|
options.effective_flags |= regex::ECMAScriptFlags::Sticky;
|
|
options.declared_flags |= regex::ECMAScriptFlags::Sticky;
|
|
break;
|
|
default:
|
|
vm.throw_exception<SyntaxError>(global_object, ErrorType::RegExpObjectBadFlag, ch);
|
|
return options;
|
|
}
|
|
}
|
|
|
|
return options;
|
|
}
|
|
|
|
RegExpObject* RegExpObject::create(GlobalObject& global_object, String original_pattern, String parsed_pattern, String flags)
|
|
{
|
|
return global_object.heap().allocate<RegExpObject>(global_object, move(original_pattern), move(parsed_pattern), move(flags), *global_object.regexp_prototype());
|
|
}
|
|
|
|
RegExpObject::RegExpObject(String original_pattern, String parsed_pattern, String flags, Object& prototype)
|
|
: Object(prototype)
|
|
, m_original_pattern(move(original_pattern))
|
|
, m_parsed_pattern(move(parsed_pattern))
|
|
, m_flags(move(flags))
|
|
, m_active_flags(options_from(global_object(), m_flags))
|
|
, m_regex(m_parsed_pattern, m_active_flags.effective_flags)
|
|
{
|
|
if (m_regex.parser_result.error != regex::Error::NoError) {
|
|
vm().throw_exception<SyntaxError>(global_object(), ErrorType::RegExpCompileError, m_regex.error_string());
|
|
}
|
|
}
|
|
|
|
RegExpObject::~RegExpObject()
|
|
{
|
|
}
|
|
|
|
void RegExpObject::initialize(GlobalObject& global_object)
|
|
{
|
|
auto& vm = this->vm();
|
|
Object::initialize(global_object);
|
|
define_direct_property(vm.names.lastIndex, {}, Attribute::Writable);
|
|
}
|
|
|
|
// 22.2.3.2.4 RegExpCreate ( P, F ), https://tc39.es/ecma262/#sec-regexpcreate
|
|
RegExpObject* regexp_create(GlobalObject& global_object, Value pattern, Value flags)
|
|
{
|
|
auto& vm = global_object.vm();
|
|
|
|
String f;
|
|
if (flags.is_undefined()) {
|
|
f = String::empty();
|
|
} else {
|
|
f = flags.to_string(global_object);
|
|
if (vm.exception())
|
|
return {};
|
|
}
|
|
|
|
String original_pattern;
|
|
String parsed_pattern;
|
|
|
|
if (pattern.is_undefined()) {
|
|
original_pattern = String::empty();
|
|
parsed_pattern = String::empty();
|
|
} else {
|
|
auto utf16_pattern = pattern.to_utf16_string(global_object);
|
|
if (vm.exception())
|
|
return {};
|
|
|
|
Utf16View utf16_pattern_view { utf16_pattern };
|
|
bool unicode = f.find('u').has_value();
|
|
StringBuilder builder;
|
|
|
|
// If the Unicode flag is set, append each code point to the pattern. Otherwise, append each
|
|
// code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse.
|
|
for (size_t i = 0; i < utf16_pattern_view.length_in_code_units();) {
|
|
if (unicode) {
|
|
auto code_point = code_point_at(utf16_pattern_view, i);
|
|
builder.append_code_point(code_point.code_point);
|
|
i += code_point.code_unit_count;
|
|
continue;
|
|
}
|
|
|
|
u16 code_unit = utf16_pattern_view.code_unit_at(i);
|
|
++i;
|
|
|
|
if (code_unit > 0x7f)
|
|
builder.appendff("\\u{:04x}", code_unit);
|
|
else
|
|
builder.append_code_point(code_unit);
|
|
}
|
|
|
|
original_pattern = utf16_pattern_view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes);
|
|
parsed_pattern = builder.build();
|
|
}
|
|
|
|
auto* object = RegExpObject::create(global_object, move(original_pattern), move(parsed_pattern), move(f));
|
|
object->set(vm.names.lastIndex, Value(0), Object::ShouldThrowExceptions::Yes);
|
|
if (vm.exception())
|
|
return {};
|
|
return object;
|
|
}
|
|
|
|
}
|