RegExpObject.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. /*
  2. * Copyright (c) 2020, Matthew Olsson <mattco@serenityos.org>
  3. * Copyright (c) 2024, Andreas Kling <andreas@ladybird.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/Function.h>
  8. #include <LibJS/Runtime/AbstractOperations.h>
  9. #include <LibJS/Runtime/GlobalObject.h>
  10. #include <LibJS/Runtime/PrimitiveString.h>
  11. #include <LibJS/Runtime/RegExpConstructor.h>
  12. #include <LibJS/Runtime/RegExpObject.h>
  13. #include <LibJS/Runtime/StringPrototype.h>
  14. #include <LibJS/Runtime/Value.h>
  15. #include <LibJS/Token.h>
  16. namespace JS {
  17. JS_DEFINE_ALLOCATOR(RegExpObject);
  18. Result<regex::RegexOptions<ECMAScriptFlags>, ByteString> regex_flags_from_string(StringView flags)
  19. {
  20. bool d = false, g = false, i = false, m = false, s = false, u = false, y = false, v = false;
  21. auto options = RegExpObject::default_flags;
  22. for (auto ch : flags) {
  23. switch (ch) {
  24. case 'd':
  25. if (d)
  26. return ByteString::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
  27. d = true;
  28. break;
  29. case 'g':
  30. if (g)
  31. return ByteString::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
  32. g = true;
  33. options |= regex::ECMAScriptFlags::Global;
  34. break;
  35. case 'i':
  36. if (i)
  37. return ByteString::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
  38. i = true;
  39. options |= regex::ECMAScriptFlags::Insensitive;
  40. break;
  41. case 'm':
  42. if (m)
  43. return ByteString::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
  44. m = true;
  45. options |= regex::ECMAScriptFlags::Multiline;
  46. break;
  47. case 's':
  48. if (s)
  49. return ByteString::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
  50. s = true;
  51. options |= regex::ECMAScriptFlags::SingleLine;
  52. break;
  53. case 'u':
  54. if (u)
  55. return ByteString::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
  56. u = true;
  57. options |= regex::ECMAScriptFlags::Unicode;
  58. break;
  59. case 'y':
  60. if (y)
  61. return ByteString::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
  62. y = true;
  63. // Now for the more interesting flag, 'sticky' actually unsets 'global', part of which is the default.
  64. options.reset_flag(regex::ECMAScriptFlags::Global);
  65. // "What's the difference between sticky and global, then", that's simple.
  66. // all the other flags imply 'global', and the "global" flag implies 'stateful';
  67. // however, the "sticky" flag does *not* imply 'global', only 'stateful'.
  68. options |= (regex::ECMAScriptFlags)regex::AllFlags::Internal_Stateful;
  69. options |= regex::ECMAScriptFlags::Sticky;
  70. break;
  71. case 'v':
  72. if (v)
  73. return ByteString::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
  74. v = true;
  75. options |= regex::ECMAScriptFlags::UnicodeSets;
  76. break;
  77. default:
  78. return ByteString::formatted(ErrorType::RegExpObjectBadFlag.message(), ch);
  79. }
  80. }
  81. return options;
  82. }
  83. // 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern
  84. ErrorOr<ByteString, ParseRegexPatternError> parse_regex_pattern(StringView pattern, bool unicode, bool unicode_sets)
  85. {
  86. if (unicode && unicode_sets)
  87. return ParseRegexPatternError { ByteString::formatted(ErrorType::RegExpObjectIncompatibleFlags.message(), 'u', 'v') };
  88. auto utf16_pattern_result = AK::utf8_to_utf16(pattern);
  89. if (utf16_pattern_result.is_error())
  90. return ParseRegexPatternError { "Out of memory"sv };
  91. auto utf16_pattern = utf16_pattern_result.release_value();
  92. Utf16View utf16_pattern_view { utf16_pattern };
  93. StringBuilder builder;
  94. // If the Unicode flag is set, append each code point to the pattern. Otherwise, append each
  95. // code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse.
  96. auto previous_code_unit_was_backslash = false;
  97. for (size_t i = 0; i < utf16_pattern_view.length_in_code_units();) {
  98. if (unicode || unicode_sets) {
  99. auto code_point = code_point_at(utf16_pattern_view, i);
  100. builder.append_code_point(code_point.code_point);
  101. i += code_point.code_unit_count;
  102. continue;
  103. }
  104. u16 code_unit = utf16_pattern_view.code_unit_at(i);
  105. ++i;
  106. if (code_unit > 0x7f) {
  107. // Incorrectly escaping this code unit will result in a wildly different regex than intended
  108. // as we're converting <c> to <\uhhhh>, which would turn into <\\uhhhh> if (incorrectly) escaped again,
  109. // leading to a matcher for the literal string "\uhhhh" instead of the intended code unit <c>.
  110. // As such, we're going to remove the (invalid) backslash and pretend it never existed.
  111. if (!previous_code_unit_was_backslash)
  112. builder.append('\\');
  113. builder.appendff("u{:04x}", code_unit);
  114. } else {
  115. builder.append_code_point(code_unit);
  116. }
  117. if (code_unit == '\\')
  118. previous_code_unit_was_backslash = !previous_code_unit_was_backslash;
  119. else
  120. previous_code_unit_was_backslash = false;
  121. }
  122. return builder.to_byte_string();
  123. }
  124. // 22.2.3.4 Static Semantics: ParsePattern ( patternText, u, v ), https://tc39.es/ecma262/#sec-parsepattern
  125. ThrowCompletionOr<ByteString> parse_regex_pattern(VM& vm, StringView pattern, bool unicode, bool unicode_sets)
  126. {
  127. auto result = parse_regex_pattern(pattern, unicode, unicode_sets);
  128. if (result.is_error())
  129. return vm.throw_completion<JS::SyntaxError>(result.release_error().error);
  130. return result.release_value();
  131. }
  132. NonnullGCPtr<RegExpObject> RegExpObject::create(Realm& realm)
  133. {
  134. return realm.create<RegExpObject>(realm.intrinsics().regexp_prototype());
  135. }
  136. NonnullGCPtr<RegExpObject> RegExpObject::create(Realm& realm, Regex<ECMA262> regex, ByteString pattern, ByteString flags)
  137. {
  138. return realm.create<RegExpObject>(move(regex), move(pattern), move(flags), realm.intrinsics().regexp_prototype());
  139. }
  140. RegExpObject::RegExpObject(Object& prototype)
  141. : Object(ConstructWithPrototypeTag::Tag, prototype)
  142. {
  143. }
  144. static RegExpObject::Flags to_flag_bits(StringView flags)
  145. {
  146. RegExpObject::Flags flag_bits = static_cast<RegExpObject::Flags>(0);
  147. for (auto ch : flags) {
  148. switch (ch) {
  149. #define __JS_ENUMERATE(FlagName, flagName, flag_name, flag_char) \
  150. case #flag_char[0]: \
  151. flag_bits |= RegExpObject::Flags::FlagName; \
  152. break;
  153. JS_ENUMERATE_REGEXP_FLAGS
  154. #undef __JS_ENUMERATE
  155. default:
  156. break;
  157. }
  158. }
  159. return flag_bits;
  160. }
  161. RegExpObject::RegExpObject(Regex<ECMA262> regex, ByteString pattern, ByteString flags, Object& prototype)
  162. : Object(ConstructWithPrototypeTag::Tag, prototype)
  163. , m_pattern(move(pattern))
  164. , m_flags(move(flags))
  165. , m_flag_bits(to_flag_bits(m_flags))
  166. , m_regex(move(regex))
  167. {
  168. VERIFY(m_regex->parser_result.error == regex::Error::NoError);
  169. }
  170. void RegExpObject::initialize(Realm& realm)
  171. {
  172. auto& vm = this->vm();
  173. Base::initialize(realm);
  174. define_direct_property(vm.names.lastIndex, Value(0), Attribute::Writable);
  175. }
  176. // 22.2.3.3 RegExpInitialize ( obj, pattern, flags ), https://tc39.es/ecma262/#sec-regexpinitialize
  177. ThrowCompletionOr<NonnullGCPtr<RegExpObject>> RegExpObject::regexp_initialize(VM& vm, Value pattern_value, Value flags_value)
  178. {
  179. // 1. If pattern is undefined, let P be the empty String.
  180. // 2. Else, let P be ? ToString(pattern).
  181. auto pattern = pattern_value.is_undefined()
  182. ? ByteString::empty()
  183. : TRY(pattern_value.to_byte_string(vm));
  184. // 3. If flags is undefined, let F be the empty String.
  185. // 4. Else, let F be ? ToString(flags).
  186. auto flags = flags_value.is_undefined()
  187. ? ByteString::empty()
  188. : TRY(flags_value.to_byte_string(vm));
  189. // 5. If F contains any code unit other than "d", "g", "i", "m", "s", "u", "v", or "y", or if F contains any code unit more than once, throw a SyntaxError exception.
  190. // 6. If F contains "i", let i be true; else let i be false.
  191. // 7. If F contains "m", let m be true; else let m be false.
  192. // 8. If F contains "s", let s be true; else let s be false.
  193. // 9. If F contains "u", let u be true; else let u be false.
  194. // 10. If F contains "v", let v be true; else let v be false.
  195. auto parsed_flags_or_error = regex_flags_from_string(flags);
  196. if (parsed_flags_or_error.is_error())
  197. return vm.throw_completion<SyntaxError>(parsed_flags_or_error.release_error());
  198. auto parsed_flags = parsed_flags_or_error.release_value();
  199. auto parsed_pattern = ByteString::empty();
  200. if (!pattern.is_empty()) {
  201. bool unicode = parsed_flags.has_flag_set(regex::ECMAScriptFlags::Unicode);
  202. bool unicode_sets = parsed_flags.has_flag_set(regex::ECMAScriptFlags::UnicodeSets);
  203. // 11. If u is true or v is true, then
  204. // a. Let patternText be StringToCodePoints(P).
  205. // 12. Else,
  206. // a. Let patternText be the result of interpreting each of P's 16-bit elements as a Unicode BMP code point. UTF-16 decoding is not applied to the elements.
  207. // 13. Let parseResult be ParsePattern(patternText, u, v).
  208. parsed_pattern = TRY(parse_regex_pattern(vm, pattern, unicode, unicode_sets));
  209. }
  210. // 14. If parseResult is a non-empty List of SyntaxError objects, throw a SyntaxError exception.
  211. Regex<ECMA262> regex(move(parsed_pattern), parsed_flags);
  212. if (regex.parser_result.error != regex::Error::NoError)
  213. return vm.throw_completion<SyntaxError>(ErrorType::RegExpCompileError, regex.error_string());
  214. // 15. Assert: parseResult is a Pattern Parse Node.
  215. VERIFY(regex.parser_result.error == regex::Error::NoError);
  216. // 16. Set obj.[[OriginalSource]] to P.
  217. m_pattern = move(pattern);
  218. // 17. Set obj.[[OriginalFlags]] to F.
  219. m_flag_bits = to_flag_bits(flags);
  220. m_flags = move(flags);
  221. // 18. Let capturingGroupsCount be CountLeftCapturingParensWithin(parseResult).
  222. // 19. Let rer be the RegExp Record { [[IgnoreCase]]: i, [[Multiline]]: m, [[DotAll]]: s, [[Unicode]]: u, [[CapturingGroupsCount]]: capturingGroupsCount }.
  223. // 20. Set obj.[[RegExpRecord]] to rer.
  224. // 21. Set obj.[[RegExpMatcher]] to CompilePattern of parseResult with argument rer.
  225. m_regex = move(regex);
  226. // 22. Perform ? Set(obj, "lastIndex", +0𝔽, true).
  227. TRY(set(vm.names.lastIndex, Value(0), Object::ShouldThrowExceptions::Yes));
  228. // 23. Return obj.
  229. return NonnullGCPtr { *this };
  230. }
  231. // 22.2.6.13.1 EscapeRegExpPattern ( P, F ), https://tc39.es/ecma262/#sec-escaperegexppattern
  232. ByteString RegExpObject::escape_regexp_pattern() const
  233. {
  234. // 1. Let S be a String in the form of a Pattern[~UnicodeMode] (Pattern[+UnicodeMode] if F contains "u") equivalent
  235. // to P interpreted as UTF-16 encoded Unicode code points (6.1.4), in which certain code points are escaped as
  236. // described below. S may or may not be identical to P; however, the Abstract Closure that would result from
  237. // evaluating S as a Pattern[~UnicodeMode] (Pattern[+UnicodeMode] if F contains "u") must behave identically to
  238. // the Abstract Closure given by the constructed object's [[RegExpMatcher]] internal slot. Multiple calls to
  239. // this abstract operation using the same values for P and F must produce identical results.
  240. // 2. The code points / or any LineTerminator occurring in the pattern shall be escaped in S as necessary to ensure
  241. // that the string-concatenation of "/", S, "/", and F can be parsed (in an appropriate lexical context) as a
  242. // RegularExpressionLiteral that behaves identically to the constructed regular expression. For example, if P is
  243. // "/", then S could be "\/" or "\u002F", among other possibilities, but not "/", because /// followed by F
  244. // would be parsed as a SingleLineComment rather than a RegularExpressionLiteral. If P is the empty String, this
  245. // specification can be met by letting S be "(?:)".
  246. // 3. Return S.
  247. if (m_pattern.is_empty())
  248. return "(?:)";
  249. // FIXME: Check the 'u' and 'v' flags and escape accordingly
  250. StringBuilder builder;
  251. auto pattern = Utf8View { m_pattern };
  252. auto escaped = false;
  253. for (auto code_point : pattern) {
  254. if (escaped) {
  255. escaped = false;
  256. builder.append_code_point('\\');
  257. builder.append_code_point(code_point);
  258. continue;
  259. }
  260. if (code_point == '\\') {
  261. escaped = true;
  262. continue;
  263. }
  264. switch (code_point) {
  265. case '/':
  266. builder.append("\\/"sv);
  267. break;
  268. case '\n':
  269. builder.append("\\n"sv);
  270. break;
  271. case '\r':
  272. builder.append("\\r"sv);
  273. break;
  274. case LINE_SEPARATOR:
  275. builder.append("\\u2028"sv);
  276. break;
  277. case PARAGRAPH_SEPARATOR:
  278. builder.append("\\u2029"sv);
  279. break;
  280. default:
  281. builder.append_code_point(code_point);
  282. break;
  283. }
  284. }
  285. return builder.to_byte_string();
  286. }
  287. void RegExpObject::visit_edges(JS::Cell::Visitor& visitor)
  288. {
  289. Base::visit_edges(visitor);
  290. visitor.visit(m_realm);
  291. }
  292. // 22.2.3.1 RegExpCreate ( P, F ), https://tc39.es/ecma262/#sec-regexpcreate
  293. ThrowCompletionOr<NonnullGCPtr<RegExpObject>> regexp_create(VM& vm, Value pattern, Value flags)
  294. {
  295. auto& realm = *vm.current_realm();
  296. // 1. Let obj be ! RegExpAlloc(%RegExp%).
  297. auto regexp_object = MUST(regexp_alloc(vm, realm.intrinsics().regexp_constructor()));
  298. // 2. Return ? RegExpInitialize(obj, P, F).
  299. return TRY(regexp_object->regexp_initialize(vm, pattern, flags));
  300. }
  301. // 22.2.3.2 RegExpAlloc ( newTarget ), https://tc39.es/ecma262/#sec-regexpalloc
  302. // 22.2.3.2 RegExpAlloc ( newTarget ), https://github.com/tc39/proposal-regexp-legacy-features#regexpalloc--newtarget-
  303. ThrowCompletionOr<NonnullGCPtr<RegExpObject>> regexp_alloc(VM& vm, FunctionObject& new_target)
  304. {
  305. // 1. Let obj be ? OrdinaryCreateFromConstructor(newTarget, "%RegExp.prototype%", « [[OriginalSource]], [[OriginalFlags]], [[RegExpRecord]], [[RegExpMatcher]] »).
  306. auto regexp_object = TRY(ordinary_create_from_constructor<RegExpObject>(vm, new_target, &Intrinsics::regexp_prototype));
  307. // 2. Let thisRealm be the current Realm Record.
  308. auto& this_realm = *vm.current_realm();
  309. // 3. Set the value of obj’s [[Realm]] internal slot to thisRealm.
  310. regexp_object->set_realm(this_realm);
  311. // 4. If SameValue(newTarget, thisRealm.[[Intrinsics]].[[%RegExp%]]) is true, then
  312. if (same_value(&new_target, this_realm.intrinsics().regexp_constructor())) {
  313. // i. Set the value of obj’s [[LegacyFeaturesEnabled]] internal slot to true.
  314. regexp_object->set_legacy_features_enabled(true);
  315. }
  316. // 5. Else,
  317. else {
  318. // i. Set the value of obj’s [[LegacyFeaturesEnabled]] internal slot to false.
  319. regexp_object->set_legacy_features_enabled(false);
  320. }
  321. // 6. Perform ! DefinePropertyOrThrow(obj, "lastIndex", PropertyDescriptor { [[Writable]]: true, [[Enumerable]]: false, [[Configurable]]: false }).
  322. MUST(regexp_object->define_property_or_throw(vm.names.lastIndex, PropertyDescriptor { .writable = true, .enumerable = false, .configurable = false }));
  323. // 7. Return obj.
  324. return regexp_object;
  325. }
  326. }