Regex.cpp 46 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <LibTest/TestCase.h> // import first, to prevent warning of VERIFY* redefinition
  7. #include <AK/StringBuilder.h>
  8. #include <AK/Tuple.h>
  9. #include <LibRegex/Regex.h>
  10. #include <LibRegex/RegexDebug.h>
  11. #include <LibRegex/RegexMatcher.h>
  12. #include <stdio.h>
  13. static ECMAScriptOptions match_test_api_options(const ECMAScriptOptions options)
  14. {
  15. return options;
  16. }
  17. static PosixOptions match_test_api_options(const PosixOptions options)
  18. {
  19. return options;
  20. }
  21. template<typename... Flags>
  22. static constexpr ECMAScriptFlags combine_flags(Flags&&... flags)
  23. requires((IsSame<Flags, ECMAScriptFlags> && ...))
  24. {
  25. return static_cast<ECMAScriptFlags>((static_cast<regex::FlagsUnderlyingType>(flags) | ...));
  26. }
  27. TEST_CASE(regex_options_ecmascript)
  28. {
  29. ECMAScriptOptions eo;
  30. eo |= ECMAScriptFlags::Global;
  31. EXPECT(eo & ECMAScriptFlags::Global);
  32. EXPECT(!(eo & ECMAScriptFlags::Insensitive));
  33. eo = match_test_api_options(ECMAScriptFlags::Global | ECMAScriptFlags::Insensitive | ECMAScriptFlags::Sticky);
  34. EXPECT(eo & ECMAScriptFlags::Global);
  35. EXPECT(eo & ECMAScriptFlags::Insensitive);
  36. EXPECT(eo & ECMAScriptFlags::Sticky);
  37. EXPECT(!(eo & ECMAScriptFlags::Unicode));
  38. EXPECT(!(eo & ECMAScriptFlags::Multiline));
  39. EXPECT(!(eo & ECMAScriptFlags::SingleLine));
  40. eo &= ECMAScriptFlags::Insensitive;
  41. EXPECT(!(eo & ECMAScriptFlags::Global));
  42. EXPECT(eo & ECMAScriptFlags::Insensitive);
  43. EXPECT(!(eo & ECMAScriptFlags::Multiline));
  44. eo &= ECMAScriptFlags::Sticky;
  45. EXPECT(!(eo & ECMAScriptFlags::Global));
  46. EXPECT(!(eo & ECMAScriptFlags::Insensitive));
  47. EXPECT(!(eo & ECMAScriptFlags::Multiline));
  48. EXPECT(!(eo & ECMAScriptFlags::Sticky));
  49. eo = ~ECMAScriptFlags::Insensitive;
  50. EXPECT(eo & ECMAScriptFlags::Global);
  51. EXPECT(!(eo & ECMAScriptFlags::Insensitive));
  52. EXPECT(eo & ECMAScriptFlags::Multiline);
  53. EXPECT(eo & ECMAScriptFlags::Sticky);
  54. }
  55. TEST_CASE(regex_options_posix)
  56. {
  57. PosixOptions eo;
  58. eo |= PosixFlags::Global;
  59. EXPECT(eo & PosixFlags::Global);
  60. EXPECT(!(eo & PosixFlags::Insensitive));
  61. eo = match_test_api_options(PosixFlags::Global | PosixFlags::Insensitive | PosixFlags::MatchNotBeginOfLine);
  62. EXPECT(eo & PosixFlags::Global);
  63. EXPECT(eo & PosixFlags::Insensitive);
  64. EXPECT(eo & PosixFlags::MatchNotBeginOfLine);
  65. EXPECT(!(eo & PosixFlags::Unicode));
  66. EXPECT(!(eo & PosixFlags::Multiline));
  67. eo &= PosixFlags::Insensitive;
  68. EXPECT(!(eo & PosixFlags::Global));
  69. EXPECT(eo & PosixFlags::Insensitive);
  70. EXPECT(!(eo & PosixFlags::Multiline));
  71. eo &= PosixFlags::MatchNotBeginOfLine;
  72. EXPECT(!(eo & PosixFlags::Global));
  73. EXPECT(!(eo & PosixFlags::Insensitive));
  74. EXPECT(!(eo & PosixFlags::Multiline));
  75. eo = ~PosixFlags::Insensitive;
  76. EXPECT(eo & PosixFlags::Global);
  77. EXPECT(!(eo & PosixFlags::Insensitive));
  78. EXPECT(eo & PosixFlags::Multiline);
  79. }
  80. TEST_CASE(regex_lexer)
  81. {
  82. Lexer l("/[.*+?^${}()|[\\]\\\\]/g"sv);
  83. EXPECT(l.next().type() == regex::TokenType::Slash);
  84. EXPECT(l.next().type() == regex::TokenType::LeftBracket);
  85. EXPECT(l.next().type() == regex::TokenType::Period);
  86. EXPECT(l.next().type() == regex::TokenType::Asterisk);
  87. EXPECT(l.next().type() == regex::TokenType::Plus);
  88. EXPECT(l.next().type() == regex::TokenType::Questionmark);
  89. EXPECT(l.next().type() == regex::TokenType::Circumflex);
  90. EXPECT(l.next().type() == regex::TokenType::Dollar);
  91. EXPECT(l.next().type() == regex::TokenType::LeftCurly);
  92. EXPECT(l.next().type() == regex::TokenType::RightCurly);
  93. EXPECT(l.next().type() == regex::TokenType::LeftParen);
  94. EXPECT(l.next().type() == regex::TokenType::RightParen);
  95. EXPECT(l.next().type() == regex::TokenType::Pipe);
  96. EXPECT(l.next().type() == regex::TokenType::LeftBracket);
  97. EXPECT(l.next().type() == regex::TokenType::EscapeSequence);
  98. EXPECT(l.next().type() == regex::TokenType::EscapeSequence);
  99. EXPECT(l.next().type() == regex::TokenType::RightBracket);
  100. EXPECT(l.next().type() == regex::TokenType::Slash);
  101. EXPECT(l.next().type() == regex::TokenType::Char);
  102. }
  103. TEST_CASE(parser_error_parens)
  104. {
  105. DeprecatedString pattern = "test()test";
  106. Lexer l(pattern);
  107. PosixExtendedParser p(l);
  108. p.parse();
  109. EXPECT(p.has_error());
  110. EXPECT(p.error() == regex::Error::EmptySubExpression);
  111. }
  112. TEST_CASE(parser_error_special_characters_used_at_wrong_place)
  113. {
  114. DeprecatedString pattern;
  115. Vector<char, 5> chars = { '*', '+', '?', '{' };
  116. StringBuilder b;
  117. Lexer l;
  118. PosixExtended p(l);
  119. for (auto& ch : chars) {
  120. // First in ere
  121. b.clear();
  122. b.append(ch);
  123. pattern = b.build();
  124. l.set_source(pattern);
  125. p.parse();
  126. EXPECT(p.has_error());
  127. EXPECT(p.error() == regex::Error::InvalidRepetitionMarker);
  128. // After vertical line
  129. b.clear();
  130. b.append("a|"sv);
  131. b.append(ch);
  132. pattern = b.build();
  133. l.set_source(pattern);
  134. p.parse();
  135. EXPECT(p.has_error());
  136. EXPECT(p.error() == regex::Error::InvalidRepetitionMarker);
  137. // After circumflex
  138. b.clear();
  139. b.append('^');
  140. b.append(ch);
  141. pattern = b.build();
  142. l.set_source(pattern);
  143. p.parse();
  144. EXPECT(p.has_error());
  145. EXPECT(p.error() == regex::Error::InvalidRepetitionMarker);
  146. // After dollar
  147. b.clear();
  148. b.append('$');
  149. b.append(ch);
  150. pattern = b.build();
  151. l.set_source(pattern);
  152. p.parse();
  153. EXPECT(p.has_error());
  154. EXPECT(p.error() == regex::Error::InvalidRepetitionMarker);
  155. // After left parens
  156. b.clear();
  157. b.append('(');
  158. b.append(ch);
  159. b.append(')');
  160. pattern = b.build();
  161. l.set_source(pattern);
  162. p.parse();
  163. EXPECT(p.has_error());
  164. EXPECT(p.error() == regex::Error::InvalidRepetitionMarker);
  165. }
  166. }
  167. TEST_CASE(parser_error_vertical_line_used_at_wrong_place)
  168. {
  169. Lexer l;
  170. PosixExtended p(l);
  171. // First in ere
  172. l.set_source("|asdf"sv);
  173. p.parse();
  174. EXPECT(p.has_error());
  175. EXPECT(p.error() == regex::Error::EmptySubExpression);
  176. // Last in ere
  177. l.set_source("asdf|"sv);
  178. p.parse();
  179. EXPECT(p.has_error());
  180. EXPECT(p.error() == regex::Error::EmptySubExpression);
  181. // After left parens
  182. l.set_source("(|asdf)"sv);
  183. p.parse();
  184. EXPECT(p.has_error());
  185. EXPECT(p.error() == regex::Error::EmptySubExpression);
  186. // Proceed right parens
  187. l.set_source("(asdf)|"sv);
  188. p.parse();
  189. EXPECT(p.has_error());
  190. EXPECT(p.error() == regex::Error::EmptySubExpression);
  191. }
  192. TEST_CASE(catch_all_first)
  193. {
  194. Regex<PosixExtended> re("^.*$");
  195. RegexResult m;
  196. re.match("Hello World"sv, m);
  197. EXPECT(m.count == 1);
  198. EXPECT(re.match("Hello World"sv, m));
  199. }
  200. TEST_CASE(catch_all)
  201. {
  202. Regex<PosixExtended> re("^.*$", PosixFlags::Global);
  203. EXPECT(re.has_match("Hello World"sv));
  204. EXPECT(re.match("Hello World"sv).success);
  205. EXPECT(re.match("Hello World"sv).count == 1);
  206. EXPECT(has_match("Hello World"sv, re));
  207. auto res = match("Hello World"sv, re);
  208. EXPECT(res.success);
  209. EXPECT(res.count == 1);
  210. EXPECT(res.matches.size() == 1);
  211. EXPECT(res.matches.first().view == "Hello World");
  212. }
  213. TEST_CASE(catch_all_again)
  214. {
  215. Regex<PosixExtended> re("^.*$", PosixFlags::Extra);
  216. EXPECT_EQ(has_match("Hello World"sv, re), true);
  217. }
  218. TEST_CASE(char_utf8)
  219. {
  220. Regex<PosixExtended> re("😀");
  221. RegexResult result;
  222. EXPECT_EQ((result = match(Utf8View { "Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界"sv }, re, PosixFlags::Global)).success, true);
  223. EXPECT_EQ(result.count, 2u);
  224. }
  225. TEST_CASE(catch_all_newline)
  226. {
  227. Regex<PosixExtended> re("^.*$", PosixFlags::Multiline | PosixFlags::StringCopyMatches);
  228. RegexResult result;
  229. auto lambda = [&result, &re]() {
  230. DeprecatedString aaa = "Hello World\nTest\n1234\n";
  231. result = match(aaa, re);
  232. EXPECT_EQ(result.success, true);
  233. };
  234. lambda();
  235. EXPECT_EQ(result.count, 3u);
  236. EXPECT_EQ(result.matches.at(0).view, "Hello World");
  237. EXPECT_EQ(result.matches.at(1).view, "Test");
  238. EXPECT_EQ(result.matches.at(2).view, "1234");
  239. }
  240. TEST_CASE(catch_all_newline_view)
  241. {
  242. Regex<PosixExtended> re("^.*$", PosixFlags::Multiline);
  243. RegexResult result;
  244. DeprecatedString aaa = "Hello World\nTest\n1234\n";
  245. result = match(aaa, re);
  246. EXPECT_EQ(result.success, true);
  247. EXPECT_EQ(result.count, 3u);
  248. DeprecatedString str = "Hello World";
  249. EXPECT_EQ(result.matches.at(0).view, str.view());
  250. EXPECT_EQ(result.matches.at(1).view, "Test");
  251. EXPECT_EQ(result.matches.at(2).view, "1234");
  252. }
  253. TEST_CASE(catch_all_newline_2)
  254. {
  255. Regex<PosixExtended> re("^.*$");
  256. RegexResult result;
  257. result = match("Hello World\nTest\n1234\n"sv, re, PosixFlags::Multiline | PosixFlags::StringCopyMatches);
  258. EXPECT_EQ(result.success, true);
  259. EXPECT_EQ(result.count, 3u);
  260. EXPECT_EQ(result.matches.at(0).view, "Hello World");
  261. EXPECT_EQ(result.matches.at(1).view, "Test");
  262. EXPECT_EQ(result.matches.at(2).view, "1234");
  263. result = match("Hello World\nTest\n1234\n"sv, re);
  264. EXPECT_EQ(result.success, true);
  265. EXPECT_EQ(result.count, 1u);
  266. EXPECT_EQ(result.matches.at(0).view, "Hello World\nTest\n1234\n");
  267. }
  268. TEST_CASE(match_all_character_class)
  269. {
  270. Regex<PosixExtended> re("[[:alpha:]]");
  271. DeprecatedString str = "[Window]\nOpacity=255\nAudibleBeep=0\n";
  272. RegexResult result = match(str, re, PosixFlags::Global | PosixFlags::StringCopyMatches);
  273. EXPECT_EQ(result.success, true);
  274. EXPECT_EQ(result.count, 24u);
  275. EXPECT_EQ(result.matches.at(0).view, "W");
  276. EXPECT_EQ(result.matches.at(1).view, "i");
  277. EXPECT_EQ(result.matches.at(2).view, "n");
  278. }
  279. TEST_CASE(match_character_class_with_assertion)
  280. {
  281. Regex<PosixExtended> re("[[:alpha:]]+$");
  282. DeprecatedString str = "abcdef";
  283. RegexResult result = match(str, re);
  284. EXPECT_EQ(result.success, true);
  285. EXPECT_EQ(result.count, 1u);
  286. }
  287. TEST_CASE(example_for_git_commit)
  288. {
  289. Regex<PosixExtended> re("^.*$");
  290. auto result = re.match("Well, hello friends!\nHello World!"sv);
  291. EXPECT(result.success);
  292. EXPECT(result.count == 1);
  293. EXPECT(result.matches.at(0).view.starts_with("Well"sv));
  294. EXPECT(result.matches.at(0).view.length() == 33);
  295. EXPECT(re.has_match("Well,...."sv));
  296. result = re.match("Well, hello friends!\nHello World!"sv, PosixFlags::Multiline);
  297. EXPECT(result.success);
  298. EXPECT(result.count == 2);
  299. EXPECT(result.matches.at(0).view == "Well, hello friends!");
  300. EXPECT(result.matches.at(1).view == "Hello World!");
  301. }
  302. TEST_CASE(email_address)
  303. {
  304. Regex<PosixExtended> re("^[A-Z0-9a-z._%+-]{1,64}@([A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$");
  305. EXPECT(re.has_match("hello.world@domain.tld"sv));
  306. EXPECT(re.has_match("this.is.a.very_long_email_address@world.wide.web"sv));
  307. }
  308. TEST_CASE(ini_file_entries)
  309. {
  310. Regex<PosixExtended> re("[[:alpha:]]*=([[:digit:]]*)|\\[(.*)\\]");
  311. RegexResult result;
  312. if constexpr (REGEX_DEBUG) {
  313. RegexDebug regex_dbg(stderr);
  314. regex_dbg.print_raw_bytecode(re);
  315. regex_dbg.print_header();
  316. regex_dbg.print_bytecode(re);
  317. }
  318. DeprecatedString haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
  319. EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true);
  320. EXPECT_EQ(result.count, 3u);
  321. if constexpr (REGEX_DEBUG) {
  322. for (auto& v : result.matches)
  323. fprintf(stderr, "%s\n", v.view.to_deprecated_string().characters());
  324. }
  325. EXPECT_EQ(result.matches.at(0).view, "[Window]");
  326. EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "Window");
  327. EXPECT_EQ(result.matches.at(1).view, "Opacity=255");
  328. EXPECT_EQ(result.matches.at(1).line, 1u);
  329. EXPECT_EQ(result.matches.at(1).column, 0u);
  330. EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "255");
  331. EXPECT_EQ(result.capture_group_matches.at(1).at(0).line, 1u);
  332. EXPECT_EQ(result.capture_group_matches.at(1).at(0).column, 8u);
  333. EXPECT_EQ(result.matches.at(2).view, "AudibleBeep=0");
  334. EXPECT_EQ(result.capture_group_matches.at(2).at(0).view, "0");
  335. EXPECT_EQ(result.capture_group_matches.at(2).at(0).line, 2u);
  336. EXPECT_EQ(result.capture_group_matches.at(2).at(0).column, 12u);
  337. }
  338. TEST_CASE(ini_file_entries2)
  339. {
  340. Regex<PosixExtended> re("[[:alpha:]]*=([[:digit:]]*)");
  341. RegexResult result;
  342. DeprecatedString haystack = "ViewMode=Icon";
  343. EXPECT_EQ(re.match(haystack.view(), result), false);
  344. EXPECT_EQ(result.count, 0u);
  345. EXPECT_EQ(re.search(haystack.view(), result), true);
  346. EXPECT_EQ(result.count, 1u);
  347. }
  348. TEST_CASE(named_capture_group)
  349. {
  350. Regex<PosixExtended> re("[[:alpha:]]*=(?<Test>[[:digit:]]*)");
  351. RegexResult result;
  352. if constexpr (REGEX_DEBUG) {
  353. RegexDebug regex_dbg(stderr);
  354. regex_dbg.print_raw_bytecode(re);
  355. regex_dbg.print_header();
  356. regex_dbg.print_bytecode(re);
  357. }
  358. DeprecatedString haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
  359. EXPECT_EQ(re.search(haystack, result, PosixFlags::Multiline), true);
  360. EXPECT_EQ(result.count, 2u);
  361. EXPECT_EQ(result.matches.at(0).view, "Opacity=255");
  362. EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "255");
  363. EXPECT_EQ(result.capture_group_matches.at(0).at(0).capture_group_name, "Test");
  364. EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0");
  365. EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "0");
  366. EXPECT_EQ(result.capture_group_matches.at(1).at(0).capture_group_name, "Test");
  367. }
  368. TEST_CASE(ecma262_named_capture_group_with_dollar_sign)
  369. {
  370. Regex<ECMA262> re("[a-zA-Z]*=(?<$Test$>[0-9]*)");
  371. RegexResult result;
  372. if constexpr (REGEX_DEBUG) {
  373. RegexDebug regex_dbg(stderr);
  374. regex_dbg.print_raw_bytecode(re);
  375. regex_dbg.print_header();
  376. regex_dbg.print_bytecode(re);
  377. }
  378. DeprecatedString haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
  379. EXPECT_EQ(re.search(haystack, result, ECMAScriptFlags::Multiline), true);
  380. EXPECT_EQ(result.count, 2u);
  381. EXPECT_EQ(result.matches.at(0).view, "Opacity=255");
  382. EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "255");
  383. EXPECT_EQ(result.capture_group_matches.at(0).at(0).capture_group_name, "$Test$");
  384. EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0");
  385. EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "0");
  386. EXPECT_EQ(result.capture_group_matches.at(1).at(0).capture_group_name, "$Test$");
  387. }
  388. TEST_CASE(a_star)
  389. {
  390. Regex<PosixExtended> re("a*");
  391. RegexResult result;
  392. if constexpr (REGEX_DEBUG) {
  393. RegexDebug regex_dbg(stderr);
  394. regex_dbg.print_raw_bytecode(re);
  395. regex_dbg.print_header();
  396. regex_dbg.print_bytecode(re);
  397. }
  398. DeprecatedString haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
  399. EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true);
  400. EXPECT_EQ(result.count, 32u);
  401. if (result.count == 32u) {
  402. EXPECT_EQ(result.matches.at(0).view.length(), 0u);
  403. EXPECT_EQ(result.matches.at(10).view.length(), 1u);
  404. EXPECT_EQ(result.matches.at(10).view, "a");
  405. EXPECT_EQ(result.matches.at(31).view.length(), 0u);
  406. }
  407. }
  408. TEST_CASE(simple_period_end_benchmark)
  409. {
  410. Regex<PosixExtended> re("hello.$");
  411. RegexResult m;
  412. EXPECT_EQ(re.search("Hello1"sv, m), false);
  413. EXPECT_EQ(re.search("hello1hello1"sv, m), true);
  414. EXPECT_EQ(re.search("hello2hell"sv, m), false);
  415. EXPECT_EQ(re.search("hello?"sv, m), true);
  416. }
  417. TEST_CASE(posix_extended_nested_capture_group)
  418. {
  419. Regex<PosixExtended> re("(h(e(?<llo>llo)))"); // group 0 -> "hello", group 1 -> "ello", group 2/"llo" -> "llo"
  420. auto result = re.match("hello"sv);
  421. EXPECT(result.success);
  422. EXPECT_EQ(result.capture_group_matches.size(), 1u);
  423. EXPECT_EQ(result.capture_group_matches[0].size(), 3u);
  424. EXPECT_EQ(result.capture_group_matches[0][0].view, "hello"sv);
  425. EXPECT_EQ(result.capture_group_matches[0][1].view, "ello"sv);
  426. EXPECT_EQ(result.capture_group_matches[0][2].view, "llo"sv);
  427. }
  428. auto parse_test_case_long_disjunction_chain = DeprecatedString::repeated("a|"sv, 100000);
  429. TEST_CASE(ECMA262_parse)
  430. {
  431. struct _test {
  432. StringView pattern;
  433. regex::Error expected_error { regex::Error::NoError };
  434. regex::ECMAScriptFlags flags {};
  435. };
  436. _test const tests[] {
  437. { "^hello.$"sv },
  438. { "^(hello.)$"sv },
  439. { "^h{0,1}ello.$"sv },
  440. { "^hello\\W$"sv },
  441. { "^hell\\w.$"sv },
  442. { "^hell\\x6f1$"sv }, // ^hello1$
  443. { "^hel(?:l\\w).$"sv },
  444. { "^hel(?<LO>l\\w).$"sv },
  445. { "^[-a-zA-Z\\w\\s]+$"sv },
  446. { "\\bhello\\B"sv },
  447. { "^[\\w+/_-]+[=]{0,2}$"sv }, // #4189
  448. { "^(?:[^<]*(<[\\w\\W]+>)[^>]*$|#([\\w\\-]*)$)"sv }, // #4189
  449. { "\\/"sv }, // #4189
  450. { ",/=-:"sv }, // #4243
  451. { "\\x"sv }, // Even invalid escapes are allowed if ~unicode.
  452. { "\\x1"sv }, // Even invalid escapes are allowed if ~unicode.
  453. { "\\x1"sv, regex::Error::InvalidPattern, regex::ECMAScriptFlags::Unicode },
  454. { "\\x11"sv },
  455. { "\\x11"sv, regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
  456. { "\\"sv, regex::Error::InvalidTrailingEscape },
  457. { "(?"sv, regex::Error::InvalidCaptureGroup },
  458. { "\\u1234"sv, regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
  459. { "[\\u1234]"sv, regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
  460. { "\\u1"sv, regex::Error::InvalidPattern, regex::ECMAScriptFlags::Unicode },
  461. { "[\\u1]"sv, regex::Error::InvalidPattern, regex::ECMAScriptFlags::Unicode },
  462. { ",(?"sv, regex::Error::InvalidCaptureGroup }, // #4583
  463. { "{1}"sv, regex::Error::InvalidPattern },
  464. { "{1,2}"sv, regex::Error::InvalidPattern },
  465. { "\\uxxxx"sv, regex::Error::NoError },
  466. { "\\uxxxx"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  467. { "\\ud83d"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  468. { "\\ud83d\\uxxxx"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  469. { "\\u{0}"sv },
  470. { "\\u{0}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  471. { "\\u{10ffff}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  472. { "\\u{10ffff"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  473. { "\\u{10ffffx"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  474. { "\\u{110000}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  475. { "\\p"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  476. { "\\p{"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  477. { "\\p{}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
  478. { "\\p{AsCiI}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
  479. { "\\p{hello friends}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
  480. { "\\p{Prepended_Concatenation_Mark}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
  481. { "\\p{ASCII}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  482. { "\\\\p{1}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  483. { "\\\\p{AsCiI}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  484. { "\\\\p{ASCII}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  485. { "\\c"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  486. { "\\c"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  487. { "[\\c]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  488. { "[\\c]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  489. { "\\c`"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  490. { "\\c`"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  491. { "[\\c`]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  492. { "[\\c`]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  493. { "\\A"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  494. { "\\A"sv, regex::Error::InvalidCharacterClass, ECMAScriptFlags::Unicode },
  495. { "[\\A]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  496. { "[\\A]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  497. { "\\0"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  498. { "\\0"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
  499. { "\\00"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  500. { "\\00"sv, regex::Error::InvalidCharacterClass, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
  501. { "[\\0]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  502. { "[\\0]"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
  503. { "[\\00]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  504. { "[\\00]"sv, regex::Error::InvalidPattern, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
  505. { "\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  506. { "[\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/]"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  507. { "]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  508. { "]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  509. { "\\]"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  510. { "}"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  511. { "}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  512. { "\\}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  513. { "a{9007199254740991}"sv }, // 2^53 - 1
  514. { "a{9007199254740991,}"sv },
  515. { "a{9007199254740991,9007199254740991}"sv },
  516. { "a{9007199254740992}"sv, regex::Error::InvalidBraceContent },
  517. { "a{9007199254740992,}"sv, regex::Error::InvalidBraceContent },
  518. { "a{9007199254740991,9007199254740992}"sv, regex::Error::InvalidBraceContent },
  519. { "a{9007199254740992,9007199254740991}"sv, regex::Error::InvalidBraceContent },
  520. { "a{9007199254740992,9007199254740992}"sv, regex::Error::InvalidBraceContent },
  521. { "(?<a>a)(?<a>b)"sv, regex::Error::DuplicateNamedCapture },
  522. { "(?<a>a)(?<b>b)(?<a>c)"sv, regex::Error::DuplicateNamedCapture },
  523. { "(?<1a>a)"sv, regex::Error::InvalidNameForCaptureGroup },
  524. { "(?<\\a>a)"sv, regex::Error::InvalidNameForCaptureGroup },
  525. { "(?<\ta>a)"sv, regex::Error::InvalidNameForCaptureGroup },
  526. { "(?<$$_$$>a)"sv },
  527. { "(?<ÿ>a)"sv },
  528. { "(?<𝓑𝓻𝓸𝔀𝓷>a)"sv },
  529. { "((?=lg)?[vl]k\\-?\\d{3}) bui| 3\\.[-\\w; ]{10}lg?-([06cv9]{3,4})"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, // #12373, quantifiable assertions.
  530. { parse_test_case_long_disjunction_chain.view() }, // A whole lot of disjunctions, should not overflow the stack.
  531. { "(\"|')(?:(?!\\2)[^\\\\\\r\\n]|\\\\.)*\\2"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, // LegacyOctalEscapeSequence should not consume too many chars (and should not crash)
  532. };
  533. for (auto& test : tests) {
  534. Regex<ECMA262> re(test.pattern, test.flags);
  535. EXPECT_EQ(re.parser_result.error, test.expected_error);
  536. if constexpr (REGEX_DEBUG) {
  537. dbgln("\n");
  538. RegexDebug regex_dbg(stderr);
  539. regex_dbg.print_raw_bytecode(re);
  540. regex_dbg.print_header();
  541. regex_dbg.print_bytecode(re);
  542. dbgln("\n");
  543. }
  544. }
  545. }
  546. TEST_CASE(ECMA262_match)
  547. {
  548. struct _test {
  549. StringView pattern;
  550. StringView subject;
  551. bool matches { true };
  552. ECMAScriptFlags options {};
  553. };
  554. // clang-format off
  555. constexpr _test tests[] {
  556. { "^hello.$"sv, "hello1"sv },
  557. { "^(hello.)$"sv, "hello1"sv },
  558. { "^h{0,1}ello.$"sv, "ello1"sv },
  559. { "^hello\\W$"sv, "hello!"sv },
  560. { "^hell\\w.$"sv, "hellx!"sv },
  561. { "^hell\\x6f1$"sv, "hello1"sv },
  562. { "^hel(?<LO>l.)1$"sv, "hello1"sv },
  563. { "^hel(?<LO>l.)1*\\k<LO>.$"sv, "hello1lo1"sv },
  564. { "^[-a-z1-3\\s]+$"sv, "hell2 o1"sv },
  565. { "^[\\0-\\x1f]$"sv, "\n"sv },
  566. { .pattern = "\\bhello\\B"sv, .subject = "hello1"sv, .options = ECMAScriptFlags::Global },
  567. { "\\b.*\\b"sv, "hello1"sv },
  568. { "[^\\D\\S]{2}"sv, "1 "sv },
  569. { "bar(?=f.)foo"sv, "barfoo"sv },
  570. { "bar(?=foo)bar"sv, "barbar"sv, false },
  571. { "bar(?!foo)bar"sv, "barbar"sv, true },
  572. { "bar(?!bar)bar"sv, "barbar"sv, false },
  573. { "bar.*(?<=foo)"sv, "barbar"sv, false },
  574. { "bar.*(?<!foo)"sv, "barbar"sv, true },
  575. { "((...)X)+"sv, "fooXbarXbazX"sv, true },
  576. { "(?:)"sv, ""sv, true },
  577. { "\\^"sv, "^"sv },
  578. { "\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/"sv, "^$\\.*+?()[]{}|/"sv, true, ECMAScriptFlags::Unicode },
  579. { "[\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/]{15}"sv, "^$\\.*+?()[]{}|/"sv, true, ECMAScriptFlags::Unicode },
  580. { "(a{2}){3}"sv, "aaaaaa"sv },
  581. { "(a{2}){3}"sv, "aaaabaa"sv, false },
  582. { "(a{2}){4}"sv, "aaaaaaaa"sv },
  583. { "(a{2}){4}"sv, "aaaaaabaa"sv, false },
  584. { "(a{3}){2}"sv, "aaaaaa"sv },
  585. { "(a{3}){2}"sv, "aaaabaa"sv, false },
  586. { "(a{4}){2}"sv, "aaaaaaaa"sv },
  587. { "(a{4}){2}"sv, "aaaaaabaa"sv, false },
  588. { "\\u{4}"sv, "uuuu"sv },
  589. { "(?<=.{3})f"sv, "abcdef"sv, true, (ECMAScriptFlags)regex::AllFlags::Global },
  590. { "(?<=.{3})f"sv, "abc😀ef"sv, true, (ECMAScriptFlags)regex::AllFlags::Global },
  591. // ECMA262, B.1.4. Regular Expression Pattern extensions for browsers
  592. { "{"sv, "{"sv, true, ECMAScriptFlags::BrowserExtended },
  593. { "\\5"sv, "\5"sv, true, ECMAScriptFlags::BrowserExtended },
  594. { "\\05"sv, "\5"sv, true, ECMAScriptFlags::BrowserExtended },
  595. { "\\455"sv, "\45""5"sv, true, ECMAScriptFlags::BrowserExtended },
  596. { "\\314"sv, "\314"sv, true, ECMAScriptFlags::BrowserExtended },
  597. { "\\c"sv, "\\c"sv, true, ECMAScriptFlags::BrowserExtended },
  598. { "\\cf"sv, "\06"sv, true, ECMAScriptFlags::BrowserExtended },
  599. { "\\c1"sv, "\\c1"sv, true, ECMAScriptFlags::BrowserExtended },
  600. { "[\\c1]"sv, "\x11"sv, true, ECMAScriptFlags::BrowserExtended },
  601. { "[\\w-\\d]"sv, "-"sv, true, ECMAScriptFlags::BrowserExtended },
  602. { "^(?:^^\\.?|[!+-]|!=|!==|#|%|%=|&|&&|&&=|&=|\\(|\\*|\\*=|\\+=|,|-=|->|\\/|\\/=|:|::|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|[?@[^]|\\^=|\\^\\^|\\^\\^=|{|\\||\\|=|\\|\\||\\|\\|=|~|break|case|continue|delete|do|else|finally|instanceof|return|throw|try|typeof)\\s*(\\/(?=[^*/])(?:[^/[\\\\]|\\\\[\\S\\s]|\\[(?:[^\\\\\\]]|\\\\[\\S\\s])*(?:]|$))+\\/)"sv,
  603. "return /xx/"sv, true, ECMAScriptFlags::BrowserExtended
  604. }, // #5517, appears to be matching JS expressions that involve regular expressions...
  605. { "a{2,}"sv, "aaaa"sv }, // #5518
  606. { "\\0"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended },
  607. { "\\0"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
  608. { "\\01"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended },
  609. { "[\\0]"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended },
  610. { "[\\0]"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
  611. { "[\\01]"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended },
  612. { "(\0|a)"sv, "a"sv, true }, // #9686, Should allow null bytes in pattern
  613. { "(.*?)a(?!(a+)b\\2c)\\2(.*)"sv, "baaabaac"sv, true }, // #6042, Groups inside lookarounds may be referenced outside, but their contents appear empty if the pattern in the lookaround fails.
  614. { "a|$"sv, "x"sv, true, (ECMAScriptFlags)regex::AllFlags::Global }, // #11940, Global (not the 'g' flag) regexps should attempt to match the zero-length end of the string too.
  615. { "foo\nbar"sv, "foo\nbar"sv, true }, // #12126, ECMA262 regexp should match literal newlines without the 's' flag.
  616. { "foo[^]bar"sv, "foo\nbar"sv, true }, // #12126, ECMA262 regexp should match newline with [^].
  617. { "^[_A-Z]+$"sv, "_aA"sv, true, ECMAScriptFlags::Insensitive }, // Insensitive lookup table: characters in a range do not necessarily lie in the same range after being converted to lowercase.
  618. { "^[a-sy-z]$"sv, "b"sv, true, ECMAScriptFlags::Insensitive },
  619. { "^[a-sy-z]$"sv, "y"sv, true, ECMAScriptFlags::Insensitive },
  620. { "^[a-sy-z]$"sv, "u"sv, false, ECMAScriptFlags::Insensitive },
  621. };
  622. // clang-format on
  623. for (auto& test : tests) {
  624. Regex<ECMA262> re(test.pattern, test.options);
  625. if constexpr (REGEX_DEBUG) {
  626. dbgln("\n");
  627. RegexDebug regex_dbg(stderr);
  628. regex_dbg.print_raw_bytecode(re);
  629. regex_dbg.print_header();
  630. regex_dbg.print_bytecode(re);
  631. dbgln("\n");
  632. }
  633. EXPECT_EQ(re.parser_result.error, regex::Error::NoError);
  634. EXPECT_EQ(re.match(test.subject).success, test.matches);
  635. }
  636. }
  637. TEST_CASE(ECMA262_unicode_match)
  638. {
  639. constexpr auto space_and_line_terminator_code_points = Array { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000, 0xFEFF };
  640. StringBuilder builder;
  641. for (u32 code_point : space_and_line_terminator_code_points)
  642. builder.append_code_point(code_point);
  643. auto space_and_line_terminators = builder.build();
  644. struct _test {
  645. StringView pattern;
  646. StringView subject;
  647. bool matches { true };
  648. ECMAScriptFlags options {};
  649. };
  650. _test tests[] {
  651. { "\xf0\x9d\x8c\x86"sv, "abcdef"sv, false, ECMAScriptFlags::Unicode },
  652. { "[\xf0\x9d\x8c\x86]"sv, "abcdef"sv, false, ECMAScriptFlags::Unicode },
  653. { "\\ud83d"sv, "😀"sv, true },
  654. { "\\ud83d"sv, "😀"sv, false, ECMAScriptFlags::Unicode },
  655. { "\\ude00"sv, "😀"sv, true },
  656. { "\\ude00"sv, "😀"sv, false, ECMAScriptFlags::Unicode },
  657. { "\\ud83d\\ude00"sv, "😀"sv, true },
  658. { "\\ud83d\\ude00"sv, "😀"sv, true, ECMAScriptFlags::Unicode },
  659. { "\\u{1f600}"sv, "😀"sv, true, ECMAScriptFlags::Unicode },
  660. { "\\ud83d\\ud83d"sv, "\xed\xa0\xbd\xed\xa0\xbd"sv, true },
  661. { "\\ud83d\\ud83d"sv, "\xed\xa0\xbd\xed\xa0\xbd"sv, true, ECMAScriptFlags::Unicode },
  662. { "(?<=.{3})f"sv, "abcdef"sv, true, ECMAScriptFlags::Unicode },
  663. { "(?<=.{3})f"sv, "abc😀ef"sv, true, ECMAScriptFlags::Unicode },
  664. { "(?<𝓑𝓻𝓸𝔀𝓷>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode },
  665. { "(?<\\u{1d4d1}\\u{1d4fb}\\u{1d4f8}\\u{1d500}\\u{1d4f7}>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode },
  666. { "(?<\\ud835\\udcd1\\ud835\\udcfb\\ud835\\udcf8\\ud835\\udd00\\ud835\\udcf7>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode },
  667. { "^\\s+$"sv, space_and_line_terminators },
  668. { "^\\s+$"sv, space_and_line_terminators, true, ECMAScriptFlags::Unicode },
  669. };
  670. for (auto& test : tests) {
  671. Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | test.options);
  672. auto subject = AK::utf8_to_utf16(test.subject);
  673. Utf16View view { subject };
  674. if constexpr (REGEX_DEBUG) {
  675. dbgln("\n");
  676. RegexDebug regex_dbg(stderr);
  677. regex_dbg.print_raw_bytecode(re);
  678. regex_dbg.print_header();
  679. regex_dbg.print_bytecode(re);
  680. dbgln("\n");
  681. }
  682. EXPECT_EQ(re.parser_result.error, regex::Error::NoError);
  683. EXPECT_EQ(re.match(view).success, test.matches);
  684. }
  685. }
  686. TEST_CASE(ECMA262_unicode_sets_match)
  687. {
  688. struct _test {
  689. StringView pattern;
  690. StringView subject;
  691. bool matches { true };
  692. ECMAScriptFlags options {};
  693. };
  694. constexpr _test tests[] {
  695. { "[\\w--x]"sv, "x"sv, false },
  696. { "[\\w&&x]"sv, "y"sv, false },
  697. { "[\\w--x]"sv, "y"sv, true },
  698. { "[\\w&&x]"sv, "x"sv, true },
  699. { "[[0-9\\w]--x--6]"sv, "6"sv, false },
  700. { "[[0-9\\w]--x--6]"sv, "x"sv, false },
  701. { "[[0-9\\w]--x--6]"sv, "y"sv, true },
  702. { "[[0-9\\w]--x--6]"sv, "9"sv, true },
  703. { "[\\w&&\\d]"sv, "a"sv, false },
  704. { "[\\w&&\\d]"sv, "4"sv, true },
  705. };
  706. for (auto& test : tests) {
  707. Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::UnicodeSets | test.options);
  708. if constexpr (REGEX_DEBUG) {
  709. dbgln("\n");
  710. RegexDebug regex_dbg(stderr);
  711. regex_dbg.print_raw_bytecode(re);
  712. regex_dbg.print_header();
  713. regex_dbg.print_bytecode(re);
  714. dbgln("\n");
  715. }
  716. EXPECT_EQ(re.parser_result.error, regex::Error::NoError);
  717. auto result = re.match(test.subject).success;
  718. EXPECT_EQ(result, test.matches);
  719. }
  720. }
  721. TEST_CASE(ECMA262_property_match)
  722. {
  723. struct _test {
  724. StringView pattern;
  725. StringView subject;
  726. bool matches { true };
  727. ECMAScriptFlags options {};
  728. };
  729. constexpr _test tests[] {
  730. { "\\p{ASCII}"sv, "a"sv, false },
  731. { "\\p{ASCII}"sv, "p{ASCII}"sv, true },
  732. { "\\p{ASCII}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  733. { "\\p{ASCII}"sv, "😀"sv, false, ECMAScriptFlags::Unicode },
  734. { "\\P{ASCII}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  735. { "\\P{ASCII}"sv, "😀"sv, true, ECMAScriptFlags::Unicode },
  736. { "\\p{ASCII_Hex_Digit}"sv, "1"sv, true, ECMAScriptFlags::Unicode },
  737. { "\\p{ASCII_Hex_Digit}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  738. { "\\p{ASCII_Hex_Digit}"sv, "x"sv, false, ECMAScriptFlags::Unicode },
  739. { "\\P{ASCII_Hex_Digit}"sv, "1"sv, false, ECMAScriptFlags::Unicode },
  740. { "\\P{ASCII_Hex_Digit}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  741. { "\\P{ASCII_Hex_Digit}"sv, "x"sv, true, ECMAScriptFlags::Unicode },
  742. { "\\p{Any}"sv, "\xcd\xb8"sv, true, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
  743. { "\\P{Any}"sv, "\xcd\xb8"sv, false, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
  744. { "\\p{Assigned}"sv, "\xcd\xb8"sv, false, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
  745. { "\\P{Assigned}"sv, "\xcd\xb8"sv, true, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
  746. { "\\p{Lu}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  747. { "\\p{Lu}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  748. { "\\p{Lu}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  749. { "\\p{Cased_Letter}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  750. { "\\p{Cased_Letter}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  751. { "\\p{Cased_Letter}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  752. { "\\P{Cased_Letter}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  753. { "\\P{Cased_Letter}"sv, "A"sv, false, ECMAScriptFlags::Unicode },
  754. { "\\P{Cased_Letter}"sv, "9"sv, true, ECMAScriptFlags::Unicode },
  755. { "\\p{General_Category=Cased_Letter}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  756. { "\\p{General_Category=Cased_Letter}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  757. { "\\p{General_Category=Cased_Letter}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  758. { "\\p{gc=Cased_Letter}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  759. { "\\p{gc=Cased_Letter}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  760. { "\\p{gc=Cased_Letter}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  761. { "\\p{Script=Latin}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  762. { "\\p{Script=Latin}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  763. { "\\p{Script=Latin}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  764. { "\\p{sc=Latin}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  765. { "\\p{sc=Latin}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  766. { "\\p{sc=Latin}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  767. { "\\p{Script_Extensions=Deva}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  768. { "\\p{Script_Extensions=Beng}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5
  769. { "\\p{Script_Extensions=Deva}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5
  770. { "\\p{scx=Deva}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  771. { "\\p{scx=Beng}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5
  772. { "\\p{scx=Deva}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5
  773. };
  774. for (auto& test : tests) {
  775. Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | regex::ECMAScriptFlags::BrowserExtended | test.options);
  776. auto subject = AK::utf8_to_utf16(test.subject);
  777. Utf16View view { subject };
  778. if constexpr (REGEX_DEBUG) {
  779. dbgln("\n");
  780. RegexDebug regex_dbg(stderr);
  781. regex_dbg.print_raw_bytecode(re);
  782. regex_dbg.print_header();
  783. regex_dbg.print_bytecode(re);
  784. dbgln("\n");
  785. }
  786. EXPECT_EQ(re.parser_result.error, regex::Error::NoError);
  787. EXPECT_EQ(re.match(view).success, test.matches);
  788. }
  789. }
  790. TEST_CASE(replace)
  791. {
  792. struct _test {
  793. StringView pattern;
  794. StringView replacement;
  795. StringView subject;
  796. StringView expected;
  797. ECMAScriptFlags options {};
  798. };
  799. constexpr _test tests[] {
  800. { "foo(.+)"sv, "aaa"sv, "test"sv, "test"sv },
  801. { "foo(.+)"sv, "test\\1"sv, "foobar"sv, "testbar"sv },
  802. { "foo(.+)"sv, "\\2\\1"sv, "foobar"sv, "\\2bar"sv },
  803. { "foo(.+)"sv, "\\\\\\1"sv, "foobar"sv, "\\bar"sv },
  804. { "foo(.)"sv, "a\\1"sv, "fooxfooy"sv, "axay"sv, ECMAScriptFlags::Multiline },
  805. };
  806. for (auto& test : tests) {
  807. Regex<ECMA262> re(test.pattern, test.options);
  808. if constexpr (REGEX_DEBUG) {
  809. dbgln("\n");
  810. RegexDebug regex_dbg(stderr);
  811. regex_dbg.print_raw_bytecode(re);
  812. regex_dbg.print_header();
  813. regex_dbg.print_bytecode(re);
  814. dbgln("\n");
  815. }
  816. EXPECT_EQ(re.parser_result.error, regex::Error::NoError);
  817. EXPECT_EQ(re.replace(test.subject, test.replacement), test.expected);
  818. }
  819. }
  820. TEST_CASE(case_insensitive_match)
  821. {
  822. Regex<PosixExtended> re("cd", PosixFlags::Insensitive | PosixFlags::Global);
  823. auto result = re.match("AEKFCD"sv);
  824. EXPECT_EQ(result.success, true);
  825. if (result.success) {
  826. EXPECT_EQ(result.matches.at(0).column, 4ul);
  827. }
  828. }
  829. TEST_CASE(extremely_long_fork_chain)
  830. {
  831. Regex<ECMA262> re("(?:aa)*");
  832. auto result = re.match(DeprecatedString::repeated('a', 1000));
  833. EXPECT_EQ(result.success, true);
  834. }
  835. TEST_CASE(theoretically_infinite_loop)
  836. {
  837. Array patterns {
  838. "(a*)*"sv, // Infinitely matching empty substrings, the outer loop should short-circuit.
  839. "(a*?)*"sv, // Infinitely matching empty substrings, the outer loop should short-circuit.
  840. "(a*)*?"sv, // Should match exactly nothing.
  841. "(?:)*?"sv, // Should not generate an infinite fork loop.
  842. };
  843. for (auto& pattern : patterns) {
  844. Regex<ECMA262> re(pattern);
  845. auto result = re.match(""sv);
  846. EXPECT_EQ(result.success, true);
  847. }
  848. }
  849. static auto g_lots_of_a_s = DeprecatedString::repeated('a', 10'000'000);
  850. BENCHMARK_CASE(fork_performance)
  851. {
  852. Regex<ECMA262> re("(?:aa)*");
  853. auto result = re.match(g_lots_of_a_s);
  854. EXPECT_EQ(result.success, true);
  855. }
  856. TEST_CASE(optimizer_atomic_groups)
  857. {
  858. Array tests {
  859. // Fork -> ForkReplace
  860. Tuple { "a*b"sv, "aaaaa"sv, false },
  861. Tuple { "a+b"sv, "aaaaa"sv, false },
  862. Tuple { "\\\\(\\d+)"sv, "\\\\"sv, false }, // Rewrite bug turning a+ to a*, see #10952.
  863. Tuple { "[a-z.]+\\."sv, "..."sv, true }, // Rewrite bug, incorrect interpretation of Compare.
  864. Tuple { "[.-]+\\."sv, ".-."sv, true },
  865. // Alternative fuse
  866. Tuple { "(abcfoo|abcbar|abcbaz).*x"sv, "abcbarx"sv, true },
  867. Tuple { "(a|a)"sv, "a"sv, true },
  868. Tuple { "(a|)"sv, ""sv, true }, // Ensure that empty alternatives are not outright removed
  869. Tuple { "a{2,3}|a{5,8}"sv, "abc"sv, false }, // Optimizer should not mess up the instruction stream by ignoring inter-insn dependencies, see #11247.
  870. Tuple { "^(a{2,3}|a{5,8})$"sv, "aaaa"sv, false }, // Optimizer should not mess up the instruction stream by ignoring inter-insn dependencies, see #11247.
  871. // Optimizer should not chop off *half* of an instruction when fusing instructions.
  872. Tuple { "cubic-bezier\\(\\s*(-?\\d+\\.?\\d*|-?\\.\\d+)\\s*,\\s*(-?\\d+\\.?\\d*|-?\\.\\d+)\\s*,\\s*(-?\\d+\\.?\\d*|-?\\.\\d+)\\s*,\\s*(-?\\d+\\.?\\d*|-?\\.\\d+)\\s*\\)"sv, "cubic-bezier(.05, 0, 0, 1)"sv, true },
  873. // ForkReplace shouldn't be applied where it would change the semantics
  874. Tuple { "(1+)\\1"sv, "11"sv, true },
  875. Tuple { "(1+)1"sv, "11"sv, true },
  876. Tuple { "(1+)0"sv, "10"sv, true },
  877. // Rewrite should not skip over first required iteration of <x>+.
  878. Tuple { "a+"sv, ""sv, false },
  879. };
  880. for (auto& test : tests) {
  881. Regex<ECMA262> re(test.get<0>());
  882. auto result = re.match(test.get<1>());
  883. EXPECT_EQ(result.success, test.get<2>());
  884. }
  885. }
  886. TEST_CASE(optimizer_char_class_lut)
  887. {
  888. Regex<ECMA262> re(R"([\f\n\r\t\v\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]+$)");
  889. if constexpr (REGEX_DEBUG) {
  890. dbgln("\n");
  891. RegexDebug regex_dbg(stderr);
  892. regex_dbg.print_raw_bytecode(re);
  893. regex_dbg.print_header();
  894. regex_dbg.print_bytecode(re);
  895. dbgln("\n");
  896. }
  897. // This will go through _all_ alternatives in the character class, and then fail.
  898. for (size_t i = 0; i < 1'000'000; ++i)
  899. EXPECT_EQ(re.match("1635488940000"sv).success, false);
  900. }
  901. TEST_CASE(optimizer_alternation)
  902. {
  903. Array tests {
  904. // Pattern, Subject, Expected length
  905. Tuple { "a|"sv, "a"sv, 1u },
  906. };
  907. for (auto& test : tests) {
  908. Regex<ECMA262> re(test.get<0>());
  909. auto result = re.match(test.get<1>());
  910. EXPECT(result.success);
  911. EXPECT_EQ(result.matches.first().view.length(), test.get<2>());
  912. }
  913. }
  914. TEST_CASE(posix_basic_dollar_is_end_anchor)
  915. {
  916. // Ensure that a dollar sign at the end only matches the end of the line.
  917. {
  918. Regex<PosixBasic> re("abc$");
  919. EXPECT_EQ(re.match("123abcdef"sv, PosixFlags::Global).success, false);
  920. EXPECT_EQ(re.match("123abc"sv, PosixFlags::Global).success, true);
  921. EXPECT_EQ(re.match("123abc$def"sv, PosixFlags::Global).success, false);
  922. EXPECT_EQ(re.match("123abc$"sv, PosixFlags::Global).success, false);
  923. }
  924. }
  925. TEST_CASE(posix_basic_dollar_is_literal)
  926. {
  927. // Ensure that a dollar sign in the middle is treated as a literal.
  928. {
  929. Regex<PosixBasic> re("abc$d");
  930. EXPECT_EQ(re.match("123abcdef"sv, PosixFlags::Global).success, false);
  931. EXPECT_EQ(re.match("123abc"sv, PosixFlags::Global).success, false);
  932. EXPECT_EQ(re.match("123abc$def"sv, PosixFlags::Global).success, true);
  933. EXPECT_EQ(re.match("123abc$"sv, PosixFlags::Global).success, false);
  934. }
  935. // Ensure that a dollar sign is always treated as a literal if escaped, even if at the end of the pattern.
  936. {
  937. Regex<PosixBasic> re("abc\\$");
  938. EXPECT_EQ(re.match("123abcdef"sv, PosixFlags::Global).success, false);
  939. EXPECT_EQ(re.match("123abc"sv, PosixFlags::Global).success, false);
  940. EXPECT_EQ(re.match("123abc$def"sv, PosixFlags::Global).success, true);
  941. EXPECT_EQ(re.match("123abc$"sv, PosixFlags::Global).success, true);
  942. }
  943. }
  944. TEST_CASE(negative_lookahead)
  945. {
  946. {
  947. // Negative lookahead with more than 2 forks difference between lookahead init and finish.
  948. auto options = ECMAScriptOptions { ECMAScriptFlags::Global };
  949. options.reset_flag((ECMAScriptFlags)regex::AllFlags::Internal_Stateful);
  950. Regex<ECMA262> re(":(?!\\^\\)|1)", options);
  951. EXPECT_EQ(re.match(":^)"sv).success, false);
  952. EXPECT_EQ(re.match(":1"sv).success, false);
  953. EXPECT_EQ(re.match(":foobar"sv).success, true);
  954. }
  955. {
  956. // Correctly count forks with nested groups and optimised loops
  957. Regex<ECMA262> re("^((?:[^\\n]|\\n(?! *\\n))+)(?:\\n *)+\\n");
  958. EXPECT_EQ(re.match("foo\n\n"sv).success, true);
  959. EXPECT_EQ(re.match("foo\n"sv).success, false);
  960. }
  961. }
  962. TEST_CASE(single_match_flag)
  963. {
  964. {
  965. // Ensure that only a single match is produced and nothing past that.
  966. Regex<ECMA262> re("[\\u0008-\\uffff]"sv, ECMAScriptFlags::Global | (ECMAScriptFlags)regex::AllFlags::SingleMatch);
  967. auto result = re.match("ABC"sv);
  968. EXPECT_EQ(result.success, true);
  969. EXPECT_EQ(result.matches.size(), 1u);
  970. EXPECT_EQ(result.matches.first().view.to_deprecated_string(), "A"sv);
  971. }
  972. }
  973. TEST_CASE(inversion_state_in_char_class)
  974. {
  975. {
  976. // #13755, /[\S\s]/.exec("hello") should be [ "h" ], not null.
  977. Regex<ECMA262> re("[\\S\\s]", ECMAScriptFlags::Global | (ECMAScriptFlags)regex::AllFlags::SingleMatch);
  978. auto result = re.match("hello"sv);
  979. EXPECT_EQ(result.success, true);
  980. EXPECT_EQ(result.matches.size(), 1u);
  981. EXPECT_EQ(result.matches.first().view.to_deprecated_string(), "h"sv);
  982. }
  983. {
  984. Regex<ECMA262> re("^(?:([^\\s!\"#%-,\\./;->@\\[-\\^`\\{-~]+(?=([=~}\\s/.)|]))))"sv, ECMAScriptFlags::Global);
  985. auto result = re.match("slideNumbers}}"sv);
  986. EXPECT_EQ(result.success, true);
  987. EXPECT_EQ(result.matches.size(), 1u);
  988. EXPECT_EQ(result.matches.first().view.to_deprecated_string(), "slideNumbers"sv);
  989. EXPECT_EQ(result.capture_group_matches.first()[0].view.to_deprecated_string(), "slideNumbers"sv);
  990. EXPECT_EQ(result.capture_group_matches.first()[1].view.to_deprecated_string(), "}"sv);
  991. }
  992. }