Regex.cpp 42 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "LibRegex/RegexMatcher.h"
  7. #include <LibTest/TestCase.h> // import first, to prevent warning of VERIFY* redefinition
  8. #include <AK/StringBuilder.h>
  9. #include <AK/Tuple.h>
  10. #include <LibRegex/Regex.h>
  11. #include <LibRegex/RegexDebug.h>
  12. #include <stdio.h>
  13. static ECMAScriptOptions match_test_api_options(const ECMAScriptOptions options)
  14. {
  15. return options;
  16. }
  17. static PosixOptions match_test_api_options(const PosixOptions options)
  18. {
  19. return options;
  20. }
  21. template<typename... Flags>
  22. static constexpr ECMAScriptFlags combine_flags(Flags&&... flags) requires((IsSame<Flags, ECMAScriptFlags> && ...))
  23. {
  24. return static_cast<ECMAScriptFlags>((static_cast<regex::FlagsUnderlyingType>(flags) | ...));
  25. }
  26. TEST_CASE(regex_options_ecmascript)
  27. {
  28. ECMAScriptOptions eo;
  29. eo |= ECMAScriptFlags::Global;
  30. EXPECT(eo & ECMAScriptFlags::Global);
  31. EXPECT(!(eo & ECMAScriptFlags::Insensitive));
  32. eo = match_test_api_options(ECMAScriptFlags::Global | ECMAScriptFlags::Insensitive | ECMAScriptFlags::Sticky);
  33. EXPECT(eo & ECMAScriptFlags::Global);
  34. EXPECT(eo & ECMAScriptFlags::Insensitive);
  35. EXPECT(eo & ECMAScriptFlags::Sticky);
  36. EXPECT(!(eo & ECMAScriptFlags::Unicode));
  37. EXPECT(!(eo & ECMAScriptFlags::Multiline));
  38. EXPECT(!(eo & ECMAScriptFlags::SingleLine));
  39. eo &= ECMAScriptFlags::Insensitive;
  40. EXPECT(!(eo & ECMAScriptFlags::Global));
  41. EXPECT(eo & ECMAScriptFlags::Insensitive);
  42. EXPECT(!(eo & ECMAScriptFlags::Multiline));
  43. eo &= ECMAScriptFlags::Sticky;
  44. EXPECT(!(eo & ECMAScriptFlags::Global));
  45. EXPECT(!(eo & ECMAScriptFlags::Insensitive));
  46. EXPECT(!(eo & ECMAScriptFlags::Multiline));
  47. EXPECT(!(eo & ECMAScriptFlags::Sticky));
  48. eo = ~ECMAScriptFlags::Insensitive;
  49. EXPECT(eo & ECMAScriptFlags::Global);
  50. EXPECT(!(eo & ECMAScriptFlags::Insensitive));
  51. EXPECT(eo & ECMAScriptFlags::Multiline);
  52. EXPECT(eo & ECMAScriptFlags::Sticky);
  53. }
  54. TEST_CASE(regex_options_posix)
  55. {
  56. PosixOptions eo;
  57. eo |= PosixFlags::Global;
  58. EXPECT(eo & PosixFlags::Global);
  59. EXPECT(!(eo & PosixFlags::Insensitive));
  60. eo = match_test_api_options(PosixFlags::Global | PosixFlags::Insensitive | PosixFlags::MatchNotBeginOfLine);
  61. EXPECT(eo & PosixFlags::Global);
  62. EXPECT(eo & PosixFlags::Insensitive);
  63. EXPECT(eo & PosixFlags::MatchNotBeginOfLine);
  64. EXPECT(!(eo & PosixFlags::Unicode));
  65. EXPECT(!(eo & PosixFlags::Multiline));
  66. eo &= PosixFlags::Insensitive;
  67. EXPECT(!(eo & PosixFlags::Global));
  68. EXPECT(eo & PosixFlags::Insensitive);
  69. EXPECT(!(eo & PosixFlags::Multiline));
  70. eo &= PosixFlags::MatchNotBeginOfLine;
  71. EXPECT(!(eo & PosixFlags::Global));
  72. EXPECT(!(eo & PosixFlags::Insensitive));
  73. EXPECT(!(eo & PosixFlags::Multiline));
  74. eo = ~PosixFlags::Insensitive;
  75. EXPECT(eo & PosixFlags::Global);
  76. EXPECT(!(eo & PosixFlags::Insensitive));
  77. EXPECT(eo & PosixFlags::Multiline);
  78. }
  79. TEST_CASE(regex_lexer)
  80. {
  81. Lexer l("/[.*+?^${}()|[\\]\\\\]/g");
  82. EXPECT(l.next().type() == regex::TokenType::Slash);
  83. EXPECT(l.next().type() == regex::TokenType::LeftBracket);
  84. EXPECT(l.next().type() == regex::TokenType::Period);
  85. EXPECT(l.next().type() == regex::TokenType::Asterisk);
  86. EXPECT(l.next().type() == regex::TokenType::Plus);
  87. EXPECT(l.next().type() == regex::TokenType::Questionmark);
  88. EXPECT(l.next().type() == regex::TokenType::Circumflex);
  89. EXPECT(l.next().type() == regex::TokenType::Dollar);
  90. EXPECT(l.next().type() == regex::TokenType::LeftCurly);
  91. EXPECT(l.next().type() == regex::TokenType::RightCurly);
  92. EXPECT(l.next().type() == regex::TokenType::LeftParen);
  93. EXPECT(l.next().type() == regex::TokenType::RightParen);
  94. EXPECT(l.next().type() == regex::TokenType::Pipe);
  95. EXPECT(l.next().type() == regex::TokenType::LeftBracket);
  96. EXPECT(l.next().type() == regex::TokenType::EscapeSequence);
  97. EXPECT(l.next().type() == regex::TokenType::EscapeSequence);
  98. EXPECT(l.next().type() == regex::TokenType::RightBracket);
  99. EXPECT(l.next().type() == regex::TokenType::Slash);
  100. EXPECT(l.next().type() == regex::TokenType::Char);
  101. }
  102. TEST_CASE(parser_error_parens)
  103. {
  104. String pattern = "test()test";
  105. Lexer l(pattern);
  106. PosixExtendedParser p(l);
  107. p.parse();
  108. EXPECT(p.has_error());
  109. EXPECT(p.error() == regex::Error::EmptySubExpression);
  110. }
  111. TEST_CASE(parser_error_special_characters_used_at_wrong_place)
  112. {
  113. String pattern;
  114. Vector<char, 5> chars = { '*', '+', '?', '{' };
  115. StringBuilder b;
  116. Lexer l;
  117. PosixExtended p(l);
  118. for (auto& ch : chars) {
  119. // First in ere
  120. b.clear();
  121. b.append(ch);
  122. pattern = b.build();
  123. l.set_source(pattern);
  124. p.parse();
  125. EXPECT(p.has_error());
  126. EXPECT(p.error() == regex::Error::InvalidRepetitionMarker);
  127. // After vertical line
  128. b.clear();
  129. b.append("a|");
  130. b.append(ch);
  131. pattern = b.build();
  132. l.set_source(pattern);
  133. p.parse();
  134. EXPECT(p.has_error());
  135. EXPECT(p.error() == regex::Error::InvalidRepetitionMarker);
  136. // After circumflex
  137. b.clear();
  138. b.append("^");
  139. b.append(ch);
  140. pattern = b.build();
  141. l.set_source(pattern);
  142. p.parse();
  143. EXPECT(p.has_error());
  144. EXPECT(p.error() == regex::Error::InvalidRepetitionMarker);
  145. // After dollar
  146. b.clear();
  147. b.append("$");
  148. b.append(ch);
  149. pattern = b.build();
  150. l.set_source(pattern);
  151. p.parse();
  152. EXPECT(p.has_error());
  153. EXPECT(p.error() == regex::Error::InvalidRepetitionMarker);
  154. // After left parens
  155. b.clear();
  156. b.append("(");
  157. b.append(ch);
  158. b.append(")");
  159. pattern = b.build();
  160. l.set_source(pattern);
  161. p.parse();
  162. EXPECT(p.has_error());
  163. EXPECT(p.error() == regex::Error::InvalidRepetitionMarker);
  164. }
  165. }
  166. TEST_CASE(parser_error_vertical_line_used_at_wrong_place)
  167. {
  168. Lexer l;
  169. PosixExtended p(l);
  170. // First in ere
  171. l.set_source("|asdf");
  172. p.parse();
  173. EXPECT(p.has_error());
  174. EXPECT(p.error() == regex::Error::EmptySubExpression);
  175. // Last in ere
  176. l.set_source("asdf|");
  177. p.parse();
  178. EXPECT(p.has_error());
  179. EXPECT(p.error() == regex::Error::EmptySubExpression);
  180. // After left parens
  181. l.set_source("(|asdf)");
  182. p.parse();
  183. EXPECT(p.has_error());
  184. EXPECT(p.error() == regex::Error::EmptySubExpression);
  185. // Proceed right parens
  186. l.set_source("(asdf)|");
  187. p.parse();
  188. EXPECT(p.has_error());
  189. EXPECT(p.error() == regex::Error::EmptySubExpression);
  190. }
  191. TEST_CASE(catch_all_first)
  192. {
  193. Regex<PosixExtended> re("^.*$");
  194. RegexResult m;
  195. re.match("Hello World", m);
  196. EXPECT(m.count == 1);
  197. EXPECT(re.match("Hello World", m));
  198. }
  199. TEST_CASE(catch_all)
  200. {
  201. Regex<PosixExtended> re("^.*$", PosixFlags::Global);
  202. EXPECT(re.has_match("Hello World"));
  203. EXPECT(re.match("Hello World").success);
  204. EXPECT(re.match("Hello World").count == 1);
  205. EXPECT(has_match("Hello World", re));
  206. auto res = match("Hello World", re);
  207. EXPECT(res.success);
  208. EXPECT(res.count == 1);
  209. EXPECT(res.matches.size() == 1);
  210. EXPECT(res.matches.first().view == "Hello World");
  211. }
  212. TEST_CASE(catch_all_again)
  213. {
  214. Regex<PosixExtended> re("^.*$", PosixFlags::Extra);
  215. EXPECT_EQ(has_match("Hello World", re), true);
  216. }
  217. TEST_CASE(char_utf8)
  218. {
  219. Regex<PosixExtended> re("😀");
  220. RegexResult result;
  221. EXPECT_EQ((result = match(Utf8View { "Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界"sv }, re, PosixFlags::Global)).success, true);
  222. EXPECT_EQ(result.count, 2u);
  223. }
  224. TEST_CASE(catch_all_newline)
  225. {
  226. Regex<PosixExtended> re("^.*$", PosixFlags::Multiline | PosixFlags::StringCopyMatches);
  227. RegexResult result;
  228. auto lambda = [&result, &re]() {
  229. String aaa = "Hello World\nTest\n1234\n";
  230. result = match(aaa, re);
  231. EXPECT_EQ(result.success, true);
  232. };
  233. lambda();
  234. EXPECT_EQ(result.count, 3u);
  235. EXPECT_EQ(result.matches.at(0).view, "Hello World");
  236. EXPECT_EQ(result.matches.at(1).view, "Test");
  237. EXPECT_EQ(result.matches.at(2).view, "1234");
  238. }
  239. TEST_CASE(catch_all_newline_view)
  240. {
  241. Regex<PosixExtended> re("^.*$", PosixFlags::Multiline);
  242. RegexResult result;
  243. String aaa = "Hello World\nTest\n1234\n";
  244. result = match(aaa, re);
  245. EXPECT_EQ(result.success, true);
  246. EXPECT_EQ(result.count, 3u);
  247. String str = "Hello World";
  248. EXPECT_EQ(result.matches.at(0).view, str.view());
  249. EXPECT_EQ(result.matches.at(1).view, "Test");
  250. EXPECT_EQ(result.matches.at(2).view, "1234");
  251. }
  252. TEST_CASE(catch_all_newline_2)
  253. {
  254. Regex<PosixExtended> re("^.*$");
  255. RegexResult result;
  256. result = match("Hello World\nTest\n1234\n", re, PosixFlags::Multiline | PosixFlags::StringCopyMatches);
  257. EXPECT_EQ(result.success, true);
  258. EXPECT_EQ(result.count, 3u);
  259. EXPECT_EQ(result.matches.at(0).view, "Hello World");
  260. EXPECT_EQ(result.matches.at(1).view, "Test");
  261. EXPECT_EQ(result.matches.at(2).view, "1234");
  262. result = match("Hello World\nTest\n1234\n", re);
  263. EXPECT_EQ(result.success, true);
  264. EXPECT_EQ(result.count, 1u);
  265. EXPECT_EQ(result.matches.at(0).view, "Hello World\nTest\n1234\n");
  266. }
  267. TEST_CASE(match_all_character_class)
  268. {
  269. Regex<PosixExtended> re("[[:alpha:]]");
  270. String str = "[Window]\nOpacity=255\nAudibleBeep=0\n";
  271. RegexResult result = match(str, re, PosixFlags::Global | PosixFlags::StringCopyMatches);
  272. EXPECT_EQ(result.success, true);
  273. EXPECT_EQ(result.count, 24u);
  274. EXPECT_EQ(result.matches.at(0).view, "W");
  275. EXPECT_EQ(result.matches.at(1).view, "i");
  276. EXPECT_EQ(result.matches.at(2).view, "n");
  277. }
  278. TEST_CASE(match_character_class_with_assertion)
  279. {
  280. Regex<PosixExtended> re("[[:alpha:]]+$");
  281. String str = "abcdef";
  282. RegexResult result = match(str, re);
  283. EXPECT_EQ(result.success, true);
  284. EXPECT_EQ(result.count, 1u);
  285. }
  286. TEST_CASE(example_for_git_commit)
  287. {
  288. Regex<PosixExtended> re("^.*$");
  289. auto result = re.match("Well, hello friends!\nHello World!");
  290. EXPECT(result.success);
  291. EXPECT(result.count == 1);
  292. EXPECT(result.matches.at(0).view.starts_with("Well"));
  293. EXPECT(result.matches.at(0).view.length() == 33);
  294. EXPECT(re.has_match("Well,...."));
  295. result = re.match("Well, hello friends!\nHello World!", PosixFlags::Multiline);
  296. EXPECT(result.success);
  297. EXPECT(result.count == 2);
  298. EXPECT(result.matches.at(0).view == "Well, hello friends!");
  299. EXPECT(result.matches.at(1).view == "Hello World!");
  300. }
  301. TEST_CASE(email_address)
  302. {
  303. Regex<PosixExtended> re("^[A-Z0-9a-z._%+-]{1,64}@([A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$");
  304. EXPECT(re.has_match("hello.world@domain.tld"));
  305. EXPECT(re.has_match("this.is.a.very_long_email_address@world.wide.web"));
  306. }
  307. TEST_CASE(ini_file_entries)
  308. {
  309. Regex<PosixExtended> re("[[:alpha:]]*=([[:digit:]]*)|\\[(.*)\\]");
  310. RegexResult result;
  311. if constexpr (REGEX_DEBUG) {
  312. RegexDebug regex_dbg(stderr);
  313. regex_dbg.print_raw_bytecode(re);
  314. regex_dbg.print_header();
  315. regex_dbg.print_bytecode(re);
  316. }
  317. String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
  318. EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true);
  319. EXPECT_EQ(result.count, 3u);
  320. if constexpr (REGEX_DEBUG) {
  321. for (auto& v : result.matches)
  322. fprintf(stderr, "%s\n", v.view.to_string().characters());
  323. }
  324. EXPECT_EQ(result.matches.at(0).view, "[Window]");
  325. EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "Window");
  326. EXPECT_EQ(result.matches.at(1).view, "Opacity=255");
  327. EXPECT_EQ(result.matches.at(1).line, 1u);
  328. EXPECT_EQ(result.matches.at(1).column, 0u);
  329. EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "255");
  330. EXPECT_EQ(result.capture_group_matches.at(1).at(0).line, 1u);
  331. EXPECT_EQ(result.capture_group_matches.at(1).at(0).column, 8u);
  332. EXPECT_EQ(result.matches.at(2).view, "AudibleBeep=0");
  333. EXPECT_EQ(result.capture_group_matches.at(2).at(0).view, "0");
  334. EXPECT_EQ(result.capture_group_matches.at(2).at(0).line, 2u);
  335. EXPECT_EQ(result.capture_group_matches.at(2).at(0).column, 12u);
  336. }
  337. TEST_CASE(ini_file_entries2)
  338. {
  339. Regex<PosixExtended> re("[[:alpha:]]*=([[:digit:]]*)");
  340. RegexResult result;
  341. String haystack = "ViewMode=Icon";
  342. EXPECT_EQ(re.match(haystack.view(), result), false);
  343. EXPECT_EQ(result.count, 0u);
  344. EXPECT_EQ(re.search(haystack.view(), result), true);
  345. EXPECT_EQ(result.count, 1u);
  346. }
  347. TEST_CASE(named_capture_group)
  348. {
  349. Regex<PosixExtended> re("[[:alpha:]]*=(?<Test>[[:digit:]]*)");
  350. RegexResult result;
  351. if constexpr (REGEX_DEBUG) {
  352. RegexDebug regex_dbg(stderr);
  353. regex_dbg.print_raw_bytecode(re);
  354. regex_dbg.print_header();
  355. regex_dbg.print_bytecode(re);
  356. }
  357. String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
  358. EXPECT_EQ(re.search(haystack, result, PosixFlags::Multiline), true);
  359. EXPECT_EQ(result.count, 2u);
  360. EXPECT_EQ(result.matches.at(0).view, "Opacity=255");
  361. EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "255");
  362. EXPECT_EQ(result.capture_group_matches.at(0).at(0).capture_group_name, "Test");
  363. EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0");
  364. EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "0");
  365. EXPECT_EQ(result.capture_group_matches.at(1).at(0).capture_group_name, "Test");
  366. }
  367. TEST_CASE(ecma262_named_capture_group_with_dollar_sign)
  368. {
  369. Regex<ECMA262> re("[a-zA-Z]*=(?<$Test$>[0-9]*)");
  370. RegexResult result;
  371. if constexpr (REGEX_DEBUG) {
  372. RegexDebug regex_dbg(stderr);
  373. regex_dbg.print_raw_bytecode(re);
  374. regex_dbg.print_header();
  375. regex_dbg.print_bytecode(re);
  376. }
  377. String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
  378. EXPECT_EQ(re.search(haystack, result, ECMAScriptFlags::Multiline), true);
  379. EXPECT_EQ(result.count, 2u);
  380. EXPECT_EQ(result.matches.at(0).view, "Opacity=255");
  381. EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "255");
  382. EXPECT_EQ(result.capture_group_matches.at(0).at(0).capture_group_name, "$Test$");
  383. EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0");
  384. EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "0");
  385. EXPECT_EQ(result.capture_group_matches.at(1).at(0).capture_group_name, "$Test$");
  386. }
  387. TEST_CASE(a_star)
  388. {
  389. Regex<PosixExtended> re("a*");
  390. RegexResult result;
  391. if constexpr (REGEX_DEBUG) {
  392. RegexDebug regex_dbg(stderr);
  393. regex_dbg.print_raw_bytecode(re);
  394. regex_dbg.print_header();
  395. regex_dbg.print_bytecode(re);
  396. }
  397. String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
  398. EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true);
  399. EXPECT_EQ(result.count, 32u);
  400. if (result.count == 32u) {
  401. EXPECT_EQ(result.matches.at(0).view.length(), 0u);
  402. EXPECT_EQ(result.matches.at(10).view.length(), 1u);
  403. EXPECT_EQ(result.matches.at(10).view, "a");
  404. EXPECT_EQ(result.matches.at(31).view.length(), 0u);
  405. }
  406. }
  407. TEST_CASE(simple_period_end_benchmark)
  408. {
  409. Regex<PosixExtended> re("hello.$");
  410. RegexResult m;
  411. EXPECT_EQ(re.search("Hello1", m), false);
  412. EXPECT_EQ(re.search("hello1hello1", m), true);
  413. EXPECT_EQ(re.search("hello2hell", m), false);
  414. EXPECT_EQ(re.search("hello?", m), true);
  415. }
  416. TEST_CASE(posix_extended_nested_capture_group)
  417. {
  418. Regex<PosixExtended> re("(h(e(?<llo>llo)))"); // group 0 -> "hello", group 1 -> "ello", group 2/"llo" -> "llo"
  419. auto result = re.match("hello");
  420. EXPECT(result.success);
  421. EXPECT_EQ(result.capture_group_matches.size(), 1u);
  422. EXPECT_EQ(result.capture_group_matches[0].size(), 3u);
  423. EXPECT_EQ(result.capture_group_matches[0][0].view, "hello"sv);
  424. EXPECT_EQ(result.capture_group_matches[0][1].view, "ello"sv);
  425. EXPECT_EQ(result.capture_group_matches[0][2].view, "llo"sv);
  426. }
  427. auto parse_test_case_long_disjunction_chain = String::repeated("a|"sv, 100000);
  428. TEST_CASE(ECMA262_parse)
  429. {
  430. struct _test {
  431. StringView pattern;
  432. regex::Error expected_error { regex::Error::NoError };
  433. regex::ECMAScriptFlags flags {};
  434. };
  435. _test const tests[] {
  436. { "^hello.$"sv },
  437. { "^(hello.)$"sv },
  438. { "^h{0,1}ello.$"sv },
  439. { "^hello\\W$"sv },
  440. { "^hell\\w.$"sv },
  441. { "^hell\\x6f1$"sv }, // ^hello1$
  442. { "^hel(?:l\\w).$"sv },
  443. { "^hel(?<LO>l\\w).$"sv },
  444. { "^[-a-zA-Z\\w\\s]+$"sv },
  445. { "\\bhello\\B"sv },
  446. { "^[\\w+/_-]+[=]{0,2}$"sv }, // #4189
  447. { "^(?:[^<]*(<[\\w\\W]+>)[^>]*$|#([\\w\\-]*)$)"sv }, // #4189
  448. { "\\/"sv }, // #4189
  449. { ",/=-:"sv }, // #4243
  450. { "\\x"sv }, // Even invalid escapes are allowed if ~unicode.
  451. { "\\x1"sv }, // Even invalid escapes are allowed if ~unicode.
  452. { "\\x1"sv, regex::Error::InvalidPattern, regex::ECMAScriptFlags::Unicode },
  453. { "\\x11"sv },
  454. { "\\x11"sv, regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
  455. { "\\"sv, regex::Error::InvalidTrailingEscape },
  456. { "(?"sv, regex::Error::InvalidCaptureGroup },
  457. { "\\u1234"sv, regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
  458. { "[\\u1234]"sv, regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
  459. { "\\u1"sv, regex::Error::InvalidPattern, regex::ECMAScriptFlags::Unicode },
  460. { "[\\u1]"sv, regex::Error::InvalidPattern, regex::ECMAScriptFlags::Unicode },
  461. { ",(?"sv, regex::Error::InvalidCaptureGroup }, // #4583
  462. { "{1}"sv, regex::Error::InvalidPattern },
  463. { "{1,2}"sv, regex::Error::InvalidPattern },
  464. { "\\uxxxx"sv, regex::Error::NoError },
  465. { "\\uxxxx"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  466. { "\\ud83d"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  467. { "\\ud83d\\uxxxx"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  468. { "\\u{0}"sv },
  469. { "\\u{0}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  470. { "\\u{10ffff}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  471. { "\\u{10ffff"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  472. { "\\u{10ffffx"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  473. { "\\u{110000}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  474. { "\\p"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  475. { "\\p{"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  476. { "\\p{}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
  477. { "\\p{AsCiI}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
  478. { "\\p{hello friends}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
  479. { "\\p{Prepended_Concatenation_Mark}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
  480. { "\\p{ASCII}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  481. { "\\\\p{1}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  482. { "\\\\p{AsCiI}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  483. { "\\\\p{ASCII}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  484. { "\\c"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  485. { "\\c"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  486. { "[\\c]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  487. { "[\\c]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  488. { "\\c`"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  489. { "\\c`"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  490. { "[\\c`]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  491. { "[\\c`]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  492. { "\\A"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  493. { "\\A"sv, regex::Error::InvalidCharacterClass, ECMAScriptFlags::Unicode },
  494. { "[\\A]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  495. { "[\\A]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  496. { "\\0"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  497. { "\\0"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
  498. { "\\00"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  499. { "\\00"sv, regex::Error::InvalidCharacterClass, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
  500. { "[\\0]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  501. { "[\\0]"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
  502. { "[\\00]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  503. { "[\\00]"sv, regex::Error::InvalidPattern, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
  504. { "\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  505. { "[\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/]"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  506. { "]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  507. { "]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  508. { "\\]"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  509. { "}"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  510. { "}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  511. { "\\}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  512. { "a{9007199254740991}"sv }, // 2^53 - 1
  513. { "a{9007199254740991,}"sv },
  514. { "a{9007199254740991,9007199254740991}"sv },
  515. { "a{9007199254740992}"sv, regex::Error::InvalidBraceContent },
  516. { "a{9007199254740992,}"sv, regex::Error::InvalidBraceContent },
  517. { "a{9007199254740991,9007199254740992}"sv, regex::Error::InvalidBraceContent },
  518. { "a{9007199254740992,9007199254740991}"sv, regex::Error::InvalidBraceContent },
  519. { "a{9007199254740992,9007199254740992}"sv, regex::Error::InvalidBraceContent },
  520. { "(?<a>a)(?<a>b)"sv, regex::Error::DuplicateNamedCapture },
  521. { "(?<a>a)(?<b>b)(?<a>c)"sv, regex::Error::DuplicateNamedCapture },
  522. { "(?<1a>a)"sv, regex::Error::InvalidNameForCaptureGroup },
  523. { "(?<\\a>a)"sv, regex::Error::InvalidNameForCaptureGroup },
  524. { "(?<\ta>a)"sv, regex::Error::InvalidNameForCaptureGroup },
  525. { "(?<$$_$$>a)"sv },
  526. { "(?<ÿ>a)"sv },
  527. { "(?<𝓑𝓻𝓸𝔀𝓷>a)"sv },
  528. { "((?=lg)?[vl]k\\-?\\d{3}) bui| 3\\.[-\\w; ]{10}lg?-([06cv9]{3,4})"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended }, // #12373, quantifiable assertions.
  529. { parse_test_case_long_disjunction_chain.view() }, // A whole lot of disjunctions, should not overflow the stack.
  530. };
  531. for (auto& test : tests) {
  532. Regex<ECMA262> re(test.pattern, test.flags);
  533. EXPECT_EQ(re.parser_result.error, test.expected_error);
  534. if constexpr (REGEX_DEBUG) {
  535. dbgln("\n");
  536. RegexDebug regex_dbg(stderr);
  537. regex_dbg.print_raw_bytecode(re);
  538. regex_dbg.print_header();
  539. regex_dbg.print_bytecode(re);
  540. dbgln("\n");
  541. }
  542. }
  543. }
  544. TEST_CASE(ECMA262_match)
  545. {
  546. struct _test {
  547. StringView pattern;
  548. StringView subject;
  549. bool matches { true };
  550. ECMAScriptFlags options {};
  551. };
  552. // clang-format off
  553. constexpr _test tests[] {
  554. { "^hello.$"sv, "hello1"sv },
  555. { "^(hello.)$"sv, "hello1"sv },
  556. { "^h{0,1}ello.$"sv, "ello1"sv },
  557. { "^hello\\W$"sv, "hello!"sv },
  558. { "^hell\\w.$"sv, "hellx!"sv },
  559. { "^hell\\x6f1$"sv, "hello1"sv },
  560. { "^hel(?<LO>l.)1$"sv, "hello1"sv },
  561. { "^hel(?<LO>l.)1*\\k<LO>.$"sv, "hello1lo1"sv },
  562. { "^[-a-z1-3\\s]+$"sv, "hell2 o1"sv },
  563. { "^[\\0-\\x1f]$"sv, "\n"sv },
  564. { .pattern = "\\bhello\\B"sv, .subject = "hello1"sv, .options = ECMAScriptFlags::Global },
  565. { "\\b.*\\b"sv, "hello1"sv },
  566. { "[^\\D\\S]{2}"sv, "1 "sv },
  567. { "bar(?=f.)foo"sv, "barfoo"sv },
  568. { "bar(?=foo)bar"sv, "barbar"sv, false },
  569. { "bar(?!foo)bar"sv, "barbar"sv, true },
  570. { "bar(?!bar)bar"sv, "barbar"sv, false },
  571. { "bar.*(?<=foo)"sv, "barbar"sv, false },
  572. { "bar.*(?<!foo)"sv, "barbar"sv, true },
  573. { "((...)X)+"sv, "fooXbarXbazX"sv, true },
  574. { "(?:)"sv, ""sv, true },
  575. { "\\^"sv, "^"sv },
  576. { "\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/"sv, "^$\\.*+?()[]{}|/"sv, true, ECMAScriptFlags::Unicode },
  577. { "[\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/]{15}"sv, "^$\\.*+?()[]{}|/"sv, true, ECMAScriptFlags::Unicode },
  578. { "(a{2}){3}"sv, "aaaaaa"sv },
  579. { "(a{2}){3}"sv, "aaaabaa"sv, false },
  580. { "(a{2}){4}"sv, "aaaaaaaa"sv },
  581. { "(a{2}){4}"sv, "aaaaaabaa"sv, false },
  582. { "(a{3}){2}"sv, "aaaaaa"sv },
  583. { "(a{3}){2}"sv, "aaaabaa"sv, false },
  584. { "(a{4}){2}"sv, "aaaaaaaa"sv },
  585. { "(a{4}){2}"sv, "aaaaaabaa"sv, false },
  586. { "\\u{4}"sv, "uuuu" },
  587. { "(?<=.{3})f"sv, "abcdef"sv, true, (ECMAScriptFlags)regex::AllFlags::Global },
  588. { "(?<=.{3})f"sv, "abc😀ef"sv, true, (ECMAScriptFlags)regex::AllFlags::Global },
  589. // ECMA262, B.1.4. Regular Expression Pattern extensions for browsers
  590. { "{"sv, "{"sv, true, ECMAScriptFlags::BrowserExtended },
  591. { "\\5"sv, "\5"sv, true, ECMAScriptFlags::BrowserExtended },
  592. { "\\05"sv, "\5"sv, true, ECMAScriptFlags::BrowserExtended },
  593. { "\\455"sv, "\45""5"sv, true, ECMAScriptFlags::BrowserExtended },
  594. { "\\314"sv, "\314"sv, true, ECMAScriptFlags::BrowserExtended },
  595. { "\\c"sv, "\\c"sv, true, ECMAScriptFlags::BrowserExtended },
  596. { "\\cf"sv, "\06"sv, true, ECMAScriptFlags::BrowserExtended },
  597. { "\\c1"sv, "\\c1"sv, true, ECMAScriptFlags::BrowserExtended },
  598. { "[\\c1]"sv, "\x11"sv, true, ECMAScriptFlags::BrowserExtended },
  599. { "[\\w-\\d]"sv, "-"sv, true, ECMAScriptFlags::BrowserExtended },
  600. { "^(?:^^\\.?|[!+-]|!=|!==|#|%|%=|&|&&|&&=|&=|\\(|\\*|\\*=|\\+=|,|-=|->|\\/|\\/=|:|::|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|[?@[^]|\\^=|\\^\\^|\\^\\^=|{|\\||\\|=|\\|\\||\\|\\|=|~|break|case|continue|delete|do|else|finally|instanceof|return|throw|try|typeof)\\s*(\\/(?=[^*/])(?:[^/[\\\\]|\\\\[\\S\\s]|\\[(?:[^\\\\\\]]|\\\\[\\S\\s])*(?:]|$))+\\/)"sv,
  601. "return /xx/"sv, true, ECMAScriptFlags::BrowserExtended
  602. }, // #5517, appears to be matching JS expressions that involve regular expressions...
  603. { "a{2,}"sv, "aaaa"sv }, // #5518
  604. { "\\0"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended },
  605. { "\\0"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
  606. { "\\01"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended },
  607. { "[\\0]"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended },
  608. { "[\\0]"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
  609. { "[\\01]"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended },
  610. { "(\0|a)"sv, "a"sv, true }, // #9686, Should allow null bytes in pattern
  611. { "(.*?)a(?!(a+)b\\2c)\\2(.*)"sv, "baaabaac"sv, true }, // #6042, Groups inside lookarounds may be referenced outside, but their contents appear empty if the pattern in the lookaround fails.
  612. { "a|$"sv, "x"sv, true, (ECMAScriptFlags)regex::AllFlags::Global }, // #11940, Global (not the 'g' flag) regexps should attempt to match the zero-length end of the string too.
  613. { "foo\nbar"sv, "foo\nbar"sv, true }, // #12126, ECMA262 regexp should match literal newlines without the 's' flag.
  614. { "foo[^]bar"sv, "foo\nbar"sv, true }, // #12126, ECMA262 regexp should match newline with [^].
  615. };
  616. // clang-format on
  617. for (auto& test : tests) {
  618. Regex<ECMA262> re(test.pattern, test.options);
  619. if constexpr (REGEX_DEBUG) {
  620. dbgln("\n");
  621. RegexDebug regex_dbg(stderr);
  622. regex_dbg.print_raw_bytecode(re);
  623. regex_dbg.print_header();
  624. regex_dbg.print_bytecode(re);
  625. dbgln("\n");
  626. }
  627. EXPECT_EQ(re.parser_result.error, regex::Error::NoError);
  628. EXPECT_EQ(re.match(test.subject).success, test.matches);
  629. }
  630. }
  631. TEST_CASE(ECMA262_unicode_match)
  632. {
  633. constexpr auto space_and_line_terminator_code_points = Array { 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000, 0xFEFF };
  634. StringBuilder builder;
  635. for (u32 code_point : space_and_line_terminator_code_points)
  636. builder.append_code_point(code_point);
  637. auto space_and_line_terminators = builder.build();
  638. struct _test {
  639. StringView pattern;
  640. StringView subject;
  641. bool matches { true };
  642. ECMAScriptFlags options {};
  643. };
  644. _test tests[] {
  645. { "\xf0\x9d\x8c\x86"sv, "abcdef"sv, false, ECMAScriptFlags::Unicode },
  646. { "[\xf0\x9d\x8c\x86]"sv, "abcdef"sv, false, ECMAScriptFlags::Unicode },
  647. { "\\ud83d"sv, "😀"sv, true },
  648. { "\\ud83d"sv, "😀"sv, false, ECMAScriptFlags::Unicode },
  649. { "\\ude00"sv, "😀"sv, true },
  650. { "\\ude00"sv, "😀"sv, false, ECMAScriptFlags::Unicode },
  651. { "\\ud83d\\ude00"sv, "😀"sv, true },
  652. { "\\ud83d\\ude00"sv, "😀"sv, true, ECMAScriptFlags::Unicode },
  653. { "\\u{1f600}"sv, "😀"sv, true, ECMAScriptFlags::Unicode },
  654. { "\\ud83d\\ud83d"sv, "\xed\xa0\xbd\xed\xa0\xbd"sv, true },
  655. { "\\ud83d\\ud83d"sv, "\xed\xa0\xbd\xed\xa0\xbd"sv, true, ECMAScriptFlags::Unicode },
  656. { "(?<=.{3})f"sv, "abcdef"sv, true, ECMAScriptFlags::Unicode },
  657. { "(?<=.{3})f"sv, "abc😀ef"sv, true, ECMAScriptFlags::Unicode },
  658. { "(?<𝓑𝓻𝓸𝔀𝓷>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode },
  659. { "(?<\\u{1d4d1}\\u{1d4fb}\\u{1d4f8}\\u{1d500}\\u{1d4f7}>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode },
  660. { "(?<\\ud835\\udcd1\\ud835\\udcfb\\ud835\\udcf8\\ud835\\udd00\\ud835\\udcf7>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode },
  661. { "^\\s+$"sv, space_and_line_terminators },
  662. { "^\\s+$"sv, space_and_line_terminators, true, ECMAScriptFlags::Unicode },
  663. };
  664. for (auto& test : tests) {
  665. Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | test.options);
  666. auto subject = AK::utf8_to_utf16(test.subject);
  667. Utf16View view { subject };
  668. if constexpr (REGEX_DEBUG) {
  669. dbgln("\n");
  670. RegexDebug regex_dbg(stderr);
  671. regex_dbg.print_raw_bytecode(re);
  672. regex_dbg.print_header();
  673. regex_dbg.print_bytecode(re);
  674. dbgln("\n");
  675. }
  676. EXPECT_EQ(re.parser_result.error, regex::Error::NoError);
  677. EXPECT_EQ(re.match(view).success, test.matches);
  678. }
  679. }
  680. TEST_CASE(ECMA262_property_match)
  681. {
  682. struct _test {
  683. StringView pattern;
  684. StringView subject;
  685. bool matches { true };
  686. ECMAScriptFlags options {};
  687. };
  688. constexpr _test tests[] {
  689. { "\\p{ASCII}"sv, "a"sv, false },
  690. { "\\p{ASCII}"sv, "p{ASCII}"sv, true },
  691. { "\\p{ASCII}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  692. { "\\p{ASCII}"sv, "😀"sv, false, ECMAScriptFlags::Unicode },
  693. { "\\P{ASCII}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  694. { "\\P{ASCII}"sv, "😀"sv, true, ECMAScriptFlags::Unicode },
  695. { "\\p{ASCII_Hex_Digit}"sv, "1"sv, true, ECMAScriptFlags::Unicode },
  696. { "\\p{ASCII_Hex_Digit}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  697. { "\\p{ASCII_Hex_Digit}"sv, "x"sv, false, ECMAScriptFlags::Unicode },
  698. { "\\P{ASCII_Hex_Digit}"sv, "1"sv, false, ECMAScriptFlags::Unicode },
  699. { "\\P{ASCII_Hex_Digit}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  700. { "\\P{ASCII_Hex_Digit}"sv, "x"sv, true, ECMAScriptFlags::Unicode },
  701. { "\\p{Any}"sv, "\xcd\xb8"sv, true, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
  702. { "\\P{Any}"sv, "\xcd\xb8"sv, false, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
  703. { "\\p{Assigned}"sv, "\xcd\xb8"sv, false, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
  704. { "\\P{Assigned}"sv, "\xcd\xb8"sv, true, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
  705. { "\\p{Lu}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  706. { "\\p{Lu}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  707. { "\\p{Lu}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  708. { "\\p{Cased_Letter}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  709. { "\\p{Cased_Letter}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  710. { "\\p{Cased_Letter}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  711. { "\\P{Cased_Letter}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  712. { "\\P{Cased_Letter}"sv, "A"sv, false, ECMAScriptFlags::Unicode },
  713. { "\\P{Cased_Letter}"sv, "9"sv, true, ECMAScriptFlags::Unicode },
  714. { "\\p{General_Category=Cased_Letter}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  715. { "\\p{General_Category=Cased_Letter}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  716. { "\\p{General_Category=Cased_Letter}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  717. { "\\p{gc=Cased_Letter}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  718. { "\\p{gc=Cased_Letter}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  719. { "\\p{gc=Cased_Letter}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  720. { "\\p{Script=Latin}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  721. { "\\p{Script=Latin}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  722. { "\\p{Script=Latin}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  723. { "\\p{sc=Latin}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  724. { "\\p{sc=Latin}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  725. { "\\p{sc=Latin}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  726. { "\\p{Script_Extensions=Deva}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  727. { "\\p{Script_Extensions=Beng}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5
  728. { "\\p{Script_Extensions=Deva}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5
  729. { "\\p{scx=Deva}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  730. { "\\p{scx=Beng}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5
  731. { "\\p{scx=Deva}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5
  732. };
  733. for (auto& test : tests) {
  734. Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | regex::ECMAScriptFlags::BrowserExtended | test.options);
  735. auto subject = AK::utf8_to_utf16(test.subject);
  736. Utf16View view { subject };
  737. if constexpr (REGEX_DEBUG) {
  738. dbgln("\n");
  739. RegexDebug regex_dbg(stderr);
  740. regex_dbg.print_raw_bytecode(re);
  741. regex_dbg.print_header();
  742. regex_dbg.print_bytecode(re);
  743. dbgln("\n");
  744. }
  745. EXPECT_EQ(re.parser_result.error, regex::Error::NoError);
  746. EXPECT_EQ(re.match(view).success, test.matches);
  747. }
  748. }
  749. TEST_CASE(replace)
  750. {
  751. struct _test {
  752. StringView pattern;
  753. StringView replacement;
  754. StringView subject;
  755. StringView expected;
  756. ECMAScriptFlags options {};
  757. };
  758. constexpr _test tests[] {
  759. { "foo(.+)"sv, "aaa"sv, "test"sv, "test"sv },
  760. { "foo(.+)"sv, "test\\1"sv, "foobar"sv, "testbar"sv },
  761. { "foo(.+)"sv, "\\2\\1"sv, "foobar"sv, "\\2bar"sv },
  762. { "foo(.+)"sv, "\\\\\\1"sv, "foobar"sv, "\\bar"sv },
  763. { "foo(.)"sv, "a\\1"sv, "fooxfooy"sv, "axay"sv, ECMAScriptFlags::Multiline },
  764. };
  765. for (auto& test : tests) {
  766. Regex<ECMA262> re(test.pattern, test.options);
  767. if constexpr (REGEX_DEBUG) {
  768. dbgln("\n");
  769. RegexDebug regex_dbg(stderr);
  770. regex_dbg.print_raw_bytecode(re);
  771. regex_dbg.print_header();
  772. regex_dbg.print_bytecode(re);
  773. dbgln("\n");
  774. }
  775. EXPECT_EQ(re.parser_result.error, regex::Error::NoError);
  776. EXPECT_EQ(re.replace(test.subject, test.replacement), test.expected);
  777. }
  778. }
  779. TEST_CASE(case_insensitive_match)
  780. {
  781. Regex<PosixExtended> re("cd", PosixFlags::Insensitive | PosixFlags::Global);
  782. auto result = re.match("AEKFCD");
  783. EXPECT_EQ(result.success, true);
  784. if (result.success) {
  785. EXPECT_EQ(result.matches.at(0).column, 4ul);
  786. }
  787. }
  788. TEST_CASE(extremely_long_fork_chain)
  789. {
  790. Regex<ECMA262> re("(?:aa)*");
  791. auto result = re.match(String::repeated('a', 1000));
  792. EXPECT_EQ(result.success, true);
  793. }
  794. TEST_CASE(theoretically_infinite_loop)
  795. {
  796. Array patterns {
  797. "(a*)*"sv, // Infinitely matching empty substrings, the outer loop should short-circuit.
  798. "(a*?)*"sv, // Infinitely matching empty substrings, the outer loop should short-circuit.
  799. "(a*)*?"sv, // Should match exactly nothing.
  800. "(?:)*?"sv, // Should not generate an infinite fork loop.
  801. };
  802. for (auto& pattern : patterns) {
  803. Regex<ECMA262> re(pattern);
  804. auto result = re.match("");
  805. EXPECT_EQ(result.success, true);
  806. }
  807. }
  808. static auto g_lots_of_a_s = String::repeated('a', 10'000'000);
  809. BENCHMARK_CASE(fork_performance)
  810. {
  811. Regex<ECMA262> re("(?:aa)*");
  812. auto result = re.match(g_lots_of_a_s);
  813. EXPECT_EQ(result.success, true);
  814. }
  815. TEST_CASE(optimizer_atomic_groups)
  816. {
  817. Array tests {
  818. // Fork -> ForkReplace
  819. Tuple { "a*b"sv, "aaaaa"sv, false },
  820. Tuple { "a+b"sv, "aaaaa"sv, false },
  821. Tuple { "\\\\(\\d+)"sv, "\\\\"sv, false }, // Rewrite bug turning a+ to a*, see #10952.
  822. // Alternative fuse
  823. Tuple { "(abcfoo|abcbar|abcbaz).*x"sv, "abcbarx"sv, true },
  824. Tuple { "(a|a)"sv, "a"sv, true },
  825. Tuple { "(a|)"sv, ""sv, true }, // Ensure that empty alternatives are not outright removed
  826. Tuple { "a{2,3}|a{5,8}"sv, "abc"sv, false }, // Optimizer should not mess up the instruction stream by ignoring inter-insn dependencies, see #11247.
  827. Tuple { "^(a{2,3}|a{5,8})$"sv, "aaaa"sv, false }, // Optimizer should not mess up the instruction stream by ignoring inter-insn dependencies, see #11247.
  828. // Optimizer should not chop off *half* of an instruction when fusing instructions.
  829. Tuple { "cubic-bezier\\(\\s*(-?\\d+\\.?\\d*|-?\\.\\d+)\\s*,\\s*(-?\\d+\\.?\\d*|-?\\.\\d+)\\s*,\\s*(-?\\d+\\.?\\d*|-?\\.\\d+)\\s*,\\s*(-?\\d+\\.?\\d*|-?\\.\\d+)\\s*\\)"sv, "cubic-bezier(.05, 0, 0, 1)"sv, true },
  830. // ForkReplace shouldn't be applied where it would change the semantics
  831. Tuple { "(1+)\\1"sv, "11"sv, true },
  832. Tuple { "(1+)1"sv, "11"sv, true },
  833. Tuple { "(1+)0"sv, "10"sv, true },
  834. // Rewrite should not skip over first required iteration of <x>+.
  835. Tuple { "a+"sv, ""sv, false },
  836. };
  837. for (auto& test : tests) {
  838. Regex<ECMA262> re(test.get<0>());
  839. auto result = re.match(test.get<1>());
  840. EXPECT_EQ(result.success, test.get<2>());
  841. }
  842. }
  843. TEST_CASE(optimizer_char_class_lut)
  844. {
  845. Regex<ECMA262> re(R"([\f\n\r\t\v\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]+$)");
  846. if constexpr (REGEX_DEBUG) {
  847. dbgln("\n");
  848. RegexDebug regex_dbg(stderr);
  849. regex_dbg.print_raw_bytecode(re);
  850. regex_dbg.print_header();
  851. regex_dbg.print_bytecode(re);
  852. dbgln("\n");
  853. }
  854. // This will go through _all_ alternatives in the character class, and then fail.
  855. for (size_t i = 0; i < 1'000'000; ++i)
  856. EXPECT_EQ(re.match("1635488940000"sv).success, false);
  857. }
  858. TEST_CASE(optimizer_alternation)
  859. {
  860. Array tests {
  861. // Pattern, Subject, Expected length
  862. Tuple { "a|"sv, "a"sv, 1u },
  863. };
  864. for (auto& test : tests) {
  865. Regex<ECMA262> re(test.get<0>());
  866. auto result = re.match(test.get<1>());
  867. EXPECT(result.success);
  868. EXPECT_EQ(result.matches.first().view.length(), test.get<2>());
  869. }
  870. }
  871. TEST_CASE(posix_basic_dollar_is_end_anchor)
  872. {
  873. // Ensure that a dollar sign at the end only matches the end of the line.
  874. {
  875. Regex<PosixBasic> re("abc$");
  876. EXPECT_EQ(re.match("123abcdef", PosixFlags::Global).success, false);
  877. EXPECT_EQ(re.match("123abc", PosixFlags::Global).success, true);
  878. EXPECT_EQ(re.match("123abc$def", PosixFlags::Global).success, false);
  879. EXPECT_EQ(re.match("123abc$", PosixFlags::Global).success, false);
  880. }
  881. }
  882. TEST_CASE(posix_basic_dollar_is_literal)
  883. {
  884. // Ensure that a dollar sign in the middle is treated as a literal.
  885. {
  886. Regex<PosixBasic> re("abc$d");
  887. EXPECT_EQ(re.match("123abcdef", PosixFlags::Global).success, false);
  888. EXPECT_EQ(re.match("123abc", PosixFlags::Global).success, false);
  889. EXPECT_EQ(re.match("123abc$def", PosixFlags::Global).success, true);
  890. EXPECT_EQ(re.match("123abc$", PosixFlags::Global).success, false);
  891. }
  892. // Ensure that a dollar sign is always treated as a literal if escaped, even if at the end of the pattern.
  893. {
  894. Regex<PosixBasic> re("abc\\$");
  895. EXPECT_EQ(re.match("123abcdef", PosixFlags::Global).success, false);
  896. EXPECT_EQ(re.match("123abc", PosixFlags::Global).success, false);
  897. EXPECT_EQ(re.match("123abc$def", PosixFlags::Global).success, true);
  898. EXPECT_EQ(re.match("123abc$", PosixFlags::Global).success, true);
  899. }
  900. }
  901. TEST_CASE(negative_lookahead)
  902. {
  903. {
  904. // Negative lookahead with more than 2 forks difference between lookahead init and finish.
  905. auto options = ECMAScriptOptions { ECMAScriptFlags::Global };
  906. options.reset_flag((ECMAScriptFlags)regex::AllFlags::Internal_Stateful);
  907. Regex<ECMA262> re(":(?!\\^\\)|1)", options);
  908. EXPECT_EQ(re.match(":^)").success, false);
  909. EXPECT_EQ(re.match(":1").success, false);
  910. EXPECT_EQ(re.match(":foobar").success, true);
  911. }
  912. }
  913. TEST_CASE(single_match_flag)
  914. {
  915. {
  916. // Ensure that only a single match is produced and nothing past that.
  917. Regex<ECMA262> re("[\\u0008-\\uffff]"sv, ECMAScriptFlags::Global | (ECMAScriptFlags)regex::AllFlags::SingleMatch);
  918. auto result = re.match("ABC");
  919. EXPECT_EQ(result.success, true);
  920. EXPECT_EQ(result.matches.size(), 1u);
  921. EXPECT_EQ(result.matches.first().view.to_string(), "A"sv);
  922. }
  923. }