Regex.cpp 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "LibRegex/RegexMatcher.h"
  7. #include <LibTest/TestCase.h> // import first, to prevent warning of VERIFY* redefinition
  8. #include <AK/StringBuilder.h>
  9. #include <LibRegex/Regex.h>
  10. #include <LibRegex/RegexDebug.h>
  11. #include <stdio.h>
  12. static ECMAScriptOptions match_test_api_options(const ECMAScriptOptions options)
  13. {
  14. return options;
  15. }
  16. static PosixOptions match_test_api_options(const PosixOptions options)
  17. {
  18. return options;
  19. }
  20. TEST_CASE(regex_options_ecmascript)
  21. {
  22. ECMAScriptOptions eo;
  23. eo |= ECMAScriptFlags::Global;
  24. EXPECT(eo & ECMAScriptFlags::Global);
  25. EXPECT(!(eo & ECMAScriptFlags::Insensitive));
  26. eo = match_test_api_options(ECMAScriptFlags::Global | ECMAScriptFlags::Insensitive | ECMAScriptFlags::Sticky);
  27. EXPECT(eo & ECMAScriptFlags::Global);
  28. EXPECT(eo & ECMAScriptFlags::Insensitive);
  29. EXPECT(eo & ECMAScriptFlags::Sticky);
  30. EXPECT(!(eo & ECMAScriptFlags::Unicode));
  31. EXPECT(!(eo & ECMAScriptFlags::Multiline));
  32. EXPECT(!(eo & ECMAScriptFlags::SingleLine));
  33. eo &= ECMAScriptFlags::Insensitive;
  34. EXPECT(!(eo & ECMAScriptFlags::Global));
  35. EXPECT(eo & ECMAScriptFlags::Insensitive);
  36. EXPECT(!(eo & ECMAScriptFlags::Multiline));
  37. eo &= ECMAScriptFlags::Sticky;
  38. EXPECT(!(eo & ECMAScriptFlags::Global));
  39. EXPECT(!(eo & ECMAScriptFlags::Insensitive));
  40. EXPECT(!(eo & ECMAScriptFlags::Multiline));
  41. EXPECT(!(eo & ECMAScriptFlags::Sticky));
  42. eo = ~ECMAScriptFlags::Insensitive;
  43. EXPECT(eo & ECMAScriptFlags::Global);
  44. EXPECT(!(eo & ECMAScriptFlags::Insensitive));
  45. EXPECT(eo & ECMAScriptFlags::Multiline);
  46. EXPECT(eo & ECMAScriptFlags::Sticky);
  47. }
  48. TEST_CASE(regex_options_posix)
  49. {
  50. PosixOptions eo;
  51. eo |= PosixFlags::Global;
  52. EXPECT(eo & PosixFlags::Global);
  53. EXPECT(!(eo & PosixFlags::Insensitive));
  54. eo = match_test_api_options(PosixFlags::Global | PosixFlags::Insensitive | PosixFlags::MatchNotBeginOfLine);
  55. EXPECT(eo & PosixFlags::Global);
  56. EXPECT(eo & PosixFlags::Insensitive);
  57. EXPECT(eo & PosixFlags::MatchNotBeginOfLine);
  58. EXPECT(!(eo & PosixFlags::Unicode));
  59. EXPECT(!(eo & PosixFlags::Multiline));
  60. eo &= PosixFlags::Insensitive;
  61. EXPECT(!(eo & PosixFlags::Global));
  62. EXPECT(eo & PosixFlags::Insensitive);
  63. EXPECT(!(eo & PosixFlags::Multiline));
  64. eo &= PosixFlags::MatchNotBeginOfLine;
  65. EXPECT(!(eo & PosixFlags::Global));
  66. EXPECT(!(eo & PosixFlags::Insensitive));
  67. EXPECT(!(eo & PosixFlags::Multiline));
  68. eo = ~PosixFlags::Insensitive;
  69. EXPECT(eo & PosixFlags::Global);
  70. EXPECT(!(eo & PosixFlags::Insensitive));
  71. EXPECT(eo & PosixFlags::Multiline);
  72. }
  73. TEST_CASE(regex_lexer)
  74. {
  75. Lexer l("/[.*+?^${}()|[\\]\\\\]/g");
  76. EXPECT(l.next().type() == regex::TokenType::Slash);
  77. EXPECT(l.next().type() == regex::TokenType::LeftBracket);
  78. EXPECT(l.next().type() == regex::TokenType::Period);
  79. EXPECT(l.next().type() == regex::TokenType::Asterisk);
  80. EXPECT(l.next().type() == regex::TokenType::Plus);
  81. EXPECT(l.next().type() == regex::TokenType::Questionmark);
  82. EXPECT(l.next().type() == regex::TokenType::Circumflex);
  83. EXPECT(l.next().type() == regex::TokenType::Dollar);
  84. EXPECT(l.next().type() == regex::TokenType::LeftCurly);
  85. EXPECT(l.next().type() == regex::TokenType::RightCurly);
  86. EXPECT(l.next().type() == regex::TokenType::LeftParen);
  87. EXPECT(l.next().type() == regex::TokenType::RightParen);
  88. EXPECT(l.next().type() == regex::TokenType::Pipe);
  89. EXPECT(l.next().type() == regex::TokenType::LeftBracket);
  90. EXPECT(l.next().type() == regex::TokenType::EscapeSequence);
  91. EXPECT(l.next().type() == regex::TokenType::EscapeSequence);
  92. EXPECT(l.next().type() == regex::TokenType::RightBracket);
  93. EXPECT(l.next().type() == regex::TokenType::Slash);
  94. EXPECT(l.next().type() == regex::TokenType::Char);
  95. }
  96. TEST_CASE(parser_error_parens)
  97. {
  98. String pattern = "test()test";
  99. Lexer l(pattern);
  100. PosixExtendedParser p(l);
  101. p.parse();
  102. EXPECT(p.has_error());
  103. EXPECT(p.error() == Error::EmptySubExpression);
  104. }
  105. TEST_CASE(parser_error_special_characters_used_at_wrong_place)
  106. {
  107. String pattern;
  108. Vector<char, 5> chars = { '*', '+', '?', '{' };
  109. StringBuilder b;
  110. Lexer l;
  111. PosixExtended p(l);
  112. for (auto& ch : chars) {
  113. // First in ere
  114. b.clear();
  115. b.append(ch);
  116. pattern = b.build();
  117. l.set_source(pattern);
  118. p.parse();
  119. EXPECT(p.has_error());
  120. EXPECT(p.error() == Error::InvalidRepetitionMarker);
  121. // After vertical line
  122. b.clear();
  123. b.append("a|");
  124. b.append(ch);
  125. pattern = b.build();
  126. l.set_source(pattern);
  127. p.parse();
  128. EXPECT(p.has_error());
  129. EXPECT(p.error() == Error::InvalidRepetitionMarker);
  130. // After circumflex
  131. b.clear();
  132. b.append("^");
  133. b.append(ch);
  134. pattern = b.build();
  135. l.set_source(pattern);
  136. p.parse();
  137. EXPECT(p.has_error());
  138. EXPECT(p.error() == Error::InvalidRepetitionMarker);
  139. // After dollar
  140. b.clear();
  141. b.append("$");
  142. b.append(ch);
  143. pattern = b.build();
  144. l.set_source(pattern);
  145. p.parse();
  146. EXPECT(p.has_error());
  147. EXPECT(p.error() == Error::InvalidRepetitionMarker);
  148. // After left parens
  149. b.clear();
  150. b.append("(");
  151. b.append(ch);
  152. b.append(")");
  153. pattern = b.build();
  154. l.set_source(pattern);
  155. p.parse();
  156. EXPECT(p.has_error());
  157. EXPECT(p.error() == Error::InvalidRepetitionMarker);
  158. }
  159. }
  160. TEST_CASE(parser_error_vertical_line_used_at_wrong_place)
  161. {
  162. Lexer l;
  163. PosixExtended p(l);
  164. // First in ere
  165. l.set_source("|asdf");
  166. p.parse();
  167. EXPECT(p.has_error());
  168. EXPECT(p.error() == Error::EmptySubExpression);
  169. // Last in ere
  170. l.set_source("asdf|");
  171. p.parse();
  172. EXPECT(p.has_error());
  173. EXPECT(p.error() == Error::EmptySubExpression);
  174. // After left parens
  175. l.set_source("(|asdf)");
  176. p.parse();
  177. EXPECT(p.has_error());
  178. EXPECT(p.error() == Error::EmptySubExpression);
  179. // Proceed right parens
  180. l.set_source("(asdf)|");
  181. p.parse();
  182. EXPECT(p.has_error());
  183. EXPECT(p.error() == Error::EmptySubExpression);
  184. }
  185. TEST_CASE(catch_all_first)
  186. {
  187. Regex<PosixExtended> re("^.*$");
  188. RegexResult m;
  189. re.match("Hello World", m);
  190. EXPECT(m.count == 1);
  191. EXPECT(re.match("Hello World", m));
  192. }
  193. TEST_CASE(catch_all)
  194. {
  195. Regex<PosixExtended> re("^.*$", PosixFlags::Global);
  196. EXPECT(re.has_match("Hello World"));
  197. EXPECT(re.match("Hello World").success);
  198. EXPECT(re.match("Hello World").count == 1);
  199. EXPECT(has_match("Hello World", re));
  200. auto res = match("Hello World", re);
  201. EXPECT(res.success);
  202. EXPECT(res.count == 1);
  203. EXPECT(res.matches.size() == 1);
  204. EXPECT(res.matches.first().view == "Hello World");
  205. }
  206. TEST_CASE(catch_all_again)
  207. {
  208. Regex<PosixExtended> re("^.*$", PosixFlags::Extra);
  209. EXPECT_EQ(has_match("Hello World", re), true);
  210. }
  211. TEST_CASE(char_utf8)
  212. {
  213. Regex<PosixExtended> re("😀");
  214. RegexResult result;
  215. EXPECT_EQ((result = match(Utf8View { "Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界" }, re, PosixFlags::Global)).success, true);
  216. EXPECT_EQ(result.count, 2u);
  217. }
  218. TEST_CASE(catch_all_newline)
  219. {
  220. Regex<PosixExtended> re("^.*$", PosixFlags::Multiline | PosixFlags::StringCopyMatches);
  221. RegexResult result;
  222. auto lambda = [&result, &re]() {
  223. String aaa = "Hello World\nTest\n1234\n";
  224. result = match(aaa, re);
  225. EXPECT_EQ(result.success, true);
  226. };
  227. lambda();
  228. EXPECT_EQ(result.count, 3u);
  229. EXPECT_EQ(result.matches.at(0).view, "Hello World");
  230. EXPECT_EQ(result.matches.at(1).view, "Test");
  231. EXPECT_EQ(result.matches.at(2).view, "1234");
  232. }
  233. TEST_CASE(catch_all_newline_view)
  234. {
  235. Regex<PosixExtended> re("^.*$", PosixFlags::Multiline);
  236. RegexResult result;
  237. String aaa = "Hello World\nTest\n1234\n";
  238. result = match(aaa, re);
  239. EXPECT_EQ(result.success, true);
  240. EXPECT_EQ(result.count, 3u);
  241. String str = "Hello World";
  242. EXPECT_EQ(result.matches.at(0).view, str.view());
  243. EXPECT_EQ(result.matches.at(1).view, "Test");
  244. EXPECT_EQ(result.matches.at(2).view, "1234");
  245. }
  246. TEST_CASE(catch_all_newline_2)
  247. {
  248. Regex<PosixExtended> re("^.*$");
  249. RegexResult result;
  250. result = match("Hello World\nTest\n1234\n", re, PosixFlags::Multiline | PosixFlags::StringCopyMatches);
  251. EXPECT_EQ(result.success, true);
  252. EXPECT_EQ(result.count, 3u);
  253. EXPECT_EQ(result.matches.at(0).view, "Hello World");
  254. EXPECT_EQ(result.matches.at(1).view, "Test");
  255. EXPECT_EQ(result.matches.at(2).view, "1234");
  256. result = match("Hello World\nTest\n1234\n", re);
  257. EXPECT_EQ(result.success, true);
  258. EXPECT_EQ(result.count, 1u);
  259. EXPECT_EQ(result.matches.at(0).view, "Hello World\nTest\n1234\n");
  260. }
  261. TEST_CASE(match_all_character_class)
  262. {
  263. Regex<PosixExtended> re("[[:alpha:]]");
  264. String str = "[Window]\nOpacity=255\nAudibleBeep=0\n";
  265. RegexResult result = match(str, re, PosixFlags::Global | PosixFlags::StringCopyMatches);
  266. EXPECT_EQ(result.success, true);
  267. EXPECT_EQ(result.count, 24u);
  268. EXPECT_EQ(result.matches.at(0).view, "W");
  269. EXPECT_EQ(result.matches.at(1).view, "i");
  270. EXPECT_EQ(result.matches.at(2).view, "n");
  271. }
  272. TEST_CASE(match_character_class_with_assertion)
  273. {
  274. Regex<PosixExtended> re("[[:alpha:]]+$");
  275. String str = "abcdef";
  276. RegexResult result = match(str, re);
  277. EXPECT_EQ(result.success, true);
  278. EXPECT_EQ(result.count, 1u);
  279. }
  280. TEST_CASE(example_for_git_commit)
  281. {
  282. Regex<PosixExtended> re("^.*$");
  283. auto result = re.match("Well, hello friends!\nHello World!");
  284. EXPECT(result.success);
  285. EXPECT(result.count == 1);
  286. EXPECT(result.matches.at(0).view.starts_with("Well"));
  287. EXPECT(result.matches.at(0).view.length() == 33);
  288. EXPECT(re.has_match("Well,...."));
  289. result = re.match("Well, hello friends!\nHello World!", PosixFlags::Multiline);
  290. EXPECT(result.success);
  291. EXPECT(result.count == 2);
  292. EXPECT(result.matches.at(0).view == "Well, hello friends!");
  293. EXPECT(result.matches.at(1).view == "Hello World!");
  294. }
  295. TEST_CASE(email_address)
  296. {
  297. Regex<PosixExtended> re("^[A-Z0-9a-z._%+-]{1,64}@([A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$");
  298. EXPECT(re.has_match("hello.world@domain.tld"));
  299. EXPECT(re.has_match("this.is.a.very_long_email_address@world.wide.web"));
  300. }
  301. TEST_CASE(ini_file_entries)
  302. {
  303. Regex<PosixExtended> re("[[:alpha:]]*=([[:digit:]]*)|\\[(.*)\\]");
  304. RegexResult result;
  305. if constexpr (REGEX_DEBUG) {
  306. RegexDebug regex_dbg(stderr);
  307. regex_dbg.print_raw_bytecode(re);
  308. regex_dbg.print_header();
  309. regex_dbg.print_bytecode(re);
  310. }
  311. String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
  312. EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true);
  313. EXPECT_EQ(result.count, 3u);
  314. if constexpr (REGEX_DEBUG) {
  315. for (auto& v : result.matches)
  316. fprintf(stderr, "%s\n", v.view.to_string().characters());
  317. }
  318. EXPECT_EQ(result.matches.at(0).view, "[Window]");
  319. EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "Window");
  320. EXPECT_EQ(result.matches.at(1).view, "Opacity=255");
  321. EXPECT_EQ(result.matches.at(1).line, 1u);
  322. EXPECT_EQ(result.matches.at(1).column, 0u);
  323. EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "255");
  324. EXPECT_EQ(result.capture_group_matches.at(1).at(0).line, 1u);
  325. EXPECT_EQ(result.capture_group_matches.at(1).at(0).column, 8u);
  326. EXPECT_EQ(result.matches.at(2).view, "AudibleBeep=0");
  327. EXPECT_EQ(result.capture_group_matches.at(2).at(0).view, "0");
  328. EXPECT_EQ(result.capture_group_matches.at(2).at(0).line, 2u);
  329. EXPECT_EQ(result.capture_group_matches.at(2).at(0).column, 12u);
  330. }
  331. TEST_CASE(ini_file_entries2)
  332. {
  333. Regex<PosixExtended> re("[[:alpha:]]*=([[:digit:]]*)");
  334. RegexResult result;
  335. String haystack = "ViewMode=Icon";
  336. EXPECT_EQ(re.match(haystack.view(), result), false);
  337. EXPECT_EQ(result.count, 0u);
  338. EXPECT_EQ(re.search(haystack.view(), result), true);
  339. EXPECT_EQ(result.count, 1u);
  340. }
  341. TEST_CASE(named_capture_group)
  342. {
  343. Regex<PosixExtended> re("[[:alpha:]]*=(?<Test>[[:digit:]]*)");
  344. RegexResult result;
  345. if constexpr (REGEX_DEBUG) {
  346. RegexDebug regex_dbg(stderr);
  347. regex_dbg.print_raw_bytecode(re);
  348. regex_dbg.print_header();
  349. regex_dbg.print_bytecode(re);
  350. }
  351. String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
  352. EXPECT_EQ(re.search(haystack, result, PosixFlags::Multiline), true);
  353. EXPECT_EQ(result.count, 2u);
  354. EXPECT_EQ(result.matches.at(0).view, "Opacity=255");
  355. EXPECT_EQ(result.named_capture_group_matches.at(0).ensure("Test").view, "255");
  356. EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0");
  357. EXPECT_EQ(result.named_capture_group_matches.at(1).ensure("Test").view, "0");
  358. }
  359. TEST_CASE(ecma262_named_capture_group_with_dollar_sign)
  360. {
  361. Regex<ECMA262> re("[a-zA-Z]*=(?<$Test$>[0-9]*)");
  362. RegexResult result;
  363. if constexpr (REGEX_DEBUG) {
  364. RegexDebug regex_dbg(stderr);
  365. regex_dbg.print_raw_bytecode(re);
  366. regex_dbg.print_header();
  367. regex_dbg.print_bytecode(re);
  368. }
  369. String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
  370. EXPECT_EQ(re.search(haystack, result, ECMAScriptFlags::Multiline), true);
  371. EXPECT_EQ(result.count, 2u);
  372. EXPECT_EQ(result.matches.at(0).view, "Opacity=255");
  373. EXPECT_EQ(result.named_capture_group_matches.at(0).ensure("$Test$").view, "255");
  374. EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0");
  375. EXPECT_EQ(result.named_capture_group_matches.at(1).ensure("$Test$").view, "0");
  376. }
  377. TEST_CASE(a_star)
  378. {
  379. Regex<PosixExtended> re("a*");
  380. RegexResult result;
  381. if constexpr (REGEX_DEBUG) {
  382. RegexDebug regex_dbg(stderr);
  383. regex_dbg.print_raw_bytecode(re);
  384. regex_dbg.print_header();
  385. regex_dbg.print_bytecode(re);
  386. }
  387. String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
  388. EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true);
  389. EXPECT_EQ(result.count, 32u);
  390. if (result.count == 32u) {
  391. EXPECT_EQ(result.matches.at(0).view.length(), 0u);
  392. EXPECT_EQ(result.matches.at(10).view.length(), 1u);
  393. EXPECT_EQ(result.matches.at(10).view, "a");
  394. EXPECT_EQ(result.matches.at(31).view.length(), 0u);
  395. }
  396. }
  397. TEST_CASE(simple_period_end_benchmark)
  398. {
  399. Regex<PosixExtended> re("hello.$");
  400. RegexResult m;
  401. EXPECT_EQ(re.search("Hello1", m), false);
  402. EXPECT_EQ(re.search("hello1hello1", m), true);
  403. EXPECT_EQ(re.search("hello2hell", m), false);
  404. EXPECT_EQ(re.search("hello?", m), true);
  405. }
  406. TEST_CASE(ECMA262_parse)
  407. {
  408. struct _test {
  409. StringView pattern;
  410. regex::Error expected_error { regex::Error::NoError };
  411. regex::ECMAScriptFlags flags {};
  412. };
  413. constexpr _test tests[] {
  414. { "^hello.$"sv },
  415. { "^(hello.)$"sv },
  416. { "^h{0,1}ello.$"sv },
  417. { "^hello\\W$"sv },
  418. { "^hell\\w.$"sv },
  419. { "^hell\\x6f1$"sv }, // ^hello1$
  420. { "^hel(?:l\\w).$"sv },
  421. { "^hel(?<LO>l\\w).$"sv },
  422. { "^[-a-zA-Z\\w\\s]+$"sv },
  423. { "\\bhello\\B"sv },
  424. { "^[\\w+/_-]+[=]{0,2}$"sv }, // #4189
  425. { "^(?:[^<]*(<[\\w\\W]+>)[^>]*$|#([\\w\\-]*)$)"sv }, // #4189
  426. { "\\/"sv }, // #4189
  427. { ",/=-:"sv }, // #4243
  428. { "\\x"sv }, // Even invalid escapes are allowed if ~unicode.
  429. { "\\x1"sv }, // Even invalid escapes are allowed if ~unicode.
  430. { "\\x1"sv, regex::Error::InvalidPattern, regex::ECMAScriptFlags::Unicode },
  431. { "\\x11"sv },
  432. { "\\x11"sv, regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
  433. { "\\"sv, regex::Error::InvalidTrailingEscape },
  434. { "(?"sv, regex::Error::InvalidCaptureGroup },
  435. { "\\u1234"sv, regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
  436. { "[\\u1234]"sv, regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
  437. { "\\u1"sv, regex::Error::InvalidPattern, regex::ECMAScriptFlags::Unicode },
  438. { "[\\u1]"sv, regex::Error::InvalidPattern, regex::ECMAScriptFlags::Unicode },
  439. { ",(?"sv, regex::Error::InvalidCaptureGroup }, // #4583
  440. { "{1}"sv, regex::Error::InvalidPattern },
  441. { "{1,2}"sv, regex::Error::InvalidPattern },
  442. { "\\uxxxx"sv, regex::Error::NoError },
  443. { "\\uxxxx"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  444. { "\\ud83d"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  445. { "\\ud83d\\uxxxx"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  446. { "\\u{0}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  447. { "\\u{10ffff}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  448. { "\\u{10ffff"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  449. { "\\u{10ffffx"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  450. { "\\u{110000}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  451. { "\\p"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  452. { "\\p{"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  453. { "\\p{}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
  454. { "\\p{AsCiI}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
  455. { "\\p{hello friends}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
  456. { "\\p{Prepended_Concatenation_Mark}"sv, regex::Error::InvalidNameForProperty, ECMAScriptFlags::Unicode },
  457. { "\\p{ASCII}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  458. { "\\\\p{1}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
  459. { "\\\\p{AsCiI}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  460. { "\\\\p{ASCII}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  461. { "\\c"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  462. { "\\c"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  463. { "[\\c]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  464. { "[\\c]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  465. { "\\c`"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  466. { "\\c`"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  467. { "[\\c`]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  468. { "[\\c`]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  469. { "\\A"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  470. { "\\A"sv, regex::Error::InvalidCharacterClass, ECMAScriptFlags::Unicode },
  471. { "[\\A]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
  472. { "[\\A]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
  473. };
  474. for (auto& test : tests) {
  475. Regex<ECMA262> re(test.pattern, test.flags);
  476. EXPECT_EQ(re.parser_result.error, test.expected_error);
  477. if constexpr (REGEX_DEBUG) {
  478. dbgln("\n");
  479. RegexDebug regex_dbg(stderr);
  480. regex_dbg.print_raw_bytecode(re);
  481. regex_dbg.print_header();
  482. regex_dbg.print_bytecode(re);
  483. dbgln("\n");
  484. }
  485. }
  486. }
  487. TEST_CASE(ECMA262_match)
  488. {
  489. struct _test {
  490. StringView pattern;
  491. StringView subject;
  492. bool matches { true };
  493. ECMAScriptFlags options {};
  494. };
  495. // clang-format off
  496. constexpr _test tests[] {
  497. { "^hello.$"sv, "hello1"sv },
  498. { "^(hello.)$"sv, "hello1"sv },
  499. { "^h{0,1}ello.$"sv, "ello1"sv },
  500. { "^hello\\W$"sv, "hello!"sv },
  501. { "^hell\\w.$"sv, "hellx!"sv },
  502. { "^hell\\x6f1$"sv, "hello1"sv },
  503. { "^hel(?<LO>l.)1$"sv, "hello1"sv },
  504. { "^hel(?<LO>l.)1*\\k<LO>.$"sv, "hello1lo1"sv },
  505. { "^[-a-z1-3\\s]+$"sv, "hell2 o1"sv },
  506. { "^[\\0-\\x1f]$"sv, "\n"sv },
  507. { .pattern = "\\bhello\\B"sv, .subject = "hello1"sv, .options = ECMAScriptFlags::Global },
  508. { "\\b.*\\b"sv, "hello1"sv },
  509. { "[^\\D\\S]{2}"sv, "1 "sv },
  510. { "bar(?=f.)foo"sv, "barfoo"sv },
  511. { "bar(?=foo)bar"sv, "barbar"sv, false },
  512. { "bar(?!foo)bar"sv, "barbar"sv, true },
  513. { "bar(?!bar)bar"sv, "barbar"sv, false },
  514. { "bar.*(?<=foo)"sv, "barbar"sv, false },
  515. { "bar.*(?<!foo)"sv, "barbar"sv, true },
  516. { "((...)X)+"sv, "fooXbarXbazX"sv, true },
  517. { "(?:)"sv, ""sv, true },
  518. { "\\^"sv, "^"sv },
  519. // ECMA262, B.1.4. Regular Expression Pattern extensions for browsers
  520. { "{"sv, "{"sv, true, ECMAScriptFlags::BrowserExtended },
  521. { "\\5"sv, "\5"sv, true, ECMAScriptFlags::BrowserExtended },
  522. { "\\05"sv, "\5"sv, true, ECMAScriptFlags::BrowserExtended },
  523. { "\\455"sv, "\45""5"sv, true, ECMAScriptFlags::BrowserExtended },
  524. { "\\314"sv, "\314"sv, true, ECMAScriptFlags::BrowserExtended },
  525. { "\\c"sv, "\\c"sv, true, ECMAScriptFlags::BrowserExtended },
  526. { "\\cf"sv, "\06"sv, true, ECMAScriptFlags::BrowserExtended },
  527. { "\\c1"sv, "\\c1"sv, true, ECMAScriptFlags::BrowserExtended },
  528. { "[\\c1]"sv, "\x11"sv, true, ECMAScriptFlags::BrowserExtended },
  529. { "[\\w-\\d]"sv, "-"sv, true, ECMAScriptFlags::BrowserExtended },
  530. { "^(?:^^\\.?|[!+-]|!=|!==|#|%|%=|&|&&|&&=|&=|\\(|\\*|\\*=|\\+=|,|-=|->|\\/|\\/=|:|::|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|[?@[^]|\\^=|\\^\\^|\\^\\^=|{|\\||\\|=|\\|\\||\\|\\|=|~|break|case|continue|delete|do|else|finally|instanceof|return|throw|try|typeof)\\s*(\\/(?=[^*/])(?:[^/[\\\\]|\\\\[\\S\\s]|\\[(?:[^\\\\\\]]|\\\\[\\S\\s])*(?:]|$))+\\/)"sv,
  531. "return /xx/"sv, true, ECMAScriptFlags::BrowserExtended
  532. }, // #5517, appears to be matching JS expressions that involve regular expressions...
  533. { "a{2,}"sv, "aaaa"sv }, // #5518
  534. };
  535. // clang-format on
  536. for (auto& test : tests) {
  537. Regex<ECMA262> re(test.pattern, test.options);
  538. if constexpr (REGEX_DEBUG) {
  539. dbgln("\n");
  540. RegexDebug regex_dbg(stderr);
  541. regex_dbg.print_raw_bytecode(re);
  542. regex_dbg.print_header();
  543. regex_dbg.print_bytecode(re);
  544. dbgln("\n");
  545. }
  546. EXPECT_EQ(re.parser_result.error, Error::NoError);
  547. EXPECT_EQ(re.match(test.subject).success, test.matches);
  548. }
  549. }
  550. TEST_CASE(ECMA262_unicode_match)
  551. {
  552. struct _test {
  553. StringView pattern;
  554. StringView subject;
  555. bool matches { true };
  556. ECMAScriptFlags options {};
  557. };
  558. _test tests[] {
  559. { "\\ud83d"sv, "😀"sv, true },
  560. { "\\ud83d"sv, "😀"sv, false, ECMAScriptFlags::Unicode },
  561. { "\\ude00"sv, "😀"sv, true },
  562. { "\\ude00"sv, "😀"sv, false, ECMAScriptFlags::Unicode },
  563. { "\\ud83d\\ude00"sv, "😀"sv, true },
  564. { "\\ud83d\\ude00"sv, "😀"sv, true, ECMAScriptFlags::Unicode },
  565. { "\\u{1f600}"sv, "😀"sv, true, ECMAScriptFlags::Unicode },
  566. { "\\ud83d\\ud83d"sv, "\xed\xa0\xbd\xed\xa0\xbd"sv, true },
  567. { "\\ud83d\\ud83d"sv, "\xed\xa0\xbd\xed\xa0\xbd"sv, true, ECMAScriptFlags::Unicode },
  568. };
  569. for (auto& test : tests) {
  570. Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | test.options);
  571. auto subject = AK::utf8_to_utf16(test.subject);
  572. Utf16View view { subject };
  573. if constexpr (REGEX_DEBUG) {
  574. dbgln("\n");
  575. RegexDebug regex_dbg(stderr);
  576. regex_dbg.print_raw_bytecode(re);
  577. regex_dbg.print_header();
  578. regex_dbg.print_bytecode(re);
  579. dbgln("\n");
  580. }
  581. EXPECT_EQ(re.parser_result.error, Error::NoError);
  582. EXPECT_EQ(re.match(view).success, test.matches);
  583. }
  584. }
  585. TEST_CASE(ECMA262_property_match)
  586. {
  587. struct _test {
  588. StringView pattern;
  589. StringView subject;
  590. bool matches { true };
  591. ECMAScriptFlags options {};
  592. };
  593. constexpr _test tests[] {
  594. { "\\p{ASCII}"sv, "a"sv, false },
  595. { "\\p{ASCII}"sv, "p{ASCII}"sv, true },
  596. { "\\p{ASCII}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  597. { "\\p{ASCII}"sv, "😀"sv, false, ECMAScriptFlags::Unicode },
  598. { "\\P{ASCII}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  599. { "\\P{ASCII}"sv, "😀"sv, true, ECMAScriptFlags::Unicode },
  600. { "\\p{ASCII_Hex_Digit}"sv, "1"sv, true, ECMAScriptFlags::Unicode },
  601. { "\\p{ASCII_Hex_Digit}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  602. { "\\p{ASCII_Hex_Digit}"sv, "x"sv, false, ECMAScriptFlags::Unicode },
  603. { "\\P{ASCII_Hex_Digit}"sv, "1"sv, false, ECMAScriptFlags::Unicode },
  604. { "\\P{ASCII_Hex_Digit}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  605. { "\\P{ASCII_Hex_Digit}"sv, "x"sv, true, ECMAScriptFlags::Unicode },
  606. { "\\p{Any}"sv, "\xcd\xb8"sv, true, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
  607. { "\\P{Any}"sv, "\xcd\xb8"sv, false, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
  608. { "\\p{Assigned}"sv, "\xcd\xb8"sv, false, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
  609. { "\\P{Assigned}"sv, "\xcd\xb8"sv, true, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
  610. { "\\p{Lu}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  611. { "\\p{Lu}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  612. { "\\p{Lu}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  613. { "\\p{Cased_Letter}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  614. { "\\p{Cased_Letter}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  615. { "\\p{Cased_Letter}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  616. { "\\P{Cased_Letter}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  617. { "\\P{Cased_Letter}"sv, "A"sv, false, ECMAScriptFlags::Unicode },
  618. { "\\P{Cased_Letter}"sv, "9"sv, true, ECMAScriptFlags::Unicode },
  619. { "\\p{General_Category=Cased_Letter}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  620. { "\\p{General_Category=Cased_Letter}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  621. { "\\p{General_Category=Cased_Letter}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  622. { "\\p{gc=Cased_Letter}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  623. { "\\p{gc=Cased_Letter}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  624. { "\\p{gc=Cased_Letter}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  625. { "\\p{Script=Latin}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  626. { "\\p{Script=Latin}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  627. { "\\p{Script=Latin}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  628. { "\\p{sc=Latin}"sv, "a"sv, true, ECMAScriptFlags::Unicode },
  629. { "\\p{sc=Latin}"sv, "A"sv, true, ECMAScriptFlags::Unicode },
  630. { "\\p{sc=Latin}"sv, "9"sv, false, ECMAScriptFlags::Unicode },
  631. { "\\p{Script_Extensions=Deva}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  632. { "\\p{Script_Extensions=Beng}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5
  633. { "\\p{Script_Extensions=Deva}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5
  634. { "\\p{scx=Deva}"sv, "a"sv, false, ECMAScriptFlags::Unicode },
  635. { "\\p{scx=Beng}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5
  636. { "\\p{scx=Deva}"sv, "\xe1\xb3\x95"sv, true, ECMAScriptFlags::Unicode }, // U+01CD5
  637. };
  638. for (auto& test : tests) {
  639. Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | regex::ECMAScriptFlags::BrowserExtended | test.options);
  640. auto subject = AK::utf8_to_utf16(test.subject);
  641. Utf16View view { subject };
  642. if constexpr (REGEX_DEBUG) {
  643. dbgln("\n");
  644. RegexDebug regex_dbg(stderr);
  645. regex_dbg.print_raw_bytecode(re);
  646. regex_dbg.print_header();
  647. regex_dbg.print_bytecode(re);
  648. dbgln("\n");
  649. }
  650. EXPECT_EQ(re.parser_result.error, Error::NoError);
  651. EXPECT_EQ(re.match(view).success, test.matches);
  652. }
  653. }
  654. TEST_CASE(replace)
  655. {
  656. struct _test {
  657. StringView pattern;
  658. StringView replacement;
  659. StringView subject;
  660. StringView expected;
  661. ECMAScriptFlags options {};
  662. };
  663. constexpr _test tests[] {
  664. { "foo(.+)"sv, "aaa"sv, "test"sv, "test"sv },
  665. { "foo(.+)"sv, "test\\1"sv, "foobar"sv, "testbar"sv },
  666. { "foo(.+)"sv, "\\2\\1"sv, "foobar"sv, "\\2bar"sv },
  667. { "foo(.+)"sv, "\\\\\\1"sv, "foobar"sv, "\\bar"sv },
  668. { "foo(.)"sv, "a\\1"sv, "fooxfooy"sv, "axay"sv, ECMAScriptFlags::Multiline },
  669. };
  670. for (auto& test : tests) {
  671. Regex<ECMA262> re(test.pattern, test.options);
  672. if constexpr (REGEX_DEBUG) {
  673. dbgln("\n");
  674. RegexDebug regex_dbg(stderr);
  675. regex_dbg.print_raw_bytecode(re);
  676. regex_dbg.print_header();
  677. regex_dbg.print_bytecode(re);
  678. dbgln("\n");
  679. }
  680. EXPECT_EQ(re.parser_result.error, Error::NoError);
  681. EXPECT_EQ(re.replace(test.subject, test.replacement), test.expected);
  682. }
  683. }
  684. TEST_CASE(case_insensitive_match)
  685. {
  686. Regex<PosixExtended> re("cd", PosixFlags::Insensitive | PosixFlags::Global);
  687. auto result = re.match("AEKFCD");
  688. EXPECT_EQ(result.success, true);
  689. if (result.success) {
  690. EXPECT_EQ(result.matches.at(0).column, 4ul);
  691. }
  692. }
  693. TEST_CASE(extremely_long_fork_chain)
  694. {
  695. Regex<ECMA262> re("(?:aa)*");
  696. auto result = re.match(String::repeated('a', 100'000));
  697. EXPECT_EQ(result.success, true);
  698. }
  699. static auto g_lots_of_a_s = String::repeated('a', 10'000'000);
  700. BENCHMARK_CASE(fork_performance)
  701. {
  702. Regex<ECMA262> re("(?:aa)*");
  703. auto result = re.match(g_lots_of_a_s);
  704. EXPECT_EQ(result.success, true);
  705. }