RegexMatch.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "Forward.h"
  8. #include "RegexOptions.h"
  9. #include <AK/FlyString.h>
  10. #include <AK/HashMap.h>
  11. #include <AK/MemMem.h>
  12. #include <AK/String.h>
  13. #include <AK/StringBuilder.h>
  14. #include <AK/StringView.h>
  15. #include <AK/Utf16View.h>
  16. #include <AK/Utf32View.h>
  17. #include <AK/Utf8View.h>
  18. #include <AK/Variant.h>
  19. #include <AK/Vector.h>
  20. namespace regex {
  21. class RegexStringView {
  22. public:
  23. RegexStringView(char const* chars)
  24. : m_view(StringView { chars })
  25. {
  26. }
  27. RegexStringView(String const& string)
  28. : m_view(string.view())
  29. {
  30. }
  31. RegexStringView(StringView const view)
  32. : m_view(view)
  33. {
  34. }
  35. RegexStringView(Utf32View view)
  36. : m_view(view)
  37. {
  38. }
  39. RegexStringView(Utf16View view)
  40. : m_view(view)
  41. {
  42. }
  43. RegexStringView(Utf8View view)
  44. : m_view(view)
  45. {
  46. }
  47. explicit RegexStringView(String&&) = delete;
  48. StringView string_view() const
  49. {
  50. return m_view.get<StringView>();
  51. }
  52. Utf32View const& u32_view() const
  53. {
  54. return m_view.get<Utf32View>();
  55. }
  56. Utf16View const& u16_view() const
  57. {
  58. return m_view.get<Utf16View>();
  59. }
  60. Utf8View const& u8_view() const
  61. {
  62. return m_view.get<Utf8View>();
  63. }
  64. bool unicode() const { return m_unicode; }
  65. void set_unicode(bool unicode) { m_unicode = unicode; }
  66. bool is_empty() const
  67. {
  68. return m_view.visit([](auto& view) { return view.is_empty(); });
  69. }
  70. bool is_null() const
  71. {
  72. return m_view.visit([](auto& view) { return view.is_null(); });
  73. }
  74. size_t length() const
  75. {
  76. if (unicode()) {
  77. return m_view.visit(
  78. [](Utf16View const& view) { return view.length_in_code_points(); },
  79. [](auto const& view) { return view.length(); });
  80. }
  81. return length_in_code_units();
  82. }
  83. size_t length_in_code_units() const
  84. {
  85. return m_view.visit(
  86. [](Utf16View const& view) { return view.length_in_code_units(); },
  87. [](Utf8View const& view) { return view.byte_length(); },
  88. [](auto const& view) { return view.length(); });
  89. }
  90. size_t length_of_code_point(u32 code_point) const
  91. {
  92. return m_view.visit(
  93. [](Utf32View const&) { return 1; },
  94. [&](Utf16View const&) {
  95. if (code_point < 0x10000)
  96. return 1;
  97. return 2;
  98. },
  99. [&](auto const&) {
  100. if (code_point <= 0x7f)
  101. return 1;
  102. if (code_point <= 0x07ff)
  103. return 2;
  104. if (code_point <= 0xffff)
  105. return 3;
  106. return 4;
  107. });
  108. }
  109. RegexStringView typed_null_view()
  110. {
  111. auto view = m_view.visit(
  112. [&]<typename T>(T const&) {
  113. return RegexStringView { T {} };
  114. });
  115. view.set_unicode(unicode());
  116. return view;
  117. }
  118. RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage, Vector<u16, 1>& optional_utf16_storage) const
  119. {
  120. auto view = m_view.visit(
  121. [&]<typename T>(T const&) {
  122. StringBuilder builder;
  123. for (auto ch : data)
  124. builder.append(ch); // Note: The type conversion is intentional.
  125. optional_string_storage = builder.build();
  126. return RegexStringView { T { *optional_string_storage } };
  127. },
  128. [&](Utf32View) {
  129. return RegexStringView { Utf32View { data.data(), data.size() } };
  130. },
  131. [&](Utf16View) {
  132. optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() });
  133. return RegexStringView { Utf16View { optional_utf16_storage } };
  134. });
  135. view.set_unicode(unicode());
  136. return view;
  137. }
  138. Vector<RegexStringView> lines() const
  139. {
  140. return m_view.visit(
  141. [](StringView view) {
  142. auto views = view.lines(false);
  143. Vector<RegexStringView> new_views;
  144. for (auto& view : views)
  145. new_views.empend(view);
  146. return new_views;
  147. },
  148. [](Utf32View view) {
  149. if (view.is_empty())
  150. return Vector<RegexStringView> { view };
  151. Vector<RegexStringView> views;
  152. u32 newline = '\n';
  153. while (!view.is_empty()) {
  154. auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32));
  155. if (!position.has_value())
  156. break;
  157. auto offset = position.value() / sizeof(u32);
  158. views.empend(view.substring_view(0, offset));
  159. view = view.substring_view(offset + 1, view.length() - offset - 1);
  160. }
  161. if (!view.is_empty())
  162. views.empend(view);
  163. return views;
  164. },
  165. [](Utf16View view) {
  166. if (view.is_empty())
  167. return Vector<RegexStringView> { view };
  168. Vector<RegexStringView> views;
  169. u16 newline = '\n';
  170. while (!view.is_empty()) {
  171. auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16));
  172. if (!position.has_value())
  173. break;
  174. auto offset = position.value() / sizeof(u16);
  175. views.empend(view.substring_view(0, offset));
  176. view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1);
  177. }
  178. if (!view.is_empty())
  179. views.empend(view);
  180. return views;
  181. },
  182. [](Utf8View const& view) {
  183. if (view.is_empty())
  184. return Vector<RegexStringView> { view };
  185. Vector<RegexStringView> views;
  186. auto it = view.begin();
  187. auto previous_newline_position_it = it;
  188. for (;;) {
  189. if (*it == '\n') {
  190. auto previous_offset = view.byte_offset_of(previous_newline_position_it);
  191. auto new_offset = view.byte_offset_of(it);
  192. auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
  193. views.empend(slice);
  194. ++it;
  195. previous_newline_position_it = it;
  196. }
  197. if (it.done())
  198. break;
  199. ++it;
  200. }
  201. if (it != previous_newline_position_it) {
  202. auto previous_offset = view.byte_offset_of(previous_newline_position_it);
  203. auto new_offset = view.byte_offset_of(it);
  204. auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
  205. views.empend(slice);
  206. }
  207. return views;
  208. });
  209. }
  210. RegexStringView substring_view(size_t offset, size_t length) const
  211. {
  212. if (unicode()) {
  213. auto view = m_view.visit(
  214. [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
  215. [&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; },
  216. [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
  217. view.set_unicode(unicode());
  218. return view;
  219. }
  220. auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; });
  221. view.set_unicode(unicode());
  222. return view;
  223. }
  224. String to_string() const
  225. {
  226. return m_view.visit(
  227. [](StringView view) { return view.to_string(); },
  228. [](Utf16View view) { return view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); },
  229. [](auto& view) {
  230. StringBuilder builder;
  231. for (auto it = view.begin(); it != view.end(); ++it)
  232. builder.append_code_point(*it);
  233. return builder.to_string();
  234. });
  235. }
  236. // Note: index must always be the code unit offset to return.
  237. u32 operator[](size_t index) const
  238. {
  239. return m_view.visit(
  240. [&](StringView view) -> u32 {
  241. auto ch = view[index];
  242. if (ch < 0)
  243. return 256u + ch;
  244. return ch;
  245. },
  246. [&](Utf32View const& view) -> u32 { return view[index]; },
  247. [&](Utf16View const& view) -> u32 { return view.code_point_at(index); },
  248. [&](Utf8View const& view) -> u32 {
  249. auto it = view.iterator_at_byte_offset(index);
  250. VERIFY(it != view.end());
  251. return *it;
  252. });
  253. }
  254. size_t code_unit_offset_of(size_t code_point_index) const
  255. {
  256. return m_view.visit(
  257. [&](StringView view) -> u32 {
  258. Utf8View utf8_view { view };
  259. return utf8_view.byte_offset_of(code_point_index);
  260. },
  261. [&](Utf32View const&) -> u32 { return code_point_index; },
  262. [&](Utf16View const& view) -> u32 {
  263. return view.code_unit_offset_of(code_point_index);
  264. },
  265. [&](Utf8View const& view) -> u32 {
  266. return view.byte_offset_of(code_point_index);
  267. });
  268. }
  269. bool operator==(char const* cstring) const
  270. {
  271. return m_view.visit(
  272. [&](Utf32View) { return to_string() == cstring; },
  273. [&](Utf16View) { return to_string() == cstring; },
  274. [&](Utf8View const& view) { return view.as_string() == cstring; },
  275. [&](StringView view) { return view == cstring; });
  276. }
  277. bool operator!=(char const* cstring) const
  278. {
  279. return !(*this == cstring);
  280. }
  281. bool operator==(String const& string) const
  282. {
  283. return m_view.visit(
  284. [&](Utf32View) { return to_string() == string; },
  285. [&](Utf16View) { return to_string() == string; },
  286. [&](Utf8View const& view) { return view.as_string() == string; },
  287. [&](StringView view) { return view == string; });
  288. }
  289. bool operator==(StringView string) const
  290. {
  291. return m_view.visit(
  292. [&](Utf32View) { return to_string() == string; },
  293. [&](Utf16View) { return to_string() == string; },
  294. [&](Utf8View const& view) { return view.as_string() == string; },
  295. [&](StringView view) { return view == string; });
  296. }
  297. bool operator!=(StringView other) const
  298. {
  299. return !(*this == other);
  300. }
  301. bool operator==(Utf32View const& other) const
  302. {
  303. return m_view.visit(
  304. [&](Utf32View view) {
  305. return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0;
  306. },
  307. [&](Utf16View) { return to_string() == RegexStringView { other }.to_string(); },
  308. [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
  309. [&](StringView view) { return view == RegexStringView { other }.to_string(); });
  310. }
  311. bool operator!=(Utf32View const& other) const
  312. {
  313. return !(*this == other);
  314. }
  315. bool operator==(Utf16View const& other) const
  316. {
  317. return m_view.visit(
  318. [&](Utf32View) { return to_string() == RegexStringView { other }.to_string(); },
  319. [&](Utf16View const& view) { return view == other; },
  320. [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
  321. [&](StringView view) { return view == RegexStringView { other }.to_string(); });
  322. }
  323. bool operator!=(Utf16View const& other) const
  324. {
  325. return !(*this == other);
  326. }
  327. bool operator==(Utf8View const& other) const
  328. {
  329. return m_view.visit(
  330. [&](Utf32View) { return to_string() == other.as_string(); },
  331. [&](Utf16View) { return to_string() == other.as_string(); },
  332. [&](Utf8View const& view) { return view.as_string() == other.as_string(); },
  333. [&](StringView view) { return other.as_string() == view; });
  334. }
  335. bool operator!=(Utf8View const& other) const
  336. {
  337. return !(*this == other);
  338. }
  339. bool equals(RegexStringView other) const
  340. {
  341. return other.m_view.visit([this](auto const& view) { return operator==(view); });
  342. }
  343. bool equals_ignoring_case(RegexStringView other) const
  344. {
  345. // FIXME: Implement equals_ignoring_case() for unicode.
  346. return m_view.visit(
  347. [&](StringView view) {
  348. return other.m_view.visit(
  349. [&](StringView other_view) { return view.equals_ignoring_case(other_view); },
  350. [](auto&) -> bool { TODO(); });
  351. },
  352. [&](Utf16View view) {
  353. return other.m_view.visit(
  354. [&](Utf16View other_view) { return view.equals_ignoring_case(other_view); },
  355. [](auto&) -> bool { TODO(); });
  356. },
  357. [](auto&) -> bool { TODO(); });
  358. }
  359. bool starts_with(StringView str) const
  360. {
  361. return m_view.visit(
  362. [&](Utf32View) -> bool {
  363. TODO();
  364. },
  365. [&](Utf16View) -> bool {
  366. TODO();
  367. },
  368. [&](Utf8View const& view) { return view.as_string().starts_with(str); },
  369. [&](StringView view) { return view.starts_with(str); });
  370. }
  371. bool starts_with(Utf32View const& str) const
  372. {
  373. return m_view.visit(
  374. [&](Utf32View view) -> bool {
  375. if (str.length() > view.length())
  376. return false;
  377. if (str.length() == view.length())
  378. return operator==(str);
  379. for (size_t i = 0; i < str.length(); ++i) {
  380. if (str.at(i) != view.at(i))
  381. return false;
  382. }
  383. return true;
  384. },
  385. [&](Utf16View) -> bool { TODO(); },
  386. [&](Utf8View const& view) {
  387. auto it = view.begin();
  388. for (auto code_point : str) {
  389. if (it.done())
  390. return false;
  391. if (code_point != *it)
  392. return false;
  393. ++it;
  394. }
  395. return true;
  396. },
  397. [&](StringView) -> bool { TODO(); });
  398. }
  399. private:
  400. Variant<StringView, Utf8View, Utf16View, Utf32View> m_view;
  401. bool m_unicode { false };
  402. };
  403. class Match final {
  404. private:
  405. Optional<FlyString> string;
  406. public:
  407. Match() = default;
  408. ~Match() = default;
  409. Match(RegexStringView view_, size_t const line_, size_t const column_, size_t const global_offset_)
  410. : view(view_)
  411. , line(line_)
  412. , column(column_)
  413. , global_offset(global_offset_)
  414. , left_column(column_)
  415. {
  416. }
  417. Match(String string_, size_t const line_, size_t const column_, size_t const global_offset_)
  418. : string(move(string_))
  419. , view(string.value().view())
  420. , line(line_)
  421. , column(column_)
  422. , global_offset(global_offset_)
  423. {
  424. }
  425. Match(RegexStringView const view_, StringView capture_group_name_, size_t const line_, size_t const column_, size_t const global_offset_)
  426. : view(view_)
  427. , capture_group_name(capture_group_name_)
  428. , line(line_)
  429. , column(column_)
  430. , global_offset(global_offset_)
  431. , left_column(column_)
  432. {
  433. }
  434. void reset()
  435. {
  436. view = view.typed_null_view();
  437. capture_group_name.clear();
  438. line = 0;
  439. column = 0;
  440. global_offset = 0;
  441. left_column = 0;
  442. }
  443. RegexStringView view { nullptr };
  444. Optional<FlyString> capture_group_name {};
  445. size_t line { 0 };
  446. size_t column { 0 };
  447. size_t global_offset { 0 };
  448. // ugly, as not usable by user, but needed to prevent to create extra vectors that are
  449. // able to store the column when the left paren has been found
  450. size_t left_column { 0 };
  451. };
  452. struct MatchInput {
  453. RegexStringView view { nullptr };
  454. AllOptions regex_options {};
  455. size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset.
  456. size_t match_index { 0 };
  457. size_t line { 0 };
  458. size_t column { 0 };
  459. size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important
  460. mutable size_t fail_counter { 0 };
  461. mutable Vector<size_t> saved_positions;
  462. mutable Vector<size_t> saved_code_unit_positions;
  463. mutable Vector<size_t> saved_forks_since_last_save;
  464. mutable HashMap<u64, u64> checkpoints;
  465. mutable Optional<size_t> fork_to_replace;
  466. };
  467. struct MatchState {
  468. size_t string_position_before_match { 0 };
  469. size_t string_position { 0 };
  470. size_t string_position_in_code_units { 0 };
  471. size_t instruction_position { 0 };
  472. size_t fork_at_position { 0 };
  473. size_t forks_since_last_save { 0 };
  474. Optional<size_t> initiating_fork;
  475. Vector<Match> matches;
  476. Vector<Vector<Match>> capture_group_matches;
  477. Vector<u64> repetition_marks;
  478. };
  479. }
  480. using regex::RegexStringView;
  481. template<>
  482. struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> {
  483. ErrorOr<void> format(FormatBuilder& builder, regex::RegexStringView value)
  484. {
  485. auto string = value.to_string();
  486. return Formatter<StringView>::format(builder, string);
  487. }
  488. };