RegexMatch.h 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "Forward.h"
  8. #include "RegexOptions.h"
  9. #include <AK/FlyString.h>
  10. #include <AK/HashMap.h>
  11. #include <AK/MemMem.h>
  12. #include <AK/RedBlackTree.h>
  13. #include <AK/String.h>
  14. #include <AK/StringBuilder.h>
  15. #include <AK/StringView.h>
  16. #include <AK/Utf16View.h>
  17. #include <AK/Utf32View.h>
  18. #include <AK/Utf8View.h>
  19. #include <AK/Variant.h>
  20. #include <AK/Vector.h>
  21. namespace regex {
  22. template<typename T>
  23. class COWVector {
  24. struct Detail : RefCounted<Detail> {
  25. Vector<T> m_members;
  26. };
  27. public:
  28. COWVector()
  29. : m_detail(make_ref_counted<Detail>())
  30. {
  31. }
  32. COWVector(COWVector const&) = default;
  33. COWVector(COWVector&&) = default;
  34. COWVector& operator=(COWVector const&) = default;
  35. COWVector& operator=(COWVector&&) = default;
  36. Vector<T> release() &&
  37. {
  38. if (m_detail->ref_count() == 1)
  39. return exchange(m_detail->m_members, Vector<T>());
  40. return m_detail->m_members;
  41. }
  42. void append(T const& value)
  43. {
  44. return append(T { value });
  45. }
  46. void append(T&& value)
  47. {
  48. copy();
  49. m_detail->m_members.append(move(value));
  50. }
  51. void resize(size_t size)
  52. {
  53. copy();
  54. m_detail->m_members.resize(size);
  55. }
  56. void ensure_capacity(size_t capacity)
  57. {
  58. if (m_detail->m_members.capacity() >= capacity)
  59. return;
  60. copy();
  61. m_detail->m_members.ensure_capacity(capacity);
  62. }
  63. template<typename... Args>
  64. void empend(Args&&... args)
  65. {
  66. copy();
  67. m_detail->m_members.empend(forward<Args>(args)...);
  68. }
  69. void clear()
  70. {
  71. if (m_detail->ref_count() > 1)
  72. m_detail = make_ref_counted<Detail>();
  73. else
  74. m_detail->m_members.clear();
  75. }
  76. T& at(size_t index)
  77. {
  78. // We're handing out a mutable reference, so make sure we own the data exclusively.
  79. copy();
  80. return m_detail->m_members.at(index);
  81. }
  82. T const& at(size_t index) const
  83. {
  84. return m_detail->m_members.at(index);
  85. }
  86. T& operator[](size_t index)
  87. {
  88. // We're handing out a mutable reference, so make sure we own the data exclusively.
  89. copy();
  90. return m_detail->m_members[index];
  91. }
  92. T const& operator[](size_t index) const
  93. {
  94. return m_detail->m_members[index];
  95. }
  96. size_t capacity() const
  97. {
  98. return m_detail->m_members.capacity();
  99. }
  100. size_t size() const
  101. {
  102. return m_detail->m_members.size();
  103. }
  104. bool is_empty() const
  105. {
  106. return m_detail->m_members.is_empty();
  107. }
  108. T const& first() const
  109. {
  110. return m_detail->m_members.first();
  111. }
  112. T const& last() const
  113. {
  114. return m_detail->m_members.last();
  115. }
  116. private:
  117. void copy()
  118. {
  119. if (m_detail->ref_count() <= 1)
  120. return;
  121. auto new_detail = make_ref_counted<Detail>();
  122. new_detail->m_members = m_detail->m_members;
  123. m_detail = new_detail;
  124. }
  125. NonnullRefPtr<Detail> m_detail;
  126. };
  127. class RegexStringView {
  128. public:
  129. RegexStringView() = default;
  130. RegexStringView(String const& string)
  131. : m_view(string.view())
  132. {
  133. }
  134. RegexStringView(StringView const view)
  135. : m_view(view)
  136. {
  137. }
  138. RegexStringView(Utf32View view)
  139. : m_view(view)
  140. {
  141. }
  142. RegexStringView(Utf16View view)
  143. : m_view(view)
  144. {
  145. }
  146. RegexStringView(Utf8View view)
  147. : m_view(view)
  148. {
  149. }
  150. explicit RegexStringView(String&&) = delete;
  151. StringView string_view() const
  152. {
  153. return m_view.get<StringView>();
  154. }
  155. Utf32View const& u32_view() const
  156. {
  157. return m_view.get<Utf32View>();
  158. }
  159. Utf16View const& u16_view() const
  160. {
  161. return m_view.get<Utf16View>();
  162. }
  163. Utf8View const& u8_view() const
  164. {
  165. return m_view.get<Utf8View>();
  166. }
  167. bool unicode() const { return m_unicode; }
  168. void set_unicode(bool unicode) { m_unicode = unicode; }
  169. bool is_empty() const
  170. {
  171. return m_view.visit([](auto& view) { return view.is_empty(); });
  172. }
  173. bool is_null() const
  174. {
  175. return m_view.visit([](auto& view) { return view.is_null(); });
  176. }
  177. size_t length() const
  178. {
  179. if (unicode()) {
  180. return m_view.visit(
  181. [](Utf16View const& view) { return view.length_in_code_points(); },
  182. [](auto const& view) { return view.length(); });
  183. }
  184. return length_in_code_units();
  185. }
  186. size_t length_in_code_units() const
  187. {
  188. return m_view.visit(
  189. [](Utf16View const& view) { return view.length_in_code_units(); },
  190. [](Utf8View const& view) { return view.byte_length(); },
  191. [](auto const& view) { return view.length(); });
  192. }
  193. size_t length_of_code_point(u32 code_point) const
  194. {
  195. return m_view.visit(
  196. [](Utf32View const&) { return 1; },
  197. [&](Utf16View const&) {
  198. if (code_point < 0x10000)
  199. return 1;
  200. return 2;
  201. },
  202. [&](auto const&) {
  203. if (code_point <= 0x7f)
  204. return 1;
  205. if (code_point <= 0x07ff)
  206. return 2;
  207. if (code_point <= 0xffff)
  208. return 3;
  209. return 4;
  210. });
  211. }
  212. RegexStringView typed_null_view()
  213. {
  214. auto view = m_view.visit(
  215. [&]<typename T>(T const&) {
  216. return RegexStringView { T {} };
  217. });
  218. view.set_unicode(unicode());
  219. return view;
  220. }
  221. RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage, Vector<u16, 1>& optional_utf16_storage) const
  222. {
  223. auto view = m_view.visit(
  224. [&]<typename T>(T const&) {
  225. StringBuilder builder;
  226. for (auto ch : data)
  227. builder.append(ch); // Note: The type conversion is intentional.
  228. optional_string_storage = builder.build();
  229. return RegexStringView { T { *optional_string_storage } };
  230. },
  231. [&](Utf32View) {
  232. return RegexStringView { Utf32View { data.data(), data.size() } };
  233. },
  234. [&](Utf16View) {
  235. optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() });
  236. return RegexStringView { Utf16View { optional_utf16_storage } };
  237. });
  238. view.set_unicode(unicode());
  239. return view;
  240. }
  241. Vector<RegexStringView> lines() const
  242. {
  243. return m_view.visit(
  244. [](StringView view) {
  245. auto views = view.lines(false);
  246. Vector<RegexStringView> new_views;
  247. for (auto& view : views)
  248. new_views.empend(view);
  249. return new_views;
  250. },
  251. [](Utf32View view) {
  252. if (view.is_empty())
  253. return Vector<RegexStringView> { view };
  254. Vector<RegexStringView> views;
  255. u32 newline = '\n';
  256. while (!view.is_empty()) {
  257. auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32));
  258. if (!position.has_value())
  259. break;
  260. auto offset = position.value() / sizeof(u32);
  261. views.empend(view.substring_view(0, offset));
  262. view = view.substring_view(offset + 1, view.length() - offset - 1);
  263. }
  264. if (!view.is_empty())
  265. views.empend(view);
  266. return views;
  267. },
  268. [](Utf16View view) {
  269. if (view.is_empty())
  270. return Vector<RegexStringView> { view };
  271. Vector<RegexStringView> views;
  272. u16 newline = '\n';
  273. while (!view.is_empty()) {
  274. auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16));
  275. if (!position.has_value())
  276. break;
  277. auto offset = position.value() / sizeof(u16);
  278. views.empend(view.substring_view(0, offset));
  279. view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1);
  280. }
  281. if (!view.is_empty())
  282. views.empend(view);
  283. return views;
  284. },
  285. [](Utf8View const& view) {
  286. if (view.is_empty())
  287. return Vector<RegexStringView> { view };
  288. Vector<RegexStringView> views;
  289. auto it = view.begin();
  290. auto previous_newline_position_it = it;
  291. for (;;) {
  292. if (*it == '\n') {
  293. auto previous_offset = view.byte_offset_of(previous_newline_position_it);
  294. auto new_offset = view.byte_offset_of(it);
  295. auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
  296. views.empend(slice);
  297. ++it;
  298. previous_newline_position_it = it;
  299. }
  300. if (it.done())
  301. break;
  302. ++it;
  303. }
  304. if (it != previous_newline_position_it) {
  305. auto previous_offset = view.byte_offset_of(previous_newline_position_it);
  306. auto new_offset = view.byte_offset_of(it);
  307. auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
  308. views.empend(slice);
  309. }
  310. return views;
  311. });
  312. }
  313. RegexStringView substring_view(size_t offset, size_t length) const
  314. {
  315. if (unicode()) {
  316. auto view = m_view.visit(
  317. [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
  318. [&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; },
  319. [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
  320. view.set_unicode(unicode());
  321. return view;
  322. }
  323. auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; });
  324. view.set_unicode(unicode());
  325. return view;
  326. }
  327. String to_string() const
  328. {
  329. return m_view.visit(
  330. [](StringView view) { return view.to_string(); },
  331. [](Utf16View view) { return view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); },
  332. [](auto& view) {
  333. StringBuilder builder;
  334. for (auto it = view.begin(); it != view.end(); ++it)
  335. builder.append_code_point(*it);
  336. return builder.to_string();
  337. });
  338. }
  339. // Note: index must always be the code unit offset to return.
  340. u32 operator[](size_t index) const
  341. {
  342. return m_view.visit(
  343. [&](StringView view) -> u32 {
  344. auto ch = view[index];
  345. if constexpr (IsSigned<char>) {
  346. if (ch < 0)
  347. return 256u + ch;
  348. return ch;
  349. }
  350. },
  351. [&](Utf32View const& view) -> u32 { return view[index]; },
  352. [&](Utf16View const& view) -> u32 { return view.code_point_at(index); },
  353. [&](Utf8View const& view) -> u32 {
  354. auto it = view.iterator_at_byte_offset(index);
  355. VERIFY(it != view.end());
  356. return *it;
  357. });
  358. }
  359. size_t code_unit_offset_of(size_t code_point_index) const
  360. {
  361. return m_view.visit(
  362. [&](StringView view) -> u32 {
  363. Utf8View utf8_view { view };
  364. return utf8_view.byte_offset_of(code_point_index);
  365. },
  366. [&](Utf32View const&) -> u32 { return code_point_index; },
  367. [&](Utf16View const& view) -> u32 {
  368. return view.code_unit_offset_of(code_point_index);
  369. },
  370. [&](Utf8View const& view) -> u32 {
  371. return view.byte_offset_of(code_point_index);
  372. });
  373. }
  374. bool operator==(char const* cstring) const
  375. {
  376. return m_view.visit(
  377. [&](Utf32View) { return to_string() == cstring; },
  378. [&](Utf16View) { return to_string() == cstring; },
  379. [&](Utf8View const& view) { return view.as_string() == cstring; },
  380. [&](StringView view) { return view == cstring; });
  381. }
  382. bool operator==(String const& string) const
  383. {
  384. return m_view.visit(
  385. [&](Utf32View) { return to_string() == string; },
  386. [&](Utf16View) { return to_string() == string; },
  387. [&](Utf8View const& view) { return view.as_string() == string; },
  388. [&](StringView view) { return view == string; });
  389. }
  390. bool operator==(StringView string) const
  391. {
  392. return m_view.visit(
  393. [&](Utf32View) { return to_string() == string; },
  394. [&](Utf16View) { return to_string() == string; },
  395. [&](Utf8View const& view) { return view.as_string() == string; },
  396. [&](StringView view) { return view == string; });
  397. }
  398. bool operator==(Utf32View const& other) const
  399. {
  400. return m_view.visit(
  401. [&](Utf32View view) {
  402. return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0;
  403. },
  404. [&](Utf16View) { return to_string() == RegexStringView { other }.to_string(); },
  405. [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
  406. [&](StringView view) { return view == RegexStringView { other }.to_string(); });
  407. }
  408. bool operator==(Utf16View const& other) const
  409. {
  410. return m_view.visit(
  411. [&](Utf32View) { return to_string() == RegexStringView { other }.to_string(); },
  412. [&](Utf16View const& view) { return view == other; },
  413. [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
  414. [&](StringView view) { return view == RegexStringView { other }.to_string(); });
  415. }
  416. bool operator==(Utf8View const& other) const
  417. {
  418. return m_view.visit(
  419. [&](Utf32View) { return to_string() == other.as_string(); },
  420. [&](Utf16View) { return to_string() == other.as_string(); },
  421. [&](Utf8View const& view) { return view.as_string() == other.as_string(); },
  422. [&](StringView view) { return other.as_string() == view; });
  423. }
  424. bool equals(RegexStringView other) const
  425. {
  426. return other.m_view.visit([this](auto const& view) { return operator==(view); });
  427. }
  428. bool equals_ignoring_case(RegexStringView other) const
  429. {
  430. // FIXME: Implement equals_ignoring_case() for unicode.
  431. return m_view.visit(
  432. [&](StringView view) {
  433. return other.m_view.visit(
  434. [&](StringView other_view) { return view.equals_ignoring_case(other_view); },
  435. [](auto&) -> bool { TODO(); });
  436. },
  437. [&](Utf16View view) {
  438. return other.m_view.visit(
  439. [&](Utf16View other_view) { return view.equals_ignoring_case(other_view); },
  440. [](auto&) -> bool { TODO(); });
  441. },
  442. [](auto&) -> bool { TODO(); });
  443. }
  444. bool starts_with(StringView str) const
  445. {
  446. return m_view.visit(
  447. [&](Utf32View) -> bool {
  448. TODO();
  449. },
  450. [&](Utf16View) -> bool {
  451. TODO();
  452. },
  453. [&](Utf8View const& view) { return view.as_string().starts_with(str); },
  454. [&](StringView view) { return view.starts_with(str); });
  455. }
  456. bool starts_with(Utf32View const& str) const
  457. {
  458. return m_view.visit(
  459. [&](Utf32View view) -> bool {
  460. if (str.length() > view.length())
  461. return false;
  462. if (str.length() == view.length())
  463. return operator==(str);
  464. for (size_t i = 0; i < str.length(); ++i) {
  465. if (str.at(i) != view.at(i))
  466. return false;
  467. }
  468. return true;
  469. },
  470. [&](Utf16View) -> bool { TODO(); },
  471. [&](Utf8View const& view) {
  472. auto it = view.begin();
  473. for (auto code_point : str) {
  474. if (it.done())
  475. return false;
  476. if (code_point != *it)
  477. return false;
  478. ++it;
  479. }
  480. return true;
  481. },
  482. [&](StringView) -> bool { TODO(); });
  483. }
  484. private:
  485. Variant<StringView, Utf8View, Utf16View, Utf32View> m_view { StringView {} };
  486. bool m_unicode { false };
  487. };
  488. class Match final {
  489. private:
  490. Optional<FlyString> string;
  491. public:
  492. Match() = default;
  493. ~Match() = default;
  494. Match(RegexStringView view_, size_t const line_, size_t const column_, size_t const global_offset_)
  495. : view(view_)
  496. , line(line_)
  497. , column(column_)
  498. , global_offset(global_offset_)
  499. , left_column(column_)
  500. {
  501. }
  502. Match(String string_, size_t const line_, size_t const column_, size_t const global_offset_)
  503. : string(move(string_))
  504. , view(string.value().view())
  505. , line(line_)
  506. , column(column_)
  507. , global_offset(global_offset_)
  508. {
  509. }
  510. Match(RegexStringView const view_, StringView capture_group_name_, size_t const line_, size_t const column_, size_t const global_offset_)
  511. : view(view_)
  512. , capture_group_name(capture_group_name_)
  513. , line(line_)
  514. , column(column_)
  515. , global_offset(global_offset_)
  516. , left_column(column_)
  517. {
  518. }
  519. void reset()
  520. {
  521. view = view.typed_null_view();
  522. capture_group_name.clear();
  523. line = 0;
  524. column = 0;
  525. global_offset = 0;
  526. left_column = 0;
  527. }
  528. RegexStringView view {};
  529. Optional<FlyString> capture_group_name {};
  530. size_t line { 0 };
  531. size_t column { 0 };
  532. size_t global_offset { 0 };
  533. // ugly, as not usable by user, but needed to prevent to create extra vectors that are
  534. // able to store the column when the left paren has been found
  535. size_t left_column { 0 };
  536. };
  537. struct MatchInput {
  538. RegexStringView view {};
  539. AllOptions regex_options {};
  540. size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset.
  541. size_t match_index { 0 };
  542. size_t line { 0 };
  543. size_t column { 0 };
  544. size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important
  545. mutable size_t fail_counter { 0 };
  546. mutable Vector<size_t> saved_positions;
  547. mutable Vector<size_t> saved_code_unit_positions;
  548. mutable Vector<size_t> saved_forks_since_last_save;
  549. mutable HashMap<u64, u64> checkpoints;
  550. mutable Optional<size_t> fork_to_replace;
  551. };
  552. struct MatchState {
  553. size_t string_position_before_match { 0 };
  554. size_t string_position { 0 };
  555. size_t string_position_in_code_units { 0 };
  556. size_t instruction_position { 0 };
  557. size_t fork_at_position { 0 };
  558. size_t forks_since_last_save { 0 };
  559. Optional<size_t> initiating_fork;
  560. COWVector<Match> matches;
  561. COWVector<Vector<Match>> capture_group_matches;
  562. COWVector<u64> repetition_marks;
  563. };
  564. }
  565. using regex::RegexStringView;
  566. template<>
  567. struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> {
  568. ErrorOr<void> format(FormatBuilder& builder, regex::RegexStringView value)
  569. {
  570. auto string = value.to_string();
  571. return Formatter<StringView>::format(builder, string);
  572. }
  573. };