RegexMatch.h 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "Forward.h"
  8. #include "RegexOptions.h"
  9. #include <AK/Error.h>
  10. #include <AK/ByteString.h>
  11. #include <AK/DeprecatedFlyString.h>
  12. #include <AK/HashMap.h>
  13. #include <AK/MemMem.h>
  14. #include <AK/RedBlackTree.h>
  15. #include <AK/StringBuilder.h>
  16. #include <AK/StringView.h>
  17. #include <AK/Utf16View.h>
  18. #include <AK/Utf32View.h>
  19. #include <AK/Utf8View.h>
  20. #include <AK/Variant.h>
  21. #include <AK/Vector.h>
  22. namespace regex {
  23. template<typename T>
  24. class COWVector {
  25. struct Detail : RefCounted<Detail> {
  26. Vector<T> m_members;
  27. };
  28. public:
  29. COWVector()
  30. : m_detail(make_ref_counted<Detail>())
  31. {
  32. }
  33. COWVector(COWVector const&) = default;
  34. COWVector(COWVector&&) = default;
  35. COWVector& operator=(COWVector const&) = default;
  36. COWVector& operator=(COWVector&&) = default;
  37. Vector<T> release() &&
  38. {
  39. if (m_detail->ref_count() == 1)
  40. return exchange(m_detail->m_members, Vector<T>());
  41. return m_detail->m_members;
  42. }
  43. void append(T const& value)
  44. {
  45. return append(T { value });
  46. }
  47. void append(T&& value)
  48. {
  49. copy();
  50. m_detail->m_members.append(move(value));
  51. }
  52. void resize(size_t size)
  53. {
  54. copy();
  55. m_detail->m_members.resize(size);
  56. }
  57. void ensure_capacity(size_t capacity)
  58. {
  59. if (m_detail->m_members.capacity() >= capacity)
  60. return;
  61. copy();
  62. m_detail->m_members.ensure_capacity(capacity);
  63. }
  64. template<typename... Args>
  65. void empend(Args&&... args)
  66. {
  67. copy();
  68. m_detail->m_members.empend(forward<Args>(args)...);
  69. }
  70. void clear()
  71. {
  72. if (m_detail->ref_count() > 1)
  73. m_detail = make_ref_counted<Detail>();
  74. else
  75. m_detail->m_members.clear();
  76. }
  77. T& at(size_t index)
  78. {
  79. // We're handing out a mutable reference, so make sure we own the data exclusively.
  80. copy();
  81. return m_detail->m_members.at(index);
  82. }
  83. T const& at(size_t index) const
  84. {
  85. return m_detail->m_members.at(index);
  86. }
  87. T& operator[](size_t index)
  88. {
  89. // We're handing out a mutable reference, so make sure we own the data exclusively.
  90. copy();
  91. return m_detail->m_members[index];
  92. }
  93. T const& operator[](size_t index) const
  94. {
  95. return m_detail->m_members[index];
  96. }
  97. size_t capacity() const
  98. {
  99. return m_detail->m_members.capacity();
  100. }
  101. size_t size() const
  102. {
  103. return m_detail->m_members.size();
  104. }
  105. bool is_empty() const
  106. {
  107. return m_detail->m_members.is_empty();
  108. }
  109. T const& first() const
  110. {
  111. return m_detail->m_members.first();
  112. }
  113. T const& last() const
  114. {
  115. return m_detail->m_members.last();
  116. }
  117. private:
  118. void copy()
  119. {
  120. if (m_detail->ref_count() <= 1)
  121. return;
  122. auto new_detail = make_ref_counted<Detail>();
  123. new_detail->m_members = m_detail->m_members;
  124. m_detail = new_detail;
  125. }
  126. NonnullRefPtr<Detail> m_detail;
  127. };
  128. class RegexStringView {
  129. public:
  130. RegexStringView() = default;
  131. RegexStringView(ByteString const& string)
  132. : m_view(string.view())
  133. {
  134. }
  135. RegexStringView(String const& string)
  136. : m_view(string.bytes_as_string_view())
  137. {
  138. }
  139. RegexStringView(StringView const view)
  140. : m_view(view)
  141. {
  142. }
  143. RegexStringView(Utf32View view)
  144. : m_view(view)
  145. {
  146. }
  147. RegexStringView(Utf16View view)
  148. : m_view(view)
  149. {
  150. }
  151. RegexStringView(Utf8View view)
  152. : m_view(view)
  153. {
  154. }
  155. explicit RegexStringView(ByteString&&) = delete;
  156. bool is_string_view() const
  157. {
  158. return m_view.has<StringView>();
  159. }
  160. StringView string_view() const
  161. {
  162. return m_view.get<StringView>();
  163. }
  164. Utf32View const& u32_view() const
  165. {
  166. return m_view.get<Utf32View>();
  167. }
  168. Utf16View const& u16_view() const
  169. {
  170. return m_view.get<Utf16View>();
  171. }
  172. Utf8View const& u8_view() const
  173. {
  174. return m_view.get<Utf8View>();
  175. }
  176. bool unicode() const { return m_unicode; }
  177. void set_unicode(bool unicode) { m_unicode = unicode; }
  178. bool is_empty() const
  179. {
  180. return m_view.visit([](auto& view) { return view.is_empty(); });
  181. }
  182. bool is_null() const
  183. {
  184. return m_view.visit([](auto& view) { return view.is_null(); });
  185. }
  186. size_t length() const
  187. {
  188. if (unicode()) {
  189. return m_view.visit(
  190. [](Utf16View const& view) { return view.length_in_code_points(); },
  191. [](auto const& view) { return view.length(); });
  192. }
  193. return length_in_code_units();
  194. }
  195. size_t length_in_code_units() const
  196. {
  197. return m_view.visit(
  198. [](Utf16View const& view) { return view.length_in_code_units(); },
  199. [](Utf8View const& view) { return view.byte_length(); },
  200. [](auto const& view) { return view.length(); });
  201. }
  202. size_t length_of_code_point(u32 code_point) const
  203. {
  204. return m_view.visit(
  205. [](Utf32View const&) { return 1; },
  206. [&](Utf16View const&) {
  207. if (code_point < 0x10000)
  208. return 1;
  209. return 2;
  210. },
  211. [&](auto const&) {
  212. if (code_point <= 0x7f)
  213. return 1;
  214. if (code_point <= 0x07ff)
  215. return 2;
  216. if (code_point <= 0xffff)
  217. return 3;
  218. return 4;
  219. });
  220. }
  221. RegexStringView typed_null_view()
  222. {
  223. auto view = m_view.visit(
  224. [&]<typename T>(T const&) {
  225. return RegexStringView { T {} };
  226. });
  227. view.set_unicode(unicode());
  228. return view;
  229. }
  230. RegexStringView construct_as_same(Span<u32> data, Optional<ByteString>& optional_string_storage, Utf16Data& optional_utf16_storage) const
  231. {
  232. auto view = m_view.visit(
  233. [&]<typename T>(T const&) {
  234. StringBuilder builder;
  235. for (auto ch : data)
  236. builder.append(ch); // Note: The type conversion is intentional.
  237. optional_string_storage = builder.to_byte_string();
  238. return RegexStringView { T { *optional_string_storage } };
  239. },
  240. [&](Utf32View) {
  241. return RegexStringView { Utf32View { data.data(), data.size() } };
  242. },
  243. [&](Utf16View) {
  244. optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() }).release_value_but_fixme_should_propagate_errors();
  245. return RegexStringView { Utf16View { optional_utf16_storage } };
  246. });
  247. view.set_unicode(unicode());
  248. return view;
  249. }
  250. Vector<RegexStringView> lines() const
  251. {
  252. return m_view.visit(
  253. [](StringView view) {
  254. auto views = view.lines(false);
  255. Vector<RegexStringView> new_views;
  256. for (auto& view : views)
  257. new_views.empend(view);
  258. return new_views;
  259. },
  260. [](Utf32View view) {
  261. if (view.is_empty())
  262. return Vector<RegexStringView> { view };
  263. Vector<RegexStringView> views;
  264. u32 newline = '\n';
  265. while (!view.is_empty()) {
  266. auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32));
  267. if (!position.has_value())
  268. break;
  269. auto offset = position.value() / sizeof(u32);
  270. views.empend(view.substring_view(0, offset));
  271. view = view.substring_view(offset + 1, view.length() - offset - 1);
  272. }
  273. if (!view.is_empty())
  274. views.empend(view);
  275. return views;
  276. },
  277. [](Utf16View view) {
  278. if (view.is_empty())
  279. return Vector<RegexStringView> { view };
  280. Vector<RegexStringView> views;
  281. u16 newline = '\n';
  282. while (!view.is_empty()) {
  283. auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16));
  284. if (!position.has_value())
  285. break;
  286. auto offset = position.value() / sizeof(u16);
  287. views.empend(view.substring_view(0, offset));
  288. view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1);
  289. }
  290. if (!view.is_empty())
  291. views.empend(view);
  292. return views;
  293. },
  294. [](Utf8View const& view) {
  295. if (view.is_empty())
  296. return Vector<RegexStringView> { view };
  297. Vector<RegexStringView> views;
  298. auto it = view.begin();
  299. auto previous_newline_position_it = it;
  300. for (;;) {
  301. if (*it == '\n') {
  302. auto previous_offset = view.byte_offset_of(previous_newline_position_it);
  303. auto new_offset = view.byte_offset_of(it);
  304. auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
  305. views.empend(slice);
  306. ++it;
  307. previous_newline_position_it = it;
  308. }
  309. if (it.done())
  310. break;
  311. ++it;
  312. }
  313. if (it != previous_newline_position_it) {
  314. auto previous_offset = view.byte_offset_of(previous_newline_position_it);
  315. auto new_offset = view.byte_offset_of(it);
  316. auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
  317. views.empend(slice);
  318. }
  319. return views;
  320. });
  321. }
  322. RegexStringView substring_view(size_t offset, size_t length) const
  323. {
  324. if (unicode()) {
  325. auto view = m_view.visit(
  326. [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
  327. [&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; },
  328. [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
  329. view.set_unicode(unicode());
  330. return view;
  331. }
  332. auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; });
  333. view.set_unicode(unicode());
  334. return view;
  335. }
  336. ByteString to_byte_string() const
  337. {
  338. return m_view.visit(
  339. [](StringView view) { return view.to_byte_string(); },
  340. [](Utf16View view) { return view.to_byte_string(Utf16View::AllowInvalidCodeUnits::Yes).release_value_but_fixme_should_propagate_errors(); },
  341. [](auto& view) {
  342. StringBuilder builder;
  343. for (auto it = view.begin(); it != view.end(); ++it)
  344. builder.append_code_point(*it);
  345. return builder.to_byte_string();
  346. });
  347. }
  348. ErrorOr<String> to_string() const
  349. {
  350. return m_view.visit(
  351. [](StringView view) { return String::from_utf8(view); },
  352. [](Utf16View view) { return view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); },
  353. [](auto& view) -> ErrorOr<String> {
  354. StringBuilder builder;
  355. for (auto it = view.begin(); it != view.end(); ++it)
  356. TRY(builder.try_append_code_point(*it));
  357. return builder.to_string();
  358. });
  359. }
  360. // Note: index must always be the code unit offset to return.
  361. u32 operator[](size_t index) const
  362. {
  363. return m_view.visit(
  364. [&](StringView view) -> u32 {
  365. auto ch = view[index];
  366. if constexpr (IsSigned<char>) {
  367. if (ch < 0)
  368. return 256u + ch;
  369. return ch;
  370. }
  371. },
  372. [&](Utf32View const& view) -> u32 { return view[index]; },
  373. [&](Utf16View const& view) -> u32 { return view.code_point_at(index); },
  374. [&](Utf8View const& view) -> u32 {
  375. auto it = view.iterator_at_byte_offset(index);
  376. VERIFY(it != view.end());
  377. return *it;
  378. });
  379. }
  380. u32 code_unit_at(size_t code_unit_index) const
  381. {
  382. if (unicode())
  383. return operator[](code_unit_index);
  384. return m_view.visit(
  385. [&](StringView view) -> u32 {
  386. auto ch = view[code_unit_index];
  387. if constexpr (IsSigned<char>) {
  388. if (ch < 0)
  389. return 256u + ch;
  390. return ch;
  391. }
  392. },
  393. [&](Utf32View const& view) -> u32 { return view[code_unit_index]; },
  394. [&](Utf16View const& view) -> u32 { return view.code_unit_at(code_unit_index); },
  395. [&](Utf8View const& view) -> u32 {
  396. auto it = view.iterator_at_byte_offset(code_unit_index);
  397. VERIFY(it != view.end());
  398. return *it;
  399. });
  400. }
  401. size_t code_unit_offset_of(size_t code_point_index) const
  402. {
  403. return m_view.visit(
  404. [&](StringView view) -> u32 {
  405. Utf8View utf8_view { view };
  406. return utf8_view.byte_offset_of(code_point_index);
  407. },
  408. [&](Utf32View const&) -> u32 { return code_point_index; },
  409. [&](Utf16View const& view) -> u32 {
  410. return view.code_unit_offset_of(code_point_index);
  411. },
  412. [&](Utf8View const& view) -> u32 {
  413. return view.byte_offset_of(code_point_index);
  414. });
  415. }
  416. bool operator==(char const* cstring) const
  417. {
  418. return m_view.visit(
  419. [&](Utf32View) { return to_byte_string() == cstring; },
  420. [&](Utf16View) { return to_byte_string() == cstring; },
  421. [&](Utf8View const& view) { return view.as_string() == cstring; },
  422. [&](StringView view) { return view == cstring; });
  423. }
  424. bool operator==(ByteString const& string) const
  425. {
  426. return m_view.visit(
  427. [&](Utf32View) { return to_byte_string() == string; },
  428. [&](Utf16View) { return to_byte_string() == string; },
  429. [&](Utf8View const& view) { return view.as_string() == string; },
  430. [&](StringView view) { return view == string; });
  431. }
  432. bool operator==(StringView string) const
  433. {
  434. return m_view.visit(
  435. [&](Utf32View) { return to_byte_string() == string; },
  436. [&](Utf16View) { return to_byte_string() == string; },
  437. [&](Utf8View const& view) { return view.as_string() == string; },
  438. [&](StringView view) { return view == string; });
  439. }
  440. bool operator==(Utf32View const& other) const
  441. {
  442. return m_view.visit(
  443. [&](Utf32View view) {
  444. return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0;
  445. },
  446. [&](Utf16View) { return to_byte_string() == RegexStringView { other }.to_byte_string(); },
  447. [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_byte_string(); },
  448. [&](StringView view) { return view == RegexStringView { other }.to_byte_string(); });
  449. }
  450. bool operator==(Utf16View const& other) const
  451. {
  452. return m_view.visit(
  453. [&](Utf32View) { return to_byte_string() == RegexStringView { other }.to_byte_string(); },
  454. [&](Utf16View const& view) { return view == other; },
  455. [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_byte_string(); },
  456. [&](StringView view) { return view == RegexStringView { other }.to_byte_string(); });
  457. }
  458. bool operator==(Utf8View const& other) const
  459. {
  460. return m_view.visit(
  461. [&](Utf32View) { return to_byte_string() == other.as_string(); },
  462. [&](Utf16View) { return to_byte_string() == other.as_string(); },
  463. [&](Utf8View const& view) { return view.as_string() == other.as_string(); },
  464. [&](StringView view) { return other.as_string() == view; });
  465. }
  466. bool equals(RegexStringView other) const
  467. {
  468. return other.m_view.visit([this](auto const& view) { return operator==(view); });
  469. }
  470. bool equals_ignoring_case(RegexStringView other) const
  471. {
  472. // FIXME: Implement equals_ignoring_case() for unicode.
  473. return m_view.visit(
  474. [&](StringView view) {
  475. return other.m_view.visit(
  476. [&](StringView other_view) { return view.equals_ignoring_ascii_case(other_view); },
  477. [](auto&) -> bool { TODO(); });
  478. },
  479. [&](Utf16View view) {
  480. return other.m_view.visit(
  481. [&](Utf16View other_view) { return view.equals_ignoring_case(other_view); },
  482. [](auto&) -> bool { TODO(); });
  483. },
  484. [](auto&) -> bool { TODO(); });
  485. }
  486. bool starts_with(StringView str) const
  487. {
  488. return m_view.visit(
  489. [&](Utf32View) -> bool {
  490. TODO();
  491. },
  492. [&](Utf16View) -> bool {
  493. TODO();
  494. },
  495. [&](Utf8View const& view) { return view.as_string().starts_with(str); },
  496. [&](StringView view) { return view.starts_with(str); });
  497. }
  498. bool starts_with(Utf32View const& str) const
  499. {
  500. return m_view.visit(
  501. [&](Utf32View view) -> bool {
  502. if (str.length() > view.length())
  503. return false;
  504. if (str.length() == view.length())
  505. return operator==(str);
  506. for (size_t i = 0; i < str.length(); ++i) {
  507. if (str.at(i) != view.at(i))
  508. return false;
  509. }
  510. return true;
  511. },
  512. [&](Utf16View) -> bool { TODO(); },
  513. [&](Utf8View const& view) {
  514. auto it = view.begin();
  515. for (auto code_point : str) {
  516. if (it.done())
  517. return false;
  518. if (code_point != *it)
  519. return false;
  520. ++it;
  521. }
  522. return true;
  523. },
  524. [&](StringView) -> bool { TODO(); });
  525. }
  526. private:
  527. Variant<StringView, Utf8View, Utf16View, Utf32View> m_view { StringView {} };
  528. bool m_unicode { false };
  529. };
  530. class Match final {
  531. private:
  532. Optional<DeprecatedFlyString> string;
  533. public:
  534. Match() = default;
  535. ~Match() = default;
  536. Match(RegexStringView view_, size_t const line_, size_t const column_, size_t const global_offset_)
  537. : view(view_)
  538. , line(line_)
  539. , column(column_)
  540. , global_offset(global_offset_)
  541. , left_column(column_)
  542. {
  543. }
  544. Match(ByteString string_, size_t const line_, size_t const column_, size_t const global_offset_)
  545. : string(move(string_))
  546. , view(string.value().view())
  547. , line(line_)
  548. , column(column_)
  549. , global_offset(global_offset_)
  550. {
  551. }
  552. Match(RegexStringView const view_, StringView capture_group_name_, size_t const line_, size_t const column_, size_t const global_offset_)
  553. : view(view_)
  554. , capture_group_name(capture_group_name_)
  555. , line(line_)
  556. , column(column_)
  557. , global_offset(global_offset_)
  558. , left_column(column_)
  559. {
  560. }
  561. void reset()
  562. {
  563. view = view.typed_null_view();
  564. capture_group_name.clear();
  565. line = 0;
  566. column = 0;
  567. global_offset = 0;
  568. left_column = 0;
  569. }
  570. RegexStringView view {};
  571. Optional<DeprecatedFlyString> capture_group_name {};
  572. size_t line { 0 };
  573. size_t column { 0 };
  574. size_t global_offset { 0 };
  575. // ugly, as not usable by user, but needed to prevent to create extra vectors that are
  576. // able to store the column when the left paren has been found
  577. size_t left_column { 0 };
  578. };
  579. struct MatchInput {
  580. RegexStringView view {};
  581. AllOptions regex_options {};
  582. size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset.
  583. size_t match_index { 0 };
  584. size_t line { 0 };
  585. size_t column { 0 };
  586. size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important
  587. mutable size_t fail_counter { 0 };
  588. mutable Vector<size_t> saved_positions;
  589. mutable Vector<size_t> saved_code_unit_positions;
  590. mutable Vector<size_t> saved_forks_since_last_save;
  591. mutable Vector<u64, 64> checkpoints;
  592. mutable Optional<size_t> fork_to_replace;
  593. };
  594. struct MatchState {
  595. size_t string_position_before_match { 0 };
  596. size_t string_position { 0 };
  597. size_t string_position_in_code_units { 0 };
  598. size_t instruction_position { 0 };
  599. size_t fork_at_position { 0 };
  600. size_t forks_since_last_save { 0 };
  601. Optional<size_t> initiating_fork;
  602. COWVector<Match> matches;
  603. COWVector<Vector<Match>> capture_group_matches;
  604. COWVector<u64> repetition_marks;
  605. };
  606. }
  607. using regex::RegexStringView;
  608. template<>
  609. struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> {
  610. ErrorOr<void> format(FormatBuilder& builder, regex::RegexStringView value)
  611. {
  612. auto string = value.to_byte_string();
  613. return Formatter<StringView>::format(builder, string);
  614. }
  615. };