RegexMatch.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "Forward.h"
  8. #include "RegexOptions.h"
  9. #include <AK/Error.h>
  10. #include <AK/ByteString.h>
  11. #include <AK/COWVector.h>
  12. #include <AK/DeprecatedFlyString.h>
  13. #include <AK/HashMap.h>
  14. #include <AK/MemMem.h>
  15. #include <AK/RedBlackTree.h>
  16. #include <AK/StringBuilder.h>
  17. #include <AK/StringView.h>
  18. #include <AK/Utf16View.h>
  19. #include <AK/Utf32View.h>
  20. #include <AK/Utf8View.h>
  21. #include <AK/Variant.h>
  22. #include <AK/Vector.h>
  23. namespace regex {
  24. class RegexStringView {
  25. public:
  26. RegexStringView() = default;
  27. RegexStringView(ByteString const& string)
  28. : m_view(string.view())
  29. {
  30. }
  31. RegexStringView(String const& string)
  32. : m_view(string.bytes_as_string_view())
  33. {
  34. }
  35. RegexStringView(StringView const view)
  36. : m_view(view)
  37. {
  38. }
  39. RegexStringView(Utf32View view)
  40. : m_view(view)
  41. {
  42. }
  43. RegexStringView(Utf16View view)
  44. : m_view(view)
  45. {
  46. }
  47. RegexStringView(Utf8View view)
  48. : m_view(view)
  49. {
  50. }
  51. explicit RegexStringView(ByteString&&) = delete;
  52. bool is_string_view() const
  53. {
  54. return m_view.has<StringView>();
  55. }
  56. StringView string_view() const
  57. {
  58. return m_view.get<StringView>();
  59. }
  60. Utf32View const& u32_view() const
  61. {
  62. return m_view.get<Utf32View>();
  63. }
  64. Utf16View const& u16_view() const
  65. {
  66. return m_view.get<Utf16View>();
  67. }
  68. Utf8View const& u8_view() const
  69. {
  70. return m_view.get<Utf8View>();
  71. }
  72. bool unicode() const { return m_unicode; }
  73. void set_unicode(bool unicode) { m_unicode = unicode; }
  74. bool is_empty() const
  75. {
  76. return m_view.visit([](auto& view) { return view.is_empty(); });
  77. }
  78. bool is_null() const
  79. {
  80. return m_view.visit([](auto& view) { return view.is_null(); });
  81. }
  82. size_t length() const
  83. {
  84. if (unicode()) {
  85. return m_view.visit(
  86. [](Utf16View const& view) { return view.length_in_code_points(); },
  87. [](auto const& view) { return view.length(); });
  88. }
  89. return length_in_code_units();
  90. }
  91. size_t length_in_code_units() const
  92. {
  93. return m_view.visit(
  94. [](Utf16View const& view) { return view.length_in_code_units(); },
  95. [](Utf8View const& view) { return view.byte_length(); },
  96. [](auto const& view) { return view.length(); });
  97. }
  98. size_t length_of_code_point(u32 code_point) const
  99. {
  100. return m_view.visit(
  101. [](Utf32View const&) { return 1; },
  102. [&](Utf16View const&) {
  103. if (code_point < 0x10000)
  104. return 1;
  105. return 2;
  106. },
  107. [&](auto const&) {
  108. if (code_point <= 0x7f)
  109. return 1;
  110. if (code_point <= 0x07ff)
  111. return 2;
  112. if (code_point <= 0xffff)
  113. return 3;
  114. return 4;
  115. });
  116. }
  117. RegexStringView typed_null_view()
  118. {
  119. auto view = m_view.visit(
  120. [&]<typename T>(T const&) {
  121. return RegexStringView { T {} };
  122. });
  123. view.set_unicode(unicode());
  124. return view;
  125. }
  126. RegexStringView construct_as_same(Span<u32> data, Optional<ByteString>& optional_string_storage, Utf16Data& optional_utf16_storage) const
  127. {
  128. auto view = m_view.visit(
  129. [&]<typename T>(T const&) {
  130. StringBuilder builder;
  131. for (auto ch : data)
  132. builder.append(ch); // Note: The type conversion is intentional.
  133. optional_string_storage = builder.to_byte_string();
  134. return RegexStringView { T { *optional_string_storage } };
  135. },
  136. [&](Utf32View) {
  137. return RegexStringView { Utf32View { data.data(), data.size() } };
  138. },
  139. [&](Utf16View) {
  140. optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() }).release_value_but_fixme_should_propagate_errors();
  141. return RegexStringView { Utf16View { optional_utf16_storage } };
  142. });
  143. view.set_unicode(unicode());
  144. return view;
  145. }
  146. Vector<RegexStringView> lines() const
  147. {
  148. return m_view.visit(
  149. [](StringView view) {
  150. auto views = view.lines(StringView::ConsiderCarriageReturn::No);
  151. Vector<RegexStringView> new_views;
  152. for (auto& view : views)
  153. new_views.empend(view);
  154. return new_views;
  155. },
  156. [](Utf32View view) {
  157. if (view.is_empty())
  158. return Vector<RegexStringView> { view };
  159. Vector<RegexStringView> views;
  160. u32 newline = '\n';
  161. while (!view.is_empty()) {
  162. auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32));
  163. if (!position.has_value())
  164. break;
  165. auto offset = position.value() / sizeof(u32);
  166. views.empend(view.substring_view(0, offset));
  167. view = view.substring_view(offset + 1, view.length() - offset - 1);
  168. }
  169. if (!view.is_empty())
  170. views.empend(view);
  171. return views;
  172. },
  173. [](Utf16View view) {
  174. if (view.is_empty())
  175. return Vector<RegexStringView> { view };
  176. Vector<RegexStringView> views;
  177. u16 newline = '\n';
  178. while (!view.is_empty()) {
  179. auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16));
  180. if (!position.has_value())
  181. break;
  182. auto offset = position.value() / sizeof(u16);
  183. views.empend(view.substring_view(0, offset));
  184. view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1);
  185. }
  186. if (!view.is_empty())
  187. views.empend(view);
  188. return views;
  189. },
  190. [](Utf8View const& view) {
  191. if (view.is_empty())
  192. return Vector<RegexStringView> { view };
  193. Vector<RegexStringView> views;
  194. auto it = view.begin();
  195. auto previous_newline_position_it = it;
  196. for (;;) {
  197. if (*it == '\n') {
  198. auto previous_offset = view.byte_offset_of(previous_newline_position_it);
  199. auto new_offset = view.byte_offset_of(it);
  200. auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
  201. views.empend(slice);
  202. ++it;
  203. previous_newline_position_it = it;
  204. }
  205. if (it.done())
  206. break;
  207. ++it;
  208. }
  209. if (it != previous_newline_position_it) {
  210. auto previous_offset = view.byte_offset_of(previous_newline_position_it);
  211. auto new_offset = view.byte_offset_of(it);
  212. auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
  213. views.empend(slice);
  214. }
  215. return views;
  216. });
  217. }
  218. RegexStringView substring_view(size_t offset, size_t length) const
  219. {
  220. if (unicode()) {
  221. auto view = m_view.visit(
  222. [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
  223. [&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; },
  224. [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
  225. view.set_unicode(unicode());
  226. return view;
  227. }
  228. auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; });
  229. view.set_unicode(unicode());
  230. return view;
  231. }
  232. ByteString to_byte_string() const
  233. {
  234. return m_view.visit(
  235. [](StringView view) { return view.to_byte_string(); },
  236. [](Utf16View view) { return view.to_byte_string(Utf16View::AllowInvalidCodeUnits::Yes).release_value_but_fixme_should_propagate_errors(); },
  237. [](auto& view) {
  238. StringBuilder builder;
  239. for (auto it = view.begin(); it != view.end(); ++it)
  240. builder.append_code_point(*it);
  241. return builder.to_byte_string();
  242. });
  243. }
  244. ErrorOr<String> to_string() const
  245. {
  246. return m_view.visit(
  247. [](StringView view) { return String::from_utf8(view); },
  248. [](Utf16View view) { return view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); },
  249. [](auto& view) -> ErrorOr<String> {
  250. StringBuilder builder;
  251. for (auto it = view.begin(); it != view.end(); ++it)
  252. TRY(builder.try_append_code_point(*it));
  253. return builder.to_string();
  254. });
  255. }
  256. // Note: index must always be the code unit offset to return.
  257. u32 operator[](size_t index) const
  258. {
  259. return m_view.visit(
  260. [&](StringView view) -> u32 {
  261. auto ch = view[index];
  262. if constexpr (IsSigned<char>) {
  263. if (ch < 0)
  264. return 256u + ch;
  265. return ch;
  266. }
  267. },
  268. [&](Utf32View const& view) -> u32 { return view[index]; },
  269. [&](Utf16View const& view) -> u32 { return view.code_point_at(index); },
  270. [&](Utf8View const& view) -> u32 {
  271. auto it = view.iterator_at_byte_offset(index);
  272. VERIFY(it != view.end());
  273. return *it;
  274. });
  275. }
  276. u32 code_unit_at(size_t code_unit_index) const
  277. {
  278. if (unicode())
  279. return operator[](code_unit_index);
  280. return m_view.visit(
  281. [&](StringView view) -> u32 {
  282. auto ch = view[code_unit_index];
  283. if constexpr (IsSigned<char>) {
  284. if (ch < 0)
  285. return 256u + ch;
  286. return ch;
  287. }
  288. },
  289. [&](Utf32View const& view) -> u32 { return view[code_unit_index]; },
  290. [&](Utf16View const& view) -> u32 { return view.code_unit_at(code_unit_index); },
  291. [&](Utf8View const& view) -> u32 {
  292. auto it = view.iterator_at_byte_offset(code_unit_index);
  293. VERIFY(it != view.end());
  294. return *it;
  295. });
  296. }
  297. size_t code_unit_offset_of(size_t code_point_index) const
  298. {
  299. return m_view.visit(
  300. [&](StringView view) -> u32 {
  301. Utf8View utf8_view { view };
  302. return utf8_view.byte_offset_of(code_point_index);
  303. },
  304. [&](Utf32View const&) -> u32 { return code_point_index; },
  305. [&](Utf16View const& view) -> u32 {
  306. return view.code_unit_offset_of(code_point_index);
  307. },
  308. [&](Utf8View const& view) -> u32 {
  309. return view.byte_offset_of(code_point_index);
  310. });
  311. }
  312. bool operator==(char const* cstring) const
  313. {
  314. return m_view.visit(
  315. [&](Utf32View) { return to_byte_string() == cstring; },
  316. [&](Utf16View) { return to_byte_string() == cstring; },
  317. [&](Utf8View const& view) { return view.as_string() == cstring; },
  318. [&](StringView view) { return view == cstring; });
  319. }
  320. bool operator==(ByteString const& string) const
  321. {
  322. return m_view.visit(
  323. [&](Utf32View) { return to_byte_string() == string; },
  324. [&](Utf16View) { return to_byte_string() == string; },
  325. [&](Utf8View const& view) { return view.as_string() == string; },
  326. [&](StringView view) { return view == string; });
  327. }
  328. bool operator==(StringView string) const
  329. {
  330. return m_view.visit(
  331. [&](Utf32View) { return to_byte_string() == string; },
  332. [&](Utf16View) { return to_byte_string() == string; },
  333. [&](Utf8View const& view) { return view.as_string() == string; },
  334. [&](StringView view) { return view == string; });
  335. }
  336. bool operator==(Utf32View const& other) const
  337. {
  338. return m_view.visit(
  339. [&](Utf32View view) {
  340. return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0;
  341. },
  342. [&](Utf16View) { return to_byte_string() == RegexStringView { other }.to_byte_string(); },
  343. [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_byte_string(); },
  344. [&](StringView view) { return view == RegexStringView { other }.to_byte_string(); });
  345. }
  346. bool operator==(Utf16View const& other) const
  347. {
  348. return m_view.visit(
  349. [&](Utf32View) { return to_byte_string() == RegexStringView { other }.to_byte_string(); },
  350. [&](Utf16View const& view) { return view == other; },
  351. [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_byte_string(); },
  352. [&](StringView view) { return view == RegexStringView { other }.to_byte_string(); });
  353. }
  354. bool operator==(Utf8View const& other) const
  355. {
  356. return m_view.visit(
  357. [&](Utf32View) { return to_byte_string() == other.as_string(); },
  358. [&](Utf16View) { return to_byte_string() == other.as_string(); },
  359. [&](Utf8View const& view) { return view.as_string() == other.as_string(); },
  360. [&](StringView view) { return other.as_string() == view; });
  361. }
  362. bool equals(RegexStringView other) const
  363. {
  364. return other.m_view.visit([this](auto const& view) { return operator==(view); });
  365. }
  366. bool equals_ignoring_case(RegexStringView other) const
  367. {
  368. // FIXME: Implement equals_ignoring_case() for unicode.
  369. return m_view.visit(
  370. [&](StringView view) {
  371. return other.m_view.visit(
  372. [&](StringView other_view) { return view.equals_ignoring_ascii_case(other_view); },
  373. [](auto&) -> bool { TODO(); });
  374. },
  375. [&](Utf16View view) {
  376. return other.m_view.visit(
  377. [&](Utf16View other_view) { return view.equals_ignoring_case(other_view); },
  378. [](auto&) -> bool { TODO(); });
  379. },
  380. [](auto&) -> bool { TODO(); });
  381. }
  382. bool starts_with(StringView str) const
  383. {
  384. return m_view.visit(
  385. [&](Utf32View) -> bool {
  386. TODO();
  387. },
  388. [&](Utf16View) -> bool {
  389. TODO();
  390. },
  391. [&](Utf8View const& view) { return view.as_string().starts_with(str); },
  392. [&](StringView view) { return view.starts_with(str); });
  393. }
  394. bool starts_with(Utf32View const& str) const
  395. {
  396. return m_view.visit(
  397. [&](Utf32View view) -> bool {
  398. if (str.length() > view.length())
  399. return false;
  400. if (str.length() == view.length())
  401. return operator==(str);
  402. for (size_t i = 0; i < str.length(); ++i) {
  403. if (str.at(i) != view.at(i))
  404. return false;
  405. }
  406. return true;
  407. },
  408. [&](Utf16View) -> bool { TODO(); },
  409. [&](Utf8View const& view) {
  410. auto it = view.begin();
  411. for (auto code_point : str) {
  412. if (it.done())
  413. return false;
  414. if (code_point != *it)
  415. return false;
  416. ++it;
  417. }
  418. return true;
  419. },
  420. [&](StringView) -> bool { TODO(); });
  421. }
  422. private:
  423. Variant<StringView, Utf8View, Utf16View, Utf32View> m_view { StringView {} };
  424. bool m_unicode { false };
  425. };
  426. class Match final {
  427. private:
  428. Optional<DeprecatedFlyString> string;
  429. public:
  430. Match() = default;
  431. ~Match() = default;
  432. Match(RegexStringView view_, size_t const line_, size_t const column_, size_t const global_offset_)
  433. : view(view_)
  434. , line(line_)
  435. , column(column_)
  436. , global_offset(global_offset_)
  437. , left_column(column_)
  438. {
  439. }
  440. Match(ByteString string_, size_t const line_, size_t const column_, size_t const global_offset_)
  441. : string(move(string_))
  442. , view(string.value().view())
  443. , line(line_)
  444. , column(column_)
  445. , global_offset(global_offset_)
  446. {
  447. }
  448. Match(RegexStringView const view_, StringView capture_group_name_, size_t const line_, size_t const column_, size_t const global_offset_)
  449. : view(view_)
  450. , capture_group_name(capture_group_name_)
  451. , line(line_)
  452. , column(column_)
  453. , global_offset(global_offset_)
  454. , left_column(column_)
  455. {
  456. }
  457. void reset()
  458. {
  459. view = view.typed_null_view();
  460. capture_group_name.clear();
  461. line = 0;
  462. column = 0;
  463. global_offset = 0;
  464. left_column = 0;
  465. }
  466. RegexStringView view {};
  467. Optional<DeprecatedFlyString> capture_group_name {};
  468. size_t line { 0 };
  469. size_t column { 0 };
  470. size_t global_offset { 0 };
  471. // ugly, as not usable by user, but needed to prevent to create extra vectors that are
  472. // able to store the column when the left paren has been found
  473. size_t left_column { 0 };
  474. };
  475. struct MatchInput {
  476. RegexStringView view {};
  477. AllOptions regex_options {};
  478. size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset.
  479. size_t match_index { 0 };
  480. size_t line { 0 };
  481. size_t column { 0 };
  482. size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important
  483. mutable size_t fail_counter { 0 };
  484. mutable Vector<size_t> saved_positions;
  485. mutable Vector<size_t> saved_code_unit_positions;
  486. mutable Vector<size_t> saved_forks_since_last_save;
  487. mutable Vector<u64, 64> checkpoints;
  488. mutable Optional<size_t> fork_to_replace;
  489. };
  490. struct MatchState {
  491. size_t string_position_before_match { 0 };
  492. size_t string_position { 0 };
  493. size_t string_position_in_code_units { 0 };
  494. size_t instruction_position { 0 };
  495. size_t fork_at_position { 0 };
  496. size_t forks_since_last_save { 0 };
  497. Optional<size_t> initiating_fork;
  498. COWVector<Match> matches;
  499. COWVector<Vector<Match>> capture_group_matches;
  500. COWVector<u64> repetition_marks;
  501. };
  502. }
  503. using regex::RegexStringView;
  504. template<>
  505. struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> {
  506. ErrorOr<void> format(FormatBuilder& builder, regex::RegexStringView value)
  507. {
  508. auto string = value.to_byte_string();
  509. return Formatter<StringView>::format(builder, string);
  510. }
  511. };