RegexMatch.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "Forward.h"
  8. #include "RegexOptions.h"
  9. #include <AK/FlyString.h>
  10. #include <AK/HashMap.h>
  11. #include <AK/MemMem.h>
  12. #include <AK/String.h>
  13. #include <AK/StringBuilder.h>
  14. #include <AK/StringView.h>
  15. #include <AK/Utf16View.h>
  16. #include <AK/Utf32View.h>
  17. #include <AK/Utf8View.h>
  18. #include <AK/Variant.h>
  19. #include <AK/Vector.h>
  20. namespace regex {
  21. class RegexStringView {
  22. public:
  23. RegexStringView() = default;
  24. RegexStringView(String const& string)
  25. : m_view(string.view())
  26. {
  27. }
  28. RegexStringView(StringView const view)
  29. : m_view(view)
  30. {
  31. }
  32. RegexStringView(Utf32View view)
  33. : m_view(view)
  34. {
  35. }
  36. RegexStringView(Utf16View view)
  37. : m_view(view)
  38. {
  39. }
  40. RegexStringView(Utf8View view)
  41. : m_view(view)
  42. {
  43. }
  44. explicit RegexStringView(String&&) = delete;
  45. StringView string_view() const
  46. {
  47. return m_view.get<StringView>();
  48. }
  49. Utf32View const& u32_view() const
  50. {
  51. return m_view.get<Utf32View>();
  52. }
  53. Utf16View const& u16_view() const
  54. {
  55. return m_view.get<Utf16View>();
  56. }
  57. Utf8View const& u8_view() const
  58. {
  59. return m_view.get<Utf8View>();
  60. }
  61. bool unicode() const { return m_unicode; }
  62. void set_unicode(bool unicode) { m_unicode = unicode; }
  63. bool is_empty() const
  64. {
  65. return m_view.visit([](auto& view) { return view.is_empty(); });
  66. }
  67. bool is_null() const
  68. {
  69. return m_view.visit([](auto& view) { return view.is_null(); });
  70. }
  71. size_t length() const
  72. {
  73. if (unicode()) {
  74. return m_view.visit(
  75. [](Utf16View const& view) { return view.length_in_code_points(); },
  76. [](auto const& view) { return view.length(); });
  77. }
  78. return length_in_code_units();
  79. }
  80. size_t length_in_code_units() const
  81. {
  82. return m_view.visit(
  83. [](Utf16View const& view) { return view.length_in_code_units(); },
  84. [](Utf8View const& view) { return view.byte_length(); },
  85. [](auto const& view) { return view.length(); });
  86. }
  87. size_t length_of_code_point(u32 code_point) const
  88. {
  89. return m_view.visit(
  90. [](Utf32View const&) { return 1; },
  91. [&](Utf16View const&) {
  92. if (code_point < 0x10000)
  93. return 1;
  94. return 2;
  95. },
  96. [&](auto const&) {
  97. if (code_point <= 0x7f)
  98. return 1;
  99. if (code_point <= 0x07ff)
  100. return 2;
  101. if (code_point <= 0xffff)
  102. return 3;
  103. return 4;
  104. });
  105. }
  106. RegexStringView typed_null_view()
  107. {
  108. auto view = m_view.visit(
  109. [&]<typename T>(T const&) {
  110. return RegexStringView { T {} };
  111. });
  112. view.set_unicode(unicode());
  113. return view;
  114. }
  115. RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage, Vector<u16, 1>& optional_utf16_storage) const
  116. {
  117. auto view = m_view.visit(
  118. [&]<typename T>(T const&) {
  119. StringBuilder builder;
  120. for (auto ch : data)
  121. builder.append(ch); // Note: The type conversion is intentional.
  122. optional_string_storage = builder.build();
  123. return RegexStringView { T { *optional_string_storage } };
  124. },
  125. [&](Utf32View) {
  126. return RegexStringView { Utf32View { data.data(), data.size() } };
  127. },
  128. [&](Utf16View) {
  129. optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() });
  130. return RegexStringView { Utf16View { optional_utf16_storage } };
  131. });
  132. view.set_unicode(unicode());
  133. return view;
  134. }
  135. Vector<RegexStringView> lines() const
  136. {
  137. return m_view.visit(
  138. [](StringView view) {
  139. auto views = view.lines(false);
  140. Vector<RegexStringView> new_views;
  141. for (auto& view : views)
  142. new_views.empend(view);
  143. return new_views;
  144. },
  145. [](Utf32View view) {
  146. if (view.is_empty())
  147. return Vector<RegexStringView> { view };
  148. Vector<RegexStringView> views;
  149. u32 newline = '\n';
  150. while (!view.is_empty()) {
  151. auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32));
  152. if (!position.has_value())
  153. break;
  154. auto offset = position.value() / sizeof(u32);
  155. views.empend(view.substring_view(0, offset));
  156. view = view.substring_view(offset + 1, view.length() - offset - 1);
  157. }
  158. if (!view.is_empty())
  159. views.empend(view);
  160. return views;
  161. },
  162. [](Utf16View view) {
  163. if (view.is_empty())
  164. return Vector<RegexStringView> { view };
  165. Vector<RegexStringView> views;
  166. u16 newline = '\n';
  167. while (!view.is_empty()) {
  168. auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16));
  169. if (!position.has_value())
  170. break;
  171. auto offset = position.value() / sizeof(u16);
  172. views.empend(view.substring_view(0, offset));
  173. view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1);
  174. }
  175. if (!view.is_empty())
  176. views.empend(view);
  177. return views;
  178. },
  179. [](Utf8View const& view) {
  180. if (view.is_empty())
  181. return Vector<RegexStringView> { view };
  182. Vector<RegexStringView> views;
  183. auto it = view.begin();
  184. auto previous_newline_position_it = it;
  185. for (;;) {
  186. if (*it == '\n') {
  187. auto previous_offset = view.byte_offset_of(previous_newline_position_it);
  188. auto new_offset = view.byte_offset_of(it);
  189. auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
  190. views.empend(slice);
  191. ++it;
  192. previous_newline_position_it = it;
  193. }
  194. if (it.done())
  195. break;
  196. ++it;
  197. }
  198. if (it != previous_newline_position_it) {
  199. auto previous_offset = view.byte_offset_of(previous_newline_position_it);
  200. auto new_offset = view.byte_offset_of(it);
  201. auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
  202. views.empend(slice);
  203. }
  204. return views;
  205. });
  206. }
  207. RegexStringView substring_view(size_t offset, size_t length) const
  208. {
  209. if (unicode()) {
  210. auto view = m_view.visit(
  211. [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
  212. [&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; },
  213. [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
  214. view.set_unicode(unicode());
  215. return view;
  216. }
  217. auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; });
  218. view.set_unicode(unicode());
  219. return view;
  220. }
  221. String to_string() const
  222. {
  223. return m_view.visit(
  224. [](StringView view) { return view.to_string(); },
  225. [](Utf16View view) { return view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); },
  226. [](auto& view) {
  227. StringBuilder builder;
  228. for (auto it = view.begin(); it != view.end(); ++it)
  229. builder.append_code_point(*it);
  230. return builder.to_string();
  231. });
  232. }
  233. // Note: index must always be the code unit offset to return.
  234. u32 operator[](size_t index) const
  235. {
  236. return m_view.visit(
  237. [&](StringView view) -> u32 {
  238. auto ch = view[index];
  239. if constexpr (IsSigned<char>) {
  240. if (ch < 0)
  241. return 256u + ch;
  242. return ch;
  243. }
  244. },
  245. [&](Utf32View const& view) -> u32 { return view[index]; },
  246. [&](Utf16View const& view) -> u32 { return view.code_point_at(index); },
  247. [&](Utf8View const& view) -> u32 {
  248. auto it = view.iterator_at_byte_offset(index);
  249. VERIFY(it != view.end());
  250. return *it;
  251. });
  252. }
  253. size_t code_unit_offset_of(size_t code_point_index) const
  254. {
  255. return m_view.visit(
  256. [&](StringView view) -> u32 {
  257. Utf8View utf8_view { view };
  258. return utf8_view.byte_offset_of(code_point_index);
  259. },
  260. [&](Utf32View const&) -> u32 { return code_point_index; },
  261. [&](Utf16View const& view) -> u32 {
  262. return view.code_unit_offset_of(code_point_index);
  263. },
  264. [&](Utf8View const& view) -> u32 {
  265. return view.byte_offset_of(code_point_index);
  266. });
  267. }
  268. bool operator==(char const* cstring) const
  269. {
  270. return m_view.visit(
  271. [&](Utf32View) { return to_string() == cstring; },
  272. [&](Utf16View) { return to_string() == cstring; },
  273. [&](Utf8View const& view) { return view.as_string() == cstring; },
  274. [&](StringView view) { return view == cstring; });
  275. }
  276. bool operator!=(char const* cstring) const
  277. {
  278. return !(*this == cstring);
  279. }
  280. bool operator==(String const& string) const
  281. {
  282. return m_view.visit(
  283. [&](Utf32View) { return to_string() == string; },
  284. [&](Utf16View) { return to_string() == string; },
  285. [&](Utf8View const& view) { return view.as_string() == string; },
  286. [&](StringView view) { return view == string; });
  287. }
  288. bool operator==(StringView string) const
  289. {
  290. return m_view.visit(
  291. [&](Utf32View) { return to_string() == string; },
  292. [&](Utf16View) { return to_string() == string; },
  293. [&](Utf8View const& view) { return view.as_string() == string; },
  294. [&](StringView view) { return view == string; });
  295. }
  296. bool operator!=(StringView other) const
  297. {
  298. return !(*this == other);
  299. }
  300. bool operator==(Utf32View const& other) const
  301. {
  302. return m_view.visit(
  303. [&](Utf32View view) {
  304. return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0;
  305. },
  306. [&](Utf16View) { return to_string() == RegexStringView { other }.to_string(); },
  307. [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
  308. [&](StringView view) { return view == RegexStringView { other }.to_string(); });
  309. }
  310. bool operator!=(Utf32View const& other) const
  311. {
  312. return !(*this == other);
  313. }
  314. bool operator==(Utf16View const& other) const
  315. {
  316. return m_view.visit(
  317. [&](Utf32View) { return to_string() == RegexStringView { other }.to_string(); },
  318. [&](Utf16View const& view) { return view == other; },
  319. [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
  320. [&](StringView view) { return view == RegexStringView { other }.to_string(); });
  321. }
  322. bool operator!=(Utf16View const& other) const
  323. {
  324. return !(*this == other);
  325. }
  326. bool operator==(Utf8View const& other) const
  327. {
  328. return m_view.visit(
  329. [&](Utf32View) { return to_string() == other.as_string(); },
  330. [&](Utf16View) { return to_string() == other.as_string(); },
  331. [&](Utf8View const& view) { return view.as_string() == other.as_string(); },
  332. [&](StringView view) { return other.as_string() == view; });
  333. }
  334. bool operator!=(Utf8View const& other) const
  335. {
  336. return !(*this == other);
  337. }
  338. bool equals(RegexStringView other) const
  339. {
  340. return other.m_view.visit([this](auto const& view) { return operator==(view); });
  341. }
  342. bool equals_ignoring_case(RegexStringView other) const
  343. {
  344. // FIXME: Implement equals_ignoring_case() for unicode.
  345. return m_view.visit(
  346. [&](StringView view) {
  347. return other.m_view.visit(
  348. [&](StringView other_view) { return view.equals_ignoring_case(other_view); },
  349. [](auto&) -> bool { TODO(); });
  350. },
  351. [&](Utf16View view) {
  352. return other.m_view.visit(
  353. [&](Utf16View other_view) { return view.equals_ignoring_case(other_view); },
  354. [](auto&) -> bool { TODO(); });
  355. },
  356. [](auto&) -> bool { TODO(); });
  357. }
  358. bool starts_with(StringView str) const
  359. {
  360. return m_view.visit(
  361. [&](Utf32View) -> bool {
  362. TODO();
  363. },
  364. [&](Utf16View) -> bool {
  365. TODO();
  366. },
  367. [&](Utf8View const& view) { return view.as_string().starts_with(str); },
  368. [&](StringView view) { return view.starts_with(str); });
  369. }
  370. bool starts_with(Utf32View const& str) const
  371. {
  372. return m_view.visit(
  373. [&](Utf32View view) -> bool {
  374. if (str.length() > view.length())
  375. return false;
  376. if (str.length() == view.length())
  377. return operator==(str);
  378. for (size_t i = 0; i < str.length(); ++i) {
  379. if (str.at(i) != view.at(i))
  380. return false;
  381. }
  382. return true;
  383. },
  384. [&](Utf16View) -> bool { TODO(); },
  385. [&](Utf8View const& view) {
  386. auto it = view.begin();
  387. for (auto code_point : str) {
  388. if (it.done())
  389. return false;
  390. if (code_point != *it)
  391. return false;
  392. ++it;
  393. }
  394. return true;
  395. },
  396. [&](StringView) -> bool { TODO(); });
  397. }
  398. private:
  399. Variant<StringView, Utf8View, Utf16View, Utf32View> m_view { StringView {} };
  400. bool m_unicode { false };
  401. };
  402. class Match final {
  403. private:
  404. Optional<FlyString> string;
  405. public:
  406. Match() = default;
  407. ~Match() = default;
  408. Match(RegexStringView view_, size_t const line_, size_t const column_, size_t const global_offset_)
  409. : view(view_)
  410. , line(line_)
  411. , column(column_)
  412. , global_offset(global_offset_)
  413. , left_column(column_)
  414. {
  415. }
  416. Match(String string_, size_t const line_, size_t const column_, size_t const global_offset_)
  417. : string(move(string_))
  418. , view(string.value().view())
  419. , line(line_)
  420. , column(column_)
  421. , global_offset(global_offset_)
  422. {
  423. }
  424. Match(RegexStringView const view_, StringView capture_group_name_, size_t const line_, size_t const column_, size_t const global_offset_)
  425. : view(view_)
  426. , capture_group_name(capture_group_name_)
  427. , line(line_)
  428. , column(column_)
  429. , global_offset(global_offset_)
  430. , left_column(column_)
  431. {
  432. }
  433. void reset()
  434. {
  435. view = view.typed_null_view();
  436. capture_group_name.clear();
  437. line = 0;
  438. column = 0;
  439. global_offset = 0;
  440. left_column = 0;
  441. }
  442. RegexStringView view {};
  443. Optional<FlyString> capture_group_name {};
  444. size_t line { 0 };
  445. size_t column { 0 };
  446. size_t global_offset { 0 };
  447. // ugly, as not usable by user, but needed to prevent to create extra vectors that are
  448. // able to store the column when the left paren has been found
  449. size_t left_column { 0 };
  450. };
  451. struct MatchInput {
  452. RegexStringView view {};
  453. AllOptions regex_options {};
  454. size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset.
  455. size_t match_index { 0 };
  456. size_t line { 0 };
  457. size_t column { 0 };
  458. size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important
  459. mutable size_t fail_counter { 0 };
  460. mutable Vector<size_t> saved_positions;
  461. mutable Vector<size_t> saved_code_unit_positions;
  462. mutable Vector<size_t> saved_forks_since_last_save;
  463. mutable HashMap<u64, u64> checkpoints;
  464. mutable Optional<size_t> fork_to_replace;
  465. };
  466. struct MatchState {
  467. size_t string_position_before_match { 0 };
  468. size_t string_position { 0 };
  469. size_t string_position_in_code_units { 0 };
  470. size_t instruction_position { 0 };
  471. size_t fork_at_position { 0 };
  472. size_t forks_since_last_save { 0 };
  473. Optional<size_t> initiating_fork;
  474. Vector<Match> matches;
  475. Vector<Vector<Match>> capture_group_matches;
  476. Vector<u64> repetition_marks;
  477. };
  478. }
  479. using regex::RegexStringView;
  480. template<>
  481. struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> {
  482. ErrorOr<void> format(FormatBuilder& builder, regex::RegexStringView value)
  483. {
  484. auto string = value.to_string();
  485. return Formatter<StringView>::format(builder, string);
  486. }
  487. };