RegexMatch.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "RegexOptions.h"
  8. #include <AK/FlyString.h>
  9. #include <AK/HashMap.h>
  10. #include <AK/MemMem.h>
  11. #include <AK/String.h>
  12. #include <AK/StringBuilder.h>
  13. #include <AK/StringView.h>
  14. #include <AK/Utf16View.h>
  15. #include <AK/Utf32View.h>
  16. #include <AK/Utf8View.h>
  17. #include <AK/Variant.h>
  18. #include <AK/Vector.h>
  19. namespace regex {
  20. class RegexStringView {
  21. public:
  22. RegexStringView(char const* chars)
  23. : m_view(StringView { chars })
  24. {
  25. }
  26. RegexStringView(String const& string)
  27. : m_view(string.view())
  28. {
  29. }
  30. RegexStringView(StringView const view)
  31. : m_view(view)
  32. {
  33. }
  34. RegexStringView(Utf32View view)
  35. : m_view(view)
  36. {
  37. }
  38. RegexStringView(Utf16View view)
  39. : m_view(view)
  40. {
  41. }
  42. RegexStringView(Utf8View view)
  43. : m_view(view)
  44. {
  45. }
  46. explicit RegexStringView(String&&) = delete;
  47. StringView const& string_view() const
  48. {
  49. return m_view.get<StringView>();
  50. }
  51. Utf32View const& u32_view() const
  52. {
  53. return m_view.get<Utf32View>();
  54. }
  55. Utf16View const& u16_view() const
  56. {
  57. return m_view.get<Utf16View>();
  58. }
  59. Utf8View const& u8_view() const
  60. {
  61. return m_view.get<Utf8View>();
  62. }
  63. bool unicode() const { return m_unicode; }
  64. void set_unicode(bool unicode) { m_unicode = unicode; }
  65. bool is_empty() const
  66. {
  67. return m_view.visit([](auto& view) { return view.is_empty(); });
  68. }
  69. bool is_null() const
  70. {
  71. return m_view.visit([](auto& view) { return view.is_null(); });
  72. }
  73. size_t length() const
  74. {
  75. if (unicode()) {
  76. return m_view.visit(
  77. [](Utf16View const& view) { return view.length_in_code_points(); },
  78. [](auto const& view) { return view.length(); });
  79. }
  80. return length_in_code_units();
  81. }
  82. size_t length_in_code_units() const
  83. {
  84. return m_view.visit(
  85. [](Utf16View const& view) { return view.length_in_code_units(); },
  86. [](Utf8View const& view) { return view.byte_length(); },
  87. [](auto const& view) { return view.length(); });
  88. }
  89. size_t length_of_code_point(u32 code_point) const
  90. {
  91. return m_view.visit(
  92. [](Utf32View const&) { return 1; },
  93. [&](Utf16View const&) {
  94. if (code_point < 0x10000)
  95. return 1;
  96. return 2;
  97. },
  98. [&](auto const&) {
  99. if (code_point <= 0x7f)
  100. return 1;
  101. else if (code_point <= 0x07ff)
  102. return 2;
  103. else if (code_point <= 0xffff)
  104. return 3;
  105. return 4;
  106. });
  107. }
  108. RegexStringView typed_null_view()
  109. {
  110. auto view = m_view.visit(
  111. [&]<typename T>(T const&) {
  112. return RegexStringView { T {} };
  113. });
  114. view.set_unicode(unicode());
  115. return view;
  116. }
  117. RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage, Vector<u16>& optional_utf16_storage) const
  118. {
  119. auto view = m_view.visit(
  120. [&]<typename T>(T const&) {
  121. StringBuilder builder;
  122. for (auto ch : data)
  123. builder.append(ch); // Note: The type conversion is intentional.
  124. optional_string_storage = builder.build();
  125. return RegexStringView { T { *optional_string_storage } };
  126. },
  127. [&](Utf32View) {
  128. return RegexStringView { Utf32View { data.data(), data.size() } };
  129. },
  130. [&](Utf16View) {
  131. optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() });
  132. return RegexStringView { Utf16View { optional_utf16_storage } };
  133. });
  134. view.set_unicode(unicode());
  135. return view;
  136. }
  137. Vector<RegexStringView> lines() const
  138. {
  139. return m_view.visit(
  140. [](StringView view) {
  141. auto views = view.lines(false);
  142. Vector<RegexStringView> new_views;
  143. for (auto& view : views)
  144. new_views.empend(view);
  145. return new_views;
  146. },
  147. [](Utf32View view) {
  148. Vector<RegexStringView> views;
  149. u32 newline = '\n';
  150. while (!view.is_empty()) {
  151. auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32));
  152. if (!position.has_value())
  153. break;
  154. auto offset = position.value() / sizeof(u32);
  155. views.empend(view.substring_view(0, offset));
  156. view = view.substring_view(offset + 1, view.length() - offset - 1);
  157. }
  158. if (!view.is_empty())
  159. views.empend(view);
  160. return views;
  161. },
  162. [](Utf16View view) {
  163. Vector<RegexStringView> views;
  164. u16 newline = '\n';
  165. while (!view.is_empty()) {
  166. auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16));
  167. if (!position.has_value())
  168. break;
  169. auto offset = position.value() / sizeof(u16);
  170. views.empend(view.substring_view(0, offset));
  171. view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1);
  172. }
  173. if (!view.is_empty())
  174. views.empend(view);
  175. return views;
  176. },
  177. [](Utf8View& view) {
  178. Vector<RegexStringView> views;
  179. auto it = view.begin();
  180. auto previous_newline_position_it = it;
  181. for (;;) {
  182. if (*it == '\n') {
  183. auto previous_offset = view.byte_offset_of(previous_newline_position_it);
  184. auto new_offset = view.byte_offset_of(it);
  185. auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
  186. views.empend(slice);
  187. ++it;
  188. previous_newline_position_it = it;
  189. }
  190. if (it.done())
  191. break;
  192. ++it;
  193. }
  194. if (it != previous_newline_position_it) {
  195. auto previous_offset = view.byte_offset_of(previous_newline_position_it);
  196. auto new_offset = view.byte_offset_of(it);
  197. auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
  198. views.empend(slice);
  199. }
  200. return views;
  201. });
  202. }
  203. RegexStringView substring_view(size_t offset, size_t length) const
  204. {
  205. if (unicode()) {
  206. auto view = m_view.visit(
  207. [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
  208. [&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; },
  209. [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
  210. view.set_unicode(unicode());
  211. return view;
  212. }
  213. auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; });
  214. view.set_unicode(unicode());
  215. return view;
  216. }
  217. String to_string() const
  218. {
  219. return m_view.visit(
  220. [](StringView view) { return view.to_string(); },
  221. [](Utf16View view) { return view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); },
  222. [](auto& view) {
  223. StringBuilder builder;
  224. for (auto it = view.begin(); it != view.end(); ++it)
  225. builder.append_code_point(*it);
  226. return builder.to_string();
  227. });
  228. }
  229. // Note: index must always be the code unit offset to return.
  230. u32 operator[](size_t index) const
  231. {
  232. return m_view.visit(
  233. [&](StringView view) -> u32 {
  234. auto ch = view[index];
  235. if (ch < 0)
  236. return 256u + ch;
  237. return ch;
  238. },
  239. [&](Utf32View const& view) -> u32 { return view[index]; },
  240. [&](Utf16View const& view) -> u32 { return view.code_point_at(index); },
  241. [&](Utf8View const& view) -> u32 {
  242. auto it = view.iterator_at_byte_offset(index);
  243. VERIFY(it != view.end());
  244. return *it;
  245. });
  246. }
  247. size_t code_unit_offset_of(size_t code_point_index) const
  248. {
  249. return m_view.visit(
  250. [&](StringView const& view) -> u32 {
  251. Utf8View utf8_view { view };
  252. return utf8_view.byte_offset_of(code_point_index);
  253. },
  254. [&](Utf32View const&) -> u32 { return code_point_index; },
  255. [&](Utf16View const& view) -> u32 {
  256. return view.code_unit_offset_of(code_point_index);
  257. },
  258. [&](Utf8View const& view) -> u32 {
  259. return view.byte_offset_of(code_point_index);
  260. });
  261. }
  262. bool operator==(char const* cstring) const
  263. {
  264. return m_view.visit(
  265. [&](Utf32View) { return to_string() == cstring; },
  266. [&](Utf16View) { return to_string() == cstring; },
  267. [&](Utf8View const& view) { return view.as_string() == cstring; },
  268. [&](StringView view) { return view == cstring; });
  269. }
  270. bool operator!=(char const* cstring) const
  271. {
  272. return !(*this == cstring);
  273. }
  274. bool operator==(String const& string) const
  275. {
  276. return m_view.visit(
  277. [&](Utf32View) { return to_string() == string; },
  278. [&](Utf16View) { return to_string() == string; },
  279. [&](Utf8View const& view) { return view.as_string() == string; },
  280. [&](StringView view) { return view == string; });
  281. }
  282. bool operator==(StringView const& string) const
  283. {
  284. return m_view.visit(
  285. [&](Utf32View) { return to_string() == string; },
  286. [&](Utf16View) { return to_string() == string; },
  287. [&](Utf8View const& view) { return view.as_string() == string; },
  288. [&](StringView view) { return view == string; });
  289. }
  290. bool operator!=(StringView const& other) const
  291. {
  292. return !(*this == other);
  293. }
  294. bool operator==(Utf32View const& other) const
  295. {
  296. return m_view.visit(
  297. [&](Utf32View view) {
  298. return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0;
  299. },
  300. [&](Utf16View) { return to_string() == RegexStringView { other }.to_string(); },
  301. [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
  302. [&](StringView view) { return view == RegexStringView { other }.to_string(); });
  303. }
  304. bool operator!=(Utf32View const& other) const
  305. {
  306. return !(*this == other);
  307. }
  308. bool operator==(Utf16View const& other) const
  309. {
  310. return m_view.visit(
  311. [&](Utf32View) { return to_string() == RegexStringView { other }.to_string(); },
  312. [&](Utf16View const& view) { return view == other; },
  313. [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
  314. [&](StringView view) { return view == RegexStringView { other }.to_string(); });
  315. }
  316. bool operator!=(Utf16View const& other) const
  317. {
  318. return !(*this == other);
  319. }
  320. bool operator==(Utf8View const& other) const
  321. {
  322. return m_view.visit(
  323. [&](Utf32View) { return to_string() == other.as_string(); },
  324. [&](Utf16View) { return to_string() == other.as_string(); },
  325. [&](Utf8View const& view) { return view.as_string() == other.as_string(); },
  326. [&](StringView view) { return other.as_string() == view; });
  327. }
  328. bool operator!=(Utf8View const& other) const
  329. {
  330. return !(*this == other);
  331. }
  332. bool equals(RegexStringView const& other) const
  333. {
  334. return other.m_view.visit([&](auto const& view) { return operator==(view); });
  335. }
  336. bool equals_ignoring_case(RegexStringView const& other) const
  337. {
  338. // FIXME: Implement equals_ignoring_case() for unicode.
  339. return m_view.visit(
  340. [&](StringView view) {
  341. return other.m_view.visit(
  342. [&](StringView other_view) { return view.equals_ignoring_case(other_view); },
  343. [](auto&) -> bool { TODO(); });
  344. },
  345. [&](Utf16View view) {
  346. return other.m_view.visit(
  347. [&](Utf16View other_view) { return view.equals_ignoring_case(other_view); },
  348. [](auto&) -> bool { TODO(); });
  349. },
  350. [](auto&) -> bool { TODO(); });
  351. }
  352. bool starts_with(StringView const& str) const
  353. {
  354. return m_view.visit(
  355. [&](Utf32View) -> bool {
  356. TODO();
  357. },
  358. [&](Utf16View) -> bool {
  359. TODO();
  360. },
  361. [&](Utf8View const& view) { return view.as_string().starts_with(str); },
  362. [&](StringView view) { return view.starts_with(str); });
  363. }
  364. bool starts_with(Utf32View const& str) const
  365. {
  366. return m_view.visit(
  367. [&](Utf32View view) -> bool {
  368. if (str.length() > view.length())
  369. return false;
  370. if (str.length() == view.length())
  371. return operator==(str);
  372. for (size_t i = 0; i < str.length(); ++i) {
  373. if (str.at(i) != view.at(i))
  374. return false;
  375. }
  376. return true;
  377. },
  378. [&](Utf16View) -> bool { TODO(); },
  379. [&](Utf8View const& view) {
  380. auto it = view.begin();
  381. for (auto code_point : str) {
  382. if (it.done())
  383. return false;
  384. if (code_point != *it)
  385. return false;
  386. ++it;
  387. }
  388. return true;
  389. },
  390. [&](StringView) -> bool { TODO(); });
  391. }
  392. private:
  393. Variant<StringView, Utf8View, Utf16View, Utf32View> m_view;
  394. bool m_unicode { false };
  395. };
  396. class Match final {
  397. private:
  398. Optional<FlyString> string;
  399. public:
  400. Match() = default;
  401. ~Match() = default;
  402. Match(RegexStringView const view_, size_t const line_, size_t const column_, size_t const global_offset_)
  403. : view(view_)
  404. , line(line_)
  405. , column(column_)
  406. , global_offset(global_offset_)
  407. , left_column(column_)
  408. {
  409. }
  410. Match(String const string_, size_t const line_, size_t const column_, size_t const global_offset_)
  411. : string(move(string_))
  412. , view(string.value().view())
  413. , line(line_)
  414. , column(column_)
  415. , global_offset(global_offset_)
  416. {
  417. }
  418. Match(RegexStringView const view_, StringView capture_group_name_, size_t const line_, size_t const column_, size_t const global_offset_)
  419. : view(view_)
  420. , capture_group_name(capture_group_name_)
  421. , line(line_)
  422. , column(column_)
  423. , global_offset(global_offset_)
  424. , left_column(column_)
  425. {
  426. }
  427. void reset()
  428. {
  429. view = view.typed_null_view();
  430. capture_group_name.clear();
  431. line = 0;
  432. column = 0;
  433. global_offset = 0;
  434. left_column = 0;
  435. }
  436. RegexStringView view { nullptr };
  437. Optional<FlyString> capture_group_name {};
  438. size_t line { 0 };
  439. size_t column { 0 };
  440. size_t global_offset { 0 };
  441. // ugly, as not usable by user, but needed to prevent to create extra vectors that are
  442. // able to store the column when the left paren has been found
  443. size_t left_column { 0 };
  444. };
  445. struct MatchInput {
  446. RegexStringView view { nullptr };
  447. AllOptions regex_options {};
  448. size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset.
  449. size_t match_index { 0 };
  450. size_t line { 0 };
  451. size_t column { 0 };
  452. size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important
  453. mutable size_t fail_counter { 0 };
  454. mutable Vector<size_t> saved_positions;
  455. mutable Vector<size_t> saved_code_unit_positions;
  456. mutable HashMap<u64, u64> checkpoints;
  457. };
  458. struct MatchState {
  459. size_t string_position_before_match { 0 };
  460. size_t string_position { 0 };
  461. size_t string_position_in_code_units { 0 };
  462. size_t instruction_position { 0 };
  463. size_t fork_at_position { 0 };
  464. Vector<Match> matches;
  465. Vector<Vector<Match>> capture_group_matches;
  466. Vector<u64> repetition_marks;
  467. };
  468. }
  469. using regex::RegexStringView;
  470. template<>
  471. struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> {
  472. void format(FormatBuilder& builder, regex::RegexStringView const& value)
  473. {
  474. auto string = value.to_string();
  475. return Formatter<StringView>::format(builder, string);
  476. }
  477. };