RegexMatch.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "RegexOptions.h"
  8. #include <AK/FlyString.h>
  9. #include <AK/HashMap.h>
  10. #include <AK/MemMem.h>
  11. #include <AK/String.h>
  12. #include <AK/StringBuilder.h>
  13. #include <AK/StringView.h>
  14. #include <AK/Utf16View.h>
  15. #include <AK/Utf32View.h>
  16. #include <AK/Utf8View.h>
  17. #include <AK/Variant.h>
  18. #include <AK/Vector.h>
  19. namespace regex {
  20. class RegexStringView {
  21. public:
  22. RegexStringView(char const* chars)
  23. : m_view(StringView { chars })
  24. {
  25. }
  26. RegexStringView(String const& string)
  27. : m_view(string.view())
  28. {
  29. }
  30. RegexStringView(StringView const view)
  31. : m_view(view)
  32. {
  33. }
  34. RegexStringView(Utf32View view)
  35. : m_view(view)
  36. {
  37. }
  38. RegexStringView(Utf16View view)
  39. : m_view(view)
  40. {
  41. }
  42. RegexStringView(Utf8View view)
  43. : m_view(view)
  44. {
  45. }
  46. StringView const& string_view() const
  47. {
  48. return m_view.get<StringView>();
  49. }
  50. Utf32View const& u32_view() const
  51. {
  52. return m_view.get<Utf32View>();
  53. }
  54. Utf16View const& u16_view() const
  55. {
  56. return m_view.get<Utf16View>();
  57. }
  58. Utf8View const& u8_view() const
  59. {
  60. return m_view.get<Utf8View>();
  61. }
  62. bool unicode() const { return m_unicode; }
  63. void set_unicode(bool unicode) { m_unicode = unicode; }
  64. bool is_empty() const
  65. {
  66. return m_view.visit([](auto& view) { return view.is_empty(); });
  67. }
  68. bool is_null() const
  69. {
  70. return m_view.visit([](auto& view) { return view.is_null(); });
  71. }
  72. size_t length() const
  73. {
  74. if (unicode()) {
  75. return m_view.visit(
  76. [](Utf16View const& view) { return view.length_in_code_points(); },
  77. [](auto const& view) { return view.length(); });
  78. }
  79. return m_view.visit(
  80. [](Utf16View const& view) { return view.length_in_code_units(); },
  81. [](Utf8View const& view) { return view.byte_length(); },
  82. [](auto const& view) { return view.length(); });
  83. }
  84. RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage, Vector<u16>& optional_utf16_storage) const
  85. {
  86. auto view = m_view.visit(
  87. [&]<typename T>(T const&) {
  88. StringBuilder builder;
  89. for (auto ch : data)
  90. builder.append(ch); // Note: The type conversion is intentional.
  91. optional_string_storage = builder.build();
  92. return RegexStringView { T { *optional_string_storage } };
  93. },
  94. [&](Utf32View) {
  95. return RegexStringView { Utf32View { data.data(), data.size() } };
  96. },
  97. [&](Utf16View) {
  98. optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() });
  99. return RegexStringView { Utf16View { optional_utf16_storage } };
  100. });
  101. view.set_unicode(unicode());
  102. return view;
  103. }
  104. Vector<RegexStringView> lines() const
  105. {
  106. return m_view.visit(
  107. [](StringView view) {
  108. auto views = view.lines(false);
  109. Vector<RegexStringView> new_views;
  110. for (auto& view : views)
  111. new_views.empend(view);
  112. return new_views;
  113. },
  114. [](Utf32View view) {
  115. Vector<RegexStringView> views;
  116. u32 newline = '\n';
  117. while (!view.is_empty()) {
  118. auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32));
  119. if (!position.has_value())
  120. break;
  121. auto offset = position.value() / sizeof(u32);
  122. views.empend(view.substring_view(0, offset));
  123. view = view.substring_view(offset + 1, view.length() - offset - 1);
  124. }
  125. if (!view.is_empty())
  126. views.empend(view);
  127. return views;
  128. },
  129. [](Utf16View view) {
  130. Vector<RegexStringView> views;
  131. u16 newline = '\n';
  132. while (!view.is_empty()) {
  133. auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16));
  134. if (!position.has_value())
  135. break;
  136. auto offset = position.value() / sizeof(u16);
  137. views.empend(view.substring_view(0, offset));
  138. view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1);
  139. }
  140. if (!view.is_empty())
  141. views.empend(view);
  142. return views;
  143. },
  144. [](Utf8View& view) {
  145. Vector<RegexStringView> views;
  146. auto it = view.begin();
  147. auto previous_newline_position_it = it;
  148. for (;;) {
  149. if (*it == '\n') {
  150. auto previous_offset = view.byte_offset_of(previous_newline_position_it);
  151. auto new_offset = view.byte_offset_of(it);
  152. auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
  153. views.empend(slice);
  154. ++it;
  155. previous_newline_position_it = it;
  156. }
  157. if (it.done())
  158. break;
  159. ++it;
  160. }
  161. if (it != previous_newline_position_it) {
  162. auto previous_offset = view.byte_offset_of(previous_newline_position_it);
  163. auto new_offset = view.byte_offset_of(it);
  164. auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
  165. views.empend(slice);
  166. }
  167. return views;
  168. });
  169. }
  170. RegexStringView substring_view(size_t offset, size_t length) const
  171. {
  172. if (unicode()) {
  173. auto view = m_view.visit(
  174. [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
  175. [&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; },
  176. [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
  177. view.set_unicode(unicode());
  178. return view;
  179. }
  180. auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; });
  181. view.set_unicode(unicode());
  182. return view;
  183. }
  184. String to_string() const
  185. {
  186. return m_view.visit(
  187. [](StringView view) { return view.to_string(); },
  188. [](Utf16View view) { return view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); },
  189. [](auto& view) {
  190. StringBuilder builder;
  191. for (auto it = view.begin(); it != view.end(); ++it)
  192. builder.append_code_point(*it);
  193. return builder.to_string();
  194. });
  195. }
  196. u32 operator[](size_t index) const
  197. {
  198. return m_view.visit(
  199. [&](StringView view) -> u32 {
  200. auto ch = view[index];
  201. if (ch < 0)
  202. return 256u + ch;
  203. return ch;
  204. },
  205. [&](Utf32View& view) -> u32 { return view[index]; },
  206. [&](auto& view) -> u32 {
  207. size_t i = index;
  208. for (auto it = view.begin(); it != view.end(); ++it, --i) {
  209. if (i == 0)
  210. return *it;
  211. }
  212. VERIFY_NOT_REACHED();
  213. });
  214. }
  215. bool operator==(char const* cstring) const
  216. {
  217. return m_view.visit(
  218. [&](Utf32View) { return to_string() == cstring; },
  219. [&](Utf16View) { return to_string() == cstring; },
  220. [&](Utf8View const& view) { return view.as_string() == cstring; },
  221. [&](StringView view) { return view == cstring; });
  222. }
  223. bool operator!=(char const* cstring) const
  224. {
  225. return !(*this == cstring);
  226. }
  227. bool operator==(String const& string) const
  228. {
  229. return m_view.visit(
  230. [&](Utf32View) { return to_string() == string; },
  231. [&](Utf16View) { return to_string() == string; },
  232. [&](Utf8View const& view) { return view.as_string() == string; },
  233. [&](StringView view) { return view == string; });
  234. }
  235. bool operator==(StringView const& string) const
  236. {
  237. return m_view.visit(
  238. [&](Utf32View) { return to_string() == string; },
  239. [&](Utf16View) { return to_string() == string; },
  240. [&](Utf8View const& view) { return view.as_string() == string; },
  241. [&](StringView view) { return view == string; });
  242. }
  243. bool operator!=(StringView const& other) const
  244. {
  245. return !(*this == other);
  246. }
  247. bool operator==(Utf32View const& other) const
  248. {
  249. return m_view.visit(
  250. [&](Utf32View view) {
  251. return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0;
  252. },
  253. [&](Utf16View) { return to_string() == RegexStringView { other }.to_string(); },
  254. [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
  255. [&](StringView view) { return view == RegexStringView { other }.to_string(); });
  256. }
  257. bool operator!=(Utf32View const& other) const
  258. {
  259. return !(*this == other);
  260. }
  261. bool operator==(Utf16View const& other) const
  262. {
  263. return m_view.visit(
  264. [&](Utf32View) { return to_string() == RegexStringView { other }.to_string(); },
  265. [&](Utf16View const& view) { return view == other; },
  266. [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
  267. [&](StringView view) { return view == RegexStringView { other }.to_string(); });
  268. }
  269. bool operator!=(Utf16View const& other) const
  270. {
  271. return !(*this == other);
  272. }
  273. bool operator==(Utf8View const& other) const
  274. {
  275. return m_view.visit(
  276. [&](Utf32View) { return to_string() == other.as_string(); },
  277. [&](Utf16View) { return to_string() == other.as_string(); },
  278. [&](Utf8View const& view) { return view.as_string() == other.as_string(); },
  279. [&](StringView view) { return other.as_string() == view; });
  280. }
  281. bool operator!=(Utf8View const& other) const
  282. {
  283. return !(*this == other);
  284. }
  285. bool equals(RegexStringView const& other) const
  286. {
  287. return other.m_view.visit([&](auto const& view) { return operator==(view); });
  288. }
  289. bool equals_ignoring_case(RegexStringView const& other) const
  290. {
  291. // FIXME: Implement equals_ignoring_case() for unicode.
  292. return m_view.visit(
  293. [&](StringView view) {
  294. return other.m_view.visit(
  295. [&](StringView other_view) { return view.equals_ignoring_case(other_view); },
  296. [](auto&) -> bool { TODO(); });
  297. },
  298. [](auto&) -> bool { TODO(); });
  299. }
  300. bool starts_with(StringView const& str) const
  301. {
  302. return m_view.visit(
  303. [&](Utf32View) -> bool {
  304. TODO();
  305. },
  306. [&](Utf16View) -> bool {
  307. TODO();
  308. },
  309. [&](Utf8View const& view) { return view.as_string().starts_with(str); },
  310. [&](StringView view) { return view.starts_with(str); });
  311. }
  312. bool starts_with(Utf32View const& str) const
  313. {
  314. return m_view.visit(
  315. [&](Utf32View view) -> bool {
  316. if (str.length() > view.length())
  317. return false;
  318. if (str.length() == view.length())
  319. return operator==(str);
  320. for (size_t i = 0; i < str.length(); ++i) {
  321. if (str.at(i) != view.at(i))
  322. return false;
  323. }
  324. return true;
  325. },
  326. [&](Utf16View) -> bool { TODO(); },
  327. [&](Utf8View const& view) {
  328. auto it = view.begin();
  329. for (auto code_point : str) {
  330. if (it.done())
  331. return false;
  332. if (code_point != *it)
  333. return false;
  334. ++it;
  335. }
  336. return true;
  337. },
  338. [&](StringView) -> bool { TODO(); });
  339. }
  340. private:
  341. Variant<StringView, Utf8View, Utf16View, Utf32View> m_view;
  342. bool m_unicode { false };
  343. };
  344. class Match final {
  345. private:
  346. Optional<FlyString> string;
  347. public:
  348. Match() = default;
  349. ~Match() = default;
  350. Match(RegexStringView const view_, size_t const line_, size_t const column_, size_t const global_offset_)
  351. : view(view_)
  352. , line(line_)
  353. , column(column_)
  354. , global_offset(global_offset_)
  355. , left_column(column_)
  356. {
  357. }
  358. Match(String const string_, size_t const line_, size_t const column_, size_t const global_offset_)
  359. : string(string_)
  360. , view(string.value().view())
  361. , line(line_)
  362. , column(column_)
  363. , global_offset(global_offset_)
  364. , left_column(column_)
  365. {
  366. }
  367. RegexStringView view { nullptr };
  368. size_t line { 0 };
  369. size_t column { 0 };
  370. size_t global_offset { 0 };
  371. // ugly, as not usable by user, but needed to prevent to create extra vectors that are
  372. // able to store the column when the left paren has been found
  373. size_t left_column { 0 };
  374. };
  375. struct MatchInput {
  376. RegexStringView view { nullptr };
  377. AllOptions regex_options {};
  378. size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset.
  379. size_t match_index { 0 };
  380. size_t line { 0 };
  381. size_t column { 0 };
  382. size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important
  383. mutable size_t fail_counter { 0 };
  384. mutable Vector<size_t> saved_positions;
  385. };
  386. struct MatchState {
  387. size_t string_position_before_match { 0 };
  388. size_t string_position { 0 };
  389. size_t instruction_position { 0 };
  390. size_t fork_at_position { 0 };
  391. Vector<Match> matches;
  392. Vector<Vector<Match>> capture_group_matches;
  393. Vector<HashMap<String, Match>> named_capture_group_matches;
  394. };
  395. struct MatchOutput {
  396. size_t operations;
  397. Vector<Match> matches;
  398. Vector<Vector<Match>> capture_group_matches;
  399. Vector<HashMap<String, Match>> named_capture_group_matches;
  400. };
  401. }
  402. using regex::RegexStringView;
  403. template<>
  404. struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> {
  405. void format(FormatBuilder& builder, regex::RegexStringView const& value)
  406. {
  407. auto string = value.to_string();
  408. return Formatter<StringView>::format(builder, string);
  409. }
  410. };