RegexMatch.h 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "RegexOptions.h"
  8. #include "AK/FlyString.h"
  9. #include "AK/HashMap.h"
  10. #include "AK/String.h"
  11. #include "AK/StringBuilder.h"
  12. #include "AK/StringView.h"
  13. #include "AK/Utf32View.h"
  14. #include "AK/Vector.h"
  15. namespace regex {
  16. class RegexStringView {
  17. public:
  18. RegexStringView(const char* chars)
  19. : m_u8view(chars)
  20. {
  21. }
  22. RegexStringView(const String& string)
  23. : m_u8view(string)
  24. {
  25. }
  26. RegexStringView(const StringView view)
  27. : m_u8view(view)
  28. {
  29. }
  30. RegexStringView(const Utf32View view)
  31. : m_u32view(view)
  32. {
  33. }
  34. bool is_u8_view() const { return m_u8view.has_value(); }
  35. bool is_u32_view() const { return m_u32view.has_value(); }
  36. const StringView& u8view() const
  37. {
  38. VERIFY(m_u8view.has_value());
  39. return m_u8view.value();
  40. };
  41. const Utf32View& u32view() const
  42. {
  43. VERIFY(m_u32view.has_value());
  44. return m_u32view.value();
  45. };
  46. bool is_empty() const
  47. {
  48. if (is_u8_view())
  49. return m_u8view.value().is_empty();
  50. else
  51. return m_u32view.value().is_empty();
  52. }
  53. bool is_null() const
  54. {
  55. if (is_u8_view())
  56. return m_u8view.value().is_null();
  57. else
  58. return m_u32view.value().code_points() == nullptr;
  59. }
  60. size_t length() const
  61. {
  62. if (is_u8_view())
  63. return m_u8view.value().length();
  64. else
  65. return m_u32view.value().length();
  66. }
  67. Vector<RegexStringView> lines() const
  68. {
  69. if (is_u8_view()) {
  70. auto views = u8view().lines(false);
  71. Vector<RegexStringView> new_views;
  72. for (auto& view : views)
  73. new_views.append(move(view));
  74. return new_views;
  75. }
  76. // FIXME: line splitting for Utf32View needed
  77. Vector<RegexStringView> views;
  78. views.append(m_u32view.value());
  79. return views;
  80. }
  81. RegexStringView substring_view(size_t offset, size_t length) const
  82. {
  83. if (is_u8_view()) {
  84. return u8view().substring_view(offset, length);
  85. }
  86. return u32view().substring_view(offset, length);
  87. }
  88. String to_string() const
  89. {
  90. if (is_u8_view()) {
  91. return u8view().to_string();
  92. }
  93. StringBuilder builder;
  94. builder.append(u32view());
  95. return builder.to_string();
  96. }
  97. u32 operator[](size_t index) const
  98. {
  99. if (is_u8_view()) {
  100. i8 ch = u8view()[index];
  101. u8 value = *reinterpret_cast<u8*>(&ch);
  102. return static_cast<u32>(value);
  103. }
  104. return u32view().code_points()[index];
  105. }
  106. bool operator==(const char* cstring) const
  107. {
  108. if (is_u8_view())
  109. return u8view() == cstring;
  110. return to_string() == cstring;
  111. }
  112. bool operator!=(const char* cstring) const
  113. {
  114. return !(*this == cstring);
  115. }
  116. bool operator==(const String& string) const
  117. {
  118. if (is_u8_view())
  119. return u8view() == string;
  120. return to_string() == string;
  121. }
  122. bool operator==(const StringView& other) const
  123. {
  124. if (is_u8_view())
  125. return u8view() == other;
  126. return false;
  127. }
  128. bool operator!=(const StringView& other) const
  129. {
  130. return !(*this == other);
  131. }
  132. bool operator==(const Utf32View& other) const
  133. {
  134. if (is_u32_view()) {
  135. StringBuilder builder;
  136. builder.append(other);
  137. return to_string() == builder.to_string();
  138. }
  139. return false;
  140. }
  141. bool operator!=(const Utf32View& other) const
  142. {
  143. return !(*this == other);
  144. }
  145. const char* characters_without_null_termination() const
  146. {
  147. if (is_u8_view())
  148. return u8view().characters_without_null_termination();
  149. return to_string().characters(); // FIXME: it contains the null termination, does that actually matter?
  150. }
  151. bool starts_with(const StringView& str) const
  152. {
  153. if (is_u32_view())
  154. return false;
  155. return u8view().starts_with(str);
  156. }
  157. bool starts_with(const Utf32View& str) const
  158. {
  159. if (is_u8_view())
  160. return false;
  161. StringBuilder builder;
  162. builder.append(str);
  163. return to_string().starts_with(builder.to_string());
  164. }
  165. private:
  166. Optional<StringView> m_u8view;
  167. Optional<Utf32View> m_u32view;
  168. };
  169. class Match final {
  170. private:
  171. Optional<FlyString> string;
  172. public:
  173. Match() = default;
  174. ~Match() = default;
  175. Match(const RegexStringView view_, const size_t line_, const size_t column_, const size_t global_offset_)
  176. : view(view_)
  177. , line(line_)
  178. , column(column_)
  179. , global_offset(global_offset_)
  180. , left_column(column_)
  181. {
  182. }
  183. Match(const String string_, const size_t line_, const size_t column_, const size_t global_offset_)
  184. : string(string_)
  185. , view(string.value().view())
  186. , line(line_)
  187. , column(column_)
  188. , global_offset(global_offset_)
  189. , left_column(column_)
  190. {
  191. }
  192. RegexStringView view { nullptr };
  193. size_t line { 0 };
  194. size_t column { 0 };
  195. size_t global_offset { 0 };
  196. // ugly, as not usable by user, but needed to prevent to create extra vectors that are
  197. // able to store the column when the left paren has been found
  198. size_t left_column { 0 };
  199. };
  200. struct MatchInput {
  201. RegexStringView view { nullptr };
  202. AllOptions regex_options {};
  203. size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset.
  204. size_t match_index { 0 };
  205. size_t line { 0 };
  206. size_t column { 0 };
  207. size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important
  208. mutable size_t fail_counter { 0 };
  209. mutable Vector<size_t> saved_positions;
  210. };
  211. struct MatchState {
  212. size_t string_position_before_match { 0 };
  213. size_t string_position { 0 };
  214. size_t instruction_position { 0 };
  215. size_t fork_at_position { 0 };
  216. };
  217. struct MatchOutput {
  218. size_t operations;
  219. Vector<Match> matches;
  220. Vector<Vector<Match>> capture_group_matches;
  221. Vector<HashMap<String, Match>> named_capture_group_matches;
  222. };
  223. }
  224. using regex::RegexStringView;
  225. template<>
  226. struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> {
  227. void format(FormatBuilder& builder, const regex::RegexStringView& value)
  228. {
  229. return Formatter<StringView>::format(builder, { value.characters_without_null_termination(), value.length() });
  230. }
  231. };