RegexMatcher.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "RegexByteCode.h"
  8. #include "RegexMatch.h"
  9. #include "RegexOptions.h"
  10. #include "RegexParser.h"
  11. #include <AK/Forward.h>
  12. #include <AK/HashMap.h>
  13. #include <AK/NonnullOwnPtrVector.h>
  14. #include <AK/Types.h>
  15. #include <AK/Utf32View.h>
  16. #include <AK/Vector.h>
  17. #include <ctype.h>
  18. #include <stdio.h>
  19. namespace regex {
  20. static const constexpr size_t c_max_recursion = 5000;
  21. static const constexpr size_t c_match_preallocation_count = 0;
  22. struct RegexResult final {
  23. bool success { false };
  24. size_t count { 0 };
  25. Vector<Match> matches;
  26. Vector<Vector<Match>> capture_group_matches;
  27. Vector<HashMap<String, Match>> named_capture_group_matches;
  28. size_t n_operations { 0 };
  29. size_t n_capture_groups { 0 };
  30. size_t n_named_capture_groups { 0 };
  31. };
  32. template<class Parser>
  33. class Regex;
  34. template<class Parser>
  35. class Matcher final {
  36. public:
  37. Matcher(const Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  38. : m_pattern(pattern)
  39. , m_regex_options(regex_options.value_or({}))
  40. {
  41. }
  42. ~Matcher() = default;
  43. RegexResult match(const RegexStringView&, Optional<typename ParserTraits<Parser>::OptionsType> = {}) const;
  44. RegexResult match(const Vector<RegexStringView>, Optional<typename ParserTraits<Parser>::OptionsType> = {}) const;
  45. typename ParserTraits<Parser>::OptionsType options() const
  46. {
  47. return m_regex_options;
  48. }
  49. private:
  50. Optional<bool> execute(const MatchInput& input, MatchState& state, MatchOutput& output, size_t recursion_level) const;
  51. ALWAYS_INLINE Optional<bool> execute_low_prio_forks(const MatchInput& input, MatchState& original_state, MatchOutput& output, Vector<MatchState> states, size_t recursion_level) const;
  52. const Regex<Parser>& m_pattern;
  53. const typename ParserTraits<Parser>::OptionsType m_regex_options;
  54. };
  55. template<class Parser>
  56. class Regex final {
  57. public:
  58. String pattern_value;
  59. regex::Parser::Result parser_result;
  60. OwnPtr<Matcher<Parser>> matcher { nullptr };
  61. mutable size_t start_offset { 0 };
  62. explicit Regex(StringView pattern, typename ParserTraits<Parser>::OptionsType regex_options = {});
  63. ~Regex() = default;
  64. Regex(Regex&&) = default;
  65. Regex& operator=(Regex&&) = default;
  66. typename ParserTraits<Parser>::OptionsType options() const;
  67. void print_bytecode(FILE* f = stdout) const;
  68. String error_string(Optional<String> message = {}) const;
  69. RegexResult match(const RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  70. {
  71. if (!matcher || parser_result.error != Error::NoError)
  72. return {};
  73. return matcher->match(view, regex_options);
  74. }
  75. RegexResult match(const Vector<RegexStringView> views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  76. {
  77. if (!matcher || parser_result.error != Error::NoError)
  78. return {};
  79. return matcher->match(views, regex_options);
  80. }
  81. String replace(const RegexStringView view, const StringView& replacement_pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  82. {
  83. if (!matcher || parser_result.error != Error::NoError)
  84. return {};
  85. StringBuilder builder;
  86. size_t start_offset = 0;
  87. RegexResult result = matcher->match(view, regex_options);
  88. if (!result.success)
  89. return view.to_string();
  90. for (size_t i = 0; i < result.matches.size(); ++i) {
  91. auto& match = result.matches[i];
  92. builder.append(view.substring_view(start_offset, match.global_offset - start_offset).to_string());
  93. start_offset = match.global_offset + match.view.length();
  94. GenericLexer lexer(replacement_pattern);
  95. while (!lexer.is_eof()) {
  96. if (lexer.consume_specific('\\')) {
  97. if (lexer.consume_specific('\\')) {
  98. builder.append('\\');
  99. continue;
  100. }
  101. auto number = lexer.consume_while(isdigit);
  102. if (auto index = number.to_uint(); index.has_value() && result.n_capture_groups >= index.value()) {
  103. builder.append(result.capture_group_matches[i][index.value() - 1].view.to_string());
  104. } else {
  105. builder.appendff("\\{}", number);
  106. }
  107. } else {
  108. builder.append(lexer.consume_while([](auto ch) { return ch != '\\'; }));
  109. }
  110. }
  111. }
  112. builder.append(view.substring_view(start_offset, view.length() - start_offset).to_string());
  113. return builder.to_string();
  114. }
  115. // FIXME: replace(const Vector<RegexStringView>, ...)
  116. RegexResult search(const RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  117. {
  118. if (!matcher || parser_result.error != Error::NoError)
  119. return {};
  120. AllOptions options = (AllOptions)regex_options.value_or({});
  121. if ((options & AllFlags::MatchNotBeginOfLine) && (options & AllFlags::MatchNotEndOfLine)) {
  122. options.reset_flag(AllFlags::MatchNotEndOfLine);
  123. options.reset_flag(AllFlags::MatchNotBeginOfLine);
  124. }
  125. options.reset_flag(AllFlags::Internal_Stateful);
  126. options |= AllFlags::Global;
  127. return matcher->match(view, options);
  128. }
  129. RegexResult search(const Vector<RegexStringView> views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  130. {
  131. if (!matcher || parser_result.error != Error::NoError)
  132. return {};
  133. AllOptions options = (AllOptions)regex_options.value_or({});
  134. if ((options & AllFlags::MatchNotBeginOfLine) && (options & AllFlags::MatchNotEndOfLine)) {
  135. options.reset_flag(AllFlags::MatchNotEndOfLine);
  136. options.reset_flag(AllFlags::MatchNotBeginOfLine);
  137. }
  138. options.reset_flag(AllFlags::Internal_Stateful);
  139. options |= AllFlags::Global;
  140. return matcher->match(views, options);
  141. }
  142. bool match(const RegexStringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  143. {
  144. m = match(view, regex_options);
  145. return m.success;
  146. }
  147. bool match(const Vector<RegexStringView> views, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  148. {
  149. m = match(views, regex_options);
  150. return m.success;
  151. }
  152. bool search(const RegexStringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  153. {
  154. m = search(view, regex_options);
  155. return m.success;
  156. }
  157. bool search(const Vector<RegexStringView> views, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  158. {
  159. m = search(views, regex_options);
  160. return m.success;
  161. }
  162. bool has_match(const RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  163. {
  164. if (!matcher || parser_result.error != Error::NoError)
  165. return false;
  166. RegexResult result = matcher->match(view, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
  167. return result.success;
  168. }
  169. bool has_match(const Vector<RegexStringView> views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  170. {
  171. if (!matcher || parser_result.error != Error::NoError)
  172. return false;
  173. RegexResult result = matcher->match(views, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
  174. return result.success;
  175. }
  176. };
  177. // free standing functions for match, search and has_match
  178. template<class Parser>
  179. RegexResult match(const RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  180. {
  181. return pattern.match(view, regex_options);
  182. }
  183. template<class Parser>
  184. RegexResult match(const Vector<RegexStringView> view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  185. {
  186. return pattern.match(view, regex_options);
  187. }
  188. template<class Parser>
  189. bool match(const RegexStringView view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  190. {
  191. return pattern.match(view, regex_options);
  192. }
  193. template<class Parser>
  194. bool match(const Vector<RegexStringView> view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  195. {
  196. return pattern.match(view, regex_options);
  197. }
  198. template<class Parser>
  199. RegexResult search(const RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  200. {
  201. return pattern.search(view, regex_options);
  202. }
  203. template<class Parser>
  204. RegexResult search(const Vector<RegexStringView> views, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  205. {
  206. return pattern.search(views, regex_options);
  207. }
  208. template<class Parser>
  209. bool search(const RegexStringView view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  210. {
  211. return pattern.search(view, regex_options);
  212. }
  213. template<class Parser>
  214. bool search(const Vector<RegexStringView> views, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  215. {
  216. return pattern.search(views, regex_options);
  217. }
  218. template<class Parser>
  219. bool has_match(const RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  220. {
  221. return pattern.has_match(view, regex_options);
  222. }
  223. template<class Parser>
  224. bool has_match(const Vector<RegexStringView> views, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  225. {
  226. return pattern.has_match(views, regex_options);
  227. }
  228. }
  229. using regex::has_match;
  230. using regex::match;
  231. using regex::Regex;
  232. using regex::RegexResult;