RegexMatcher.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "RegexByteCode.h"
  8. #include "RegexMatch.h"
  9. #include "RegexOptions.h"
  10. #include "RegexParser.h"
  11. #include <AK/Forward.h>
  12. #include <AK/GenericLexer.h>
  13. #include <AK/HashMap.h>
  14. #include <AK/NonnullOwnPtrVector.h>
  15. #include <AK/Types.h>
  16. #include <AK/Utf32View.h>
  17. #include <AK/Vector.h>
  18. #include <ctype.h>
  19. #include <stdio.h>
  20. namespace regex {
  21. namespace Detail {
  22. struct Block {
  23. size_t start;
  24. size_t end;
  25. };
  26. }
  27. static constexpr const size_t c_max_recursion = 5000;
  28. static constexpr const size_t c_match_preallocation_count = 0;
  29. struct RegexResult final {
  30. bool success { false };
  31. size_t count { 0 };
  32. Vector<Match> matches;
  33. Vector<Vector<Match>> capture_group_matches;
  34. size_t n_operations { 0 };
  35. size_t n_capture_groups { 0 };
  36. size_t n_named_capture_groups { 0 };
  37. };
  38. template<class Parser>
  39. class Regex;
  40. template<class Parser>
  41. class Matcher final {
  42. public:
  43. Matcher(Regex<Parser> const* pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  44. : m_pattern(pattern)
  45. , m_regex_options(regex_options.value_or({}))
  46. {
  47. }
  48. ~Matcher() = default;
  49. RegexResult match(RegexStringView const&, Optional<typename ParserTraits<Parser>::OptionsType> = {}) const;
  50. RegexResult match(Vector<RegexStringView> const&, Optional<typename ParserTraits<Parser>::OptionsType> = {}) const;
  51. typename ParserTraits<Parser>::OptionsType options() const
  52. {
  53. return m_regex_options;
  54. }
  55. void reset_pattern(Badge<Regex<Parser>>, Regex<Parser> const* pattern)
  56. {
  57. m_pattern = pattern;
  58. }
  59. private:
  60. Optional<bool> execute(MatchInput const& input, MatchState& state, size_t& operations) const;
  61. Regex<Parser> const* m_pattern;
  62. typename ParserTraits<Parser>::OptionsType const m_regex_options;
  63. };
  64. template<class Parser>
  65. class Regex final {
  66. public:
  67. String pattern_value;
  68. regex::Parser::Result parser_result;
  69. OwnPtr<Matcher<Parser>> matcher { nullptr };
  70. mutable size_t start_offset { 0 };
  71. static regex::Parser::Result parse_pattern(StringView pattern, typename ParserTraits<Parser>::OptionsType regex_options = {});
  72. explicit Regex(String pattern, typename ParserTraits<Parser>::OptionsType regex_options = {});
  73. Regex(regex::Parser::Result parse_result, String pattern, typename ParserTraits<Parser>::OptionsType regex_options = {});
  74. ~Regex() = default;
  75. Regex(Regex&&);
  76. Regex& operator=(Regex&&);
  77. typename ParserTraits<Parser>::OptionsType options() const;
  78. void print_bytecode(FILE* f = stdout) const;
  79. String error_string(Optional<String> message = {}) const;
  80. RegexResult match(RegexStringView const& view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  81. {
  82. if (!matcher || parser_result.error != Error::NoError)
  83. return {};
  84. return matcher->match(view, regex_options);
  85. }
  86. RegexResult match(Vector<RegexStringView> const& views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  87. {
  88. if (!matcher || parser_result.error != Error::NoError)
  89. return {};
  90. return matcher->match(views, regex_options);
  91. }
  92. String replace(RegexStringView const& view, StringView const& replacement_pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  93. {
  94. if (!matcher || parser_result.error != Error::NoError)
  95. return {};
  96. StringBuilder builder;
  97. size_t start_offset = 0;
  98. RegexResult result = matcher->match(view, regex_options);
  99. if (!result.success)
  100. return view.to_string();
  101. for (size_t i = 0; i < result.matches.size(); ++i) {
  102. auto& match = result.matches[i];
  103. builder.append(view.substring_view(start_offset, match.global_offset - start_offset).to_string());
  104. start_offset = match.global_offset + match.view.length();
  105. GenericLexer lexer(replacement_pattern);
  106. while (!lexer.is_eof()) {
  107. if (lexer.consume_specific('\\')) {
  108. if (lexer.consume_specific('\\')) {
  109. builder.append('\\');
  110. continue;
  111. }
  112. auto number = lexer.consume_while(isdigit);
  113. if (auto index = number.to_uint(); index.has_value() && result.n_capture_groups >= index.value()) {
  114. builder.append(result.capture_group_matches[i][index.value() - 1].view.to_string());
  115. } else {
  116. builder.appendff("\\{}", number);
  117. }
  118. } else {
  119. builder.append(lexer.consume_while([](auto ch) { return ch != '\\'; }));
  120. }
  121. }
  122. }
  123. builder.append(view.substring_view(start_offset, view.length() - start_offset).to_string());
  124. return builder.to_string();
  125. }
  126. // FIXME: replace(Vector<RegexStringView> const , ...)
  127. RegexResult search(RegexStringView const& view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  128. {
  129. if (!matcher || parser_result.error != Error::NoError)
  130. return {};
  131. AllOptions options = (AllOptions)regex_options.value_or({});
  132. if ((options & AllFlags::MatchNotBeginOfLine) && (options & AllFlags::MatchNotEndOfLine)) {
  133. options.reset_flag(AllFlags::MatchNotEndOfLine);
  134. options.reset_flag(AllFlags::MatchNotBeginOfLine);
  135. }
  136. options.reset_flag(AllFlags::Internal_Stateful);
  137. options |= AllFlags::Global;
  138. return matcher->match(view, options);
  139. }
  140. RegexResult search(Vector<RegexStringView> const& views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  141. {
  142. if (!matcher || parser_result.error != Error::NoError)
  143. return {};
  144. AllOptions options = (AllOptions)regex_options.value_or({});
  145. if ((options & AllFlags::MatchNotBeginOfLine) && (options & AllFlags::MatchNotEndOfLine)) {
  146. options.reset_flag(AllFlags::MatchNotEndOfLine);
  147. options.reset_flag(AllFlags::MatchNotBeginOfLine);
  148. }
  149. options.reset_flag(AllFlags::Internal_Stateful);
  150. options |= AllFlags::Global;
  151. return matcher->match(views, options);
  152. }
  153. bool match(RegexStringView const& view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  154. {
  155. m = match(view, regex_options);
  156. return m.success;
  157. }
  158. bool match(Vector<RegexStringView> const& views, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  159. {
  160. m = match(views, regex_options);
  161. return m.success;
  162. }
  163. bool search(RegexStringView const& view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  164. {
  165. m = search(view, regex_options);
  166. return m.success;
  167. }
  168. bool search(Vector<RegexStringView> const& views, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  169. {
  170. m = search(views, regex_options);
  171. return m.success;
  172. }
  173. bool has_match(RegexStringView const& view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  174. {
  175. if (!matcher || parser_result.error != Error::NoError)
  176. return false;
  177. RegexResult result = matcher->match(view, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
  178. return result.success;
  179. }
  180. bool has_match(Vector<RegexStringView> const& views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  181. {
  182. if (!matcher || parser_result.error != Error::NoError)
  183. return false;
  184. RegexResult result = matcher->match(views, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
  185. return result.success;
  186. }
  187. private:
  188. void run_optimization_passes();
  189. using BasicBlockList = Vector<Detail::Block>;
  190. BasicBlockList split_basic_blocks();
  191. void attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&);
  192. };
  193. // free standing functions for match, search and has_match
  194. template<class Parser>
  195. RegexResult match(RegexStringView const& view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  196. {
  197. return pattern.match(view, regex_options);
  198. }
  199. template<class Parser>
  200. RegexResult match(Vector<RegexStringView> const& view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  201. {
  202. return pattern.match(view, regex_options);
  203. }
  204. template<class Parser>
  205. bool match(RegexStringView const& view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  206. {
  207. return pattern.match(view, regex_options);
  208. }
  209. template<class Parser>
  210. bool match(Vector<RegexStringView> const& view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  211. {
  212. return pattern.match(view, regex_options);
  213. }
  214. template<class Parser>
  215. RegexResult search(RegexStringView const& view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  216. {
  217. return pattern.search(view, regex_options);
  218. }
  219. template<class Parser>
  220. RegexResult search(Vector<RegexStringView> const& views, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  221. {
  222. return pattern.search(views, regex_options);
  223. }
  224. template<class Parser>
  225. bool search(RegexStringView const& view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  226. {
  227. return pattern.search(view, regex_options);
  228. }
  229. template<class Parser>
  230. bool search(Vector<RegexStringView> const& views, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  231. {
  232. return pattern.search(views, regex_options);
  233. }
  234. template<class Parser>
  235. bool has_match(RegexStringView const& view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  236. {
  237. return pattern.has_match(view, regex_options);
  238. }
  239. template<class Parser>
  240. bool has_match(Vector<RegexStringView> const& views, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  241. {
  242. return pattern.has_match(views, regex_options);
  243. }
  244. }
  245. using regex::has_match;
  246. using regex::match;
  247. using regex::Regex;
  248. using regex::RegexResult;