RegexMatcher.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "RegexByteCode.h"
  8. #include "RegexMatch.h"
  9. #include "RegexOptions.h"
  10. #include "RegexParser.h"
  11. #include <AK/Forward.h>
  12. #include <AK/GenericLexer.h>
  13. #include <AK/HashMap.h>
  14. #include <AK/Types.h>
  15. #include <AK/Utf32View.h>
  16. #include <AK/Vector.h>
  17. #include <ctype.h>
  18. #include <stdio.h>
  19. namespace regex {
  20. namespace Detail {
  21. struct Block {
  22. size_t start;
  23. size_t end;
  24. };
  25. }
  26. static constexpr const size_t c_max_recursion = 5000;
  27. static constexpr const size_t c_match_preallocation_count = 0;
  28. struct RegexResult final {
  29. bool success { false };
  30. size_t count { 0 };
  31. Vector<Match> matches;
  32. Vector<Vector<Match>> capture_group_matches;
  33. size_t n_operations { 0 };
  34. size_t n_capture_groups { 0 };
  35. size_t n_named_capture_groups { 0 };
  36. };
  37. template<class Parser>
  38. class Regex;
  39. template<class Parser>
  40. class Matcher final {
  41. public:
  42. Matcher(Regex<Parser> const* pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  43. : m_pattern(pattern)
  44. , m_regex_options(regex_options.value_or({}))
  45. {
  46. }
  47. ~Matcher() = default;
  48. RegexResult match(RegexStringView, Optional<typename ParserTraits<Parser>::OptionsType> = {}) const;
  49. RegexResult match(Vector<RegexStringView> const&, Optional<typename ParserTraits<Parser>::OptionsType> = {}) const;
  50. typename ParserTraits<Parser>::OptionsType options() const
  51. {
  52. return m_regex_options;
  53. }
  54. void reset_pattern(Badge<Regex<Parser>>, Regex<Parser> const* pattern)
  55. {
  56. m_pattern = pattern;
  57. }
  58. private:
  59. bool execute(MatchInput const& input, MatchState& state, size_t& operations) const;
  60. Regex<Parser> const* m_pattern;
  61. typename ParserTraits<Parser>::OptionsType const m_regex_options;
  62. };
  63. template<class Parser>
  64. class Regex final {
  65. public:
  66. ByteString pattern_value;
  67. regex::Parser::Result parser_result;
  68. OwnPtr<Matcher<Parser>> matcher { nullptr };
  69. mutable size_t start_offset { 0 };
  70. static regex::Parser::Result parse_pattern(StringView pattern, typename ParserTraits<Parser>::OptionsType regex_options = {});
  71. explicit Regex(ByteString pattern, typename ParserTraits<Parser>::OptionsType regex_options = {});
  72. Regex(regex::Parser::Result parse_result, ByteString pattern, typename ParserTraits<Parser>::OptionsType regex_options = {});
  73. ~Regex() = default;
  74. Regex(Regex&&);
  75. Regex& operator=(Regex&&);
  76. typename ParserTraits<Parser>::OptionsType options() const;
  77. ByteString error_string(Optional<ByteString> message = {}) const;
  78. RegexResult match(RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  79. {
  80. if (!matcher || parser_result.error != Error::NoError)
  81. return {};
  82. return matcher->match(view, regex_options);
  83. }
  84. RegexResult match(Vector<RegexStringView> const& views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  85. {
  86. if (!matcher || parser_result.error != Error::NoError)
  87. return {};
  88. return matcher->match(views, regex_options);
  89. }
  90. ByteString replace(RegexStringView view, StringView replacement_pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  91. {
  92. if (!matcher || parser_result.error != Error::NoError)
  93. return {};
  94. StringBuilder builder;
  95. size_t start_offset = 0;
  96. RegexResult result = matcher->match(view, regex_options);
  97. if (!result.success)
  98. return view.to_byte_string();
  99. for (size_t i = 0; i < result.matches.size(); ++i) {
  100. auto& match = result.matches[i];
  101. builder.append(view.substring_view(start_offset, match.global_offset - start_offset).to_byte_string());
  102. start_offset = match.global_offset + match.view.length();
  103. GenericLexer lexer(replacement_pattern);
  104. while (!lexer.is_eof()) {
  105. if (lexer.consume_specific('\\')) {
  106. if (lexer.consume_specific('\\')) {
  107. builder.append('\\');
  108. continue;
  109. }
  110. auto number = lexer.consume_while(isdigit);
  111. if (auto index = number.to_number<unsigned>(); index.has_value() && result.n_capture_groups >= index.value()) {
  112. builder.append(result.capture_group_matches[i][index.value() - 1].view.to_byte_string());
  113. } else {
  114. builder.appendff("\\{}", number);
  115. }
  116. } else {
  117. builder.append(lexer.consume_while([](auto ch) { return ch != '\\'; }));
  118. }
  119. }
  120. }
  121. builder.append(view.substring_view(start_offset, view.length() - start_offset).to_byte_string());
  122. return builder.to_byte_string();
  123. }
  124. // FIXME: replace(Vector<RegexStringView> const , ...)
  125. RegexResult search(RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  126. {
  127. if (!matcher || parser_result.error != Error::NoError)
  128. return {};
  129. AllOptions options = (AllOptions)regex_options.value_or({});
  130. if ((options & AllFlags::MatchNotBeginOfLine) && (options & AllFlags::MatchNotEndOfLine)) {
  131. options.reset_flag(AllFlags::MatchNotEndOfLine);
  132. options.reset_flag(AllFlags::MatchNotBeginOfLine);
  133. }
  134. options.reset_flag(AllFlags::Internal_Stateful);
  135. options |= AllFlags::Global;
  136. return matcher->match(view, options);
  137. }
  138. RegexResult search(Vector<RegexStringView> const& views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  139. {
  140. if (!matcher || parser_result.error != Error::NoError)
  141. return {};
  142. AllOptions options = (AllOptions)regex_options.value_or({});
  143. if ((options & AllFlags::MatchNotBeginOfLine) && (options & AllFlags::MatchNotEndOfLine)) {
  144. options.reset_flag(AllFlags::MatchNotEndOfLine);
  145. options.reset_flag(AllFlags::MatchNotBeginOfLine);
  146. }
  147. options.reset_flag(AllFlags::Internal_Stateful);
  148. options |= AllFlags::Global;
  149. return matcher->match(views, options);
  150. }
  151. bool match(RegexStringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  152. {
  153. m = match(view, regex_options);
  154. return m.success;
  155. }
  156. bool match(Vector<RegexStringView> const& views, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  157. {
  158. m = match(views, regex_options);
  159. return m.success;
  160. }
  161. bool search(RegexStringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  162. {
  163. m = search(view, regex_options);
  164. return m.success;
  165. }
  166. bool search(Vector<RegexStringView> const& views, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  167. {
  168. m = search(views, regex_options);
  169. return m.success;
  170. }
  171. bool has_match(RegexStringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  172. {
  173. if (!matcher || parser_result.error != Error::NoError)
  174. return false;
  175. RegexResult result = matcher->match(view, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
  176. return result.success;
  177. }
  178. bool has_match(Vector<RegexStringView> const& views, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
  179. {
  180. if (!matcher || parser_result.error != Error::NoError)
  181. return false;
  182. RegexResult result = matcher->match(views, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
  183. return result.success;
  184. }
  185. using BasicBlockList = Vector<Detail::Block>;
  186. static BasicBlockList split_basic_blocks(ByteCode const&);
  187. private:
  188. void run_optimization_passes();
  189. void attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&);
  190. bool attempt_rewrite_entire_match_as_substring_search(BasicBlockList const&);
  191. };
  192. // free standing functions for match, search and has_match
  193. template<class Parser>
  194. RegexResult match(RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  195. {
  196. return pattern.match(view, regex_options);
  197. }
  198. template<class Parser>
  199. RegexResult match(Vector<RegexStringView> const& view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  200. {
  201. return pattern.match(view, regex_options);
  202. }
  203. template<class Parser>
  204. bool match(RegexStringView view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  205. {
  206. return pattern.match(view, regex_options);
  207. }
  208. template<class Parser>
  209. bool match(Vector<RegexStringView> const& view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  210. {
  211. return pattern.match(view, regex_options);
  212. }
  213. template<class Parser>
  214. RegexResult search(RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  215. {
  216. return pattern.search(view, regex_options);
  217. }
  218. template<class Parser>
  219. RegexResult search(Vector<RegexStringView> const& views, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  220. {
  221. return pattern.search(views, regex_options);
  222. }
  223. template<class Parser>
  224. bool search(RegexStringView view, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  225. {
  226. return pattern.search(view, regex_options);
  227. }
  228. template<class Parser>
  229. bool search(Vector<RegexStringView> const& views, Regex<Parser>& pattern, RegexResult&, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  230. {
  231. return pattern.search(views, regex_options);
  232. }
  233. template<class Parser>
  234. bool has_match(RegexStringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  235. {
  236. return pattern.has_match(view, regex_options);
  237. }
  238. template<class Parser>
  239. bool has_match(Vector<RegexStringView> const& views, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
  240. {
  241. return pattern.has_match(views, regex_options);
  242. }
  243. }
  244. using regex::has_match;
  245. using regex::match;
  246. using regex::Regex;
  247. using regex::RegexResult;