RegexParser.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "RegexByteCode.h"
  8. #include "RegexError.h"
  9. #include "RegexLexer.h"
  10. #include "RegexOptions.h"
  11. #include <AK/Forward.h>
  12. #include <AK/StringBuilder.h>
  13. #include <AK/Types.h>
  14. #include <AK/Vector.h>
  15. #include <LibUnicode/Forward.h>
  16. namespace regex {
  17. class PosixExtendedParser;
  18. class PosixBasicParser;
  19. class ECMA262Parser;
  20. template<typename T>
  21. struct GenericParserTraits {
  22. using OptionsType = T;
  23. };
  24. template<typename T>
  25. struct ParserTraits : public GenericParserTraits<T> {
  26. };
  27. template<>
  28. struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
  29. };
  30. template<>
  31. struct ParserTraits<PosixBasicParser> : public GenericParserTraits<PosixOptions> {
  32. };
  33. template<>
  34. struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
  35. };
  36. class Parser {
  37. public:
  38. struct Result {
  39. ByteCode bytecode;
  40. size_t capture_groups_count;
  41. size_t named_capture_groups_count;
  42. size_t match_length_minimum;
  43. Error error;
  44. Token error_token;
  45. Vector<DeprecatedFlyString> capture_groups;
  46. AllOptions options;
  47. struct {
  48. Optional<ByteString> pure_substring_search;
  49. } optimization_data {};
  50. };
  51. explicit Parser(Lexer& lexer)
  52. : m_parser_state(lexer)
  53. {
  54. }
  55. Parser(Lexer& lexer, AllOptions regex_options)
  56. : m_parser_state(lexer, regex_options)
  57. {
  58. }
  59. virtual ~Parser() = default;
  60. Result parse(Optional<AllOptions> regex_options = {});
  61. bool has_error() const { return m_parser_state.error != Error::NoError; }
  62. Error error() const { return m_parser_state.error; }
  63. AllOptions options() const { return m_parser_state.regex_options; }
  64. protected:
  65. virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
  66. ALWAYS_INLINE bool match(TokenType type) const;
  67. ALWAYS_INLINE bool match(char ch) const;
  68. ALWAYS_INLINE bool match_ordinary_characters();
  69. ALWAYS_INLINE Token consume();
  70. ALWAYS_INLINE Token consume(TokenType type, Error error);
  71. ALWAYS_INLINE bool consume(ByteString const&);
  72. ALWAYS_INLINE Optional<u32> consume_escaped_code_point(bool unicode);
  73. ALWAYS_INLINE bool try_skip(StringView);
  74. ALWAYS_INLINE bool lookahead_any(StringView);
  75. ALWAYS_INLINE unsigned char skip();
  76. ALWAYS_INLINE void back(size_t = 1);
  77. ALWAYS_INLINE void reset();
  78. ALWAYS_INLINE bool done() const;
  79. ALWAYS_INLINE bool set_error(Error error);
  80. size_t tell() const { return m_parser_state.current_token.position(); }
  81. struct NamedCaptureGroup {
  82. size_t group_index { 0 };
  83. size_t minimum_length { 0 };
  84. };
  85. struct ParserState {
  86. Lexer& lexer;
  87. Token current_token;
  88. Error error = Error::NoError;
  89. Token error_token { TokenType::Eof, 0, {} };
  90. ByteCode bytecode;
  91. size_t capture_groups_count { 0 };
  92. size_t named_capture_groups_count { 0 };
  93. size_t match_length_minimum { 0 };
  94. size_t repetition_mark_count { 0 };
  95. AllOptions regex_options;
  96. HashMap<int, size_t> capture_group_minimum_lengths;
  97. HashMap<DeprecatedFlyString, NamedCaptureGroup> named_capture_groups;
  98. explicit ParserState(Lexer& lexer)
  99. : lexer(lexer)
  100. , current_token(lexer.next())
  101. {
  102. }
  103. explicit ParserState(Lexer& lexer, AllOptions regex_options)
  104. : lexer(lexer)
  105. , current_token(lexer.next())
  106. , regex_options(regex_options)
  107. {
  108. }
  109. };
  110. ParserState m_parser_state;
  111. };
  112. class AbstractPosixParser : public Parser {
  113. protected:
  114. explicit AbstractPosixParser(Lexer& lexer)
  115. : Parser(lexer)
  116. {
  117. }
  118. AbstractPosixParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
  119. : Parser(lexer, regex_options.value_or({}))
  120. {
  121. }
  122. ALWAYS_INLINE bool parse_bracket_expression(Vector<CompareTypeAndValuePair>&, size_t&);
  123. };
  124. class PosixBasicParser final : public AbstractPosixParser {
  125. public:
  126. explicit PosixBasicParser(Lexer& lexer)
  127. : AbstractPosixParser(lexer)
  128. {
  129. }
  130. PosixBasicParser(Lexer& lexer, Optional<typename ParserTraits<PosixBasicParser>::OptionsType> regex_options)
  131. : AbstractPosixParser(lexer, regex_options.value_or({}))
  132. {
  133. }
  134. ~PosixBasicParser() = default;
  135. private:
  136. bool parse_internal(ByteCode&, size_t&) override;
  137. bool parse_root(ByteCode&, size_t&);
  138. bool parse_re_expression(ByteCode&, size_t&);
  139. bool parse_simple_re(ByteCode&, size_t&);
  140. bool parse_nonduplicating_re(ByteCode&, size_t&);
  141. bool parse_one_char_or_collation_element(ByteCode&, size_t&);
  142. constexpr static size_t number_of_addressable_capture_groups = 9;
  143. size_t m_capture_group_minimum_lengths[number_of_addressable_capture_groups] { 0 };
  144. bool m_capture_group_seen[number_of_addressable_capture_groups] { false };
  145. size_t m_current_capture_group_depth { 0 };
  146. };
  147. class PosixExtendedParser final : public AbstractPosixParser {
  148. constexpr static auto default_options = static_cast<PosixFlags>(AllFlags::SingleLine) | static_cast<PosixFlags>(AllFlags::Internal_ConsiderNewline);
  149. public:
  150. explicit PosixExtendedParser(Lexer& lexer)
  151. : AbstractPosixParser(lexer, default_options)
  152. {
  153. }
  154. PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
  155. : AbstractPosixParser(lexer, regex_options.value_or({}) | default_options.value())
  156. {
  157. }
  158. ~PosixExtendedParser() = default;
  159. private:
  160. ALWAYS_INLINE bool match_repetition_symbol();
  161. bool parse_internal(ByteCode&, size_t&) override;
  162. bool parse_root(ByteCode&, size_t&);
  163. ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
  164. ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
  165. ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
  166. };
  167. class ECMA262Parser final : public Parser {
  168. constexpr static ECMAScriptOptions default_options = static_cast<ECMAScriptFlags>(AllFlags::Internal_ConsiderNewline);
  169. public:
  170. explicit ECMA262Parser(Lexer& lexer)
  171. : Parser(lexer, default_options)
  172. {
  173. m_capture_groups_in_scope.empend();
  174. }
  175. ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
  176. : Parser(lexer, regex_options.value_or({}) | default_options.value())
  177. {
  178. m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
  179. m_capture_groups_in_scope.empend();
  180. }
  181. ~ECMA262Parser() = default;
  182. private:
  183. bool parse_internal(ByteCode&, size_t&) override;
  184. struct ParseFlags {
  185. bool unicode { false };
  186. bool named { false };
  187. bool unicode_sets { false };
  188. };
  189. enum class ReadDigitsInitialZeroState {
  190. Allow,
  191. Disallow,
  192. };
  193. StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
  194. Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
  195. DeprecatedFlyString read_capture_group_specifier(bool take_starting_angle_bracket = false);
  196. struct Script {
  197. Unicode::Script script {};
  198. bool is_extension { false };
  199. };
  200. using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Script, Empty>;
  201. Optional<PropertyEscape> read_unicode_property_escape();
  202. bool parse_pattern(ByteCode&, size_t&, ParseFlags);
  203. bool parse_disjunction(ByteCode&, size_t&, ParseFlags);
  204. bool parse_alternative(ByteCode&, size_t&, ParseFlags);
  205. bool parse_term(ByteCode&, size_t&, ParseFlags);
  206. bool parse_assertion(ByteCode&, size_t&, ParseFlags);
  207. bool parse_atom(ByteCode&, size_t&, ParseFlags);
  208. bool parse_quantifier(ByteCode&, size_t&, ParseFlags);
  209. bool parse_interval_quantifier(Optional<u64>& repeat_min, Optional<u64>& repeat_max);
  210. bool parse_atom_escape(ByteCode&, size_t&, ParseFlags);
  211. bool parse_character_class(ByteCode&, size_t&, ParseFlags);
  212. bool parse_capture_group(ByteCode&, size_t&, ParseFlags);
  213. Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
  214. bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, ParseFlags);
  215. bool parse_unicode_property_escape(PropertyEscape& property, bool& negated);
  216. bool parse_character_escape(Vector<CompareTypeAndValuePair>&, size_t&, ParseFlags);
  217. bool parse_class_set_expression(Vector<CompareTypeAndValuePair>&);
  218. bool parse_class_union(Vector<CompareTypeAndValuePair>&);
  219. bool parse_class_intersection(Vector<CompareTypeAndValuePair>&);
  220. bool parse_class_subtraction(Vector<CompareTypeAndValuePair>&);
  221. bool parse_class_set_range(Vector<CompareTypeAndValuePair>&);
  222. bool parse_class_set_operand(Vector<CompareTypeAndValuePair>&);
  223. bool parse_nested_class(Vector<CompareTypeAndValuePair>&);
  224. Optional<u32> parse_class_set_character();
  225. // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
  226. bool parse_quantifiable_assertion(ByteCode&, size_t&, ParseFlags);
  227. bool parse_extended_atom(ByteCode&, size_t&, ParseFlags);
  228. bool parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, ParseFlags);
  229. bool parse_invalid_braced_quantifier(); // Note: This function either parses and *fails*, or doesn't parse anything and returns false.
  230. Optional<u8> parse_legacy_octal_escape();
  231. size_t ensure_total_number_of_capturing_parenthesis();
  232. void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); }
  233. void exit_capture_group_scope()
  234. {
  235. auto last = m_capture_groups_in_scope.take_last();
  236. m_capture_groups_in_scope.last().extend(move(last));
  237. }
  238. void clear_all_capture_groups_in_scope(ByteCode& stack)
  239. {
  240. for (auto& index : m_capture_groups_in_scope.last())
  241. stack.insert_bytecode_clear_capture_group(index);
  242. }
  243. // ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
  244. // always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
  245. // parse it twice, so we'll just do so when it's actually needed.
  246. // Most patterns should have no need to ever populate this field.
  247. Optional<size_t> m_total_number_of_capturing_parenthesis;
  248. // Keep the Annex B. behavior behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
  249. bool m_should_use_browser_extended_grammar { false };
  250. // ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it,
  251. // by requiring that (...)+ only contain the matches for the last iteration.
  252. // To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed.
  253. Vector<Vector<size_t>> m_capture_groups_in_scope;
  254. };
  255. using PosixExtended = PosixExtendedParser;
  256. using PosixBasic = PosixBasicParser;
  257. using ECMA262 = ECMA262Parser;
  258. }
  259. using regex::ECMA262;
  260. using regex::PosixBasic;
  261. using regex::PosixExtended;