RegexParser.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "RegexByteCode.h"
  8. #include "RegexError.h"
  9. #include "RegexLexer.h"
  10. #include "RegexOptions.h"
  11. #include <AK/Forward.h>
  12. #include <AK/StringBuilder.h>
  13. #include <AK/Types.h>
  14. #include <AK/Vector.h>
  15. #include <LibUnicode/Forward.h>
  16. namespace regex {
  17. class PosixExtendedParser;
  18. class PosixBasicParser;
  19. class ECMA262Parser;
  20. template<typename T>
  21. struct GenericParserTraits {
  22. using OptionsType = T;
  23. };
  24. template<typename T>
  25. struct ParserTraits : public GenericParserTraits<T> {
  26. };
  27. template<>
  28. struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
  29. };
  30. template<>
  31. struct ParserTraits<PosixBasicParser> : public GenericParserTraits<PosixOptions> {
  32. };
  33. template<>
  34. struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
  35. };
  36. class Parser {
  37. public:
  38. struct Result {
  39. ByteCode bytecode;
  40. size_t capture_groups_count;
  41. size_t named_capture_groups_count;
  42. size_t match_length_minimum;
  43. Error error;
  44. Token error_token;
  45. Vector<FlyString> capture_groups;
  46. AllOptions options;
  47. };
  48. explicit Parser(Lexer& lexer)
  49. : m_parser_state(lexer)
  50. {
  51. }
  52. Parser(Lexer& lexer, AllOptions regex_options)
  53. : m_parser_state(lexer, regex_options)
  54. {
  55. }
  56. virtual ~Parser() = default;
  57. Result parse(Optional<AllOptions> regex_options = {});
  58. bool has_error() const { return m_parser_state.error != Error::NoError; }
  59. Error error() const { return m_parser_state.error; }
  60. AllOptions options() const { return m_parser_state.regex_options; }
  61. protected:
  62. virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
  63. ALWAYS_INLINE bool match(TokenType type) const;
  64. ALWAYS_INLINE bool match(char ch) const;
  65. ALWAYS_INLINE bool match_ordinary_characters();
  66. ALWAYS_INLINE Token consume();
  67. ALWAYS_INLINE Token consume(TokenType type, Error error);
  68. ALWAYS_INLINE bool consume(String const&);
  69. ALWAYS_INLINE Optional<u32> consume_escaped_code_point(bool unicode);
  70. ALWAYS_INLINE bool try_skip(StringView);
  71. ALWAYS_INLINE bool lookahead_any(StringView);
  72. ALWAYS_INLINE unsigned char skip();
  73. ALWAYS_INLINE void back(size_t = 1);
  74. ALWAYS_INLINE void reset();
  75. ALWAYS_INLINE bool done() const;
  76. ALWAYS_INLINE bool set_error(Error error);
  77. struct NamedCaptureGroup {
  78. size_t group_index { 0 };
  79. size_t minimum_length { 0 };
  80. };
  81. struct ParserState {
  82. Lexer& lexer;
  83. Token current_token;
  84. Error error = Error::NoError;
  85. Token error_token { TokenType::Eof, 0, {} };
  86. ByteCode bytecode;
  87. size_t capture_groups_count { 0 };
  88. size_t named_capture_groups_count { 0 };
  89. size_t match_length_minimum { 0 };
  90. size_t repetition_mark_count { 0 };
  91. AllOptions regex_options;
  92. HashMap<int, size_t> capture_group_minimum_lengths;
  93. HashMap<FlyString, NamedCaptureGroup> named_capture_groups;
  94. explicit ParserState(Lexer& lexer)
  95. : lexer(lexer)
  96. , current_token(lexer.next())
  97. {
  98. }
  99. explicit ParserState(Lexer& lexer, AllOptions regex_options)
  100. : lexer(lexer)
  101. , current_token(lexer.next())
  102. , regex_options(regex_options)
  103. {
  104. }
  105. };
  106. ParserState m_parser_state;
  107. };
  108. class AbstractPosixParser : public Parser {
  109. protected:
  110. explicit AbstractPosixParser(Lexer& lexer)
  111. : Parser(lexer)
  112. {
  113. }
  114. AbstractPosixParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
  115. : Parser(lexer, regex_options.value_or({}))
  116. {
  117. }
  118. ALWAYS_INLINE bool parse_bracket_expression(Vector<CompareTypeAndValuePair>&, size_t&);
  119. };
  120. class PosixBasicParser final : public AbstractPosixParser {
  121. public:
  122. explicit PosixBasicParser(Lexer& lexer)
  123. : AbstractPosixParser(lexer)
  124. {
  125. }
  126. PosixBasicParser(Lexer& lexer, Optional<typename ParserTraits<PosixBasicParser>::OptionsType> regex_options)
  127. : AbstractPosixParser(lexer, regex_options.value_or({}))
  128. {
  129. }
  130. ~PosixBasicParser() = default;
  131. private:
  132. bool parse_internal(ByteCode&, size_t&) override;
  133. bool parse_root(ByteCode&, size_t&);
  134. bool parse_re_expression(ByteCode&, size_t&);
  135. bool parse_simple_re(ByteCode&, size_t&);
  136. bool parse_nonduplicating_re(ByteCode&, size_t&);
  137. bool parse_one_char_or_collation_element(ByteCode&, size_t&);
  138. constexpr static size_t number_of_addressable_capture_groups = 9;
  139. size_t m_capture_group_minimum_lengths[number_of_addressable_capture_groups] { 0 };
  140. bool m_capture_group_seen[number_of_addressable_capture_groups] { false };
  141. size_t m_current_capture_group_depth { 0 };
  142. };
  143. class PosixExtendedParser final : public AbstractPosixParser {
  144. constexpr static auto default_options = static_cast<PosixFlags>(AllFlags::SingleLine) | static_cast<PosixFlags>(AllFlags::Internal_ConsiderNewline);
  145. public:
  146. explicit PosixExtendedParser(Lexer& lexer)
  147. : AbstractPosixParser(lexer, default_options)
  148. {
  149. }
  150. PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
  151. : AbstractPosixParser(lexer, regex_options.value_or({}) | default_options.value())
  152. {
  153. }
  154. ~PosixExtendedParser() = default;
  155. private:
  156. ALWAYS_INLINE bool match_repetition_symbol();
  157. bool parse_internal(ByteCode&, size_t&) override;
  158. bool parse_root(ByteCode&, size_t&);
  159. ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
  160. ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
  161. ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
  162. };
  163. class ECMA262Parser final : public Parser {
  164. constexpr static ECMAScriptOptions default_options = static_cast<ECMAScriptFlags>(AllFlags::Internal_ConsiderNewline);
  165. public:
  166. explicit ECMA262Parser(Lexer& lexer)
  167. : Parser(lexer, default_options)
  168. {
  169. m_capture_groups_in_scope.empend();
  170. }
  171. ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
  172. : Parser(lexer, regex_options.value_or({}) | default_options.value())
  173. {
  174. m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
  175. m_capture_groups_in_scope.empend();
  176. }
  177. ~ECMA262Parser() = default;
  178. private:
  179. bool parse_internal(ByteCode&, size_t&) override;
  180. enum class ReadDigitsInitialZeroState {
  181. Allow,
  182. Disallow,
  183. };
  184. StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
  185. Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
  186. FlyString read_capture_group_specifier(bool take_starting_angle_bracket = false);
  187. struct Script {
  188. Unicode::Script script {};
  189. bool is_extension { false };
  190. };
  191. using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Script, Empty>;
  192. Optional<PropertyEscape> read_unicode_property_escape();
  193. bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
  194. bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
  195. bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named);
  196. bool parse_term(ByteCode&, size_t&, bool unicode, bool named);
  197. bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
  198. bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
  199. bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
  200. bool parse_interval_quantifier(Optional<u64>& repeat_min, Optional<u64>& repeat_max);
  201. bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
  202. bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
  203. bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
  204. Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
  205. bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
  206. bool parse_unicode_property_escape(PropertyEscape& property, bool& negated);
  207. // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
  208. bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named);
  209. bool parse_extended_atom(ByteCode&, size_t&, bool named);
  210. bool parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, bool unicode, bool named);
  211. bool parse_invalid_braced_quantifier(); // Note: This function either parses and *fails*, or doesn't parse anything and returns false.
  212. bool parse_legacy_octal_escape_sequence(ByteCode& bytecode_stack, size_t& length);
  213. Optional<u8> parse_legacy_octal_escape();
  214. size_t ensure_total_number_of_capturing_parenthesis();
  215. void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); }
  216. void exit_capture_group_scope()
  217. {
  218. auto last = m_capture_groups_in_scope.take_last();
  219. m_capture_groups_in_scope.last().extend(move(last));
  220. }
  221. void clear_all_capture_groups_in_scope(ByteCode& stack)
  222. {
  223. for (auto& index : m_capture_groups_in_scope.last())
  224. stack.insert_bytecode_clear_capture_group(index);
  225. };
  226. // ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
  227. // always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
  228. // parse it twice, so we'll just do so when it's actually needed.
  229. // Most patterns should have no need to ever populate this field.
  230. Optional<size_t> m_total_number_of_capturing_parenthesis;
  231. // Keep the Annex B. behavior behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
  232. bool m_should_use_browser_extended_grammar { false };
  233. // ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it,
  234. // by requiring that (...)+ only contain the matches for the last iteration.
  235. // To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed.
  236. Vector<Vector<size_t>> m_capture_groups_in_scope;
  237. };
  238. using PosixExtended = PosixExtendedParser;
  239. using PosixBasic = PosixBasicParser;
  240. using ECMA262 = ECMA262Parser;
  241. }
  242. using regex::ECMA262;
  243. using regex::PosixBasic;
  244. using regex::PosixExtended;