RegexParser.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "RegexByteCode.h"
  8. #include "RegexError.h"
  9. #include "RegexLexer.h"
  10. #include "RegexOptions.h"
  11. #include <AK/Forward.h>
  12. #include <AK/StringBuilder.h>
  13. #include <AK/Types.h>
  14. #include <AK/Vector.h>
  15. #include <LibUnicode/Forward.h>
  16. namespace regex {
  17. class PosixExtendedParser;
  18. class PosixBasicParser;
  19. class ECMA262Parser;
  20. template<typename T>
  21. struct GenericParserTraits {
  22. using OptionsType = T;
  23. };
  24. template<typename T>
  25. struct ParserTraits : public GenericParserTraits<T> {
  26. };
  27. template<>
  28. struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
  29. };
  30. template<>
  31. struct ParserTraits<PosixBasicParser> : public GenericParserTraits<PosixOptions> {
  32. };
  33. template<>
  34. struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
  35. };
  36. class Parser {
  37. public:
  38. struct Result {
  39. ByteCode bytecode;
  40. size_t capture_groups_count;
  41. size_t named_capture_groups_count;
  42. size_t match_length_minimum;
  43. Error error;
  44. Token error_token;
  45. Vector<FlyString> capture_groups;
  46. };
  47. explicit Parser(Lexer& lexer)
  48. : m_parser_state(lexer)
  49. {
  50. }
  51. Parser(Lexer& lexer, AllOptions regex_options)
  52. : m_parser_state(lexer, regex_options)
  53. {
  54. }
  55. virtual ~Parser() = default;
  56. Result parse(Optional<AllOptions> regex_options = {});
  57. bool has_error() const { return m_parser_state.error != Error::NoError; }
  58. Error error() const { return m_parser_state.error; }
  59. protected:
  60. virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
  61. ALWAYS_INLINE bool match(TokenType type) const;
  62. ALWAYS_INLINE bool match(char ch) const;
  63. ALWAYS_INLINE bool match_ordinary_characters();
  64. ALWAYS_INLINE Token consume();
  65. ALWAYS_INLINE Token consume(TokenType type, Error error);
  66. ALWAYS_INLINE bool consume(String const&);
  67. ALWAYS_INLINE Optional<u32> consume_escaped_code_point(bool unicode);
  68. ALWAYS_INLINE bool try_skip(StringView);
  69. ALWAYS_INLINE bool lookahead_any(StringView);
  70. ALWAYS_INLINE unsigned char skip();
  71. ALWAYS_INLINE void back(size_t = 1);
  72. ALWAYS_INLINE void reset();
  73. ALWAYS_INLINE bool done() const;
  74. ALWAYS_INLINE bool set_error(Error error);
  75. struct NamedCaptureGroup {
  76. size_t group_index { 0 };
  77. size_t minimum_length { 0 };
  78. };
  79. struct ParserState {
  80. Lexer& lexer;
  81. Token current_token;
  82. Error error = Error::NoError;
  83. Token error_token { TokenType::Eof, 0, StringView(nullptr) };
  84. ByteCode bytecode;
  85. size_t capture_groups_count { 0 };
  86. size_t named_capture_groups_count { 0 };
  87. size_t match_length_minimum { 0 };
  88. size_t repetition_mark_count { 0 };
  89. AllOptions regex_options;
  90. HashMap<int, size_t> capture_group_minimum_lengths;
  91. HashMap<FlyString, NamedCaptureGroup> named_capture_groups;
  92. explicit ParserState(Lexer& lexer)
  93. : lexer(lexer)
  94. , current_token(lexer.next())
  95. {
  96. }
  97. explicit ParserState(Lexer& lexer, AllOptions regex_options)
  98. : lexer(lexer)
  99. , current_token(lexer.next())
  100. , regex_options(regex_options)
  101. {
  102. }
  103. };
  104. ParserState m_parser_state;
  105. };
  106. class AbstractPosixParser : public Parser {
  107. protected:
  108. explicit AbstractPosixParser(Lexer& lexer)
  109. : Parser(lexer)
  110. {
  111. }
  112. AbstractPosixParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
  113. : Parser(lexer, regex_options.value_or({}))
  114. {
  115. }
  116. ALWAYS_INLINE bool parse_bracket_expression(Vector<CompareTypeAndValuePair>&, size_t&);
  117. };
  118. class PosixBasicParser final : public AbstractPosixParser {
  119. public:
  120. explicit PosixBasicParser(Lexer& lexer)
  121. : AbstractPosixParser(lexer)
  122. {
  123. }
  124. PosixBasicParser(Lexer& lexer, Optional<typename ParserTraits<PosixBasicParser>::OptionsType> regex_options)
  125. : AbstractPosixParser(lexer, regex_options.value_or({}))
  126. {
  127. }
  128. ~PosixBasicParser() = default;
  129. private:
  130. bool parse_internal(ByteCode&, size_t&) override;
  131. bool parse_root(ByteCode&, size_t&);
  132. bool parse_re_expression(ByteCode&, size_t&);
  133. bool parse_simple_re(ByteCode&, size_t&);
  134. bool parse_nonduplicating_re(ByteCode&, size_t&);
  135. bool parse_one_char_or_collation_element(ByteCode&, size_t&);
  136. constexpr static size_t number_of_addressable_capture_groups = 9;
  137. size_t m_capture_group_minimum_lengths[number_of_addressable_capture_groups] { 0 };
  138. bool m_capture_group_seen[number_of_addressable_capture_groups] { false };
  139. size_t m_current_capture_group_depth { 0 };
  140. };
  141. class PosixExtendedParser final : public AbstractPosixParser {
  142. public:
  143. explicit PosixExtendedParser(Lexer& lexer)
  144. : AbstractPosixParser(lexer)
  145. {
  146. }
  147. PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
  148. : AbstractPosixParser(lexer, regex_options.value_or({}))
  149. {
  150. }
  151. ~PosixExtendedParser() = default;
  152. private:
  153. ALWAYS_INLINE bool match_repetition_symbol();
  154. bool parse_internal(ByteCode&, size_t&) override;
  155. bool parse_root(ByteCode&, size_t&);
  156. ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
  157. ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
  158. ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
  159. };
  160. class ECMA262Parser final : public Parser {
  161. public:
  162. explicit ECMA262Parser(Lexer& lexer)
  163. : Parser(lexer)
  164. {
  165. m_capture_groups_in_scope.empend();
  166. }
  167. ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
  168. : Parser(lexer, regex_options.value_or({}))
  169. {
  170. m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
  171. m_capture_groups_in_scope.empend();
  172. }
  173. ~ECMA262Parser() = default;
  174. private:
  175. bool parse_internal(ByteCode&, size_t&) override;
  176. enum class ReadDigitsInitialZeroState {
  177. Allow,
  178. Disallow,
  179. };
  180. StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
  181. Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
  182. FlyString read_capture_group_specifier(bool take_starting_angle_bracket = false);
  183. struct Script {
  184. Unicode::Script script {};
  185. bool is_extension { false };
  186. };
  187. using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Script, Empty>;
  188. Optional<PropertyEscape> read_unicode_property_escape();
  189. bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
  190. bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
  191. bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named);
  192. bool parse_term(ByteCode&, size_t&, bool unicode, bool named);
  193. bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
  194. bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
  195. bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
  196. bool parse_interval_quantifier(Optional<u64>& repeat_min, Optional<u64>& repeat_max);
  197. bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
  198. bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
  199. bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
  200. Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
  201. bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
  202. bool parse_unicode_property_escape(PropertyEscape& property, bool& negated);
  203. // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
  204. bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named);
  205. bool parse_extended_atom(ByteCode&, size_t&, bool named);
  206. bool parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, bool unicode, bool named);
  207. bool parse_invalid_braced_quantifier(); // Note: This function either parses and *fails*, or doesn't parse anything and returns false.
  208. bool parse_legacy_octal_escape_sequence(ByteCode& bytecode_stack, size_t& length);
  209. Optional<u8> parse_legacy_octal_escape();
  210. size_t ensure_total_number_of_capturing_parenthesis();
  211. void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); }
  212. void exit_capture_group_scope()
  213. {
  214. auto last = m_capture_groups_in_scope.take_last();
  215. m_capture_groups_in_scope.last().extend(move(last));
  216. }
  217. void clear_all_capture_groups_in_scope(ByteCode& stack)
  218. {
  219. for (auto& index : m_capture_groups_in_scope.last())
  220. stack.insert_bytecode_clear_capture_group(index);
  221. };
  222. // ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
  223. // always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
  224. // parse it twice, so we'll just do so when it's actually needed.
  225. // Most patterns should have no need to ever populate this field.
  226. Optional<size_t> m_total_number_of_capturing_parenthesis;
  227. // Keep the Annex B. behavior behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
  228. bool m_should_use_browser_extended_grammar { false };
  229. // ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it,
  230. // by requiring that (...)+ only contain the matches for the last iteration.
  231. // To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed.
  232. Vector<Vector<size_t>> m_capture_groups_in_scope;
  233. };
  234. using PosixExtended = PosixExtendedParser;
  235. using PosixBasic = PosixBasicParser;
  236. using ECMA262 = ECMA262Parser;
  237. }
  238. using regex::ECMA262;
  239. using regex::PosixBasic;
  240. using regex::PosixExtended;