RegexParser.h 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "RegexByteCode.h"
  8. #include "RegexError.h"
  9. #include "RegexLexer.h"
  10. #include "RegexOptions.h"
  11. #include <AK/Forward.h>
  12. #include <AK/StringBuilder.h>
  13. #include <AK/Types.h>
  14. #include <AK/Vector.h>
  15. namespace regex {
  16. class PosixExtendedParser;
  17. class PosixBasicParser;
  18. class ECMA262Parser;
  19. template<typename T>
  20. struct GenericParserTraits {
  21. using OptionsType = T;
  22. };
  23. template<typename T>
  24. struct ParserTraits : public GenericParserTraits<T> {
  25. };
  26. template<>
  27. struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
  28. };
  29. template<>
  30. struct ParserTraits<PosixBasicParser> : public GenericParserTraits<PosixOptions> {
  31. };
  32. template<>
  33. struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
  34. };
  35. class Parser {
  36. public:
  37. struct Result {
  38. ByteCode bytecode;
  39. size_t capture_groups_count;
  40. size_t named_capture_groups_count;
  41. size_t match_length_minimum;
  42. Error error;
  43. Token error_token;
  44. };
  45. explicit Parser(Lexer& lexer)
  46. : m_parser_state(lexer)
  47. {
  48. }
  49. Parser(Lexer& lexer, AllOptions regex_options)
  50. : m_parser_state(lexer, regex_options)
  51. {
  52. }
  53. virtual ~Parser() = default;
  54. Result parse(Optional<AllOptions> regex_options = {});
  55. bool has_error() const { return m_parser_state.error != Error::NoError; }
  56. Error error() const { return m_parser_state.error; }
  57. protected:
  58. virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
  59. ALWAYS_INLINE bool match(TokenType type) const;
  60. ALWAYS_INLINE bool match(char ch) const;
  61. ALWAYS_INLINE bool match_ordinary_characters();
  62. ALWAYS_INLINE Token consume();
  63. ALWAYS_INLINE Token consume(TokenType type, Error error);
  64. ALWAYS_INLINE bool consume(const String&);
  65. ALWAYS_INLINE bool try_skip(StringView);
  66. ALWAYS_INLINE bool lookahead_any(StringView);
  67. ALWAYS_INLINE char skip();
  68. ALWAYS_INLINE void back(size_t = 1);
  69. ALWAYS_INLINE void reset();
  70. ALWAYS_INLINE bool done() const;
  71. ALWAYS_INLINE bool set_error(Error error);
  72. struct ParserState {
  73. Lexer& lexer;
  74. Token current_token;
  75. Error error = Error::NoError;
  76. Token error_token { TokenType::Eof, 0, StringView(nullptr) };
  77. ByteCode bytecode;
  78. size_t capture_groups_count { 0 };
  79. size_t named_capture_groups_count { 0 };
  80. size_t match_length_minimum { 0 };
  81. AllOptions regex_options;
  82. HashMap<int, size_t> capture_group_minimum_lengths;
  83. HashMap<FlyString, size_t> named_capture_group_minimum_lengths;
  84. HashMap<size_t, FlyString> named_capture_groups;
  85. explicit ParserState(Lexer& lexer)
  86. : lexer(lexer)
  87. , current_token(lexer.next())
  88. {
  89. }
  90. explicit ParserState(Lexer& lexer, AllOptions regex_options)
  91. : lexer(lexer)
  92. , current_token(lexer.next())
  93. , regex_options(regex_options)
  94. {
  95. }
  96. };
  97. ParserState m_parser_state;
  98. };
  99. class AbstractPosixParser : public Parser {
  100. protected:
  101. explicit AbstractPosixParser(Lexer& lexer)
  102. : Parser(lexer)
  103. {
  104. }
  105. AbstractPosixParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
  106. : Parser(lexer, regex_options.value_or({}))
  107. {
  108. }
  109. ALWAYS_INLINE bool parse_bracket_expression(Vector<CompareTypeAndValuePair>&, size_t&);
  110. };
  111. class PosixBasicParser final : public AbstractPosixParser {
  112. public:
  113. explicit PosixBasicParser(Lexer& lexer)
  114. : AbstractPosixParser(lexer)
  115. {
  116. }
  117. PosixBasicParser(Lexer& lexer, Optional<typename ParserTraits<PosixBasicParser>::OptionsType> regex_options)
  118. : AbstractPosixParser(lexer, regex_options.value_or({}))
  119. {
  120. }
  121. ~PosixBasicParser() = default;
  122. private:
  123. bool parse_internal(ByteCode&, size_t&) override;
  124. bool parse_root(ByteCode&, size_t&);
  125. bool parse_re_expression(ByteCode&, size_t&);
  126. bool parse_simple_re(ByteCode&, size_t&);
  127. bool parse_nonduplicating_re(ByteCode&, size_t&);
  128. bool parse_one_char_or_collation_element(ByteCode&, size_t&);
  129. constexpr static size_t number_of_addressable_capture_groups = 9;
  130. size_t m_capture_group_minimum_lengths[number_of_addressable_capture_groups] { 0 };
  131. bool m_capture_group_seen[number_of_addressable_capture_groups] { false };
  132. };
  133. class PosixExtendedParser final : public AbstractPosixParser {
  134. public:
  135. explicit PosixExtendedParser(Lexer& lexer)
  136. : AbstractPosixParser(lexer)
  137. {
  138. }
  139. PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
  140. : AbstractPosixParser(lexer, regex_options.value_or({}))
  141. {
  142. }
  143. ~PosixExtendedParser() = default;
  144. private:
  145. ALWAYS_INLINE bool match_repetition_symbol();
  146. bool parse_internal(ByteCode&, size_t&) override;
  147. bool parse_root(ByteCode&, size_t&);
  148. ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
  149. ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
  150. ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
  151. };
  152. class ECMA262Parser final : public Parser {
  153. public:
  154. explicit ECMA262Parser(Lexer& lexer)
  155. : Parser(lexer)
  156. {
  157. }
  158. ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
  159. : Parser(lexer, regex_options.value_or({}))
  160. {
  161. m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
  162. }
  163. ~ECMA262Parser() = default;
  164. private:
  165. bool parse_internal(ByteCode&, size_t&) override;
  166. enum class ReadDigitsInitialZeroState {
  167. Allow,
  168. Disallow,
  169. };
  170. StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
  171. Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
  172. StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
  173. bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
  174. bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
  175. bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named);
  176. bool parse_term(ByteCode&, size_t&, bool unicode, bool named);
  177. bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
  178. bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
  179. bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
  180. bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
  181. bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
  182. bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
  183. Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
  184. bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
  185. // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
  186. bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named);
  187. bool parse_extended_atom(ByteCode&, size_t&, bool named);
  188. bool parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, bool unicode, bool named);
  189. bool parse_invalid_braced_quantifier(); // Note: This function either parses and *fails*, or doesn't parse anything and returns false.
  190. bool parse_legacy_octal_escape_sequence(ByteCode& bytecode_stack, size_t& length);
  191. Optional<u8> parse_legacy_octal_escape();
  192. size_t ensure_total_number_of_capturing_parenthesis();
  193. // ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
  194. // always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
  195. // parse it twice, so we'll just do so when it's actually needed.
  196. // Most patterns should have no need to ever populate this field.
  197. Optional<size_t> m_total_number_of_capturing_parenthesis;
  198. // Keep the Annex B. behaviour behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
  199. bool m_should_use_browser_extended_grammar { false };
  200. };
  201. using PosixExtended = PosixExtendedParser;
  202. using PosixBasic = PosixBasicParser;
  203. using ECMA262 = ECMA262Parser;
  204. }
  205. using regex::ECMA262;
  206. using regex::PosixBasic;
  207. using regex::PosixExtended;