RegexParser.h 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "RegexByteCode.h"
  8. #include "RegexError.h"
  9. #include "RegexLexer.h"
  10. #include "RegexOptions.h"
  11. #include <AK/Forward.h>
  12. #include <AK/StringBuilder.h>
  13. #include <AK/Types.h>
  14. #include <AK/Vector.h>
  15. #include <LibUnicode/Forward.h>
  16. namespace regex {
  17. class PosixExtendedParser;
  18. class PosixBasicParser;
  19. class ECMA262Parser;
  20. template<typename T>
  21. struct GenericParserTraits {
  22. using OptionsType = T;
  23. };
  24. template<typename T>
  25. struct ParserTraits : public GenericParserTraits<T> {
  26. };
  27. template<>
  28. struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
  29. };
  30. template<>
  31. struct ParserTraits<PosixBasicParser> : public GenericParserTraits<PosixOptions> {
  32. };
  33. template<>
  34. struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
  35. };
  36. class Parser {
  37. public:
  38. struct Result {
  39. ByteCode bytecode;
  40. size_t capture_groups_count;
  41. size_t named_capture_groups_count;
  42. size_t match_length_minimum;
  43. Error error;
  44. Token error_token;
  45. };
  46. explicit Parser(Lexer& lexer)
  47. : m_parser_state(lexer)
  48. {
  49. }
  50. Parser(Lexer& lexer, AllOptions regex_options)
  51. : m_parser_state(lexer, regex_options)
  52. {
  53. }
  54. virtual ~Parser() = default;
  55. Result parse(Optional<AllOptions> regex_options = {});
  56. bool has_error() const { return m_parser_state.error != Error::NoError; }
  57. Error error() const { return m_parser_state.error; }
  58. protected:
  59. virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
  60. ALWAYS_INLINE bool match(TokenType type) const;
  61. ALWAYS_INLINE bool match(char ch) const;
  62. ALWAYS_INLINE bool match_ordinary_characters();
  63. ALWAYS_INLINE Token consume();
  64. ALWAYS_INLINE Token consume(TokenType type, Error error);
  65. ALWAYS_INLINE bool consume(String const&);
  66. ALWAYS_INLINE bool try_skip(StringView);
  67. ALWAYS_INLINE bool lookahead_any(StringView);
  68. ALWAYS_INLINE char skip();
  69. ALWAYS_INLINE void back(size_t = 1);
  70. ALWAYS_INLINE void reset();
  71. ALWAYS_INLINE bool done() const;
  72. ALWAYS_INLINE bool set_error(Error error);
  73. struct ParserState {
  74. Lexer& lexer;
  75. Token current_token;
  76. Error error = Error::NoError;
  77. Token error_token { TokenType::Eof, 0, StringView(nullptr) };
  78. ByteCode bytecode;
  79. size_t capture_groups_count { 0 };
  80. size_t named_capture_groups_count { 0 };
  81. size_t match_length_minimum { 0 };
  82. AllOptions regex_options;
  83. HashMap<int, size_t> capture_group_minimum_lengths;
  84. HashMap<FlyString, size_t> named_capture_group_minimum_lengths;
  85. HashMap<size_t, FlyString> named_capture_groups;
  86. explicit ParserState(Lexer& lexer)
  87. : lexer(lexer)
  88. , current_token(lexer.next())
  89. {
  90. }
  91. explicit ParserState(Lexer& lexer, AllOptions regex_options)
  92. : lexer(lexer)
  93. , current_token(lexer.next())
  94. , regex_options(regex_options)
  95. {
  96. }
  97. };
  98. ParserState m_parser_state;
  99. };
  100. class AbstractPosixParser : public Parser {
  101. protected:
  102. explicit AbstractPosixParser(Lexer& lexer)
  103. : Parser(lexer)
  104. {
  105. }
  106. AbstractPosixParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
  107. : Parser(lexer, regex_options.value_or({}))
  108. {
  109. }
  110. ALWAYS_INLINE bool parse_bracket_expression(Vector<CompareTypeAndValuePair>&, size_t&);
  111. };
  112. class PosixBasicParser final : public AbstractPosixParser {
  113. public:
  114. explicit PosixBasicParser(Lexer& lexer)
  115. : AbstractPosixParser(lexer)
  116. {
  117. }
  118. PosixBasicParser(Lexer& lexer, Optional<typename ParserTraits<PosixBasicParser>::OptionsType> regex_options)
  119. : AbstractPosixParser(lexer, regex_options.value_or({}))
  120. {
  121. }
  122. ~PosixBasicParser() = default;
  123. private:
  124. bool parse_internal(ByteCode&, size_t&) override;
  125. bool parse_root(ByteCode&, size_t&);
  126. bool parse_re_expression(ByteCode&, size_t&);
  127. bool parse_simple_re(ByteCode&, size_t&);
  128. bool parse_nonduplicating_re(ByteCode&, size_t&);
  129. bool parse_one_char_or_collation_element(ByteCode&, size_t&);
  130. constexpr static size_t number_of_addressable_capture_groups = 9;
  131. size_t m_capture_group_minimum_lengths[number_of_addressable_capture_groups] { 0 };
  132. bool m_capture_group_seen[number_of_addressable_capture_groups] { false };
  133. };
  134. class PosixExtendedParser final : public AbstractPosixParser {
  135. public:
  136. explicit PosixExtendedParser(Lexer& lexer)
  137. : AbstractPosixParser(lexer)
  138. {
  139. }
  140. PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
  141. : AbstractPosixParser(lexer, regex_options.value_or({}))
  142. {
  143. }
  144. ~PosixExtendedParser() = default;
  145. private:
  146. ALWAYS_INLINE bool match_repetition_symbol();
  147. bool parse_internal(ByteCode&, size_t&) override;
  148. bool parse_root(ByteCode&, size_t&);
  149. ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
  150. ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
  151. ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
  152. };
  153. class ECMA262Parser final : public Parser {
  154. public:
  155. explicit ECMA262Parser(Lexer& lexer)
  156. : Parser(lexer)
  157. {
  158. m_capture_groups_in_scope.empend();
  159. }
  160. ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
  161. : Parser(lexer, regex_options.value_or({}))
  162. {
  163. m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
  164. m_capture_groups_in_scope.empend();
  165. }
  166. ~ECMA262Parser() = default;
  167. private:
  168. bool parse_internal(ByteCode&, size_t&) override;
  169. enum class ReadDigitsInitialZeroState {
  170. Allow,
  171. Disallow,
  172. };
  173. StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
  174. Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
  175. StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
  176. struct Script {
  177. Unicode::Script script {};
  178. bool is_extension { false };
  179. };
  180. using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Script>;
  181. Optional<PropertyEscape> read_unicode_property_escape();
  182. bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
  183. bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
  184. bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named);
  185. bool parse_term(ByteCode&, size_t&, bool unicode, bool named);
  186. bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
  187. bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
  188. bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
  189. bool parse_interval_quantifier(Optional<size_t>& repeat_min, Optional<size_t>& repeat_max);
  190. bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
  191. bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
  192. bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
  193. Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
  194. bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
  195. bool parse_unicode_property_escape(PropertyEscape& property, bool& negated);
  196. // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
  197. bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named);
  198. bool parse_extended_atom(ByteCode&, size_t&, bool named);
  199. bool parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, bool unicode, bool named);
  200. bool parse_invalid_braced_quantifier(); // Note: This function either parses and *fails*, or doesn't parse anything and returns false.
  201. bool parse_legacy_octal_escape_sequence(ByteCode& bytecode_stack, size_t& length);
  202. Optional<u8> parse_legacy_octal_escape();
  203. size_t ensure_total_number_of_capturing_parenthesis();
  204. // ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
  205. // always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
  206. // parse it twice, so we'll just do so when it's actually needed.
  207. // Most patterns should have no need to ever populate this field.
  208. Optional<size_t> m_total_number_of_capturing_parenthesis;
  209. // Keep the Annex B. behaviour behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
  210. bool m_should_use_browser_extended_grammar { false };
  211. // ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it,
  212. // by requiring that (...)+ only contain the matches for the last iteration.
  213. // To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed.
  214. using CaptureGroup = Variant<size_t, String>;
  215. Vector<Vector<CaptureGroup>> m_capture_groups_in_scope;
  216. };
  217. using PosixExtended = PosixExtendedParser;
  218. using PosixBasic = PosixBasicParser;
  219. using ECMA262 = ECMA262Parser;
  220. }
  221. using regex::ECMA262;
  222. using regex::PosixBasic;
  223. using regex::PosixExtended;