RegexParser.h 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #pragma once
  27. #include "RegexByteCode.h"
  28. #include "RegexError.h"
  29. #include "RegexLexer.h"
  30. #include "RegexOptions.h"
  31. #include <AK/Forward.h>
  32. #include <AK/StringBuilder.h>
  33. #include <AK/Types.h>
  34. #include <AK/Vector.h>
  35. namespace regex {
  36. class PosixExtendedParser;
  37. class ECMA262Parser;
  38. template<typename T>
  39. struct GenericParserTraits {
  40. using OptionsType = T;
  41. };
  42. template<typename T>
  43. struct ParserTraits : public GenericParserTraits<T> {
  44. };
  45. template<>
  46. struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
  47. };
  48. template<>
  49. struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
  50. };
  51. class Parser {
  52. public:
  53. struct Result {
  54. ByteCode bytecode;
  55. size_t capture_groups_count;
  56. size_t named_capture_groups_count;
  57. size_t match_length_minimum;
  58. Error error;
  59. Token error_token;
  60. };
  61. explicit Parser(Lexer& lexer)
  62. : m_parser_state(lexer)
  63. {
  64. }
  65. Parser(Lexer& lexer, AllOptions regex_options)
  66. : m_parser_state(lexer, regex_options)
  67. {
  68. }
  69. virtual ~Parser() = default;
  70. Result parse(Optional<AllOptions> regex_options = {});
  71. bool has_error() const { return m_parser_state.error != Error::NoError; }
  72. Error error() const { return m_parser_state.error; }
  73. protected:
  74. virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
  75. ALWAYS_INLINE bool match(TokenType type) const;
  76. ALWAYS_INLINE bool match(char ch) const;
  77. ALWAYS_INLINE bool match_ordinary_characters();
  78. ALWAYS_INLINE Token consume();
  79. ALWAYS_INLINE Token consume(TokenType type, Error error);
  80. ALWAYS_INLINE bool consume(const String&);
  81. ALWAYS_INLINE bool try_skip(StringView);
  82. ALWAYS_INLINE bool lookahead_any(StringView);
  83. ALWAYS_INLINE char skip();
  84. ALWAYS_INLINE void back(size_t = 1);
  85. ALWAYS_INLINE void reset();
  86. ALWAYS_INLINE bool done() const;
  87. ALWAYS_INLINE bool set_error(Error error);
  88. struct ParserState {
  89. Lexer& lexer;
  90. Token current_token;
  91. Error error = Error::NoError;
  92. Token error_token { TokenType::Eof, 0, StringView(nullptr) };
  93. ByteCode bytecode;
  94. size_t capture_groups_count { 0 };
  95. size_t named_capture_groups_count { 0 };
  96. size_t match_length_minimum { 0 };
  97. AllOptions regex_options;
  98. HashMap<int, size_t> capture_group_minimum_lengths;
  99. HashMap<FlyString, size_t> named_capture_group_minimum_lengths;
  100. HashMap<size_t, FlyString> named_capture_groups;
  101. explicit ParserState(Lexer& lexer)
  102. : lexer(lexer)
  103. , current_token(lexer.next())
  104. {
  105. }
  106. explicit ParserState(Lexer& lexer, AllOptions regex_options)
  107. : lexer(lexer)
  108. , current_token(lexer.next())
  109. , regex_options(regex_options)
  110. {
  111. }
  112. };
  113. ParserState m_parser_state;
  114. };
  115. class PosixExtendedParser final : public Parser {
  116. public:
  117. explicit PosixExtendedParser(Lexer& lexer)
  118. : Parser(lexer)
  119. {
  120. }
  121. PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
  122. : Parser(lexer, regex_options.value_or({}))
  123. {
  124. }
  125. ~PosixExtendedParser() = default;
  126. private:
  127. ALWAYS_INLINE bool match_repetition_symbol();
  128. bool parse_internal(ByteCode&, size_t&) override;
  129. bool parse_root(ByteCode&, size_t&);
  130. ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
  131. ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
  132. ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
  133. };
  134. class ECMA262Parser final : public Parser {
  135. public:
  136. explicit ECMA262Parser(Lexer& lexer)
  137. : Parser(lexer)
  138. {
  139. }
  140. ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
  141. : Parser(lexer, regex_options.value_or({}))
  142. {
  143. m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
  144. }
  145. ~ECMA262Parser() = default;
  146. private:
  147. bool parse_internal(ByteCode&, size_t&) override;
  148. enum class ReadDigitsInitialZeroState {
  149. Allow,
  150. Disallow,
  151. };
  152. enum class ReadDigitFollowPolicy {
  153. Any,
  154. DisallowNonDigit,
  155. };
  156. StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy follow_policy = ReadDigitFollowPolicy::Any, bool hex = false, int max_count = -1);
  157. Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy follow_policy = ReadDigitFollowPolicy::Any, bool hex = false, int max_count = -1);
  158. StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
  159. bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
  160. bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
  161. bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named);
  162. bool parse_term(ByteCode&, size_t&, bool unicode, bool named);
  163. bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
  164. bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
  165. bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
  166. bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
  167. bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
  168. bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
  169. Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
  170. bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
  171. // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
  172. bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named);
  173. bool parse_extended_atom(ByteCode&, size_t&, bool named);
  174. bool parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, bool unicode, bool named);
  175. bool parse_invalid_braced_quantifier(); // Note: This function either parses and *fails*, or doesn't parse anything and returns false.
  176. bool parse_legacy_octal_escape_sequence(ByteCode& bytecode_stack, size_t& length);
  177. Optional<u8> parse_legacy_octal_escape();
  178. size_t ensure_total_number_of_capturing_parenthesis();
  179. // ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
  180. // always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
  181. // parse it twice, so we'll just do so when it's actually needed.
  182. // Most patterns should have no need to ever populate this field.
  183. Optional<size_t> m_total_number_of_capturing_parenthesis;
  184. // Keep the Annex B. behaviour behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
  185. bool m_should_use_browser_extended_grammar { false };
  186. };
  187. using PosixExtended = PosixExtendedParser;
  188. using ECMA262 = ECMA262Parser;
  189. }
  190. using regex::ECMA262;
  191. using regex::PosixExtended;