RegexParser.h 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #pragma once
  27. #include "RegexByteCode.h"
  28. #include "RegexError.h"
  29. #include "RegexLexer.h"
  30. #include "RegexOptions.h"
  31. #include <AK/Forward.h>
  32. #include <AK/StringBuilder.h>
  33. #include <AK/Types.h>
  34. #include <AK/Vector.h>
  35. namespace regex {
  36. class PosixExtendedParser;
  37. class ECMA262Parser;
  38. template<typename T>
  39. struct GenericParserTraits {
  40. using OptionsType = T;
  41. };
  42. template<typename T>
  43. struct ParserTraits : public GenericParserTraits<T> {
  44. };
  45. template<>
  46. struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
  47. };
  48. template<>
  49. struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
  50. };
  51. class Parser {
  52. public:
  53. struct Result {
  54. ByteCode bytecode;
  55. size_t capture_groups_count;
  56. size_t named_capture_groups_count;
  57. size_t match_length_minimum;
  58. Error error;
  59. Token error_token;
  60. };
  61. explicit Parser(Lexer& lexer)
  62. : m_parser_state(lexer)
  63. {
  64. }
  65. Parser(Lexer& lexer, AllOptions regex_options)
  66. : m_parser_state(lexer, regex_options)
  67. {
  68. }
  69. virtual ~Parser() = default;
  70. Result parse(Optional<AllOptions> regex_options = {});
  71. bool has_error() const { return m_parser_state.error != Error::NoError; }
  72. Error error() const { return m_parser_state.error; }
  73. protected:
  74. virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
  75. ALWAYS_INLINE bool match(TokenType type) const;
  76. ALWAYS_INLINE bool match(char ch) const;
  77. ALWAYS_INLINE bool match_ordinary_characters();
  78. ALWAYS_INLINE Token consume();
  79. ALWAYS_INLINE Token consume(TokenType type, Error error);
  80. ALWAYS_INLINE bool consume(const String&);
  81. ALWAYS_INLINE bool try_skip(StringView);
  82. ALWAYS_INLINE char skip();
  83. ALWAYS_INLINE void reset();
  84. ALWAYS_INLINE bool done() const;
  85. ALWAYS_INLINE bool set_error(Error error);
  86. struct ParserState {
  87. Lexer& lexer;
  88. Token current_token;
  89. Error error = Error::NoError;
  90. Token error_token { TokenType::Eof, 0, StringView(nullptr) };
  91. ByteCode bytecode;
  92. size_t capture_groups_count { 0 };
  93. size_t named_capture_groups_count { 0 };
  94. size_t match_length_minimum { 0 };
  95. AllOptions regex_options;
  96. HashMap<int, size_t> capture_group_minimum_lengths;
  97. HashMap<FlyString, size_t> named_capture_group_minimum_lengths;
  98. HashMap<size_t, FlyString> named_capture_groups;
  99. explicit ParserState(Lexer& lexer)
  100. : lexer(lexer)
  101. , current_token(lexer.next())
  102. {
  103. }
  104. explicit ParserState(Lexer& lexer, AllOptions regex_options)
  105. : lexer(lexer)
  106. , current_token(lexer.next())
  107. , regex_options(regex_options)
  108. {
  109. }
  110. };
  111. ParserState m_parser_state;
  112. };
  113. class PosixExtendedParser final : public Parser {
  114. public:
  115. explicit PosixExtendedParser(Lexer& lexer)
  116. : Parser(lexer)
  117. {
  118. }
  119. PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
  120. : Parser(lexer, regex_options.value_or({}))
  121. {
  122. }
  123. ~PosixExtendedParser() = default;
  124. private:
  125. ALWAYS_INLINE bool match_repetition_symbol();
  126. bool parse_internal(ByteCode&, size_t&) override;
  127. bool parse_root(ByteCode&, size_t&);
  128. ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
  129. ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
  130. ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
  131. };
  132. class ECMA262Parser final : public Parser {
  133. public:
  134. explicit ECMA262Parser(Lexer& lexer)
  135. : Parser(lexer)
  136. {
  137. }
  138. ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
  139. : Parser(lexer, regex_options.value_or({}))
  140. {
  141. }
  142. ~ECMA262Parser() = default;
  143. private:
  144. bool parse_internal(ByteCode&, size_t&) override;
  145. enum class ReadDigitsInitialZeroState {
  146. Allow,
  147. Disallow,
  148. Require,
  149. };
  150. enum class ReadDigitFollowPolicy {
  151. Any,
  152. DisallowDigit,
  153. DisallowNonDigit,
  154. };
  155. Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy follow_policy = ReadDigitFollowPolicy::Any, bool hex = false, int max_count = -1);
  156. StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
  157. bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
  158. bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
  159. bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named);
  160. bool parse_term(ByteCode&, size_t&, bool unicode, bool named);
  161. bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
  162. bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
  163. bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
  164. bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
  165. bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
  166. bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
  167. Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
  168. bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
  169. };
  170. using PosixExtended = PosixExtendedParser;
  171. using ECMA262 = ECMA262Parser;
  172. }
  173. using regex::ECMA262;
  174. using regex::PosixExtended;