123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273 |
- /*
- * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
- *
- * SPDX-License-Identifier: BSD-2-Clause
- */
- #pragma once
- #include "RegexByteCode.h"
- #include "RegexError.h"
- #include "RegexLexer.h"
- #include "RegexOptions.h"
- #include <AK/Forward.h>
- #include <AK/StringBuilder.h>
- #include <AK/Types.h>
- #include <AK/Vector.h>
- #include <LibUnicode/Forward.h>
- namespace regex {
- class PosixExtendedParser;
- class PosixBasicParser;
- class ECMA262Parser;
- template<typename T>
- struct GenericParserTraits {
- using OptionsType = T;
- };
- template<typename T>
- struct ParserTraits : public GenericParserTraits<T> {
- };
- template<>
- struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
- };
- template<>
- struct ParserTraits<PosixBasicParser> : public GenericParserTraits<PosixOptions> {
- };
- template<>
- struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
- };
- class Parser {
- public:
- struct Result {
- ByteCode bytecode;
- size_t capture_groups_count;
- size_t named_capture_groups_count;
- size_t match_length_minimum;
- Error error;
- Token error_token;
- };
- explicit Parser(Lexer& lexer)
- : m_parser_state(lexer)
- {
- }
- Parser(Lexer& lexer, AllOptions regex_options)
- : m_parser_state(lexer, regex_options)
- {
- }
- virtual ~Parser() = default;
- Result parse(Optional<AllOptions> regex_options = {});
- bool has_error() const { return m_parser_state.error != Error::NoError; }
- Error error() const { return m_parser_state.error; }
- protected:
- virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
- ALWAYS_INLINE bool match(TokenType type) const;
- ALWAYS_INLINE bool match(char ch) const;
- ALWAYS_INLINE bool match_ordinary_characters();
- ALWAYS_INLINE Token consume();
- ALWAYS_INLINE Token consume(TokenType type, Error error);
- ALWAYS_INLINE bool consume(String const&);
- ALWAYS_INLINE bool try_skip(StringView);
- ALWAYS_INLINE bool lookahead_any(StringView);
- ALWAYS_INLINE char skip();
- ALWAYS_INLINE void back(size_t = 1);
- ALWAYS_INLINE void reset();
- ALWAYS_INLINE bool done() const;
- ALWAYS_INLINE bool set_error(Error error);
- struct ParserState {
- Lexer& lexer;
- Token current_token;
- Error error = Error::NoError;
- Token error_token { TokenType::Eof, 0, StringView(nullptr) };
- ByteCode bytecode;
- size_t capture_groups_count { 0 };
- size_t named_capture_groups_count { 0 };
- size_t match_length_minimum { 0 };
- AllOptions regex_options;
- HashMap<int, size_t> capture_group_minimum_lengths;
- HashMap<FlyString, size_t> named_capture_group_minimum_lengths;
- HashMap<size_t, FlyString> named_capture_groups;
- explicit ParserState(Lexer& lexer)
- : lexer(lexer)
- , current_token(lexer.next())
- {
- }
- explicit ParserState(Lexer& lexer, AllOptions regex_options)
- : lexer(lexer)
- , current_token(lexer.next())
- , regex_options(regex_options)
- {
- }
- };
- ParserState m_parser_state;
- };
- class AbstractPosixParser : public Parser {
- protected:
- explicit AbstractPosixParser(Lexer& lexer)
- : Parser(lexer)
- {
- }
- AbstractPosixParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
- : Parser(lexer, regex_options.value_or({}))
- {
- }
- ALWAYS_INLINE bool parse_bracket_expression(Vector<CompareTypeAndValuePair>&, size_t&);
- };
- class PosixBasicParser final : public AbstractPosixParser {
- public:
- explicit PosixBasicParser(Lexer& lexer)
- : AbstractPosixParser(lexer)
- {
- }
- PosixBasicParser(Lexer& lexer, Optional<typename ParserTraits<PosixBasicParser>::OptionsType> regex_options)
- : AbstractPosixParser(lexer, regex_options.value_or({}))
- {
- }
- ~PosixBasicParser() = default;
- private:
- bool parse_internal(ByteCode&, size_t&) override;
- bool parse_root(ByteCode&, size_t&);
- bool parse_re_expression(ByteCode&, size_t&);
- bool parse_simple_re(ByteCode&, size_t&);
- bool parse_nonduplicating_re(ByteCode&, size_t&);
- bool parse_one_char_or_collation_element(ByteCode&, size_t&);
- constexpr static size_t number_of_addressable_capture_groups = 9;
- size_t m_capture_group_minimum_lengths[number_of_addressable_capture_groups] { 0 };
- bool m_capture_group_seen[number_of_addressable_capture_groups] { false };
- };
- class PosixExtendedParser final : public AbstractPosixParser {
- public:
- explicit PosixExtendedParser(Lexer& lexer)
- : AbstractPosixParser(lexer)
- {
- }
- PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
- : AbstractPosixParser(lexer, regex_options.value_or({}))
- {
- }
- ~PosixExtendedParser() = default;
- private:
- ALWAYS_INLINE bool match_repetition_symbol();
- bool parse_internal(ByteCode&, size_t&) override;
- bool parse_root(ByteCode&, size_t&);
- ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
- ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
- ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
- };
- class ECMA262Parser final : public Parser {
- public:
- explicit ECMA262Parser(Lexer& lexer)
- : Parser(lexer)
- {
- m_capture_groups_in_scope.empend();
- }
- ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
- : Parser(lexer, regex_options.value_or({}))
- {
- m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
- m_capture_groups_in_scope.empend();
- }
- ~ECMA262Parser() = default;
- private:
- bool parse_internal(ByteCode&, size_t&) override;
- enum class ReadDigitsInitialZeroState {
- Allow,
- Disallow,
- };
- StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
- Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
- StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
- struct Script {
- Unicode::Script script {};
- bool is_extension { false };
- };
- using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Script>;
- Optional<PropertyEscape> read_unicode_property_escape();
- bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
- bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
- bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named);
- bool parse_term(ByteCode&, size_t&, bool unicode, bool named);
- bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
- bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
- bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
- bool parse_interval_quantifier(Optional<size_t>& repeat_min, Optional<size_t>& repeat_max);
- bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
- bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
- bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
- Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
- bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
- bool parse_unicode_property_escape(PropertyEscape& property, bool& negated);
- // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
- bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named);
- bool parse_extended_atom(ByteCode&, size_t&, bool named);
- bool parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, bool unicode, bool named);
- bool parse_invalid_braced_quantifier(); // Note: This function either parses and *fails*, or doesn't parse anything and returns false.
- bool parse_legacy_octal_escape_sequence(ByteCode& bytecode_stack, size_t& length);
- Optional<u8> parse_legacy_octal_escape();
- size_t ensure_total_number_of_capturing_parenthesis();
- // ECMA-262's flavour of regex is a bit weird in that it allows backrefs to reference "future" captures, and such backrefs
- // always match the empty string. So we have to know how many capturing parenthesis there are, but we don't want to always
- // parse it twice, so we'll just do so when it's actually needed.
- // Most patterns should have no need to ever populate this field.
- Optional<size_t> m_total_number_of_capturing_parenthesis;
- // Keep the Annex B. behaviour behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
- bool m_should_use_browser_extended_grammar { false };
- // ECMA-262 basically requires that we clear the inner captures of a capture group before trying to match it,
- // by requiring that (...)+ only contain the matches for the last iteration.
- // To do that, we have to keep track of which capture groups are "in scope", so we can clear them as needed.
- using CaptureGroup = Variant<size_t, String>;
- Vector<Vector<CaptureGroup>> m_capture_groups_in_scope;
- };
- using PosixExtended = PosixExtendedParser;
- using PosixBasic = PosixBasicParser;
- using ECMA262 = ECMA262Parser;
- }
- using regex::ECMA262;
- using regex::PosixBasic;
- using regex::PosixExtended;
|