LibRegex: Fix parsing identity escape sequences

Also fixes the propagation of default options (the previous implementation reset them to zero before parsing...). Partially deals with #4189.
Author: https://github.com/alimpfard Commit: https://github.com/SerenityOS/serenity/commit/801750b95a6 Pull-request: https://github.com/SerenityOS/serenity/pull/4194 Issue: https://github.com/SerenityOS/serenity/issues/4186 Issue: https://github.com/SerenityOS/serenity/issues/4189
2024-11-21 23:20:20 +00:00 · 2020-11-28 10:46:30 +03:30 · 2020-11-28 10:46:30 +03:30 · 801750b95a · 2024-07-19 01:13:47 +09:00
commit 801750b95a
parent e83e7a03c2
1 changed files with 24 additions and 7 deletions
--- a/Libraries/LibRegex/RegexParser.cpp
+++ b/Libraries/LibRegex/RegexParser.cpp
@ -115,7 +115,6 @@ ALWAYS_INLINE void Parser::reset()
    m_parser_state.current_token = m_parser_state.lexer.next();
    m_parser_state.error = Error::NoError;
    m_parser_state.error_token = { TokenType::Eof, 0, StringView(nullptr) };
-    m_parser_state.regex_options = {};
 }

 Parser::Result Parser::parse(Optional<AllOptions> regex_options)
@ -621,7 +620,7 @@ bool PosixExtendedParser::parse_root(ByteCode& stack, size_t& match_length_minim

 bool ECMA262Parser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
 {
-    if (m_parser_state.regex_options & AllFlags::Unicode) {
+    if (m_parser_state.regex_options.has_flag_set(AllFlags::Unicode)) {
        return parse_pattern(stack, match_length_minimum, true, true);
    } else {
        ByteCode new_stack;
@ -918,6 +917,13 @@ bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minim

 bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
 {
+    if (match(TokenType::EscapeSequence)) {
+        // Also part of AtomEscape.
+        auto token = consume();
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[0] } });
+        return true;
+    }
    if (try_skip("\\")) {
        // AtomEscape.
        return parse_atom_escape(stack, match_length_minimum, unicode, named);
@ -1035,11 +1041,20 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
    }

    // IdentityEscape
-    if (match(TokenType::EscapeSequence)) {
-        match_length_minimum += 1;
-        auto token = consume().value();
-        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token[token.length() - 1] } });
-        return true;
+    for (auto ch : StringView { "^$\\.*+?()[]{}|" }) {
+        if (try_skip({ &ch, 1 })) {
+            match_length_minimum += 1;
+            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } });
+            return true;
+        }
+    }
+
+    if (unicode) {
+        if (try_skip("/")) {
+            match_length_minimum += 1;
+            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'/' } });
+            return true;
+        }
    }

    if (named && try_skip("k")) {
@ -1171,6 +1186,8 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
                return { { .code_point = '\v', .is_character_class = false } };
            if (try_skip("b"))
                return { { .code_point = '\b', .is_character_class = false } };
+            if (try_skip("/"))
+                return { { .code_point = '/', .is_character_class = false } };

            // CharacterEscape > ControlLetter
            if (try_skip("c")) {