소스 검색

LibRegex: Allow unknown escapes in non-unicode mode (for ECMA262)

This makes regexps like `/\x/` to work as normal.
Partially deals with #4189.
AnotherTest 4 년 전
부모
커밋
e2fa1b40c4
4개의 변경된 파일64개의 추가작업 그리고 4개의 파일을 삭제
  1. 7 0
      Libraries/LibRegex/RegexLexer.cpp
  2. 1 0
      Libraries/LibRegex/RegexLexer.h
  3. 55 4
      Libraries/LibRegex/RegexParser.cpp
  4. 1 0
      Libraries/LibRegex/RegexParser.h

+ 7 - 0
Libraries/LibRegex/RegexLexer.cpp

@@ -99,6 +99,13 @@ bool Lexer::try_skip(char c)
     return true;
 }
 
+char Lexer::skip()
+{
+    auto c = peek();
+    consume();
+    return c;
+}
+
 Token Lexer::next()
 {
     size_t token_start_position;

+ 1 - 0
Libraries/LibRegex/RegexLexer.h

@@ -92,6 +92,7 @@ public:
     void back(size_t offset);
     void set_source(const StringView source) { m_source = source; }
     bool try_skip(char);
+    char skip();
 
     StringView slice_back(size_t offset) const { return m_source.substring_view(m_position - offset - 1, offset); }
 

+ 55 - 4
Libraries/LibRegex/RegexParser.cpp

@@ -108,6 +108,20 @@ ALWAYS_INLINE bool Parser::try_skip(StringView str)
     return true;
 }
 
+ALWAYS_INLINE char Parser::skip()
+{
+    char ch;
+    if (m_parser_state.current_token.value().length() == 1) {
+        ch = m_parser_state.current_token.value()[0];
+    } else {
+        m_parser_state.lexer.back(m_parser_state.current_token.value().length());
+        ch = m_parser_state.lexer.skip();
+    }
+
+    m_parser_state.current_token = m_parser_state.lexer.next();
+    return ch;
+}
+
 ALWAYS_INLINE void Parser::reset()
 {
     m_parser_state.bytecode.clear();
@@ -1017,6 +1031,16 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
                 return true;
             }
         }
+
+        if (unicode) {
+            set_error(Error::InvalidPattern);
+            return false;
+        }
+
+        // Allow '\c' in non-unicode mode, just matches 'c'.
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'c' } });
+        return true;
     }
 
     // '\0'
@@ -1032,6 +1056,14 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
             match_length_minimum += 1;
             stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } });
             return true;
+        } else if (!unicode) {
+            // '\x' is allowed in non-unicode mode, just matches 'x'.
+            match_length_minimum += 1;
+            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'x' } });
+            return true;
+        } else {
+            set_error(Error::InvalidPattern);
+            return false;
         }
     }
 
@@ -1088,6 +1120,14 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
     bool negate = false;
     auto ch = parse_character_class_escape(negate);
     if (!ch.has_value()) {
+        if (!unicode) {
+            // Allow all SourceCharacter's as escapes here.
+            auto token = consume();
+            match_length_minimum += 1;
+            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[0] } });
+            return true;
+        }
+
         set_error(Error::InvalidCharacterClass);
         return false;
     }
@@ -1203,8 +1243,15 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
 
             // HexEscape
             if (try_skip("x")) {
-                if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value())
+                if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value()) {
                     return { { .code_point = hex_escape.value(), .is_character_class = false } };
+                } else if (!unicode) {
+                    // '\x' is allowed in non-unicode mode, just matches 'x'.
+                    return { { .code_point = 'x', .is_character_class = false } };
+                } else {
+                    set_error(Error::InvalidPattern);
+                    return {};
+                }
             }
 
             if (try_skip("u")) {
@@ -1234,14 +1281,18 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
                 return { { .character_class = CharClass::Space, .is_negated = true, .is_character_class = true } };
             if (try_skip("W"))
                 return { { .character_class = CharClass::Word, .is_negated = true, .is_character_class = true } };
+
+            if (!unicode) {
+                // Any unrecognised escape is allowed in non-unicode mode.
+                return { { .code_point = (u32)skip(), .is_character_class = false } };
+            }
         }
 
         if (match(TokenType::RightBracket) || match(TokenType::HyphenMinus))
             return {};
 
-        auto token = consume(TokenType::Char, Error::InvalidCharacterClass);
-
-        return { { .code_point = (u32)token.value()[0], .is_character_class = false } };
+        // Allow any (other) SourceCharacter.
+        return { { .code_point = (u32)skip(), .is_character_class = false } };
     };
     auto read_class_atom = [&]() -> Optional<CharClassRangeElement> {
         if (match(TokenType::HyphenMinus)) {

+ 1 - 0
Libraries/LibRegex/RegexParser.h

@@ -94,6 +94,7 @@ protected:
     ALWAYS_INLINE Token consume(TokenType type, Error error);
     ALWAYS_INLINE bool consume(const String&);
     ALWAYS_INLINE bool try_skip(StringView);
+    ALWAYS_INLINE char skip();
     ALWAYS_INLINE void reset();
     ALWAYS_INLINE bool done() const;
     ALWAYS_INLINE bool set_error(Error error);