Browse Source

LibRegex: Support property escapes of Unicode General Categories

This changes LibRegex to parse the property escape as a Variant of
Unicode Property & General Category values. A byte code instruction is
added to perform matching based on General Category values.
Timothy Flynn 4 năm trước cách đây
mục cha
commit
1e10d6d7ce

+ 6 - 0
Tests/LibRegex/Regex.cpp

@@ -661,6 +661,12 @@ TEST_CASE(ECMA262_property_match)
         { "\\p{ASCII_Hex_Digit}", "x", false, ECMAScriptFlags::Unicode },
         { "\\p{Any}", "\xcd\xb8", true, ECMAScriptFlags::Unicode },       // U+0378, which is an unassigned code point.
         { "\\p{Assigned}", "\xcd\xb8", false, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
+        { "\\p{Lu}", "a", false, ECMAScriptFlags::Unicode },
+        { "\\p{Lu}", "A", true, ECMAScriptFlags::Unicode },
+        { "\\p{Lu}", "9", false, ECMAScriptFlags::Unicode },
+        { "\\p{Cased_Letter}", "a", true, ECMAScriptFlags::Unicode },
+        { "\\p{Cased_Letter}", "A", true, ECMAScriptFlags::Unicode },
+        { "\\p{Cased_Letter}", "9", false, ECMAScriptFlags::Unicode },
     };
 
     for (auto& test : tests) {

+ 20 - 0
Userland/Libraries/LibRegex/RegexByteCode.cpp

@@ -537,6 +537,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
             auto property = static_cast<Unicode::Property>(m_bytecode->at(offset++));
             compare_property(input, state, property, current_inversion_state(), inverse_matched);
 
+        } else if (compare_type == CharacterCompareType::GeneralCategory) {
+            auto general_category = static_cast<Unicode::GeneralCategory>(m_bytecode->at(offset++));
+            compare_general_category(input, state, general_category, current_inversion_state(), inverse_matched);
+
         } else {
             warnln("Undefined comparison: {}", (int)compare_type);
             VERIFY_NOT_REACHED();
@@ -742,6 +746,22 @@ ALWAYS_INLINE void OpCode_Compare::compare_property(MatchInput const& input, Mat
     }
 }
 
+ALWAYS_INLINE void OpCode_Compare::compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched)
+{
+    if (state.string_position == input.view.length())
+        return;
+
+    u32 code_point = input.view[state.string_position];
+    bool equal = Unicode::code_point_has_general_category(code_point, general_category);
+
+    if (equal) {
+        if (inverse)
+            inverse_matched = true;
+        else
+            ++state.string_position;
+    }
+}
+
 String const OpCode_Compare::arguments_string() const
 {
     return String::formatted("argc={}, args={} ", arguments_count(), arguments_size());

+ 2 - 0
Userland/Libraries/LibRegex/RegexByteCode.h

@@ -67,6 +67,7 @@ enum class OpCodeId : ByteCodeValueType {
     __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference)        \
     __ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference)   \
     __ENUMERATE_CHARACTER_COMPARE_TYPE(Property)         \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory)  \
     __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
 
 enum class CharacterCompareType : ByteCodeValueType {
@@ -725,6 +726,7 @@ private:
     ALWAYS_INLINE static void compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched);
     ALWAYS_INLINE static void compare_character_range(MatchInput const& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched);
     ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched);
+    ALWAYS_INLINE static void compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched);
 };
 
 template<typename T>

+ 45 - 17
Userland/Libraries/LibRegex/RegexParser.cpp

@@ -1542,13 +1542,19 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
     }
 
     if (unicode) {
-        Unicode::Property property {};
+        PropertyEscape property {};
         bool negated = false;
 
         if (parse_unicode_property_escape(property, negated)) {
             if (negated)
                 stack.insert_bytecode_compare_values({ { CharacterCompareType::Inverse, 0 } });
-            stack.insert_bytecode_compare_values({ { CharacterCompareType::Property, (ByteCodeValueType)(property) } });
+            property.visit(
+                [&](Unicode::Property property) {
+                    stack.insert_bytecode_compare_values({ { CharacterCompareType::Property, (ByteCodeValueType)(property) } });
+                },
+                [&](Unicode::GeneralCategory general_category) {
+                    stack.insert_bytecode_compare_values({ { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(general_category) } });
+                });
             return true;
         }
     }
@@ -1695,11 +1701,13 @@ struct CharClassRangeElement {
         CharClass character_class;
         u32 code_point { 0 };
         Unicode::Property property;
+        Unicode::GeneralCategory general_category;
     };
 
     bool is_negated { false };
     bool is_character_class { false };
-    bool is_property_escape { false };
+    bool is_property { false };
+    bool is_general_category { false };
 };
 
 bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
@@ -1784,10 +1792,17 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
                 if (try_skip("-"))
                     return { CharClassRangeElement { .code_point = '-', .is_character_class = false } };
 
-                Unicode::Property property {};
+                PropertyEscape property {};
                 bool negated = false;
-                if (parse_unicode_property_escape(property, negated))
-                    return { CharClassRangeElement { .property = property, .is_negated = negated, .is_character_class = true, .is_property_escape = true } };
+                if (parse_unicode_property_escape(property, negated)) {
+                    return property.visit(
+                        [&](Unicode::Property property) {
+                            return CharClassRangeElement { .property = property, .is_negated = negated, .is_character_class = true, .is_property = true };
+                        },
+                        [&](Unicode::GeneralCategory general_category) {
+                            return CharClassRangeElement { .general_category = general_category, .is_negated = negated, .is_character_class = true, .is_general_category = true };
+                        });
+                }
             }
 
             if (try_skip("d"))
@@ -1828,8 +1843,11 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
         if (atom.is_character_class) {
             if (atom.is_negated)
                 ranges.empend(CompareTypeAndValuePair { CharacterCompareType::TemporaryInverse, 0 });
-            if (atom.is_property_escape)
+
+            if (atom.is_property)
                 ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property) });
+            else if (atom.is_general_category)
+                ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) });
             else
                 ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)atom.character_class });
         } else {
@@ -1901,7 +1919,7 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
     return true;
 }
 
-bool ECMA262Parser::parse_unicode_property_escape(Unicode::Property& property, bool& negated)
+bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool& negated)
 {
     negated = false;
 
@@ -1918,13 +1936,19 @@ bool ECMA262Parser::parse_unicode_property_escape(Unicode::Property& property, b
         return false;
     }
 
-    if (!Unicode::is_ecma262_property(*parsed_property)) {
-        set_error(Error::InvalidNameForProperty);
-        return false;
-    }
+    property = move(*parsed_property);
 
-    property = *parsed_property;
-    return true;
+    return property.visit(
+        [this](Unicode::Property property) {
+            if (!Unicode::is_ecma262_property(property)) {
+                set_error(Error::InvalidNameForProperty);
+                return false;
+            }
+            return true;
+        },
+        [](Unicode::GeneralCategory) {
+            return true;
+        });
 }
 
 StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
@@ -1948,7 +1972,7 @@ StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_
     return name;
 }
 
-Optional<Unicode::Property> ECMA262Parser::read_unicode_property_escape()
+Optional<ECMA262Parser::PropertyEscape> ECMA262Parser::read_unicode_property_escape()
 {
     consume(TokenType::LeftCurly, Error::InvalidPattern);
 
@@ -1960,10 +1984,14 @@ Optional<Unicode::Property> ECMA262Parser::read_unicode_property_escape()
         offset += consume().value().length();
     }
 
+    StringView property_name { start_token.value().characters_without_null_termination(), offset };
     consume(TokenType::RightCurly, Error::InvalidPattern);
 
-    StringView property_name { start_token.value().characters_without_null_termination(), offset };
-    return Unicode::property_from_string(property_name);
+    if (auto property = Unicode::property_from_string(property_name); property.has_value())
+        return { *property };
+    if (auto general_category = Unicode::general_category_from_string(property_name); general_category.has_value())
+        return { *general_category };
+    return {};
 }
 
 bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)

+ 4 - 2
Userland/Libraries/LibRegex/RegexParser.h

@@ -213,7 +213,9 @@ private:
     StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
     Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
     StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
-    Optional<Unicode::Property> read_unicode_property_escape();
+
+    using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory>;
+    Optional<PropertyEscape> read_unicode_property_escape();
 
     bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
     bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
@@ -227,7 +229,7 @@ private:
     bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
     Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
     bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
-    bool parse_unicode_property_escape(Unicode::Property& property, bool& negated);
+    bool parse_unicode_property_escape(PropertyEscape& property, bool& negated);
 
     // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
     bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named);