Преглед на файлове

LibRegex: Support property escapes of the Unicode script property

Note that unlike binary properties and general categories, scripts must
be specified in the non-binary (Script=Value) form.
Timothy Flynn преди 4 години
родител
ревизия
06088df729

+ 6 - 0
Tests/LibRegex/Regex.cpp

@@ -683,6 +683,12 @@ TEST_CASE(ECMA262_property_match)
         { "\\p{gc=Cased_Letter}", "a", true, ECMAScriptFlags::Unicode },
         { "\\p{gc=Cased_Letter}", "A", true, ECMAScriptFlags::Unicode },
         { "\\p{gc=Cased_Letter}", "9", false, ECMAScriptFlags::Unicode },
+        { "\\p{Script=Latin}", "a", true, ECMAScriptFlags::Unicode },
+        { "\\p{Script=Latin}", "A", true, ECMAScriptFlags::Unicode },
+        { "\\p{Script=Latin}", "9", false, ECMAScriptFlags::Unicode },
+        { "\\p{sc=Latin}", "a", true, ECMAScriptFlags::Unicode },
+        { "\\p{sc=Latin}", "A", true, ECMAScriptFlags::Unicode },
+        { "\\p{sc=Latin}", "9", false, ECMAScriptFlags::Unicode },
     };
 
     for (auto& test : tests) {

+ 20 - 0
Userland/Libraries/LibRegex/RegexByteCode.cpp

@@ -569,6 +569,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
             auto general_category = static_cast<Unicode::GeneralCategory>(m_bytecode->at(offset++));
             compare_general_category(input, state, general_category, current_inversion_state(), inverse_matched);
 
+        } else if (compare_type == CharacterCompareType::Script) {
+            auto script = static_cast<Unicode::Script>(m_bytecode->at(offset++));
+            compare_script(input, state, script, current_inversion_state(), inverse_matched);
+
         } else {
             warnln("Undefined comparison: {}", (int)compare_type);
             VERIFY_NOT_REACHED();
@@ -790,6 +794,22 @@ ALWAYS_INLINE void OpCode_Compare::compare_general_category(MatchInput const& in
     }
 }
 
+ALWAYS_INLINE void OpCode_Compare::compare_script(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched)
+{
+    if (state.string_position == input.view.length())
+        return;
+
+    u32 code_point = input.view[state.string_position_in_code_units];
+    bool equal = Unicode::code_point_has_script(code_point, script);
+
+    if (equal) {
+        if (inverse)
+            inverse_matched = true;
+        else
+            advance_string_position(state, input.view, code_point);
+    }
+}
+
 String const OpCode_Compare::arguments_string() const
 {
     return String::formatted("argc={}, args={} ", arguments_count(), arguments_size());

+ 2 - 0
Userland/Libraries/LibRegex/RegexByteCode.h

@@ -68,6 +68,7 @@ enum class OpCodeId : ByteCodeValueType {
     __ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference)   \
     __ENUMERATE_CHARACTER_COMPARE_TYPE(Property)         \
     __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory)  \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(Script)           \
     __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
 
 enum class CharacterCompareType : ByteCodeValueType {
@@ -727,6 +728,7 @@ private:
     ALWAYS_INLINE static void compare_character_range(MatchInput const& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched);
     ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched);
     ALWAYS_INLINE static void compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched);
+    ALWAYS_INLINE static void compare_script(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched);
 };
 
 template<typename T>

+ 15 - 3
Userland/Libraries/LibRegex/RegexParser.cpp

@@ -1555,6 +1555,9 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
                 },
                 [&](Unicode::GeneralCategory general_category) {
                     compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
+                },
+                [&](Unicode::Script script) {
+                    compares.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)script });
                 });
             stack.insert_bytecode_compare_values(move(compares));
             match_length_minimum += 1;
@@ -1705,12 +1708,14 @@ struct CharClassRangeElement {
         u32 code_point { 0 };
         Unicode::Property property;
         Unicode::GeneralCategory general_category;
+        Unicode::Script script;
     };
 
     bool is_negated { false };
     bool is_character_class { false };
     bool is_property { false };
     bool is_general_category { false };
+    bool is_script { false };
 };
 
 bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
@@ -1804,6 +1809,9 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
                         },
                         [&](Unicode::GeneralCategory general_category) {
                             return CharClassRangeElement { .general_category = general_category, .is_negated = negated, .is_character_class = true, .is_general_category = true };
+                        },
+                        [&](Unicode::Script script) {
+                            return CharClassRangeElement { .script = script, .is_negated = negated, .is_character_class = true, .is_script = true };
                         });
                 }
             }
@@ -1851,6 +1859,8 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
                 ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property) });
             else if (atom.is_general_category)
                 ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) });
+            else if (atom.is_script)
+                ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)(atom.script) });
             else
                 ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)atom.character_class });
         } else {
@@ -1949,9 +1959,8 @@ bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool
             }
             return true;
         },
-        [](Unicode::GeneralCategory) {
-            return true;
-        });
+        [](Unicode::GeneralCategory) { return true; },
+        [](Unicode::Script) { return true; });
 }
 
 StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
@@ -2015,6 +2024,9 @@ Optional<ECMA262Parser::PropertyEscape> ECMA262Parser::read_unicode_property_esc
     } else if ((property_type == "General_Category"sv) || (property_type == "gc"sv)) {
         if (auto general_category = Unicode::general_category_from_string(property_name); general_category.has_value())
             return { *general_category };
+    } else if ((property_type == "Script"sv) || (property_type == "sc"sv)) {
+        if (auto script = Unicode::script_from_string(property_name); script.has_value())
+            return { *script };
     }
 
     return {};

+ 1 - 1
Userland/Libraries/LibRegex/RegexParser.h

@@ -214,7 +214,7 @@ private:
     Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
     StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
 
-    using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory>;
+    using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Unicode::Script>;
     Optional<PropertyEscape> read_unicode_property_escape();
 
     bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);