Просмотр исходного кода

LibRegex: Correct And/Or and inversion interplay semantics

This commit also fixes an incorrect test case from very early on, our
behaviour now matches the ECMA262 spec in this case.

Fixes #21786.
Ali Mohammad Pur 1 год назад
Родитель
Сommit
e265d81277

+ 9 - 1
Tests/LibRegex/Regex.cpp

@@ -648,7 +648,7 @@ TEST_CASE(ECMA262_match)
         { "^[\\0-\\x1f]$"sv, "\n"sv },
         { .pattern = "\\bhello\\B"sv, .subject = "hello1"sv, .options = ECMAScriptFlags::Global },
         { "\\b.*\\b"sv, "hello1"sv },
-        { "[^\\D\\S]{2}"sv, "1 "sv },
+        { "[^\\D\\S]{2}"sv, "1 "sv, false },
         { "bar(?=f.)foo"sv, "barfoo"sv },
         { "bar(?=foo)bar"sv, "barbar"sv, false },
         { "bar(?!foo)bar"sv, "barbar"sv, true },
@@ -1174,6 +1174,14 @@ TEST_CASE(inversion_state_in_char_class)
         EXPECT_EQ(result.capture_group_matches.first()[0].view.to_byte_string(), "slideNumbers"sv);
         EXPECT_EQ(result.capture_group_matches.first()[1].view.to_byte_string(), "}"sv);
     }
+    {
+        // #21786, /[^\S\n]/.exec("\n") should be null, not [ "\n" ].
+        // This was a general confusion between the inversion state and the negation state (temp inverse).
+        Regex<ECMA262> re("[^\\S\\n]", ECMAScriptFlags::Global | (ECMAScriptFlags)regex::AllFlags::SingleMatch);
+
+        auto result = re.match("\n"sv);
+        EXPECT_EQ(result.success, false);
+    }
 }
 
 TEST_CASE(mismatching_brackets)

+ 10 - 4
Userland/Libraries/LibRegex/RegexByteCode.cpp

@@ -426,6 +426,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
         bool active { false };
         bool is_conjunction { false };
         bool fail { false };
+        bool inverse_matched { false };
         size_t initial_position;
         size_t initial_code_unit_position;
         Optional<size_t> last_accepted_position {};
@@ -623,8 +624,9 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
         case CharacterCompareType::And:
             disjunction_states.append({
                 .active = true,
-                .is_conjunction = false,
-                .fail = false,
+                .is_conjunction = current_inversion_state(),
+                .fail = current_inversion_state(),
+                .inverse_matched = current_inversion_state(),
                 .initial_position = state.string_position,
                 .initial_code_unit_position = state.string_position_in_code_units,
             });
@@ -632,8 +634,9 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
         case CharacterCompareType::Or:
             disjunction_states.append({
                 .active = true,
-                .is_conjunction = true,
-                .fail = true,
+                .is_conjunction = !current_inversion_state(),
+                .fail = !current_inversion_state(),
+                .inverse_matched = !current_inversion_state(),
                 .initial_position = state.string_position,
                 .initial_code_unit_position = state.string_position_in_code_units,
             });
@@ -644,6 +647,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
                 state.string_position = disjunction_state.last_accepted_position.value_or(disjunction_state.initial_position);
                 state.string_position_in_code_units = disjunction_state.last_accepted_code_unit_position.value_or(disjunction_state.initial_code_unit_position);
             }
+            inverse_matched = disjunction_state.inverse_matched || disjunction_state.fail;
             break;
         }
         default:
@@ -664,6 +668,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
             if (!failed) {
                 new_disjunction_state.last_accepted_position = state.string_position;
                 new_disjunction_state.last_accepted_code_unit_position = state.string_position_in_code_units;
+                new_disjunction_state.inverse_matched |= inverse_matched;
             }
 
             if (new_disjunction_state.is_conjunction)
@@ -673,6 +678,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
 
             state.string_position = new_disjunction_state.initial_position;
             state.string_position_in_code_units = new_disjunction_state.initial_code_unit_position;
+            inverse_matched = false;
         }
     }
 

+ 10 - 3
Userland/Libraries/LibRegex/RegexParser.cpp

@@ -1777,10 +1777,12 @@ bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_
 
     Vector<CompareTypeAndValuePair> compares;
 
+    auto uses_explicit_or_semantics = false;
     if (match(TokenType::Circumflex)) {
         // Negated charclass
         consume();
         compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
+        uses_explicit_or_semantics = true;
     }
 
     // ClassContents :: [empty]
@@ -1800,6 +1802,11 @@ bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_
     if (flags.unicode_sets && !parse_class_set_expression(compares))
         return false;
 
+    if (uses_explicit_or_semantics && compares.size() > 2) {
+        compares.insert(1, CompareTypeAndValuePair { CharacterCompareType::Or, 0 });
+        compares.empend(CompareTypeAndValuePair { CharacterCompareType::EndAndOr, 0 });
+    }
+
     match_length_minimum += 1;
     stack.insert_bytecode_compare_values(move(compares));
     return true;
@@ -2466,9 +2473,9 @@ DeprecatedFlyString ECMA262Parser::read_capture_group_specifier(bool take_starti
 {
     static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
     static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
-    static constexpr const u32 REPLACEMENT_CHARACTER = 0xFFFD;
-    constexpr const u32 ZERO_WIDTH_NON_JOINER { 0x200C };
-    constexpr const u32 ZERO_WIDTH_JOINER { 0x200D };
+    static constexpr u32 const REPLACEMENT_CHARACTER = 0xFFFD;
+    constexpr u32 const ZERO_WIDTH_NON_JOINER { 0x200C };
+    constexpr u32 const ZERO_WIDTH_JOINER { 0x200D };
 
     if (take_starting_angle_bracket && !consume("<"))
         return {};