LibRegex: Implement ECMA262 multiline matching without splitting lines

As ECMA262 regex allows `[^]` and literal newlines to match newlines in the input string, we shouldn't split the input string into lines, rather simply make boundaries and catchall patterns capable of checking for these conditions specifically.
Author: https://github.com/alimpfard Commit: https://github.com/SerenityOS/serenity/commit/5fac41f733a Pull-request: https://github.com/SerenityOS/serenity/pull/12126 Reviewed-by: https://github.com/linusg ✅
2024-11-22 07:30:19 +00:00 · 2022-01-25 13:30:27 +03:30 · 2022-01-25 13:30:27 +03:30 · 5fac41f733 · 2024-07-17 20:13:41 +09:00
commit 5fac41f733
parent 98183ef572
7 changed files with 55 additions and 20 deletions
--- a/Tests/LibRegex/Regex.cpp
+++ b/Tests/LibRegex/Regex.cpp
@ -684,6 +684,8 @@ TEST_CASE(ECMA262_match)
        { "(\0|a)"sv, "a"sv, true }, // #9686, Should allow null bytes in pattern
        { "(.*?)a(?!(a+)b\\2c)\\2(.*)"sv, "baaabaac"sv, true }, // #6042, Groups inside lookarounds may be referenced outside, but their contents appear empty if the pattern in the lookaround fails.
        { "a|$"sv, "x"sv, true, (ECMAScriptFlags)regex::AllFlags::Global }, // #11940, Global (not the 'g' flag) regexps should attempt to match the zero-length end of the string too.
+        { "foo\nbar"sv, "foo\nbar"sv, true }, // #12126, ECMA262 regexp should match literal newlines without the 's' flag.
+        { "foo[^]bar"sv, "foo\nbar"sv, true }, // #12126, ECMA262 regexp should match newline with [^].
    };
    // clang-format on

--- a/Userland/Libraries/LibC/regex.h
+++ b/Userland/Libraries/LibC/regex.h
@ -83,6 +83,7 @@ enum __RegexAllFlags {
    __Regex_SkipTrimEmptyMatches = __Regex_Global << 13,     // Do not remove empty capture group results.
    __Regex_Internal_Stateful = __Regex_Global << 14,        // Internal flag; enables stateful matches.
    __Regex_Internal_BrowserExtended = __Regex_Global << 15, // Internal flag; enable browser-specific ECMA262 extensions.
+    __Regex_Internal_ConsiderNewline = __Regex_Global << 16, // Internal flag; allow matchers to consider newlines as line separators.
    __Regex_Last = __Regex_SkipTrimEmptyMatches
 };

@ -97,7 +98,6 @@ enum __RegexAllFlags {
 #define REG_NOTBOL __Regex_MatchNotBeginOfLine // The circumflex character (^), when taken as a special character, will not match the beginning of string.
 #define REG_NOTEOL __Regex_MatchNotEndOfLine   // The dollar sign ($), when taken as a special character, will not match the end of string.

-//static_assert (sizeof(FlagsUnderlyingType) * 8 >= regex::POSIXFlags::Last << 1), "flags type too small")
 #define REG_SEARCH __Regex_Last << 1

 int regcomp(regex_t*, const char*, int);
--- a/Userland/Libraries/LibRegex/RegexByteCode.cpp
+++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp
@ -273,12 +273,23 @@ ALWAYS_INLINE ExecutionResult OpCode_ForkReplaceStay::execute(MatchInput const&

 ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(MatchInput const& input, MatchState& state) const
 {
-    if (0 == state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
+    auto is_at_line_boundary = [&] {
+        if (state.string_position == 0)
+            return true;
+
+        if (input.regex_options.has_flag_set(AllFlags::Multiline) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline)) {
+            auto input_view = input.view.substring_view(state.string_position - 1, 1)[0];
+            return input_view == '\n';
+        }
+
+        return false;
+    }();
+    if (is_at_line_boundary && (input.regex_options & AllFlags::MatchNotBeginOfLine))
        return ExecutionResult::Failed_ExecuteLowPrioForks;

-    if ((0 == state.string_position && !(input.regex_options & AllFlags::MatchNotBeginOfLine))
-        || (0 != state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
-        || (0 == state.string_position && (input.regex_options & AllFlags::Global)))
+    if ((is_at_line_boundary && !(input.regex_options & AllFlags::MatchNotBeginOfLine))
+        || (!is_at_line_boundary && (input.regex_options & AllFlags::MatchNotBeginOfLine))
+        || (is_at_line_boundary && (input.regex_options & AllFlags::Global)))
        return ExecutionResult::Continue;

    return ExecutionResult::Failed_ExecuteLowPrioForks;
@ -315,11 +326,22 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(MatchInput const& in

 ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(MatchInput const& input, MatchState& state) const
 {
-    if (state.string_position == input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine))
+    auto is_at_line_boundary = [&] {
+        if (state.string_position == input.view.length())
+            return true;
+
+        if (input.regex_options.has_flag_set(AllFlags::Multiline) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline)) {
+            auto input_view = input.view.substring_view(state.string_position, 1)[0];
+            return input_view == '\n';
+        }
+
+        return false;
+    }();
+    if (is_at_line_boundary && (input.regex_options & AllFlags::MatchNotEndOfLine))
        return ExecutionResult::Failed_ExecuteLowPrioForks;

-    if ((state.string_position == input.view.length() && !(input.regex_options & AllFlags::MatchNotEndOfLine))
-        || (state.string_position != input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine)))
+    if ((is_at_line_boundary && !(input.regex_options & AllFlags::MatchNotEndOfLine))
+        || (!is_at_line_boundary && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine)))
        return ExecutionResult::Continue;

    return ExecutionResult::Failed_ExecuteLowPrioForks;
@ -461,8 +483,9 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
            if (input.view.length() <= state.string_position)
                return ExecutionResult::Failed_ExecuteLowPrioForks;

-            VERIFY(!current_inversion_state());
-            advance_string_position(state, input.view);
+            auto input_view = input.view.substring_view(state.string_position, 1)[0];
+            if (input_view != '\n' || (input.regex_options.has_flag_set(AllFlags::SingleLine) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline)))
+                advance_string_position(state, input.view, input_view);

        } else if (compare_type == CharacterCompareType::String) {
            VERIFY(!current_inversion_state());
--- a/Userland/Libraries/LibRegex/RegexMatcher.cpp
+++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp
@ -41,7 +41,7 @@ Regex<Parser>::Regex(String pattern, typename ParserTraits<Parser>::OptionsType

    run_optimization_passes();
    if (parser_result.error == regex::Error::NoError)
-        matcher = make<Matcher<Parser>>(this, regex_options);
+        matcher = make<Matcher<Parser>>(this, static_cast<decltype(regex_options.value())>(parser_result.options.value()));
 }

 template<class Parser>
@ -51,7 +51,7 @@ Regex<Parser>::Regex(regex::Parser::Result parse_result, String pattern, typenam
 {
    run_optimization_passes();
    if (parser_result.error == regex::Error::NoError)
-        matcher = make<Matcher<Parser>>(this, regex_options);
+        matcher = make<Matcher<Parser>>(this, regex_options | static_cast<decltype(regex_options.value())>(parse_result.options.value()));
 }

 template<class Parser>
@ -104,8 +104,10 @@ RegexResult Matcher<Parser>::match(RegexStringView view, Optional<typename Parse
 {
    AllOptions options = m_regex_options | regex_options.value_or({}).value();

-    if (options.has_flag_set(AllFlags::Multiline))
-        return match(view.lines(), regex_options); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string...
+    if constexpr (!IsSame<Parser, ECMA262>) {
+        if (options.has_flag_set(AllFlags::Multiline))
+            return match(view.lines(), regex_options); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string...
+    }

    Vector<RegexStringView> views;
    views.append(view);
--- a/Userland/Libraries/LibRegex/RegexOptions.h
+++ b/Userland/Libraries/LibRegex/RegexOptions.h
@ -16,7 +16,7 @@

 namespace regex {

-using FlagsUnderlyingType = u16;
+using FlagsUnderlyingType = u32;

 enum class AllFlags {
    Global = __Regex_Global,                                     // All matches (don't return after first match)
@ -35,6 +35,7 @@ enum class AllFlags {
    SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches,         // Do not remove empty capture group results.
    Internal_Stateful = __Regex_Internal_Stateful,               // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off.
    Internal_BrowserExtended = __Regex_Internal_BrowserExtended, // Only for ECMA262, Enable the behaviors defined in section B.1.4. of the ECMA262 spec.
+    Internal_ConsiderNewline = __Regex_Internal_ConsiderNewline, // Only for ECMA262, Allow multiline matches to consider newlines as line boundaries.
    Last = Internal_BrowserExtended,
 };

--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@ -194,7 +194,8 @@ Parser::Result Parser::parse(Optional<AllOptions> regex_options)
        move(m_parser_state.match_length_minimum),
        move(m_parser_state.error),
        move(m_parser_state.error_token),
-        m_parser_state.named_capture_groups.keys()
+        m_parser_state.named_capture_groups.keys(),
+        m_parser_state.regex_options,
    };
 }

--- a/Userland/Libraries/LibRegex/RegexParser.h
+++ b/Userland/Libraries/LibRegex/RegexParser.h
@ -54,6 +54,7 @@ public:
        Error error;
        Token error_token;
        Vector<FlyString> capture_groups;
+        AllOptions options;
    };

    explicit Parser(Lexer& lexer)
@ -71,6 +72,7 @@ public:
    Result parse(Optional<AllOptions> regex_options = {});
    bool has_error() const { return m_parser_state.error != Error::NoError; }
    Error error() const { return m_parser_state.error; }
+    AllOptions options() const { return m_parser_state.regex_options; }

 protected:
    virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
@ -170,14 +172,16 @@ private:
 };

 class PosixExtendedParser final : public AbstractPosixParser {
+    constexpr static auto default_options = static_cast<PosixFlags>(AllFlags::SingleLine) | static_cast<PosixFlags>(AllFlags::Internal_ConsiderNewline);
+
 public:
    explicit PosixExtendedParser(Lexer& lexer)
-        : AbstractPosixParser(lexer)
+        : AbstractPosixParser(lexer, default_options)
    {
    }

    PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
-        : AbstractPosixParser(lexer, regex_options.value_or({}))
+        : AbstractPosixParser(lexer, regex_options.value_or({}) | default_options.value())
    {
    }

@ -195,15 +199,17 @@ private:
 };

 class ECMA262Parser final : public Parser {
+    constexpr static ECMAScriptOptions default_options = static_cast<ECMAScriptFlags>(AllFlags::Internal_ConsiderNewline);
+
 public:
    explicit ECMA262Parser(Lexer& lexer)
-        : Parser(lexer)
+        : Parser(lexer, default_options)
    {
        m_capture_groups_in_scope.empend();
    }

    ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
-        : Parser(lexer, regex_options.value_or({}))
+        : Parser(lexer, regex_options.value_or({}) | default_options.value())
    {
        m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
        m_capture_groups_in_scope.empend();