From 9509433e25455bf8ee98a9ea718c9b4d4c84c7f6 Mon Sep 17 00:00:00 2001
From: Timothy Flynn <trflynn89@pm.me>
Date: Thu, 12 Aug 2021 11:02:46 -0400
Subject: [PATCH] LibRegex: Implement and use a REPEAT operation for bytecode
 repetition

Currently, when we need to repeat an instruction N times, we simply add
that instruction N times in a for-loop. This doesn't scale well with
extremely large values of N, and ECMA-262 allows up to N = 2^53 - 1.

Instead, add a new REPEAT bytecode operation to defer this loop from the
parser to the runtime executor. This allows the parser to complete sans
any loops (for this instruction), and allows the executor to bail early
if the repeated bytecode fails.

Note: The templated ByteCode methods are to allow the Posix parsers to
continue using u32 because they are limited to N = 2^20.
---
 Tests/LibRegex/Regex.cpp                      | 16 +++++++
 Userland/Libraries/LibRegex/RegexByteCode.cpp | 22 +++++++++
 Userland/Libraries/LibRegex/RegexByteCode.h   | 45 ++++++++++++++++---
 Userland/Libraries/LibRegex/RegexMatch.h      |  1 +
 Userland/Libraries/LibRegex/RegexMatcher.cpp  |  2 +
 Userland/Libraries/LibRegex/RegexParser.cpp   | 30 ++++++++-----
 Userland/Libraries/LibRegex/RegexParser.h     |  3 +-
 7 files changed, 103 insertions(+), 16 deletions(-)
diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp
index ea8707ccf7f4e3ee70b6b69d0ff58592ea5b1058..bcbfa545f97e2e343858394d4dad0810d1230651 100644
--- a/Tests/LibRegex/Regex.cpp
+++ b/Tests/LibRegex/Regex.cpp
@@ -569,6 +569,14 @@ TEST_CASE(ECMA262_parse)
         { "}"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
         { "}"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
         { "\\}"sv, regex::Error::NoError, ECMAScriptFlags::Unicode },
+        { "a{9007199254740991}"sv }, // 2^53 - 1
+        { "a{9007199254740991,}"sv },
+        { "a{9007199254740991,9007199254740991}"sv },
+        { "a{9007199254740992}"sv, regex::Error::InvalidBraceContent },
+        { "a{9007199254740992,}"sv, regex::Error::InvalidBraceContent },
+        { "a{9007199254740991,9007199254740992}"sv, regex::Error::InvalidBraceContent },
+        { "a{9007199254740992,9007199254740991}"sv, regex::Error::InvalidBraceContent },
+        { "a{9007199254740992,9007199254740992}"sv, regex::Error::InvalidBraceContent },
     };
 
     for (auto& test : tests) {
@@ -619,6 +627,14 @@ TEST_CASE(ECMA262_match)
         { "\\^"sv, "^"sv },
         { "\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/"sv, "^$\\.*+?()[]{}|/"sv, true, ECMAScriptFlags::Unicode },
         { "[\\^\\$\\\\\\.\\*\\+\\?\\(\\)\\[\\]\\{\\}\\|\\/]{15}"sv, "^$\\.*+?()[]{}|/"sv, true, ECMAScriptFlags::Unicode },
+        { "(a{2}){3}"sv, "aaaaaa"sv },
+        { "(a{2}){3}"sv, "aaaabaa"sv, false },
+        { "(a{2}){4}"sv, "aaaaaaaa"sv },
+        { "(a{2}){4}"sv, "aaaaaabaa"sv, false },
+        { "(a{3}){2}"sv, "aaaaaa"sv },
+        { "(a{3}){2}"sv, "aaaabaa"sv, false },
+        { "(a{4}){2}"sv, "aaaaaaaa"sv },
+        { "(a{4}){2}"sv, "aaaaaabaa"sv, false },
         // ECMA262, B.1.4. Regular Expression Pattern extensions for browsers
         { "{"sv, "{"sv, true, ECMAScriptFlags::BrowserExtended },
         { "\\5"sv, "\5"sv, true, ECMAScriptFlags::BrowserExtended },
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp
index 146a0bbd1dc5c21d0e615cf9120078f3cafd1c9b..0d411ba34c91ad257ee020f8b0041d1b722aa398 100644
--- a/Userland/Libraries/LibRegex/RegexByteCode.cpp
+++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp
@@ -175,6 +175,9 @@ void ByteCode::ensure_opcodes_initialized()
         case OpCodeId::SaveRightNamedCaptureGroup:
             s_opcodes[i] = make<OpCode_SaveRightNamedCaptureGroup>();
             break;
+        case OpCodeId::Repeat:
+            s_opcodes[i] = make<OpCode_Repeat>();
+            break;
         }
     }
     s_opcodes_initialized = true;
@@ -850,4 +853,23 @@ Vector<String> const OpCode_Compare::variable_arguments_to_string(Optional<Match
     }
     return result;
 }
+
+ALWAYS_INLINE ExecutionResult OpCode_Repeat::execute(MatchInput const&, MatchState& state) const
+{
+    VERIFY(count() > 0);
+
+    if (id() >= state.repetition_marks.size())
+        state.repetition_marks.resize(id() + 1);
+    auto& repetition_mark = state.repetition_marks.at(id());
+
+    if (repetition_mark == count() - 1) {
+        repetition_mark = 0;
+    } else {
+        state.instruction_position -= offset() + size();
+        ++repetition_mark;
+    }
+
+    return ExecutionResult::Continue;
+}
+
 }
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h
index 475ce61bd6a27eaba8e15f3976b1811663367ff4..926737fa8b377dcb11ca2e8967b3f3a0902d938f 100644
--- a/Userland/Libraries/LibRegex/RegexByteCode.h
+++ b/Userland/Libraries/LibRegex/RegexByteCode.h
@@ -40,6 +40,7 @@ using ByteCodeValueType = u64;
     __ENUMERATE_OPCODE(Restore)                    \
     __ENUMERATE_OPCODE(GoBack)                     \
     __ENUMERATE_OPCODE(ClearCaptureGroup)          \
+    __ENUMERATE_OPCODE(Repeat)                     \
     __ENUMERATE_OPCODE(Exit)
 
 // clang-format off
@@ -331,10 +332,11 @@ public:
         // LABEL _END = alterantive_bytecode.size
     }
 
-    static void transform_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, size_t minimum, Optional<size_t> maximum, bool greedy = true)
+    template<typename T>
+    static void transform_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, T minimum, Optional<T> maximum, size_t repetition_mark_id, bool greedy = true) requires(IsIntegral<T>)
     {
         ByteCode new_bytecode;
-        new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum);
+        new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, repetition_mark_id);
 
         if (maximum.has_value()) {
             auto jump_kind = static_cast<ByteCodeValueType>(greedy ? OpCodeId::ForkStay : OpCodeId::ForkJump);
@@ -343,7 +345,7 @@ public:
                 new_bytecode.empend(jump_kind);
                 new_bytecode.empend(diff * (bytecode_to_repeat.size() + 2)); // Jump to the _END label
 
-                for (size_t i = 0; i < diff; ++i) {
+                for (T i = 0; i < diff; ++i) {
                     new_bytecode.extend(bytecode_to_repeat);
                     new_bytecode.empend(jump_kind);
                     new_bytecode.empend((diff - i - 1) * (bytecode_to_repeat.size() + 2)); // Jump to the _END label
@@ -359,10 +361,28 @@ public:
         bytecode_to_repeat = move(new_bytecode);
     }
 
-    void insert_bytecode_repetition_n(ByteCode& bytecode_to_repeat, size_t n)
+    template<typename T>
+    void insert_bytecode_repetition_n(ByteCode& bytecode_to_repeat, T n, size_t repetition_mark_id) requires(IsIntegral<T>)
     {
-        for (size_t i = 0; i < n; ++i)
+        // LABEL _LOOP
+        // REGEXP
+        // REPEAT _LOOP N-1
+        // REGEXP
+        if (n == 0)
+            return;
+
+        // Note: this bytecode layout allows callers to repeat the last REGEXP instruction without the
+        // REPEAT instruction forcing another loop.
+        extend(bytecode_to_repeat);
+
+        if (n > 1) {
+            empend(static_cast<ByteCodeValueType>(OpCodeId::Repeat));
+            empend(bytecode_to_repeat.size());
+            empend(static_cast<ByteCodeValueType>(n - 1));
+            empend(repetition_mark_id);
+
             extend(bytecode_to_repeat);
+        }
     }
 
     static void transform_bytecode_repetition_min_one(ByteCode& bytecode_to_repeat, bool greedy)
@@ -672,6 +692,21 @@ private:
     ALWAYS_INLINE static void compare_script_extension(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched);
 };
 
+class OpCode_Repeat : public OpCode {
+public:
+    ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
+    ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Repeat; }
+    ALWAYS_INLINE size_t size() const override { return 4; }
+    ALWAYS_INLINE size_t offset() const { return argument(0); }
+    ALWAYS_INLINE u64 count() const { return argument(1); }
+    ALWAYS_INLINE size_t id() const { return argument(2); }
+    String const arguments_string() const override
+    {
+        auto reps = id() < state().repetition_marks.size() ? state().repetition_marks.at(id()) : 0;
+        return String::formatted("offset={} count={} id={} rep={}, sp: {}", offset(), count() + 1, id(), reps + 1, state().string_position);
+    }
+};
+
 template<typename T>
 bool is(OpCode const&);
 
diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h
index dc0b7bb52a3670d3957855c83c226c182b13ba52..0fdb27f1476eadfea95de3cfcffe01758b557e90 100644
--- a/Userland/Libraries/LibRegex/RegexMatch.h
+++ b/Userland/Libraries/LibRegex/RegexMatch.h
@@ -505,6 +505,7 @@ struct MatchState {
     size_t fork_at_position { 0 };
     Vector<Match> matches;
     Vector<Vector<Match>> capture_group_matches;
+    Vector<u64> repetition_marks;
 };
 
 }
diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp
index 4fed9ec69197564d95c2097977894abe7966dc91..12f63c016ed5391aa0ea58194b892f8c9e345e6a 100644
--- a/Userland/Libraries/LibRegex/RegexMatcher.cpp
+++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp
@@ -206,6 +206,7 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
             state.string_position = view_index;
             state.string_position_in_code_units = view_index;
             state.instruction_position = 0;
+            state.repetition_marks.clear();
 
             auto success = execute(input, state, temp_operations);
             // This success is acceptable only if it doesn't read anything from the input (input length is 0).
@@ -238,6 +239,7 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
             state.string_position = view_index;
             state.string_position_in_code_units = view_index;
             state.instruction_position = 0;
+            state.repetition_marks.clear();
 
             auto success = execute(input, state, operations);
             if (!success.has_value())
diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp
index db08d836fcc5fb71b40ece68f6f6623b4ac7e296..f18b0179ffb47fafb59f7397685455f53046eae1 100644
--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@@ -17,6 +17,7 @@
 namespace regex {
 
 static constexpr size_t s_maximum_repetition_count = 1024 * 1024;
+static constexpr u64 s_ecma262_maximum_repetition_count = (1ull << 53) - 1;
 static constexpr auto s_alphabetic_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"sv;
 static constexpr auto s_decimal_characters = "0123456789"sv;
 
@@ -419,7 +420,8 @@ bool PosixBasicParser::parse_simple_re(ByteCode& bytecode, size_t& match_length_
         if (min_limit > s_maximum_repetition_count || (max_limit.has_value() && *max_limit > s_maximum_repetition_count))
             return set_error(Error::InvalidBraceContent);
 
-        ByteCode::transform_bytecode_repetition_min_max(simple_re_bytecode, min_limit, max_limit, true);
+        auto repetition_mark_id = m_parser_state.repetition_mark_count++;
+        ByteCode::transform_bytecode_repetition_min_max(simple_re_bytecode, min_limit, max_limit, repetition_mark_id, true);
         match_length_minimum += re_match_length_minimum * min_limit;
     } else {
         match_length_minimum += re_match_length_minimum;
@@ -564,15 +566,17 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& byteco
         if (match(TokenType::Comma)) {
             consume();
         } else {
+            auto repetition_mark_id = m_parser_state.repetition_mark_count++;
+
             ByteCode bytecode;
-            bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum);
+            bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, repetition_mark_id);
             bytecode_to_repeat = move(bytecode);
 
             consume(TokenType::RightCurly, Error::MismatchingBrace);
             return !has_error();
         }
 
-        Optional<size_t> maybe_maximum {};
+        Optional<u32> maybe_maximum {};
         number_builder.clear();
         while (match(TokenType::Char)) {
             number_builder.append(consume().value());
@@ -585,7 +589,8 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& byteco
             maybe_maximum = value.value();
         }
 
-        ByteCode::transform_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum);
+        auto repetition_mark_id = m_parser_state.repetition_mark_count++;
+        ByteCode::transform_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum, repetition_mark_id);
 
         consume(TokenType::RightCurly, Error::MismatchingBrace);
         return !has_error();
@@ -1141,7 +1146,7 @@ bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minim
     } repetition_mark { Repetition::None };
 
     bool ungreedy = false;
-    Optional<size_t> repeat_min, repeat_max;
+    Optional<u64> repeat_min, repeat_max;
 
     if (match(TokenType::Asterisk)) {
         consume();
@@ -1182,10 +1187,12 @@ bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minim
         ByteCode::transform_bytecode_repetition_zero_or_one(stack, !ungreedy);
         match_length_minimum = 0;
         break;
-    case Repetition::Explicit:
-        ByteCode::transform_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max, !ungreedy);
+    case Repetition::Explicit: {
+        auto repetition_mark_id = m_parser_state.repetition_mark_count++;
+        ByteCode::transform_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max, repetition_mark_id, !ungreedy);
         match_length_minimum *= repeat_min.value();
         break;
+    }
     case Repetition::None:
         VERIFY_NOT_REACHED();
     }
@@ -1193,7 +1200,7 @@ bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minim
     return true;
 }
 
-bool ECMA262Parser::parse_interval_quantifier(Optional<size_t>& repeat_min, Optional<size_t>& repeat_max)
+bool ECMA262Parser::parse_interval_quantifier(Optional<u64>& repeat_min, Optional<u64>& repeat_max)
 {
     VERIFY(match(TokenType::LeftCurly));
     consume();
@@ -1202,7 +1209,7 @@ bool ECMA262Parser::parse_interval_quantifier(Optional<size_t>& repeat_min, Opti
     auto low_bound_string = read_digits_as_string();
     chars_consumed += low_bound_string.length();
 
-    auto low_bound = low_bound_string.to_uint();
+    auto low_bound = low_bound_string.to_uint<u64>();
 
     if (!low_bound.has_value()) {
         if (!m_should_use_browser_extended_grammar && done())
@@ -1218,7 +1225,7 @@ bool ECMA262Parser::parse_interval_quantifier(Optional<size_t>& repeat_min, Opti
         consume();
         ++chars_consumed;
         auto high_bound_string = read_digits_as_string();
-        auto high_bound = high_bound_string.to_uint();
+        auto high_bound = high_bound_string.to_uint<u64>();
         if (high_bound.has_value()) {
             repeat_max = high_bound.value();
             chars_consumed += high_bound_string.length();
@@ -1243,6 +1250,9 @@ bool ECMA262Parser::parse_interval_quantifier(Optional<size_t>& repeat_min, Opti
             set_error(Error::InvalidBraceContent);
     }
 
+    if ((*repeat_min > s_ecma262_maximum_repetition_count) || (repeat_max.has_value() && (*repeat_max > s_ecma262_maximum_repetition_count)))
+        return set_error(Error::InvalidBraceContent);
+
     return true;
 }
 
diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h
index c1f54be69c95511576b3893779908c95ef0c9c2e..d9825bcd78889f569941a2f9bd6adb0d5a101d3b 100644
--- a/Userland/Libraries/LibRegex/RegexParser.h
+++ b/Userland/Libraries/LibRegex/RegexParser.h
@@ -102,6 +102,7 @@ protected:
         size_t capture_groups_count { 0 };
         size_t named_capture_groups_count { 0 };
         size_t match_length_minimum { 0 };
+        size_t repetition_mark_count { 0 };
         AllOptions regex_options;
         HashMap<int, size_t> capture_group_minimum_lengths;
         HashMap<FlyString, NamedCaptureGroup> named_capture_groups;
@@ -232,7 +233,7 @@ private:
     bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
     bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
     bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
-    bool parse_interval_quantifier(Optional<size_t>& repeat_min, Optional<size_t>& repeat_max);
+    bool parse_interval_quantifier(Optional<u64>& repeat_min, Optional<u64>& repeat_max);
     bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
     bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
     bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);