Sfoglia il codice sorgente

LibRegex: Implement an ECMA262-compatible parser

This also adds support for lookarounds and individually-negated
comparisons.
The only unimplemented part of the parser spec is the unicode stuff.
AnotherTest 4 anni fa
parent
commit
dbef2b1ee9

+ 2 - 0
Libraries/LibRegex/Forward.h

@@ -32,6 +32,7 @@ namespace regex {
 enum class Error : u8;
 class Lexer;
 class PosixExtendedParser;
+class ECMA262Parser;
 
 class ByteCode;
 class OpCode;
@@ -50,6 +51,7 @@ class OpCode_Compare;
 class RegexStringView;
 }
 
+using regex::ECMA262Parser;
 using regex::Error;
 using regex::Lexer;
 using regex::PosixExtendedParser;

+ 171 - 9
Libraries/LibRegex/RegexByteCode.cpp

@@ -65,6 +65,20 @@ const char* execution_result_name(ExecutionResult result)
     }
 }
 
+const char* boundary_check_type_name(BoundaryCheckType ty)
+{
+    switch (ty) {
+#define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) \
+    case BoundaryCheckType::x:             \
+        return #x;
+        ENUMERATE_BOUNDARY_CHECK_TYPES
+#undef __ENUMERATE_BOUNDARY_CHECK_TYPE
+    default:
+        ASSERT_NOT_REACHED();
+        return "<Unknown>";
+    }
+}
+
 const char* character_compare_type_name(CharacterCompareType ch_compare_type)
 {
     switch (ch_compare_type) {
@@ -112,12 +126,27 @@ ALWAYS_INLINE OpCode* ByteCode::get_opcode_by_id(OpCodeId id) const
             case OpCodeId::CheckEnd:
                 s_opcodes.set(i, make<OpCode_CheckEnd>(*const_cast<ByteCode*>(this)));
                 break;
+            case OpCodeId::CheckBoundary:
+                s_opcodes.set(i, make<OpCode_CheckBoundary>(*const_cast<ByteCode*>(this)));
+                break;
             case OpCodeId::ForkJump:
                 s_opcodes.set(i, make<OpCode_ForkJump>(*const_cast<ByteCode*>(this)));
                 break;
             case OpCodeId::ForkStay:
                 s_opcodes.set(i, make<OpCode_ForkStay>(*const_cast<ByteCode*>(this)));
                 break;
+            case OpCodeId::FailForks:
+                s_opcodes.set(i, make<OpCode_FailForks>(*const_cast<ByteCode*>(this)));
+                break;
+            case OpCodeId::Save:
+                s_opcodes.set(i, make<OpCode_Save>(*const_cast<ByteCode*>(this)));
+                break;
+            case OpCodeId::Restore:
+                s_opcodes.set(i, make<OpCode_Restore>(*const_cast<ByteCode*>(this)));
+                break;
+            case OpCodeId::GoBack:
+                s_opcodes.set(i, make<OpCode_GoBack>(*const_cast<ByteCode*>(this)));
+                break;
             case OpCodeId::CheckBegin:
                 s_opcodes.set(i, make<OpCode_CheckBegin>(*const_cast<ByteCode*>(this)));
                 break;
@@ -166,6 +195,38 @@ ALWAYS_INLINE ExecutionResult OpCode_Exit::execute(const MatchInput& input, Matc
     return ExecutionResult::Failed;
 }
 
+ALWAYS_INLINE ExecutionResult OpCode_Save::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
+{
+    input.saved_positions.append(state.string_position);
+    return ExecutionResult::Continue;
+}
+
+ALWAYS_INLINE ExecutionResult OpCode_Restore::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
+{
+    if (input.saved_positions.is_empty())
+        return ExecutionResult::Failed;
+
+    state.string_position = input.saved_positions.take_last();
+    return ExecutionResult::Continue;
+}
+
+ALWAYS_INLINE ExecutionResult OpCode_GoBack::execute(const MatchInput&, MatchState& state, MatchOutput&) const
+{
+    if (count() > state.string_position)
+        return ExecutionResult::Failed_ExecuteLowPrioForks;
+
+    state.string_position -= count();
+    return ExecutionResult::Continue;
+}
+
+ALWAYS_INLINE ExecutionResult OpCode_FailForks::execute(const MatchInput& input, MatchState&, MatchOutput&) const
+{
+    ASSERT(count() > 0);
+
+    input.fail_counter += count() - 1;
+    return ExecutionResult::Failed_ExecuteLowPrioForks;
+}
+
 ALWAYS_INLINE ExecutionResult OpCode_Jump::execute(const MatchInput&, MatchState& state, MatchOutput&) const
 {
 
@@ -198,6 +259,40 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(const MatchInput& input
     return ExecutionResult::Failed_ExecuteLowPrioForks;
 }
 
+ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
+{
+    auto isword = [](auto ch) { return isalnum(ch) || ch == '_'; };
+    auto is_word_boundary = [&] {
+        if (state.string_position == input.view.length()) {
+            if (state.string_position > 0 && isword(input.view[state.string_position - 1]))
+                return true;
+            return false;
+        }
+
+        if (state.string_position == 0) {
+            if (isword(input.view[0]))
+                return true;
+
+            return false;
+        }
+
+        return !!(isword(input.view[state.string_position]) ^ isword(input.view[state.string_position - 1]));
+    };
+    switch (type()) {
+    case BoundaryCheckType::Word: {
+        if (is_word_boundary())
+            return ExecutionResult::Continue;
+        return ExecutionResult::Failed_ExecuteLowPrioForks;
+    }
+    case BoundaryCheckType::NonWord: {
+        if (!is_word_boundary())
+            return ExecutionResult::Continue;
+        return ExecutionResult::Failed_ExecuteLowPrioForks;
+    }
+    }
+    ASSERT_NOT_REACHED();
+}
+
 ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
 {
     if (state.string_position == input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine))
@@ -293,9 +388,13 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const M
     return ExecutionResult::Continue;
 }
 
-ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
+ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
 {
     bool inverse { false };
+    bool temporary_inverse { false };
+    bool reset_temp_inverse { false };
+
+    auto current_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; };
 
     size_t string_position = state.string_position;
     bool inverse_matched { false };
@@ -305,30 +404,45 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
         if (state.string_position > string_position)
             break;
 
+        if (reset_temp_inverse) {
+            reset_temp_inverse = false;
+            temporary_inverse = false;
+        } else {
+            reset_temp_inverse = true;
+        }
+
         auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
 
         if (compare_type == CharacterCompareType::Inverse)
             inverse = true;
 
-        else if (compare_type == CharacterCompareType::Char) {
-            char ch = m_bytecode->at(offset++);
+        else if (compare_type == CharacterCompareType::TemporaryInverse) {
+            // If "TemporaryInverse" is given, negate the current inversion state only for the next opcode.
+            // it follows that this cannot be the last compare element.
+            ASSERT(i != arguments_count() - 1);
+
+            temporary_inverse = true;
+            reset_temp_inverse = false;
+
+        } else if (compare_type == CharacterCompareType::Char) {
+            u32 ch = m_bytecode->at(offset++);
 
             // We want to compare a string that is longer or equal in length to the available string
             if (input.view.length() - state.string_position < 1)
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
-            compare_char(input, state, ch, inverse, inverse_matched);
+            compare_char(input, state, ch, current_inversion_state(), inverse_matched);
 
         } else if (compare_type == CharacterCompareType::AnyChar) {
             // We want to compare a string that is definitely longer than the available string
             if (input.view.length() - state.string_position < 1)
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
 
-            ASSERT(!inverse);
+            ASSERT(!current_inversion_state());
             ++state.string_position;
 
         } else if (compare_type == CharacterCompareType::String) {
-            ASSERT(!inverse);
+            ASSERT(!current_inversion_state());
 
             char* str = reinterpret_cast<char*>(m_bytecode->at(offset++));
             auto& length = m_bytecode->at(offset++);
@@ -348,7 +462,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
             auto character_class = (CharClass)m_bytecode->at(offset++);
             auto ch = input.view[state.string_position];
 
-            compare_character_class(input, state, character_class, ch, inverse, inverse_matched);
+            compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
 
         } else if (compare_type == CharacterCompareType::CharRange) {
             auto value = (CharRange)m_bytecode->at(offset++);
@@ -357,7 +471,40 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
             auto to = value.to;
             auto ch = input.view[state.string_position];
 
-            compare_character_range(input, state, from, to, ch, inverse, inverse_matched);
+            compare_character_range(input, state, from, to, ch, current_inversion_state(), inverse_matched);
+
+        } else if (compare_type == CharacterCompareType::Reference) {
+            auto reference_number = (size_t)m_bytecode->at(offset++);
+            auto& groups = output.capture_group_matches.at(input.match_index);
+            if (groups.size() <= reference_number)
+                return ExecutionResult::Failed_ExecuteLowPrioForks;
+
+            auto str = groups.at(reference_number).view;
+
+            // We want to compare a string that is definitely longer than the available string
+            if (input.view.length() - state.string_position < str.length())
+                return ExecutionResult::Failed_ExecuteLowPrioForks;
+
+            if (!compare_string(input, state, str.characters_without_null_termination(), str.length()))
+                return ExecutionResult::Failed_ExecuteLowPrioForks;
+
+        } else if (compare_type == CharacterCompareType::NamedReference) {
+            auto ptr = (const char*)m_bytecode->at(offset++);
+            auto length = (size_t)m_bytecode->at(offset++);
+            StringView name { ptr, length };
+
+            auto group = output.named_capture_group_matches.at(input.match_index).get(name);
+            if (!group.has_value())
+                return ExecutionResult::Failed_ExecuteLowPrioForks;
+
+            auto str = group.value().view;
+
+            // We want to compare a string that is definitely longer than the available string
+            if (input.view.length() - state.string_position < str.length())
+                return ExecutionResult::Failed_ExecuteLowPrioForks;
+
+            if (!compare_string(input, state, str.characters_without_null_termination(), str.length()))
+                return ExecutionResult::Failed_ExecuteLowPrioForks;
 
         } else {
             fprintf(stderr, "Undefined comparison: %i\n", (int)compare_type);
@@ -366,7 +513,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
         }
     }
 
-    if (inverse && !inverse_matched)
+    if (current_inversion_state() && !inverse_matched)
         ++state.string_position;
 
     if (string_position == state.string_position || state.string_position > input.view.length())
@@ -502,6 +649,14 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& inp
                 ++state.string_position;
         }
         break;
+    case CharClass::Word:
+        if (isalnum(ch) || ch == '_') {
+            if (inverse)
+                inverse_matched = true;
+            else
+                ++state.string_position;
+        }
+        break;
     case CharClass::Xdigit:
         if (isxdigit(ch)) {
             if (inverse)
@@ -550,6 +705,13 @@ const Vector<String> OpCode_Compare::variable_arguments_to_string(Optional<Match
             result.empend(String::format("value='%c'", ch));
             if (!view.is_null())
                 result.empend(String::format("compare against: '%s'", view.substring_view(state().string_position, state().string_position + 1 > view.length() ? 0 : 1).to_string().characters()));
+        } else if (compare_type == CharacterCompareType::NamedReference) {
+            auto ptr = (const char*)m_bytecode->at(offset++);
+            auto length = m_bytecode->at(offset++);
+            result.empend(String::format("name='%.*s'", length, ptr));
+        } else if (compare_type == CharacterCompareType::Reference) {
+            auto ref = m_bytecode->at(offset++);
+            result.empend(String::format("number=%lu", ref));
         } else if (compare_type == CharacterCompareType::String) {
             char* str = reinterpret_cast<char*>(m_bytecode->at(offset++));
             auto& length = m_bytecode->at(offset++);

+ 209 - 14
Libraries/LibRegex/RegexByteCode.h

@@ -29,6 +29,7 @@
 #include "RegexMatch.h"
 #include "RegexOptions.h"
 
+#include <AK/Format.h>
 #include <AK/Forward.h>
 #include <AK/HashMap.h>
 #include <AK/NonnullOwnPtr.h>
@@ -46,33 +47,41 @@ using ByteCodeValueType = u64;
     __ENUMERATE_OPCODE(Jump)                       \
     __ENUMERATE_OPCODE(ForkJump)                   \
     __ENUMERATE_OPCODE(ForkStay)                   \
+    __ENUMERATE_OPCODE(FailForks)                  \
     __ENUMERATE_OPCODE(SaveLeftCaptureGroup)       \
     __ENUMERATE_OPCODE(SaveRightCaptureGroup)      \
     __ENUMERATE_OPCODE(SaveLeftNamedCaptureGroup)  \
     __ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \
     __ENUMERATE_OPCODE(CheckBegin)                 \
     __ENUMERATE_OPCODE(CheckEnd)                   \
+    __ENUMERATE_OPCODE(CheckBoundary)              \
+    __ENUMERATE_OPCODE(Save)                       \
+    __ENUMERATE_OPCODE(Restore)                    \
+    __ENUMERATE_OPCODE(GoBack)                     \
     __ENUMERATE_OPCODE(Exit)
 
+// clang-format off
 enum class OpCodeId : ByteCodeValueType {
 #define __ENUMERATE_OPCODE(x) x,
     ENUMERATE_OPCODES
 #undef __ENUMERATE_OPCODE
 
-        First
-    = Compare,
-    Last
-    = Exit,
+    First = Compare,
+    Last = Exit,
 };
-
-#define ENUMERATE_CHARACTER_COMPARE_TYPES         \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse)   \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar)   \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(Char)      \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(String)    \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \
+// clang-format on
+
+#define ENUMERATE_CHARACTER_COMPARE_TYPES                \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined)        \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse)          \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse) \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar)          \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(Char)             \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(String)           \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass)        \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange)        \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference)        \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference)   \
     __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
 
 enum class CharacterCompareType : ByteCodeValueType {
@@ -93,6 +102,7 @@ enum class CharacterCompareType : ByteCodeValueType {
     __ENUMERATE_CHARACTER_CLASS(Blank) \
     __ENUMERATE_CHARACTER_CLASS(Graph) \
     __ENUMERATE_CHARACTER_CLASS(Punct) \
+    __ENUMERATE_CHARACTER_CLASS(Word)  \
     __ENUMERATE_CHARACTER_CLASS(Xdigit)
 
 enum class CharClass : ByteCodeValueType {
@@ -101,6 +111,16 @@ enum class CharClass : ByteCodeValueType {
 #undef __ENUMERATE_CHARACTER_CLASS
 };
 
+#define ENUMERATE_BOUNDARY_CHECK_TYPES    \
+    __ENUMERATE_BOUNDARY_CHECK_TYPE(Word) \
+    __ENUMERATE_BOUNDARY_CHECK_TYPE(NonWord)
+
+enum class BoundaryCheckType : ByteCodeValueType {
+#define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) x,
+    ENUMERATE_BOUNDARY_CHECK_TYPES
+#undef __ENUMERATE_BOUNDARY_CHECK_TYPE
+};
+
 struct CharRange {
     const u32 from;
     const u32 to;
@@ -144,9 +164,10 @@ public:
             ASSERT(value.type != CharacterCompareType::RangeExpressionDummy);
             ASSERT(value.type != CharacterCompareType::Undefined);
             ASSERT(value.type != CharacterCompareType::String);
+            ASSERT(value.type != CharacterCompareType::NamedReference);
 
             arguments.append((ByteCodeValueType)value.type);
-            if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar)
+            if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar && value.type != CharacterCompareType::TemporaryInverse)
                 arguments.append(move(value.value));
         }
 
@@ -156,6 +177,15 @@ public:
         append(move(bytecode));
     }
 
+    void insert_bytecode_check_boundary(BoundaryCheckType type)
+    {
+        ByteCode bytecode;
+        bytecode.empend((ByteCodeValueType)OpCodeId::CheckBoundary);
+        bytecode.empend((ByteCodeValueType)type);
+
+        append(move(bytecode));
+    }
+
     void insert_bytecode_compare_string(StringView view, size_t length)
     {
         ByteCode bytecode;
@@ -175,6 +205,25 @@ public:
         append(move(bytecode));
     }
 
+    void insert_bytecode_compare_named_reference(StringView name, size_t length)
+    {
+        ByteCode bytecode;
+
+        bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
+        bytecode.empend(1); // number of arguments
+
+        ByteCode arguments;
+
+        arguments.empend(static_cast<ByteCodeValueType>(CharacterCompareType::NamedReference));
+        arguments.empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
+        arguments.empend(length);
+
+        bytecode.empend(arguments.size()); // size of arguments
+        bytecode.append(move(arguments));
+
+        append(move(bytecode));
+    }
+
     void insert_bytecode_group_capture_left(size_t capture_groups_count)
     {
         empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftCaptureGroup));
@@ -201,6 +250,87 @@ public:
         empend(name.length());
     }
 
+    enum class LookAroundType {
+        LookAhead,
+        LookBehind,
+        NegatedLookAhead,
+        NegatedLookBehind,
+    };
+    void insert_bytecode_lookaround(ByteCode&& lookaround_body, LookAroundType type, size_t match_length = 0)
+    {
+        // FIXME: The save stack will grow infinitely with repeated failures
+        //        as we do not discard that on failure (we don't necessarily know how many to pop with the current architecture).
+        switch (type) {
+        case LookAroundType::LookAhead: {
+            // SAVE
+            // REGEXP BODY
+            // RESTORE
+            empend((ByteCodeValueType)OpCodeId::Save);
+            append(move(lookaround_body));
+            empend((ByteCodeValueType)OpCodeId::Restore);
+            return;
+        }
+        case LookAroundType::NegatedLookAhead: {
+            // JUMP _A
+            // LABEL _L
+            // REGEXP BODY
+            // FAIL 2
+            // LABEL _A
+            // SAVE
+            // FORKJUMP _L
+            // RESTORE
+            auto body_length = lookaround_body.size();
+            empend((ByteCodeValueType)OpCodeId::Jump);
+            empend((ByteCodeValueType)body_length + 2); // JUMP to label _A
+            append(move(lookaround_body));
+            empend((ByteCodeValueType)OpCodeId::FailForks);
+            empend((ByteCodeValueType)2); // Fail two forks
+            empend((ByteCodeValueType)OpCodeId::Save);
+            empend((ByteCodeValueType)OpCodeId::ForkJump);
+            empend((ByteCodeValueType) - (body_length + 5)); // JUMP to lavel _L
+            empend((ByteCodeValueType)OpCodeId::Restore);
+            return;
+        }
+        case LookAroundType::LookBehind:
+            // SAVE
+            // GOBACK match_length(BODY)
+            // REGEXP BODY
+            // RESTORE
+            empend((ByteCodeValueType)OpCodeId::Save);
+            empend((ByteCodeValueType)OpCodeId::GoBack);
+            empend((ByteCodeValueType)match_length);
+            append(move(lookaround_body));
+            empend((ByteCodeValueType)OpCodeId::Restore);
+            return;
+        case LookAroundType::NegatedLookBehind: {
+            // JUMP _A
+            // LABEL _L
+            // GOBACK match_length(BODY)
+            // REGEXP BODY
+            // FAIL 2
+            // LABEL _A
+            // SAVE
+            // FORKJUMP _L
+            // RESTORE
+            auto body_length = lookaround_body.size();
+            empend((ByteCodeValueType)OpCodeId::Jump);
+            empend((ByteCodeValueType)body_length + 4); // JUMP to label _A
+            empend((ByteCodeValueType)OpCodeId::GoBack);
+            empend((ByteCodeValueType)match_length);
+            append(move(lookaround_body));
+            empend((ByteCodeValueType)OpCodeId::FailForks);
+            empend((ByteCodeValueType)2); // Fail two forks
+            empend((ByteCodeValueType)OpCodeId::Save);
+            empend((ByteCodeValueType)OpCodeId::ForkJump);
+            empend((ByteCodeValueType) - (body_length + 7)); // JUMP to lavel _L
+            empend((ByteCodeValueType)OpCodeId::Restore);
+            return;
+        }
+        }
+
+        ASSERT_NOT_REACHED();
+    }
+
     void insert_bytecode_alternation(ByteCode&& left, ByteCode&& right)
     {
 
@@ -348,6 +478,7 @@ enum class ExecutionResult : u8 {
 
 const char* execution_result_name(ExecutionResult result);
 const char* opcode_id_name(OpCodeId opcode_id);
+const char* boundary_check_type_name(BoundaryCheckType);
 const char* character_compare_type_name(CharacterCompareType result);
 const char* execution_result_name(ExecutionResult result);
 
@@ -419,6 +550,56 @@ public:
     const String arguments_string() const override { return ""; }
 };
 
+class OpCode_FailForks final : public OpCode {
+public:
+    OpCode_FailForks(ByteCode& bytecode)
+        : OpCode(bytecode)
+    {
+    }
+    ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+    ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::FailForks; }
+    ALWAYS_INLINE size_t size() const override { return 2; }
+    ALWAYS_INLINE size_t count() const { return argument(0); }
+    const String arguments_string() const override { return String::formatted("count={}", count()); }
+};
+
+class OpCode_Save final : public OpCode {
+public:
+    OpCode_Save(ByteCode& bytecode)
+        : OpCode(bytecode)
+    {
+    }
+    ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+    ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Save; }
+    ALWAYS_INLINE size_t size() const override { return 1; }
+    const String arguments_string() const override { return ""; }
+};
+
+class OpCode_Restore final : public OpCode {
+public:
+    OpCode_Restore(ByteCode& bytecode)
+        : OpCode(bytecode)
+    {
+    }
+    ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+    ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Restore; }
+    ALWAYS_INLINE size_t size() const override { return 1; }
+    const String arguments_string() const override { return ""; }
+};
+
+class OpCode_GoBack final : public OpCode {
+public:
+    OpCode_GoBack(ByteCode& bytecode)
+        : OpCode(bytecode)
+    {
+    }
+    ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+    ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::GoBack; }
+    ALWAYS_INLINE size_t size() const override { return 2; }
+    ALWAYS_INLINE size_t count() const { return argument(0); }
+    const String arguments_string() const override { return String::formatted("count={}", count()); }
+};
+
 class OpCode_Jump final : public OpCode {
 public:
     OpCode_Jump(ByteCode& bytecode)
@@ -491,6 +672,20 @@ public:
     const String arguments_string() const override { return ""; }
 };
 
+class OpCode_CheckBoundary final : public OpCode {
+public:
+    OpCode_CheckBoundary(ByteCode& bytecode)
+        : OpCode(bytecode)
+    {
+    }
+    ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+    ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBoundary; }
+    ALWAYS_INLINE size_t size() const override { return 2; }
+    ALWAYS_INLINE size_t arguments_count() const { return 1; }
+    ALWAYS_INLINE BoundaryCheckType type() const { return static_cast<BoundaryCheckType>(argument(0)); }
+    const String arguments_string() const override { return String::format("kind=%lu (%s)", argument(0), boundary_check_type_name(type())); }
+};
+
 class OpCode_SaveLeftCaptureGroup final : public OpCode {
 public:
     OpCode_SaveLeftCaptureGroup(ByteCode& bytecode)

+ 1 - 0
Libraries/LibRegex/RegexDebug.h

@@ -102,6 +102,7 @@ public:
     {
         StringBuilder builder;
         builder.append(execution_result_name(result));
+        builder.appendff(", fc: {}, ss: {}", input.fail_counter, input.saved_positions.size());
         if (result == ExecutionResult::Succeeded) {
             builder.appendf(", ip: %lu/%lu, sp: %lu/%lu", state.instruction_position, bytecode.size() - 1, state.string_position, input.view.length() - 1);
         } else if (result == ExecutionResult::Fork_PrioHigh) {

+ 13 - 1
Libraries/LibRegex/RegexLexer.cpp

@@ -26,6 +26,7 @@
 
 #include "RegexLexer.h"
 #include <AK/Assertions.h>
+#include <AK/LogStream.h>
 #include <stdio.h>
 
 namespace regex {
@@ -89,6 +90,15 @@ void Lexer::reset()
     m_previous_position = 0;
 }
 
+bool Lexer::try_skip(char c)
+{
+    if (peek() != c)
+        return false;
+
+    consume();
+    return true;
+}
+
 Token Lexer::next()
 {
     size_t token_start_position;
@@ -127,7 +137,9 @@ Token Lexer::next()
         case '\\':
             return 2;
         default:
-            fprintf(stderr, "[LEXER] Found invalid escape sequence: \\%c\n", peek(1));
+#ifdef REGEX_DEBUG
+            fprintf(stderr, "[LEXER] Found invalid escape sequence: \\%c (the parser will have to deal with this!)\n", peek(1));
+#endif
             return 0;
         }
     };

+ 3 - 0
Libraries/LibRegex/RegexLexer.h

@@ -91,6 +91,9 @@ public:
     void reset();
     void back(size_t offset);
     void set_source(const StringView source) { m_source = source; }
+    bool try_skip(char);
+
+    StringView slice_back(size_t offset) const { return m_source.substring_view(m_position - offset - 1, offset); }
 
 private:
     ALWAYS_INLINE char peek(size_t offset = 0) const;

+ 3 - 0
Libraries/LibRegex/RegexMatch.h

@@ -267,6 +267,9 @@ struct MatchInput {
     size_t column { 0 };
 
     size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important
+
+    mutable size_t fail_counter { 0 };
+    mutable Vector<size_t> saved_positions;
 };
 
 struct MatchState {

+ 10 - 1
Libraries/LibRegex/RegexMatcher.cpp

@@ -264,7 +264,13 @@ Optional<bool> Matcher<Parser>::execute(const MatchInput& input, MatchState& sta
         s_regex_dbg.print_opcode("VM", *opcode, state, recursion_level, false);
 #endif
 
-        auto result = opcode->execute(input, state, output);
+        ExecutionResult result;
+        if (input.fail_counter > 0) {
+            --input.fail_counter;
+            result = ExecutionResult::Failed_ExecuteLowPrioForks;
+        } else {
+            result = opcode->execute(input, state, output);
+        }
 
 #ifdef REGEX_DEBUG
         s_regex_dbg.print_result(*opcode, bytecode, input, state, result);
@@ -330,4 +336,7 @@ ALWAYS_INLINE Optional<bool> Matcher<Parser>::execute_low_prio_forks(const Match
 
 template class Matcher<PosixExtendedParser>;
 template class Regex<PosixExtendedParser>;
+
+template class Matcher<ECMA262Parser>;
+template class Regex<ECMA262Parser>;
 }

+ 779 - 1
Libraries/LibRegex/RegexParser.cpp

@@ -28,7 +28,7 @@
 #include "RegexDebug.h"
 #include <AK/String.h>
 #include <AK/StringBuilder.h>
-#include <cstdio>
+#include <AK/StringUtils.h>
 
 namespace regex {
 
@@ -88,6 +88,26 @@ ALWAYS_INLINE bool Parser::consume(const String& str)
     return true;
 }
 
+ALWAYS_INLINE bool Parser::try_skip(StringView str)
+{
+    if (str.starts_with(m_parser_state.current_token.value()))
+        str = str.substring_view(m_parser_state.current_token.value().length(), str.length() - m_parser_state.current_token.value().length());
+    else
+        return false;
+
+    size_t potentially_go_back { 0 };
+    for (auto ch : str) {
+        if (!m_parser_state.lexer.try_skip(ch)) {
+            m_parser_state.lexer.back(potentially_go_back);
+            return false;
+        }
+        ++potentially_go_back;
+    }
+
+    m_parser_state.current_token = m_parser_state.lexer.next();
+    return true;
+}
+
 ALWAYS_INLINE void Parser::reset()
 {
     m_parser_state.bytecode.clear();
@@ -595,4 +615,762 @@ bool PosixExtendedParser::parse_root(ByteCode& stack, size_t& match_length_minim
     return !has_error();
 }
 
+// =============================
+// ECMA262 Parser
+// =============================
+
+bool ECMA262Parser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
+{
+    if (m_parser_state.regex_options & AllFlags::Unicode) {
+        return parse_pattern(stack, match_length_minimum, true, true);
+    } else {
+        ByteCode new_stack;
+        size_t new_match_length = 0;
+        auto res = parse_pattern(new_stack, new_match_length, false, false);
+        if (m_parser_state.named_capture_groups_count > 0) {
+            reset();
+            return parse_pattern(stack, match_length_minimum, false, true);
+        }
+
+        if (!res)
+            return false;
+
+        stack.append(new_stack);
+        match_length_minimum = new_match_length;
+        return res;
+    }
+}
+
+bool ECMA262Parser::parse_pattern(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
+{
+    return parse_disjunction(stack, match_length_minimum, unicode, named);
+}
+
+bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
+{
+    ByteCode left_alternative_stack;
+    size_t left_alternative_min_length = 0;
+    auto alt_ok = parse_alternative(left_alternative_stack, left_alternative_min_length, unicode, named);
+    if (!alt_ok)
+        return false;
+
+    if (!match(TokenType::Pipe)) {
+        stack.append(left_alternative_stack);
+        match_length_minimum = left_alternative_min_length;
+        return alt_ok;
+    }
+
+    consume();
+    ByteCode right_alternative_stack;
+    size_t right_alternative_min_length = 0;
+    auto continuation_ok = parse_disjunction(right_alternative_stack, right_alternative_min_length, unicode, named);
+    if (!continuation_ok)
+        return false;
+
+    stack.insert_bytecode_alternation(move(left_alternative_stack), move(right_alternative_stack));
+    match_length_minimum = min(left_alternative_min_length, right_alternative_min_length);
+    return continuation_ok;
+}
+
+bool ECMA262Parser::parse_alternative(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
+{
+    for (;;) {
+        if (match(TokenType::Eof))
+            return true;
+
+        if (parse_term(stack, match_length_minimum, unicode, named))
+            continue;
+
+        return !has_error();
+    }
+}
+
+bool ECMA262Parser::parse_term(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
+{
+    if (parse_assertion(stack, match_length_minimum, unicode, named))
+        return true;
+
+    ByteCode atom_stack;
+    size_t minimum_atom_length = 0;
+    if (!parse_atom(atom_stack, minimum_atom_length, unicode, named))
+        return false;
+
+    if (!parse_quantifier(atom_stack, minimum_atom_length, unicode, named))
+        return false;
+
+    stack.append(move(atom_stack));
+    match_length_minimum += minimum_atom_length;
+    return true;
+}
+
+bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& match_length_minimum, bool unicode, bool named)
+{
+    if (match(TokenType::Circumflex)) {
+        consume();
+        stack.empend((ByteCodeValueType)OpCodeId::CheckBegin);
+        return true;
+    }
+
+    if (match(TokenType::Dollar)) {
+        consume();
+        stack.empend((ByteCodeValueType)OpCodeId::CheckEnd);
+        return true;
+    }
+
+    if (try_skip("\\b")) {
+        stack.insert_bytecode_check_boundary(BoundaryCheckType::Word);
+        return true;
+    }
+
+    if (try_skip("\\B")) {
+        stack.insert_bytecode_check_boundary(BoundaryCheckType::NonWord);
+        return true;
+    }
+
+    if (match(TokenType::LeftParen)) {
+        if (!try_skip("(?"))
+            return false;
+
+        ByteCode assertion_stack;
+        size_t length_dummy = 0;
+
+        auto parse_inner_disjunction = [&] {
+            auto disjunction_ok = parse_disjunction(assertion_stack, length_dummy, unicode, named);
+            if (!disjunction_ok)
+                return false;
+            consume(TokenType::RightParen, Error::MismatchingParen);
+            return true;
+        };
+
+        if (try_skip("=")) {
+            if (!parse_inner_disjunction())
+                return false;
+            stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookAhead);
+            return true;
+        }
+        if (try_skip("!")) {
+            if (!parse_inner_disjunction())
+                return false;
+            stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
+            return true;
+        }
+        if (try_skip("<=")) {
+            if (!parse_inner_disjunction())
+                return false;
+            // FIXME: Somehow ensure that this assertion regexp has a fixed length.
+            stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookBehind, length_dummy);
+            return true;
+        }
+        if (try_skip("<!")) {
+            if (!parse_inner_disjunction())
+                return false;
+            stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookBehind, length_dummy);
+            return true;
+        }
+
+        // If none of these matched, put the '(?' back.
+        m_parser_state.lexer.back(3);
+        m_parser_state.current_token = m_parser_state.lexer.next();
+        return false;
+    }
+
+    return false;
+}
+
+Optional<unsigned> ECMA262Parser::read_digits(ECMA262Parser::ReadDigitsInitialZeroState initial_zero, ECMA262Parser::ReadDigitFollowPolicy follow_policy, bool hex, int max_count)
+{
+    if (!match(TokenType::Char))
+        return {};
+
+    if (initial_zero != ReadDigitsInitialZeroState::Allow) {
+        auto has_initial_zero = m_parser_state.current_token.value() == "0";
+        if (initial_zero == ReadDigitsInitialZeroState::Disallow && has_initial_zero)
+            return {};
+
+        if (initial_zero == ReadDigitsInitialZeroState::Require && !has_initial_zero)
+            return {};
+    }
+
+    int count = 0;
+    size_t offset = 0;
+    while (match(TokenType::Char)) {
+        auto c = m_parser_state.current_token.value();
+        if (follow_policy == ReadDigitFollowPolicy::DisallowDigit) {
+            if (hex && AK::StringUtils::convert_to_uint_from_hex(c).has_value())
+                break;
+            if (!hex && c.to_uint().has_value())
+                break;
+        }
+
+        if (follow_policy == ReadDigitFollowPolicy::DisallowNonDigit) {
+            if (hex && !AK::StringUtils::convert_to_uint_from_hex(c).has_value())
+                break;
+            if (!hex && !c.to_uint().has_value())
+                break;
+        }
+
+        if (max_count > 0 && count >= max_count)
+            break;
+
+        offset += consume().value().length();
+        ++count;
+    }
+
+    auto str = m_parser_state.lexer.slice_back(offset);
+    if (hex)
+        return AK::StringUtils::convert_to_uint_from_hex(str);
+
+    return str.to_uint();
+}
+
+bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minimum, bool, bool)
+{
+    enum class Repetition {
+        OneOrMore,
+        ZeroOrMore,
+        Optional,
+        Explicit,
+        None,
+    } repetition_mark { Repetition::None };
+
+    bool ungreedy = false;
+    Optional<size_t> repeat_min, repeat_max;
+
+    if (match(TokenType::Asterisk)) {
+        consume();
+        repetition_mark = Repetition::ZeroOrMore;
+    } else if (match(TokenType::Plus)) {
+        consume();
+        repetition_mark = Repetition::OneOrMore;
+    } else if (match(TokenType::Questionmark)) {
+        consume();
+        repetition_mark = Repetition::Optional;
+    } else if (match(TokenType::LeftCurly)) {
+        consume();
+        repetition_mark = Repetition::Explicit;
+
+        auto low_bound = read_digits();
+
+        if (!low_bound.has_value()) {
+            set_error(Error::InvalidBraceContent);
+            return false;
+        }
+
+        repeat_min = low_bound.value();
+
+        if (match(TokenType::Comma)) {
+            consume();
+            auto high_bound = read_digits();
+            if (!high_bound.has_value()) {
+                set_error(Error::InvalidBraceContent);
+                return false;
+            }
+
+            repeat_max = high_bound.value();
+        }
+
+        if (!match(TokenType::RightCurly)) {
+            set_error(Error::MismatchingBrace);
+            return false;
+        }
+        consume();
+
+        if (repeat_max.has_value()) {
+            if (repeat_min.value() > repeat_max.value())
+                set_error(Error::InvalidBraceContent);
+        }
+    } else {
+        return true;
+    }
+
+    if (match(TokenType::Questionmark)) {
+        if (repetition_mark == Repetition::Explicit) {
+            set_error(Error::InvalidRepetitionMarker);
+            return false;
+        }
+        consume();
+        ungreedy = true;
+    }
+
+    ByteCode new_bytecode;
+    switch (repetition_mark) {
+    case Repetition::OneOrMore:
+        new_bytecode.insert_bytecode_repetition_min_one(stack, !ungreedy);
+        break;
+    case Repetition::ZeroOrMore:
+        new_bytecode.insert_bytecode_repetition_any(stack, !ungreedy);
+        match_length_minimum = 0;
+        break;
+    case Repetition::Optional:
+        new_bytecode.insert_bytecode_repetition_zero_or_one(stack, !ungreedy);
+        match_length_minimum = 0;
+        break;
+    case Repetition::Explicit:
+        new_bytecode.insert_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max);
+        match_length_minimum *= repeat_min.value();
+        break;
+    case Repetition::None:
+        ASSERT_NOT_REACHED();
+    }
+
+    return true;
+}
+
+bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
+{
+    if (try_skip("\\")) {
+        // AtomEscape.
+        return parse_atom_escape(stack, match_length_minimum, unicode, named);
+    }
+
+    if (match(TokenType::LeftBracket)) {
+        // Character class.
+        return parse_character_class(stack, match_length_minimum, unicode, named);
+    }
+
+    if (match(TokenType::LeftParen)) {
+        // Non-capturing group, or a capture group.
+        return parse_capture_group(stack, match_length_minimum, unicode, named);
+    }
+
+    if (match(TokenType::Period)) {
+        consume();
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
+        return true;
+    }
+
+    if (match(TokenType::Circumflex) || match(TokenType::Dollar) || match(TokenType::RightBracket)
+        || match(TokenType::RightCurly) || match(TokenType::RightParen) || match(TokenType::Pipe)
+        || match(TokenType::Plus) || match(TokenType::Asterisk) || match(TokenType::Questionmark)) {
+
+        return false;
+    }
+
+    if (match(TokenType::Char)) {
+        auto token = consume().value();
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token[0] } });
+        return true;
+    }
+
+    set_error(Error::InvalidPattern);
+    return false;
+}
+
+bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
+{
+    if (auto escape = read_digits(ReadDigitsInitialZeroState::Disallow, ReadDigitFollowPolicy::DisallowNonDigit); escape.has_value()) {
+        auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value());
+        if (!maybe_length.has_value()) {
+            set_error(Error::InvalidNumber);
+            return false;
+        }
+        match_length_minimum += maybe_length.value();
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } });
+        return true;
+    }
+
+    // CharacterEscape > ControlEscape
+    if (try_skip("f")) {
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\f' } });
+        return true;
+    }
+
+    if (try_skip("n")) {
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\n' } });
+        return true;
+    }
+
+    if (try_skip("r")) {
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\r' } });
+        return true;
+    }
+
+    if (try_skip("t")) {
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\t' } });
+        return true;
+    }
+
+    if (try_skip("v")) {
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\v' } });
+        return true;
+    }
+
+    // CharacterEscape > ControlLetter
+    if (try_skip("c")) {
+        for (auto c = 'A'; c <= 'z'; ++c) {
+            if (try_skip({ &c, 1 })) {
+                match_length_minimum += 1;
+                stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)(c & 0x3f) } });
+                return true;
+            }
+        }
+    }
+
+    // '\0'
+    if (read_digits(ReadDigitsInitialZeroState::Require, ReadDigitFollowPolicy::DisallowDigit).has_value()) {
+        match_length_minimum += 1;
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
+        return true;
+    }
+
+    // HexEscape
+    if (try_skip("x")) {
+        if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value()) {
+            match_length_minimum += 1;
+            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } });
+            return true;
+        }
+    }
+
+    if (try_skip("u")) {
+        // FIXME: Implement this path, unicode escape sequence.
+        TODO();
+    }
+
+    // IdentityEscape
+    if (match(TokenType::EscapeSequence)) {
+        match_length_minimum += 1;
+        auto token = consume().value();
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token[token.length() - 1] } });
+        return true;
+    }
+
+    if (named && try_skip("k")) {
+        auto name = read_capture_group_specifier(true);
+        if (name.is_empty()) {
+            set_error(Error::InvalidNameForCaptureGroup);
+            return false;
+        }
+        auto maybe_length = m_parser_state.named_capture_group_minimum_lengths.get(name);
+        if (!maybe_length.has_value()) {
+            set_error(Error::InvalidNameForCaptureGroup);
+            return false;
+        }
+        match_length_minimum += maybe_length.value();
+
+        stack.insert_bytecode_compare_named_reference(name, name.length());
+        return true;
+    }
+
+    if (unicode) {
+        if (try_skip("p{")) {
+            // FIXME: Implement this path, Unicode property match.
+            TODO();
+        }
+        if (try_skip("P{")) {
+            // FIXME: Implement this path, Unicode property match.
+            TODO();
+        }
+    }
+
+    bool negate = false;
+    auto ch = parse_character_class_escape(negate);
+    if (!ch.has_value()) {
+        set_error(Error::InvalidCharacterClass);
+        return false;
+    }
+
+    Vector<CompareTypeAndValuePair> compares;
+    if (negate)
+        compares.empend(CharacterCompareType::Inverse, 0);
+    compares.empend(CharacterCompareType::CharClass, (ByteCodeValueType)ch.value());
+    match_length_minimum += 1;
+    stack.insert_bytecode_compare_values(move(compares));
+    return true;
+}
+
+Optional<CharClass> ECMA262Parser::parse_character_class_escape(bool& negate, bool expect_backslash)
+{
+    if (expect_backslash && !try_skip("\\"))
+        return {};
+
+    // CharacterClassEscape
+    CharClass ch_class;
+    if (try_skip("d")) {
+        ch_class = CharClass::Digit;
+    } else if (try_skip("D")) {
+        ch_class = CharClass::Digit;
+        negate = true;
+    } else if (try_skip("s")) {
+        ch_class = CharClass::Space;
+    } else if (try_skip("S")) {
+        ch_class = CharClass::Space;
+        negate = true;
+    } else if (try_skip("w")) {
+        ch_class = CharClass::Word;
+    } else if (try_skip("W")) {
+        ch_class = CharClass::Word;
+        negate = true;
+    } else {
+        return {};
+    }
+
+    return ch_class;
+}
+
+bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool)
+{
+    consume(TokenType::LeftBracket, Error::InvalidPattern);
+
+    Vector<CompareTypeAndValuePair> compares;
+
+    if (match(TokenType::Circumflex)) {
+        // Negated charclass
+        consume();
+        compares.empend(CharacterCompareType::Inverse, 0);
+    }
+
+    if (match(TokenType::RightBracket)) {
+        consume();
+        return true;
+    }
+
+    if (!parse_nonempty_class_ranges(compares, unicode))
+        return false;
+
+    match_length_minimum += 1;
+    stack.insert_bytecode_compare_values(move(compares));
+    return true;
+}
+
+struct CharClassRangeElement {
+    union {
+        CharClass character_class;
+        u32 code_point { 0 };
+    };
+
+    bool is_negated { false };
+    bool is_character_class { false };
+};
+
+bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
+{
+    auto read_class_atom_no_dash = [&]() -> Optional<CharClassRangeElement> {
+        if (match(TokenType::EscapeSequence)) {
+            auto token = consume().value();
+            return { { .code_point = (u32)token[1], .is_character_class = false } };
+        }
+
+        if (try_skip("\\")) {
+            if (try_skip("f"))
+                return { { .code_point = '\f', .is_character_class = false } };
+            if (try_skip("n"))
+                return { { .code_point = '\n', .is_character_class = false } };
+            if (try_skip("r"))
+                return { { .code_point = '\r', .is_character_class = false } };
+            if (try_skip("t"))
+                return { { .code_point = '\t', .is_character_class = false } };
+            if (try_skip("v"))
+                return { { .code_point = '\v', .is_character_class = false } };
+            if (try_skip("b"))
+                return { { .code_point = '\b', .is_character_class = false } };
+
+            // CharacterEscape > ControlLetter
+            if (try_skip("c")) {
+                for (auto c = 'A'; c <= 'z'; ++c) {
+                    if (try_skip({ &c, 1 }))
+                        return { { .code_point = (u32)(c & 0x3f), .is_character_class = false } };
+                }
+            }
+
+            // '\0'
+            if (read_digits(ReadDigitsInitialZeroState::Require, ReadDigitFollowPolicy::DisallowDigit).has_value())
+                return { { .code_point = 0, .is_character_class = false } };
+
+            // HexEscape
+            if (try_skip("x")) {
+                if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value())
+                    return { { .code_point = hex_escape.value(), .is_character_class = false } };
+            }
+
+            if (try_skip("u")) {
+                // FIXME: Implement this path, unicode escape sequence.
+                TODO();
+            }
+
+            if (unicode) {
+                if (try_skip("-"))
+                    return { { .code_point = '-', .is_character_class = false } };
+            }
+
+            if (try_skip("p{") || try_skip("P{")) {
+                // FIXME: Implement these; unicode properties.
+                TODO();
+            }
+
+            if (try_skip("d"))
+                return { { .character_class = CharClass::Digit, .is_character_class = true } };
+            if (try_skip("s"))
+                return { { .character_class = CharClass::Space, .is_character_class = true } };
+            if (try_skip("w"))
+                return { { .character_class = CharClass::Word, .is_character_class = true } };
+            if (try_skip("D"))
+                return { { .character_class = CharClass::Digit, .is_negated = true, .is_character_class = true } };
+            if (try_skip("S"))
+                return { { .character_class = CharClass::Space, .is_negated = true, .is_character_class = true } };
+            if (try_skip("W"))
+                return { { .character_class = CharClass::Word, .is_negated = true, .is_character_class = true } };
+        }
+
+        if (match(TokenType::RightBracket) || match(TokenType::HyphenMinus))
+            return {};
+
+        auto token = consume(TokenType::Char, Error::InvalidCharacterClass);
+
+        return { { .code_point = (u32)token.value()[0], .is_character_class = false } };
+    };
+    auto read_class_atom = [&]() -> Optional<CharClassRangeElement> {
+        if (match(TokenType::HyphenMinus)) {
+            consume();
+            return { { .code_point = '-', .is_character_class = false } };
+        }
+
+        return read_class_atom_no_dash();
+    };
+
+    while (!match(TokenType::RightBracket)) {
+        auto first_atom = read_class_atom();
+        if (!first_atom.has_value())
+            return false;
+
+        if (match(TokenType::HyphenMinus)) {
+            consume();
+            auto second_atom = read_class_atom();
+            if (!second_atom.has_value())
+                return false;
+
+            if (first_atom.value().is_character_class || second_atom.value().is_character_class) {
+                set_error(Error::InvalidRange);
+                return false;
+            }
+
+            if (first_atom.value().code_point > second_atom.value().code_point) {
+                set_error(Error::InvalidRange);
+                return false;
+            }
+
+            ASSERT(!first_atom.value().is_negated);
+            ASSERT(!second_atom.value().is_negated);
+
+            ranges.empend(CharacterCompareType::CharRange, CharRange { first_atom.value().code_point, second_atom.value().code_point });
+            continue;
+        }
+
+        auto atom = first_atom.value();
+
+        if (atom.is_character_class) {
+            if (atom.is_negated)
+                ranges.empend(CharacterCompareType::TemporaryInverse, 0);
+            ranges.empend(CharacterCompareType::CharClass, (ByteCodeValueType)first_atom.value().character_class);
+        } else {
+            ASSERT(!atom.is_negated);
+            ranges.empend(CharacterCompareType::Char, first_atom.value().code_point);
+        }
+    }
+
+    consume(TokenType::RightBracket, Error::MismatchingBracket);
+
+    return true;
+}
+
+StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
+{
+    if (take_starting_angle_bracket && !consume("<"))
+        return {};
+
+    size_t offset = 0;
+    while (match(TokenType::Char)) {
+        auto c = m_parser_state.current_token.value();
+        if (c == ">")
+            break;
+        offset += consume().value().length();
+    }
+
+    auto name = m_parser_state.lexer.slice_back(offset);
+    if (!consume(">") || name.is_empty())
+        set_error(Error::InvalidNameForCaptureGroup);
+
+    return name;
+}
+
+bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
+{
+    consume(TokenType::LeftParen, Error::InvalidPattern);
+
+    if (match(TokenType::Questionmark)) {
+        // Non-capturing group or group with specifier.
+        consume();
+
+        if (match(TokenType::Colon)) {
+            consume();
+            ByteCode noncapture_group_bytecode;
+            size_t length = 0;
+            if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named))
+                return set_error(Error::InvalidPattern);
+
+            consume(TokenType::RightParen, Error::MismatchingParen);
+
+            stack.append(move(noncapture_group_bytecode));
+            match_length_minimum += length;
+            return true;
+        }
+
+        if (consume("<")) {
+            ++m_parser_state.named_capture_groups_count;
+            auto name = read_capture_group_specifier();
+
+            if (name.is_empty()) {
+                set_error(Error::InvalidNameForCaptureGroup);
+                return false;
+            }
+
+            ByteCode capture_group_bytecode;
+            size_t length = 0;
+            if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
+                return set_error(Error::InvalidPattern);
+
+            consume(TokenType::RightParen, Error::MismatchingParen);
+
+            stack.insert_bytecode_group_capture_left(name);
+            stack.append(move(capture_group_bytecode));
+            stack.insert_bytecode_group_capture_right(name);
+
+            match_length_minimum += length;
+
+            m_parser_state.named_capture_group_minimum_lengths.set(name, length);
+            return true;
+        }
+
+        set_error(Error::InvalidCaptureGroup);
+        return false;
+    }
+
+    auto group_index = ++m_parser_state.capture_groups_count;
+    stack.insert_bytecode_group_capture_left(group_index);
+
+    ByteCode capture_group_bytecode;
+    size_t length = 0;
+
+    if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
+        return set_error(Error::InvalidPattern);
+
+    stack.append(move(capture_group_bytecode));
+
+    m_parser_state.capture_group_minimum_lengths.set(group_index, length);
+
+    consume(TokenType::RightParen, Error::MismatchingParen);
+
+    stack.insert_bytecode_group_capture_right(group_index);
+
+    match_length_minimum += length;
+
+    return true;
+}
 }

+ 56 - 0
Libraries/LibRegex/RegexParser.h

@@ -39,6 +39,7 @@
 namespace regex {
 
 class PosixExtendedParser;
+class ECMA262Parser;
 
 template<typename T>
 struct GenericParserTraits {
@@ -53,6 +54,10 @@ template<>
 struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
 };
 
+template<>
+struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
+};
+
 class Parser {
 public:
     struct Result {
@@ -88,6 +93,7 @@ protected:
     ALWAYS_INLINE Token consume();
     ALWAYS_INLINE Token consume(TokenType type, Error error);
     ALWAYS_INLINE bool consume(const String&);
+    ALWAYS_INLINE bool try_skip(StringView);
     ALWAYS_INLINE void reset();
     ALWAYS_INLINE bool done() const;
     ALWAYS_INLINE bool set_error(Error error);
@@ -102,6 +108,10 @@ protected:
         size_t named_capture_groups_count { 0 };
         size_t match_length_minimum { 0 };
         AllOptions regex_options;
+        HashMap<int, size_t> capture_group_minimum_lengths;
+        HashMap<FlyString, size_t> named_capture_group_minimum_lengths;
+        HashMap<size_t, FlyString> named_capture_groups;
+
         explicit ParserState(Lexer& lexer)
             : lexer(lexer)
             , current_token(lexer.next())
@@ -144,8 +154,54 @@ private:
     ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
 };
 
+class ECMA262Parser final : public Parser {
+public:
+    explicit ECMA262Parser(Lexer& lexer)
+        : Parser(lexer)
+    {
+    }
+
+    ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
+        : Parser(lexer, regex_options.value_or({}))
+    {
+    }
+
+    ~ECMA262Parser() = default;
+
+private:
+    bool parse_internal(ByteCode&, size_t&) override;
+
+    enum class ReadDigitsInitialZeroState {
+        Allow,
+        Disallow,
+        Require,
+    };
+    enum class ReadDigitFollowPolicy {
+        Any,
+        DisallowDigit,
+        DisallowNonDigit,
+    };
+    Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy follow_policy = ReadDigitFollowPolicy::Any, bool hex = false, int max_count = -1);
+    StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
+
+    bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_term(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
+    bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
+    Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
+    bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
+};
+
 using PosixExtended = PosixExtendedParser;
+using ECMA262 = ECMA262Parser;
 
 }
 
+using regex::ECMA262;
 using regex::PosixExtended;

+ 75 - 0
Libraries/LibRegex/Tests/Regex.cpp

@@ -472,4 +472,79 @@ TEST_CASE(simple_period_end_benchmark)
     EXPECT_EQ(re.search("hello?", m), true);
 }
 
+TEST_CASE(ECMA262_parse)
+{
+    constexpr const char* patterns[] {
+        "^hello.$",
+        "^(hello.)$",
+        "^h{0,1}ello.$",
+        "^hello\\W$",
+        "^hell\\w.$",
+        "^hell\\x6f1$", // ^hello1$
+        "^hel(?:l\\w).$",
+        "^hel(?<LO>l\\w).$",
+        "^[-a-zA-Z\\w\\s]+$",
+        "\\bhello\\B",
+    };
+
+    for (auto& pattern : patterns) {
+        Regex<ECMA262> re(pattern);
+        EXPECT_EQ(re.parser_result.error, Error::NoError);
+#ifdef REGEX_DEBUG
+        dbg() << "\n";
+        RegexDebug regex_dbg(stderr);
+        regex_dbg.print_raw_bytecode(re);
+        regex_dbg.print_header();
+        regex_dbg.print_bytecode(re);
+        dbg() << "\n";
+#endif
+    }
+}
+
+TEST_CASE(ECMA262_match)
+{
+    struct _test {
+        const char* pattern;
+        const char* subject;
+        bool matches { true };
+        ECMAScriptFlags options {};
+    };
+
+    constexpr _test tests[] {
+        { "^hello.$", "hello1" },
+        { "^(hello.)$", "hello1" },
+        { "^h{0,1}ello.$", "ello1" },
+        { "^hello\\W$", "hello!" },
+        { "^hell\\w.$", "hellx!" },
+        { "^hell\\x6f1$", "hello1" },
+        { "^hel(?<LO>l.)1$", "hello1" },
+        { "^hel(?<LO>l.)1*\\k<LO>.$", "hello1lo1" },
+        { "^[-a-z1-3\\s]+$", "hell2 o1" },
+        { .pattern = "\\bhello\\B", .subject = "hello1", .options = ECMAScriptFlags::Global },
+        { "\\b.*\\b", "hello1" },
+        { "[^\\D\\S]{2}", "1 " },
+        { "bar(?=f.)foo", "barfoo" },
+        { "bar(?=foo)bar", "barbar", false },
+        { "bar(?!foo)bar", "barbar", true },
+        { "bar(?!bar)bar", "barbar", false },
+        { "bar.*(?<=foo)", "barbar", false },
+        { "bar.*(?<!foo)", "barbar", true },
+        { "((...)X)+", "fooXbarXbazX", true },
+    };
+
+    for (auto& test : tests) {
+        Regex<ECMA262> re(test.pattern, test.options);
+#ifdef REGEX_DEBUG
+        dbg() << "\n";
+        RegexDebug regex_dbg(stderr);
+        regex_dbg.print_raw_bytecode(re);
+        regex_dbg.print_header();
+        regex_dbg.print_bytecode(re);
+        dbg() << "\n";
+#endif
+        EXPECT_EQ(re.parser_result.error, Error::NoError);
+        EXPECT_EQ(re.match(test.subject).success, test.matches);
+    }
+}
+
 TEST_MAIN(Regex)