mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-25 00:50:22 +00:00
LibRegex: Implement an ECMA262-compatible parser
This also adds support for lookarounds and individually-negated comparisons. The only unimplemented part of the parser spec is the unicode stuff.
This commit is contained in:
parent
3200ff5f4f
commit
dbef2b1ee9
Notes:
sideshowbarker
2024-07-19 01:14:40 +09:00
Author: https://github.com/alimpfard Commit: https://github.com/SerenityOS/serenity/commit/dbef2b1ee9d Pull-request: https://github.com/SerenityOS/serenity/pull/4103 Reviewed-by: https://github.com/linusg ✅
11 changed files with 1321 additions and 25 deletions
|
@ -32,6 +32,7 @@ namespace regex {
|
|||
enum class Error : u8;
|
||||
class Lexer;
|
||||
class PosixExtendedParser;
|
||||
class ECMA262Parser;
|
||||
|
||||
class ByteCode;
|
||||
class OpCode;
|
||||
|
@ -50,6 +51,7 @@ class OpCode_Compare;
|
|||
class RegexStringView;
|
||||
}
|
||||
|
||||
using regex::ECMA262Parser;
|
||||
using regex::Error;
|
||||
using regex::Lexer;
|
||||
using regex::PosixExtendedParser;
|
||||
|
|
|
@ -65,6 +65,20 @@ const char* execution_result_name(ExecutionResult result)
|
|||
}
|
||||
}
|
||||
|
||||
const char* boundary_check_type_name(BoundaryCheckType ty)
|
||||
{
|
||||
switch (ty) {
|
||||
#define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) \
|
||||
case BoundaryCheckType::x: \
|
||||
return #x;
|
||||
ENUMERATE_BOUNDARY_CHECK_TYPES
|
||||
#undef __ENUMERATE_BOUNDARY_CHECK_TYPE
|
||||
default:
|
||||
ASSERT_NOT_REACHED();
|
||||
return "<Unknown>";
|
||||
}
|
||||
}
|
||||
|
||||
const char* character_compare_type_name(CharacterCompareType ch_compare_type)
|
||||
{
|
||||
switch (ch_compare_type) {
|
||||
|
@ -112,12 +126,27 @@ ALWAYS_INLINE OpCode* ByteCode::get_opcode_by_id(OpCodeId id) const
|
|||
case OpCodeId::CheckEnd:
|
||||
s_opcodes.set(i, make<OpCode_CheckEnd>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::CheckBoundary:
|
||||
s_opcodes.set(i, make<OpCode_CheckBoundary>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::ForkJump:
|
||||
s_opcodes.set(i, make<OpCode_ForkJump>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::ForkStay:
|
||||
s_opcodes.set(i, make<OpCode_ForkStay>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::FailForks:
|
||||
s_opcodes.set(i, make<OpCode_FailForks>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::Save:
|
||||
s_opcodes.set(i, make<OpCode_Save>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::Restore:
|
||||
s_opcodes.set(i, make<OpCode_Restore>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::GoBack:
|
||||
s_opcodes.set(i, make<OpCode_GoBack>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
case OpCodeId::CheckBegin:
|
||||
s_opcodes.set(i, make<OpCode_CheckBegin>(*const_cast<ByteCode*>(this)));
|
||||
break;
|
||||
|
@ -166,6 +195,38 @@ ALWAYS_INLINE ExecutionResult OpCode_Exit::execute(const MatchInput& input, Matc
|
|||
return ExecutionResult::Failed;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_Save::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
input.saved_positions.append(state.string_position);
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_Restore::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
if (input.saved_positions.is_empty())
|
||||
return ExecutionResult::Failed;
|
||||
|
||||
state.string_position = input.saved_positions.take_last();
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_GoBack::execute(const MatchInput&, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
if (count() > state.string_position)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
state.string_position -= count();
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_FailForks::execute(const MatchInput& input, MatchState&, MatchOutput&) const
|
||||
{
|
||||
ASSERT(count() > 0);
|
||||
|
||||
input.fail_counter += count() - 1;
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_Jump::execute(const MatchInput&, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
|
||||
|
@ -198,6 +259,40 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(const MatchInput& input
|
|||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
auto isword = [](auto ch) { return isalnum(ch) || ch == '_'; };
|
||||
auto is_word_boundary = [&] {
|
||||
if (state.string_position == input.view.length()) {
|
||||
if (state.string_position > 0 && isword(input.view[state.string_position - 1]))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (state.string_position == 0) {
|
||||
if (isword(input.view[0]))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
return !!(isword(input.view[state.string_position]) ^ isword(input.view[state.string_position - 1]));
|
||||
};
|
||||
switch (type()) {
|
||||
case BoundaryCheckType::Word: {
|
||||
if (is_word_boundary())
|
||||
return ExecutionResult::Continue;
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
}
|
||||
case BoundaryCheckType::NonWord: {
|
||||
if (!is_word_boundary())
|
||||
return ExecutionResult::Continue;
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
}
|
||||
}
|
||||
ASSERT_NOT_REACHED();
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
if (state.string_position == input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine))
|
||||
|
@ -293,9 +388,13 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const M
|
|||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
|
||||
{
|
||||
bool inverse { false };
|
||||
bool temporary_inverse { false };
|
||||
bool reset_temp_inverse { false };
|
||||
|
||||
auto current_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; };
|
||||
|
||||
size_t string_position = state.string_position;
|
||||
bool inverse_matched { false };
|
||||
|
@ -305,30 +404,45 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
|
|||
if (state.string_position > string_position)
|
||||
break;
|
||||
|
||||
if (reset_temp_inverse) {
|
||||
reset_temp_inverse = false;
|
||||
temporary_inverse = false;
|
||||
} else {
|
||||
reset_temp_inverse = true;
|
||||
}
|
||||
|
||||
auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
|
||||
|
||||
if (compare_type == CharacterCompareType::Inverse)
|
||||
inverse = true;
|
||||
|
||||
else if (compare_type == CharacterCompareType::Char) {
|
||||
char ch = m_bytecode->at(offset++);
|
||||
else if (compare_type == CharacterCompareType::TemporaryInverse) {
|
||||
// If "TemporaryInverse" is given, negate the current inversion state only for the next opcode.
|
||||
// it follows that this cannot be the last compare element.
|
||||
ASSERT(i != arguments_count() - 1);
|
||||
|
||||
temporary_inverse = true;
|
||||
reset_temp_inverse = false;
|
||||
|
||||
} else if (compare_type == CharacterCompareType::Char) {
|
||||
u32 ch = m_bytecode->at(offset++);
|
||||
|
||||
// We want to compare a string that is longer or equal in length to the available string
|
||||
if (input.view.length() - state.string_position < 1)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
compare_char(input, state, ch, inverse, inverse_matched);
|
||||
compare_char(input, state, ch, current_inversion_state(), inverse_matched);
|
||||
|
||||
} else if (compare_type == CharacterCompareType::AnyChar) {
|
||||
// We want to compare a string that is definitely longer than the available string
|
||||
if (input.view.length() - state.string_position < 1)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
ASSERT(!inverse);
|
||||
ASSERT(!current_inversion_state());
|
||||
++state.string_position;
|
||||
|
||||
} else if (compare_type == CharacterCompareType::String) {
|
||||
ASSERT(!inverse);
|
||||
ASSERT(!current_inversion_state());
|
||||
|
||||
char* str = reinterpret_cast<char*>(m_bytecode->at(offset++));
|
||||
auto& length = m_bytecode->at(offset++);
|
||||
|
@ -348,7 +462,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
|
|||
auto character_class = (CharClass)m_bytecode->at(offset++);
|
||||
auto ch = input.view[state.string_position];
|
||||
|
||||
compare_character_class(input, state, character_class, ch, inverse, inverse_matched);
|
||||
compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
|
||||
|
||||
} else if (compare_type == CharacterCompareType::CharRange) {
|
||||
auto value = (CharRange)m_bytecode->at(offset++);
|
||||
|
@ -357,7 +471,40 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
|
|||
auto to = value.to;
|
||||
auto ch = input.view[state.string_position];
|
||||
|
||||
compare_character_range(input, state, from, to, ch, inverse, inverse_matched);
|
||||
compare_character_range(input, state, from, to, ch, current_inversion_state(), inverse_matched);
|
||||
|
||||
} else if (compare_type == CharacterCompareType::Reference) {
|
||||
auto reference_number = (size_t)m_bytecode->at(offset++);
|
||||
auto& groups = output.capture_group_matches.at(input.match_index);
|
||||
if (groups.size() <= reference_number)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
auto str = groups.at(reference_number).view;
|
||||
|
||||
// We want to compare a string that is definitely longer than the available string
|
||||
if (input.view.length() - state.string_position < str.length())
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
if (!compare_string(input, state, str.characters_without_null_termination(), str.length()))
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
} else if (compare_type == CharacterCompareType::NamedReference) {
|
||||
auto ptr = (const char*)m_bytecode->at(offset++);
|
||||
auto length = (size_t)m_bytecode->at(offset++);
|
||||
StringView name { ptr, length };
|
||||
|
||||
auto group = output.named_capture_group_matches.at(input.match_index).get(name);
|
||||
if (!group.has_value())
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
auto str = group.value().view;
|
||||
|
||||
// We want to compare a string that is definitely longer than the available string
|
||||
if (input.view.length() - state.string_position < str.length())
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
if (!compare_string(input, state, str.characters_without_null_termination(), str.length()))
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
} else {
|
||||
fprintf(stderr, "Undefined comparison: %i\n", (int)compare_type);
|
||||
|
@ -366,7 +513,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
|
|||
}
|
||||
}
|
||||
|
||||
if (inverse && !inverse_matched)
|
||||
if (current_inversion_state() && !inverse_matched)
|
||||
++state.string_position;
|
||||
|
||||
if (string_position == state.string_position || state.string_position > input.view.length())
|
||||
|
@ -502,6 +649,14 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& inp
|
|||
++state.string_position;
|
||||
}
|
||||
break;
|
||||
case CharClass::Word:
|
||||
if (isalnum(ch) || ch == '_') {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
++state.string_position;
|
||||
}
|
||||
break;
|
||||
case CharClass::Xdigit:
|
||||
if (isxdigit(ch)) {
|
||||
if (inverse)
|
||||
|
@ -550,6 +705,13 @@ const Vector<String> OpCode_Compare::variable_arguments_to_string(Optional<Match
|
|||
result.empend(String::format("value='%c'", ch));
|
||||
if (!view.is_null())
|
||||
result.empend(String::format("compare against: '%s'", view.substring_view(state().string_position, state().string_position + 1 > view.length() ? 0 : 1).to_string().characters()));
|
||||
} else if (compare_type == CharacterCompareType::NamedReference) {
|
||||
auto ptr = (const char*)m_bytecode->at(offset++);
|
||||
auto length = m_bytecode->at(offset++);
|
||||
result.empend(String::format("name='%.*s'", length, ptr));
|
||||
} else if (compare_type == CharacterCompareType::Reference) {
|
||||
auto ref = m_bytecode->at(offset++);
|
||||
result.empend(String::format("number=%lu", ref));
|
||||
} else if (compare_type == CharacterCompareType::String) {
|
||||
char* str = reinterpret_cast<char*>(m_bytecode->at(offset++));
|
||||
auto& length = m_bytecode->at(offset++);
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
#include "RegexMatch.h"
|
||||
#include "RegexOptions.h"
|
||||
|
||||
#include <AK/Format.h>
|
||||
#include <AK/Forward.h>
|
||||
#include <AK/HashMap.h>
|
||||
#include <AK/NonnullOwnPtr.h>
|
||||
|
@ -46,33 +47,41 @@ using ByteCodeValueType = u64;
|
|||
__ENUMERATE_OPCODE(Jump) \
|
||||
__ENUMERATE_OPCODE(ForkJump) \
|
||||
__ENUMERATE_OPCODE(ForkStay) \
|
||||
__ENUMERATE_OPCODE(FailForks) \
|
||||
__ENUMERATE_OPCODE(SaveLeftCaptureGroup) \
|
||||
__ENUMERATE_OPCODE(SaveRightCaptureGroup) \
|
||||
__ENUMERATE_OPCODE(SaveLeftNamedCaptureGroup) \
|
||||
__ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \
|
||||
__ENUMERATE_OPCODE(CheckBegin) \
|
||||
__ENUMERATE_OPCODE(CheckEnd) \
|
||||
__ENUMERATE_OPCODE(CheckBoundary) \
|
||||
__ENUMERATE_OPCODE(Save) \
|
||||
__ENUMERATE_OPCODE(Restore) \
|
||||
__ENUMERATE_OPCODE(GoBack) \
|
||||
__ENUMERATE_OPCODE(Exit)
|
||||
|
||||
// clang-format off
|
||||
enum class OpCodeId : ByteCodeValueType {
|
||||
#define __ENUMERATE_OPCODE(x) x,
|
||||
ENUMERATE_OPCODES
|
||||
#undef __ENUMERATE_OPCODE
|
||||
|
||||
First
|
||||
= Compare,
|
||||
Last
|
||||
= Exit,
|
||||
First = Compare,
|
||||
Last = Exit,
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
#define ENUMERATE_CHARACTER_COMPARE_TYPES \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(String) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \
|
||||
#define ENUMERATE_CHARACTER_COMPARE_TYPES \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(String) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \
|
||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
|
||||
|
||||
enum class CharacterCompareType : ByteCodeValueType {
|
||||
|
@ -93,6 +102,7 @@ enum class CharacterCompareType : ByteCodeValueType {
|
|||
__ENUMERATE_CHARACTER_CLASS(Blank) \
|
||||
__ENUMERATE_CHARACTER_CLASS(Graph) \
|
||||
__ENUMERATE_CHARACTER_CLASS(Punct) \
|
||||
__ENUMERATE_CHARACTER_CLASS(Word) \
|
||||
__ENUMERATE_CHARACTER_CLASS(Xdigit)
|
||||
|
||||
enum class CharClass : ByteCodeValueType {
|
||||
|
@ -101,6 +111,16 @@ enum class CharClass : ByteCodeValueType {
|
|||
#undef __ENUMERATE_CHARACTER_CLASS
|
||||
};
|
||||
|
||||
#define ENUMERATE_BOUNDARY_CHECK_TYPES \
|
||||
__ENUMERATE_BOUNDARY_CHECK_TYPE(Word) \
|
||||
__ENUMERATE_BOUNDARY_CHECK_TYPE(NonWord)
|
||||
|
||||
enum class BoundaryCheckType : ByteCodeValueType {
|
||||
#define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) x,
|
||||
ENUMERATE_BOUNDARY_CHECK_TYPES
|
||||
#undef __ENUMERATE_BOUNDARY_CHECK_TYPE
|
||||
};
|
||||
|
||||
struct CharRange {
|
||||
const u32 from;
|
||||
const u32 to;
|
||||
|
@ -144,9 +164,10 @@ public:
|
|||
ASSERT(value.type != CharacterCompareType::RangeExpressionDummy);
|
||||
ASSERT(value.type != CharacterCompareType::Undefined);
|
||||
ASSERT(value.type != CharacterCompareType::String);
|
||||
ASSERT(value.type != CharacterCompareType::NamedReference);
|
||||
|
||||
arguments.append((ByteCodeValueType)value.type);
|
||||
if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar)
|
||||
if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar && value.type != CharacterCompareType::TemporaryInverse)
|
||||
arguments.append(move(value.value));
|
||||
}
|
||||
|
||||
|
@ -156,6 +177,15 @@ public:
|
|||
append(move(bytecode));
|
||||
}
|
||||
|
||||
void insert_bytecode_check_boundary(BoundaryCheckType type)
|
||||
{
|
||||
ByteCode bytecode;
|
||||
bytecode.empend((ByteCodeValueType)OpCodeId::CheckBoundary);
|
||||
bytecode.empend((ByteCodeValueType)type);
|
||||
|
||||
append(move(bytecode));
|
||||
}
|
||||
|
||||
void insert_bytecode_compare_string(StringView view, size_t length)
|
||||
{
|
||||
ByteCode bytecode;
|
||||
|
@ -175,6 +205,25 @@ public:
|
|||
append(move(bytecode));
|
||||
}
|
||||
|
||||
void insert_bytecode_compare_named_reference(StringView name, size_t length)
|
||||
{
|
||||
ByteCode bytecode;
|
||||
|
||||
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
|
||||
bytecode.empend(1); // number of arguments
|
||||
|
||||
ByteCode arguments;
|
||||
|
||||
arguments.empend(static_cast<ByteCodeValueType>(CharacterCompareType::NamedReference));
|
||||
arguments.empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
|
||||
arguments.empend(length);
|
||||
|
||||
bytecode.empend(arguments.size()); // size of arguments
|
||||
bytecode.append(move(arguments));
|
||||
|
||||
append(move(bytecode));
|
||||
}
|
||||
|
||||
void insert_bytecode_group_capture_left(size_t capture_groups_count)
|
||||
{
|
||||
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftCaptureGroup));
|
||||
|
@ -201,6 +250,87 @@ public:
|
|||
empend(name.length());
|
||||
}
|
||||
|
||||
enum class LookAroundType {
|
||||
LookAhead,
|
||||
LookBehind,
|
||||
NegatedLookAhead,
|
||||
NegatedLookBehind,
|
||||
};
|
||||
void insert_bytecode_lookaround(ByteCode&& lookaround_body, LookAroundType type, size_t match_length = 0)
|
||||
{
|
||||
// FIXME: The save stack will grow infinitely with repeated failures
|
||||
// as we do not discard that on failure (we don't necessarily know how many to pop with the current architecture).
|
||||
switch (type) {
|
||||
case LookAroundType::LookAhead: {
|
||||
// SAVE
|
||||
// REGEXP BODY
|
||||
// RESTORE
|
||||
empend((ByteCodeValueType)OpCodeId::Save);
|
||||
append(move(lookaround_body));
|
||||
empend((ByteCodeValueType)OpCodeId::Restore);
|
||||
return;
|
||||
}
|
||||
case LookAroundType::NegatedLookAhead: {
|
||||
// JUMP _A
|
||||
// LABEL _L
|
||||
// REGEXP BODY
|
||||
// FAIL 2
|
||||
// LABEL _A
|
||||
// SAVE
|
||||
// FORKJUMP _L
|
||||
// RESTORE
|
||||
auto body_length = lookaround_body.size();
|
||||
empend((ByteCodeValueType)OpCodeId::Jump);
|
||||
empend((ByteCodeValueType)body_length + 2); // JUMP to label _A
|
||||
append(move(lookaround_body));
|
||||
empend((ByteCodeValueType)OpCodeId::FailForks);
|
||||
empend((ByteCodeValueType)2); // Fail two forks
|
||||
empend((ByteCodeValueType)OpCodeId::Save);
|
||||
empend((ByteCodeValueType)OpCodeId::ForkJump);
|
||||
empend((ByteCodeValueType) - (body_length + 5)); // JUMP to lavel _L
|
||||
empend((ByteCodeValueType)OpCodeId::Restore);
|
||||
return;
|
||||
}
|
||||
case LookAroundType::LookBehind:
|
||||
// SAVE
|
||||
// GOBACK match_length(BODY)
|
||||
// REGEXP BODY
|
||||
// RESTORE
|
||||
empend((ByteCodeValueType)OpCodeId::Save);
|
||||
empend((ByteCodeValueType)OpCodeId::GoBack);
|
||||
empend((ByteCodeValueType)match_length);
|
||||
append(move(lookaround_body));
|
||||
empend((ByteCodeValueType)OpCodeId::Restore);
|
||||
return;
|
||||
case LookAroundType::NegatedLookBehind: {
|
||||
// JUMP _A
|
||||
// LABEL _L
|
||||
// GOBACK match_length(BODY)
|
||||
// REGEXP BODY
|
||||
// FAIL 2
|
||||
// LABEL _A
|
||||
// SAVE
|
||||
// FORKJUMP _L
|
||||
// RESTORE
|
||||
auto body_length = lookaround_body.size();
|
||||
empend((ByteCodeValueType)OpCodeId::Jump);
|
||||
empend((ByteCodeValueType)body_length + 4); // JUMP to label _A
|
||||
empend((ByteCodeValueType)OpCodeId::GoBack);
|
||||
empend((ByteCodeValueType)match_length);
|
||||
append(move(lookaround_body));
|
||||
empend((ByteCodeValueType)OpCodeId::FailForks);
|
||||
empend((ByteCodeValueType)2); // Fail two forks
|
||||
empend((ByteCodeValueType)OpCodeId::Save);
|
||||
empend((ByteCodeValueType)OpCodeId::ForkJump);
|
||||
empend((ByteCodeValueType) - (body_length + 7)); // JUMP to lavel _L
|
||||
empend((ByteCodeValueType)OpCodeId::Restore);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
ASSERT_NOT_REACHED();
|
||||
}
|
||||
|
||||
void insert_bytecode_alternation(ByteCode&& left, ByteCode&& right)
|
||||
{
|
||||
|
||||
|
@ -348,6 +478,7 @@ enum class ExecutionResult : u8 {
|
|||
|
||||
const char* execution_result_name(ExecutionResult result);
|
||||
const char* opcode_id_name(OpCodeId opcode_id);
|
||||
const char* boundary_check_type_name(BoundaryCheckType);
|
||||
const char* character_compare_type_name(CharacterCompareType result);
|
||||
const char* execution_result_name(ExecutionResult result);
|
||||
|
||||
|
@ -419,6 +550,56 @@ public:
|
|||
const String arguments_string() const override { return ""; }
|
||||
};
|
||||
|
||||
class OpCode_FailForks final : public OpCode {
|
||||
public:
|
||||
OpCode_FailForks(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::FailForks; }
|
||||
ALWAYS_INLINE size_t size() const override { return 2; }
|
||||
ALWAYS_INLINE size_t count() const { return argument(0); }
|
||||
const String arguments_string() const override { return String::formatted("count={}", count()); }
|
||||
};
|
||||
|
||||
class OpCode_Save final : public OpCode {
|
||||
public:
|
||||
OpCode_Save(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Save; }
|
||||
ALWAYS_INLINE size_t size() const override { return 1; }
|
||||
const String arguments_string() const override { return ""; }
|
||||
};
|
||||
|
||||
class OpCode_Restore final : public OpCode {
|
||||
public:
|
||||
OpCode_Restore(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Restore; }
|
||||
ALWAYS_INLINE size_t size() const override { return 1; }
|
||||
const String arguments_string() const override { return ""; }
|
||||
};
|
||||
|
||||
class OpCode_GoBack final : public OpCode {
|
||||
public:
|
||||
OpCode_GoBack(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::GoBack; }
|
||||
ALWAYS_INLINE size_t size() const override { return 2; }
|
||||
ALWAYS_INLINE size_t count() const { return argument(0); }
|
||||
const String arguments_string() const override { return String::formatted("count={}", count()); }
|
||||
};
|
||||
|
||||
class OpCode_Jump final : public OpCode {
|
||||
public:
|
||||
OpCode_Jump(ByteCode& bytecode)
|
||||
|
@ -491,6 +672,20 @@ public:
|
|||
const String arguments_string() const override { return ""; }
|
||||
};
|
||||
|
||||
class OpCode_CheckBoundary final : public OpCode {
|
||||
public:
|
||||
OpCode_CheckBoundary(ByteCode& bytecode)
|
||||
: OpCode(bytecode)
|
||||
{
|
||||
}
|
||||
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
|
||||
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBoundary; }
|
||||
ALWAYS_INLINE size_t size() const override { return 2; }
|
||||
ALWAYS_INLINE size_t arguments_count() const { return 1; }
|
||||
ALWAYS_INLINE BoundaryCheckType type() const { return static_cast<BoundaryCheckType>(argument(0)); }
|
||||
const String arguments_string() const override { return String::format("kind=%lu (%s)", argument(0), boundary_check_type_name(type())); }
|
||||
};
|
||||
|
||||
class OpCode_SaveLeftCaptureGroup final : public OpCode {
|
||||
public:
|
||||
OpCode_SaveLeftCaptureGroup(ByteCode& bytecode)
|
||||
|
|
|
@ -102,6 +102,7 @@ public:
|
|||
{
|
||||
StringBuilder builder;
|
||||
builder.append(execution_result_name(result));
|
||||
builder.appendff(", fc: {}, ss: {}", input.fail_counter, input.saved_positions.size());
|
||||
if (result == ExecutionResult::Succeeded) {
|
||||
builder.appendf(", ip: %lu/%lu, sp: %lu/%lu", state.instruction_position, bytecode.size() - 1, state.string_position, input.view.length() - 1);
|
||||
} else if (result == ExecutionResult::Fork_PrioHigh) {
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
|
||||
#include "RegexLexer.h"
|
||||
#include <AK/Assertions.h>
|
||||
#include <AK/LogStream.h>
|
||||
#include <stdio.h>
|
||||
|
||||
namespace regex {
|
||||
|
@ -89,6 +90,15 @@ void Lexer::reset()
|
|||
m_previous_position = 0;
|
||||
}
|
||||
|
||||
bool Lexer::try_skip(char c)
|
||||
{
|
||||
if (peek() != c)
|
||||
return false;
|
||||
|
||||
consume();
|
||||
return true;
|
||||
}
|
||||
|
||||
Token Lexer::next()
|
||||
{
|
||||
size_t token_start_position;
|
||||
|
@ -127,7 +137,9 @@ Token Lexer::next()
|
|||
case '\\':
|
||||
return 2;
|
||||
default:
|
||||
fprintf(stderr, "[LEXER] Found invalid escape sequence: \\%c\n", peek(1));
|
||||
#ifdef REGEX_DEBUG
|
||||
fprintf(stderr, "[LEXER] Found invalid escape sequence: \\%c (the parser will have to deal with this!)\n", peek(1));
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
|
|
@ -91,6 +91,9 @@ public:
|
|||
void reset();
|
||||
void back(size_t offset);
|
||||
void set_source(const StringView source) { m_source = source; }
|
||||
bool try_skip(char);
|
||||
|
||||
StringView slice_back(size_t offset) const { return m_source.substring_view(m_position - offset - 1, offset); }
|
||||
|
||||
private:
|
||||
ALWAYS_INLINE char peek(size_t offset = 0) const;
|
||||
|
|
|
@ -267,6 +267,9 @@ struct MatchInput {
|
|||
size_t column { 0 };
|
||||
|
||||
size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important
|
||||
|
||||
mutable size_t fail_counter { 0 };
|
||||
mutable Vector<size_t> saved_positions;
|
||||
};
|
||||
|
||||
struct MatchState {
|
||||
|
|
|
@ -264,7 +264,13 @@ Optional<bool> Matcher<Parser>::execute(const MatchInput& input, MatchState& sta
|
|||
s_regex_dbg.print_opcode("VM", *opcode, state, recursion_level, false);
|
||||
#endif
|
||||
|
||||
auto result = opcode->execute(input, state, output);
|
||||
ExecutionResult result;
|
||||
if (input.fail_counter > 0) {
|
||||
--input.fail_counter;
|
||||
result = ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
} else {
|
||||
result = opcode->execute(input, state, output);
|
||||
}
|
||||
|
||||
#ifdef REGEX_DEBUG
|
||||
s_regex_dbg.print_result(*opcode, bytecode, input, state, result);
|
||||
|
@ -330,4 +336,7 @@ ALWAYS_INLINE Optional<bool> Matcher<Parser>::execute_low_prio_forks(const Match
|
|||
|
||||
template class Matcher<PosixExtendedParser>;
|
||||
template class Regex<PosixExtendedParser>;
|
||||
|
||||
template class Matcher<ECMA262Parser>;
|
||||
template class Regex<ECMA262Parser>;
|
||||
}
|
||||
|
|
|
@ -28,7 +28,7 @@
|
|||
#include "RegexDebug.h"
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <cstdio>
|
||||
#include <AK/StringUtils.h>
|
||||
|
||||
namespace regex {
|
||||
|
||||
|
@ -88,6 +88,26 @@ ALWAYS_INLINE bool Parser::consume(const String& str)
|
|||
return true;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE bool Parser::try_skip(StringView str)
|
||||
{
|
||||
if (str.starts_with(m_parser_state.current_token.value()))
|
||||
str = str.substring_view(m_parser_state.current_token.value().length(), str.length() - m_parser_state.current_token.value().length());
|
||||
else
|
||||
return false;
|
||||
|
||||
size_t potentially_go_back { 0 };
|
||||
for (auto ch : str) {
|
||||
if (!m_parser_state.lexer.try_skip(ch)) {
|
||||
m_parser_state.lexer.back(potentially_go_back);
|
||||
return false;
|
||||
}
|
||||
++potentially_go_back;
|
||||
}
|
||||
|
||||
m_parser_state.current_token = m_parser_state.lexer.next();
|
||||
return true;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE void Parser::reset()
|
||||
{
|
||||
m_parser_state.bytecode.clear();
|
||||
|
@ -595,4 +615,762 @@ bool PosixExtendedParser::parse_root(ByteCode& stack, size_t& match_length_minim
|
|||
return !has_error();
|
||||
}
|
||||
|
||||
// =============================
|
||||
// ECMA262 Parser
|
||||
// =============================
|
||||
|
||||
bool ECMA262Parser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
|
||||
{
|
||||
if (m_parser_state.regex_options & AllFlags::Unicode) {
|
||||
return parse_pattern(stack, match_length_minimum, true, true);
|
||||
} else {
|
||||
ByteCode new_stack;
|
||||
size_t new_match_length = 0;
|
||||
auto res = parse_pattern(new_stack, new_match_length, false, false);
|
||||
if (m_parser_state.named_capture_groups_count > 0) {
|
||||
reset();
|
||||
return parse_pattern(stack, match_length_minimum, false, true);
|
||||
}
|
||||
|
||||
if (!res)
|
||||
return false;
|
||||
|
||||
stack.append(new_stack);
|
||||
match_length_minimum = new_match_length;
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
bool ECMA262Parser::parse_pattern(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
|
||||
{
|
||||
return parse_disjunction(stack, match_length_minimum, unicode, named);
|
||||
}
|
||||
|
||||
bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
|
||||
{
|
||||
ByteCode left_alternative_stack;
|
||||
size_t left_alternative_min_length = 0;
|
||||
auto alt_ok = parse_alternative(left_alternative_stack, left_alternative_min_length, unicode, named);
|
||||
if (!alt_ok)
|
||||
return false;
|
||||
|
||||
if (!match(TokenType::Pipe)) {
|
||||
stack.append(left_alternative_stack);
|
||||
match_length_minimum = left_alternative_min_length;
|
||||
return alt_ok;
|
||||
}
|
||||
|
||||
consume();
|
||||
ByteCode right_alternative_stack;
|
||||
size_t right_alternative_min_length = 0;
|
||||
auto continuation_ok = parse_disjunction(right_alternative_stack, right_alternative_min_length, unicode, named);
|
||||
if (!continuation_ok)
|
||||
return false;
|
||||
|
||||
stack.insert_bytecode_alternation(move(left_alternative_stack), move(right_alternative_stack));
|
||||
match_length_minimum = min(left_alternative_min_length, right_alternative_min_length);
|
||||
return continuation_ok;
|
||||
}
|
||||
|
||||
bool ECMA262Parser::parse_alternative(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
|
||||
{
|
||||
for (;;) {
|
||||
if (match(TokenType::Eof))
|
||||
return true;
|
||||
|
||||
if (parse_term(stack, match_length_minimum, unicode, named))
|
||||
continue;
|
||||
|
||||
return !has_error();
|
||||
}
|
||||
}
|
||||
|
||||
bool ECMA262Parser::parse_term(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
|
||||
{
|
||||
if (parse_assertion(stack, match_length_minimum, unicode, named))
|
||||
return true;
|
||||
|
||||
ByteCode atom_stack;
|
||||
size_t minimum_atom_length = 0;
|
||||
if (!parse_atom(atom_stack, minimum_atom_length, unicode, named))
|
||||
return false;
|
||||
|
||||
if (!parse_quantifier(atom_stack, minimum_atom_length, unicode, named))
|
||||
return false;
|
||||
|
||||
stack.append(move(atom_stack));
|
||||
match_length_minimum += minimum_atom_length;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& match_length_minimum, bool unicode, bool named)
|
||||
{
|
||||
if (match(TokenType::Circumflex)) {
|
||||
consume();
|
||||
stack.empend((ByteCodeValueType)OpCodeId::CheckBegin);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (match(TokenType::Dollar)) {
|
||||
consume();
|
||||
stack.empend((ByteCodeValueType)OpCodeId::CheckEnd);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (try_skip("\\b")) {
|
||||
stack.insert_bytecode_check_boundary(BoundaryCheckType::Word);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (try_skip("\\B")) {
|
||||
stack.insert_bytecode_check_boundary(BoundaryCheckType::NonWord);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (match(TokenType::LeftParen)) {
|
||||
if (!try_skip("(?"))
|
||||
return false;
|
||||
|
||||
ByteCode assertion_stack;
|
||||
size_t length_dummy = 0;
|
||||
|
||||
auto parse_inner_disjunction = [&] {
|
||||
auto disjunction_ok = parse_disjunction(assertion_stack, length_dummy, unicode, named);
|
||||
if (!disjunction_ok)
|
||||
return false;
|
||||
consume(TokenType::RightParen, Error::MismatchingParen);
|
||||
return true;
|
||||
};
|
||||
|
||||
if (try_skip("=")) {
|
||||
if (!parse_inner_disjunction())
|
||||
return false;
|
||||
stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookAhead);
|
||||
return true;
|
||||
}
|
||||
if (try_skip("!")) {
|
||||
if (!parse_inner_disjunction())
|
||||
return false;
|
||||
stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
|
||||
return true;
|
||||
}
|
||||
if (try_skip("<=")) {
|
||||
if (!parse_inner_disjunction())
|
||||
return false;
|
||||
// FIXME: Somehow ensure that this assertion regexp has a fixed length.
|
||||
stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookBehind, length_dummy);
|
||||
return true;
|
||||
}
|
||||
if (try_skip("<!")) {
|
||||
if (!parse_inner_disjunction())
|
||||
return false;
|
||||
stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookBehind, length_dummy);
|
||||
return true;
|
||||
}
|
||||
|
||||
// If none of these matched, put the '(?' back.
|
||||
m_parser_state.lexer.back(3);
|
||||
m_parser_state.current_token = m_parser_state.lexer.next();
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
Optional<unsigned> ECMA262Parser::read_digits(ECMA262Parser::ReadDigitsInitialZeroState initial_zero, ECMA262Parser::ReadDigitFollowPolicy follow_policy, bool hex, int max_count)
|
||||
{
|
||||
if (!match(TokenType::Char))
|
||||
return {};
|
||||
|
||||
if (initial_zero != ReadDigitsInitialZeroState::Allow) {
|
||||
auto has_initial_zero = m_parser_state.current_token.value() == "0";
|
||||
if (initial_zero == ReadDigitsInitialZeroState::Disallow && has_initial_zero)
|
||||
return {};
|
||||
|
||||
if (initial_zero == ReadDigitsInitialZeroState::Require && !has_initial_zero)
|
||||
return {};
|
||||
}
|
||||
|
||||
int count = 0;
|
||||
size_t offset = 0;
|
||||
while (match(TokenType::Char)) {
|
||||
auto c = m_parser_state.current_token.value();
|
||||
if (follow_policy == ReadDigitFollowPolicy::DisallowDigit) {
|
||||
if (hex && AK::StringUtils::convert_to_uint_from_hex(c).has_value())
|
||||
break;
|
||||
if (!hex && c.to_uint().has_value())
|
||||
break;
|
||||
}
|
||||
|
||||
if (follow_policy == ReadDigitFollowPolicy::DisallowNonDigit) {
|
||||
if (hex && !AK::StringUtils::convert_to_uint_from_hex(c).has_value())
|
||||
break;
|
||||
if (!hex && !c.to_uint().has_value())
|
||||
break;
|
||||
}
|
||||
|
||||
if (max_count > 0 && count >= max_count)
|
||||
break;
|
||||
|
||||
offset += consume().value().length();
|
||||
++count;
|
||||
}
|
||||
|
||||
auto str = m_parser_state.lexer.slice_back(offset);
|
||||
if (hex)
|
||||
return AK::StringUtils::convert_to_uint_from_hex(str);
|
||||
|
||||
return str.to_uint();
|
||||
}
|
||||
|
||||
bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minimum, bool, bool)
|
||||
{
|
||||
enum class Repetition {
|
||||
OneOrMore,
|
||||
ZeroOrMore,
|
||||
Optional,
|
||||
Explicit,
|
||||
None,
|
||||
} repetition_mark { Repetition::None };
|
||||
|
||||
bool ungreedy = false;
|
||||
Optional<size_t> repeat_min, repeat_max;
|
||||
|
||||
if (match(TokenType::Asterisk)) {
|
||||
consume();
|
||||
repetition_mark = Repetition::ZeroOrMore;
|
||||
} else if (match(TokenType::Plus)) {
|
||||
consume();
|
||||
repetition_mark = Repetition::OneOrMore;
|
||||
} else if (match(TokenType::Questionmark)) {
|
||||
consume();
|
||||
repetition_mark = Repetition::Optional;
|
||||
} else if (match(TokenType::LeftCurly)) {
|
||||
consume();
|
||||
repetition_mark = Repetition::Explicit;
|
||||
|
||||
auto low_bound = read_digits();
|
||||
|
||||
if (!low_bound.has_value()) {
|
||||
set_error(Error::InvalidBraceContent);
|
||||
return false;
|
||||
}
|
||||
|
||||
repeat_min = low_bound.value();
|
||||
|
||||
if (match(TokenType::Comma)) {
|
||||
consume();
|
||||
auto high_bound = read_digits();
|
||||
if (!high_bound.has_value()) {
|
||||
set_error(Error::InvalidBraceContent);
|
||||
return false;
|
||||
}
|
||||
|
||||
repeat_max = high_bound.value();
|
||||
}
|
||||
|
||||
if (!match(TokenType::RightCurly)) {
|
||||
set_error(Error::MismatchingBrace);
|
||||
return false;
|
||||
}
|
||||
consume();
|
||||
|
||||
if (repeat_max.has_value()) {
|
||||
if (repeat_min.value() > repeat_max.value())
|
||||
set_error(Error::InvalidBraceContent);
|
||||
}
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (match(TokenType::Questionmark)) {
|
||||
if (repetition_mark == Repetition::Explicit) {
|
||||
set_error(Error::InvalidRepetitionMarker);
|
||||
return false;
|
||||
}
|
||||
consume();
|
||||
ungreedy = true;
|
||||
}
|
||||
|
||||
ByteCode new_bytecode;
|
||||
switch (repetition_mark) {
|
||||
case Repetition::OneOrMore:
|
||||
new_bytecode.insert_bytecode_repetition_min_one(stack, !ungreedy);
|
||||
break;
|
||||
case Repetition::ZeroOrMore:
|
||||
new_bytecode.insert_bytecode_repetition_any(stack, !ungreedy);
|
||||
match_length_minimum = 0;
|
||||
break;
|
||||
case Repetition::Optional:
|
||||
new_bytecode.insert_bytecode_repetition_zero_or_one(stack, !ungreedy);
|
||||
match_length_minimum = 0;
|
||||
break;
|
||||
case Repetition::Explicit:
|
||||
new_bytecode.insert_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max);
|
||||
match_length_minimum *= repeat_min.value();
|
||||
break;
|
||||
case Repetition::None:
|
||||
ASSERT_NOT_REACHED();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
|
||||
{
|
||||
if (try_skip("\\")) {
|
||||
// AtomEscape.
|
||||
return parse_atom_escape(stack, match_length_minimum, unicode, named);
|
||||
}
|
||||
|
||||
if (match(TokenType::LeftBracket)) {
|
||||
// Character class.
|
||||
return parse_character_class(stack, match_length_minimum, unicode, named);
|
||||
}
|
||||
|
||||
if (match(TokenType::LeftParen)) {
|
||||
// Non-capturing group, or a capture group.
|
||||
return parse_capture_group(stack, match_length_minimum, unicode, named);
|
||||
}
|
||||
|
||||
if (match(TokenType::Period)) {
|
||||
consume();
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
|
||||
return true;
|
||||
}
|
||||
|
||||
if (match(TokenType::Circumflex) || match(TokenType::Dollar) || match(TokenType::RightBracket)
|
||||
|| match(TokenType::RightCurly) || match(TokenType::RightParen) || match(TokenType::Pipe)
|
||||
|| match(TokenType::Plus) || match(TokenType::Asterisk) || match(TokenType::Questionmark)) {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
if (match(TokenType::Char)) {
|
||||
auto token = consume().value();
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token[0] } });
|
||||
return true;
|
||||
}
|
||||
|
||||
set_error(Error::InvalidPattern);
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
|
||||
{
|
||||
if (auto escape = read_digits(ReadDigitsInitialZeroState::Disallow, ReadDigitFollowPolicy::DisallowNonDigit); escape.has_value()) {
|
||||
auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value());
|
||||
if (!maybe_length.has_value()) {
|
||||
set_error(Error::InvalidNumber);
|
||||
return false;
|
||||
}
|
||||
match_length_minimum += maybe_length.value();
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } });
|
||||
return true;
|
||||
}
|
||||
|
||||
// CharacterEscape > ControlEscape
|
||||
if (try_skip("f")) {
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\f' } });
|
||||
return true;
|
||||
}
|
||||
|
||||
if (try_skip("n")) {
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\n' } });
|
||||
return true;
|
||||
}
|
||||
|
||||
if (try_skip("r")) {
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\r' } });
|
||||
return true;
|
||||
}
|
||||
|
||||
if (try_skip("t")) {
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\t' } });
|
||||
return true;
|
||||
}
|
||||
|
||||
if (try_skip("v")) {
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\v' } });
|
||||
return true;
|
||||
}
|
||||
|
||||
// CharacterEscape > ControlLetter
|
||||
if (try_skip("c")) {
|
||||
for (auto c = 'A'; c <= 'z'; ++c) {
|
||||
if (try_skip({ &c, 1 })) {
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)(c & 0x3f) } });
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// '\0'
|
||||
if (read_digits(ReadDigitsInitialZeroState::Require, ReadDigitFollowPolicy::DisallowDigit).has_value()) {
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
|
||||
return true;
|
||||
}
|
||||
|
||||
// HexEscape
|
||||
if (try_skip("x")) {
|
||||
if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value()) {
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } });
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (try_skip("u")) {
|
||||
// FIXME: Implement this path, unicode escape sequence.
|
||||
TODO();
|
||||
}
|
||||
|
||||
// IdentityEscape
|
||||
if (match(TokenType::EscapeSequence)) {
|
||||
match_length_minimum += 1;
|
||||
auto token = consume().value();
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token[token.length() - 1] } });
|
||||
return true;
|
||||
}
|
||||
|
||||
if (named && try_skip("k")) {
|
||||
auto name = read_capture_group_specifier(true);
|
||||
if (name.is_empty()) {
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
return false;
|
||||
}
|
||||
auto maybe_length = m_parser_state.named_capture_group_minimum_lengths.get(name);
|
||||
if (!maybe_length.has_value()) {
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
return false;
|
||||
}
|
||||
match_length_minimum += maybe_length.value();
|
||||
|
||||
stack.insert_bytecode_compare_named_reference(name, name.length());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (unicode) {
|
||||
if (try_skip("p{")) {
|
||||
// FIXME: Implement this path, Unicode property match.
|
||||
TODO();
|
||||
}
|
||||
if (try_skip("P{")) {
|
||||
// FIXME: Implement this path, Unicode property match.
|
||||
TODO();
|
||||
}
|
||||
}
|
||||
|
||||
bool negate = false;
|
||||
auto ch = parse_character_class_escape(negate);
|
||||
if (!ch.has_value()) {
|
||||
set_error(Error::InvalidCharacterClass);
|
||||
return false;
|
||||
}
|
||||
|
||||
Vector<CompareTypeAndValuePair> compares;
|
||||
if (negate)
|
||||
compares.empend(CharacterCompareType::Inverse, 0);
|
||||
compares.empend(CharacterCompareType::CharClass, (ByteCodeValueType)ch.value());
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values(move(compares));
|
||||
return true;
|
||||
}
|
||||
|
||||
Optional<CharClass> ECMA262Parser::parse_character_class_escape(bool& negate, bool expect_backslash)
|
||||
{
|
||||
if (expect_backslash && !try_skip("\\"))
|
||||
return {};
|
||||
|
||||
// CharacterClassEscape
|
||||
CharClass ch_class;
|
||||
if (try_skip("d")) {
|
||||
ch_class = CharClass::Digit;
|
||||
} else if (try_skip("D")) {
|
||||
ch_class = CharClass::Digit;
|
||||
negate = true;
|
||||
} else if (try_skip("s")) {
|
||||
ch_class = CharClass::Space;
|
||||
} else if (try_skip("S")) {
|
||||
ch_class = CharClass::Space;
|
||||
negate = true;
|
||||
} else if (try_skip("w")) {
|
||||
ch_class = CharClass::Word;
|
||||
} else if (try_skip("W")) {
|
||||
ch_class = CharClass::Word;
|
||||
negate = true;
|
||||
} else {
|
||||
return {};
|
||||
}
|
||||
|
||||
return ch_class;
|
||||
}
|
||||
|
||||
bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool)
|
||||
{
|
||||
consume(TokenType::LeftBracket, Error::InvalidPattern);
|
||||
|
||||
Vector<CompareTypeAndValuePair> compares;
|
||||
|
||||
if (match(TokenType::Circumflex)) {
|
||||
// Negated charclass
|
||||
consume();
|
||||
compares.empend(CharacterCompareType::Inverse, 0);
|
||||
}
|
||||
|
||||
if (match(TokenType::RightBracket)) {
|
||||
consume();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!parse_nonempty_class_ranges(compares, unicode))
|
||||
return false;
|
||||
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values(move(compares));
|
||||
return true;
|
||||
}
|
||||
|
||||
struct CharClassRangeElement {
|
||||
union {
|
||||
CharClass character_class;
|
||||
u32 code_point { 0 };
|
||||
};
|
||||
|
||||
bool is_negated { false };
|
||||
bool is_character_class { false };
|
||||
};
|
||||
|
||||
bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
|
||||
{
|
||||
auto read_class_atom_no_dash = [&]() -> Optional<CharClassRangeElement> {
|
||||
if (match(TokenType::EscapeSequence)) {
|
||||
auto token = consume().value();
|
||||
return { { .code_point = (u32)token[1], .is_character_class = false } };
|
||||
}
|
||||
|
||||
if (try_skip("\\")) {
|
||||
if (try_skip("f"))
|
||||
return { { .code_point = '\f', .is_character_class = false } };
|
||||
if (try_skip("n"))
|
||||
return { { .code_point = '\n', .is_character_class = false } };
|
||||
if (try_skip("r"))
|
||||
return { { .code_point = '\r', .is_character_class = false } };
|
||||
if (try_skip("t"))
|
||||
return { { .code_point = '\t', .is_character_class = false } };
|
||||
if (try_skip("v"))
|
||||
return { { .code_point = '\v', .is_character_class = false } };
|
||||
if (try_skip("b"))
|
||||
return { { .code_point = '\b', .is_character_class = false } };
|
||||
|
||||
// CharacterEscape > ControlLetter
|
||||
if (try_skip("c")) {
|
||||
for (auto c = 'A'; c <= 'z'; ++c) {
|
||||
if (try_skip({ &c, 1 }))
|
||||
return { { .code_point = (u32)(c & 0x3f), .is_character_class = false } };
|
||||
}
|
||||
}
|
||||
|
||||
// '\0'
|
||||
if (read_digits(ReadDigitsInitialZeroState::Require, ReadDigitFollowPolicy::DisallowDigit).has_value())
|
||||
return { { .code_point = 0, .is_character_class = false } };
|
||||
|
||||
// HexEscape
|
||||
if (try_skip("x")) {
|
||||
if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value())
|
||||
return { { .code_point = hex_escape.value(), .is_character_class = false } };
|
||||
}
|
||||
|
||||
if (try_skip("u")) {
|
||||
// FIXME: Implement this path, unicode escape sequence.
|
||||
TODO();
|
||||
}
|
||||
|
||||
if (unicode) {
|
||||
if (try_skip("-"))
|
||||
return { { .code_point = '-', .is_character_class = false } };
|
||||
}
|
||||
|
||||
if (try_skip("p{") || try_skip("P{")) {
|
||||
// FIXME: Implement these; unicode properties.
|
||||
TODO();
|
||||
}
|
||||
|
||||
if (try_skip("d"))
|
||||
return { { .character_class = CharClass::Digit, .is_character_class = true } };
|
||||
if (try_skip("s"))
|
||||
return { { .character_class = CharClass::Space, .is_character_class = true } };
|
||||
if (try_skip("w"))
|
||||
return { { .character_class = CharClass::Word, .is_character_class = true } };
|
||||
if (try_skip("D"))
|
||||
return { { .character_class = CharClass::Digit, .is_negated = true, .is_character_class = true } };
|
||||
if (try_skip("S"))
|
||||
return { { .character_class = CharClass::Space, .is_negated = true, .is_character_class = true } };
|
||||
if (try_skip("W"))
|
||||
return { { .character_class = CharClass::Word, .is_negated = true, .is_character_class = true } };
|
||||
}
|
||||
|
||||
if (match(TokenType::RightBracket) || match(TokenType::HyphenMinus))
|
||||
return {};
|
||||
|
||||
auto token = consume(TokenType::Char, Error::InvalidCharacterClass);
|
||||
|
||||
return { { .code_point = (u32)token.value()[0], .is_character_class = false } };
|
||||
};
|
||||
auto read_class_atom = [&]() -> Optional<CharClassRangeElement> {
|
||||
if (match(TokenType::HyphenMinus)) {
|
||||
consume();
|
||||
return { { .code_point = '-', .is_character_class = false } };
|
||||
}
|
||||
|
||||
return read_class_atom_no_dash();
|
||||
};
|
||||
|
||||
while (!match(TokenType::RightBracket)) {
|
||||
auto first_atom = read_class_atom();
|
||||
if (!first_atom.has_value())
|
||||
return false;
|
||||
|
||||
if (match(TokenType::HyphenMinus)) {
|
||||
consume();
|
||||
auto second_atom = read_class_atom();
|
||||
if (!second_atom.has_value())
|
||||
return false;
|
||||
|
||||
if (first_atom.value().is_character_class || second_atom.value().is_character_class) {
|
||||
set_error(Error::InvalidRange);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (first_atom.value().code_point > second_atom.value().code_point) {
|
||||
set_error(Error::InvalidRange);
|
||||
return false;
|
||||
}
|
||||
|
||||
ASSERT(!first_atom.value().is_negated);
|
||||
ASSERT(!second_atom.value().is_negated);
|
||||
|
||||
ranges.empend(CharacterCompareType::CharRange, CharRange { first_atom.value().code_point, second_atom.value().code_point });
|
||||
continue;
|
||||
}
|
||||
|
||||
auto atom = first_atom.value();
|
||||
|
||||
if (atom.is_character_class) {
|
||||
if (atom.is_negated)
|
||||
ranges.empend(CharacterCompareType::TemporaryInverse, 0);
|
||||
ranges.empend(CharacterCompareType::CharClass, (ByteCodeValueType)first_atom.value().character_class);
|
||||
} else {
|
||||
ASSERT(!atom.is_negated);
|
||||
ranges.empend(CharacterCompareType::Char, first_atom.value().code_point);
|
||||
}
|
||||
}
|
||||
|
||||
consume(TokenType::RightBracket, Error::MismatchingBracket);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
|
||||
{
|
||||
if (take_starting_angle_bracket && !consume("<"))
|
||||
return {};
|
||||
|
||||
size_t offset = 0;
|
||||
while (match(TokenType::Char)) {
|
||||
auto c = m_parser_state.current_token.value();
|
||||
if (c == ">")
|
||||
break;
|
||||
offset += consume().value().length();
|
||||
}
|
||||
|
||||
auto name = m_parser_state.lexer.slice_back(offset);
|
||||
if (!consume(">") || name.is_empty())
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
|
||||
return name;
|
||||
}
|
||||
|
||||
bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
|
||||
{
|
||||
consume(TokenType::LeftParen, Error::InvalidPattern);
|
||||
|
||||
if (match(TokenType::Questionmark)) {
|
||||
// Non-capturing group or group with specifier.
|
||||
consume();
|
||||
|
||||
if (match(TokenType::Colon)) {
|
||||
consume();
|
||||
ByteCode noncapture_group_bytecode;
|
||||
size_t length = 0;
|
||||
if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named))
|
||||
return set_error(Error::InvalidPattern);
|
||||
|
||||
consume(TokenType::RightParen, Error::MismatchingParen);
|
||||
|
||||
stack.append(move(noncapture_group_bytecode));
|
||||
match_length_minimum += length;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (consume("<")) {
|
||||
++m_parser_state.named_capture_groups_count;
|
||||
auto name = read_capture_group_specifier();
|
||||
|
||||
if (name.is_empty()) {
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
return false;
|
||||
}
|
||||
|
||||
ByteCode capture_group_bytecode;
|
||||
size_t length = 0;
|
||||
if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
|
||||
return set_error(Error::InvalidPattern);
|
||||
|
||||
consume(TokenType::RightParen, Error::MismatchingParen);
|
||||
|
||||
stack.insert_bytecode_group_capture_left(name);
|
||||
stack.append(move(capture_group_bytecode));
|
||||
stack.insert_bytecode_group_capture_right(name);
|
||||
|
||||
match_length_minimum += length;
|
||||
|
||||
m_parser_state.named_capture_group_minimum_lengths.set(name, length);
|
||||
return true;
|
||||
}
|
||||
|
||||
set_error(Error::InvalidCaptureGroup);
|
||||
return false;
|
||||
}
|
||||
|
||||
auto group_index = ++m_parser_state.capture_groups_count;
|
||||
stack.insert_bytecode_group_capture_left(group_index);
|
||||
|
||||
ByteCode capture_group_bytecode;
|
||||
size_t length = 0;
|
||||
|
||||
if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
|
||||
return set_error(Error::InvalidPattern);
|
||||
|
||||
stack.append(move(capture_group_bytecode));
|
||||
|
||||
m_parser_state.capture_group_minimum_lengths.set(group_index, length);
|
||||
|
||||
consume(TokenType::RightParen, Error::MismatchingParen);
|
||||
|
||||
stack.insert_bytecode_group_capture_right(group_index);
|
||||
|
||||
match_length_minimum += length;
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,6 +39,7 @@
|
|||
namespace regex {
|
||||
|
||||
class PosixExtendedParser;
|
||||
class ECMA262Parser;
|
||||
|
||||
template<typename T>
|
||||
struct GenericParserTraits {
|
||||
|
@ -53,6 +54,10 @@ template<>
|
|||
struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
|
||||
};
|
||||
|
||||
template<>
|
||||
struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
|
||||
};
|
||||
|
||||
class Parser {
|
||||
public:
|
||||
struct Result {
|
||||
|
@ -88,6 +93,7 @@ protected:
|
|||
ALWAYS_INLINE Token consume();
|
||||
ALWAYS_INLINE Token consume(TokenType type, Error error);
|
||||
ALWAYS_INLINE bool consume(const String&);
|
||||
ALWAYS_INLINE bool try_skip(StringView);
|
||||
ALWAYS_INLINE void reset();
|
||||
ALWAYS_INLINE bool done() const;
|
||||
ALWAYS_INLINE bool set_error(Error error);
|
||||
|
@ -102,6 +108,10 @@ protected:
|
|||
size_t named_capture_groups_count { 0 };
|
||||
size_t match_length_minimum { 0 };
|
||||
AllOptions regex_options;
|
||||
HashMap<int, size_t> capture_group_minimum_lengths;
|
||||
HashMap<FlyString, size_t> named_capture_group_minimum_lengths;
|
||||
HashMap<size_t, FlyString> named_capture_groups;
|
||||
|
||||
explicit ParserState(Lexer& lexer)
|
||||
: lexer(lexer)
|
||||
, current_token(lexer.next())
|
||||
|
@ -144,8 +154,54 @@ private:
|
|||
ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
|
||||
};
|
||||
|
||||
class ECMA262Parser final : public Parser {
|
||||
public:
|
||||
explicit ECMA262Parser(Lexer& lexer)
|
||||
: Parser(lexer)
|
||||
{
|
||||
}
|
||||
|
||||
ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
|
||||
: Parser(lexer, regex_options.value_or({}))
|
||||
{
|
||||
}
|
||||
|
||||
~ECMA262Parser() = default;
|
||||
|
||||
private:
|
||||
bool parse_internal(ByteCode&, size_t&) override;
|
||||
|
||||
enum class ReadDigitsInitialZeroState {
|
||||
Allow,
|
||||
Disallow,
|
||||
Require,
|
||||
};
|
||||
enum class ReadDigitFollowPolicy {
|
||||
Any,
|
||||
DisallowDigit,
|
||||
DisallowNonDigit,
|
||||
};
|
||||
Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy follow_policy = ReadDigitFollowPolicy::Any, bool hex = false, int max_count = -1);
|
||||
StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
|
||||
|
||||
bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_alternative(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_term(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_assertion(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_atom(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_quantifier(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_atom_escape(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_character_class(ByteCode&, size_t&, bool unicode, bool named);
|
||||
bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
|
||||
Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
|
||||
bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
|
||||
};
|
||||
|
||||
using PosixExtended = PosixExtendedParser;
|
||||
using ECMA262 = ECMA262Parser;
|
||||
|
||||
}
|
||||
|
||||
using regex::ECMA262;
|
||||
using regex::PosixExtended;
|
||||
|
|
|
@ -472,4 +472,79 @@ TEST_CASE(simple_period_end_benchmark)
|
|||
EXPECT_EQ(re.search("hello?", m), true);
|
||||
}
|
||||
|
||||
TEST_CASE(ECMA262_parse)
|
||||
{
|
||||
constexpr const char* patterns[] {
|
||||
"^hello.$",
|
||||
"^(hello.)$",
|
||||
"^h{0,1}ello.$",
|
||||
"^hello\\W$",
|
||||
"^hell\\w.$",
|
||||
"^hell\\x6f1$", // ^hello1$
|
||||
"^hel(?:l\\w).$",
|
||||
"^hel(?<LO>l\\w).$",
|
||||
"^[-a-zA-Z\\w\\s]+$",
|
||||
"\\bhello\\B",
|
||||
};
|
||||
|
||||
for (auto& pattern : patterns) {
|
||||
Regex<ECMA262> re(pattern);
|
||||
EXPECT_EQ(re.parser_result.error, Error::NoError);
|
||||
#ifdef REGEX_DEBUG
|
||||
dbg() << "\n";
|
||||
RegexDebug regex_dbg(stderr);
|
||||
regex_dbg.print_raw_bytecode(re);
|
||||
regex_dbg.print_header();
|
||||
regex_dbg.print_bytecode(re);
|
||||
dbg() << "\n";
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE(ECMA262_match)
|
||||
{
|
||||
struct _test {
|
||||
const char* pattern;
|
||||
const char* subject;
|
||||
bool matches { true };
|
||||
ECMAScriptFlags options {};
|
||||
};
|
||||
|
||||
constexpr _test tests[] {
|
||||
{ "^hello.$", "hello1" },
|
||||
{ "^(hello.)$", "hello1" },
|
||||
{ "^h{0,1}ello.$", "ello1" },
|
||||
{ "^hello\\W$", "hello!" },
|
||||
{ "^hell\\w.$", "hellx!" },
|
||||
{ "^hell\\x6f1$", "hello1" },
|
||||
{ "^hel(?<LO>l.)1$", "hello1" },
|
||||
{ "^hel(?<LO>l.)1*\\k<LO>.$", "hello1lo1" },
|
||||
{ "^[-a-z1-3\\s]+$", "hell2 o1" },
|
||||
{ .pattern = "\\bhello\\B", .subject = "hello1", .options = ECMAScriptFlags::Global },
|
||||
{ "\\b.*\\b", "hello1" },
|
||||
{ "[^\\D\\S]{2}", "1 " },
|
||||
{ "bar(?=f.)foo", "barfoo" },
|
||||
{ "bar(?=foo)bar", "barbar", false },
|
||||
{ "bar(?!foo)bar", "barbar", true },
|
||||
{ "bar(?!bar)bar", "barbar", false },
|
||||
{ "bar.*(?<=foo)", "barbar", false },
|
||||
{ "bar.*(?<!foo)", "barbar", true },
|
||||
{ "((...)X)+", "fooXbarXbazX", true },
|
||||
};
|
||||
|
||||
for (auto& test : tests) {
|
||||
Regex<ECMA262> re(test.pattern, test.options);
|
||||
#ifdef REGEX_DEBUG
|
||||
dbg() << "\n";
|
||||
RegexDebug regex_dbg(stderr);
|
||||
regex_dbg.print_raw_bytecode(re);
|
||||
regex_dbg.print_header();
|
||||
regex_dbg.print_bytecode(re);
|
||||
dbg() << "\n";
|
||||
#endif
|
||||
EXPECT_EQ(re.parser_result.error, Error::NoError);
|
||||
EXPECT_EQ(re.match(test.subject).success, test.matches);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_MAIN(Regex)
|
||||
|
|
Loading…
Reference in a new issue