LibRegex+Everywhere: Make LibRegex more unicode-aware
This commit makes LibRegex (mostly) capable of operating on any of the three main string views: - StringView for raw strings - Utf8View for utf-8 encoded strings - Utf32View for raw unicode strings As a result, regexps with unicode strings should be able to properly handle utf-8 and not stop in the middle of a code point. A future commit will update LibJS to use the correct type of string depending on the flags.
This commit is contained in:
parent
e5af15a6e9
commit
f364fcec5d
Notes:
sideshowbarker
2024-07-18 08:47:27 +09:00
Author: https://github.com/alimpfard Commit: https://github.com/SerenityOS/serenity/commit/f364fcec5da Pull-request: https://github.com/SerenityOS/serenity/pull/8847 Reviewed-by: https://github.com/MaxWipfli Reviewed-by: https://github.com/linusg ✅ Reviewed-by: https://github.com/trflynn89
8 changed files with 310 additions and 207 deletions
|
@ -249,7 +249,7 @@ TEST_CASE(char_utf8)
|
|||
Regex<PosixExtended> re("😀");
|
||||
RegexResult result;
|
||||
|
||||
EXPECT_EQ((result = match("Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界", re, PosixFlags::Global)).success, true);
|
||||
EXPECT_EQ((result = match(Utf8View { "Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界" }, re, PosixFlags::Global)).success, true);
|
||||
EXPECT_EQ(result.count, 2u);
|
||||
}
|
||||
|
||||
|
@ -312,7 +312,6 @@ TEST_CASE(match_all_character_class)
|
|||
EXPECT_EQ(result.matches.at(0).view, "W");
|
||||
EXPECT_EQ(result.matches.at(1).view, "i");
|
||||
EXPECT_EQ(result.matches.at(2).view, "n");
|
||||
EXPECT(&result.matches.at(0).view.characters_without_null_termination()[0] != &str.view().characters_without_null_termination()[1]);
|
||||
}
|
||||
|
||||
TEST_CASE(match_character_class_with_assertion)
|
||||
|
|
|
@ -359,7 +359,7 @@ String CppComprehensionEngine::document_path_from_include_path(const StringView&
|
|||
if (!library_include.search(include_path, result))
|
||||
return {};
|
||||
|
||||
auto path = result.capture_group_matches.at(0).at(0).view.u8view();
|
||||
auto path = result.capture_group_matches.at(0).at(0).view.string_view();
|
||||
return String::formatted("/usr/include/{}", path);
|
||||
};
|
||||
|
||||
|
@ -368,7 +368,7 @@ String CppComprehensionEngine::document_path_from_include_path(const StringView&
|
|||
if (!user_defined_include.search(include_path, result))
|
||||
return {};
|
||||
|
||||
return result.capture_group_matches.at(0).at(0).view.u8view();
|
||||
return result.capture_group_matches.at(0).at(0).view.string_view();
|
||||
};
|
||||
|
||||
auto result = document_path_for_library_include(include_path);
|
||||
|
|
|
@ -422,7 +422,7 @@ void DebugSession::update_loaded_libs()
|
|||
auto rc = re.search(vm_name, result);
|
||||
if (!rc)
|
||||
return {};
|
||||
auto lib_name = result.capture_group_matches.at(0).at(0).view.u8view().to_string();
|
||||
auto lib_name = result.capture_group_matches.at(0).at(0).view.string_view().to_string();
|
||||
if (lib_name.starts_with("/"))
|
||||
return lib_name;
|
||||
return String::formatted("/usr/lib/{}", lib_name);
|
||||
|
|
|
@ -288,30 +288,33 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input,
|
|||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
|
||||
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
if (input.match_index >= output.capture_group_matches.size()) {
|
||||
output.capture_group_matches.ensure_capacity(input.match_index);
|
||||
auto capacity = output.capture_group_matches.capacity();
|
||||
for (size_t i = output.capture_group_matches.size(); i <= capacity; ++i)
|
||||
output.capture_group_matches.empend();
|
||||
if (input.match_index >= state.capture_group_matches.size()) {
|
||||
state.capture_group_matches.ensure_capacity(input.match_index);
|
||||
auto capacity = state.capture_group_matches.capacity();
|
||||
for (size_t i = state.capture_group_matches.size(); i <= capacity; ++i)
|
||||
state.capture_group_matches.empend();
|
||||
}
|
||||
|
||||
if (id() >= output.capture_group_matches.at(input.match_index).size()) {
|
||||
output.capture_group_matches.at(input.match_index).ensure_capacity(id());
|
||||
auto capacity = output.capture_group_matches.at(input.match_index).capacity();
|
||||
for (size_t i = output.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i)
|
||||
output.capture_group_matches.at(input.match_index).empend();
|
||||
if (id() >= state.capture_group_matches.at(input.match_index).size()) {
|
||||
state.capture_group_matches.at(input.match_index).ensure_capacity(id());
|
||||
auto capacity = state.capture_group_matches.at(input.match_index).capacity();
|
||||
for (size_t i = state.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i)
|
||||
state.capture_group_matches.at(input.match_index).empend();
|
||||
}
|
||||
|
||||
output.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position;
|
||||
state.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position;
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
|
||||
ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
auto& match = output.capture_group_matches.at(input.match_index).at(id());
|
||||
auto& match = state.capture_group_matches.at(input.match_index).at(id());
|
||||
auto start_position = match.left_column;
|
||||
if (state.string_position < start_position)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
auto length = state.string_position - start_position;
|
||||
|
||||
if (start_position < match.column)
|
||||
|
@ -330,27 +333,27 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchI
|
|||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
|
||||
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
if (input.match_index >= output.named_capture_group_matches.size()) {
|
||||
output.named_capture_group_matches.ensure_capacity(input.match_index);
|
||||
auto capacity = output.named_capture_group_matches.capacity();
|
||||
for (size_t i = output.named_capture_group_matches.size(); i <= capacity; ++i)
|
||||
output.named_capture_group_matches.empend();
|
||||
if (input.match_index >= state.named_capture_group_matches.size()) {
|
||||
state.named_capture_group_matches.ensure_capacity(input.match_index);
|
||||
auto capacity = state.named_capture_group_matches.capacity();
|
||||
for (size_t i = state.named_capture_group_matches.size(); i <= capacity; ++i)
|
||||
state.named_capture_group_matches.empend();
|
||||
}
|
||||
output.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position;
|
||||
state.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position;
|
||||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
|
||||
ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
StringView capture_group_name = name();
|
||||
|
||||
if (output.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) {
|
||||
auto start_position = output.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column;
|
||||
if (state.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) {
|
||||
auto start_position = state.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column;
|
||||
auto length = state.string_position - start_position;
|
||||
|
||||
auto& map = output.named_capture_group_matches.at(input.match_index);
|
||||
auto& map = state.named_capture_group_matches.at(input.match_index);
|
||||
|
||||
if constexpr (REGEX_DEBUG) {
|
||||
VERIFY(start_position + length <= input.view.length());
|
||||
|
@ -371,7 +374,7 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const M
|
|||
return ExecutionResult::Continue;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
|
||||
ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
|
||||
{
|
||||
bool inverse { false };
|
||||
bool temporary_inverse { false };
|
||||
|
@ -414,14 +417,14 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
|
|||
u32 ch = m_bytecode->at(offset++);
|
||||
|
||||
// We want to compare a string that is longer or equal in length to the available string
|
||||
if (input.view.length() - state.string_position < 1)
|
||||
if (input.view.length() <= state.string_position)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
compare_char(input, state, ch, current_inversion_state(), inverse_matched);
|
||||
|
||||
} else if (compare_type == CharacterCompareType::AnyChar) {
|
||||
// We want to compare a string that is definitely longer than the available string
|
||||
if (input.view.length() - state.string_position < 1)
|
||||
if (input.view.length() <= state.string_position)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
VERIFY(!current_inversion_state());
|
||||
|
@ -431,20 +434,25 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
|
|||
VERIFY(!current_inversion_state());
|
||||
|
||||
const auto& length = m_bytecode->at(offset++);
|
||||
StringBuilder str_builder;
|
||||
for (size_t i = 0; i < length; ++i)
|
||||
str_builder.append(m_bytecode->at(offset++));
|
||||
|
||||
// We want to compare a string that is definitely longer than the available string
|
||||
if (input.view.length() - state.string_position < length)
|
||||
if (input.view.length() < state.string_position + length)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
if (!compare_string(input, state, str_builder.string_view().characters_without_null_termination(), length, had_zero_length_match))
|
||||
Optional<String> str;
|
||||
Vector<u32> data;
|
||||
data.ensure_capacity(length);
|
||||
for (size_t i = offset; i < offset + length; ++i)
|
||||
data.unchecked_append(m_bytecode->at(i));
|
||||
|
||||
auto view = input.view.construct_as_same(data, str);
|
||||
offset += length;
|
||||
if (!compare_string(input, state, view, had_zero_length_match))
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
} else if (compare_type == CharacterCompareType::CharClass) {
|
||||
|
||||
if (input.view.length() - state.string_position < 1)
|
||||
if (input.view.length() <= state.string_position)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
auto character_class = (CharClass)m_bytecode->at(offset++);
|
||||
|
@ -453,6 +461,9 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
|
|||
compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
|
||||
|
||||
} else if (compare_type == CharacterCompareType::CharRange) {
|
||||
if (input.view.length() <= state.string_position)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
auto value = (CharRange)m_bytecode->at(offset++);
|
||||
|
||||
auto from = value.from;
|
||||
|
@ -463,17 +474,17 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
|
|||
|
||||
} else if (compare_type == CharacterCompareType::Reference) {
|
||||
auto reference_number = (size_t)m_bytecode->at(offset++);
|
||||
auto& groups = output.capture_group_matches.at(input.match_index);
|
||||
auto& groups = state.capture_group_matches.at(input.match_index);
|
||||
if (groups.size() <= reference_number)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
auto str = groups.at(reference_number).view;
|
||||
|
||||
// We want to compare a string that is definitely longer than the available string
|
||||
if (input.view.length() - state.string_position < str.length())
|
||||
if (input.view.length() < state.string_position + str.length())
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match))
|
||||
if (!compare_string(input, state, str, had_zero_length_match))
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
} else if (compare_type == CharacterCompareType::NamedReference) {
|
||||
|
@ -481,17 +492,17 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
|
|||
auto length = (size_t)m_bytecode->at(offset++);
|
||||
StringView name { ptr, length };
|
||||
|
||||
auto group = output.named_capture_group_matches.at(input.match_index).get(name);
|
||||
auto group = state.named_capture_group_matches.at(input.match_index).get(name);
|
||||
if (!group.has_value())
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
auto str = group.value().view;
|
||||
|
||||
// We want to compare a string that is definitely longer than the available string
|
||||
if (input.view.length() - state.string_position < str.length())
|
||||
if (input.view.length() < state.string_position + str.length())
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match))
|
||||
if (!compare_string(input, state, str, had_zero_length_match))
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
} else {
|
||||
|
@ -512,14 +523,19 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, M
|
|||
|
||||
ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched)
|
||||
{
|
||||
u32 ch2 = input.view[state.string_position];
|
||||
if (state.string_position == input.view.length())
|
||||
return;
|
||||
|
||||
if (input.regex_options & AllFlags::Insensitive) {
|
||||
ch1 = to_ascii_lowercase(ch1);
|
||||
ch2 = to_ascii_lowercase(ch2);
|
||||
}
|
||||
auto input_view = input.view.substring_view(state.string_position, 1);
|
||||
Optional<String> str;
|
||||
auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str);
|
||||
bool equal;
|
||||
if (input.regex_options & AllFlags::Insensitive)
|
||||
equal = input_view.equals_ignoring_case(compare_view);
|
||||
else
|
||||
equal = input_view.equals(compare_view);
|
||||
|
||||
if (ch1 == ch2) {
|
||||
if (equal) {
|
||||
if (inverse)
|
||||
inverse_matched = true;
|
||||
else
|
||||
|
@ -527,41 +543,32 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchSt
|
|||
}
|
||||
}
|
||||
|
||||
ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match)
|
||||
ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, RegexStringView const& str, bool& had_zero_length_match)
|
||||
{
|
||||
if (length == 0) {
|
||||
if (state.string_position + str.length() > input.view.length()) {
|
||||
if (str.is_empty()) {
|
||||
had_zero_length_match = true;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
if (str.length() == 0) {
|
||||
had_zero_length_match = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (input.view.is_u8_view()) {
|
||||
auto str_view1 = StringView(str, length);
|
||||
auto str_view2 = StringView(&input.view.u8view()[state.string_position], length);
|
||||
auto subject = input.view.substring_view(state.string_position, str.length());
|
||||
bool equals;
|
||||
if (input.regex_options & AllFlags::Insensitive)
|
||||
equals = subject.equals_ignoring_case(str);
|
||||
else
|
||||
equals = subject.equals(str);
|
||||
|
||||
bool string_equals;
|
||||
if (input.regex_options & AllFlags::Insensitive)
|
||||
string_equals = str_view1.equals_ignoring_case(str_view2);
|
||||
else
|
||||
string_equals = str_view1 == str_view2;
|
||||
if (equals)
|
||||
state.string_position += str.length();
|
||||
|
||||
if (string_equals) {
|
||||
state.string_position += length;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
bool equals;
|
||||
if (input.regex_options & AllFlags::Insensitive)
|
||||
TODO();
|
||||
else
|
||||
equals = __builtin_memcmp(str, &input.view.u32view().code_points()[state.string_position], length) == 0;
|
||||
|
||||
if (equals) {
|
||||
state.string_position += length;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
return equals;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched)
|
||||
|
|
|
@ -682,7 +682,7 @@ public:
|
|||
|
||||
private:
|
||||
ALWAYS_INLINE static void compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched);
|
||||
ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match);
|
||||
ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, RegexStringView const& str, bool& had_zero_length_match);
|
||||
ALWAYS_INLINE static void compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched);
|
||||
ALWAYS_INLINE static void compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched);
|
||||
};
|
||||
|
|
|
@ -15,6 +15,8 @@
|
|||
#include <AK/StringBuilder.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/Utf32View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <AK/Variant.h>
|
||||
#include <AK/Vector.h>
|
||||
|
||||
namespace regex {
|
||||
|
@ -22,124 +24,172 @@ namespace regex {
|
|||
class RegexStringView {
|
||||
public:
|
||||
RegexStringView(const char* chars)
|
||||
: m_u8view(chars)
|
||||
: m_view(StringView { chars })
|
||||
{
|
||||
}
|
||||
|
||||
RegexStringView(const String& string)
|
||||
: m_u8view(string)
|
||||
: m_view(string.view())
|
||||
{
|
||||
}
|
||||
|
||||
RegexStringView(const StringView view)
|
||||
: m_u8view(view)
|
||||
{
|
||||
}
|
||||
RegexStringView(const Utf32View view)
|
||||
: m_u32view(view)
|
||||
: m_view(view)
|
||||
{
|
||||
}
|
||||
|
||||
bool is_u8_view() const { return m_u8view.has_value(); }
|
||||
bool is_u32_view() const { return m_u32view.has_value(); }
|
||||
|
||||
const StringView& u8view() const
|
||||
RegexStringView(Utf32View view)
|
||||
: m_view(view)
|
||||
{
|
||||
VERIFY(m_u8view.has_value());
|
||||
return m_u8view.value();
|
||||
};
|
||||
}
|
||||
|
||||
const Utf32View& u32view() const
|
||||
RegexStringView(Utf8View view)
|
||||
: m_view(view)
|
||||
{
|
||||
VERIFY(m_u32view.has_value());
|
||||
return m_u32view.value();
|
||||
};
|
||||
}
|
||||
|
||||
const StringView& string_view() const
|
||||
{
|
||||
return m_view.get<StringView>();
|
||||
}
|
||||
|
||||
const Utf32View& u32_view() const
|
||||
{
|
||||
return m_view.get<Utf32View>();
|
||||
}
|
||||
|
||||
const Utf8View& u8_view() const
|
||||
{
|
||||
return m_view.get<Utf8View>();
|
||||
}
|
||||
|
||||
bool is_empty() const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return m_u8view.value().is_empty();
|
||||
else
|
||||
return m_u32view.value().is_empty();
|
||||
return m_view.visit([](auto& view) { return view.is_empty(); });
|
||||
}
|
||||
|
||||
bool is_null() const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return m_u8view.value().is_null();
|
||||
else
|
||||
return m_u32view.value().code_points() == nullptr;
|
||||
return m_view.visit([](auto& view) { return view.is_null(); });
|
||||
}
|
||||
|
||||
size_t length() const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return m_u8view.value().length();
|
||||
else
|
||||
return m_u32view.value().length();
|
||||
return m_view.visit([](auto& view) { return view.length(); });
|
||||
}
|
||||
|
||||
RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage) const
|
||||
{
|
||||
return m_view.visit(
|
||||
[&]<typename T>(T const&) {
|
||||
StringBuilder builder;
|
||||
for (auto ch : data)
|
||||
builder.append(ch); // Note: The type conversion is intentional.
|
||||
optional_string_storage = builder.build();
|
||||
return RegexStringView { T { *optional_string_storage } };
|
||||
},
|
||||
[&](Utf32View) {
|
||||
return RegexStringView { Utf32View { data.data(), data.size() } };
|
||||
});
|
||||
}
|
||||
|
||||
Vector<RegexStringView> lines() const
|
||||
{
|
||||
if (is_u8_view()) {
|
||||
auto views = u8view().lines(false);
|
||||
Vector<RegexStringView> new_views;
|
||||
for (auto& view : views)
|
||||
new_views.append(move(view));
|
||||
return new_views;
|
||||
}
|
||||
|
||||
Vector<RegexStringView> views;
|
||||
auto view = u32view();
|
||||
u32 newline = '\n';
|
||||
while (!view.is_empty()) {
|
||||
auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32));
|
||||
if (!position.has_value())
|
||||
break;
|
||||
auto offset = position.value() / sizeof(u32);
|
||||
views.append(view.substring_view(0, offset));
|
||||
view = view.substring_view(offset + 1, view.length() - offset - 1);
|
||||
}
|
||||
if (!view.is_empty())
|
||||
views.append(view);
|
||||
return views;
|
||||
return m_view.visit(
|
||||
[](StringView view) {
|
||||
auto views = view.lines(false);
|
||||
Vector<RegexStringView> new_views;
|
||||
for (auto& view : views)
|
||||
new_views.empend(view);
|
||||
return new_views;
|
||||
},
|
||||
[](Utf32View view) {
|
||||
Vector<RegexStringView> views;
|
||||
u32 newline = '\n';
|
||||
while (!view.is_empty()) {
|
||||
auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32));
|
||||
if (!position.has_value())
|
||||
break;
|
||||
auto offset = position.value() / sizeof(u32);
|
||||
views.empend(view.substring_view(0, offset));
|
||||
view = view.substring_view(offset + 1, view.length() - offset - 1);
|
||||
}
|
||||
if (!view.is_empty())
|
||||
views.empend(view);
|
||||
return views;
|
||||
},
|
||||
[](Utf8View& view) {
|
||||
Vector<RegexStringView> views;
|
||||
auto it = view.begin();
|
||||
auto previous_newline_position_it = it;
|
||||
for (;;) {
|
||||
if (*it == '\n') {
|
||||
auto previous_offset = view.byte_offset_of(previous_newline_position_it);
|
||||
auto new_offset = view.byte_offset_of(it);
|
||||
auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
|
||||
views.empend(slice);
|
||||
++it;
|
||||
previous_newline_position_it = it;
|
||||
}
|
||||
if (it.done())
|
||||
break;
|
||||
++it;
|
||||
}
|
||||
if (it != previous_newline_position_it) {
|
||||
auto previous_offset = view.byte_offset_of(previous_newline_position_it);
|
||||
auto new_offset = view.byte_offset_of(it);
|
||||
auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
|
||||
views.empend(slice);
|
||||
}
|
||||
return views;
|
||||
});
|
||||
}
|
||||
|
||||
RegexStringView substring_view(size_t offset, size_t length) const
|
||||
{
|
||||
if (is_u8_view()) {
|
||||
return u8view().substring_view(offset, length);
|
||||
}
|
||||
return u32view().substring_view(offset, length);
|
||||
return m_view.visit(
|
||||
[&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
|
||||
[&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
|
||||
}
|
||||
|
||||
String to_string() const
|
||||
{
|
||||
if (is_u8_view()) {
|
||||
return u8view().to_string();
|
||||
}
|
||||
|
||||
StringBuilder builder;
|
||||
builder.append(u32view());
|
||||
return builder.to_string();
|
||||
return m_view.visit(
|
||||
[](StringView view) { return view.to_string(); },
|
||||
[](auto& view) {
|
||||
StringBuilder builder;
|
||||
for (auto it = view.begin(); it != view.end(); ++it)
|
||||
builder.append_code_point(*it);
|
||||
return builder.to_string();
|
||||
});
|
||||
}
|
||||
|
||||
u32 operator[](size_t index) const
|
||||
{
|
||||
if (is_u8_view()) {
|
||||
i8 ch = u8view()[index];
|
||||
u8 value = *reinterpret_cast<u8*>(&ch);
|
||||
return static_cast<u32>(value);
|
||||
}
|
||||
return u32view().code_points()[index];
|
||||
return m_view.visit(
|
||||
[&](StringView view) -> u32 {
|
||||
auto ch = view[index];
|
||||
if (ch < 0)
|
||||
return 256u + ch;
|
||||
return ch;
|
||||
},
|
||||
[&](auto view) -> u32 { return view[index]; },
|
||||
[&](Utf8View& view) -> u32 {
|
||||
size_t i = index;
|
||||
for (auto it = view.begin(); it != view.end(); ++it, --i) {
|
||||
if (i == 0)
|
||||
return *it;
|
||||
}
|
||||
VERIFY_NOT_REACHED();
|
||||
});
|
||||
}
|
||||
|
||||
bool operator==(const char* cstring) const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return u8view() == cstring;
|
||||
|
||||
return to_string() == cstring;
|
||||
return m_view.visit(
|
||||
[&](Utf32View) { return to_string() == cstring; },
|
||||
[&](Utf8View const& view) { return view.as_string() == cstring; },
|
||||
[&](StringView view) { return view == cstring; });
|
||||
}
|
||||
|
||||
bool operator!=(const char* cstring) const
|
||||
|
@ -149,18 +199,18 @@ public:
|
|||
|
||||
bool operator==(const String& string) const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return u8view() == string;
|
||||
|
||||
return to_string() == string;
|
||||
return m_view.visit(
|
||||
[&](Utf32View) { return to_string() == string; },
|
||||
[&](Utf8View const& view) { return view.as_string() == string; },
|
||||
[&](StringView view) { return view == string; });
|
||||
}
|
||||
|
||||
bool operator==(const StringView& other) const
|
||||
bool operator==(const StringView& string) const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return u8view() == other;
|
||||
|
||||
return false;
|
||||
return m_view.visit(
|
||||
[&](Utf32View) { return to_string() == string; },
|
||||
[&](Utf8View const& view) { return view.as_string() == string; },
|
||||
[&](StringView view) { return view == string; });
|
||||
}
|
||||
|
||||
bool operator!=(const StringView& other) const
|
||||
|
@ -170,13 +220,12 @@ public:
|
|||
|
||||
bool operator==(const Utf32View& other) const
|
||||
{
|
||||
if (is_u32_view()) {
|
||||
StringBuilder builder;
|
||||
builder.append(other);
|
||||
return to_string() == builder.to_string();
|
||||
}
|
||||
|
||||
return false;
|
||||
return m_view.visit(
|
||||
[&](Utf32View view) {
|
||||
return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0;
|
||||
},
|
||||
[&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
|
||||
[&](StringView view) { return view == RegexStringView { other }.to_string(); });
|
||||
}
|
||||
|
||||
bool operator!=(const Utf32View& other) const
|
||||
|
@ -184,34 +233,78 @@ public:
|
|||
return !(*this == other);
|
||||
}
|
||||
|
||||
const char* characters_without_null_termination() const
|
||||
bool operator==(const Utf8View& other) const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return u8view().characters_without_null_termination();
|
||||
return m_view.visit(
|
||||
[&](Utf32View) {
|
||||
return to_string() == other.as_string();
|
||||
},
|
||||
[&](Utf8View const& view) { return view.as_string() == other.as_string(); },
|
||||
[&](StringView view) { return other.as_string() == view; });
|
||||
}
|
||||
|
||||
return to_string().characters(); // FIXME: it contains the null termination, does that actually matter?
|
||||
bool operator!=(const Utf8View& other) const
|
||||
{
|
||||
return !(*this == other);
|
||||
}
|
||||
|
||||
bool equals(const RegexStringView& other) const
|
||||
{
|
||||
return other.m_view.visit([&](auto const& view) { return operator==(view); });
|
||||
}
|
||||
|
||||
bool equals_ignoring_case(const RegexStringView& other) const
|
||||
{
|
||||
// FIXME: Implement equals_ignoring_case() for unicode.
|
||||
return m_view.visit(
|
||||
[&](StringView view) {
|
||||
return other.m_view.visit(
|
||||
[&](StringView other_view) { return view.equals_ignoring_case(other_view); },
|
||||
[](auto&) -> bool { TODO(); });
|
||||
},
|
||||
[](auto&) -> bool { TODO(); });
|
||||
}
|
||||
|
||||
bool starts_with(const StringView& str) const
|
||||
{
|
||||
if (is_u32_view())
|
||||
return false;
|
||||
return u8view().starts_with(str);
|
||||
return m_view.visit(
|
||||
[&](Utf32View) -> bool {
|
||||
TODO();
|
||||
},
|
||||
[&](Utf8View const& view) { return view.as_string().starts_with(str); },
|
||||
[&](StringView view) { return view.starts_with(str); });
|
||||
}
|
||||
|
||||
bool starts_with(const Utf32View& str) const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return false;
|
||||
|
||||
StringBuilder builder;
|
||||
builder.append(str);
|
||||
return to_string().starts_with(builder.to_string());
|
||||
return m_view.visit(
|
||||
[&](Utf32View view) -> bool {
|
||||
if (str.length() > view.length())
|
||||
return false;
|
||||
if (str.length() == view.length())
|
||||
return operator==(str);
|
||||
for (size_t i = 0; i < str.length(); ++i) {
|
||||
if (str.at(i) != view.at(i))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
},
|
||||
[&](Utf8View const& view) {
|
||||
auto it = view.begin();
|
||||
for (auto code_point : str) {
|
||||
if (it.done())
|
||||
return false;
|
||||
if (code_point != *it)
|
||||
return false;
|
||||
++it;
|
||||
}
|
||||
return true;
|
||||
},
|
||||
[&](StringView) -> bool { TODO(); });
|
||||
}
|
||||
|
||||
private:
|
||||
Optional<StringView> m_u8view;
|
||||
Optional<Utf32View> m_u32view;
|
||||
Variant<StringView, Utf8View, Utf32View> m_view;
|
||||
};
|
||||
|
||||
class Match final {
|
||||
|
@ -271,6 +364,9 @@ struct MatchState {
|
|||
size_t string_position { 0 };
|
||||
size_t instruction_position { 0 };
|
||||
size_t fork_at_position { 0 };
|
||||
Vector<Match> matches;
|
||||
Vector<Vector<Match>> capture_group_matches;
|
||||
Vector<HashMap<String, Match>> named_capture_group_matches;
|
||||
};
|
||||
|
||||
struct MatchOutput {
|
||||
|
@ -288,6 +384,7 @@ template<>
|
|||
struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> {
|
||||
void format(FormatBuilder& builder, const regex::RegexStringView& value)
|
||||
{
|
||||
return Formatter<StringView>::format(builder, { value.characters_without_null_termination(), value.length() });
|
||||
auto string = value.to_string();
|
||||
return Formatter<StringView>::format(builder, string);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -99,34 +99,34 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional
|
|||
}
|
||||
|
||||
if (c_match_preallocation_count) {
|
||||
output.matches.ensure_capacity(c_match_preallocation_count);
|
||||
output.capture_group_matches.ensure_capacity(c_match_preallocation_count);
|
||||
output.named_capture_group_matches.ensure_capacity(c_match_preallocation_count);
|
||||
state.matches.ensure_capacity(c_match_preallocation_count);
|
||||
state.capture_group_matches.ensure_capacity(c_match_preallocation_count);
|
||||
state.named_capture_group_matches.ensure_capacity(c_match_preallocation_count);
|
||||
|
||||
auto& capture_groups_count = m_pattern.parser_result.capture_groups_count;
|
||||
auto& named_capture_groups_count = m_pattern.parser_result.named_capture_groups_count;
|
||||
|
||||
for (size_t j = 0; j < c_match_preallocation_count; ++j) {
|
||||
output.matches.empend();
|
||||
output.capture_group_matches.unchecked_append({});
|
||||
output.capture_group_matches.at(j).ensure_capacity(capture_groups_count);
|
||||
state.matches.empend();
|
||||
state.capture_group_matches.unchecked_append({});
|
||||
state.capture_group_matches.at(j).ensure_capacity(capture_groups_count);
|
||||
for (size_t k = 0; k < capture_groups_count; ++k)
|
||||
output.capture_group_matches.at(j).unchecked_append({});
|
||||
state.capture_group_matches.at(j).unchecked_append({});
|
||||
|
||||
output.named_capture_group_matches.unchecked_append({});
|
||||
output.named_capture_group_matches.at(j).ensure_capacity(named_capture_groups_count);
|
||||
state.named_capture_group_matches.unchecked_append({});
|
||||
state.named_capture_group_matches.at(j).ensure_capacity(named_capture_groups_count);
|
||||
}
|
||||
}
|
||||
|
||||
auto append_match = [](auto& input, auto& state, auto& output, auto& start_position) {
|
||||
if (output.matches.size() == input.match_index)
|
||||
output.matches.empend();
|
||||
auto append_match = [](auto& input, auto& state, auto& start_position) {
|
||||
if (state.matches.size() == input.match_index)
|
||||
state.matches.empend();
|
||||
|
||||
VERIFY(start_position + state.string_position - start_position <= input.view.length());
|
||||
if (input.regex_options.has_flag_set(AllFlags::StringCopyMatches)) {
|
||||
output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_string(), input.line, start_position, input.global_offset + start_position };
|
||||
state.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_string(), input.line, start_position, input.global_offset + start_position };
|
||||
} else { // let the view point to the original string ...
|
||||
output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position };
|
||||
state.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position };
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -171,7 +171,7 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional
|
|||
output = move(temp_output);
|
||||
if (!match_count) {
|
||||
// Nothing was *actually* matched, so append an empty match.
|
||||
append_match(input, state, output, view_index);
|
||||
append_match(input, state, view_index);
|
||||
++match_count;
|
||||
}
|
||||
}
|
||||
|
@ -219,21 +219,21 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional
|
|||
++match_count;
|
||||
|
||||
if (continue_search) {
|
||||
append_match(input, state, output, view_index);
|
||||
append_match(input, state, view_index);
|
||||
|
||||
bool has_zero_length = state.string_position == view_index;
|
||||
view_index = state.string_position - (has_zero_length ? 0 : 1);
|
||||
continue;
|
||||
|
||||
} else if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) {
|
||||
append_match(input, state, output, view_index);
|
||||
append_match(input, state, view_index);
|
||||
break;
|
||||
|
||||
} else if (state.string_position < view_length) {
|
||||
return { false, 0, {}, {}, {}, output.operations };
|
||||
}
|
||||
|
||||
append_match(input, state, output, view_index);
|
||||
append_match(input, state, view_index);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -253,7 +253,7 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional
|
|||
|
||||
MatchOutput output_copy;
|
||||
if (match_count) {
|
||||
output_copy.capture_group_matches = output.capture_group_matches;
|
||||
output_copy.capture_group_matches = state.capture_group_matches;
|
||||
// Make sure there are as many capture matches as there are actual matches.
|
||||
if (output_copy.capture_group_matches.size() < match_count)
|
||||
output_copy.capture_group_matches.resize(match_count);
|
||||
|
@ -264,12 +264,12 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional
|
|||
matches.template remove_all_matching([](auto& match) { return match.view.is_null(); });
|
||||
}
|
||||
|
||||
output_copy.named_capture_group_matches = output.named_capture_group_matches;
|
||||
output_copy.named_capture_group_matches = state.named_capture_group_matches;
|
||||
// Make sure there are as many capture matches as there are actual matches.
|
||||
if (output_copy.named_capture_group_matches.size() < match_count)
|
||||
output_copy.named_capture_group_matches.resize(match_count);
|
||||
|
||||
output_copy.matches = output.matches;
|
||||
output_copy.matches = state.matches;
|
||||
} else {
|
||||
output_copy.capture_group_matches.clear_with_capacity();
|
||||
output_copy.named_capture_group_matches.clear_with_capacity();
|
||||
|
|
|
@ -417,7 +417,7 @@ private:
|
|||
|
||||
StringBuilder result;
|
||||
for (auto& e : match.capture_group_matches[0])
|
||||
result.append(e.view.u8view());
|
||||
result.append(e.view.string_view());
|
||||
|
||||
return result.build();
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue