123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706 |
- /*
- * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
- *
- * SPDX-License-Identifier: BSD-2-Clause
- */
- #pragma once
- #include "Forward.h"
- #include "RegexOptions.h"
- #include <AK/Error.h>
- #include <AK/ByteString.h>
- #include <AK/DeprecatedFlyString.h>
- #include <AK/HashMap.h>
- #include <AK/MemMem.h>
- #include <AK/RedBlackTree.h>
- #include <AK/StringBuilder.h>
- #include <AK/StringView.h>
- #include <AK/Utf16View.h>
- #include <AK/Utf32View.h>
- #include <AK/Utf8View.h>
- #include <AK/Variant.h>
- #include <AK/Vector.h>
- namespace regex {
- template<typename T>
- class COWVector {
- struct Detail : RefCounted<Detail> {
- Vector<T> m_members;
- };
- public:
- COWVector()
- : m_detail(make_ref_counted<Detail>())
- {
- }
- COWVector(COWVector const&) = default;
- COWVector(COWVector&&) = default;
- COWVector& operator=(COWVector const&) = default;
- COWVector& operator=(COWVector&&) = default;
- Vector<T> release() &&
- {
- if (m_detail->ref_count() == 1)
- return exchange(m_detail->m_members, Vector<T>());
- return m_detail->m_members;
- }
- void append(T const& value)
- {
- return append(T { value });
- }
- void append(T&& value)
- {
- copy();
- m_detail->m_members.append(move(value));
- }
- void resize(size_t size)
- {
- copy();
- m_detail->m_members.resize(size);
- }
- void ensure_capacity(size_t capacity)
- {
- if (m_detail->m_members.capacity() >= capacity)
- return;
- copy();
- m_detail->m_members.ensure_capacity(capacity);
- }
- template<typename... Args>
- void empend(Args&&... args)
- {
- copy();
- m_detail->m_members.empend(forward<Args>(args)...);
- }
- void clear()
- {
- if (m_detail->ref_count() > 1)
- m_detail = make_ref_counted<Detail>();
- else
- m_detail->m_members.clear();
- }
- T& at(size_t index)
- {
- // We're handing out a mutable reference, so make sure we own the data exclusively.
- copy();
- return m_detail->m_members.at(index);
- }
- T const& at(size_t index) const
- {
- return m_detail->m_members.at(index);
- }
- T& operator[](size_t index)
- {
- // We're handing out a mutable reference, so make sure we own the data exclusively.
- copy();
- return m_detail->m_members[index];
- }
- T const& operator[](size_t index) const
- {
- return m_detail->m_members[index];
- }
- size_t capacity() const
- {
- return m_detail->m_members.capacity();
- }
- size_t size() const
- {
- return m_detail->m_members.size();
- }
- bool is_empty() const
- {
- return m_detail->m_members.is_empty();
- }
- T const& first() const
- {
- return m_detail->m_members.first();
- }
- T const& last() const
- {
- return m_detail->m_members.last();
- }
- private:
- void copy()
- {
- if (m_detail->ref_count() <= 1)
- return;
- auto new_detail = make_ref_counted<Detail>();
- new_detail->m_members = m_detail->m_members;
- m_detail = new_detail;
- }
- NonnullRefPtr<Detail> m_detail;
- };
- class RegexStringView {
- public:
- RegexStringView() = default;
- RegexStringView(ByteString const& string)
- : m_view(string.view())
- {
- }
- RegexStringView(String const& string)
- : m_view(string.bytes_as_string_view())
- {
- }
- RegexStringView(StringView const view)
- : m_view(view)
- {
- }
- RegexStringView(Utf32View view)
- : m_view(view)
- {
- }
- RegexStringView(Utf16View view)
- : m_view(view)
- {
- }
- RegexStringView(Utf8View view)
- : m_view(view)
- {
- }
- explicit RegexStringView(ByteString&&) = delete;
- bool is_string_view() const
- {
- return m_view.has<StringView>();
- }
- StringView string_view() const
- {
- return m_view.get<StringView>();
- }
- Utf32View const& u32_view() const
- {
- return m_view.get<Utf32View>();
- }
- Utf16View const& u16_view() const
- {
- return m_view.get<Utf16View>();
- }
- Utf8View const& u8_view() const
- {
- return m_view.get<Utf8View>();
- }
- bool unicode() const { return m_unicode; }
- void set_unicode(bool unicode) { m_unicode = unicode; }
- bool is_empty() const
- {
- return m_view.visit([](auto& view) { return view.is_empty(); });
- }
- bool is_null() const
- {
- return m_view.visit([](auto& view) { return view.is_null(); });
- }
- size_t length() const
- {
- if (unicode()) {
- return m_view.visit(
- [](Utf16View const& view) { return view.length_in_code_points(); },
- [](auto const& view) { return view.length(); });
- }
- return length_in_code_units();
- }
- size_t length_in_code_units() const
- {
- return m_view.visit(
- [](Utf16View const& view) { return view.length_in_code_units(); },
- [](Utf8View const& view) { return view.byte_length(); },
- [](auto const& view) { return view.length(); });
- }
- size_t length_of_code_point(u32 code_point) const
- {
- return m_view.visit(
- [](Utf32View const&) { return 1; },
- [&](Utf16View const&) {
- if (code_point < 0x10000)
- return 1;
- return 2;
- },
- [&](auto const&) {
- if (code_point <= 0x7f)
- return 1;
- if (code_point <= 0x07ff)
- return 2;
- if (code_point <= 0xffff)
- return 3;
- return 4;
- });
- }
- RegexStringView typed_null_view()
- {
- auto view = m_view.visit(
- [&]<typename T>(T const&) {
- return RegexStringView { T {} };
- });
- view.set_unicode(unicode());
- return view;
- }
- RegexStringView construct_as_same(Span<u32> data, Optional<ByteString>& optional_string_storage, Utf16Data& optional_utf16_storage) const
- {
- auto view = m_view.visit(
- [&]<typename T>(T const&) {
- StringBuilder builder;
- for (auto ch : data)
- builder.append(ch); // Note: The type conversion is intentional.
- optional_string_storage = builder.to_byte_string();
- return RegexStringView { T { *optional_string_storage } };
- },
- [&](Utf32View) {
- return RegexStringView { Utf32View { data.data(), data.size() } };
- },
- [&](Utf16View) {
- optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() }).release_value_but_fixme_should_propagate_errors();
- return RegexStringView { Utf16View { optional_utf16_storage } };
- });
- view.set_unicode(unicode());
- return view;
- }
- Vector<RegexStringView> lines() const
- {
- return m_view.visit(
- [](StringView view) {
- auto views = view.lines(false);
- Vector<RegexStringView> new_views;
- for (auto& view : views)
- new_views.empend(view);
- return new_views;
- },
- [](Utf32View view) {
- if (view.is_empty())
- return Vector<RegexStringView> { view };
- Vector<RegexStringView> views;
- u32 newline = '\n';
- while (!view.is_empty()) {
- auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32));
- if (!position.has_value())
- break;
- auto offset = position.value() / sizeof(u32);
- views.empend(view.substring_view(0, offset));
- view = view.substring_view(offset + 1, view.length() - offset - 1);
- }
- if (!view.is_empty())
- views.empend(view);
- return views;
- },
- [](Utf16View view) {
- if (view.is_empty())
- return Vector<RegexStringView> { view };
- Vector<RegexStringView> views;
- u16 newline = '\n';
- while (!view.is_empty()) {
- auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16));
- if (!position.has_value())
- break;
- auto offset = position.value() / sizeof(u16);
- views.empend(view.substring_view(0, offset));
- view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1);
- }
- if (!view.is_empty())
- views.empend(view);
- return views;
- },
- [](Utf8View const& view) {
- if (view.is_empty())
- return Vector<RegexStringView> { view };
- Vector<RegexStringView> views;
- auto it = view.begin();
- auto previous_newline_position_it = it;
- for (;;) {
- if (*it == '\n') {
- auto previous_offset = view.byte_offset_of(previous_newline_position_it);
- auto new_offset = view.byte_offset_of(it);
- auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
- views.empend(slice);
- ++it;
- previous_newline_position_it = it;
- }
- if (it.done())
- break;
- ++it;
- }
- if (it != previous_newline_position_it) {
- auto previous_offset = view.byte_offset_of(previous_newline_position_it);
- auto new_offset = view.byte_offset_of(it);
- auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
- views.empend(slice);
- }
- return views;
- });
- }
- RegexStringView substring_view(size_t offset, size_t length) const
- {
- if (unicode()) {
- auto view = m_view.visit(
- [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
- [&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; },
- [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
- view.set_unicode(unicode());
- return view;
- }
- auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; });
- view.set_unicode(unicode());
- return view;
- }
- ByteString to_byte_string() const
- {
- return m_view.visit(
- [](StringView view) { return view.to_byte_string(); },
- [](Utf16View view) { return view.to_byte_string(Utf16View::AllowInvalidCodeUnits::Yes).release_value_but_fixme_should_propagate_errors(); },
- [](auto& view) {
- StringBuilder builder;
- for (auto it = view.begin(); it != view.end(); ++it)
- builder.append_code_point(*it);
- return builder.to_byte_string();
- });
- }
- ErrorOr<String> to_string() const
- {
- return m_view.visit(
- [](StringView view) { return String::from_utf8(view); },
- [](Utf16View view) { return view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); },
- [](auto& view) -> ErrorOr<String> {
- StringBuilder builder;
- for (auto it = view.begin(); it != view.end(); ++it)
- TRY(builder.try_append_code_point(*it));
- return builder.to_string();
- });
- }
- // Note: index must always be the code unit offset to return.
- u32 operator[](size_t index) const
- {
- return m_view.visit(
- [&](StringView view) -> u32 {
- auto ch = view[index];
- if constexpr (IsSigned<char>) {
- if (ch < 0)
- return 256u + ch;
- return ch;
- }
- },
- [&](Utf32View const& view) -> u32 { return view[index]; },
- [&](Utf16View const& view) -> u32 { return view.code_point_at(index); },
- [&](Utf8View const& view) -> u32 {
- auto it = view.iterator_at_byte_offset(index);
- VERIFY(it != view.end());
- return *it;
- });
- }
- u32 code_unit_at(size_t code_unit_index) const
- {
- if (unicode())
- return operator[](code_unit_index);
- return m_view.visit(
- [&](StringView view) -> u32 {
- auto ch = view[code_unit_index];
- if constexpr (IsSigned<char>) {
- if (ch < 0)
- return 256u + ch;
- return ch;
- }
- },
- [&](Utf32View const& view) -> u32 { return view[code_unit_index]; },
- [&](Utf16View const& view) -> u32 { return view.code_unit_at(code_unit_index); },
- [&](Utf8View const& view) -> u32 {
- auto it = view.iterator_at_byte_offset(code_unit_index);
- VERIFY(it != view.end());
- return *it;
- });
- }
- size_t code_unit_offset_of(size_t code_point_index) const
- {
- return m_view.visit(
- [&](StringView view) -> u32 {
- Utf8View utf8_view { view };
- return utf8_view.byte_offset_of(code_point_index);
- },
- [&](Utf32View const&) -> u32 { return code_point_index; },
- [&](Utf16View const& view) -> u32 {
- return view.code_unit_offset_of(code_point_index);
- },
- [&](Utf8View const& view) -> u32 {
- return view.byte_offset_of(code_point_index);
- });
- }
- bool operator==(char const* cstring) const
- {
- return m_view.visit(
- [&](Utf32View) { return to_byte_string() == cstring; },
- [&](Utf16View) { return to_byte_string() == cstring; },
- [&](Utf8View const& view) { return view.as_string() == cstring; },
- [&](StringView view) { return view == cstring; });
- }
- bool operator==(ByteString const& string) const
- {
- return m_view.visit(
- [&](Utf32View) { return to_byte_string() == string; },
- [&](Utf16View) { return to_byte_string() == string; },
- [&](Utf8View const& view) { return view.as_string() == string; },
- [&](StringView view) { return view == string; });
- }
- bool operator==(StringView string) const
- {
- return m_view.visit(
- [&](Utf32View) { return to_byte_string() == string; },
- [&](Utf16View) { return to_byte_string() == string; },
- [&](Utf8View const& view) { return view.as_string() == string; },
- [&](StringView view) { return view == string; });
- }
- bool operator==(Utf32View const& other) const
- {
- return m_view.visit(
- [&](Utf32View view) {
- return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0;
- },
- [&](Utf16View) { return to_byte_string() == RegexStringView { other }.to_byte_string(); },
- [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_byte_string(); },
- [&](StringView view) { return view == RegexStringView { other }.to_byte_string(); });
- }
- bool operator==(Utf16View const& other) const
- {
- return m_view.visit(
- [&](Utf32View) { return to_byte_string() == RegexStringView { other }.to_byte_string(); },
- [&](Utf16View const& view) { return view == other; },
- [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_byte_string(); },
- [&](StringView view) { return view == RegexStringView { other }.to_byte_string(); });
- }
- bool operator==(Utf8View const& other) const
- {
- return m_view.visit(
- [&](Utf32View) { return to_byte_string() == other.as_string(); },
- [&](Utf16View) { return to_byte_string() == other.as_string(); },
- [&](Utf8View const& view) { return view.as_string() == other.as_string(); },
- [&](StringView view) { return other.as_string() == view; });
- }
- bool equals(RegexStringView other) const
- {
- return other.m_view.visit([this](auto const& view) { return operator==(view); });
- }
- bool equals_ignoring_case(RegexStringView other) const
- {
- // FIXME: Implement equals_ignoring_case() for unicode.
- return m_view.visit(
- [&](StringView view) {
- return other.m_view.visit(
- [&](StringView other_view) { return view.equals_ignoring_ascii_case(other_view); },
- [](auto&) -> bool { TODO(); });
- },
- [&](Utf16View view) {
- return other.m_view.visit(
- [&](Utf16View other_view) { return view.equals_ignoring_case(other_view); },
- [](auto&) -> bool { TODO(); });
- },
- [](auto&) -> bool { TODO(); });
- }
- bool starts_with(StringView str) const
- {
- return m_view.visit(
- [&](Utf32View) -> bool {
- TODO();
- },
- [&](Utf16View) -> bool {
- TODO();
- },
- [&](Utf8View const& view) { return view.as_string().starts_with(str); },
- [&](StringView view) { return view.starts_with(str); });
- }
- bool starts_with(Utf32View const& str) const
- {
- return m_view.visit(
- [&](Utf32View view) -> bool {
- if (str.length() > view.length())
- return false;
- if (str.length() == view.length())
- return operator==(str);
- for (size_t i = 0; i < str.length(); ++i) {
- if (str.at(i) != view.at(i))
- return false;
- }
- return true;
- },
- [&](Utf16View) -> bool { TODO(); },
- [&](Utf8View const& view) {
- auto it = view.begin();
- for (auto code_point : str) {
- if (it.done())
- return false;
- if (code_point != *it)
- return false;
- ++it;
- }
- return true;
- },
- [&](StringView) -> bool { TODO(); });
- }
- private:
- Variant<StringView, Utf8View, Utf16View, Utf32View> m_view { StringView {} };
- bool m_unicode { false };
- };
- class Match final {
- private:
- Optional<DeprecatedFlyString> string;
- public:
- Match() = default;
- ~Match() = default;
- Match(RegexStringView view_, size_t const line_, size_t const column_, size_t const global_offset_)
- : view(view_)
- , line(line_)
- , column(column_)
- , global_offset(global_offset_)
- , left_column(column_)
- {
- }
- Match(ByteString string_, size_t const line_, size_t const column_, size_t const global_offset_)
- : string(move(string_))
- , view(string.value().view())
- , line(line_)
- , column(column_)
- , global_offset(global_offset_)
- {
- }
- Match(RegexStringView const view_, StringView capture_group_name_, size_t const line_, size_t const column_, size_t const global_offset_)
- : view(view_)
- , capture_group_name(capture_group_name_)
- , line(line_)
- , column(column_)
- , global_offset(global_offset_)
- , left_column(column_)
- {
- }
- void reset()
- {
- view = view.typed_null_view();
- capture_group_name.clear();
- line = 0;
- column = 0;
- global_offset = 0;
- left_column = 0;
- }
- RegexStringView view {};
- Optional<DeprecatedFlyString> capture_group_name {};
- size_t line { 0 };
- size_t column { 0 };
- size_t global_offset { 0 };
- // ugly, as not usable by user, but needed to prevent to create extra vectors that are
- // able to store the column when the left paren has been found
- size_t left_column { 0 };
- };
- struct MatchInput {
- RegexStringView view {};
- AllOptions regex_options {};
- size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset.
- size_t match_index { 0 };
- size_t line { 0 };
- size_t column { 0 };
- size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important
- mutable size_t fail_counter { 0 };
- mutable Vector<size_t> saved_positions;
- mutable Vector<size_t> saved_code_unit_positions;
- mutable Vector<size_t> saved_forks_since_last_save;
- mutable Vector<u64, 64> checkpoints;
- mutable Optional<size_t> fork_to_replace;
- };
- struct MatchState {
- size_t string_position_before_match { 0 };
- size_t string_position { 0 };
- size_t string_position_in_code_units { 0 };
- size_t instruction_position { 0 };
- size_t fork_at_position { 0 };
- size_t forks_since_last_save { 0 };
- Optional<size_t> initiating_fork;
- COWVector<Match> matches;
- COWVector<Vector<Match>> capture_group_matches;
- COWVector<u64> repetition_marks;
- };
- }
- using regex::RegexStringView;
- template<>
- struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> {
- ErrorOr<void> format(FormatBuilder& builder, regex::RegexStringView value)
- {
- auto string = value.to_byte_string();
- return Formatter<StringView>::format(builder, string);
- }
- };
|