123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282 |
- /*
- * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
- *
- * SPDX-License-Identifier: BSD-2-Clause
- */
- #pragma once
- #include "RegexOptions.h"
- #include "AK/FlyString.h"
- #include "AK/HashMap.h"
- #include "AK/String.h"
- #include "AK/StringBuilder.h"
- #include "AK/StringView.h"
- #include "AK/Utf32View.h"
- #include "AK/Vector.h"
- namespace regex {
- class RegexStringView {
- public:
- RegexStringView(const char* chars)
- : m_u8view(chars)
- {
- }
- RegexStringView(const String& string)
- : m_u8view(string)
- {
- }
- RegexStringView(const StringView view)
- : m_u8view(view)
- {
- }
- RegexStringView(const Utf32View view)
- : m_u32view(view)
- {
- }
- bool is_u8_view() const { return m_u8view.has_value(); }
- bool is_u32_view() const { return m_u32view.has_value(); }
- const StringView& u8view() const
- {
- VERIFY(m_u8view.has_value());
- return m_u8view.value();
- };
- const Utf32View& u32view() const
- {
- VERIFY(m_u32view.has_value());
- return m_u32view.value();
- };
- bool is_empty() const
- {
- if (is_u8_view())
- return m_u8view.value().is_empty();
- else
- return m_u32view.value().is_empty();
- }
- bool is_null() const
- {
- if (is_u8_view())
- return m_u8view.value().is_null();
- else
- return m_u32view.value().code_points() == nullptr;
- }
- size_t length() const
- {
- if (is_u8_view())
- return m_u8view.value().length();
- else
- return m_u32view.value().length();
- }
- Vector<RegexStringView> lines() const
- {
- if (is_u8_view()) {
- auto views = u8view().lines(false);
- Vector<RegexStringView> new_views;
- for (auto& view : views)
- new_views.append(move(view));
- return new_views;
- }
- // FIXME: line splitting for Utf32View needed
- Vector<RegexStringView> views;
- views.append(m_u32view.value());
- return views;
- }
- RegexStringView substring_view(size_t offset, size_t length) const
- {
- if (is_u8_view()) {
- return u8view().substring_view(offset, length);
- }
- return u32view().substring_view(offset, length);
- }
- String to_string() const
- {
- if (is_u8_view()) {
- return u8view().to_string();
- }
- StringBuilder builder;
- builder.append(u32view());
- return builder.to_string();
- }
- u32 operator[](size_t index) const
- {
- if (is_u8_view()) {
- i8 ch = u8view()[index];
- u8 value = *reinterpret_cast<u8*>(&ch);
- return static_cast<u32>(value);
- }
- return u32view().code_points()[index];
- }
- bool operator==(const char* cstring) const
- {
- if (is_u8_view())
- return u8view() == cstring;
- return to_string() == cstring;
- }
- bool operator!=(const char* cstring) const
- {
- return !(*this == cstring);
- }
- bool operator==(const String& string) const
- {
- if (is_u8_view())
- return u8view() == string;
- return to_string() == string;
- }
- bool operator==(const StringView& other) const
- {
- if (is_u8_view())
- return u8view() == other;
- return false;
- }
- bool operator!=(const StringView& other) const
- {
- return !(*this == other);
- }
- bool operator==(const Utf32View& other) const
- {
- if (is_u32_view()) {
- StringBuilder builder;
- builder.append(other);
- return to_string() == builder.to_string();
- }
- return false;
- }
- bool operator!=(const Utf32View& other) const
- {
- return !(*this == other);
- }
- const char* characters_without_null_termination() const
- {
- if (is_u8_view())
- return u8view().characters_without_null_termination();
- return to_string().characters(); // FIXME: it contains the null termination, does that actually matter?
- }
- bool starts_with(const StringView& str) const
- {
- if (is_u32_view())
- return false;
- return u8view().starts_with(str);
- }
- bool starts_with(const Utf32View& str) const
- {
- if (is_u8_view())
- return false;
- StringBuilder builder;
- builder.append(str);
- return to_string().starts_with(builder.to_string());
- }
- private:
- Optional<StringView> m_u8view;
- Optional<Utf32View> m_u32view;
- };
- class Match final {
- private:
- Optional<FlyString> string;
- public:
- Match() = default;
- ~Match() = default;
- Match(const RegexStringView view_, const size_t line_, const size_t column_, const size_t global_offset_)
- : view(view_)
- , line(line_)
- , column(column_)
- , global_offset(global_offset_)
- , left_column(column_)
- {
- }
- Match(const String string_, const size_t line_, const size_t column_, const size_t global_offset_)
- : string(string_)
- , view(string.value().view())
- , line(line_)
- , column(column_)
- , global_offset(global_offset_)
- , left_column(column_)
- {
- }
- RegexStringView view { nullptr };
- size_t line { 0 };
- size_t column { 0 };
- size_t global_offset { 0 };
- // ugly, as not usable by user, but needed to prevent to create extra vectors that are
- // able to store the column when the left paren has been found
- size_t left_column { 0 };
- };
- struct MatchInput {
- RegexStringView view { nullptr };
- AllOptions regex_options {};
- size_t start_offset { 0 }; // For Stateful matches, saved and restored from Regex::start_offset.
- size_t match_index { 0 };
- size_t line { 0 };
- size_t column { 0 };
- size_t global_offset { 0 }; // For multiline matching, knowing the offset from start could be important
- mutable size_t fail_counter { 0 };
- mutable Vector<size_t> saved_positions;
- };
- struct MatchState {
- size_t string_position_before_match { 0 };
- size_t string_position { 0 };
- size_t instruction_position { 0 };
- size_t fork_at_position { 0 };
- };
- struct MatchOutput {
- size_t operations;
- Vector<Match> matches;
- Vector<Vector<Match>> capture_group_matches;
- Vector<HashMap<String, Match>> named_capture_group_matches;
- };
- }
- using regex::RegexStringView;
- template<>
- struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> {
- void format(FormatBuilder& builder, const regex::RegexStringView& value)
- {
- return Formatter<StringView>::format(builder, { value.characters_without_null_termination(), value.length() });
- }
- };
|