From 3f10a5701d9634e47111203b837283bdfc2d8b18 Mon Sep 17 00:00:00 2001 From: Sam Atkins Date: Fri, 15 Nov 2024 15:43:59 +0000 Subject: [PATCH] AK: Add Utf8View::for_each_split_view() method Returns one Utf8View at a time, using a callback function to identify code points to split on. --- AK/Utf8View.h | 47 +++++++++++++++++++++++++++++++++++++++++++ Tests/AK/TestUtf8.cpp | 23 +++++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/AK/Utf8View.h b/AK/Utf8View.h index 79a26594d8f..9158bf669c1 100644 --- a/AK/Utf8View.h +++ b/AK/Utf8View.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -139,6 +140,52 @@ public: bool validate(size_t& valid_bytes, AllowSurrogates allow_surrogates = AllowSurrogates::Yes) const; + template + auto for_each_split_view(Function splitter, SplitBehavior split_behavior, Callback callback) const + { + bool keep_empty = has_flag(split_behavior, SplitBehavior::KeepEmpty); + bool keep_trailing_separator = has_flag(split_behavior, SplitBehavior::KeepTrailingSeparator); + + auto start_offset = 0u; + auto offset = 0u; + + auto run_callback = [&]() { + auto length = offset - start_offset; + + if (length == 0 && !keep_empty) + return; + + auto substring = unicode_substring_view(start_offset, length); + + // Reject splitter-only entries if we're not keeping empty results + if (keep_trailing_separator && !keep_empty && length == 1 && splitter(*substring.begin())) + return; + + callback(substring); + }; + + auto iterator = begin(); + while (iterator != end()) { + if (splitter(*iterator)) { + if (keep_trailing_separator) + ++offset; + + run_callback(); + + if (!keep_trailing_separator) + ++offset; + + start_offset = offset; + ++iterator; + continue; + } + + ++offset; + ++iterator; + } + run_callback(); + } + private: friend class Utf8CodePointIterator; diff --git a/Tests/AK/TestUtf8.cpp b/Tests/AK/TestUtf8.cpp index 988707c41d9..c007cdd7e6c 100644 --- a/Tests/AK/TestUtf8.cpp +++ b/Tests/AK/TestUtf8.cpp @@ -312,3 +312,26 @@ TEST_CASE(trim) EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "\u180E"); } } + +static bool is_period(u32 code_point) { return code_point == '.'; } + +TEST_CASE(for_each_split_view) +{ + Utf8View view { "...Well..hello.friends!..."sv }; + auto gather = [&](auto split_behavior) { + Vector results; + view.for_each_split_view(is_period, split_behavior, [&](auto part) { + results.append(part.as_string()); + }); + return results; + }; + + EXPECT_EQ(gather(SplitBehavior::Nothing), + Vector({ "Well"sv, "hello"sv, "friends!"sv })); + EXPECT_EQ(gather(SplitBehavior::KeepEmpty), + Vector({ ""sv, ""sv, ""sv, "Well"sv, ""sv, "hello"sv, "friends!"sv, ""sv, ""sv, ""sv })); + EXPECT_EQ(gather(SplitBehavior::KeepTrailingSeparator), + Vector({ "Well."sv, "hello."sv, "friends!."sv })); + EXPECT_EQ(gather(SplitBehavior::KeepEmpty | SplitBehavior::KeepTrailingSeparator), + Vector({ "."sv, "."sv, "."sv, "Well."sv, "."sv, "hello."sv, "friends!."sv, "."sv, "."sv, ""sv })); +}