From cfcb29bdfd8d07a562af515e2d7c195ae2f3c201 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Sun, 3 Nov 2024 17:13:56 -0500 Subject: [PATCH] AK+LibUnicode: Add a method to trim non-ASCII whitespace from a String Required by WebDriver. --- AK/String.h | 3 +- Tests/AK/TestString.cpp | 76 +++++++++++++++++++ .../Libraries/LibUnicode/CharacterTypes.cpp | 5 ++ .../Libraries/LibUnicode/CharacterTypes.h | 1 + Userland/Libraries/LibUnicode/String.cpp | 35 ++++++++- 5 files changed, 118 insertions(+), 2 deletions(-) diff --git a/AK/String.h b/AK/String.h index c10b3fbf296..825677b9408 100644 --- a/AK/String.h +++ b/AK/String.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2018-2022, Andreas Kling - * Copyright (c) 2023, Tim Flynn + * Copyright (c) 2023-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -144,6 +144,7 @@ public: ErrorOr trim(Utf8View const& code_points_to_trim, TrimMode mode = TrimMode::Both) const; ErrorOr trim(StringView code_points_to_trim, TrimMode mode = TrimMode::Both) const; + ErrorOr trim_whitespace(TrimMode mode = TrimMode::Both) const; ErrorOr trim_ascii_whitespace(TrimMode mode = TrimMode::Both) const; ErrorOr> split_limit(u32 separator, size_t limit, SplitBehavior = SplitBehavior::Nothing) const; diff --git a/Tests/AK/TestString.cpp b/Tests/AK/TestString.cpp index ccffba67179..59b89b59af1 100644 --- a/Tests/AK/TestString.cpp +++ b/Tests/AK/TestString.cpp @@ -1289,6 +1289,82 @@ TEST_CASE(trim) } } +TEST_CASE(trim_whitespace) +{ + { + String string {}; + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), String {}); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), String {}); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), String {}); + } + { + auto string = " "_string; + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), String {}); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), String {}); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), String {}); + } + { + auto string = " "_string; + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), String {}); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), String {}); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), String {}); + } + { + auto string = " \t \n \r \u00A0 \u202F "_string; + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), String {}); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), String {}); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), String {}); + } + { + auto string = "abcdef"_string; + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), "abcdef"_string); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), "abcdef"_string); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), "abcdef"_string); + } + { + auto string = " \u00A0 abcdef"_string; + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), "abcdef"_string); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), "abcdef"_string); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), " \u00A0 abcdef"_string); + } + { + auto string = "abcdef \u202F "_string; + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), "abcdef"_string); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), "abcdef \u202F "_string); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), "abcdef"_string); + } + { + auto string = " \u00A0 abcdef \u202F "_string; + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), "abcdef"_string); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), "abcdef \u202F "_string); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), " \u00A0 abcdef"_string); + } + { + auto string = "ab \t cd \n ef"_string; + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), "ab \t cd \n ef"_string); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), "ab \t cd \n ef"_string); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), "ab \t cd \n ef"_string); + } + { + auto string = " \u00A0 ab \t cd \n ef"_string; + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), "ab \t cd \n ef"_string); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), "ab \t cd \n ef"_string); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), " \u00A0 ab \t cd \n ef"_string); + } + { + auto string = "ab \t cd \n ef \u202F "_string; + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), "ab \t cd \n ef"_string); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), "ab \t cd \n ef \u202F "_string); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), "ab \t cd \n ef"_string); + } + { + auto string = " \u00A0 ab \t cd \n ef \u202F "_string; + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), "ab \t cd \n ef"_string); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), "ab \t cd \n ef \u202F "_string); + EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), " \u00A0 ab \t cd \n ef"_string); + } +} + TEST_CASE(contains) { EXPECT(!String {}.contains({})); diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 6bb304dcedb..112eaf1e641 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -231,6 +231,11 @@ bool code_point_has_variation_selector_property(u32 code_point) return code_point_has_property(code_point, UCHAR_VARIATION_SELECTOR); } +bool code_point_has_white_space_property(u32 code_point) +{ + return code_point_has_property(code_point, UCHAR_WHITE_SPACE); +} + // https://tc39.es/ecma262/#table-binary-unicode-properties bool is_ecma262_property(Property property) { diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index c16ee908599..c2a98ebb2ff 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -36,6 +36,7 @@ bool code_point_has_identifier_start_property(u32 code_point); bool code_point_has_identifier_continue_property(u32 code_point); bool code_point_has_regional_indicator_property(u32 code_point); bool code_point_has_variation_selector_property(u32 code_point); +bool code_point_has_white_space_property(u32 code_point); bool is_ecma262_property(Property); diff --git a/Userland/Libraries/LibUnicode/String.cpp b/Userland/Libraries/LibUnicode/String.cpp index c9810cd8d32..aadcef61dbc 100644 --- a/Userland/Libraries/LibUnicode/String.cpp +++ b/Userland/Libraries/LibUnicode/String.cpp @@ -1,11 +1,12 @@ /* - * Copyright (c) 2023-2024, Tim Flynn + * Copyright (c) 2023-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ #include #include +#include #include #include @@ -155,4 +156,36 @@ Optional String::find_byte_offset_ignoring_case(StringView needle, size_ return {}; } +ErrorOr String::trim_whitespace(TrimMode mode) const +{ + auto code_points = this->code_points(); + + Optional start; + size_t length = 0; + + for (auto it = code_points.begin(); it != code_points.end(); ++it) { + if (Unicode::code_point_has_white_space_property(*it)) + continue; + + auto offset = code_points.byte_offset_of(it); + + if (!start.has_value()) + start = offset; + + length = offset + it.underlying_code_point_length_in_bytes(); + } + + if (mode == TrimMode::Right) + start = 0; + if (mode == TrimMode::Left) + length = bytes_as_string_view().length(); + + if (!start.has_value() || start == length) + return String {}; + if (start == 0uz && length == bytes_as_string_view().length()) + return *this; + + return substring_from_byte_offset_with_shared_superstring(*start, length - *start); +} + }