AK+LibUnicode: Add a method to trim non-ASCII whitespace from a String

Required by WebDriver.
This commit is contained in:
Timothy Flynn 2024-11-03 17:13:56 -05:00 committed by Tim Flynn
parent 969ee0f3e0
commit cfcb29bdfd
Notes: github-actions[bot] 2024-11-04 01:43:52 +00:00
5 changed files with 118 additions and 2 deletions

View file

@ -1,6 +1,6 @@
/* /*
* Copyright (c) 2018-2022, Andreas Kling <andreas@ladybird.org> * Copyright (c) 2018-2022, Andreas Kling <andreas@ladybird.org>
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org> * Copyright (c) 2023-2024, Tim Flynn <trflynn89@ladybird.org>
* *
* SPDX-License-Identifier: BSD-2-Clause * SPDX-License-Identifier: BSD-2-Clause
*/ */
@ -144,6 +144,7 @@ public:
ErrorOr<String> trim(Utf8View const& code_points_to_trim, TrimMode mode = TrimMode::Both) const; ErrorOr<String> trim(Utf8View const& code_points_to_trim, TrimMode mode = TrimMode::Both) const;
ErrorOr<String> trim(StringView code_points_to_trim, TrimMode mode = TrimMode::Both) const; ErrorOr<String> trim(StringView code_points_to_trim, TrimMode mode = TrimMode::Both) const;
ErrorOr<String> trim_whitespace(TrimMode mode = TrimMode::Both) const;
ErrorOr<String> trim_ascii_whitespace(TrimMode mode = TrimMode::Both) const; ErrorOr<String> trim_ascii_whitespace(TrimMode mode = TrimMode::Both) const;
ErrorOr<Vector<String>> split_limit(u32 separator, size_t limit, SplitBehavior = SplitBehavior::Nothing) const; ErrorOr<Vector<String>> split_limit(u32 separator, size_t limit, SplitBehavior = SplitBehavior::Nothing) const;

View file

@ -1289,6 +1289,82 @@ TEST_CASE(trim)
} }
} }
TEST_CASE(trim_whitespace)
{
{
String string {};
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), String {});
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), String {});
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), String {});
}
{
auto string = " "_string;
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), String {});
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), String {});
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), String {});
}
{
auto string = " "_string;
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), String {});
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), String {});
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), String {});
}
{
auto string = " \t \n \r \u00A0 \u202F "_string;
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), String {});
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), String {});
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), String {});
}
{
auto string = "abcdef"_string;
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), "abcdef"_string);
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), "abcdef"_string);
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), "abcdef"_string);
}
{
auto string = " \u00A0 abcdef"_string;
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), "abcdef"_string);
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), "abcdef"_string);
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), " \u00A0 abcdef"_string);
}
{
auto string = "abcdef \u202F "_string;
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), "abcdef"_string);
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), "abcdef \u202F "_string);
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), "abcdef"_string);
}
{
auto string = " \u00A0 abcdef \u202F "_string;
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), "abcdef"_string);
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), "abcdef \u202F "_string);
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), " \u00A0 abcdef"_string);
}
{
auto string = "ab \t cd \n ef"_string;
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), "ab \t cd \n ef"_string);
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), "ab \t cd \n ef"_string);
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), "ab \t cd \n ef"_string);
}
{
auto string = " \u00A0 ab \t cd \n ef"_string;
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), "ab \t cd \n ef"_string);
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), "ab \t cd \n ef"_string);
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), " \u00A0 ab \t cd \n ef"_string);
}
{
auto string = "ab \t cd \n ef \u202F "_string;
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), "ab \t cd \n ef"_string);
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), "ab \t cd \n ef \u202F "_string);
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), "ab \t cd \n ef"_string);
}
{
auto string = " \u00A0 ab \t cd \n ef \u202F "_string;
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Both)), "ab \t cd \n ef"_string);
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Left)), "ab \t cd \n ef \u202F "_string);
EXPECT_EQ(MUST(string.trim_whitespace(TrimMode::Right)), " \u00A0 ab \t cd \n ef"_string);
}
}
TEST_CASE(contains) TEST_CASE(contains)
{ {
EXPECT(!String {}.contains({})); EXPECT(!String {}.contains({}));

View file

@ -231,6 +231,11 @@ bool code_point_has_variation_selector_property(u32 code_point)
return code_point_has_property(code_point, UCHAR_VARIATION_SELECTOR); return code_point_has_property(code_point, UCHAR_VARIATION_SELECTOR);
} }
bool code_point_has_white_space_property(u32 code_point)
{
return code_point_has_property(code_point, UCHAR_WHITE_SPACE);
}
// https://tc39.es/ecma262/#table-binary-unicode-properties // https://tc39.es/ecma262/#table-binary-unicode-properties
bool is_ecma262_property(Property property) bool is_ecma262_property(Property property)
{ {

View file

@ -36,6 +36,7 @@ bool code_point_has_identifier_start_property(u32 code_point);
bool code_point_has_identifier_continue_property(u32 code_point); bool code_point_has_identifier_continue_property(u32 code_point);
bool code_point_has_regional_indicator_property(u32 code_point); bool code_point_has_regional_indicator_property(u32 code_point);
bool code_point_has_variation_selector_property(u32 code_point); bool code_point_has_variation_selector_property(u32 code_point);
bool code_point_has_white_space_property(u32 code_point);
bool is_ecma262_property(Property); bool is_ecma262_property(Property);

View file

@ -1,11 +1,12 @@
/* /*
* Copyright (c) 2023-2024, Tim Flynn <trflynn89@serenityos.org> * Copyright (c) 2023-2024, Tim Flynn <trflynn89@ladybird.org>
* *
* SPDX-License-Identifier: BSD-2-Clause * SPDX-License-Identifier: BSD-2-Clause
*/ */
#include <AK/String.h> #include <AK/String.h>
#include <AK/StringBuilder.h> #include <AK/StringBuilder.h>
#include <LibUnicode/CharacterTypes.h>
#include <LibUnicode/ICU.h> #include <LibUnicode/ICU.h>
#include <unicode/bytestream.h> #include <unicode/bytestream.h>
@ -155,4 +156,36 @@ Optional<size_t> String::find_byte_offset_ignoring_case(StringView needle, size_
return {}; return {};
} }
ErrorOr<String> String::trim_whitespace(TrimMode mode) const
{
auto code_points = this->code_points();
Optional<size_t> start;
size_t length = 0;
for (auto it = code_points.begin(); it != code_points.end(); ++it) {
if (Unicode::code_point_has_white_space_property(*it))
continue;
auto offset = code_points.byte_offset_of(it);
if (!start.has_value())
start = offset;
length = offset + it.underlying_code_point_length_in_bytes();
}
if (mode == TrimMode::Right)
start = 0;
if (mode == TrimMode::Left)
length = bytes_as_string_view().length();
if (!start.has_value() || start == length)
return String {};
if (start == 0uz && length == bytes_as_string_view().length())
return *this;
return substring_from_byte_offset_with_shared_superstring(*start, length - *start);
}
} }