From 0c14a9417ad9bad0a6457c3019ed3a448736aeed Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Tue, 16 Jul 2024 16:05:46 -0400 Subject: [PATCH] AK: Replace converting to and from UTF-16 with simdutf The one behavior difference is that we will now actually fail on invalid code units with Utf16View::to_utf8(AllowInvalidCodeUnits::No). It was arguably a bug that this wasn't already the case. --- AK/String.cpp | 29 ++++++++++++++++++ AK/String.h | 4 +++ AK/Utf16View.cpp | 67 +++++++++++++++++++++++++++++------------- Tests/AK/TestUtf16.cpp | 4 +-- 4 files changed, 81 insertions(+), 23 deletions(-) diff --git a/AK/String.cpp b/AK/String.cpp index acd794006d8..a7ea72d8ea8 100644 --- a/AK/String.cpp +++ b/AK/String.cpp @@ -4,6 +4,8 @@ * SPDX-License-Identifier: BSD-2-Clause */ +#define AK_DONT_REPLACE_STD + #include #include #include @@ -11,9 +13,12 @@ #include #include #include +#include #include #include +#include + namespace AK { String String::from_utf8_without_validation(ReadonlyBytes bytes) @@ -39,6 +44,30 @@ ErrorOr String::from_utf8(StringView view) return result; } +ErrorOr String::from_utf16(Utf16View const& utf16) +{ + if (!utf16.validate()) + return Error::from_string_literal("String::from_utf16: Input was not valid UTF-16"); + + String result; + + auto utf8_length = simdutf::utf8_length_from_utf16( + reinterpret_cast(utf16.data()), + utf16.length_in_code_units()); + + TRY(result.replace_with_new_string(utf8_length, [&](Bytes buffer) -> ErrorOr { + [[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8( + reinterpret_cast(utf16.data()), + utf16.length_in_code_units(), + reinterpret_cast(buffer.data())); + ASSERT(result == buffer.size()); + + return {}; + })); + + return result; +} + ErrorOr String::from_stream(Stream& stream, size_t byte_count) { String result; diff --git a/AK/String.h b/AK/String.h index b4fe68f6d74..1294496e204 100644 --- a/AK/String.h +++ b/AK/String.h @@ -50,12 +50,16 @@ public: // Creates a new String from a sequence of UTF-8 encoded code points. static ErrorOr from_utf8(StringView); + template requires(IsOneOf, ByteString, DeprecatedFlyString, FlyString, String>) static ErrorOr from_utf8(T&&) = delete; [[nodiscard]] static String from_utf8_without_validation(ReadonlyBytes); + // Creates a new String from a sequence of UTF-16 encoded code points. + static ErrorOr from_utf16(Utf16View const&); + // Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream. static ErrorOr from_stream(Stream&, size_t byte_count); diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp index 2b3a534e24d..a6e569fbb45 100644 --- a/AK/Utf16View.cpp +++ b/AK/Utf16View.cpp @@ -26,7 +26,7 @@ static constexpr u32 replacement_code_point = 0xfffd; static constexpr u32 first_supplementary_plane_code_point = 0x10000; template UtfViewType> -static ErrorOr to_utf16_impl(UtfViewType const& view) +static ErrorOr to_utf16_slow(UtfViewType const& view) { Utf16Data utf16_data; TRY(utf16_data.try_ensure_capacity(view.length())); @@ -39,17 +39,45 @@ static ErrorOr to_utf16_impl(UtfViewType const& view) ErrorOr utf8_to_utf16(StringView utf8_view) { - return to_utf16_impl(Utf8View { utf8_view }); + return utf8_to_utf16(Utf8View { utf8_view }); } ErrorOr utf8_to_utf16(Utf8View const& utf8_view) { - return to_utf16_impl(utf8_view); + // All callers want to allow lonely surrogates, which simdutf does not permit. + if (!utf8_view.validate(Utf8View::AllowSurrogates::No)) [[unlikely]] + return to_utf16_slow(utf8_view); + + Utf16Data utf16_data; + + TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf8( + reinterpret_cast(utf8_view.bytes()), + utf8_view.byte_length()))); + + [[maybe_unused]] auto result = simdutf::convert_utf8_to_utf16( + reinterpret_cast(utf8_view.bytes()), + utf8_view.byte_length(), + reinterpret_cast(utf16_data.data())); + ASSERT(result == utf16_data.size()); + + return utf16_data; } ErrorOr utf32_to_utf16(Utf32View const& utf32_view) { - return to_utf16_impl(utf32_view); + Utf16Data utf16_data; + + TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf32( + reinterpret_cast(utf32_view.code_points()), + utf32_view.length()))); + + [[maybe_unused]] auto result = simdutf::convert_utf32_to_utf16( + reinterpret_cast(utf32_view.code_points()), + utf32_view.length(), + reinterpret_cast(utf16_data.data())); + ASSERT(result == utf16_data.size()); + + return utf16_data; } ErrorOr code_point_to_utf16(Utf16Data& string, u32 code_point) @@ -92,30 +120,27 @@ ErrorOr Utf16View::to_byte_string(AllowInvalidCodeUnits allow_invali ErrorOr Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const { + if (allow_invalid_code_units == AllowInvalidCodeUnits::No) + return String::from_utf16(*this); + StringBuilder builder; - if (allow_invalid_code_units == AllowInvalidCodeUnits::Yes) { - for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) { - if (is_high_surrogate(*ptr)) { - auto const* next = ptr + 1; + for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) { + if (is_high_surrogate(*ptr)) { + auto const* next = ptr + 1; - if ((next < end_ptr()) && is_low_surrogate(*next)) { - auto code_point = decode_surrogate_pair(*ptr, *next); - TRY(builder.try_append_code_point(code_point)); - ++ptr; - continue; - } + if ((next < end_ptr()) && is_low_surrogate(*next)) { + auto code_point = decode_surrogate_pair(*ptr, *next); + TRY(builder.try_append_code_point(code_point)); + ++ptr; + continue; } - - TRY(builder.try_append_code_point(static_cast(*ptr))); } - return builder.to_string_without_validation(); + + TRY(builder.try_append_code_point(static_cast(*ptr))); } - for (auto code_point : *this) - TRY(builder.try_append_code_point(code_point)); - - return builder.to_string(); + return builder.to_string_without_validation(); } size_t Utf16View::length_in_code_points() const diff --git a/Tests/AK/TestUtf16.cpp b/Tests/AK/TestUtf16.cpp index fef4082818e..79aba4edeb7 100644 --- a/Tests/AK/TestUtf16.cpp +++ b/Tests/AK/TestUtf16.cpp @@ -63,7 +63,7 @@ TEST_CASE(encode_utf8) auto encoded = Array { (u16)0xd83d }; Utf16View view { encoded }; EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), "\xed\xa0\xbd"sv); - EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No)), "\ufffd"sv); + EXPECT(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No).is_error()); } } @@ -307,7 +307,7 @@ TEST_CASE(substring_view) EXPECT(view.length_in_code_units() == 1); EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), "\xed\xa0\xbd"sv); - EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No)), "\ufffd"sv); + EXPECT(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No).is_error()); } }