From 3c039903fbb73134548bb8ff057cf1e361f88b00 Mon Sep 17 00:00:00 2001 From: Andreas Kling Date: Fri, 29 Dec 2023 15:30:15 +0100 Subject: [PATCH] LibTextCodec+AK: Don't validate UTF-8 strings twice UTF8Decoder was already converting invalid data into replacement characters while converting, so we know for sure we have valid UTF-8 by the time conversion is finished. This patch adds a new StringBuilder::to_string_without_validation() and uses it to make UTF8Decoder avoid half the work it was doing. --- AK/String.cpp | 13 +++++++++++++ AK/String.h | 2 ++ AK/StringBuilder.cpp | 5 +++++ AK/StringBuilder.h | 1 + Userland/Libraries/LibTextCodec/Decoder.cpp | 2 +- 5 files changed, 22 insertions(+), 1 deletion(-) diff --git a/AK/String.cpp b/AK/String.cpp index 2f4aaac8b73..34608280232 100644 --- a/AK/String.cpp +++ b/AK/String.cpp @@ -233,6 +233,19 @@ void String::destroy_string() m_data->unref(); } +String String::from_utf8_without_validation(ReadonlyBytes bytes) +{ + if (bytes.size() <= MAX_SHORT_STRING_BYTE_COUNT) { + ShortString short_string; + if (!bytes.is_empty()) + memcpy(short_string.storage, bytes.data(), bytes.size()); + short_string.byte_count_and_short_string_flag = (bytes.size() << 1) | SHORT_STRING_FLAG; + return String { short_string }; + } + auto data = MUST(Detail::StringData::from_utf8(reinterpret_cast(bytes.data()), bytes.size())); + return String { move(data) }; +} + ErrorOr String::from_utf8(StringView view) { if (!Utf8View { view }.validate()) diff --git a/AK/String.h b/AK/String.h index a608f772bbc..756a0e620be 100644 --- a/AK/String.h +++ b/AK/String.h @@ -70,6 +70,8 @@ public: requires(IsOneOf, ByteString, DeprecatedFlyString, FlyString, String>) static ErrorOr from_utf8(T&&) = delete; + [[nodiscard]] static String from_utf8_without_validation(ReadonlyBytes); + // Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream. static ErrorOr from_stream(Stream&, size_t byte_count); diff --git a/AK/StringBuilder.cpp b/AK/StringBuilder.cpp index 1db8dea923c..60e0d3d332e 100644 --- a/AK/StringBuilder.cpp +++ b/AK/StringBuilder.cpp @@ -156,6 +156,11 @@ ErrorOr StringBuilder::to_string() const return String::from_utf8(string_view()); } +String StringBuilder::to_string_without_validation() const +{ + return String::from_utf8_without_validation(string_view().bytes()); +} + ErrorOr StringBuilder::to_fly_string() const { return FlyString::from_utf8(string_view()); diff --git a/AK/StringBuilder.h b/AK/StringBuilder.h index 8d2e09a48fa..f710503a386 100644 --- a/AK/StringBuilder.h +++ b/AK/StringBuilder.h @@ -73,6 +73,7 @@ public: [[nodiscard]] ByteString to_byte_string() const; #endif + [[nodiscard]] String to_string_without_validation() const; ErrorOr to_string() const; ErrorOr to_fly_string() const; diff --git a/Userland/Libraries/LibTextCodec/Decoder.cpp b/Userland/Libraries/LibTextCodec/Decoder.cpp index 995f0a06481..9b121b1f545 100644 --- a/Userland/Libraries/LibTextCodec/Decoder.cpp +++ b/Userland/Libraries/LibTextCodec/Decoder.cpp @@ -247,7 +247,7 @@ ErrorOr Decoder::to_utf8(StringView input) { StringBuilder builder(input.length()); TRY(process(input, [&builder](u32 c) { return builder.try_append_code_point(c); })); - return builder.to_string(); + return builder.to_string_without_validation(); } ErrorOr UTF8Decoder::process(StringView input, Function(u32)> on_code_point)