AK: Use simdutf when appending UTF-16 to StringBuilder

Adds a fast path for valid UTF-16 using `simdutf`, and fall back to the slow path for unmatched surrogates.
Author: https://github.com/yyny Commit: https://github.com/LadybirdBrowser/ladybird/commit/04920d06f0c Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/1973
2024-10-26 11:05:31 +02:00 · 2024-10-26 11:05:31 +02:00 · 04920d06f0 · 2024-10-30 09:29:15 +00:00
commit 04920d06f0
parent ff6020c207
4 changed files with 120 additions and 17 deletions
--- a/AK/ByteBuffer.h
+++ b/AK/ByteBuffer.h
@ -202,6 +202,17 @@ public:
        MUST(try_ensure_capacity(new_capacity));
    }
    void set_size(size_t new_size, ZeroFillNewElements zero_fill_new_elements = ZeroFillNewElements::No)
    {
        ASSERT(new_size <= capacity());
        if (zero_fill_new_elements == ZeroFillNewElements::Yes) {
            __builtin_memset(data() + m_size, 0, new_size - m_size);
        }
        m_size = new_size;
    }
    ErrorOr<void> try_resize(size_t new_size, ZeroFillNewElements zero_fill_new_elements = ZeroFillNewElements::No)
    {
        if (new_size <= m_size) {
@ -210,11 +221,8 @@ public:
        }
        TRY(try_ensure_capacity(new_size));
-        if (zero_fill_new_elements == ZeroFillNewElements::Yes) {
+        set_size(new_size, zero_fill_new_elements);
            __builtin_memset(data() + m_size, 0, new_size - m_size);
        }
        m_size = new_size;
        return {};
    }
--- a/AK/StringBuilder.cpp
+++ b/AK/StringBuilder.cpp
@ -17,6 +17,8 @@
 #include <AK/Utf16View.h>
 #include <AK/Utf32View.h>
 #include <simdutf.h>
 namespace AK {
 static constexpr auto STRING_BASE_PREFIX_SIZE = sizeof(Detail::StringData);
@ -224,24 +226,69 @@ void StringBuilder::append_code_point(u32 code_point)
 ErrorOr<void> StringBuilder::try_append(Utf16View const& utf16_view)
 {
-    // NOTE: This may under-allocate in the presence of surrogate pairs.
+    if (utf16_view.is_empty())
-    //       That's okay, appending will still grow the buffer as needed.
+        return {};
    TRY(will_append(utf16_view.length_in_code_units()));
-    for (size_t i = 0; i < utf16_view.length_in_code_units();) {
+    auto maximum_utf8_length = UnicodeUtils::maximum_utf8_length_from_utf16(utf16_view.span());
-        // OPTIMIZATION: Fast path for ASCII characters.
+
-        auto code_unit = utf16_view.data()[i];
+    // Possibly over-allocate a little to ensure we don't have to allocate later.
-        if (code_unit <= 0x7f) {
+    TRY(will_append(maximum_utf8_length));
-            append(static_cast<char>(code_unit));
+
-            ++i;
+    Utf16View remaining_view = utf16_view;
-            continue;
+    for (;;) {
        auto uninitialized_data_pointer = static_cast<char*>(m_buffer.end_pointer());
        // Fast path.
        auto result = [&]() {
            switch (remaining_view.endianness()) {
            case Endianness::Host:
                return simdutf::convert_utf16_to_utf8_with_errors(remaining_view.char_data(), remaining_view.length_in_code_units(), uninitialized_data_pointer);
            case Endianness::Big:
                return simdutf::convert_utf16be_to_utf8_with_errors(remaining_view.char_data(), remaining_view.length_in_code_units(), uninitialized_data_pointer);
            case Endianness::Little:
                return simdutf::convert_utf16le_to_utf8_with_errors(remaining_view.char_data(), remaining_view.length_in_code_units(), uninitialized_data_pointer);
            }
            VERIFY_NOT_REACHED();
        }();
        if (result.error == simdutf::SUCCESS) {
            auto bytes_just_written = result.count;
            m_buffer.set_size(m_buffer.size() + bytes_just_written);
            break;
        }
-        auto code_point = utf16_view.code_point_at(i);
+        // Slow path. Found unmatched surrogate code unit.
-        TRY(try_append_code_point(code_point));
+        auto first_invalid_code_unit = result.count;
        ASSERT(first_invalid_code_unit < remaining_view.length_in_code_units());
-        i += (code_point > 0xffff ? 2 : 1);
+        // Unfortunately, `simdutf` does not tell us how many bytes it just wrote in case of an error, so we have to calculate it ourselves.
        auto bytes_just_written = [&]() {
            switch (remaining_view.endianness()) {
            case Endianness::Host:
                return simdutf::utf8_length_from_utf16(remaining_view.char_data(), first_invalid_code_unit);
            case Endianness::Big:
                return simdutf::utf8_length_from_utf16be(remaining_view.char_data(), first_invalid_code_unit);
            case Endianness::Little:
                return simdutf::utf8_length_from_utf16le(remaining_view.char_data(), first_invalid_code_unit);
            }
            VERIFY_NOT_REACHED();
        }();
        do {
            auto code_unit = remaining_view.code_unit_at(first_invalid_code_unit++);
            // Invalid surrogate code units are U+D800 - U+DFFF, so they are always encoded using 3 bytes.
            ASSERT(code_unit >= 0xD800 && code_unit <= 0xDFFF);
            ASSERT(m_buffer.size() + bytes_just_written + 3 < m_buffer.capacity());
            uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 12) & 0x0f) | 0xe0);
            uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 6) & 0x3f) | 0x80);
            uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 0) & 0x3f) | 0x80);
        } while (first_invalid_code_unit < remaining_view.length_in_code_units() && Utf16View::is_low_surrogate(remaining_view.data()[first_invalid_code_unit]));
        // Code unit might no longer be invalid, retry on the remaining data.
        m_buffer.set_size(m_buffer.size() + bytes_just_written);
        remaining_view = remaining_view.substring_view(first_invalid_code_unit);
    }
    return {};
 }
--- a/AK/UnicodeUtils.h
+++ b/AK/UnicodeUtils.h
@ -78,4 +78,50 @@ template<FallibleFunction<char> Callback>
    return -1;
 }
 /**
 * Compute the maximum number of UTF-8 bytes needed to store a given UTF-16 string, accounting for unmatched UTF-16 surrogates.
 * This function will overcount by at most 33%; 2 bytes for every valid UTF-16 codepoint between U+100000 and U+10FFFF.
 */
 [[nodiscard]] static inline size_t maximum_utf8_length_from_utf16(ReadonlySpan<u16> code_units)
 {
    // # UTF-8 code point -> no. UTF-8 bytes needed
    // U+0000   - U+007F   => 1 UTF-8 bytes
    // U+0080   - U+07FF   => 2 UTF-8 bytes
    // U+0800   - U+FFFF   => 3 UTF-8 bytes
    // U+010000 - U+10FFFF => 4 UTF-8 bytes
    // # UTF-16 code unit -> no. UTF-8 bytes needed
    // 0x0000 - 0x007f [U+000000 - U+00007F] = 1 UTF-8 bytes
    // 0x0080 - 0x07ff [U+000080 - U+0007FF] = 2 UTF-8 bytes
    // 0x0800 - 0xd7ff [U+000800 - U+00FFFF] = 3 UTF-8 bytes
    // 0xd800 - 0xdbff [U+010000 - U+10FFFF] = 4 UTF-8 bytes to encode valid UTF-16 code units,
    //                                         or 3 UTF-8 bytes to encode the unmatched surrogate code unit.
    // 0xdc00 - 0xdfff [U+010000 - U+10FFFF] = 0 UTF-8 bytes to encode valid UTF-16 code units (because it is already accounted for in 0xdc00 - 0xdfff),
    //                                         or 3 UTF-8 bytes to encode the unmatched surrogate code unit.
    // 0xe000 - 0xffff [U+00E000 - U+00FFFF] = 3 UTF-8 bytes
    // # UTF-16 code unit -> actual length added.
    // 0x0000 - 0x007f = 1
    // 0x0080 - 0x07ff = 2
    // 0x0800 - 0xd7ff = 3
    // 0xd800 - 0xdbff = 3
    //   ^ If the next code unit is 0xdc00 - 0xdfff, they will combined sum to 6, which is greater than the 4 required.
    //   Otherwise, 3 bytes are needed to encode U+D800 - U+DBFF.
    // 0xdc00 - 0xdfff = 3
    //   ^ If the previous code unit was, 0xd800 - 0xdbff, this will ensure that the combined sum is greater than 4.
    //   Otherwise, 3 bytes are needed to encode U+DC00 - U+DFFF.
    // 0xe000 - 0xffff = 3
    size_t maximum_utf8_length = 0;
    // NOTE: This loop is designed to be easy to vectorize.
    for (auto code_unit : code_units) {
        maximum_utf8_length += 1;
        maximum_utf8_length += code_unit > 0x007f;
        maximum_utf8_length += code_unit > 0x07ff;
    }
    return maximum_utf8_length;
 }
 }
--- a/AK/Utf16View.h
+++ b/AK/Utf16View.h
@ -109,6 +109,8 @@ public:
    u16 const* data() const { return m_code_units.data(); }
    char16_t const* char_data() const { return reinterpret_cast<char16_t const*>(data()); }
    ReadonlySpan<u16> span() const { return m_code_units; }
    u16 code_unit_at(size_t index) const;
    u32 code_point_at(size_t index) const;