diff --git a/AK/ByteBuffer.h b/AK/ByteBuffer.h index d7aa530fc57..3bf92f6374b 100644 --- a/AK/ByteBuffer.h +++ b/AK/ByteBuffer.h @@ -202,6 +202,17 @@ public: MUST(try_ensure_capacity(new_capacity)); } + void set_size(size_t new_size, ZeroFillNewElements zero_fill_new_elements = ZeroFillNewElements::No) + { + ASSERT(new_size <= capacity()); + + if (zero_fill_new_elements == ZeroFillNewElements::Yes) { + __builtin_memset(data() + m_size, 0, new_size - m_size); + } + + m_size = new_size; + } + ErrorOr try_resize(size_t new_size, ZeroFillNewElements zero_fill_new_elements = ZeroFillNewElements::No) { if (new_size <= m_size) { @@ -210,11 +221,8 @@ public: } TRY(try_ensure_capacity(new_size)); - if (zero_fill_new_elements == ZeroFillNewElements::Yes) { - __builtin_memset(data() + m_size, 0, new_size - m_size); - } + set_size(new_size, zero_fill_new_elements); - m_size = new_size; return {}; } diff --git a/AK/StringBuilder.cpp b/AK/StringBuilder.cpp index e259269964b..28d1c3126d0 100644 --- a/AK/StringBuilder.cpp +++ b/AK/StringBuilder.cpp @@ -17,6 +17,8 @@ #include #include +#include + namespace AK { static constexpr auto STRING_BASE_PREFIX_SIZE = sizeof(Detail::StringData); @@ -224,24 +226,69 @@ void StringBuilder::append_code_point(u32 code_point) ErrorOr StringBuilder::try_append(Utf16View const& utf16_view) { - // NOTE: This may under-allocate in the presence of surrogate pairs. - // That's okay, appending will still grow the buffer as needed. - TRY(will_append(utf16_view.length_in_code_units())); + if (utf16_view.is_empty()) + return {}; - for (size_t i = 0; i < utf16_view.length_in_code_units();) { - // OPTIMIZATION: Fast path for ASCII characters. - auto code_unit = utf16_view.data()[i]; - if (code_unit <= 0x7f) { - append(static_cast(code_unit)); - ++i; - continue; + auto maximum_utf8_length = UnicodeUtils::maximum_utf8_length_from_utf16(utf16_view.span()); + + // Possibly over-allocate a little to ensure we don't have to allocate later. + TRY(will_append(maximum_utf8_length)); + + Utf16View remaining_view = utf16_view; + for (;;) { + auto uninitialized_data_pointer = static_cast(m_buffer.end_pointer()); + + // Fast path. + auto result = [&]() { + switch (remaining_view.endianness()) { + case Endianness::Host: + return simdutf::convert_utf16_to_utf8_with_errors(remaining_view.char_data(), remaining_view.length_in_code_units(), uninitialized_data_pointer); + case Endianness::Big: + return simdutf::convert_utf16be_to_utf8_with_errors(remaining_view.char_data(), remaining_view.length_in_code_units(), uninitialized_data_pointer); + case Endianness::Little: + return simdutf::convert_utf16le_to_utf8_with_errors(remaining_view.char_data(), remaining_view.length_in_code_units(), uninitialized_data_pointer); + } + VERIFY_NOT_REACHED(); + }(); + if (result.error == simdutf::SUCCESS) { + auto bytes_just_written = result.count; + m_buffer.set_size(m_buffer.size() + bytes_just_written); + break; } - auto code_point = utf16_view.code_point_at(i); - TRY(try_append_code_point(code_point)); + // Slow path. Found unmatched surrogate code unit. + auto first_invalid_code_unit = result.count; + ASSERT(first_invalid_code_unit < remaining_view.length_in_code_units()); - i += (code_point > 0xffff ? 2 : 1); + // Unfortunately, `simdutf` does not tell us how many bytes it just wrote in case of an error, so we have to calculate it ourselves. + auto bytes_just_written = [&]() { + switch (remaining_view.endianness()) { + case Endianness::Host: + return simdutf::utf8_length_from_utf16(remaining_view.char_data(), first_invalid_code_unit); + case Endianness::Big: + return simdutf::utf8_length_from_utf16be(remaining_view.char_data(), first_invalid_code_unit); + case Endianness::Little: + return simdutf::utf8_length_from_utf16le(remaining_view.char_data(), first_invalid_code_unit); + } + VERIFY_NOT_REACHED(); + }(); + + do { + auto code_unit = remaining_view.code_unit_at(first_invalid_code_unit++); + + // Invalid surrogate code units are U+D800 - U+DFFF, so they are always encoded using 3 bytes. + ASSERT(code_unit >= 0xD800 && code_unit <= 0xDFFF); + ASSERT(m_buffer.size() + bytes_just_written + 3 < m_buffer.capacity()); + uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 12) & 0x0f) | 0xe0); + uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 6) & 0x3f) | 0x80); + uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 0) & 0x3f) | 0x80); + } while (first_invalid_code_unit < remaining_view.length_in_code_units() && Utf16View::is_low_surrogate(remaining_view.data()[first_invalid_code_unit])); + + // Code unit might no longer be invalid, retry on the remaining data. + m_buffer.set_size(m_buffer.size() + bytes_just_written); + remaining_view = remaining_view.substring_view(first_invalid_code_unit); } + return {}; } diff --git a/AK/UnicodeUtils.h b/AK/UnicodeUtils.h index 44045507209..7c57a262cd4 100644 --- a/AK/UnicodeUtils.h +++ b/AK/UnicodeUtils.h @@ -78,4 +78,50 @@ template Callback> return -1; } +/** + * Compute the maximum number of UTF-8 bytes needed to store a given UTF-16 string, accounting for unmatched UTF-16 surrogates. + * This function will overcount by at most 33%; 2 bytes for every valid UTF-16 codepoint between U+100000 and U+10FFFF. + */ +[[nodiscard]] static inline size_t maximum_utf8_length_from_utf16(ReadonlySpan code_units) +{ + // # UTF-8 code point -> no. UTF-8 bytes needed + // U+0000 - U+007F => 1 UTF-8 bytes + // U+0080 - U+07FF => 2 UTF-8 bytes + // U+0800 - U+FFFF => 3 UTF-8 bytes + // U+010000 - U+10FFFF => 4 UTF-8 bytes + + // # UTF-16 code unit -> no. UTF-8 bytes needed + // 0x0000 - 0x007f [U+000000 - U+00007F] = 1 UTF-8 bytes + // 0x0080 - 0x07ff [U+000080 - U+0007FF] = 2 UTF-8 bytes + // 0x0800 - 0xd7ff [U+000800 - U+00FFFF] = 3 UTF-8 bytes + // 0xd800 - 0xdbff [U+010000 - U+10FFFF] = 4 UTF-8 bytes to encode valid UTF-16 code units, + // or 3 UTF-8 bytes to encode the unmatched surrogate code unit. + // 0xdc00 - 0xdfff [U+010000 - U+10FFFF] = 0 UTF-8 bytes to encode valid UTF-16 code units (because it is already accounted for in 0xdc00 - 0xdfff), + // or 3 UTF-8 bytes to encode the unmatched surrogate code unit. + // 0xe000 - 0xffff [U+00E000 - U+00FFFF] = 3 UTF-8 bytes + + // # UTF-16 code unit -> actual length added. + // 0x0000 - 0x007f = 1 + // 0x0080 - 0x07ff = 2 + // 0x0800 - 0xd7ff = 3 + // 0xd800 - 0xdbff = 3 + // ^ If the next code unit is 0xdc00 - 0xdfff, they will combined sum to 6, which is greater than the 4 required. + // Otherwise, 3 bytes are needed to encode U+D800 - U+DBFF. + // 0xdc00 - 0xdfff = 3 + // ^ If the previous code unit was, 0xd800 - 0xdbff, this will ensure that the combined sum is greater than 4. + // Otherwise, 3 bytes are needed to encode U+DC00 - U+DFFF. + // 0xe000 - 0xffff = 3 + + size_t maximum_utf8_length = 0; + + // NOTE: This loop is designed to be easy to vectorize. + for (auto code_unit : code_units) { + maximum_utf8_length += 1; + maximum_utf8_length += code_unit > 0x007f; + maximum_utf8_length += code_unit > 0x07ff; + } + + return maximum_utf8_length; +} + } diff --git a/AK/Utf16View.h b/AK/Utf16View.h index f3a4fa3e209..19ae89c5ff3 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -109,6 +109,8 @@ public: u16 const* data() const { return m_code_units.data(); } char16_t const* char_data() const { return reinterpret_cast(data()); } + ReadonlySpan span() const { return m_code_units; } + u16 code_unit_at(size_t index) const; u32 code_point_at(size_t index) const;