ladybird/AK/UnicodeUtils.h

/*
 * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#pragma once

#include <AK/Concepts.h>
#include <AK/Error.h>
#include <AK/Forward.h>

namespace AK::UnicodeUtils {

constexpr int bytes_to_store_code_point_in_utf8(u32 code_point)
{
    if (code_point <= 0x7f)
        return 1;
    if (code_point <= 0x7ff)
        return 2;
    if (code_point <= 0xffff)
        return 3;
    if (code_point <= 0x10ffff)
        return 4;
    return 0;
}

template<typename Callback>
[[nodiscard]] constexpr int code_point_to_utf8(u32 code_point, Callback callback)
{
    if (code_point <= 0x7f) {
        callback(static_cast<char>(code_point));
        return 1;
    } else if (code_point <= 0x07ff) {
        callback(static_cast<char>(((code_point >> 6) & 0x1f) | 0xc0));
        callback(static_cast<char>(((code_point >> 0) & 0x3f) | 0x80));
        return 2;
    } else if (code_point <= 0xffff) {
        callback(static_cast<char>(((code_point >> 12) & 0x0f) | 0xe0));
        callback(static_cast<char>(((code_point >> 6) & 0x3f) | 0x80));
        callback(static_cast<char>(((code_point >> 0) & 0x3f) | 0x80));
        return 3;
    } else if (code_point <= 0x10ffff) {
        callback(static_cast<char>(((code_point >> 18) & 0x07) | 0xf0));
        callback(static_cast<char>(((code_point >> 12) & 0x3f) | 0x80));
        callback(static_cast<char>(((code_point >> 6) & 0x3f) | 0x80));
        callback(static_cast<char>(((code_point >> 0) & 0x3f) | 0x80));
        return 4;
    }
    return -1;
}

template<FallibleFunction<char> Callback>
[[nodiscard]] ErrorOr<int> try_code_point_to_utf8(u32 code_point, Callback&& callback)
{
    if (code_point <= 0x7f) {
        TRY(callback(static_cast<char>(code_point)));
        return 1;
    }
    if (code_point <= 0x07ff) {
        TRY(callback(static_cast<char>((((code_point >> 6) & 0x1f) | 0xc0))));
        TRY(callback(static_cast<char>((((code_point >> 0) & 0x3f) | 0x80))));
        return 2;
    }
    if (code_point <= 0xffff) {
        TRY(callback(static_cast<char>((((code_point >> 12) & 0x0f) | 0xe0))));
        TRY(callback(static_cast<char>((((code_point >> 6) & 0x3f) | 0x80))));
        TRY(callback(static_cast<char>((((code_point >> 0) & 0x3f) | 0x80))));
        return 3;
    }
    if (code_point <= 0x10ffff) {
        TRY(callback(static_cast<char>((((code_point >> 18) & 0x07) | 0xf0))));
        TRY(callback(static_cast<char>((((code_point >> 12) & 0x3f) | 0x80))));
        TRY(callback(static_cast<char>((((code_point >> 6) & 0x3f) | 0x80))));
        TRY(callback(static_cast<char>((((code_point >> 0) & 0x3f) | 0x80))));
        return 4;
    }
    return -1;
}

/**
 * Compute the maximum number of UTF-8 bytes needed to store a given UTF-16 string, accounting for unmatched UTF-16 surrogates.
 * This function will overcount by at most 33%; 2 bytes for every valid UTF-16 codepoint between U+100000 and U+10FFFF.
 */
[[nodiscard]] static inline size_t maximum_utf8_length_from_utf16(ReadonlySpan<u16> code_units)
{
    // # UTF-8 code point -> no. UTF-8 bytes needed
    // U+0000   - U+007F   => 1 UTF-8 bytes
    // U+0080   - U+07FF   => 2 UTF-8 bytes
    // U+0800   - U+FFFF   => 3 UTF-8 bytes
    // U+010000 - U+10FFFF => 4 UTF-8 bytes

    // # UTF-16 code unit -> no. UTF-8 bytes needed
    // 0x0000 - 0x007f [U+000000 - U+00007F] = 1 UTF-8 bytes
    // 0x0080 - 0x07ff [U+000080 - U+0007FF] = 2 UTF-8 bytes
    // 0x0800 - 0xd7ff [U+000800 - U+00FFFF] = 3 UTF-8 bytes
    // 0xd800 - 0xdbff [U+010000 - U+10FFFF] = 4 UTF-8 bytes to encode valid UTF-16 code units,
    //                                         or 3 UTF-8 bytes to encode the unmatched surrogate code unit.
    // 0xdc00 - 0xdfff [U+010000 - U+10FFFF] = 0 UTF-8 bytes to encode valid UTF-16 code units (because it is already accounted for in 0xdc00 - 0xdfff),
    //                                         or 3 UTF-8 bytes to encode the unmatched surrogate code unit.
    // 0xe000 - 0xffff [U+00E000 - U+00FFFF] = 3 UTF-8 bytes

    // # UTF-16 code unit -> actual length added.
    // 0x0000 - 0x007f = 1
    // 0x0080 - 0x07ff = 2
    // 0x0800 - 0xd7ff = 3
    // 0xd800 - 0xdbff = 3
    //   ^ If the next code unit is 0xdc00 - 0xdfff, they will combined sum to 6, which is greater than the 4 required.
    //   Otherwise, 3 bytes are needed to encode U+D800 - U+DBFF.
    // 0xdc00 - 0xdfff = 3
    //   ^ If the previous code unit was, 0xd800 - 0xdbff, this will ensure that the combined sum is greater than 4.
    //   Otherwise, 3 bytes are needed to encode U+DC00 - U+DFFF.
    // 0xe000 - 0xffff = 3

    size_t maximum_utf8_length = 0;

    // NOTE: This loop is designed to be easy to vectorize.
    for (auto code_unit : code_units) {
        maximum_utf8_length += 1;
        maximum_utf8_length += code_unit > 0x007f;
        maximum_utf8_length += code_unit > 0x07ff;
    }

    return maximum_utf8_length;
}

}
AK: Add UnicodeUtils with Unicode-related helper functions This introduces the UnicodeUtils file, which contains helper functions related to Unicode. This is in contrast to StringUtils, whose functions are not directly related to Unicode and are, in theory, encoding-agnostic. 2021-05-20 10:56:38 +00:00			`/*`
			`* Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>`
			`*`
			`* SPDX-License-Identifier: BSD-2-Clause`
			`*/`

			`#pragma once`

AK: Make StringBuilder::try_append_code_point actually fallible It currently uses the non-fallible `append` method to append each UTF-8 encoded byte of the code point. 2023-01-07 19:48:20 +00:00			`#include <AK/Concepts.h>`
			`#include <AK/Error.h>`
AK: Add UnicodeUtils with Unicode-related helper functions This introduces the UnicodeUtils file, which contains helper functions related to Unicode. This is in contrast to StringUtils, whose functions are not directly related to Unicode and are, in theory, encoding-agnostic. 2021-05-20 10:56:38 +00:00			`#include <AK/Forward.h>`

			`namespace AK::UnicodeUtils {`

AK: Stop using ShortString in String::from_code_point Refactor it to use StringBase::replace_with_new_short_string instead. 2023-10-28 21:06:51 +00:00			`constexpr int bytes_to_store_code_point_in_utf8(u32 code_point)`
			`{`
			`if (code_point <= 0x7f)`
			`return 1;`
			`if (code_point <= 0x7ff)`
			`return 2;`
			`if (code_point <= 0xffff)`
			`return 3;`
			`if (code_point <= 0x10ffff)`
			`return 4;`
			`return 0;`
			`}`

LibC: Implement wcrtomb This function converts a single wide character into its multibyte representation (UTF-8 in our case). It is called from libc++'s `std::basic_ostream<wchar_t>::flush`, which gets called at program exit from a global destructor in order to flush `std::wcout`. 2021-10-04 14:59:13 +00:00			`template<typename Callback>`
			`[[nodiscard]] constexpr int code_point_to_utf8(u32 code_point, Callback callback)`
			`{`
			`if (code_point <= 0x7f) {`
AK: Replace C-style casts 2023-03-07 14:28:21 +00:00			`callback(static_cast<char>(code_point));`
LibC: Implement wcrtomb This function converts a single wide character into its multibyte representation (UTF-8 in our case). It is called from libc++'s `std::basic_ostream<wchar_t>::flush`, which gets called at program exit from a global destructor in order to flush `std::wcout`. 2021-10-04 14:59:13 +00:00			`return 1;`
			`} else if (code_point <= 0x07ff) {`
AK: Replace C-style casts 2023-03-07 14:28:21 +00:00			`callback(static_cast<char>(((code_point >> 6) & 0x1f) \| 0xc0));`
			`callback(static_cast<char>(((code_point >> 0) & 0x3f) \| 0x80));`
LibC: Implement wcrtomb This function converts a single wide character into its multibyte representation (UTF-8 in our case). It is called from libc++'s `std::basic_ostream<wchar_t>::flush`, which gets called at program exit from a global destructor in order to flush `std::wcout`. 2021-10-04 14:59:13 +00:00			`return 2;`
			`} else if (code_point <= 0xffff) {`
AK: Replace C-style casts 2023-03-07 14:28:21 +00:00			`callback(static_cast<char>(((code_point >> 12) & 0x0f) \| 0xe0));`
			`callback(static_cast<char>(((code_point >> 6) & 0x3f) \| 0x80));`
			`callback(static_cast<char>(((code_point >> 0) & 0x3f) \| 0x80));`
LibC: Implement wcrtomb This function converts a single wide character into its multibyte representation (UTF-8 in our case). It is called from libc++'s `std::basic_ostream<wchar_t>::flush`, which gets called at program exit from a global destructor in order to flush `std::wcout`. 2021-10-04 14:59:13 +00:00			`return 3;`
			`} else if (code_point <= 0x10ffff) {`
AK: Replace C-style casts 2023-03-07 14:28:21 +00:00			`callback(static_cast<char>(((code_point >> 18) & 0x07) \| 0xf0));`
			`callback(static_cast<char>(((code_point >> 12) & 0x3f) \| 0x80));`
			`callback(static_cast<char>(((code_point >> 6) & 0x3f) \| 0x80));`
			`callback(static_cast<char>(((code_point >> 0) & 0x3f) \| 0x80));`
LibC: Implement wcrtomb This function converts a single wide character into its multibyte representation (UTF-8 in our case). It is called from libc++'s `std::basic_ostream<wchar_t>::flush`, which gets called at program exit from a global destructor in order to flush `std::wcout`. 2021-10-04 14:59:13 +00:00			`return 4;`
			`}`
			`return -1;`
			`}`

AK: Make StringBuilder::try_append_code_point actually fallible It currently uses the non-fallible `append` method to append each UTF-8 encoded byte of the code point. 2023-01-07 19:48:20 +00:00			`template<FallibleFunction<char> Callback>`
			`[[nodiscard]] ErrorOr<int> try_code_point_to_utf8(u32 code_point, Callback&& callback)`
			`{`
			`if (code_point <= 0x7f) {`
			`TRY(callback(static_cast<char>(code_point)));`
			`return 1;`
			`}`
			`if (code_point <= 0x07ff) {`
			`TRY(callback(static_cast<char>((((code_point >> 6) & 0x1f) \| 0xc0))));`
			`TRY(callback(static_cast<char>((((code_point >> 0) & 0x3f) \| 0x80))));`
			`return 2;`
			`}`
			`if (code_point <= 0xffff) {`
			`TRY(callback(static_cast<char>((((code_point >> 12) & 0x0f) \| 0xe0))));`
			`TRY(callback(static_cast<char>((((code_point >> 6) & 0x3f) \| 0x80))));`
			`TRY(callback(static_cast<char>((((code_point >> 0) & 0x3f) \| 0x80))));`
			`return 3;`
			`}`
			`if (code_point <= 0x10ffff) {`
			`TRY(callback(static_cast<char>((((code_point >> 18) & 0x07) \| 0xf0))));`
			`TRY(callback(static_cast<char>((((code_point >> 12) & 0x3f) \| 0x80))));`
			`TRY(callback(static_cast<char>((((code_point >> 6) & 0x3f) \| 0x80))));`
			`TRY(callback(static_cast<char>((((code_point >> 0) & 0x3f) \| 0x80))));`
			`return 4;`
			`}`
			`return -1;`
			`}`

AK: Use `simdutf` when appending UTF-16 to StringBuilder Adds a fast path for valid UTF-16 using `simdutf`, and fall back to the slow path for unmatched surrogates. 2024-10-26 09:05:31 +00:00			`/**`
			`* Compute the maximum number of UTF-8 bytes needed to store a given UTF-16 string, accounting for unmatched UTF-16 surrogates.`
			`* This function will overcount by at most 33%; 2 bytes for every valid UTF-16 codepoint between U+100000 and U+10FFFF.`
			`*/`
			`[[nodiscard]] static inline size_t maximum_utf8_length_from_utf16(ReadonlySpan<u16> code_units)`
			`{`
			`// # UTF-8 code point -> no. UTF-8 bytes needed`
			`// U+0000 - U+007F => 1 UTF-8 bytes`
			`// U+0080 - U+07FF => 2 UTF-8 bytes`
			`// U+0800 - U+FFFF => 3 UTF-8 bytes`
			`// U+010000 - U+10FFFF => 4 UTF-8 bytes`

			`// # UTF-16 code unit -> no. UTF-8 bytes needed`
			`// 0x0000 - 0x007f [U+000000 - U+00007F] = 1 UTF-8 bytes`
			`// 0x0080 - 0x07ff [U+000080 - U+0007FF] = 2 UTF-8 bytes`
			`// 0x0800 - 0xd7ff [U+000800 - U+00FFFF] = 3 UTF-8 bytes`
			`// 0xd800 - 0xdbff [U+010000 - U+10FFFF] = 4 UTF-8 bytes to encode valid UTF-16 code units,`
			`// or 3 UTF-8 bytes to encode the unmatched surrogate code unit.`
			`// 0xdc00 - 0xdfff [U+010000 - U+10FFFF] = 0 UTF-8 bytes to encode valid UTF-16 code units (because it is already accounted for in 0xdc00 - 0xdfff),`
			`// or 3 UTF-8 bytes to encode the unmatched surrogate code unit.`
			`// 0xe000 - 0xffff [U+00E000 - U+00FFFF] = 3 UTF-8 bytes`

			`// # UTF-16 code unit -> actual length added.`
			`// 0x0000 - 0x007f = 1`
			`// 0x0080 - 0x07ff = 2`
			`// 0x0800 - 0xd7ff = 3`
			`// 0xd800 - 0xdbff = 3`
			`// ^ If the next code unit is 0xdc00 - 0xdfff, they will combined sum to 6, which is greater than the 4 required.`
			`// Otherwise, 3 bytes are needed to encode U+D800 - U+DBFF.`
			`// 0xdc00 - 0xdfff = 3`
			`// ^ If the previous code unit was, 0xd800 - 0xdbff, this will ensure that the combined sum is greater than 4.`
			`// Otherwise, 3 bytes are needed to encode U+DC00 - U+DFFF.`
			`// 0xe000 - 0xffff = 3`

			`size_t maximum_utf8_length = 0;`

			`// NOTE: This loop is designed to be easy to vectorize.`
			`for (auto code_unit : code_units) {`
			`maximum_utf8_length += 1;`
			`maximum_utf8_length += code_unit > 0x007f;`
			`maximum_utf8_length += code_unit > 0x07ff;`
			`}`

			`return maximum_utf8_length;`
			`}`

AK: Add UnicodeUtils with Unicode-related helper functions This introduces the UnicodeUtils file, which contains helper functions related to Unicode. This is in contrast to StringUtils, whose functions are not directly related to Unicode and are, in theory, encoding-agnostic. 2021-05-20 10:56:38 +00:00			`}`