LibTextCodec: Use AK facilities to validate and convert UTF-16 to UTF-8

This allows LibTextCodec to make use of simdutf, and also reduces the number of places with manual UTF-16 implementations.
Author: https://github.com/trflynn89 Commit: https://github.com/LadybirdBrowser/ladybird/commit/368dad54ef2 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/698
2024-11-22 07:30:19 +00:00 · 2024-07-18 12:07:37 -04:00 · 2024-07-18 12:07:37 -04:00 · 368dad54ef · 2024-07-19 06:22:43 +09:00
commit 368dad54ef
parent 71c29504af
2 changed files with 30 additions and 112 deletions
--- a/Tests/LibTextCodec/TestTextDecoders.cpp
+++ b/Tests/LibTextCodec/TestTextDecoders.cpp
@ -15,6 +15,8 @@ TEST_CASE(test_utf8_decode)
    // Bytes for U+1F600 GRINNING FACE
    auto test_string = "\xf0\x9f\x98\x80"sv;

+    EXPECT(decoder.validate(test_string));
+
    Vector<u32> processed_code_points;
    MUST(decoder.process(test_string, [&](u32 code_point) {
        return processed_code_points.try_append(code_point);
@ -31,6 +33,8 @@ TEST_CASE(test_utf16be_decode)
    // This is the output of `python3 -c "print('säk😀'.encode('utf-16be'))"`.
    auto test_string = "\x00s\x00\xe4\x00k\xd8=\xde\x00"sv;

+    EXPECT(decoder.validate(test_string));
+
    Vector<u32> processed_code_points;
    MUST(decoder.process(test_string, [&](u32 code_point) {
        return processed_code_points.try_append(code_point);
@ -40,6 +44,9 @@ TEST_CASE(test_utf16be_decode)
    EXPECT(processed_code_points[1] == 0xE4);
    EXPECT(processed_code_points[2] == 0x6B);
    EXPECT(processed_code_points[3] == 0x1F600);
+
+    auto utf8 = MUST(decoder.to_utf8(test_string));
+    EXPECT_EQ(utf8, "säk😀"sv);
 }

 TEST_CASE(test_utf16le_decode)
@ -48,6 +55,8 @@ TEST_CASE(test_utf16le_decode)
    // This is the output of `python3 -c "print('säk😀'.encode('utf-16le'))"`.
    auto test_string = "s\x00\xe4\x00k\x00=\xd8\x00\xde"sv;

+    EXPECT(decoder.validate(test_string));
+
    Vector<u32> processed_code_points;
    MUST(decoder.process(test_string, [&](u32 code_point) {
        return processed_code_points.try_append(code_point);
@ -57,4 +66,7 @@ TEST_CASE(test_utf16le_decode)
    EXPECT(processed_code_points[1] == 0xE4);
    EXPECT(processed_code_points[2] == 0x6B);
    EXPECT(processed_code_points[3] == 0x1F600);
+
+    auto utf8 = MUST(decoder.to_utf8(test_string));
+    EXPECT_EQ(utf8, "säk😀"sv);
 }
--- a/Userland/Libraries/LibTextCodec/Decoder.cpp
+++ b/Userland/Libraries/LibTextCodec/Decoder.cpp
@ -375,150 +375,56 @@ ErrorOr<String> UTF8Decoder::to_utf8(StringView input)
    return Decoder::to_utf8(bomless_input);
 }

+static Utf16View as_utf16(StringView view, AK::Endianness endianness)
+{
+    return Utf16View {
+        { reinterpret_cast<u16 const*>(view.bytes().data()), view.length() / 2 },
+        endianness
+    };
+}
+
 ErrorOr<void> UTF16BEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
 {
-    // rfc2781, 2.2 Decoding UTF-16
-    size_t utf16_length = input.length() - (input.length() % 2);
-    for (size_t i = 0; i < utf16_length; i += 2) {
-        // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
-        //    of W1. Terminate.
-        u16 w1 = (static_cast<u8>(input[i]) << 8) | static_cast<u8>(input[i + 1]);
-        if (!is_unicode_surrogate(w1)) {
-            TRY(on_code_point(w1));
-            continue;
-        }
-
-        // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
-        //    is in error and no valid character can be obtained using W1.
-        //    Terminate.
-        // 3) If there is no W2 (that is, the sequence ends with W1), or if W2
-        //    is not between 0xDC00 and 0xDFFF, the sequence is in error.
-        //    Terminate.
-        if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) {
-            TRY(on_code_point(replacement_code_point));
-            continue;
-        }
-
-        u16 w2 = (static_cast<u8>(input[i + 2]) << 8) | static_cast<u8>(input[i + 3]);
-        if (!Utf16View::is_low_surrogate(w2)) {
-            TRY(on_code_point(replacement_code_point));
-            continue;
-        }
-
-        // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
-        //    bits of W1 as its 10 high-order bits and the 10 low-order bits of
-        //    W2 as its 10 low-order bits.
-        // 5) Add 0x10000 to U' to obtain the character value U. Terminate.
-        TRY(on_code_point(Utf16View::decode_surrogate_pair(w1, w2)));
-        i += 2;
-    }
+    for (auto code_point : as_utf16(input, AK::Endianness::Big))
+        TRY(on_code_point(code_point));

    return {};
 }

 bool UTF16BEDecoder::validate(StringView input)
 {
-    size_t utf16_length = input.length() - (input.length() % 2);
-    for (size_t i = 0; i < utf16_length; i += 2) {
-        u16 w1 = (static_cast<u8>(input[i]) << 8) | static_cast<u8>(input[i + 1]);
-        if (!is_unicode_surrogate(w1))
-            continue;
-
-        if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length)
-            return false;
-
-        u16 w2 = (static_cast<u8>(input[i + 2]) << 8) | static_cast<u8>(input[i + 3]);
-        if (!Utf16View::is_low_surrogate(w2))
-            return false;
-
-        i += 2;
-    }
-    return true;
+    return as_utf16(input, AK::Endianness::Big).validate();
 }

 ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input)
 {
    // Discard the BOM
-    auto bomless_input = input;
    if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
-        bomless_input = input.substring_view(2);
+        input = input.substring_view(2);

-    StringBuilder builder(bomless_input.length() / 2);
-    TRY(process(bomless_input, [&builder](u32 c) { return builder.try_append_code_point(c); }));
-    return builder.to_string();
+    return String::from_utf16(as_utf16(input, AK::Endianness::Big));
 }

 ErrorOr<void> UTF16LEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
 {
-    // rfc2781, 2.2 Decoding UTF-16
-    size_t utf16_length = input.length() - (input.length() % 2);
-    for (size_t i = 0; i < utf16_length; i += 2) {
-        // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
-        //    of W1. Terminate.
-        u16 w1 = static_cast<u8>(input[i]) | (static_cast<u8>(input[i + 1]) << 8);
-        if (!is_unicode_surrogate(w1)) {
-            TRY(on_code_point(w1));
-            continue;
-        }
-
-        // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
-        //    is in error and no valid character can be obtained using W1.
-        //    Terminate.
-        // 3) If there is no W2 (that is, the sequence ends with W1), or if W2
-        //    is not between 0xDC00 and 0xDFFF, the sequence is in error.
-        //    Terminate.
-        if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) {
-            TRY(on_code_point(replacement_code_point));
-            continue;
-        }
-
-        u16 w2 = static_cast<u8>(input[i + 2]) | (static_cast<u8>(input[i + 3]) << 8);
-        if (!Utf16View::is_low_surrogate(w2)) {
-            TRY(on_code_point(replacement_code_point));
-            continue;
-        }
-
-        // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
-        //    bits of W1 as its 10 high-order bits and the 10 low-order bits of
-        //    W2 as its 10 low-order bits.
-        // 5) Add 0x10000 to U' to obtain the character value U. Terminate.
-        TRY(on_code_point(Utf16View::decode_surrogate_pair(w1, w2)));
-        i += 2;
-    }
+    for (auto code_point : as_utf16(input, AK::Endianness::Little))
+        TRY(on_code_point(code_point));

    return {};
 }

 bool UTF16LEDecoder::validate(StringView input)
 {
-    size_t utf16_length = input.length() - (input.length() % 2);
-    for (size_t i = 0; i < utf16_length; i += 2) {
-        u16 w1 = static_cast<u8>(input[i]) | (static_cast<u8>(input[i + 1]) << 8);
-        if (!is_unicode_surrogate(w1))
-            continue;
-
-        if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length)
-            return false;
-
-        u16 w2 = static_cast<u8>(input[i + 2]) | (static_cast<u8>(input[i + 3]) << 8);
-        if (!Utf16View::is_low_surrogate(w2))
-            return false;
-
-        i += 2;
-    }
-    return true;
+    return as_utf16(input, AK::Endianness::Little).validate();
 }

 ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input)
 {
    // Discard the BOM
-    auto bomless_input = input;
    if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
-        bomless_input = input.substring_view(2);
+        input = input.substring_view(2);

-    StringBuilder builder(bomless_input.length() / 2);
-    TRY(process(bomless_input, [&builder](u32 c) { return builder.try_append_code_point(c); }));
-    return builder.to_string();
+    return String::from_utf16(as_utf16(input, AK::Endianness::Little));
 }

 ErrorOr<void> Latin1Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)