Przeglądaj źródła

LibTextCodec: Use AK facilities to validate and convert UTF-16 to UTF-8

This allows LibTextCodec to make use of simdutf, and also reduces the
number of places with manual UTF-16 implementations.
Timothy Flynn 1 rok temu
rodzic
commit
368dad54ef

+ 12 - 0
Tests/LibTextCodec/TestTextDecoders.cpp

@@ -15,6 +15,8 @@ TEST_CASE(test_utf8_decode)
     // Bytes for U+1F600 GRINNING FACE
     auto test_string = "\xf0\x9f\x98\x80"sv;
 
+    EXPECT(decoder.validate(test_string));
+
     Vector<u32> processed_code_points;
     MUST(decoder.process(test_string, [&](u32 code_point) {
         return processed_code_points.try_append(code_point);
@@ -31,6 +33,8 @@ TEST_CASE(test_utf16be_decode)
     // This is the output of `python3 -c "print('säk😀'.encode('utf-16be'))"`.
     auto test_string = "\x00s\x00\xe4\x00k\xd8=\xde\x00"sv;
 
+    EXPECT(decoder.validate(test_string));
+
     Vector<u32> processed_code_points;
     MUST(decoder.process(test_string, [&](u32 code_point) {
         return processed_code_points.try_append(code_point);
@@ -40,6 +44,9 @@ TEST_CASE(test_utf16be_decode)
     EXPECT(processed_code_points[1] == 0xE4);
     EXPECT(processed_code_points[2] == 0x6B);
     EXPECT(processed_code_points[3] == 0x1F600);
+
+    auto utf8 = MUST(decoder.to_utf8(test_string));
+    EXPECT_EQ(utf8, "säk😀"sv);
 }
 
 TEST_CASE(test_utf16le_decode)
@@ -48,6 +55,8 @@ TEST_CASE(test_utf16le_decode)
     // This is the output of `python3 -c "print('säk😀'.encode('utf-16le'))"`.
     auto test_string = "s\x00\xe4\x00k\x00=\xd8\x00\xde"sv;
 
+    EXPECT(decoder.validate(test_string));
+
     Vector<u32> processed_code_points;
     MUST(decoder.process(test_string, [&](u32 code_point) {
         return processed_code_points.try_append(code_point);
@@ -57,4 +66,7 @@ TEST_CASE(test_utf16le_decode)
     EXPECT(processed_code_points[1] == 0xE4);
     EXPECT(processed_code_points[2] == 0x6B);
     EXPECT(processed_code_points[3] == 0x1F600);
+
+    auto utf8 = MUST(decoder.to_utf8(test_string));
+    EXPECT_EQ(utf8, "säk😀"sv);
 }

+ 18 - 112
Userland/Libraries/LibTextCodec/Decoder.cpp

@@ -375,150 +375,56 @@ ErrorOr<String> UTF8Decoder::to_utf8(StringView input)
     return Decoder::to_utf8(bomless_input);
 }
 
-ErrorOr<void> UTF16BEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
+static Utf16View as_utf16(StringView view, AK::Endianness endianness)
 {
-    // rfc2781, 2.2 Decoding UTF-16
-    size_t utf16_length = input.length() - (input.length() % 2);
-    for (size_t i = 0; i < utf16_length; i += 2) {
-        // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
-        //    of W1. Terminate.
-        u16 w1 = (static_cast<u8>(input[i]) << 8) | static_cast<u8>(input[i + 1]);
-        if (!is_unicode_surrogate(w1)) {
-            TRY(on_code_point(w1));
-            continue;
-        }
-
-        // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
-        //    is in error and no valid character can be obtained using W1.
-        //    Terminate.
-        // 3) If there is no W2 (that is, the sequence ends with W1), or if W2
-        //    is not between 0xDC00 and 0xDFFF, the sequence is in error.
-        //    Terminate.
-        if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) {
-            TRY(on_code_point(replacement_code_point));
-            continue;
-        }
-
-        u16 w2 = (static_cast<u8>(input[i + 2]) << 8) | static_cast<u8>(input[i + 3]);
-        if (!Utf16View::is_low_surrogate(w2)) {
-            TRY(on_code_point(replacement_code_point));
-            continue;
-        }
+    return Utf16View {
+        { reinterpret_cast<u16 const*>(view.bytes().data()), view.length() / 2 },
+        endianness
+    };
+}
 
-        // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
-        //    bits of W1 as its 10 high-order bits and the 10 low-order bits of
-        //    W2 as its 10 low-order bits.
-        // 5) Add 0x10000 to U' to obtain the character value U. Terminate.
-        TRY(on_code_point(Utf16View::decode_surrogate_pair(w1, w2)));
-        i += 2;
-    }
+ErrorOr<void> UTF16BEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
+{
+    for (auto code_point : as_utf16(input, AK::Endianness::Big))
+        TRY(on_code_point(code_point));
 
     return {};
 }
 
 bool UTF16BEDecoder::validate(StringView input)
 {
-    size_t utf16_length = input.length() - (input.length() % 2);
-    for (size_t i = 0; i < utf16_length; i += 2) {
-        u16 w1 = (static_cast<u8>(input[i]) << 8) | static_cast<u8>(input[i + 1]);
-        if (!is_unicode_surrogate(w1))
-            continue;
-
-        if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length)
-            return false;
-
-        u16 w2 = (static_cast<u8>(input[i + 2]) << 8) | static_cast<u8>(input[i + 3]);
-        if (!Utf16View::is_low_surrogate(w2))
-            return false;
-
-        i += 2;
-    }
-    return true;
+    return as_utf16(input, AK::Endianness::Big).validate();
 }
 
 ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input)
 {
     // Discard the BOM
-    auto bomless_input = input;
     if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
-        bomless_input = input.substring_view(2);
+        input = input.substring_view(2);
 
-    StringBuilder builder(bomless_input.length() / 2);
-    TRY(process(bomless_input, [&builder](u32 c) { return builder.try_append_code_point(c); }));
-    return builder.to_string();
+    return String::from_utf16(as_utf16(input, AK::Endianness::Big));
 }
 
 ErrorOr<void> UTF16LEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
 {
-    // rfc2781, 2.2 Decoding UTF-16
-    size_t utf16_length = input.length() - (input.length() % 2);
-    for (size_t i = 0; i < utf16_length; i += 2) {
-        // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
-        //    of W1. Terminate.
-        u16 w1 = static_cast<u8>(input[i]) | (static_cast<u8>(input[i + 1]) << 8);
-        if (!is_unicode_surrogate(w1)) {
-            TRY(on_code_point(w1));
-            continue;
-        }
-
-        // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
-        //    is in error and no valid character can be obtained using W1.
-        //    Terminate.
-        // 3) If there is no W2 (that is, the sequence ends with W1), or if W2
-        //    is not between 0xDC00 and 0xDFFF, the sequence is in error.
-        //    Terminate.
-        if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) {
-            TRY(on_code_point(replacement_code_point));
-            continue;
-        }
-
-        u16 w2 = static_cast<u8>(input[i + 2]) | (static_cast<u8>(input[i + 3]) << 8);
-        if (!Utf16View::is_low_surrogate(w2)) {
-            TRY(on_code_point(replacement_code_point));
-            continue;
-        }
-
-        // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
-        //    bits of W1 as its 10 high-order bits and the 10 low-order bits of
-        //    W2 as its 10 low-order bits.
-        // 5) Add 0x10000 to U' to obtain the character value U. Terminate.
-        TRY(on_code_point(Utf16View::decode_surrogate_pair(w1, w2)));
-        i += 2;
-    }
+    for (auto code_point : as_utf16(input, AK::Endianness::Little))
+        TRY(on_code_point(code_point));
 
     return {};
 }
 
 bool UTF16LEDecoder::validate(StringView input)
 {
-    size_t utf16_length = input.length() - (input.length() % 2);
-    for (size_t i = 0; i < utf16_length; i += 2) {
-        u16 w1 = static_cast<u8>(input[i]) | (static_cast<u8>(input[i + 1]) << 8);
-        if (!is_unicode_surrogate(w1))
-            continue;
-
-        if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length)
-            return false;
-
-        u16 w2 = static_cast<u8>(input[i + 2]) | (static_cast<u8>(input[i + 3]) << 8);
-        if (!Utf16View::is_low_surrogate(w2))
-            return false;
-
-        i += 2;
-    }
-    return true;
+    return as_utf16(input, AK::Endianness::Little).validate();
 }
 
 ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input)
 {
     // Discard the BOM
-    auto bomless_input = input;
     if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
-        bomless_input = input.substring_view(2);
+        input = input.substring_view(2);
 
-    StringBuilder builder(bomless_input.length() / 2);
-    TRY(process(bomless_input, [&builder](u32 c) { return builder.try_append_code_point(c); }));
-    return builder.to_string();
+    return String::from_utf16(as_utf16(input, AK::Endianness::Little));
 }
 
 ErrorOr<void> Latin1Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)