diff --git a/Tests/LibTextCodec/TestTextDecoders.cpp b/Tests/LibTextCodec/TestTextDecoders.cpp index c5964695a61..fe873bf7ffc 100644 --- a/Tests/LibTextCodec/TestTextDecoders.cpp +++ b/Tests/LibTextCodec/TestTextDecoders.cpp @@ -23,3 +23,37 @@ TEST_CASE(test_utf8_decode) EXPECT(decoder.to_utf8(test_string) == test_string); } + +TEST_CASE(test_utf16be_decode) +{ + auto decoder = TextCodec::UTF16BEDecoder(); + // This is the output of `python3 -c "print('säk😀'.encode('utf-16be'))"`. + auto test_string = "\x00s\x00\xe4\x00k\xd8=\xde\x00"sv; + + Vector processed_code_points; + decoder.process(test_string, [&](u32 code_point) { + processed_code_points.append(code_point); + }); + EXPECT(processed_code_points.size() == 4); + EXPECT(processed_code_points[0] == 0x73); + EXPECT(processed_code_points[1] == 0xE4); + EXPECT(processed_code_points[2] == 0x6B); + EXPECT(processed_code_points[3] == 0x1F600); +} + +TEST_CASE(test_utf16le_decode) +{ + auto decoder = TextCodec::UTF16LEDecoder(); + // This is the output of `python3 -c "print('säk😀'.encode('utf-16le'))"`. + auto test_string = "s\x00\xe4\x00k\x00=\xd8\x00\xde"sv; + + Vector processed_code_points; + decoder.process(test_string, [&](u32 code_point) { + processed_code_points.append(code_point); + }); + EXPECT(processed_code_points.size() == 4); + EXPECT(processed_code_points[0] == 0x73); + EXPECT(processed_code_points[1] == 0xE4); + EXPECT(processed_code_points[2] == 0x6B); + EXPECT(processed_code_points[3] == 0x1F600); +} diff --git a/Userland/Libraries/LibTextCodec/Decoder.cpp b/Userland/Libraries/LibTextCodec/Decoder.cpp index 47f4805bd85..1bb32a28a7f 100644 --- a/Userland/Libraries/LibTextCodec/Decoder.cpp +++ b/Userland/Libraries/LibTextCodec/Decoder.cpp @@ -7,11 +7,14 @@ #include #include +#include #include #include namespace TextCodec { +static constexpr u32 replacement_code_point = 0xfffd; + namespace { Latin1Decoder s_latin1_decoder; UTF8Decoder s_utf8_decoder; @@ -234,10 +237,40 @@ DeprecatedString UTF8Decoder::to_utf8(StringView input) void UTF16BEDecoder::process(StringView input, Function on_code_point) { + // rfc2781, 2.2 Decoding UTF-16 size_t utf16_length = input.length() - (input.length() % 2); for (size_t i = 0; i < utf16_length; i += 2) { - u16 code_point = (input[i] << 8) | input[i + 1]; - on_code_point(code_point); + // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value + // of W1. Terminate. + u16 w1 = (static_cast(input[i]) << 8) | static_cast(input[i + 1]); + if (!is_unicode_surrogate(w1)) { + on_code_point(w1); + continue; + } + + // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence + // is in error and no valid character can be obtained using W1. + // Terminate. + // 3) If there is no W2 (that is, the sequence ends with W1), or if W2 + // is not between 0xDC00 and 0xDFFF, the sequence is in error. + // Terminate. + if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) { + on_code_point(replacement_code_point); + continue; + } + + u16 w2 = (static_cast(input[i + 2]) << 8) | static_cast(input[i + 3]); + if (!Utf16View::is_low_surrogate(w2)) { + on_code_point(replacement_code_point); + continue; + } + + // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order + // bits of W1 as its 10 high-order bits and the 10 low-order bits of + // W2 as its 10 low-order bits. + // 5) Add 0x10000 to U' to obtain the character value U. Terminate. + on_code_point(Utf16View::decode_surrogate_pair(w1, w2)); + i += 2; } } @@ -255,10 +288,40 @@ DeprecatedString UTF16BEDecoder::to_utf8(StringView input) void UTF16LEDecoder::process(StringView input, Function on_code_point) { + // rfc2781, 2.2 Decoding UTF-16 size_t utf16_length = input.length() - (input.length() % 2); for (size_t i = 0; i < utf16_length; i += 2) { - u16 code_point = input[i] | (input[i + 1] << 8); - on_code_point(code_point); + // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value + // of W1. Terminate. + u16 w1 = static_cast(input[i]) | (static_cast(input[i + 1]) << 8); + if (!is_unicode_surrogate(w1)) { + on_code_point(w1); + continue; + } + + // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence + // is in error and no valid character can be obtained using W1. + // Terminate. + // 3) If there is no W2 (that is, the sequence ends with W1), or if W2 + // is not between 0xDC00 and 0xDFFF, the sequence is in error. + // Terminate. + if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) { + on_code_point(replacement_code_point); + continue; + } + + u16 w2 = static_cast(input[i + 2]) | (static_cast(input[i + 3]) << 8); + if (!Utf16View::is_low_surrogate(w2)) { + on_code_point(replacement_code_point); + continue; + } + + // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order + // bits of W1 as its 10 high-order bits and the 10 low-order bits of + // W2 as its 10 low-order bits. + // 5) Add 0x10000 to U' to obtain the character value U. Terminate. + on_code_point(Utf16View::decode_surrogate_pair(w1, w2)); + i += 2; } }