mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-26 01:20:25 +00:00
LibTextCodec: Make utf-16be and utf-16le codecs actually work
There were two problems: 1. They didn't handle surrogates 2. They used signed chars, leading to eg 0x00e4 being treated as 0xffe4 Also add a basic test that catches both issues. There's some code duplication with Utf16CodePointIterator::operator*(), but let's get things working first.
This commit is contained in:
parent
aa9037eed4
commit
3423b54eb9
Notes:
sideshowbarker
2024-07-17 01:22:50 +09:00
Author: https://github.com/nico Commit: https://github.com/SerenityOS/serenity/commit/3423b54eb9 Pull-request: https://github.com/SerenityOS/serenity/pull/17133
2 changed files with 101 additions and 4 deletions
|
@ -23,3 +23,37 @@ TEST_CASE(test_utf8_decode)
|
||||||
|
|
||||||
EXPECT(decoder.to_utf8(test_string) == test_string);
|
EXPECT(decoder.to_utf8(test_string) == test_string);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_CASE(test_utf16be_decode)
|
||||||
|
{
|
||||||
|
auto decoder = TextCodec::UTF16BEDecoder();
|
||||||
|
// This is the output of `python3 -c "print('säk😀'.encode('utf-16be'))"`.
|
||||||
|
auto test_string = "\x00s\x00\xe4\x00k\xd8=\xde\x00"sv;
|
||||||
|
|
||||||
|
Vector<u32> processed_code_points;
|
||||||
|
decoder.process(test_string, [&](u32 code_point) {
|
||||||
|
processed_code_points.append(code_point);
|
||||||
|
});
|
||||||
|
EXPECT(processed_code_points.size() == 4);
|
||||||
|
EXPECT(processed_code_points[0] == 0x73);
|
||||||
|
EXPECT(processed_code_points[1] == 0xE4);
|
||||||
|
EXPECT(processed_code_points[2] == 0x6B);
|
||||||
|
EXPECT(processed_code_points[3] == 0x1F600);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_CASE(test_utf16le_decode)
|
||||||
|
{
|
||||||
|
auto decoder = TextCodec::UTF16LEDecoder();
|
||||||
|
// This is the output of `python3 -c "print('säk😀'.encode('utf-16le'))"`.
|
||||||
|
auto test_string = "s\x00\xe4\x00k\x00=\xd8\x00\xde"sv;
|
||||||
|
|
||||||
|
Vector<u32> processed_code_points;
|
||||||
|
decoder.process(test_string, [&](u32 code_point) {
|
||||||
|
processed_code_points.append(code_point);
|
||||||
|
});
|
||||||
|
EXPECT(processed_code_points.size() == 4);
|
||||||
|
EXPECT(processed_code_points[0] == 0x73);
|
||||||
|
EXPECT(processed_code_points[1] == 0xE4);
|
||||||
|
EXPECT(processed_code_points[2] == 0x6B);
|
||||||
|
EXPECT(processed_code_points[3] == 0x1F600);
|
||||||
|
}
|
||||||
|
|
|
@ -7,11 +7,14 @@
|
||||||
|
|
||||||
#include <AK/DeprecatedString.h>
|
#include <AK/DeprecatedString.h>
|
||||||
#include <AK/StringBuilder.h>
|
#include <AK/StringBuilder.h>
|
||||||
|
#include <AK/Utf16View.h>
|
||||||
#include <AK/Utf8View.h>
|
#include <AK/Utf8View.h>
|
||||||
#include <LibTextCodec/Decoder.h>
|
#include <LibTextCodec/Decoder.h>
|
||||||
|
|
||||||
namespace TextCodec {
|
namespace TextCodec {
|
||||||
|
|
||||||
|
static constexpr u32 replacement_code_point = 0xfffd;
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
Latin1Decoder s_latin1_decoder;
|
Latin1Decoder s_latin1_decoder;
|
||||||
UTF8Decoder s_utf8_decoder;
|
UTF8Decoder s_utf8_decoder;
|
||||||
|
@ -234,10 +237,40 @@ DeprecatedString UTF8Decoder::to_utf8(StringView input)
|
||||||
|
|
||||||
void UTF16BEDecoder::process(StringView input, Function<void(u32)> on_code_point)
|
void UTF16BEDecoder::process(StringView input, Function<void(u32)> on_code_point)
|
||||||
{
|
{
|
||||||
|
// rfc2781, 2.2 Decoding UTF-16
|
||||||
size_t utf16_length = input.length() - (input.length() % 2);
|
size_t utf16_length = input.length() - (input.length() % 2);
|
||||||
for (size_t i = 0; i < utf16_length; i += 2) {
|
for (size_t i = 0; i < utf16_length; i += 2) {
|
||||||
u16 code_point = (input[i] << 8) | input[i + 1];
|
// 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
|
||||||
on_code_point(code_point);
|
// of W1. Terminate.
|
||||||
|
u16 w1 = (static_cast<u8>(input[i]) << 8) | static_cast<u8>(input[i + 1]);
|
||||||
|
if (!is_unicode_surrogate(w1)) {
|
||||||
|
on_code_point(w1);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
|
||||||
|
// is in error and no valid character can be obtained using W1.
|
||||||
|
// Terminate.
|
||||||
|
// 3) If there is no W2 (that is, the sequence ends with W1), or if W2
|
||||||
|
// is not between 0xDC00 and 0xDFFF, the sequence is in error.
|
||||||
|
// Terminate.
|
||||||
|
if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) {
|
||||||
|
on_code_point(replacement_code_point);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
u16 w2 = (static_cast<u8>(input[i + 2]) << 8) | static_cast<u8>(input[i + 3]);
|
||||||
|
if (!Utf16View::is_low_surrogate(w2)) {
|
||||||
|
on_code_point(replacement_code_point);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
|
||||||
|
// bits of W1 as its 10 high-order bits and the 10 low-order bits of
|
||||||
|
// W2 as its 10 low-order bits.
|
||||||
|
// 5) Add 0x10000 to U' to obtain the character value U. Terminate.
|
||||||
|
on_code_point(Utf16View::decode_surrogate_pair(w1, w2));
|
||||||
|
i += 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -255,10 +288,40 @@ DeprecatedString UTF16BEDecoder::to_utf8(StringView input)
|
||||||
|
|
||||||
void UTF16LEDecoder::process(StringView input, Function<void(u32)> on_code_point)
|
void UTF16LEDecoder::process(StringView input, Function<void(u32)> on_code_point)
|
||||||
{
|
{
|
||||||
|
// rfc2781, 2.2 Decoding UTF-16
|
||||||
size_t utf16_length = input.length() - (input.length() % 2);
|
size_t utf16_length = input.length() - (input.length() % 2);
|
||||||
for (size_t i = 0; i < utf16_length; i += 2) {
|
for (size_t i = 0; i < utf16_length; i += 2) {
|
||||||
u16 code_point = input[i] | (input[i + 1] << 8);
|
// 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
|
||||||
on_code_point(code_point);
|
// of W1. Terminate.
|
||||||
|
u16 w1 = static_cast<u8>(input[i]) | (static_cast<u8>(input[i + 1]) << 8);
|
||||||
|
if (!is_unicode_surrogate(w1)) {
|
||||||
|
on_code_point(w1);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
|
||||||
|
// is in error and no valid character can be obtained using W1.
|
||||||
|
// Terminate.
|
||||||
|
// 3) If there is no W2 (that is, the sequence ends with W1), or if W2
|
||||||
|
// is not between 0xDC00 and 0xDFFF, the sequence is in error.
|
||||||
|
// Terminate.
|
||||||
|
if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) {
|
||||||
|
on_code_point(replacement_code_point);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
u16 w2 = static_cast<u8>(input[i + 2]) | (static_cast<u8>(input[i + 3]) << 8);
|
||||||
|
if (!Utf16View::is_low_surrogate(w2)) {
|
||||||
|
on_code_point(replacement_code_point);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
|
||||||
|
// bits of W1 as its 10 high-order bits and the 10 low-order bits of
|
||||||
|
// W2 as its 10 low-order bits.
|
||||||
|
// 5) Add 0x10000 to U' to obtain the character value U. Terminate.
|
||||||
|
on_code_point(Utf16View::decode_surrogate_pair(w1, w2));
|
||||||
|
i += 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue