mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-25 09:00:22 +00:00
LibTextCodec: Use AK facilities to validate and convert UTF-16 to UTF-8
This allows LibTextCodec to make use of simdutf, and also reduces the number of places with manual UTF-16 implementations.
This commit is contained in:
parent
71c29504af
commit
368dad54ef
Notes:
sideshowbarker
2024-07-19 06:22:43 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/LadybirdBrowser/ladybird/commit/368dad54ef2 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/698
2 changed files with 30 additions and 112 deletions
|
@ -15,6 +15,8 @@ TEST_CASE(test_utf8_decode)
|
||||||
// Bytes for U+1F600 GRINNING FACE
|
// Bytes for U+1F600 GRINNING FACE
|
||||||
auto test_string = "\xf0\x9f\x98\x80"sv;
|
auto test_string = "\xf0\x9f\x98\x80"sv;
|
||||||
|
|
||||||
|
EXPECT(decoder.validate(test_string));
|
||||||
|
|
||||||
Vector<u32> processed_code_points;
|
Vector<u32> processed_code_points;
|
||||||
MUST(decoder.process(test_string, [&](u32 code_point) {
|
MUST(decoder.process(test_string, [&](u32 code_point) {
|
||||||
return processed_code_points.try_append(code_point);
|
return processed_code_points.try_append(code_point);
|
||||||
|
@ -31,6 +33,8 @@ TEST_CASE(test_utf16be_decode)
|
||||||
// This is the output of `python3 -c "print('säk😀'.encode('utf-16be'))"`.
|
// This is the output of `python3 -c "print('säk😀'.encode('utf-16be'))"`.
|
||||||
auto test_string = "\x00s\x00\xe4\x00k\xd8=\xde\x00"sv;
|
auto test_string = "\x00s\x00\xe4\x00k\xd8=\xde\x00"sv;
|
||||||
|
|
||||||
|
EXPECT(decoder.validate(test_string));
|
||||||
|
|
||||||
Vector<u32> processed_code_points;
|
Vector<u32> processed_code_points;
|
||||||
MUST(decoder.process(test_string, [&](u32 code_point) {
|
MUST(decoder.process(test_string, [&](u32 code_point) {
|
||||||
return processed_code_points.try_append(code_point);
|
return processed_code_points.try_append(code_point);
|
||||||
|
@ -40,6 +44,9 @@ TEST_CASE(test_utf16be_decode)
|
||||||
EXPECT(processed_code_points[1] == 0xE4);
|
EXPECT(processed_code_points[1] == 0xE4);
|
||||||
EXPECT(processed_code_points[2] == 0x6B);
|
EXPECT(processed_code_points[2] == 0x6B);
|
||||||
EXPECT(processed_code_points[3] == 0x1F600);
|
EXPECT(processed_code_points[3] == 0x1F600);
|
||||||
|
|
||||||
|
auto utf8 = MUST(decoder.to_utf8(test_string));
|
||||||
|
EXPECT_EQ(utf8, "säk😀"sv);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE(test_utf16le_decode)
|
TEST_CASE(test_utf16le_decode)
|
||||||
|
@ -48,6 +55,8 @@ TEST_CASE(test_utf16le_decode)
|
||||||
// This is the output of `python3 -c "print('säk😀'.encode('utf-16le'))"`.
|
// This is the output of `python3 -c "print('säk😀'.encode('utf-16le'))"`.
|
||||||
auto test_string = "s\x00\xe4\x00k\x00=\xd8\x00\xde"sv;
|
auto test_string = "s\x00\xe4\x00k\x00=\xd8\x00\xde"sv;
|
||||||
|
|
||||||
|
EXPECT(decoder.validate(test_string));
|
||||||
|
|
||||||
Vector<u32> processed_code_points;
|
Vector<u32> processed_code_points;
|
||||||
MUST(decoder.process(test_string, [&](u32 code_point) {
|
MUST(decoder.process(test_string, [&](u32 code_point) {
|
||||||
return processed_code_points.try_append(code_point);
|
return processed_code_points.try_append(code_point);
|
||||||
|
@ -57,4 +66,7 @@ TEST_CASE(test_utf16le_decode)
|
||||||
EXPECT(processed_code_points[1] == 0xE4);
|
EXPECT(processed_code_points[1] == 0xE4);
|
||||||
EXPECT(processed_code_points[2] == 0x6B);
|
EXPECT(processed_code_points[2] == 0x6B);
|
||||||
EXPECT(processed_code_points[3] == 0x1F600);
|
EXPECT(processed_code_points[3] == 0x1F600);
|
||||||
|
|
||||||
|
auto utf8 = MUST(decoder.to_utf8(test_string));
|
||||||
|
EXPECT_EQ(utf8, "säk😀"sv);
|
||||||
}
|
}
|
||||||
|
|
|
@ -375,150 +375,56 @@ ErrorOr<String> UTF8Decoder::to_utf8(StringView input)
|
||||||
return Decoder::to_utf8(bomless_input);
|
return Decoder::to_utf8(bomless_input);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static Utf16View as_utf16(StringView view, AK::Endianness endianness)
|
||||||
|
{
|
||||||
|
return Utf16View {
|
||||||
|
{ reinterpret_cast<u16 const*>(view.bytes().data()), view.length() / 2 },
|
||||||
|
endianness
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
ErrorOr<void> UTF16BEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
ErrorOr<void> UTF16BEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
||||||
{
|
{
|
||||||
// rfc2781, 2.2 Decoding UTF-16
|
for (auto code_point : as_utf16(input, AK::Endianness::Big))
|
||||||
size_t utf16_length = input.length() - (input.length() % 2);
|
TRY(on_code_point(code_point));
|
||||||
for (size_t i = 0; i < utf16_length; i += 2) {
|
|
||||||
// 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
|
|
||||||
// of W1. Terminate.
|
|
||||||
u16 w1 = (static_cast<u8>(input[i]) << 8) | static_cast<u8>(input[i + 1]);
|
|
||||||
if (!is_unicode_surrogate(w1)) {
|
|
||||||
TRY(on_code_point(w1));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
|
|
||||||
// is in error and no valid character can be obtained using W1.
|
|
||||||
// Terminate.
|
|
||||||
// 3) If there is no W2 (that is, the sequence ends with W1), or if W2
|
|
||||||
// is not between 0xDC00 and 0xDFFF, the sequence is in error.
|
|
||||||
// Terminate.
|
|
||||||
if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) {
|
|
||||||
TRY(on_code_point(replacement_code_point));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
u16 w2 = (static_cast<u8>(input[i + 2]) << 8) | static_cast<u8>(input[i + 3]);
|
|
||||||
if (!Utf16View::is_low_surrogate(w2)) {
|
|
||||||
TRY(on_code_point(replacement_code_point));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
|
|
||||||
// bits of W1 as its 10 high-order bits and the 10 low-order bits of
|
|
||||||
// W2 as its 10 low-order bits.
|
|
||||||
// 5) Add 0x10000 to U' to obtain the character value U. Terminate.
|
|
||||||
TRY(on_code_point(Utf16View::decode_surrogate_pair(w1, w2)));
|
|
||||||
i += 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
bool UTF16BEDecoder::validate(StringView input)
|
bool UTF16BEDecoder::validate(StringView input)
|
||||||
{
|
{
|
||||||
size_t utf16_length = input.length() - (input.length() % 2);
|
return as_utf16(input, AK::Endianness::Big).validate();
|
||||||
for (size_t i = 0; i < utf16_length; i += 2) {
|
|
||||||
u16 w1 = (static_cast<u8>(input[i]) << 8) | static_cast<u8>(input[i + 1]);
|
|
||||||
if (!is_unicode_surrogate(w1))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
u16 w2 = (static_cast<u8>(input[i + 2]) << 8) | static_cast<u8>(input[i + 3]);
|
|
||||||
if (!Utf16View::is_low_surrogate(w2))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
i += 2;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input)
|
ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input)
|
||||||
{
|
{
|
||||||
// Discard the BOM
|
// Discard the BOM
|
||||||
auto bomless_input = input;
|
|
||||||
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
|
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
|
||||||
bomless_input = input.substring_view(2);
|
input = input.substring_view(2);
|
||||||
|
|
||||||
StringBuilder builder(bomless_input.length() / 2);
|
return String::from_utf16(as_utf16(input, AK::Endianness::Big));
|
||||||
TRY(process(bomless_input, [&builder](u32 c) { return builder.try_append_code_point(c); }));
|
|
||||||
return builder.to_string();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<void> UTF16LEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
ErrorOr<void> UTF16LEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
||||||
{
|
{
|
||||||
// rfc2781, 2.2 Decoding UTF-16
|
for (auto code_point : as_utf16(input, AK::Endianness::Little))
|
||||||
size_t utf16_length = input.length() - (input.length() % 2);
|
TRY(on_code_point(code_point));
|
||||||
for (size_t i = 0; i < utf16_length; i += 2) {
|
|
||||||
// 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
|
|
||||||
// of W1. Terminate.
|
|
||||||
u16 w1 = static_cast<u8>(input[i]) | (static_cast<u8>(input[i + 1]) << 8);
|
|
||||||
if (!is_unicode_surrogate(w1)) {
|
|
||||||
TRY(on_code_point(w1));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
|
|
||||||
// is in error and no valid character can be obtained using W1.
|
|
||||||
// Terminate.
|
|
||||||
// 3) If there is no W2 (that is, the sequence ends with W1), or if W2
|
|
||||||
// is not between 0xDC00 and 0xDFFF, the sequence is in error.
|
|
||||||
// Terminate.
|
|
||||||
if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) {
|
|
||||||
TRY(on_code_point(replacement_code_point));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
u16 w2 = static_cast<u8>(input[i + 2]) | (static_cast<u8>(input[i + 3]) << 8);
|
|
||||||
if (!Utf16View::is_low_surrogate(w2)) {
|
|
||||||
TRY(on_code_point(replacement_code_point));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
|
|
||||||
// bits of W1 as its 10 high-order bits and the 10 low-order bits of
|
|
||||||
// W2 as its 10 low-order bits.
|
|
||||||
// 5) Add 0x10000 to U' to obtain the character value U. Terminate.
|
|
||||||
TRY(on_code_point(Utf16View::decode_surrogate_pair(w1, w2)));
|
|
||||||
i += 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
bool UTF16LEDecoder::validate(StringView input)
|
bool UTF16LEDecoder::validate(StringView input)
|
||||||
{
|
{
|
||||||
size_t utf16_length = input.length() - (input.length() % 2);
|
return as_utf16(input, AK::Endianness::Little).validate();
|
||||||
for (size_t i = 0; i < utf16_length; i += 2) {
|
|
||||||
u16 w1 = static_cast<u8>(input[i]) | (static_cast<u8>(input[i + 1]) << 8);
|
|
||||||
if (!is_unicode_surrogate(w1))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
u16 w2 = static_cast<u8>(input[i + 2]) | (static_cast<u8>(input[i + 3]) << 8);
|
|
||||||
if (!Utf16View::is_low_surrogate(w2))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
i += 2;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input)
|
ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input)
|
||||||
{
|
{
|
||||||
// Discard the BOM
|
// Discard the BOM
|
||||||
auto bomless_input = input;
|
|
||||||
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
|
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
|
||||||
bomless_input = input.substring_view(2);
|
input = input.substring_view(2);
|
||||||
|
|
||||||
StringBuilder builder(bomless_input.length() / 2);
|
return String::from_utf16(as_utf16(input, AK::Endianness::Little));
|
||||||
TRY(process(bomless_input, [&builder](u32 c) { return builder.try_append_code_point(c); }));
|
|
||||||
return builder.to_string();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<void> Latin1Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
ErrorOr<void> Latin1Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
||||||
|
|
Loading…
Reference in a new issue