LibTextCodec: Add support for the UTF16-LE encoding

This commit is contained in:
Jelle Raaijmakers 2022-03-08 14:27:11 +01:00 committed by Linus Groh
parent efd9c70d94
commit 9c2a7c0e03
Notes: sideshowbarker 2024-07-17 17:48:43 +09:00
2 changed files with 33 additions and 3 deletions

View file

@ -1,5 +1,6 @@
/*
* Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
* Copyright (c) 2022, Jelle Raaijmakers <jelle@gmta.nl>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
@ -14,6 +15,7 @@ namespace {
Latin1Decoder s_latin1_decoder;
UTF8Decoder s_utf8_decoder;
UTF16BEDecoder s_utf16be_decoder;
UTF16LEDecoder s_utf16le_decoder;
Latin2Decoder s_latin2_decoder;
HebrewDecoder s_hebrew_decoder;
CyrillicDecoder s_cyrillic_decoder;
@ -33,6 +35,8 @@ Decoder* decoder_for(const String& a_encoding)
return &s_utf8_decoder;
if (encoding.value().equals_ignoring_case("utf-16be"))
return &s_utf16be_decoder;
if (encoding.value().equals_ignoring_case("utf-16le"))
return &s_utf16le_decoder;
if (encoding.value().equals_ignoring_case("iso-8859-2"))
return &s_latin2_decoder;
if (encoding.value().equals_ignoring_case("windows-1255"))
@ -172,8 +176,7 @@ Decoder* bom_sniff_to_decoder(StringView input)
case 0xFE: // UTF-16BE
return bytes[1] == 0xFF ? &s_utf16be_decoder : nullptr;
case 0xFF: // UTF-16LE
// FIXME: There is currently no UTF-16LE decoder.
TODO();
return bytes[1] == 0xFE ? &s_utf16le_decoder : nullptr;
}
return nullptr;
@ -241,9 +244,29 @@ String UTF16BEDecoder::to_utf8(StringView input)
{
// Discard the BOM
auto bomless_input = input;
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) {
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
bomless_input = input.substring_view(2);
StringBuilder builder(bomless_input.length() / 2);
process(bomless_input, [&builder](u32 c) { builder.append_code_point(c); });
return builder.to_string();
}
void UTF16LEDecoder::process(StringView input, Function<void(u32)> on_code_point)
{
size_t utf16_length = input.length() - (input.length() % 2);
for (size_t i = 0; i < utf16_length; i += 2) {
u16 code_point = input[i] | (input[i + 1] << 8);
on_code_point(code_point);
}
}
String UTF16LEDecoder::to_utf8(StringView input)
{
// Discard the BOM
auto bomless_input = input;
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
bomless_input = input.substring_view(2);
StringBuilder builder(bomless_input.length() / 2);
process(bomless_input, [&builder](u32 c) { builder.append_code_point(c); });

View file

@ -1,5 +1,6 @@
/*
* Copyright (c) 2020-2021, Andreas Kling <kling@serenityos.org>
* Copyright (c) 2022, Jelle Raaijmakers <jelle@gmta.nl>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
@ -32,6 +33,12 @@ public:
virtual String to_utf8(StringView) override;
};
class UTF16LEDecoder final : public Decoder {
public:
virtual void process(StringView, Function<void(u32)> on_code_point) override;
virtual String to_utf8(StringView) override;
};
class Latin1Decoder final : public Decoder {
public:
virtual void process(StringView, Function<void(u32)> on_code_point) override;