LibTextCodec: Implement gb18030 and gbk encoders

Implements the `gb18030` and `gbk` encoders, as specified by
https://encoding.spec.whatwg.org/#gb18030-encoder
https://encoding.spec.whatwg.org/#gbk-encoder
This commit is contained in:
BenJilks 2024-08-05 21:14:51 +01:00 committed by Tim Ledbetter
parent 34c8c559c1
commit d80575a410
Notes: github-actions[bot] 2024-08-08 16:51:17 +00:00
4 changed files with 152 additions and 1 deletions

View file

@ -240,7 +240,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
auto json_data = TRY(json_file->read_until_eof());
auto data = TRY(JsonValue::from_string(json_data)).as_object();
auto gb18030_table = prepare_table(data.get("gb18030"sv)->as_array(), GenerateAccessor::Yes);
auto gb18030_table = prepare_table(data.get("gb18030"sv)->as_array(), GenerateAccessor::Yes, GenerateInverseAccessor::Yes);
// FIXME: Encoding specification is not updated to GB-18030-2022 yet (https://github.com/whatwg/encoding/issues/312)
// NOTE: See https://commits.webkit.org/264918@main

View file

@ -79,3 +79,22 @@ TEST_CASE(test_big5_encoder)
EXPECT(processed_bytes[2] == 0xD2);
EXPECT(processed_bytes[3] == 0x71);
}
TEST_CASE(test_gb18030_encoder)
{
TextCodec::GB18030Encoder encoder;
// U+20AC Euro Sign
// U+E4C5 Private Use Area
auto test_string = "\U000020AC\U0000E4C5"sv;
Vector<u8> processed_bytes;
MUST(encoder.process(Utf8View(test_string), [&](u8 byte) {
return processed_bytes.try_append(byte);
}));
EXPECT(processed_bytes.size() == 4);
EXPECT(processed_bytes[0] == 0xA2);
EXPECT(processed_bytes[1] == 0xE3);
EXPECT(processed_bytes[2] == 0xFE);
EXPECT(processed_bytes[3] == 0xFE);
}

View file

@ -4,6 +4,7 @@
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/BinarySearch.h>
#include <AK/Error.h>
#include <AK/Utf8View.h>
#include <LibTextCodec/Decoder.h>
@ -14,6 +15,8 @@ namespace TextCodec {
namespace {
UTF8Encoder s_utf8_encoder;
GB18030Encoder s_gb18030_encoder;
GB18030Encoder s_gbk_encoder(GB18030Encoder::IsGBK::Yes);
Big5Encoder s_big5_encoder;
EUCJPEncoder s_euc_jp_encoder;
EUCKREncoder s_euc_kr_encoder;
@ -29,6 +32,10 @@ Optional<Encoder&> encoder_for_exact_name(StringView encoding)
return s_euc_jp_encoder;
if (encoding.equals_ignoring_ascii_case("euc-kr"sv))
return s_euc_kr_encoder;
if (encoding.equals_ignoring_ascii_case("gb18030"sv))
return s_gb18030_encoder;
if (encoding.equals_ignoring_ascii_case("gbk"sv))
return s_gbk_encoder;
dbgln("TextCodec: No encoder implemented for encoding '{}'", encoding);
return {};
}
@ -207,4 +214,114 @@ ErrorOr<void> Big5Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> o
return {};
}
// https://encoding.spec.whatwg.org/#index-gb18030-ranges-pointer
static u32 index_gb18030_ranges_pointer(u32 code_point)
{
// 1. If code point is U+E7C7, return pointer 7457.
if (code_point == 0xe7c7)
return 7457;
// 2. Let offset be the last code point in index gb18030 ranges that is less than
// or equal to code point and let pointer offset be its corresponding pointer.
size_t last_index;
binary_search(s_gb18030_ranges, code_point, &last_index, [](auto const code_point, auto const& entry) {
return code_point - entry.code_point;
});
auto offset = s_gb18030_ranges[last_index].code_point;
auto pointer_offset = s_gb18030_ranges[last_index].pointer;
// 3. Return a pointer whose value is pointer offset + code point offset.
return pointer_offset + code_point - offset;
}
GB18030Encoder::GB18030Encoder(IsGBK is_gbk)
: m_is_gbk(is_gbk)
{
}
// https://encoding.spec.whatwg.org/#gb18030-encoder
ErrorOr<void> GB18030Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
{
bool gbk = (m_is_gbk == IsGBK::Yes);
for (u32 item : input) {
// 1. If code point is end-of-queue, return finished.
// 2. If code point is an ASCII code point, return a byte whose value is code point.
if (is_ascii(item)) {
TRY(on_byte(static_cast<u8>(item)));
continue;
}
// 3. If code point is U+E5E5, return error with code point.
if (item == 0xE5E5) {
// TODO: Report error.
continue;
}
// 4. If is GBK is true and code point is U+20AC, return byte 0x80.
if (gbk && item == 0x20AC) {
TRY(on_byte(0x80));
continue;
}
// 5. Let pointer be the index pointer for code point in index gb18030.
auto pointer = code_point_gb18030_index(item);
// 6. If pointer is non-null, then:
if (pointer.has_value()) {
// 1. Let lead be pointer / 190 + 0x81.
auto lead = *pointer / 190 + 0x81;
// 2. Let trail be pointer % 190.
auto trail = *pointer % 190;
// 3. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x41.
auto offset = 0x41;
if (trail < 0x3f)
offset = 0x40;
// 4. Return two bytes whose values are lead and trail + offset.
TRY(on_byte(static_cast<u8>(lead)));
TRY(on_byte(static_cast<u8>(trail + offset)));
continue;
}
// 7. If is GBK is true, return error with code point.
if (gbk) {
// TODO: Report error.
continue;
}
// 8. Set pointer to the index gb18030 ranges pointer for code point.
pointer = index_gb18030_ranges_pointer(item);
// 9. Let byte1 be pointer / (10 × 126 × 10).
auto byte1 = *pointer / (10 * 126 * 10);
// 10. Set pointer to pointer % (10 × 126 × 10).
pointer = *pointer % (10 * 126 * 10);
// 11. Let byte2 be pointer / (10 × 126).
auto byte2 = *pointer / (10 * 126);
// 12. Set pointer to pointer % (10 × 126).
pointer = *pointer % (10 * 126);
// 13. Let byte3 be pointer / 10.
auto byte3 = *pointer / 10;
// 14. Let byte4 be pointer % 10.
auto byte4 = *pointer % 10;
// 15. Return four bytes whose values are byte1 + 0x81, byte2 + 0x30, byte3 + 0x81, byte4 + 0x30.
TRY(on_byte(static_cast<u8>(byte1 + 0x81)));
TRY(on_byte(static_cast<u8>(byte2 + 0x30)));
TRY(on_byte(static_cast<u8>(byte3 + 0x81)));
TRY(on_byte(static_cast<u8>(byte4 + 0x30)));
}
return {};
}
}

View file

@ -39,6 +39,21 @@ public:
virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
};
class GB18030Encoder final : public Encoder {
public:
enum class IsGBK {
Yes,
No,
};
GB18030Encoder(IsGBK is_gbk = IsGBK::No);
virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
private:
IsGBK m_is_gbk { IsGBK::No };
};
Optional<Encoder&> encoder_for_exact_name(StringView encoding);
Optional<Encoder&> encoder_for(StringView label);