diff --git a/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp b/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp index b568f4a60df..53918f50d54 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp @@ -240,7 +240,7 @@ ErrorOr serenity_main(Main::Arguments arguments) auto json_data = TRY(json_file->read_until_eof()); auto data = TRY(JsonValue::from_string(json_data)).as_object(); - auto gb18030_table = prepare_table(data.get("gb18030"sv)->as_array(), GenerateAccessor::Yes); + auto gb18030_table = prepare_table(data.get("gb18030"sv)->as_array(), GenerateAccessor::Yes, GenerateInverseAccessor::Yes); // FIXME: Encoding specification is not updated to GB-18030-2022 yet (https://github.com/whatwg/encoding/issues/312) // NOTE: See https://commits.webkit.org/264918@main diff --git a/Tests/LibTextCodec/TestTextEncoders.cpp b/Tests/LibTextCodec/TestTextEncoders.cpp index 1ee6adf20da..2dc71dc21a0 100644 --- a/Tests/LibTextCodec/TestTextEncoders.cpp +++ b/Tests/LibTextCodec/TestTextEncoders.cpp @@ -79,3 +79,22 @@ TEST_CASE(test_big5_encoder) EXPECT(processed_bytes[2] == 0xD2); EXPECT(processed_bytes[3] == 0x71); } + +TEST_CASE(test_gb18030_encoder) +{ + TextCodec::GB18030Encoder encoder; + // U+20AC Euro Sign + // U+E4C5 Private Use Area + auto test_string = "\U000020AC\U0000E4C5"sv; + + Vector processed_bytes; + MUST(encoder.process(Utf8View(test_string), [&](u8 byte) { + return processed_bytes.try_append(byte); + })); + + EXPECT(processed_bytes.size() == 4); + EXPECT(processed_bytes[0] == 0xA2); + EXPECT(processed_bytes[1] == 0xE3); + EXPECT(processed_bytes[2] == 0xFE); + EXPECT(processed_bytes[3] == 0xFE); +} diff --git a/Userland/Libraries/LibTextCodec/Encoder.cpp b/Userland/Libraries/LibTextCodec/Encoder.cpp index 08304dfe15b..ee3429d3fa5 100644 --- a/Userland/Libraries/LibTextCodec/Encoder.cpp +++ b/Userland/Libraries/LibTextCodec/Encoder.cpp @@ -4,6 +4,7 @@ * SPDX-License-Identifier: BSD-2-Clause */ +#include #include #include #include @@ -14,6 +15,8 @@ namespace TextCodec { namespace { UTF8Encoder s_utf8_encoder; +GB18030Encoder s_gb18030_encoder; +GB18030Encoder s_gbk_encoder(GB18030Encoder::IsGBK::Yes); Big5Encoder s_big5_encoder; EUCJPEncoder s_euc_jp_encoder; EUCKREncoder s_euc_kr_encoder; @@ -29,6 +32,10 @@ Optional encoder_for_exact_name(StringView encoding) return s_euc_jp_encoder; if (encoding.equals_ignoring_ascii_case("euc-kr"sv)) return s_euc_kr_encoder; + if (encoding.equals_ignoring_ascii_case("gb18030"sv)) + return s_gb18030_encoder; + if (encoding.equals_ignoring_ascii_case("gbk"sv)) + return s_gbk_encoder; dbgln("TextCodec: No encoder implemented for encoding '{}'", encoding); return {}; } @@ -207,4 +214,114 @@ ErrorOr Big5Encoder::process(Utf8View input, Function(u8)> o return {}; } +// https://encoding.spec.whatwg.org/#index-gb18030-ranges-pointer +static u32 index_gb18030_ranges_pointer(u32 code_point) +{ + // 1. If code point is U+E7C7, return pointer 7457. + if (code_point == 0xe7c7) + return 7457; + + // 2. Let offset be the last code point in index gb18030 ranges that is less than + // or equal to code point and let pointer offset be its corresponding pointer. + size_t last_index; + binary_search(s_gb18030_ranges, code_point, &last_index, [](auto const code_point, auto const& entry) { + return code_point - entry.code_point; + }); + auto offset = s_gb18030_ranges[last_index].code_point; + auto pointer_offset = s_gb18030_ranges[last_index].pointer; + + // 3. Return a pointer whose value is pointer offset + code point − offset. + return pointer_offset + code_point - offset; +} + +GB18030Encoder::GB18030Encoder(IsGBK is_gbk) + : m_is_gbk(is_gbk) +{ +} + +// https://encoding.spec.whatwg.org/#gb18030-encoder +ErrorOr GB18030Encoder::process(Utf8View input, Function(u8)> on_byte) +{ + bool gbk = (m_is_gbk == IsGBK::Yes); + + for (u32 item : input) { + // 1. If code point is end-of-queue, return finished. + + // 2. If code point is an ASCII code point, return a byte whose value is code point. + if (is_ascii(item)) { + TRY(on_byte(static_cast(item))); + continue; + } + + // 3. If code point is U+E5E5, return error with code point. + if (item == 0xE5E5) { + // TODO: Report error. + continue; + } + + // 4. If is GBK is true and code point is U+20AC, return byte 0x80. + if (gbk && item == 0x20AC) { + TRY(on_byte(0x80)); + continue; + } + + // 5. Let pointer be the index pointer for code point in index gb18030. + auto pointer = code_point_gb18030_index(item); + + // 6. If pointer is non-null, then: + if (pointer.has_value()) { + // 1. Let lead be pointer / 190 + 0x81. + auto lead = *pointer / 190 + 0x81; + + // 2. Let trail be pointer % 190. + auto trail = *pointer % 190; + + // 3. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x41. + auto offset = 0x41; + if (trail < 0x3f) + offset = 0x40; + + // 4. Return two bytes whose values are lead and trail + offset. + TRY(on_byte(static_cast(lead))); + TRY(on_byte(static_cast(trail + offset))); + continue; + } + + // 7. If is GBK is true, return error with code point. + if (gbk) { + // TODO: Report error. + continue; + } + + // 8. Set pointer to the index gb18030 ranges pointer for code point. + pointer = index_gb18030_ranges_pointer(item); + + // 9. Let byte1 be pointer / (10 × 126 × 10). + auto byte1 = *pointer / (10 * 126 * 10); + + // 10. Set pointer to pointer % (10 × 126 × 10). + pointer = *pointer % (10 * 126 * 10); + + // 11. Let byte2 be pointer / (10 × 126). + auto byte2 = *pointer / (10 * 126); + + // 12. Set pointer to pointer % (10 × 126). + pointer = *pointer % (10 * 126); + + // 13. Let byte3 be pointer / 10. + auto byte3 = *pointer / 10; + + // 14. Let byte4 be pointer % 10. + auto byte4 = *pointer % 10; + + // 15. Return four bytes whose values are byte1 + 0x81, byte2 + 0x30, byte3 + 0x81, byte4 + 0x30. + TRY(on_byte(static_cast(byte1 + 0x81))); + TRY(on_byte(static_cast(byte2 + 0x30))); + TRY(on_byte(static_cast(byte3 + 0x81))); + TRY(on_byte(static_cast(byte4 + 0x30))); + } + + return {}; +} + } diff --git a/Userland/Libraries/LibTextCodec/Encoder.h b/Userland/Libraries/LibTextCodec/Encoder.h index 78529b3dbb5..57f97adc039 100644 --- a/Userland/Libraries/LibTextCodec/Encoder.h +++ b/Userland/Libraries/LibTextCodec/Encoder.h @@ -39,6 +39,21 @@ public: virtual ErrorOr process(Utf8View, Function(u8)> on_byte) override; }; +class GB18030Encoder final : public Encoder { +public: + enum class IsGBK { + Yes, + No, + }; + + GB18030Encoder(IsGBK is_gbk = IsGBK::No); + + virtual ErrorOr process(Utf8View, Function(u8)> on_byte) override; + +private: + IsGBK m_is_gbk { IsGBK::No }; +}; + Optional encoder_for_exact_name(StringView encoding); Optional encoder_for(StringView label);