Browse Source

LibTextCodec: Implement `gb18030` and `gbk` encoders

Implements the `gb18030` and `gbk` encoders, as specified by
https://encoding.spec.whatwg.org/#gb18030-encoder
https://encoding.spec.whatwg.org/#gbk-encoder
BenJilks 11 months ago
parent
commit
d80575a410

+ 1 - 1
Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp

@@ -240,7 +240,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
     auto json_data = TRY(json_file->read_until_eof());
     auto data = TRY(JsonValue::from_string(json_data)).as_object();
 
-    auto gb18030_table = prepare_table(data.get("gb18030"sv)->as_array(), GenerateAccessor::Yes);
+    auto gb18030_table = prepare_table(data.get("gb18030"sv)->as_array(), GenerateAccessor::Yes, GenerateInverseAccessor::Yes);
 
     // FIXME: Encoding specification is not updated to GB-18030-2022 yet (https://github.com/whatwg/encoding/issues/312)
     // NOTE: See https://commits.webkit.org/264918@main

+ 19 - 0
Tests/LibTextCodec/TestTextEncoders.cpp

@@ -79,3 +79,22 @@ TEST_CASE(test_big5_encoder)
     EXPECT(processed_bytes[2] == 0xD2);
     EXPECT(processed_bytes[3] == 0x71);
 }
+
+TEST_CASE(test_gb18030_encoder)
+{
+    TextCodec::GB18030Encoder encoder;
+    // U+20AC Euro Sign
+    // U+E4C5 Private Use Area
+    auto test_string = "\U000020AC\U0000E4C5"sv;
+
+    Vector<u8> processed_bytes;
+    MUST(encoder.process(Utf8View(test_string), [&](u8 byte) {
+        return processed_bytes.try_append(byte);
+    }));
+
+    EXPECT(processed_bytes.size() == 4);
+    EXPECT(processed_bytes[0] == 0xA2);
+    EXPECT(processed_bytes[1] == 0xE3);
+    EXPECT(processed_bytes[2] == 0xFE);
+    EXPECT(processed_bytes[3] == 0xFE);
+}

+ 117 - 0
Userland/Libraries/LibTextCodec/Encoder.cpp

@@ -4,6 +4,7 @@
  * SPDX-License-Identifier: BSD-2-Clause
  */
 
+#include <AK/BinarySearch.h>
 #include <AK/Error.h>
 #include <AK/Utf8View.h>
 #include <LibTextCodec/Decoder.h>
@@ -14,6 +15,8 @@ namespace TextCodec {
 
 namespace {
 UTF8Encoder s_utf8_encoder;
+GB18030Encoder s_gb18030_encoder;
+GB18030Encoder s_gbk_encoder(GB18030Encoder::IsGBK::Yes);
 Big5Encoder s_big5_encoder;
 EUCJPEncoder s_euc_jp_encoder;
 EUCKREncoder s_euc_kr_encoder;
@@ -29,6 +32,10 @@ Optional<Encoder&> encoder_for_exact_name(StringView encoding)
         return s_euc_jp_encoder;
     if (encoding.equals_ignoring_ascii_case("euc-kr"sv))
         return s_euc_kr_encoder;
+    if (encoding.equals_ignoring_ascii_case("gb18030"sv))
+        return s_gb18030_encoder;
+    if (encoding.equals_ignoring_ascii_case("gbk"sv))
+        return s_gbk_encoder;
     dbgln("TextCodec: No encoder implemented for encoding '{}'", encoding);
     return {};
 }
@@ -207,4 +214,114 @@ ErrorOr<void> Big5Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> o
     return {};
 }
 
+// https://encoding.spec.whatwg.org/#index-gb18030-ranges-pointer
+static u32 index_gb18030_ranges_pointer(u32 code_point)
+{
+    // 1. If code point is U+E7C7, return pointer 7457.
+    if (code_point == 0xe7c7)
+        return 7457;
+
+    // 2. Let offset be the last code point in index gb18030 ranges that is less than
+    //    or equal to code point and let pointer offset be its corresponding pointer.
+    size_t last_index;
+    binary_search(s_gb18030_ranges, code_point, &last_index, [](auto const code_point, auto const& entry) {
+        return code_point - entry.code_point;
+    });
+    auto offset = s_gb18030_ranges[last_index].code_point;
+    auto pointer_offset = s_gb18030_ranges[last_index].pointer;
+
+    // 3. Return a pointer whose value is pointer offset + code point − offset.
+    return pointer_offset + code_point - offset;
+}
+
+GB18030Encoder::GB18030Encoder(IsGBK is_gbk)
+    : m_is_gbk(is_gbk)
+{
+}
+
+// https://encoding.spec.whatwg.org/#gb18030-encoder
+ErrorOr<void> GB18030Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
+{
+    bool gbk = (m_is_gbk == IsGBK::Yes);
+
+    for (u32 item : input) {
+        // 1. If code point is end-of-queue, return finished.
+
+        // 2. If code point is an ASCII code point, return a byte whose value is code point.
+        if (is_ascii(item)) {
+            TRY(on_byte(static_cast<u8>(item)));
+            continue;
+        }
+
+        // 3. If code point is U+E5E5, return error with code point.
+        if (item == 0xE5E5) {
+            // TODO: Report error.
+            continue;
+        }
+
+        // 4. If is GBK is true and code point is U+20AC, return byte 0x80.
+        if (gbk && item == 0x20AC) {
+            TRY(on_byte(0x80));
+            continue;
+        }
+
+        // 5. Let pointer be the index pointer for code point in index gb18030.
+        auto pointer = code_point_gb18030_index(item);
+
+        // 6. If pointer is non-null, then:
+        if (pointer.has_value()) {
+            // 1. Let lead be pointer / 190 + 0x81.
+            auto lead = *pointer / 190 + 0x81;
+
+            // 2. Let trail be pointer % 190.
+            auto trail = *pointer % 190;
+
+            // 3. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x41.
+            auto offset = 0x41;
+            if (trail < 0x3f)
+                offset = 0x40;
+
+            // 4. Return two bytes whose values are lead and trail + offset.
+            TRY(on_byte(static_cast<u8>(lead)));
+            TRY(on_byte(static_cast<u8>(trail + offset)));
+            continue;
+        }
+
+        // 7. If is GBK is true, return error with code point.
+        if (gbk) {
+            // TODO: Report error.
+            continue;
+        }
+
+        // 8. Set pointer to the index gb18030 ranges pointer for code point.
+        pointer = index_gb18030_ranges_pointer(item);
+
+        // 9. Let byte1 be pointer / (10 × 126 × 10).
+        auto byte1 = *pointer / (10 * 126 * 10);
+
+        // 10. Set pointer to pointer % (10 × 126 × 10).
+        pointer = *pointer % (10 * 126 * 10);
+
+        // 11. Let byte2 be pointer / (10 × 126).
+        auto byte2 = *pointer / (10 * 126);
+
+        // 12. Set pointer to pointer % (10 × 126).
+        pointer = *pointer % (10 * 126);
+
+        // 13. Let byte3 be pointer / 10.
+        auto byte3 = *pointer / 10;
+
+        // 14. Let byte4 be pointer % 10.
+        auto byte4 = *pointer % 10;
+
+        // 15. Return four bytes whose values are byte1 + 0x81, byte2 + 0x30, byte3 + 0x81, byte4 + 0x30.
+        TRY(on_byte(static_cast<u8>(byte1 + 0x81)));
+        TRY(on_byte(static_cast<u8>(byte2 + 0x30)));
+        TRY(on_byte(static_cast<u8>(byte3 + 0x81)));
+        TRY(on_byte(static_cast<u8>(byte4 + 0x30)));
+    }
+
+    return {};
+}
+
 }

+ 15 - 0
Userland/Libraries/LibTextCodec/Encoder.h

@@ -39,6 +39,21 @@ public:
     virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
 };
 
+class GB18030Encoder final : public Encoder {
+public:
+    enum class IsGBK {
+        Yes,
+        No,
+    };
+
+    GB18030Encoder(IsGBK is_gbk = IsGBK::No);
+
+    virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
+
+private:
+    IsGBK m_is_gbk { IsGBK::No };
+};
+
 Optional<Encoder&> encoder_for_exact_name(StringView encoding);
 Optional<Encoder&> encoder_for(StringView label);