mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-21 23:20:20 +00:00
LibTextCodec: Implement gb18030
and gbk
encoders
Implements the `gb18030` and `gbk` encoders, as specified by https://encoding.spec.whatwg.org/#gb18030-encoder https://encoding.spec.whatwg.org/#gbk-encoder
This commit is contained in:
parent
34c8c559c1
commit
d80575a410
Notes:
github-actions[bot]
2024-08-08 16:51:17 +00:00
Author: https://github.com/BenJilks Commit: https://github.com/LadybirdBrowser/ladybird/commit/d80575a4101 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/975 Reviewed-by: https://github.com/shannonbooth Reviewed-by: https://github.com/skyrising Reviewed-by: https://github.com/tcl3 ✅
4 changed files with 152 additions and 1 deletions
|
@ -240,7 +240,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
|
|||
auto json_data = TRY(json_file->read_until_eof());
|
||||
auto data = TRY(JsonValue::from_string(json_data)).as_object();
|
||||
|
||||
auto gb18030_table = prepare_table(data.get("gb18030"sv)->as_array(), GenerateAccessor::Yes);
|
||||
auto gb18030_table = prepare_table(data.get("gb18030"sv)->as_array(), GenerateAccessor::Yes, GenerateInverseAccessor::Yes);
|
||||
|
||||
// FIXME: Encoding specification is not updated to GB-18030-2022 yet (https://github.com/whatwg/encoding/issues/312)
|
||||
// NOTE: See https://commits.webkit.org/264918@main
|
||||
|
|
|
@ -79,3 +79,22 @@ TEST_CASE(test_big5_encoder)
|
|||
EXPECT(processed_bytes[2] == 0xD2);
|
||||
EXPECT(processed_bytes[3] == 0x71);
|
||||
}
|
||||
|
||||
TEST_CASE(test_gb18030_encoder)
|
||||
{
|
||||
TextCodec::GB18030Encoder encoder;
|
||||
// U+20AC Euro Sign
|
||||
// U+E4C5 Private Use Area
|
||||
auto test_string = "\U000020AC\U0000E4C5"sv;
|
||||
|
||||
Vector<u8> processed_bytes;
|
||||
MUST(encoder.process(Utf8View(test_string), [&](u8 byte) {
|
||||
return processed_bytes.try_append(byte);
|
||||
}));
|
||||
|
||||
EXPECT(processed_bytes.size() == 4);
|
||||
EXPECT(processed_bytes[0] == 0xA2);
|
||||
EXPECT(processed_bytes[1] == 0xE3);
|
||||
EXPECT(processed_bytes[2] == 0xFE);
|
||||
EXPECT(processed_bytes[3] == 0xFE);
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <AK/BinarySearch.h>
|
||||
#include <AK/Error.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibTextCodec/Decoder.h>
|
||||
|
@ -14,6 +15,8 @@ namespace TextCodec {
|
|||
|
||||
namespace {
|
||||
UTF8Encoder s_utf8_encoder;
|
||||
GB18030Encoder s_gb18030_encoder;
|
||||
GB18030Encoder s_gbk_encoder(GB18030Encoder::IsGBK::Yes);
|
||||
Big5Encoder s_big5_encoder;
|
||||
EUCJPEncoder s_euc_jp_encoder;
|
||||
EUCKREncoder s_euc_kr_encoder;
|
||||
|
@ -29,6 +32,10 @@ Optional<Encoder&> encoder_for_exact_name(StringView encoding)
|
|||
return s_euc_jp_encoder;
|
||||
if (encoding.equals_ignoring_ascii_case("euc-kr"sv))
|
||||
return s_euc_kr_encoder;
|
||||
if (encoding.equals_ignoring_ascii_case("gb18030"sv))
|
||||
return s_gb18030_encoder;
|
||||
if (encoding.equals_ignoring_ascii_case("gbk"sv))
|
||||
return s_gbk_encoder;
|
||||
dbgln("TextCodec: No encoder implemented for encoding '{}'", encoding);
|
||||
return {};
|
||||
}
|
||||
|
@ -207,4 +214,114 @@ ErrorOr<void> Big5Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> o
|
|||
return {};
|
||||
}
|
||||
|
||||
// https://encoding.spec.whatwg.org/#index-gb18030-ranges-pointer
|
||||
static u32 index_gb18030_ranges_pointer(u32 code_point)
|
||||
{
|
||||
// 1. If code point is U+E7C7, return pointer 7457.
|
||||
if (code_point == 0xe7c7)
|
||||
return 7457;
|
||||
|
||||
// 2. Let offset be the last code point in index gb18030 ranges that is less than
|
||||
// or equal to code point and let pointer offset be its corresponding pointer.
|
||||
size_t last_index;
|
||||
binary_search(s_gb18030_ranges, code_point, &last_index, [](auto const code_point, auto const& entry) {
|
||||
return code_point - entry.code_point;
|
||||
});
|
||||
auto offset = s_gb18030_ranges[last_index].code_point;
|
||||
auto pointer_offset = s_gb18030_ranges[last_index].pointer;
|
||||
|
||||
// 3. Return a pointer whose value is pointer offset + code point − offset.
|
||||
return pointer_offset + code_point - offset;
|
||||
}
|
||||
|
||||
GB18030Encoder::GB18030Encoder(IsGBK is_gbk)
|
||||
: m_is_gbk(is_gbk)
|
||||
{
|
||||
}
|
||||
|
||||
// https://encoding.spec.whatwg.org/#gb18030-encoder
|
||||
ErrorOr<void> GB18030Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
|
||||
{
|
||||
bool gbk = (m_is_gbk == IsGBK::Yes);
|
||||
|
||||
for (u32 item : input) {
|
||||
// 1. If code point is end-of-queue, return finished.
|
||||
|
||||
// 2. If code point is an ASCII code point, return a byte whose value is code point.
|
||||
if (is_ascii(item)) {
|
||||
TRY(on_byte(static_cast<u8>(item)));
|
||||
continue;
|
||||
}
|
||||
|
||||
// 3. If code point is U+E5E5, return error with code point.
|
||||
if (item == 0xE5E5) {
|
||||
// TODO: Report error.
|
||||
continue;
|
||||
}
|
||||
|
||||
// 4. If is GBK is true and code point is U+20AC, return byte 0x80.
|
||||
if (gbk && item == 0x20AC) {
|
||||
TRY(on_byte(0x80));
|
||||
continue;
|
||||
}
|
||||
|
||||
// 5. Let pointer be the index pointer for code point in index gb18030.
|
||||
auto pointer = code_point_gb18030_index(item);
|
||||
|
||||
// 6. If pointer is non-null, then:
|
||||
if (pointer.has_value()) {
|
||||
// 1. Let lead be pointer / 190 + 0x81.
|
||||
auto lead = *pointer / 190 + 0x81;
|
||||
|
||||
// 2. Let trail be pointer % 190.
|
||||
auto trail = *pointer % 190;
|
||||
|
||||
// 3. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x41.
|
||||
auto offset = 0x41;
|
||||
if (trail < 0x3f)
|
||||
offset = 0x40;
|
||||
|
||||
// 4. Return two bytes whose values are lead and trail + offset.
|
||||
TRY(on_byte(static_cast<u8>(lead)));
|
||||
TRY(on_byte(static_cast<u8>(trail + offset)));
|
||||
continue;
|
||||
}
|
||||
|
||||
// 7. If is GBK is true, return error with code point.
|
||||
if (gbk) {
|
||||
// TODO: Report error.
|
||||
continue;
|
||||
}
|
||||
|
||||
// 8. Set pointer to the index gb18030 ranges pointer for code point.
|
||||
pointer = index_gb18030_ranges_pointer(item);
|
||||
|
||||
// 9. Let byte1 be pointer / (10 × 126 × 10).
|
||||
auto byte1 = *pointer / (10 * 126 * 10);
|
||||
|
||||
// 10. Set pointer to pointer % (10 × 126 × 10).
|
||||
pointer = *pointer % (10 * 126 * 10);
|
||||
|
||||
// 11. Let byte2 be pointer / (10 × 126).
|
||||
auto byte2 = *pointer / (10 * 126);
|
||||
|
||||
// 12. Set pointer to pointer % (10 × 126).
|
||||
pointer = *pointer % (10 * 126);
|
||||
|
||||
// 13. Let byte3 be pointer / 10.
|
||||
auto byte3 = *pointer / 10;
|
||||
|
||||
// 14. Let byte4 be pointer % 10.
|
||||
auto byte4 = *pointer % 10;
|
||||
|
||||
// 15. Return four bytes whose values are byte1 + 0x81, byte2 + 0x30, byte3 + 0x81, byte4 + 0x30.
|
||||
TRY(on_byte(static_cast<u8>(byte1 + 0x81)));
|
||||
TRY(on_byte(static_cast<u8>(byte2 + 0x30)));
|
||||
TRY(on_byte(static_cast<u8>(byte3 + 0x81)));
|
||||
TRY(on_byte(static_cast<u8>(byte4 + 0x30)));
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -39,6 +39,21 @@ public:
|
|||
virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
|
||||
};
|
||||
|
||||
class GB18030Encoder final : public Encoder {
|
||||
public:
|
||||
enum class IsGBK {
|
||||
Yes,
|
||||
No,
|
||||
};
|
||||
|
||||
GB18030Encoder(IsGBK is_gbk = IsGBK::No);
|
||||
|
||||
virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
|
||||
|
||||
private:
|
||||
IsGBK m_is_gbk { IsGBK::No };
|
||||
};
|
||||
|
||||
Optional<Encoder&> encoder_for_exact_name(StringView encoding);
|
||||
Optional<Encoder&> encoder_for(StringView label);
|
||||
|
||||
|
|
Loading…
Reference in a new issue