LibTextCodec: Add alternate Cyrillic (aka Koi8-r) encoding

Fixes #6840.
This commit is contained in:
Dmitry Petrov 2021-12-14 01:19:56 +01:00 committed by Andreas Kling
parent 2c1a6ce9a5
commit 6f5102f435
Notes: sideshowbarker 2024-07-17 22:42:20 +09:00
2 changed files with 41 additions and 0 deletions

View file

@ -59,6 +59,14 @@ CyrillicDecoder& cyrillic_decoder()
return *decoder;
}
Koi8RDecoder& koi8r_decoder()
{
static Koi8RDecoder* decoder = nullptr;
if (!decoder)
decoder = new Koi8RDecoder;
return *decoder;
}
Latin9Decoder& latin9_decoder()
{
static Latin9Decoder* decoder = nullptr;
@ -93,6 +101,8 @@ Decoder* decoder_for(const String& a_encoding)
return &hebrew_decoder();
if (encoding.value().equals_ignoring_case("windows-1251"))
return &cyrillic_decoder();
if (encoding.value().equals_ignoring_case("koi8-r"))
return &koi8r_decoder();
if (encoding.value().equals_ignoring_case("iso-8859-15"))
return &latin9_decoder();
if (encoding.value().equals_ignoring_case("windows-1254"))
@ -165,6 +175,8 @@ Optional<String> get_standardized_encoding(const String& encoding)
return "windows-1258";
if (trimmed_lowercase_encoding.is_one_of("x-mac-cyrillic", "x-mac-ukrainian"))
return "x-mac-cyrillic";
if (trimmed_lowercase_encoding.is_one_of("koi8-r", "koi8r"))
return "koi8-r";
if (trimmed_lowercase_encoding.is_one_of("chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "x-gbk"))
return "GBK";
if (trimmed_lowercase_encoding == "gb18030")
@ -376,6 +388,30 @@ void CyrillicDecoder::process(StringView input, Function<void(u32)> on_code_poin
}
}
void Koi8RDecoder::process(StringView input, Function<void(u32)> on_code_point)
{
// clang-format off
static constexpr Array<u32, 128> translation_table = {
0x2500,0x2502,0x250c,0x2510,0x2514,0x2518,0x251c,0x2524,0x252c,0x2534,0x253c,0x2580,0x2584,0x2588,0x258c,0x2590,
0x2591,0x2592,0x2593,0x2320,0x25a0,0x2219,0x221a,0x2248,0x2264,0x2265,0xA0,0x2321,0xb0,0xb2,0xb7,0xf7,
0x2550,0x2551,0x2552,0xd191,0x2553,0x2554,0x2555,0x2556,0x2557,0x2558,0x2559,0x255a,0x255b,0x255c,0x255d,0x255e,
0x255f,0x2560,0x2561,0xd081,0x2562,0x2563,0x2564,0x2565,0x2566,0x2567,0x2568,0x2569,0x256a,0x256b,0x256c,0xa9,
0x44e,0x430,0x431,0x446,0x434,0x435,0x444,0x433,0x445,0x438,0x439,0x43a,0x43b,0x43c,0x43d,0x43e,
0x43f,0x44f,0x440,0x441,0x442,0x443,0x436,0x432,0x44c,0x44b,0x437,0x448,0x44d,0x449,0x447,0x44a,
0x42e,0x410,0x441,0x426,0x414,0x415,0x424,0x413,0x425,0x418,0x419,0x41a,0x41b,0x41c,0x41d,0x41e,
0x41f,0x42f,0x420,0x421,0x422,0x423,0x416,0x412,0x42c,0x42b,0x417,0x428,0x42d,0x429,0x427,0x42a,
};
// clang-format on
for (unsigned char ch : input) {
if (ch < 0x80) { // Superset of ASCII
on_code_point(ch);
} else {
on_code_point(translation_table[ch - 0x80]);
}
}
}
void Latin9Decoder::process(StringView input, Function<void(u32)> on_code_point)
{
auto convert_latin9_to_utf8 = [](u8 ch) -> u32 {

View file

@ -52,6 +52,11 @@ public:
virtual void process(StringView, Function<void(u32)> on_code_point) override;
};
class Koi8RDecoder final : public Decoder {
public:
virtual void process(StringView, Function<void(u32)> on_code_point) override;
};
class Latin9Decoder final : public Decoder {
public:
virtual void process(StringView, Function<void(u32)> on_code_point) override;