LibTextCodec: Add Turkish (aka ISO-8859-9, Windows-1254) encoding

This commit is contained in:
Aatos Majava 2021-06-23 16:18:50 +03:00 committed by Linus Groh
parent 21ee0ad6fc
commit 3b2a528b33
Notes: sideshowbarker 2024-07-18 11:37:28 +09:00
2 changed files with 44 additions and 0 deletions

View file

@ -67,6 +67,14 @@ Latin9Decoder& latin9_decoder()
return *decoder;
}
TurkishDecoder& turkish_decoder()
{
static TurkishDecoder* decoder = nullptr;
if (!decoder)
decoder = new TurkishDecoder;
return *decoder;
}
}
Decoder* decoder_for(const String& a_encoding)
@ -87,6 +95,8 @@ Decoder* decoder_for(const String& a_encoding)
return &cyrillic_decoder();
if (encoding.value().equals_ignoring_case("iso-8859-15"))
return &latin9_decoder();
if (encoding.value().equals_ignoring_case("windows-1254"))
return &turkish_decoder();
}
dbgln("TextCodec: No decoder implemented for encoding '{}'", a_encoding);
return nullptr;
@ -383,4 +393,33 @@ String Latin9Decoder::to_utf8(const StringView& input)
return builder.to_string();
}
String TurkishDecoder::to_utf8(const StringView& input)
{
auto convert_turkish_to_utf8 = [](u8 ch) -> u32 {
// Turkish (aka ISO-8859-9, Windows-1254) is the same as the first 256 Unicode code points, except for 6 characters.
switch (ch) {
case 0xD0:
return 0x11E;
case 0xDD:
return 0x130;
case 0xDE:
return 0x15E;
case 0xF0:
return 0x11F;
case 0xFD:
return 0x131;
case 0xFE:
return 0x15F;
default:
return ch;
}
};
StringBuilder builder(input.length());
for (auto ch : input) {
builder.append_code_point(convert_turkish_to_utf8(ch));
}
return builder.to_string();
}
}

View file

@ -53,6 +53,11 @@ public:
virtual String to_utf8(const StringView&) override;
};
class TurkishDecoder final : public Decoder {
public:
virtual String to_utf8(const StringView&) override;
};
Decoder* decoder_for(const String& encoding);
Optional<String> get_standardized_encoding(const String& encoding);
bool is_standardized_encoding(const String& encoding);