mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-12-04 05:20:30 +00:00
LibTextCodec: Add x-user-defined decoder
It's a pretty simple charset: the bottom 128 bytes (0x00-0x7F) are standard ASCII, while the top 128 bytes (0x80-0xFF) are mapped to a portion of the Unicode Private Use Area, specifically 0xF780-0xF7FF. This is used by Google Maps for certain blobs.
This commit is contained in:
parent
835a344337
commit
0e0f98a45e
Notes:
sideshowbarker
2024-07-17 18:58:23 +09:00
Author: https://github.com/Lubrsi Commit: https://github.com/SerenityOS/serenity/commit/0e0f98a45e Pull-request: https://github.com/SerenityOS/serenity/pull/12448
2 changed files with 30 additions and 0 deletions
|
@ -20,6 +20,7 @@ CyrillicDecoder s_cyrillic_decoder;
|
|||
Koi8RDecoder s_koi8r_decoder;
|
||||
Latin9Decoder s_latin9_decoder;
|
||||
TurkishDecoder s_turkish_decoder;
|
||||
XUserDefinedDecoder s_x_user_defined_decoder;
|
||||
}
|
||||
|
||||
Decoder* decoder_for(const String& a_encoding)
|
||||
|
@ -44,6 +45,8 @@ Decoder* decoder_for(const String& a_encoding)
|
|||
return &s_latin9_decoder;
|
||||
if (encoding.value().equals_ignoring_case("windows-1254"))
|
||||
return &s_turkish_decoder;
|
||||
if (encoding.value().equals_ignoring_case("x-user-defined"))
|
||||
return &s_x_user_defined_decoder;
|
||||
}
|
||||
dbgln("TextCodec: No decoder implemented for encoding '{}'", a_encoding);
|
||||
return nullptr;
|
||||
|
@ -466,4 +469,26 @@ void TurkishDecoder::process(StringView input, Function<void(u32)> on_code_point
|
|||
}
|
||||
}
|
||||
|
||||
// https://encoding.spec.whatwg.org/#x-user-defined-decoder
|
||||
void XUserDefinedDecoder::process(StringView input, Function<void(u32)> on_code_point)
|
||||
{
|
||||
auto convert_x_user_defined_to_utf8 = [](u8 ch) -> u32 {
|
||||
// 2. If byte is an ASCII byte, return a code point whose value is byte.
|
||||
// https://infra.spec.whatwg.org/#ascii-byte
|
||||
// An ASCII byte is a byte in the range 0x00 (NUL) to 0x7F (DEL), inclusive.
|
||||
// NOTE: This doesn't check for ch >= 0x00, as that would always be true due to being unsigned.
|
||||
if (ch <= 0x7f)
|
||||
return ch;
|
||||
|
||||
// 3. Return a code point whose value is 0xF780 + byte − 0x80.
|
||||
return 0xF780 + ch - 0x80;
|
||||
};
|
||||
|
||||
for (auto ch : input) {
|
||||
on_code_point(convert_x_user_defined_to_utf8(ch));
|
||||
}
|
||||
|
||||
// 1. If byte is end-of-queue, return finished.
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -67,6 +67,11 @@ public:
|
|||
virtual void process(StringView, Function<void(u32)> on_code_point) override;
|
||||
};
|
||||
|
||||
class XUserDefinedDecoder final : public Decoder {
|
||||
public:
|
||||
virtual void process(StringView, Function<void(u32)> on_code_point) override;
|
||||
};
|
||||
|
||||
Decoder* decoder_for(String const& encoding);
|
||||
Optional<String> get_standardized_encoding(const String& encoding);
|
||||
|
||||
|
|
Loading…
Reference in a new issue