mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-21 23:20:20 +00:00
LibTextCodec: Fix ISO-8859-1 vs. windows-1252 handling in web contexts
The Encoding specification maps ISO-8859-1 to windows-1252 and expects the windows-1252 translation table to be used, which differs from ISO-8859-1 for 0x80-0x9F. Other contexts expect to get the actual ISO-8859-1 encoding, with 1-to-1 mapping to U+0000-U+00FF, when requesting it. `decoder_for_exact_name` is introduced, which skips the mapping from aliases to the encoding name done by `get_standardized_encoding`.
This commit is contained in:
parent
46d5cf0443
commit
6b2c459901
Notes:
sideshowbarker
2024-07-17 00:49:59 +09:00
Author: https://github.com/skyrising Commit: https://github.com/LadybirdBrowser/ladybird/commit/6b2c459901 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/32 Reviewed-by: https://github.com/jamierocks
7 changed files with 107 additions and 95 deletions
|
@ -211,6 +211,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
|
|||
{ "jis0208"sv, prepare_table(data.get("jis0208"sv)->as_array()) },
|
||||
{ "jis0212"sv, prepare_table(data.get("jis0212"sv)->as_array()) },
|
||||
{ "euc_kr"sv, prepare_table(data.get("euc-kr"sv)->as_array()) },
|
||||
{ "windows_1252"sv, prepare_table(data.get("windows-1252"sv)->as_array()) },
|
||||
},
|
||||
};
|
||||
|
||||
|
|
|
@ -2497,8 +2497,8 @@ static ErrorOr<void> decode_extension(JBIG2LoadingContext&, SegmentData const& s
|
|||
|
||||
auto second_bytes = TRY(read_string.template operator()<u8>());
|
||||
|
||||
auto first = TRY(TextCodec::decoder_for("ISO-8859-1"sv)->to_utf8(StringView { first_bytes }));
|
||||
auto second = TRY(TextCodec::decoder_for("ISO-8859-1"sv)->to_utf8(StringView { second_bytes }));
|
||||
auto first = TRY(TextCodec::decoder_for_exact_name("ISO-8859-1"sv)->to_utf8(StringView { first_bytes }));
|
||||
auto second = TRY(TextCodec::decoder_for_exact_name("ISO-8859-1"sv)->to_utf8(StringView { second_bytes }));
|
||||
dbgln("JBIG2ImageDecoderPlugin: key '{}', value '{}'", first, second);
|
||||
}
|
||||
if (!stream.is_eof())
|
||||
|
|
|
@ -496,7 +496,7 @@ static ErrorOr<Comment> read_comment(ReadonlyBytes data)
|
|||
|
||||
dbgln_if(JPEG2000_DEBUG, "JPEG2000ImageDecoderPlugin: COM marker segment: comment_type={}, size()={}", (int)com.type, com.data.size());
|
||||
if (com.type == Comment::ISO_IEC_8859_15)
|
||||
dbgln_if(JPEG2000_DEBUG, "JPEG2000ImageDecoderPlugin: COM marker segment, ISO/IEC 8859-15 text: '{}'", TRY(TextCodec::decoder_for("ISO-8859-1"sv)->to_utf8(StringView { com.data })));
|
||||
dbgln_if(JPEG2000_DEBUG, "JPEG2000ImageDecoderPlugin: COM marker segment, ISO/IEC 8859-15 text: '{}'", TRY(TextCodec::decoder_for("ISO-8859-15"sv)->to_utf8(StringView { com.data })));
|
||||
|
||||
return com;
|
||||
}
|
||||
|
|
|
@ -235,6 +235,8 @@ SingleByteDecoder s_windows874_decoder {{
|
|||
0x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0x0E45, 0x0E46, 0x0E47, 0x0E48, 0x0E49, 0x0E4A, 0x0E4B, 0x0E4C, 0x0E4D, 0x0E4E, 0x0E4F,
|
||||
0x0E50, 0x0E51, 0x0E52, 0x0E53, 0x0E54, 0x0E55, 0x0E56, 0x0E57, 0x0E58, 0x0E59, 0x0E5A, 0x0E5B, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
|
||||
}};
|
||||
// https://encoding.spec.whatwg.org/index-windows-1252.txt
|
||||
SingleByteDecoder s_windows1252_decoder { s_windows_1252_index };
|
||||
// https://encoding.spec.whatwg.org/index-windows-1253.txt
|
||||
SingleByteDecoder s_windows1253_decoder {{
|
||||
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x0088, 0x2030, 0x008A, 0x2039, 0x008C, 0x008D, 0x008E, 0x008F,
|
||||
|
@ -294,94 +296,97 @@ SingleByteDecoder s_mac_cyrillic_decoder {{
|
|||
|
||||
}
|
||||
|
||||
Optional<Decoder&> decoder_for(StringView a_encoding)
|
||||
Optional<Decoder&> decoder_for(StringView label)
|
||||
{
|
||||
auto encoding = get_standardized_encoding(a_encoding);
|
||||
if (encoding.has_value()) {
|
||||
if (encoding.value().equals_ignoring_ascii_case("windows-1252"sv))
|
||||
return s_latin1_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("utf-8"sv))
|
||||
return s_utf8_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("utf-16be"sv))
|
||||
return s_utf16be_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("utf-16le"sv))
|
||||
return s_utf16le_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("big5"sv))
|
||||
return s_big5_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("euc-jp"sv))
|
||||
return s_euc_jp_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("euc-kr"sv))
|
||||
return s_euc_kr_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("gbk"sv))
|
||||
return s_gb18030_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("gb18030"sv))
|
||||
return s_gb18030_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("ibm866"sv))
|
||||
return s_ibm866_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("iso-2022-jp"sv))
|
||||
return s_iso_2022_jp_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("iso-8859-2"sv))
|
||||
return s_latin2_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("iso-8859-3"sv))
|
||||
return s_latin3_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("iso-8859-4"sv))
|
||||
return s_latin4_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("iso-8859-5"sv))
|
||||
return s_latin_cyrillic_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("iso-8859-6"sv))
|
||||
return s_latin_arabic_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("iso-8859-7"sv))
|
||||
return s_latin_greek_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("iso-8859-8"sv))
|
||||
return s_latin_hebrew_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("iso-8859-8-i"sv))
|
||||
return s_latin_hebrew_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("iso-8859-10"sv))
|
||||
return s_latin6_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("iso-8859-13"sv))
|
||||
return s_latin7_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("iso-8859-14"sv))
|
||||
return s_latin8_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("iso-8859-15"sv))
|
||||
return s_latin9_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("iso-8859-16"sv))
|
||||
return s_latin10_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("koi8-r"sv))
|
||||
return s_koi8r_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("koi8-u"sv))
|
||||
return s_koi8u_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("macintosh"sv))
|
||||
return s_mac_roman_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("PDFDocEncoding"sv))
|
||||
return s_pdf_doc_encoding_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("replacement"sv))
|
||||
return s_replacement_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("shift_jis"sv))
|
||||
return s_shift_jis_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("windows-874"sv))
|
||||
return s_windows874_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("windows-1250"sv))
|
||||
return s_centraleurope_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("windows-1251"sv))
|
||||
return s_cyrillic_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("windows-1253"sv))
|
||||
return s_windows1253_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("windows-1254"sv))
|
||||
return s_turkish_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("windows-1255"sv))
|
||||
return s_hebrew_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("windows-1256"sv))
|
||||
return s_windows1256_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("windows-1257"sv))
|
||||
return s_windows1257_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("windows-1258"sv))
|
||||
return s_windows1258_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("x-mac-cyrillic"sv))
|
||||
return s_mac_cyrillic_decoder;
|
||||
if (encoding.value().equals_ignoring_ascii_case("x-user-defined"sv))
|
||||
return s_x_user_defined_decoder;
|
||||
}
|
||||
dbgln("TextCodec: No decoder implemented for encoding '{}'", a_encoding);
|
||||
auto encoding = get_standardized_encoding(label);
|
||||
return encoding.has_value() ? decoder_for_exact_name(encoding.value()) : Optional<Decoder&> {};
|
||||
}
|
||||
|
||||
Optional<Decoder&> decoder_for_exact_name(StringView encoding)
|
||||
{
|
||||
if (encoding.equals_ignoring_ascii_case("iso-8859-1"sv))
|
||||
return s_latin1_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("windows-1252"sv))
|
||||
return s_windows1252_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("utf-8"sv))
|
||||
return s_utf8_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("utf-16be"sv))
|
||||
return s_utf16be_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("utf-16le"sv))
|
||||
return s_utf16le_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("big5"sv))
|
||||
return s_big5_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("euc-jp"sv))
|
||||
return s_euc_jp_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("euc-kr"sv))
|
||||
return s_euc_kr_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("gbk"sv))
|
||||
return s_gb18030_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("gb18030"sv))
|
||||
return s_gb18030_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("ibm866"sv))
|
||||
return s_ibm866_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("iso-2022-jp"sv))
|
||||
return s_iso_2022_jp_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("iso-8859-2"sv))
|
||||
return s_latin2_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("iso-8859-3"sv))
|
||||
return s_latin3_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("iso-8859-4"sv))
|
||||
return s_latin4_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("iso-8859-5"sv))
|
||||
return s_latin_cyrillic_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("iso-8859-6"sv))
|
||||
return s_latin_arabic_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("iso-8859-7"sv))
|
||||
return s_latin_greek_decoder;
|
||||
if (encoding.is_one_of_ignoring_ascii_case("iso-8859-8"sv, "iso-8859-8-i"sv))
|
||||
return s_latin_hebrew_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("iso-8859-10"sv))
|
||||
return s_latin6_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("iso-8859-13"sv))
|
||||
return s_latin7_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("iso-8859-14"sv))
|
||||
return s_latin8_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("iso-8859-15"sv))
|
||||
return s_latin9_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("iso-8859-16"sv))
|
||||
return s_latin10_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("koi8-r"sv))
|
||||
return s_koi8r_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("koi8-u"sv))
|
||||
return s_koi8u_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("macintosh"sv))
|
||||
return s_mac_roman_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("PDFDocEncoding"sv))
|
||||
return s_pdf_doc_encoding_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("replacement"sv))
|
||||
return s_replacement_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("shift_jis"sv))
|
||||
return s_shift_jis_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("windows-874"sv))
|
||||
return s_windows874_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("windows-1250"sv))
|
||||
return s_centraleurope_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("windows-1251"sv))
|
||||
return s_cyrillic_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("windows-1253"sv))
|
||||
return s_windows1253_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("windows-1254"sv))
|
||||
return s_turkish_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("windows-1255"sv))
|
||||
return s_hebrew_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("windows-1256"sv))
|
||||
return s_windows1256_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("windows-1257"sv))
|
||||
return s_windows1257_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("windows-1258"sv))
|
||||
return s_windows1258_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("x-mac-cyrillic"sv))
|
||||
return s_mac_cyrillic_decoder;
|
||||
if (encoding.equals_ignoring_ascii_case("x-user-defined"sv))
|
||||
return s_x_user_defined_decoder;
|
||||
dbgln("TextCodec: No decoder implemented for encoding '{}'", encoding);
|
||||
return {};
|
||||
}
|
||||
|
||||
|
@ -979,7 +984,8 @@ ErrorOr<void> XUserDefinedDecoder::process(StringView input, Function<ErrorOr<vo
|
|||
}
|
||||
|
||||
// https://encoding.spec.whatwg.org/#single-byte-decoder
|
||||
ErrorOr<void> SingleByteDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
||||
template<Integral ArrayType>
|
||||
ErrorOr<void> SingleByteDecoder<ArrayType>::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
||||
{
|
||||
for (u8 const byte : input) {
|
||||
if (byte < 0x80) {
|
||||
|
|
|
@ -46,9 +46,10 @@ public:
|
|||
virtual ErrorOr<String> to_utf8(StringView) override;
|
||||
};
|
||||
|
||||
template<Integral ArrayType = u32>
|
||||
class SingleByteDecoder final : public Decoder {
|
||||
public:
|
||||
SingleByteDecoder(Array<u32, 128> translation_table)
|
||||
SingleByteDecoder(Array<ArrayType, 128> translation_table)
|
||||
: m_translation_table(translation_table)
|
||||
{
|
||||
}
|
||||
|
@ -56,7 +57,7 @@ public:
|
|||
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) override;
|
||||
|
||||
private:
|
||||
Array<u32, 128> m_translation_table;
|
||||
Array<ArrayType, 128> m_translation_table;
|
||||
};
|
||||
|
||||
class Latin1Decoder final : public Decoder {
|
||||
|
@ -131,6 +132,10 @@ public:
|
|||
virtual bool validate(StringView input) override { return input.is_empty(); }
|
||||
};
|
||||
|
||||
// This will return a decoder for the exact name specified, skipping get_standardized_encoding.
|
||||
// Use this when you want ISO-8859-1 instead of windows-1252.
|
||||
Optional<Decoder&> decoder_for_exact_name(StringView encoding);
|
||||
|
||||
Optional<Decoder&> decoder_for(StringView encoding);
|
||||
Optional<StringView> get_standardized_encoding(StringView encoding);
|
||||
|
||||
|
|
|
@ -41,7 +41,7 @@ WebIDL::ExceptionOr<JS::NonnullGCPtr<TextDecoder>> TextDecoder::construct_impl(J
|
|||
auto ignore_bom = options.value_or({}).ignore_bom;
|
||||
|
||||
// NOTE: This should happen in decode(), but we don't support streaming yet and share decoders across calls.
|
||||
auto decoder = TextCodec::decoder_for(encoding.value());
|
||||
auto decoder = TextCodec::decoder_for_exact_name(encoding.value());
|
||||
VERIFY(decoder.has_value());
|
||||
|
||||
return realm.heap().allocate<TextDecoder>(realm, realm, *decoder, lowercase_encoding_name, fatal, ignore_bom);
|
||||
|
|
|
@ -137,7 +137,7 @@ WebIDL::ExceptionOr<String> WindowOrWorkerGlobalScopeMixin::atob(String const& d
|
|||
|
||||
// 3. Return decodedData.
|
||||
// decode_base64() returns a byte string. LibJS uses UTF-8 for strings. Use Latin1Decoder to convert bytes 128-255 to UTF-8.
|
||||
auto decoder = TextCodec::decoder_for("windows-1252"sv);
|
||||
auto decoder = TextCodec::decoder_for_exact_name("ISO-8859-1"sv);
|
||||
VERIFY(decoder.has_value());
|
||||
return TRY_OR_THROW_OOM(vm, decoder->to_utf8(decoded_data.value()));
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue