LibTextCodec+Everywhere: Return Optional<Decoder&> from decoder_for()
This commit is contained in:
parent
2f4c463920
commit
f2a9426885
Notes:
sideshowbarker
2024-07-17 00:01:08 +09:00
Author: https://github.com/AtkinsSJ Commit: https://github.com/SerenityOS/serenity/commit/f2a9426885 Pull-request: https://github.com/SerenityOS/serenity/pull/17511 Reviewed-by: https://github.com/ADKaster Reviewed-by: https://github.com/nico
16 changed files with 42 additions and 39 deletions
|
@ -11,8 +11,8 @@
|
||||||
|
|
||||||
extern "C" int LLVMFuzzerTestOneInput(uint8_t const* data, size_t size)
|
extern "C" int LLVMFuzzerTestOneInput(uint8_t const* data, size_t size)
|
||||||
{
|
{
|
||||||
auto* decoder = TextCodec::decoder_for("windows-1251"sv);
|
auto decoder = TextCodec::decoder_for("windows-1251"sv);
|
||||||
VERIFY(decoder);
|
VERIFY(decoder.has_value());
|
||||||
decoder->to_utf8({ data, size });
|
decoder->to_utf8({ data, size });
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,8 +11,8 @@
|
||||||
|
|
||||||
extern "C" int LLVMFuzzerTestOneInput(uint8_t const* data, size_t size)
|
extern "C" int LLVMFuzzerTestOneInput(uint8_t const* data, size_t size)
|
||||||
{
|
{
|
||||||
auto* decoder = TextCodec::decoder_for("windows-1255"sv);
|
auto decoder = TextCodec::decoder_for("windows-1255"sv);
|
||||||
VERIFY(decoder);
|
VERIFY(decoder.has_value());
|
||||||
decoder->to_utf8({ data, size });
|
decoder->to_utf8({ data, size });
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,8 +11,8 @@
|
||||||
|
|
||||||
extern "C" int LLVMFuzzerTestOneInput(uint8_t const* data, size_t size)
|
extern "C" int LLVMFuzzerTestOneInput(uint8_t const* data, size_t size)
|
||||||
{
|
{
|
||||||
auto* decoder = TextCodec::decoder_for("windows-1252"sv);
|
auto decoder = TextCodec::decoder_for("windows-1252"sv);
|
||||||
VERIFY(decoder);
|
VERIFY(decoder.has_value());
|
||||||
decoder->to_utf8({ data, size });
|
decoder->to_utf8({ data, size });
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,8 +11,8 @@
|
||||||
|
|
||||||
extern "C" int LLVMFuzzerTestOneInput(uint8_t const* data, size_t size)
|
extern "C" int LLVMFuzzerTestOneInput(uint8_t const* data, size_t size)
|
||||||
{
|
{
|
||||||
auto* decoder = TextCodec::decoder_for("iso-8859-2"sv);
|
auto decoder = TextCodec::decoder_for("iso-8859-2"sv);
|
||||||
VERIFY(decoder);
|
VERIFY(decoder.has_value());
|
||||||
decoder->to_utf8({ data, size });
|
decoder->to_utf8({ data, size });
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,8 +11,8 @@
|
||||||
|
|
||||||
extern "C" int LLVMFuzzerTestOneInput(uint8_t const* data, size_t size)
|
extern "C" int LLVMFuzzerTestOneInput(uint8_t const* data, size_t size)
|
||||||
{
|
{
|
||||||
auto* decoder = TextCodec::decoder_for("utf-16be"sv);
|
auto decoder = TextCodec::decoder_for("utf-16be"sv);
|
||||||
VERIFY(decoder);
|
VERIFY(decoder.has_value());
|
||||||
decoder->to_utf8({ data, size });
|
decoder->to_utf8({ data, size });
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
|
* Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
|
||||||
* Copyright (c) 2022, Jelle Raaijmakers <jelle@gmta.nl>
|
* Copyright (c) 2022, Jelle Raaijmakers <jelle@gmta.nl>
|
||||||
|
* Copyright (c) 2023, Sam Atkins <atkinssj@serenityos.org>
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: BSD-2-Clause
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
*/
|
*/
|
||||||
|
@ -30,37 +31,37 @@ TurkishDecoder s_turkish_decoder;
|
||||||
XUserDefinedDecoder s_x_user_defined_decoder;
|
XUserDefinedDecoder s_x_user_defined_decoder;
|
||||||
}
|
}
|
||||||
|
|
||||||
Decoder* decoder_for(StringView a_encoding)
|
Optional<Decoder&> decoder_for(StringView a_encoding)
|
||||||
{
|
{
|
||||||
auto encoding = get_standardized_encoding(a_encoding);
|
auto encoding = get_standardized_encoding(a_encoding);
|
||||||
if (encoding.has_value()) {
|
if (encoding.has_value()) {
|
||||||
if (encoding.value().equals_ignoring_case("windows-1252"sv))
|
if (encoding.value().equals_ignoring_case("windows-1252"sv))
|
||||||
return &s_latin1_decoder;
|
return s_latin1_decoder;
|
||||||
if (encoding.value().equals_ignoring_case("utf-8"sv))
|
if (encoding.value().equals_ignoring_case("utf-8"sv))
|
||||||
return &s_utf8_decoder;
|
return s_utf8_decoder;
|
||||||
if (encoding.value().equals_ignoring_case("utf-16be"sv))
|
if (encoding.value().equals_ignoring_case("utf-16be"sv))
|
||||||
return &s_utf16be_decoder;
|
return s_utf16be_decoder;
|
||||||
if (encoding.value().equals_ignoring_case("utf-16le"sv))
|
if (encoding.value().equals_ignoring_case("utf-16le"sv))
|
||||||
return &s_utf16le_decoder;
|
return s_utf16le_decoder;
|
||||||
if (encoding.value().equals_ignoring_case("iso-8859-2"sv))
|
if (encoding.value().equals_ignoring_case("iso-8859-2"sv))
|
||||||
return &s_latin2_decoder;
|
return s_latin2_decoder;
|
||||||
if (encoding.value().equals_ignoring_case("windows-1255"sv))
|
if (encoding.value().equals_ignoring_case("windows-1255"sv))
|
||||||
return &s_hebrew_decoder;
|
return s_hebrew_decoder;
|
||||||
if (encoding.value().equals_ignoring_case("windows-1251"sv))
|
if (encoding.value().equals_ignoring_case("windows-1251"sv))
|
||||||
return &s_cyrillic_decoder;
|
return s_cyrillic_decoder;
|
||||||
if (encoding.value().equals_ignoring_case("koi8-r"sv))
|
if (encoding.value().equals_ignoring_case("koi8-r"sv))
|
||||||
return &s_koi8r_decoder;
|
return s_koi8r_decoder;
|
||||||
if (encoding.value().equals_ignoring_case("iso-8859-15"sv))
|
if (encoding.value().equals_ignoring_case("iso-8859-15"sv))
|
||||||
return &s_latin9_decoder;
|
return s_latin9_decoder;
|
||||||
if (encoding.value().equals_ignoring_case("macintosh"sv))
|
if (encoding.value().equals_ignoring_case("macintosh"sv))
|
||||||
return &s_mac_roman_decoder;
|
return s_mac_roman_decoder;
|
||||||
if (encoding.value().equals_ignoring_case("windows-1254"sv))
|
if (encoding.value().equals_ignoring_case("windows-1254"sv))
|
||||||
return &s_turkish_decoder;
|
return s_turkish_decoder;
|
||||||
if (encoding.value().equals_ignoring_case("x-user-defined"sv))
|
if (encoding.value().equals_ignoring_case("x-user-defined"sv))
|
||||||
return &s_x_user_defined_decoder;
|
return s_x_user_defined_decoder;
|
||||||
}
|
}
|
||||||
dbgln("TextCodec: No decoder implemented for encoding '{}'", a_encoding);
|
dbgln("TextCodec: No decoder implemented for encoding '{}'", a_encoding);
|
||||||
return nullptr;
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
// https://encoding.spec.whatwg.org/#concept-encoding-get
|
// https://encoding.spec.whatwg.org/#concept-encoding-get
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2020-2021, Andreas Kling <kling@serenityos.org>
|
* Copyright (c) 2020-2021, Andreas Kling <kling@serenityos.org>
|
||||||
* Copyright (c) 2022, Jelle Raaijmakers <jelle@gmta.nl>
|
* Copyright (c) 2022, Jelle Raaijmakers <jelle@gmta.nl>
|
||||||
|
* Copyright (c) 2023, Sam Atkins <atkinssj@serenityos.org>
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: BSD-2-Clause
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
*/
|
*/
|
||||||
|
@ -9,6 +10,7 @@
|
||||||
|
|
||||||
#include <AK/Forward.h>
|
#include <AK/Forward.h>
|
||||||
#include <AK/Function.h>
|
#include <AK/Function.h>
|
||||||
|
#include <AK/Optional.h>
|
||||||
|
|
||||||
namespace TextCodec {
|
namespace TextCodec {
|
||||||
|
|
||||||
|
@ -84,7 +86,7 @@ public:
|
||||||
virtual void process(StringView, Function<void(u32)> on_code_point) override;
|
virtual void process(StringView, Function<void(u32)> on_code_point) override;
|
||||||
};
|
};
|
||||||
|
|
||||||
Decoder* decoder_for(StringView encoding);
|
Optional<Decoder&> decoder_for(StringView encoding);
|
||||||
Optional<StringView> get_standardized_encoding(StringView encoding);
|
Optional<StringView> get_standardized_encoding(StringView encoding);
|
||||||
|
|
||||||
// This returns the appropriate Unicode decoder for the sniffed BOM or nullptr if there is no appropriate decoder.
|
// This returns the appropriate Unicode decoder for the sniffed BOM or nullptr if there is no appropriate decoder.
|
||||||
|
|
|
@ -199,8 +199,8 @@ Tokenizer::Tokenizer(StringView input, StringView encoding)
|
||||||
{
|
{
|
||||||
// https://www.w3.org/TR/css-syntax-3/#css-filter-code-points
|
// https://www.w3.org/TR/css-syntax-3/#css-filter-code-points
|
||||||
auto filter_code_points = [](StringView input, auto encoding) -> ErrorOr<String> {
|
auto filter_code_points = [](StringView input, auto encoding) -> ErrorOr<String> {
|
||||||
auto* decoder = TextCodec::decoder_for(encoding);
|
auto decoder = TextCodec::decoder_for(encoding);
|
||||||
VERIFY(decoder);
|
VERIFY(decoder.has_value());
|
||||||
|
|
||||||
StringBuilder builder { input.length() };
|
StringBuilder builder { input.length() };
|
||||||
bool last_was_carriage_return = false;
|
bool last_was_carriage_return = false;
|
||||||
|
|
|
@ -15,7 +15,7 @@ namespace Web::Encoding {
|
||||||
WebIDL::ExceptionOr<JS::NonnullGCPtr<TextDecoder>> TextDecoder::construct_impl(JS::Realm& realm, DeprecatedFlyString encoding)
|
WebIDL::ExceptionOr<JS::NonnullGCPtr<TextDecoder>> TextDecoder::construct_impl(JS::Realm& realm, DeprecatedFlyString encoding)
|
||||||
{
|
{
|
||||||
auto decoder = TextCodec::decoder_for(encoding);
|
auto decoder = TextCodec::decoder_for(encoding);
|
||||||
if (!decoder)
|
if (!decoder.has_value())
|
||||||
return WebIDL::SimpleException { WebIDL::SimpleExceptionType::TypeError, DeprecatedString::formatted("Invalid encoding {}", encoding) };
|
return WebIDL::SimpleException { WebIDL::SimpleExceptionType::TypeError, DeprecatedString::formatted("Invalid encoding {}", encoding) };
|
||||||
|
|
||||||
return MUST_OR_THROW_OOM(realm.heap().allocate<TextDecoder>(realm, realm, *decoder, move(encoding), false, false));
|
return MUST_OR_THROW_OOM(realm.heap().allocate<TextDecoder>(realm, realm, *decoder, move(encoding), false, false));
|
||||||
|
|
|
@ -514,7 +514,7 @@ void HTMLScriptElement::resource_did_load()
|
||||||
// If the resource has an explicit encoding (i.e from a HTTP Content-Type header)
|
// If the resource has an explicit encoding (i.e from a HTTP Content-Type header)
|
||||||
// we have to re-encode it to UTF-8.
|
// we have to re-encode it to UTF-8.
|
||||||
if (resource()->has_encoding()) {
|
if (resource()->has_encoding()) {
|
||||||
if (auto* codec = TextCodec::decoder_for(resource()->encoding().value())) {
|
if (auto codec = TextCodec::decoder_for(resource()->encoding().value()); codec.has_value()) {
|
||||||
data = codec->to_utf8(data).to_byte_buffer();
|
data = codec->to_utf8(data).to_byte_buffer();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2798,8 +2798,8 @@ HTMLTokenizer::HTMLTokenizer()
|
||||||
|
|
||||||
HTMLTokenizer::HTMLTokenizer(StringView input, DeprecatedString const& encoding)
|
HTMLTokenizer::HTMLTokenizer(StringView input, DeprecatedString const& encoding)
|
||||||
{
|
{
|
||||||
auto* decoder = TextCodec::decoder_for(encoding);
|
auto decoder = TextCodec::decoder_for(encoding);
|
||||||
VERIFY(decoder);
|
VERIFY(decoder.has_value());
|
||||||
m_decoded_input = decoder->to_utf8(input);
|
m_decoded_input = decoder->to_utf8(input);
|
||||||
m_utf8_view = Utf8View(m_decoded_input);
|
m_utf8_view = Utf8View(m_decoded_input);
|
||||||
m_utf8_iterator = m_utf8_view.begin();
|
m_utf8_iterator = m_utf8_view.begin();
|
||||||
|
|
|
@ -1420,7 +1420,7 @@ JS_DEFINE_NATIVE_FUNCTION(Window::atob)
|
||||||
// The bytes object might contain bytes greater than 128, encode them in UTF8
|
// The bytes object might contain bytes greater than 128, encode them in UTF8
|
||||||
// NOTE: Any 8-bit encoding -> utf-8 decoder will work for this
|
// NOTE: Any 8-bit encoding -> utf-8 decoder will work for this
|
||||||
auto text_decoder = TextCodec::decoder_for("windows-1252"sv);
|
auto text_decoder = TextCodec::decoder_for("windows-1252"sv);
|
||||||
VERIFY(text_decoder);
|
VERIFY(text_decoder.has_value());
|
||||||
auto text = text_decoder->to_utf8(decoded.release_value());
|
auto text = text_decoder->to_utf8(decoded.release_value());
|
||||||
|
|
||||||
return JS::PrimitiveString::create(vm, DeprecatedString(text));
|
return JS::PrimitiveString::create(vm, DeprecatedString(text));
|
||||||
|
|
|
@ -161,8 +161,8 @@ WebIDL::ExceptionOr<DeprecatedString> WorkerGlobalScope::atob(DeprecatedString c
|
||||||
|
|
||||||
// 3. Return decodedData.
|
// 3. Return decodedData.
|
||||||
// decode_base64() returns a byte string. LibJS uses UTF-8 for strings. Use Latin1Decoder to convert bytes 128-255 to UTF-8.
|
// decode_base64() returns a byte string. LibJS uses UTF-8 for strings. Use Latin1Decoder to convert bytes 128-255 to UTF-8.
|
||||||
auto* decoder = TextCodec::decoder_for("windows-1252"sv);
|
auto decoder = TextCodec::decoder_for("windows-1252"sv);
|
||||||
VERIFY(decoder);
|
VERIFY(decoder.has_value());
|
||||||
return decoder->to_utf8(decoded_data.value());
|
return decoder->to_utf8(decoded_data.value());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -83,7 +83,7 @@ static DeprecatedString mime_type_from_content_type(DeprecatedString const& cont
|
||||||
|
|
||||||
static bool is_valid_encoding(StringView encoding)
|
static bool is_valid_encoding(StringView encoding)
|
||||||
{
|
{
|
||||||
return TextCodec::decoder_for(encoding);
|
return TextCodec::decoder_for(encoding).has_value();
|
||||||
}
|
}
|
||||||
|
|
||||||
void Resource::did_load(Badge<ResourceLoader>, ReadonlyBytes data, HashMap<DeprecatedString, DeprecatedString, CaseInsensitiveStringTraits> const& headers, Optional<u32> status_code)
|
void Resource::did_load(Badge<ResourceLoader>, ReadonlyBytes data, HashMap<DeprecatedString, DeprecatedString, CaseInsensitiveStringTraits> const& headers, Optional<u32> status_code)
|
||||||
|
|
|
@ -219,10 +219,10 @@ DeprecatedString XMLHttpRequest::get_text_response() const
|
||||||
charset = "UTF-8"sv;
|
charset = "UTF-8"sv;
|
||||||
|
|
||||||
// 5. Return the result of running decode on xhr’s received bytes using fallback encoding charset.
|
// 5. Return the result of running decode on xhr’s received bytes using fallback encoding charset.
|
||||||
auto* decoder = TextCodec::decoder_for(charset.value());
|
auto decoder = TextCodec::decoder_for(charset.value());
|
||||||
|
|
||||||
// If we don't support the decoder yet, let's crash instead of attempting to return something, as the result would be incorrect and create obscure bugs.
|
// If we don't support the decoder yet, let's crash instead of attempting to return something, as the result would be incorrect and create obscure bugs.
|
||||||
VERIFY(decoder);
|
VERIFY(decoder.has_value());
|
||||||
|
|
||||||
return TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(*decoder, m_received_bytes);
|
return TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(*decoder, m_received_bytes);
|
||||||
}
|
}
|
||||||
|
|
|
@ -882,8 +882,8 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
|
||||||
if (Utf8View { file_contents }.validate()) {
|
if (Utf8View { file_contents }.validate()) {
|
||||||
builder.append(source);
|
builder.append(source);
|
||||||
} else {
|
} else {
|
||||||
auto* decoder = TextCodec::decoder_for("windows-1252"sv);
|
auto decoder = TextCodec::decoder_for("windows-1252"sv);
|
||||||
VERIFY(decoder);
|
VERIFY(decoder.has_value());
|
||||||
|
|
||||||
auto utf8_source = TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(*decoder, source);
|
auto utf8_source = TextCodec::convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(*decoder, source);
|
||||||
builder.append(utf8_source);
|
builder.append(utf8_source);
|
||||||
|
|
Loading…
Add table
Reference in a new issue