mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-21 23:20:20 +00:00
LibTextCodec+LibURL: Implement utf-8
and euc-jp
encoders
Implements the corresponding encoders, selects the appropriate one when encoding URL search params. If an encoder for the given encoding could not be found, fallback to utf-8.
This commit is contained in:
parent
82a63e350c
commit
72d0e3284b
Notes:
github-actions[bot]
2024-08-08 16:51:38 +00:00
Author: https://github.com/BenJilks Commit: https://github.com/LadybirdBrowser/ladybird/commit/72d0e3284b6 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/975 Reviewed-by: https://github.com/shannonbooth Reviewed-by: https://github.com/skyrising Reviewed-by: https://github.com/tcl3 ✅
11 changed files with 260 additions and 22 deletions
|
@ -21,6 +21,7 @@ struct LookupTable {
|
|||
u32 max_code_point;
|
||||
Vector<u32> code_points;
|
||||
bool generate_accessor;
|
||||
bool generate_inverse_accessor;
|
||||
};
|
||||
|
||||
struct LookupTables {
|
||||
|
@ -33,7 +34,12 @@ enum class GenerateAccessor {
|
|||
Yes,
|
||||
};
|
||||
|
||||
LookupTable prepare_table(JsonArray const& data, GenerateAccessor generate_accessor = GenerateAccessor::No)
|
||||
enum class GenerateInverseAccessor {
|
||||
No,
|
||||
Yes,
|
||||
};
|
||||
|
||||
LookupTable prepare_table(JsonArray const& data, GenerateAccessor generate_accessor = GenerateAccessor::No, GenerateInverseAccessor generate_inverse_accessor = GenerateInverseAccessor::No)
|
||||
{
|
||||
Vector<u32> code_points;
|
||||
code_points.ensure_capacity(data.size());
|
||||
|
@ -58,7 +64,7 @@ LookupTable prepare_table(JsonArray const& data, GenerateAccessor generate_acces
|
|||
} else {
|
||||
VERIFY(first_pointer == 0);
|
||||
}
|
||||
return { first_pointer, max, move(code_points), generate_accessor == GenerateAccessor::Yes };
|
||||
return { first_pointer, max, move(code_points), generate_accessor == GenerateAccessor::Yes, generate_inverse_accessor == GenerateInverseAccessor::Yes };
|
||||
}
|
||||
|
||||
void generate_table(SourceGenerator generator, StringView name, LookupTable& table)
|
||||
|
@ -81,6 +87,8 @@ void generate_table(SourceGenerator generator, StringView name, LookupTable& tab
|
|||
generator.appendln("\n};");
|
||||
if (table.generate_accessor)
|
||||
generator.appendln("Optional<u32> index_@name@_code_point(u32 pointer);");
|
||||
if (table.generate_inverse_accessor)
|
||||
generator.appendln("Optional<u32> code_point_@name@_index(u32 code_point);");
|
||||
}
|
||||
|
||||
ErrorOr<void> generate_header_file(LookupTables& tables, Core::File& file)
|
||||
|
@ -155,6 +163,42 @@ Optional<u32> index_@name@_code_point(u32 pointer)
|
|||
}
|
||||
}
|
||||
|
||||
void generate_inverse_table_accessor(SourceGenerator generator, StringView name, LookupTable& table)
|
||||
{
|
||||
generator.set("name", name);
|
||||
generator.set("first_pointer", MUST(String::number(table.first_pointer)));
|
||||
generator.set("size", MUST(String::number(table.code_points.size())));
|
||||
|
||||
// FIXME - Doing a linear search here is really slow, should be generating
|
||||
// some kind of reverse lookup table.
|
||||
|
||||
if (table.first_pointer > 0) {
|
||||
generator.append(R"~~~(
|
||||
Optional<u32> code_point_@name@_index(u32 code_point)
|
||||
{
|
||||
for (u32 i = 0; i < s_@name@_index.size(); ++i) {
|
||||
if (s_@name@_index[i] == code_point) {
|
||||
return s_@name@_index_first_pointer + i;
|
||||
}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
)~~~");
|
||||
} else {
|
||||
generator.append(R"~~~(
|
||||
Optional<u32> code_point_@name@_index(u32 code_point)
|
||||
{
|
||||
for (u32 i = 0; i < s_@name@_index.size(); ++i) {
|
||||
if (s_@name@_index[i] == code_point) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
)~~~");
|
||||
}
|
||||
}
|
||||
|
||||
ErrorOr<void> generate_implementation_file(LookupTables& tables, Core::File& file)
|
||||
{
|
||||
StringBuilder builder;
|
||||
|
@ -169,6 +213,8 @@ namespace TextCodec {
|
|||
for (auto& [key, table] : tables.indexes) {
|
||||
if (table.generate_accessor)
|
||||
generate_table_accessor(generator.fork(), key, table);
|
||||
if (table.generate_inverse_accessor)
|
||||
generate_inverse_table_accessor(generator.fork(), key, table);
|
||||
}
|
||||
|
||||
generator.appendln("\n}");
|
||||
|
@ -222,7 +268,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
|
|||
.indexes = {
|
||||
{ "gb18030"sv, move(gb18030_table) },
|
||||
{ "big5"sv, prepare_table(data.get("big5"sv)->as_array(), GenerateAccessor::Yes) },
|
||||
{ "jis0208"sv, prepare_table(data.get("jis0208"sv)->as_array(), GenerateAccessor::Yes) },
|
||||
{ "jis0208"sv, prepare_table(data.get("jis0208"sv)->as_array(), GenerateAccessor::Yes, GenerateInverseAccessor::Yes) },
|
||||
{ "jis0212"sv, prepare_table(data.get("jis0212"sv)->as_array(), GenerateAccessor::Yes) },
|
||||
{ "euc_kr"sv, prepare_table(data.get("euc-kr"sv)->as_array(), GenerateAccessor::Yes) },
|
||||
{ "ibm866"sv, prepare_table(data.get("ibm866"sv)->as_array()) },
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
set(TEST_SOURCES
|
||||
TestTextDecoders.cpp
|
||||
TestTextEncoders.cpp
|
||||
)
|
||||
|
||||
foreach(source IN LISTS TEST_SOURCES)
|
||||
|
|
45
Tests/LibTextCodec/TestTextEncoders.cpp
Normal file
45
Tests/LibTextCodec/TestTextEncoders.cpp
Normal file
|
@ -0,0 +1,45 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Ben Jilks <benjyjilks@gmail.com>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <LibTest/TestCase.h>
|
||||
#include <LibTextCodec/Encoder.h>
|
||||
|
||||
TEST_CASE(test_utf8_encode)
|
||||
{
|
||||
TextCodec::UTF8Encoder encoder;
|
||||
// Unicode character U+1F600 GRINNING FACE
|
||||
auto test_string = "\U0001F600"sv;
|
||||
|
||||
Vector<u8> processed_bytes;
|
||||
MUST(encoder.process(Utf8View(test_string), [&](u8 byte) {
|
||||
return processed_bytes.try_append(byte);
|
||||
}));
|
||||
EXPECT(processed_bytes.size() == 4);
|
||||
EXPECT(processed_bytes[0] == 0xF0);
|
||||
EXPECT(processed_bytes[1] == 0x9F);
|
||||
EXPECT(processed_bytes[2] == 0x98);
|
||||
EXPECT(processed_bytes[3] == 0x80);
|
||||
}
|
||||
|
||||
TEST_CASE(test_euc_jp_encoder)
|
||||
{
|
||||
TextCodec::EUCJPEncoder encoder;
|
||||
// U+A5 Yen Sign
|
||||
// U+3088 Hiragana Letter Yo
|
||||
// U+30C4 Katakana Letter Tu
|
||||
auto test_string = "\U000000A5\U00003088\U000030C4"sv;
|
||||
|
||||
Vector<u8> processed_bytes;
|
||||
MUST(encoder.process(Utf8View(test_string), [&](u8 byte) {
|
||||
return processed_bytes.try_append(byte);
|
||||
}));
|
||||
EXPECT(processed_bytes.size() == 5);
|
||||
EXPECT(processed_bytes[0] == 0x5C);
|
||||
EXPECT(processed_bytes[1] == 0xA4);
|
||||
EXPECT(processed_bytes[2] == 0xE8);
|
||||
EXPECT(processed_bytes[3] == 0xA5);
|
||||
EXPECT(processed_bytes[4] == 0xC4);
|
||||
}
|
|
@ -2,6 +2,7 @@ include(libtextcodec_generators)
|
|||
|
||||
set(SOURCES
|
||||
Decoder.cpp
|
||||
Encoder.cpp
|
||||
)
|
||||
|
||||
generate_encoding_indexes()
|
||||
|
|
103
Userland/Libraries/LibTextCodec/Encoder.cpp
Normal file
103
Userland/Libraries/LibTextCodec/Encoder.cpp
Normal file
|
@ -0,0 +1,103 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Ben Jilks <benjyjilks@gmail.com>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <AK/Error.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibTextCodec/Decoder.h>
|
||||
#include <LibTextCodec/Encoder.h>
|
||||
#include <LibTextCodec/LookupTables.h>
|
||||
|
||||
namespace TextCodec {
|
||||
|
||||
namespace {
|
||||
UTF8Encoder s_utf8_encoder;
|
||||
EUCJPEncoder s_euc_jp_encoder;
|
||||
}
|
||||
|
||||
Optional<Encoder&> encoder_for_exact_name(StringView encoding)
|
||||
{
|
||||
if (encoding.equals_ignoring_ascii_case("utf-8"sv))
|
||||
return s_utf8_encoder;
|
||||
if (encoding.equals_ignoring_ascii_case("euc-jp"sv))
|
||||
return s_euc_jp_encoder;
|
||||
dbgln("TextCodec: No encoder implemented for encoding '{}'", encoding);
|
||||
return {};
|
||||
}
|
||||
|
||||
Optional<Encoder&> encoder_for(StringView label)
|
||||
{
|
||||
auto encoding = get_standardized_encoding(label);
|
||||
return encoding.has_value() ? encoder_for_exact_name(encoding.value()) : Optional<Encoder&> {};
|
||||
}
|
||||
|
||||
// https://encoding.spec.whatwg.org/#utf-8-encoder
|
||||
ErrorOr<void> UTF8Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
|
||||
{
|
||||
ReadonlyBytes bytes { input.bytes(), input.byte_length() };
|
||||
for (auto byte : bytes)
|
||||
TRY(on_byte(byte));
|
||||
return {};
|
||||
}
|
||||
|
||||
// https://encoding.spec.whatwg.org/#euc-jp-encoder
|
||||
ErrorOr<void> EUCJPEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
|
||||
{
|
||||
for (auto item : input) {
|
||||
// 1. If code point is end-of-queue, return finished.
|
||||
|
||||
// 2. If code point is an ASCII code point, return a byte whose value is code point.
|
||||
if (is_ascii(item)) {
|
||||
TRY(on_byte(static_cast<u8>(item)));
|
||||
continue;
|
||||
}
|
||||
|
||||
// 3. If code point is U+00A5, return byte 0x5C.
|
||||
if (item == 0x00A5) {
|
||||
TRY(on_byte(static_cast<u8>(0x5C)));
|
||||
continue;
|
||||
}
|
||||
|
||||
// 4. If code point is U+203E, return byte 0x7E.
|
||||
if (item == 0x203E) {
|
||||
TRY(on_byte(static_cast<u8>(0x7E)));
|
||||
continue;
|
||||
}
|
||||
|
||||
// 5. If code point is in the range U+FF61 to U+FF9F, inclusive, return two bytes whose values are 0x8E and code point − 0xFF61 + 0xA1.
|
||||
if (item >= 0xFF61 && item <= 0xFF9F) {
|
||||
TRY(on_byte(0x8E));
|
||||
TRY(on_byte(static_cast<u8>(item - 0xFF61 + 0xA1)));
|
||||
continue;
|
||||
}
|
||||
|
||||
// 6. If code point is U+2212, set it to U+FF0D.
|
||||
if (item == 0x2212)
|
||||
item = 0xFF0D;
|
||||
|
||||
// 7. Let pointer be the index pointer for code point in index jis0208.
|
||||
auto pointer = code_point_jis0208_index(item);
|
||||
|
||||
// 8. If pointer is null, return error with code point.
|
||||
if (!pointer.has_value()) {
|
||||
// TODO: Report error.
|
||||
continue;
|
||||
}
|
||||
|
||||
// 9. Let lead be pointer / 94 + 0xA1.
|
||||
auto lead = *pointer / 94 + 0xA1;
|
||||
|
||||
// 10. Let trail be pointer % 94 + 0xA1.
|
||||
auto trail = *pointer % 94 + 0xA1;
|
||||
|
||||
// 11. Return two bytes whose values are lead and trail.
|
||||
TRY(on_byte(static_cast<u8>(lead)));
|
||||
TRY(on_byte(static_cast<u8>(trail)));
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
}
|
35
Userland/Libraries/LibTextCodec/Encoder.h
Normal file
35
Userland/Libraries/LibTextCodec/Encoder.h
Normal file
|
@ -0,0 +1,35 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Ben Jilks <benjyjilks@gmail.com>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Forward.h>
|
||||
#include <AK/Function.h>
|
||||
|
||||
namespace TextCodec {
|
||||
|
||||
class Encoder {
|
||||
public:
|
||||
virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) = 0;
|
||||
|
||||
protected:
|
||||
virtual ~Encoder() = default;
|
||||
};
|
||||
|
||||
class UTF8Encoder final : public Encoder {
|
||||
public:
|
||||
virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
|
||||
};
|
||||
|
||||
class EUCJPEncoder final : public Encoder {
|
||||
public:
|
||||
virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
|
||||
};
|
||||
|
||||
Optional<Encoder&> encoder_for_exact_name(StringView encoding);
|
||||
Optional<Encoder&> encoder_for(StringView label);
|
||||
|
||||
}
|
|
@ -4,4 +4,4 @@ set(SOURCES
|
|||
)
|
||||
|
||||
serenity_lib(LibURL url)
|
||||
target_link_libraries(LibURL PRIVATE LibUnicode)
|
||||
target_link_libraries(LibURL PRIVATE LibUnicode LibTextCodec)
|
||||
|
|
|
@ -14,6 +14,8 @@
|
|||
#include <AK/StringBuilder.h>
|
||||
#include <AK/StringUtils.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibTextCodec/Decoder.h>
|
||||
#include <LibTextCodec/Encoder.h>
|
||||
#include <LibURL/Parser.h>
|
||||
#include <LibUnicode/IDNA.h>
|
||||
|
||||
|
@ -768,18 +770,17 @@ void Parser::shorten_urls_path(URL& url)
|
|||
}
|
||||
|
||||
// https://url.spec.whatwg.org/#string-percent-encode-after-encoding
|
||||
ErrorOr<String> Parser::percent_encode_after_encoding(StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus)
|
||||
ErrorOr<String> Parser::percent_encode_after_encoding(TextCodec::Encoder& encoder, StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus)
|
||||
{
|
||||
// NOTE: This is written somewhat ad-hoc since we don't yet implement the Encoding spec.
|
||||
|
||||
// 1. Let encodeOutput be an empty I/O queue.
|
||||
StringBuilder output;
|
||||
|
||||
// 3. For each byte of encodeOutput converted to a byte sequence:
|
||||
for (u8 byte : input) {
|
||||
TRY(encoder.process(Utf8View(input), [&](u8 byte) -> ErrorOr<void> {
|
||||
// 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue.
|
||||
if (space_as_plus && byte == ' ') {
|
||||
output.append('+');
|
||||
continue;
|
||||
return {};
|
||||
}
|
||||
|
||||
// 2. Let isomorph be a code point whose value is byte’s value.
|
||||
|
@ -796,7 +797,9 @@ ErrorOr<String> Parser::percent_encode_after_encoding(StringView input, PercentE
|
|||
else {
|
||||
output.appendff("%{:02X}", byte);
|
||||
}
|
||||
}
|
||||
|
||||
return {};
|
||||
}));
|
||||
|
||||
// 6. Return output.
|
||||
return output.to_string();
|
||||
|
@ -851,7 +854,9 @@ URL Parser::basic_parse(StringView raw_input, Optional<URL> const& base_url, Opt
|
|||
// 4. Let state be state override if given, or scheme start state otherwise.
|
||||
State state = state_override.value_or(State::SchemeStart);
|
||||
|
||||
// FIXME: 5. Set encoding to the result of getting an output encoding from encoding.
|
||||
// 5. Set encoding to the result of getting an output encoding from encoding.
|
||||
auto encoder = TextCodec::encoder_for("utf-8"sv);
|
||||
VERIFY(encoder.has_value());
|
||||
|
||||
// 6. Let buffer be the empty string.
|
||||
StringBuilder buffer;
|
||||
|
@ -1684,7 +1689,7 @@ URL Parser::basic_parse(StringView raw_input, Optional<URL> const& base_url, Opt
|
|||
auto query_percent_encode_set = url->is_special() ? PercentEncodeSet::SpecialQuery : PercentEncodeSet::Query;
|
||||
|
||||
// 2. Percent-encode after encoding, with encoding, buffer, and queryPercentEncodeSet, and append the result to url’s query.
|
||||
url->m_data->query = percent_encode_after_encoding(buffer.string_view(), query_percent_encode_set).release_value_but_fixme_should_propagate_errors();
|
||||
url->m_data->query = percent_encode_after_encoding(*encoder, buffer.string_view(), query_percent_encode_set).release_value_but_fixme_should_propagate_errors();
|
||||
|
||||
// 3. Set buffer to the empty string.
|
||||
buffer.clear();
|
||||
|
@ -1726,7 +1731,7 @@ URL Parser::basic_parse(StringView raw_input, Optional<URL> const& base_url, Opt
|
|||
// NOTE: The percent-encode is done on EOF on the entire buffer.
|
||||
buffer.append_code_point(code_point);
|
||||
} else {
|
||||
url->m_data->fragment = percent_encode_after_encoding(buffer.string_view(), PercentEncodeSet::Fragment).release_value_but_fixme_should_propagate_errors();
|
||||
url->m_data->fragment = percent_encode_after_encoding(*encoder, buffer.string_view(), PercentEncodeSet::Fragment).release_value_but_fixme_should_propagate_errors();
|
||||
buffer.clear();
|
||||
}
|
||||
break;
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
#include <AK/Optional.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <LibTextCodec/Encoder.h>
|
||||
#include <LibURL/URL.h>
|
||||
|
||||
namespace URL {
|
||||
|
@ -60,7 +61,7 @@ public:
|
|||
static URL basic_parse(StringView input, Optional<URL> const& base_url = {}, Optional<URL> url = {}, Optional<State> state_override = {});
|
||||
|
||||
// https://url.spec.whatwg.org/#string-percent-encode-after-encoding
|
||||
static ErrorOr<String> percent_encode_after_encoding(StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus = false);
|
||||
static ErrorOr<String> percent_encode_after_encoding(TextCodec::Encoder&, StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus = false);
|
||||
|
||||
// https://url.spec.whatwg.org/#concept-host-serializer
|
||||
static ErrorOr<String> serialize_host(Host const&);
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
#include <AK/StringBuilder.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibTextCodec/Decoder.h>
|
||||
#include <LibTextCodec/Encoder.h>
|
||||
#include <LibURL/Parser.h>
|
||||
#include <LibWeb/Bindings/ExceptionOrUtils.h>
|
||||
#include <LibWeb/Bindings/Intrinsics.h>
|
||||
|
@ -47,6 +48,12 @@ ErrorOr<String> url_encode(Vector<QueryParam> const& tuples, StringView encoding
|
|||
// 1. Set encoding to the result of getting an output encoding from encoding.
|
||||
encoding = TextCodec::get_output_encoding(encoding);
|
||||
|
||||
auto encoder = TextCodec::encoder_for(encoding);
|
||||
if (!encoder.has_value()) {
|
||||
// NOTE: Fallback to default utf-8 encoder.
|
||||
encoder = TextCodec::encoder_for("utf-8"sv);
|
||||
}
|
||||
|
||||
// 2. Let output be the empty string.
|
||||
StringBuilder output;
|
||||
|
||||
|
@ -55,12 +62,10 @@ ErrorOr<String> url_encode(Vector<QueryParam> const& tuples, StringView encoding
|
|||
// 1. Assert: tuple’s name and tuple’s value are scalar value strings.
|
||||
|
||||
// 2. Let name be the result of running percent-encode after encoding with encoding, tuple’s name, the application/x-www-form-urlencoded percent-encode set, and true.
|
||||
// FIXME: URL::Parser does not currently implement encoding.
|
||||
auto name = TRY(URL::Parser::percent_encode_after_encoding(tuple.name, URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded, true));
|
||||
auto name = TRY(URL::Parser::percent_encode_after_encoding(*encoder, tuple.name, URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded, true));
|
||||
|
||||
// 3. Let value be the result of running percent-encode after encoding with encoding, tuple’s value, the application/x-www-form-urlencoded percent-encode set, and true.
|
||||
// FIXME: URL::Parser does not currently implement encoding.
|
||||
auto value = TRY(URL::Parser::percent_encode_after_encoding(tuple.value, URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded, true));
|
||||
auto value = TRY(URL::Parser::percent_encode_after_encoding(*encoder, tuple.value, URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded, true));
|
||||
|
||||
// 4. If output is not the empty string, then append U+0026 (&) to output.
|
||||
if (!output.is_empty())
|
||||
|
|
|
@ -160,10 +160,6 @@ WebIDL::ExceptionOr<void> HTMLFormElement::submit_form(JS::NonnullGCPtr<HTMLElem
|
|||
|
||||
// 6. Let encoding be the result of picking an encoding for the form.
|
||||
auto encoding = TRY_OR_THROW_OOM(vm, pick_an_encoding());
|
||||
if (encoding != "UTF-8"sv) {
|
||||
dbgln("FIXME: Support encodings other than UTF-8 in form submission. Returning from form submission.");
|
||||
return {};
|
||||
}
|
||||
|
||||
// 7. Let entry list be the result of constructing the entry list with form, submitter, and encoding.
|
||||
auto entry_list_or_null = TRY(construct_entry_list(realm, *this, submitter, encoding));
|
||||
|
|
Loading…
Reference in a new issue