From 72d0e3284b604c4c1373fb019250cdf5bd492300 Mon Sep 17 00:00:00 2001 From: BenJilks Date: Mon, 5 Aug 2024 16:03:53 +0100 Subject: [PATCH] LibTextCodec+LibURL: Implement `utf-8` and `euc-jp` encoders Implements the corresponding encoders, selects the appropriate one when encoding URL search params. If an encoder for the given encoding could not be found, fallback to utf-8. --- .../LibTextCodec/GenerateEncodingIndexes.cpp | 52 ++++++++- Tests/LibTextCodec/CMakeLists.txt | 1 + Tests/LibTextCodec/TestTextEncoders.cpp | 45 ++++++++ .../Libraries/LibTextCodec/CMakeLists.txt | 1 + Userland/Libraries/LibTextCodec/Encoder.cpp | 103 ++++++++++++++++++ Userland/Libraries/LibTextCodec/Encoder.h | 35 ++++++ Userland/Libraries/LibURL/CMakeLists.txt | 2 +- Userland/Libraries/LibURL/Parser.cpp | 23 ++-- Userland/Libraries/LibURL/Parser.h | 3 +- .../LibWeb/DOMURL/URLSearchParams.cpp | 13 ++- .../Libraries/LibWeb/HTML/HTMLFormElement.cpp | 4 - 11 files changed, 260 insertions(+), 22 deletions(-) create mode 100644 Tests/LibTextCodec/TestTextEncoders.cpp create mode 100644 Userland/Libraries/LibTextCodec/Encoder.cpp create mode 100644 Userland/Libraries/LibTextCodec/Encoder.h diff --git a/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp b/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp index 0c64138f25a..3f268c15c39 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp @@ -21,6 +21,7 @@ struct LookupTable { u32 max_code_point; Vector code_points; bool generate_accessor; + bool generate_inverse_accessor; }; struct LookupTables { @@ -33,7 +34,12 @@ enum class GenerateAccessor { Yes, }; -LookupTable prepare_table(JsonArray const& data, GenerateAccessor generate_accessor = GenerateAccessor::No) +enum class GenerateInverseAccessor { + No, + Yes, +}; + +LookupTable prepare_table(JsonArray const& data, GenerateAccessor generate_accessor = GenerateAccessor::No, GenerateInverseAccessor generate_inverse_accessor = GenerateInverseAccessor::No) { Vector code_points; code_points.ensure_capacity(data.size()); @@ -58,7 +64,7 @@ LookupTable prepare_table(JsonArray const& data, GenerateAccessor generate_acces } else { VERIFY(first_pointer == 0); } - return { first_pointer, max, move(code_points), generate_accessor == GenerateAccessor::Yes }; + return { first_pointer, max, move(code_points), generate_accessor == GenerateAccessor::Yes, generate_inverse_accessor == GenerateInverseAccessor::Yes }; } void generate_table(SourceGenerator generator, StringView name, LookupTable& table) @@ -81,6 +87,8 @@ void generate_table(SourceGenerator generator, StringView name, LookupTable& tab generator.appendln("\n};"); if (table.generate_accessor) generator.appendln("Optional index_@name@_code_point(u32 pointer);"); + if (table.generate_inverse_accessor) + generator.appendln("Optional code_point_@name@_index(u32 code_point);"); } ErrorOr generate_header_file(LookupTables& tables, Core::File& file) @@ -155,6 +163,42 @@ Optional index_@name@_code_point(u32 pointer) } } +void generate_inverse_table_accessor(SourceGenerator generator, StringView name, LookupTable& table) +{ + generator.set("name", name); + generator.set("first_pointer", MUST(String::number(table.first_pointer))); + generator.set("size", MUST(String::number(table.code_points.size()))); + + // FIXME - Doing a linear search here is really slow, should be generating + // some kind of reverse lookup table. + + if (table.first_pointer > 0) { + generator.append(R"~~~( +Optional code_point_@name@_index(u32 code_point) +{ + for (u32 i = 0; i < s_@name@_index.size(); ++i) { + if (s_@name@_index[i] == code_point) { + return s_@name@_index_first_pointer + i; + } + } + return {}; +} +)~~~"); + } else { + generator.append(R"~~~( +Optional code_point_@name@_index(u32 code_point) +{ + for (u32 i = 0; i < s_@name@_index.size(); ++i) { + if (s_@name@_index[i] == code_point) { + return i; + } + } + return {}; +} +)~~~"); + } +} + ErrorOr generate_implementation_file(LookupTables& tables, Core::File& file) { StringBuilder builder; @@ -169,6 +213,8 @@ namespace TextCodec { for (auto& [key, table] : tables.indexes) { if (table.generate_accessor) generate_table_accessor(generator.fork(), key, table); + if (table.generate_inverse_accessor) + generate_inverse_table_accessor(generator.fork(), key, table); } generator.appendln("\n}"); @@ -222,7 +268,7 @@ ErrorOr serenity_main(Main::Arguments arguments) .indexes = { { "gb18030"sv, move(gb18030_table) }, { "big5"sv, prepare_table(data.get("big5"sv)->as_array(), GenerateAccessor::Yes) }, - { "jis0208"sv, prepare_table(data.get("jis0208"sv)->as_array(), GenerateAccessor::Yes) }, + { "jis0208"sv, prepare_table(data.get("jis0208"sv)->as_array(), GenerateAccessor::Yes, GenerateInverseAccessor::Yes) }, { "jis0212"sv, prepare_table(data.get("jis0212"sv)->as_array(), GenerateAccessor::Yes) }, { "euc_kr"sv, prepare_table(data.get("euc-kr"sv)->as_array(), GenerateAccessor::Yes) }, { "ibm866"sv, prepare_table(data.get("ibm866"sv)->as_array()) }, diff --git a/Tests/LibTextCodec/CMakeLists.txt b/Tests/LibTextCodec/CMakeLists.txt index ec39ad694f1..9cfade466c6 100644 --- a/Tests/LibTextCodec/CMakeLists.txt +++ b/Tests/LibTextCodec/CMakeLists.txt @@ -1,5 +1,6 @@ set(TEST_SOURCES TestTextDecoders.cpp + TestTextEncoders.cpp ) foreach(source IN LISTS TEST_SOURCES) diff --git a/Tests/LibTextCodec/TestTextEncoders.cpp b/Tests/LibTextCodec/TestTextEncoders.cpp new file mode 100644 index 00000000000..17c298123f6 --- /dev/null +++ b/Tests/LibTextCodec/TestTextEncoders.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2024, Ben Jilks + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include + +TEST_CASE(test_utf8_encode) +{ + TextCodec::UTF8Encoder encoder; + // Unicode character U+1F600 GRINNING FACE + auto test_string = "\U0001F600"sv; + + Vector processed_bytes; + MUST(encoder.process(Utf8View(test_string), [&](u8 byte) { + return processed_bytes.try_append(byte); + })); + EXPECT(processed_bytes.size() == 4); + EXPECT(processed_bytes[0] == 0xF0); + EXPECT(processed_bytes[1] == 0x9F); + EXPECT(processed_bytes[2] == 0x98); + EXPECT(processed_bytes[3] == 0x80); +} + +TEST_CASE(test_euc_jp_encoder) +{ + TextCodec::EUCJPEncoder encoder; + // U+A5 Yen Sign + // U+3088 Hiragana Letter Yo + // U+30C4 Katakana Letter Tu + auto test_string = "\U000000A5\U00003088\U000030C4"sv; + + Vector processed_bytes; + MUST(encoder.process(Utf8View(test_string), [&](u8 byte) { + return processed_bytes.try_append(byte); + })); + EXPECT(processed_bytes.size() == 5); + EXPECT(processed_bytes[0] == 0x5C); + EXPECT(processed_bytes[1] == 0xA4); + EXPECT(processed_bytes[2] == 0xE8); + EXPECT(processed_bytes[3] == 0xA5); + EXPECT(processed_bytes[4] == 0xC4); +} diff --git a/Userland/Libraries/LibTextCodec/CMakeLists.txt b/Userland/Libraries/LibTextCodec/CMakeLists.txt index 152024d2eb1..6fb8fda0d6f 100644 --- a/Userland/Libraries/LibTextCodec/CMakeLists.txt +++ b/Userland/Libraries/LibTextCodec/CMakeLists.txt @@ -2,6 +2,7 @@ include(libtextcodec_generators) set(SOURCES Decoder.cpp + Encoder.cpp ) generate_encoding_indexes() diff --git a/Userland/Libraries/LibTextCodec/Encoder.cpp b/Userland/Libraries/LibTextCodec/Encoder.cpp new file mode 100644 index 00000000000..b41cce99de3 --- /dev/null +++ b/Userland/Libraries/LibTextCodec/Encoder.cpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2024, Ben Jilks + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include +#include +#include + +namespace TextCodec { + +namespace { +UTF8Encoder s_utf8_encoder; +EUCJPEncoder s_euc_jp_encoder; +} + +Optional encoder_for_exact_name(StringView encoding) +{ + if (encoding.equals_ignoring_ascii_case("utf-8"sv)) + return s_utf8_encoder; + if (encoding.equals_ignoring_ascii_case("euc-jp"sv)) + return s_euc_jp_encoder; + dbgln("TextCodec: No encoder implemented for encoding '{}'", encoding); + return {}; +} + +Optional encoder_for(StringView label) +{ + auto encoding = get_standardized_encoding(label); + return encoding.has_value() ? encoder_for_exact_name(encoding.value()) : Optional {}; +} + +// https://encoding.spec.whatwg.org/#utf-8-encoder +ErrorOr UTF8Encoder::process(Utf8View input, Function(u8)> on_byte) +{ + ReadonlyBytes bytes { input.bytes(), input.byte_length() }; + for (auto byte : bytes) + TRY(on_byte(byte)); + return {}; +} + +// https://encoding.spec.whatwg.org/#euc-jp-encoder +ErrorOr EUCJPEncoder::process(Utf8View input, Function(u8)> on_byte) +{ + for (auto item : input) { + // 1. If code point is end-of-queue, return finished. + + // 2. If code point is an ASCII code point, return a byte whose value is code point. + if (is_ascii(item)) { + TRY(on_byte(static_cast(item))); + continue; + } + + // 3. If code point is U+00A5, return byte 0x5C. + if (item == 0x00A5) { + TRY(on_byte(static_cast(0x5C))); + continue; + } + + // 4. If code point is U+203E, return byte 0x7E. + if (item == 0x203E) { + TRY(on_byte(static_cast(0x7E))); + continue; + } + + // 5. If code point is in the range U+FF61 to U+FF9F, inclusive, return two bytes whose values are 0x8E and code point − 0xFF61 + 0xA1. + if (item >= 0xFF61 && item <= 0xFF9F) { + TRY(on_byte(0x8E)); + TRY(on_byte(static_cast(item - 0xFF61 + 0xA1))); + continue; + } + + // 6. If code point is U+2212, set it to U+FF0D. + if (item == 0x2212) + item = 0xFF0D; + + // 7. Let pointer be the index pointer for code point in index jis0208. + auto pointer = code_point_jis0208_index(item); + + // 8. If pointer is null, return error with code point. + if (!pointer.has_value()) { + // TODO: Report error. + continue; + } + + // 9. Let lead be pointer / 94 + 0xA1. + auto lead = *pointer / 94 + 0xA1; + + // 10. Let trail be pointer % 94 + 0xA1. + auto trail = *pointer % 94 + 0xA1; + + // 11. Return two bytes whose values are lead and trail. + TRY(on_byte(static_cast(lead))); + TRY(on_byte(static_cast(trail))); + } + + return {}; +} + +} diff --git a/Userland/Libraries/LibTextCodec/Encoder.h b/Userland/Libraries/LibTextCodec/Encoder.h new file mode 100644 index 00000000000..b3b0c0423dc --- /dev/null +++ b/Userland/Libraries/LibTextCodec/Encoder.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024, Ben Jilks + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include + +namespace TextCodec { + +class Encoder { +public: + virtual ErrorOr process(Utf8View, Function(u8)> on_byte) = 0; + +protected: + virtual ~Encoder() = default; +}; + +class UTF8Encoder final : public Encoder { +public: + virtual ErrorOr process(Utf8View, Function(u8)> on_byte) override; +}; + +class EUCJPEncoder final : public Encoder { +public: + virtual ErrorOr process(Utf8View, Function(u8)> on_byte) override; +}; + +Optional encoder_for_exact_name(StringView encoding); +Optional encoder_for(StringView label); + +} diff --git a/Userland/Libraries/LibURL/CMakeLists.txt b/Userland/Libraries/LibURL/CMakeLists.txt index 73802ab5de9..4cd8e745910 100644 --- a/Userland/Libraries/LibURL/CMakeLists.txt +++ b/Userland/Libraries/LibURL/CMakeLists.txt @@ -4,4 +4,4 @@ set(SOURCES ) serenity_lib(LibURL url) -target_link_libraries(LibURL PRIVATE LibUnicode) +target_link_libraries(LibURL PRIVATE LibUnicode LibTextCodec) diff --git a/Userland/Libraries/LibURL/Parser.cpp b/Userland/Libraries/LibURL/Parser.cpp index f69afda01bd..2be0e708968 100644 --- a/Userland/Libraries/LibURL/Parser.cpp +++ b/Userland/Libraries/LibURL/Parser.cpp @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include @@ -768,18 +770,17 @@ void Parser::shorten_urls_path(URL& url) } // https://url.spec.whatwg.org/#string-percent-encode-after-encoding -ErrorOr Parser::percent_encode_after_encoding(StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus) +ErrorOr Parser::percent_encode_after_encoding(TextCodec::Encoder& encoder, StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus) { - // NOTE: This is written somewhat ad-hoc since we don't yet implement the Encoding spec. - + // 1. Let encodeOutput be an empty I/O queue. StringBuilder output; // 3. For each byte of encodeOutput converted to a byte sequence: - for (u8 byte : input) { + TRY(encoder.process(Utf8View(input), [&](u8 byte) -> ErrorOr { // 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue. if (space_as_plus && byte == ' ') { output.append('+'); - continue; + return {}; } // 2. Let isomorph be a code point whose value is byte’s value. @@ -796,7 +797,9 @@ ErrorOr Parser::percent_encode_after_encoding(StringView input, PercentE else { output.appendff("%{:02X}", byte); } - } + + return {}; + })); // 6. Return output. return output.to_string(); @@ -851,7 +854,9 @@ URL Parser::basic_parse(StringView raw_input, Optional const& base_url, Opt // 4. Let state be state override if given, or scheme start state otherwise. State state = state_override.value_or(State::SchemeStart); - // FIXME: 5. Set encoding to the result of getting an output encoding from encoding. + // 5. Set encoding to the result of getting an output encoding from encoding. + auto encoder = TextCodec::encoder_for("utf-8"sv); + VERIFY(encoder.has_value()); // 6. Let buffer be the empty string. StringBuilder buffer; @@ -1684,7 +1689,7 @@ URL Parser::basic_parse(StringView raw_input, Optional const& base_url, Opt auto query_percent_encode_set = url->is_special() ? PercentEncodeSet::SpecialQuery : PercentEncodeSet::Query; // 2. Percent-encode after encoding, with encoding, buffer, and queryPercentEncodeSet, and append the result to url’s query. - url->m_data->query = percent_encode_after_encoding(buffer.string_view(), query_percent_encode_set).release_value_but_fixme_should_propagate_errors(); + url->m_data->query = percent_encode_after_encoding(*encoder, buffer.string_view(), query_percent_encode_set).release_value_but_fixme_should_propagate_errors(); // 3. Set buffer to the empty string. buffer.clear(); @@ -1726,7 +1731,7 @@ URL Parser::basic_parse(StringView raw_input, Optional const& base_url, Opt // NOTE: The percent-encode is done on EOF on the entire buffer. buffer.append_code_point(code_point); } else { - url->m_data->fragment = percent_encode_after_encoding(buffer.string_view(), PercentEncodeSet::Fragment).release_value_but_fixme_should_propagate_errors(); + url->m_data->fragment = percent_encode_after_encoding(*encoder, buffer.string_view(), PercentEncodeSet::Fragment).release_value_but_fixme_should_propagate_errors(); buffer.clear(); } break; diff --git a/Userland/Libraries/LibURL/Parser.h b/Userland/Libraries/LibURL/Parser.h index 185e2a2126f..6f4d04e4ef9 100644 --- a/Userland/Libraries/LibURL/Parser.h +++ b/Userland/Libraries/LibURL/Parser.h @@ -9,6 +9,7 @@ #include #include +#include #include namespace URL { @@ -60,7 +61,7 @@ public: static URL basic_parse(StringView input, Optional const& base_url = {}, Optional url = {}, Optional state_override = {}); // https://url.spec.whatwg.org/#string-percent-encode-after-encoding - static ErrorOr percent_encode_after_encoding(StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus = false); + static ErrorOr percent_encode_after_encoding(TextCodec::Encoder&, StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus = false); // https://url.spec.whatwg.org/#concept-host-serializer static ErrorOr serialize_host(Host const&); diff --git a/Userland/Libraries/LibWeb/DOMURL/URLSearchParams.cpp b/Userland/Libraries/LibWeb/DOMURL/URLSearchParams.cpp index 0ea85c69398..b84347f7c85 100644 --- a/Userland/Libraries/LibWeb/DOMURL/URLSearchParams.cpp +++ b/Userland/Libraries/LibWeb/DOMURL/URLSearchParams.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +48,12 @@ ErrorOr url_encode(Vector const& tuples, StringView encoding // 1. Set encoding to the result of getting an output encoding from encoding. encoding = TextCodec::get_output_encoding(encoding); + auto encoder = TextCodec::encoder_for(encoding); + if (!encoder.has_value()) { + // NOTE: Fallback to default utf-8 encoder. + encoder = TextCodec::encoder_for("utf-8"sv); + } + // 2. Let output be the empty string. StringBuilder output; @@ -55,12 +62,10 @@ ErrorOr url_encode(Vector const& tuples, StringView encoding // 1. Assert: tuple’s name and tuple’s value are scalar value strings. // 2. Let name be the result of running percent-encode after encoding with encoding, tuple’s name, the application/x-www-form-urlencoded percent-encode set, and true. - // FIXME: URL::Parser does not currently implement encoding. - auto name = TRY(URL::Parser::percent_encode_after_encoding(tuple.name, URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded, true)); + auto name = TRY(URL::Parser::percent_encode_after_encoding(*encoder, tuple.name, URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded, true)); // 3. Let value be the result of running percent-encode after encoding with encoding, tuple’s value, the application/x-www-form-urlencoded percent-encode set, and true. - // FIXME: URL::Parser does not currently implement encoding. - auto value = TRY(URL::Parser::percent_encode_after_encoding(tuple.value, URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded, true)); + auto value = TRY(URL::Parser::percent_encode_after_encoding(*encoder, tuple.value, URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded, true)); // 4. If output is not the empty string, then append U+0026 (&) to output. if (!output.is_empty()) diff --git a/Userland/Libraries/LibWeb/HTML/HTMLFormElement.cpp b/Userland/Libraries/LibWeb/HTML/HTMLFormElement.cpp index 43e4d5a4e89..e499a543176 100644 --- a/Userland/Libraries/LibWeb/HTML/HTMLFormElement.cpp +++ b/Userland/Libraries/LibWeb/HTML/HTMLFormElement.cpp @@ -160,10 +160,6 @@ WebIDL::ExceptionOr HTMLFormElement::submit_form(JS::NonnullGCPtr