Jelajahi Sumber

LibTextCodec+LibURL: Implement `utf-8` and `euc-jp` encoders

Implements the corresponding encoders, selects the appropriate one when
encoding URL search params. If an encoder for the given encoding could
not be found, fallback to utf-8.
BenJilks 1 tahun lalu
induk
melakukan
72d0e3284b

+ 49 - 3
Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp

@@ -21,6 +21,7 @@ struct LookupTable {
     u32 max_code_point;
     Vector<u32> code_points;
     bool generate_accessor;
+    bool generate_inverse_accessor;
 };
 
 struct LookupTables {
@@ -33,7 +34,12 @@ enum class GenerateAccessor {
     Yes,
 };
 
-LookupTable prepare_table(JsonArray const& data, GenerateAccessor generate_accessor = GenerateAccessor::No)
+enum class GenerateInverseAccessor {
+    No,
+    Yes,
+};
+
+LookupTable prepare_table(JsonArray const& data, GenerateAccessor generate_accessor = GenerateAccessor::No, GenerateInverseAccessor generate_inverse_accessor = GenerateInverseAccessor::No)
 {
     Vector<u32> code_points;
     code_points.ensure_capacity(data.size());
@@ -58,7 +64,7 @@ LookupTable prepare_table(JsonArray const& data, GenerateAccessor generate_acces
     } else {
         VERIFY(first_pointer == 0);
     }
-    return { first_pointer, max, move(code_points), generate_accessor == GenerateAccessor::Yes };
+    return { first_pointer, max, move(code_points), generate_accessor == GenerateAccessor::Yes, generate_inverse_accessor == GenerateInverseAccessor::Yes };
 }
 
 void generate_table(SourceGenerator generator, StringView name, LookupTable& table)
@@ -81,6 +87,8 @@ void generate_table(SourceGenerator generator, StringView name, LookupTable& tab
     generator.appendln("\n};");
     if (table.generate_accessor)
         generator.appendln("Optional<u32> index_@name@_code_point(u32 pointer);");
+    if (table.generate_inverse_accessor)
+        generator.appendln("Optional<u32> code_point_@name@_index(u32 code_point);");
 }
 
 ErrorOr<void> generate_header_file(LookupTables& tables, Core::File& file)
@@ -155,6 +163,42 @@ Optional<u32> index_@name@_code_point(u32 pointer)
     }
 }
 
+void generate_inverse_table_accessor(SourceGenerator generator, StringView name, LookupTable& table)
+{
+    generator.set("name", name);
+    generator.set("first_pointer", MUST(String::number(table.first_pointer)));
+    generator.set("size", MUST(String::number(table.code_points.size())));
+
+    // FIXME - Doing a linear search here is really slow, should be generating
+    //         some kind of reverse lookup table.
+
+    if (table.first_pointer > 0) {
+        generator.append(R"~~~(
+Optional<u32> code_point_@name@_index(u32 code_point)
+{
+    for (u32 i = 0; i < s_@name@_index.size(); ++i) {
+        if (s_@name@_index[i] == code_point) {
+            return s_@name@_index_first_pointer + i;
+        }
+    }
+    return {};
+}
+)~~~");
+    } else {
+        generator.append(R"~~~(
+Optional<u32> code_point_@name@_index(u32 code_point)
+{
+    for (u32 i = 0; i < s_@name@_index.size(); ++i) {
+        if (s_@name@_index[i] == code_point) {
+            return i;
+        }
+    }
+    return {};
+}
+)~~~");
+    }
+}
+
 ErrorOr<void> generate_implementation_file(LookupTables& tables, Core::File& file)
 {
     StringBuilder builder;
@@ -169,6 +213,8 @@ namespace TextCodec {
     for (auto& [key, table] : tables.indexes) {
         if (table.generate_accessor)
             generate_table_accessor(generator.fork(), key, table);
+        if (table.generate_inverse_accessor)
+            generate_inverse_table_accessor(generator.fork(), key, table);
     }
 
     generator.appendln("\n}");
@@ -222,7 +268,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
         .indexes = {
             { "gb18030"sv, move(gb18030_table) },
             { "big5"sv, prepare_table(data.get("big5"sv)->as_array(), GenerateAccessor::Yes) },
-            { "jis0208"sv, prepare_table(data.get("jis0208"sv)->as_array(), GenerateAccessor::Yes) },
+            { "jis0208"sv, prepare_table(data.get("jis0208"sv)->as_array(), GenerateAccessor::Yes, GenerateInverseAccessor::Yes) },
             { "jis0212"sv, prepare_table(data.get("jis0212"sv)->as_array(), GenerateAccessor::Yes) },
             { "euc_kr"sv, prepare_table(data.get("euc-kr"sv)->as_array(), GenerateAccessor::Yes) },
             { "ibm866"sv, prepare_table(data.get("ibm866"sv)->as_array()) },

+ 1 - 0
Tests/LibTextCodec/CMakeLists.txt

@@ -1,5 +1,6 @@
 set(TEST_SOURCES
     TestTextDecoders.cpp
+    TestTextEncoders.cpp
 )
 
 foreach(source IN LISTS TEST_SOURCES)

+ 45 - 0
Tests/LibTextCodec/TestTextEncoders.cpp

@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2024, Ben Jilks <benjyjilks@gmail.com>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <LibTest/TestCase.h>
+#include <LibTextCodec/Encoder.h>
+
+TEST_CASE(test_utf8_encode)
+{
+    TextCodec::UTF8Encoder encoder;
+    // Unicode character U+1F600 GRINNING FACE
+    auto test_string = "\U0001F600"sv;
+
+    Vector<u8> processed_bytes;
+    MUST(encoder.process(Utf8View(test_string), [&](u8 byte) {
+        return processed_bytes.try_append(byte);
+    }));
+    EXPECT(processed_bytes.size() == 4);
+    EXPECT(processed_bytes[0] == 0xF0);
+    EXPECT(processed_bytes[1] == 0x9F);
+    EXPECT(processed_bytes[2] == 0x98);
+    EXPECT(processed_bytes[3] == 0x80);
+}
+
+TEST_CASE(test_euc_jp_encoder)
+{
+    TextCodec::EUCJPEncoder encoder;
+    // U+A5 Yen Sign
+    // U+3088 Hiragana Letter Yo
+    // U+30C4 Katakana Letter Tu
+    auto test_string = "\U000000A5\U00003088\U000030C4"sv;
+
+    Vector<u8> processed_bytes;
+    MUST(encoder.process(Utf8View(test_string), [&](u8 byte) {
+        return processed_bytes.try_append(byte);
+    }));
+    EXPECT(processed_bytes.size() == 5);
+    EXPECT(processed_bytes[0] == 0x5C);
+    EXPECT(processed_bytes[1] == 0xA4);
+    EXPECT(processed_bytes[2] == 0xE8);
+    EXPECT(processed_bytes[3] == 0xA5);
+    EXPECT(processed_bytes[4] == 0xC4);
+}

+ 1 - 0
Userland/Libraries/LibTextCodec/CMakeLists.txt

@@ -2,6 +2,7 @@ include(libtextcodec_generators)
 
 set(SOURCES
     Decoder.cpp
+    Encoder.cpp
 )
 
 generate_encoding_indexes()

+ 103 - 0
Userland/Libraries/LibTextCodec/Encoder.cpp

@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2024, Ben Jilks <benjyjilks@gmail.com>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/Error.h>
+#include <AK/Utf8View.h>
+#include <LibTextCodec/Decoder.h>
+#include <LibTextCodec/Encoder.h>
+#include <LibTextCodec/LookupTables.h>
+
+namespace TextCodec {
+
+namespace {
+UTF8Encoder s_utf8_encoder;
+EUCJPEncoder s_euc_jp_encoder;
+}
+
+Optional<Encoder&> encoder_for_exact_name(StringView encoding)
+{
+    if (encoding.equals_ignoring_ascii_case("utf-8"sv))
+        return s_utf8_encoder;
+    if (encoding.equals_ignoring_ascii_case("euc-jp"sv))
+        return s_euc_jp_encoder;
+    dbgln("TextCodec: No encoder implemented for encoding '{}'", encoding);
+    return {};
+}
+
+Optional<Encoder&> encoder_for(StringView label)
+{
+    auto encoding = get_standardized_encoding(label);
+    return encoding.has_value() ? encoder_for_exact_name(encoding.value()) : Optional<Encoder&> {};
+}
+
+// https://encoding.spec.whatwg.org/#utf-8-encoder
+ErrorOr<void> UTF8Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
+{
+    ReadonlyBytes bytes { input.bytes(), input.byte_length() };
+    for (auto byte : bytes)
+        TRY(on_byte(byte));
+    return {};
+}
+
+// https://encoding.spec.whatwg.org/#euc-jp-encoder
+ErrorOr<void> EUCJPEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
+{
+    for (auto item : input) {
+        // 1. If code point is end-of-queue, return finished.
+
+        // 2. If code point is an ASCII code point, return a byte whose value is code point.
+        if (is_ascii(item)) {
+            TRY(on_byte(static_cast<u8>(item)));
+            continue;
+        }
+
+        // 3. If code point is U+00A5, return byte 0x5C.
+        if (item == 0x00A5) {
+            TRY(on_byte(static_cast<u8>(0x5C)));
+            continue;
+        }
+
+        // 4. If code point is U+203E, return byte 0x7E.
+        if (item == 0x203E) {
+            TRY(on_byte(static_cast<u8>(0x7E)));
+            continue;
+        }
+
+        // 5. If code point is in the range U+FF61 to U+FF9F, inclusive, return two bytes whose values are 0x8E and code point − 0xFF61 + 0xA1.
+        if (item >= 0xFF61 && item <= 0xFF9F) {
+            TRY(on_byte(0x8E));
+            TRY(on_byte(static_cast<u8>(item - 0xFF61 + 0xA1)));
+            continue;
+        }
+
+        // 6. If code point is U+2212, set it to U+FF0D.
+        if (item == 0x2212)
+            item = 0xFF0D;
+
+        // 7. Let pointer be the index pointer for code point in index jis0208.
+        auto pointer = code_point_jis0208_index(item);
+
+        // 8. If pointer is null, return error with code point.
+        if (!pointer.has_value()) {
+            // TODO: Report error.
+            continue;
+        }
+
+        // 9. Let lead be pointer / 94 + 0xA1.
+        auto lead = *pointer / 94 + 0xA1;
+
+        // 10. Let trail be pointer % 94 + 0xA1.
+        auto trail = *pointer % 94 + 0xA1;
+
+        // 11. Return two bytes whose values are lead and trail.
+        TRY(on_byte(static_cast<u8>(lead)));
+        TRY(on_byte(static_cast<u8>(trail)));
+    }
+
+    return {};
+}
+
+}

+ 35 - 0
Userland/Libraries/LibTextCodec/Encoder.h

@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024, Ben Jilks <benjyjilks@gmail.com>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/Forward.h>
+#include <AK/Function.h>
+
+namespace TextCodec {
+
+class Encoder {
+public:
+    virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) = 0;
+
+protected:
+    virtual ~Encoder() = default;
+};
+
+class UTF8Encoder final : public Encoder {
+public:
+    virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
+};
+
+class EUCJPEncoder final : public Encoder {
+public:
+    virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
+};
+
+Optional<Encoder&> encoder_for_exact_name(StringView encoding);
+Optional<Encoder&> encoder_for(StringView label);
+
+}

+ 1 - 1
Userland/Libraries/LibURL/CMakeLists.txt

@@ -4,4 +4,4 @@ set(SOURCES
 )
 
 serenity_lib(LibURL url)
-target_link_libraries(LibURL PRIVATE LibUnicode)
+target_link_libraries(LibURL PRIVATE LibUnicode LibTextCodec)

+ 14 - 9
Userland/Libraries/LibURL/Parser.cpp

@@ -14,6 +14,8 @@
 #include <AK/StringBuilder.h>
 #include <AK/StringUtils.h>
 #include <AK/Utf8View.h>
+#include <LibTextCodec/Decoder.h>
+#include <LibTextCodec/Encoder.h>
 #include <LibURL/Parser.h>
 #include <LibUnicode/IDNA.h>
 
@@ -768,18 +770,17 @@ void Parser::shorten_urls_path(URL& url)
 }
 
 // https://url.spec.whatwg.org/#string-percent-encode-after-encoding
-ErrorOr<String> Parser::percent_encode_after_encoding(StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus)
+ErrorOr<String> Parser::percent_encode_after_encoding(TextCodec::Encoder& encoder, StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus)
 {
-    // NOTE: This is written somewhat ad-hoc since we don't yet implement the Encoding spec.
-
+    // 1. Let encodeOutput be an empty I/O queue.
     StringBuilder output;
 
     // 3. For each byte of encodeOutput converted to a byte sequence:
-    for (u8 byte : input) {
+    TRY(encoder.process(Utf8View(input), [&](u8 byte) -> ErrorOr<void> {
         // 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue.
         if (space_as_plus && byte == ' ') {
             output.append('+');
-            continue;
+            return {};
         }
 
         // 2. Let isomorph be a code point whose value is byte’s value.
@@ -796,7 +797,9 @@ ErrorOr<String> Parser::percent_encode_after_encoding(StringView input, PercentE
         else {
             output.appendff("%{:02X}", byte);
         }
-    }
+
+        return {};
+    }));
 
     // 6. Return output.
     return output.to_string();
@@ -851,7 +854,9 @@ URL Parser::basic_parse(StringView raw_input, Optional<URL> const& base_url, Opt
     // 4. Let state be state override if given, or scheme start state otherwise.
     State state = state_override.value_or(State::SchemeStart);
 
-    // FIXME: 5. Set encoding to the result of getting an output encoding from encoding.
+    // 5. Set encoding to the result of getting an output encoding from encoding.
+    auto encoder = TextCodec::encoder_for("utf-8"sv);
+    VERIFY(encoder.has_value());
 
     // 6. Let buffer be the empty string.
     StringBuilder buffer;
@@ -1684,7 +1689,7 @@ URL Parser::basic_parse(StringView raw_input, Optional<URL> const& base_url, Opt
                 auto query_percent_encode_set = url->is_special() ? PercentEncodeSet::SpecialQuery : PercentEncodeSet::Query;
 
                 // 2. Percent-encode after encoding, with encoding, buffer, and queryPercentEncodeSet, and append the result to url’s query.
-                url->m_data->query = percent_encode_after_encoding(buffer.string_view(), query_percent_encode_set).release_value_but_fixme_should_propagate_errors();
+                url->m_data->query = percent_encode_after_encoding(*encoder, buffer.string_view(), query_percent_encode_set).release_value_but_fixme_should_propagate_errors();
 
                 // 3. Set buffer to the empty string.
                 buffer.clear();
@@ -1726,7 +1731,7 @@ URL Parser::basic_parse(StringView raw_input, Optional<URL> const& base_url, Opt
                 // NOTE: The percent-encode is done on EOF on the entire buffer.
                 buffer.append_code_point(code_point);
             } else {
-                url->m_data->fragment = percent_encode_after_encoding(buffer.string_view(), PercentEncodeSet::Fragment).release_value_but_fixme_should_propagate_errors();
+                url->m_data->fragment = percent_encode_after_encoding(*encoder, buffer.string_view(), PercentEncodeSet::Fragment).release_value_but_fixme_should_propagate_errors();
                 buffer.clear();
             }
             break;

+ 2 - 1
Userland/Libraries/LibURL/Parser.h

@@ -9,6 +9,7 @@
 
 #include <AK/Optional.h>
 #include <AK/StringView.h>
+#include <LibTextCodec/Encoder.h>
 #include <LibURL/URL.h>
 
 namespace URL {
@@ -60,7 +61,7 @@ public:
     static URL basic_parse(StringView input, Optional<URL> const& base_url = {}, Optional<URL> url = {}, Optional<State> state_override = {});
 
     // https://url.spec.whatwg.org/#string-percent-encode-after-encoding
-    static ErrorOr<String> percent_encode_after_encoding(StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus = false);
+    static ErrorOr<String> percent_encode_after_encoding(TextCodec::Encoder&, StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus = false);
 
     // https://url.spec.whatwg.org/#concept-host-serializer
     static ErrorOr<String> serialize_host(Host const&);

+ 9 - 4
Userland/Libraries/LibWeb/DOMURL/URLSearchParams.cpp

@@ -9,6 +9,7 @@
 #include <AK/StringBuilder.h>
 #include <AK/Utf8View.h>
 #include <LibTextCodec/Decoder.h>
+#include <LibTextCodec/Encoder.h>
 #include <LibURL/Parser.h>
 #include <LibWeb/Bindings/ExceptionOrUtils.h>
 #include <LibWeb/Bindings/Intrinsics.h>
@@ -47,6 +48,12 @@ ErrorOr<String> url_encode(Vector<QueryParam> const& tuples, StringView encoding
     // 1. Set encoding to the result of getting an output encoding from encoding.
     encoding = TextCodec::get_output_encoding(encoding);
 
+    auto encoder = TextCodec::encoder_for(encoding);
+    if (!encoder.has_value()) {
+        // NOTE: Fallback to default utf-8 encoder.
+        encoder = TextCodec::encoder_for("utf-8"sv);
+    }
+
     // 2. Let output be the empty string.
     StringBuilder output;
 
@@ -55,12 +62,10 @@ ErrorOr<String> url_encode(Vector<QueryParam> const& tuples, StringView encoding
         // 1. Assert: tuple’s name and tuple’s value are scalar value strings.
 
         // 2. Let name be the result of running percent-encode after encoding with encoding, tuple’s name, the application/x-www-form-urlencoded percent-encode set, and true.
-        // FIXME: URL::Parser does not currently implement encoding.
-        auto name = TRY(URL::Parser::percent_encode_after_encoding(tuple.name, URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded, true));
+        auto name = TRY(URL::Parser::percent_encode_after_encoding(*encoder, tuple.name, URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded, true));
 
         // 3. Let value be the result of running percent-encode after encoding with encoding, tuple’s value, the application/x-www-form-urlencoded percent-encode set, and true.
-        // FIXME: URL::Parser does not currently implement encoding.
-        auto value = TRY(URL::Parser::percent_encode_after_encoding(tuple.value, URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded, true));
+        auto value = TRY(URL::Parser::percent_encode_after_encoding(*encoder, tuple.value, URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded, true));
 
         // 4. If output is not the empty string, then append U+0026 (&) to output.
         if (!output.is_empty())

+ 0 - 4
Userland/Libraries/LibWeb/HTML/HTMLFormElement.cpp

@@ -160,10 +160,6 @@ WebIDL::ExceptionOr<void> HTMLFormElement::submit_form(JS::NonnullGCPtr<HTMLElem
 
     // 6. Let encoding be the result of picking an encoding for the form.
     auto encoding = TRY_OR_THROW_OOM(vm, pick_an_encoding());
-    if (encoding != "UTF-8"sv) {
-        dbgln("FIXME: Support encodings other than UTF-8 in form submission. Returning from form submission.");
-        return {};
-    }
 
     // 7. Let entry list be the result of constructing the entry list with form, submitter, and encoding.
     auto entry_list_or_null = TRY(construct_entry_list(realm, *this, submitter, encoding));