LibTextCodec+LibURL: Implement utf-8 and euc-jp encoders

Implements the corresponding encoders, selects the appropriate one when
encoding URL search params. If an encoder for the given encoding could
not be found, fallback to utf-8.
This commit is contained in:
BenJilks 2024-08-05 16:03:53 +01:00 committed by Tim Ledbetter
parent 82a63e350c
commit 72d0e3284b
Notes: github-actions[bot] 2024-08-08 16:51:38 +00:00
11 changed files with 260 additions and 22 deletions

View file

@ -21,6 +21,7 @@ struct LookupTable {
u32 max_code_point;
Vector<u32> code_points;
bool generate_accessor;
bool generate_inverse_accessor;
};
struct LookupTables {
@ -33,7 +34,12 @@ enum class GenerateAccessor {
Yes,
};
LookupTable prepare_table(JsonArray const& data, GenerateAccessor generate_accessor = GenerateAccessor::No)
enum class GenerateInverseAccessor {
No,
Yes,
};
LookupTable prepare_table(JsonArray const& data, GenerateAccessor generate_accessor = GenerateAccessor::No, GenerateInverseAccessor generate_inverse_accessor = GenerateInverseAccessor::No)
{
Vector<u32> code_points;
code_points.ensure_capacity(data.size());
@ -58,7 +64,7 @@ LookupTable prepare_table(JsonArray const& data, GenerateAccessor generate_acces
} else {
VERIFY(first_pointer == 0);
}
return { first_pointer, max, move(code_points), generate_accessor == GenerateAccessor::Yes };
return { first_pointer, max, move(code_points), generate_accessor == GenerateAccessor::Yes, generate_inverse_accessor == GenerateInverseAccessor::Yes };
}
void generate_table(SourceGenerator generator, StringView name, LookupTable& table)
@ -81,6 +87,8 @@ void generate_table(SourceGenerator generator, StringView name, LookupTable& tab
generator.appendln("\n};");
if (table.generate_accessor)
generator.appendln("Optional<u32> index_@name@_code_point(u32 pointer);");
if (table.generate_inverse_accessor)
generator.appendln("Optional<u32> code_point_@name@_index(u32 code_point);");
}
ErrorOr<void> generate_header_file(LookupTables& tables, Core::File& file)
@ -155,6 +163,42 @@ Optional<u32> index_@name@_code_point(u32 pointer)
}
}
void generate_inverse_table_accessor(SourceGenerator generator, StringView name, LookupTable& table)
{
generator.set("name", name);
generator.set("first_pointer", MUST(String::number(table.first_pointer)));
generator.set("size", MUST(String::number(table.code_points.size())));
// FIXME - Doing a linear search here is really slow, should be generating
// some kind of reverse lookup table.
if (table.first_pointer > 0) {
generator.append(R"~~~(
Optional<u32> code_point_@name@_index(u32 code_point)
{
for (u32 i = 0; i < s_@name@_index.size(); ++i) {
if (s_@name@_index[i] == code_point) {
return s_@name@_index_first_pointer + i;
}
}
return {};
}
)~~~");
} else {
generator.append(R"~~~(
Optional<u32> code_point_@name@_index(u32 code_point)
{
for (u32 i = 0; i < s_@name@_index.size(); ++i) {
if (s_@name@_index[i] == code_point) {
return i;
}
}
return {};
}
)~~~");
}
}
ErrorOr<void> generate_implementation_file(LookupTables& tables, Core::File& file)
{
StringBuilder builder;
@ -169,6 +213,8 @@ namespace TextCodec {
for (auto& [key, table] : tables.indexes) {
if (table.generate_accessor)
generate_table_accessor(generator.fork(), key, table);
if (table.generate_inverse_accessor)
generate_inverse_table_accessor(generator.fork(), key, table);
}
generator.appendln("\n}");
@ -222,7 +268,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
.indexes = {
{ "gb18030"sv, move(gb18030_table) },
{ "big5"sv, prepare_table(data.get("big5"sv)->as_array(), GenerateAccessor::Yes) },
{ "jis0208"sv, prepare_table(data.get("jis0208"sv)->as_array(), GenerateAccessor::Yes) },
{ "jis0208"sv, prepare_table(data.get("jis0208"sv)->as_array(), GenerateAccessor::Yes, GenerateInverseAccessor::Yes) },
{ "jis0212"sv, prepare_table(data.get("jis0212"sv)->as_array(), GenerateAccessor::Yes) },
{ "euc_kr"sv, prepare_table(data.get("euc-kr"sv)->as_array(), GenerateAccessor::Yes) },
{ "ibm866"sv, prepare_table(data.get("ibm866"sv)->as_array()) },

View file

@ -1,5 +1,6 @@
set(TEST_SOURCES
TestTextDecoders.cpp
TestTextEncoders.cpp
)
foreach(source IN LISTS TEST_SOURCES)

View file

@ -0,0 +1,45 @@
/*
* Copyright (c) 2024, Ben Jilks <benjyjilks@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibTest/TestCase.h>
#include <LibTextCodec/Encoder.h>
TEST_CASE(test_utf8_encode)
{
TextCodec::UTF8Encoder encoder;
// Unicode character U+1F600 GRINNING FACE
auto test_string = "\U0001F600"sv;
Vector<u8> processed_bytes;
MUST(encoder.process(Utf8View(test_string), [&](u8 byte) {
return processed_bytes.try_append(byte);
}));
EXPECT(processed_bytes.size() == 4);
EXPECT(processed_bytes[0] == 0xF0);
EXPECT(processed_bytes[1] == 0x9F);
EXPECT(processed_bytes[2] == 0x98);
EXPECT(processed_bytes[3] == 0x80);
}
TEST_CASE(test_euc_jp_encoder)
{
TextCodec::EUCJPEncoder encoder;
// U+A5 Yen Sign
// U+3088 Hiragana Letter Yo
// U+30C4 Katakana Letter Tu
auto test_string = "\U000000A5\U00003088\U000030C4"sv;
Vector<u8> processed_bytes;
MUST(encoder.process(Utf8View(test_string), [&](u8 byte) {
return processed_bytes.try_append(byte);
}));
EXPECT(processed_bytes.size() == 5);
EXPECT(processed_bytes[0] == 0x5C);
EXPECT(processed_bytes[1] == 0xA4);
EXPECT(processed_bytes[2] == 0xE8);
EXPECT(processed_bytes[3] == 0xA5);
EXPECT(processed_bytes[4] == 0xC4);
}

View file

@ -2,6 +2,7 @@ include(libtextcodec_generators)
set(SOURCES
Decoder.cpp
Encoder.cpp
)
generate_encoding_indexes()

View file

@ -0,0 +1,103 @@
/*
* Copyright (c) 2024, Ben Jilks <benjyjilks@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Error.h>
#include <AK/Utf8View.h>
#include <LibTextCodec/Decoder.h>
#include <LibTextCodec/Encoder.h>
#include <LibTextCodec/LookupTables.h>
namespace TextCodec {
namespace {
UTF8Encoder s_utf8_encoder;
EUCJPEncoder s_euc_jp_encoder;
}
Optional<Encoder&> encoder_for_exact_name(StringView encoding)
{
if (encoding.equals_ignoring_ascii_case("utf-8"sv))
return s_utf8_encoder;
if (encoding.equals_ignoring_ascii_case("euc-jp"sv))
return s_euc_jp_encoder;
dbgln("TextCodec: No encoder implemented for encoding '{}'", encoding);
return {};
}
Optional<Encoder&> encoder_for(StringView label)
{
auto encoding = get_standardized_encoding(label);
return encoding.has_value() ? encoder_for_exact_name(encoding.value()) : Optional<Encoder&> {};
}
// https://encoding.spec.whatwg.org/#utf-8-encoder
ErrorOr<void> UTF8Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
{
ReadonlyBytes bytes { input.bytes(), input.byte_length() };
for (auto byte : bytes)
TRY(on_byte(byte));
return {};
}
// https://encoding.spec.whatwg.org/#euc-jp-encoder
ErrorOr<void> EUCJPEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
{
for (auto item : input) {
// 1. If code point is end-of-queue, return finished.
// 2. If code point is an ASCII code point, return a byte whose value is code point.
if (is_ascii(item)) {
TRY(on_byte(static_cast<u8>(item)));
continue;
}
// 3. If code point is U+00A5, return byte 0x5C.
if (item == 0x00A5) {
TRY(on_byte(static_cast<u8>(0x5C)));
continue;
}
// 4. If code point is U+203E, return byte 0x7E.
if (item == 0x203E) {
TRY(on_byte(static_cast<u8>(0x7E)));
continue;
}
// 5. If code point is in the range U+FF61 to U+FF9F, inclusive, return two bytes whose values are 0x8E and code point 0xFF61 + 0xA1.
if (item >= 0xFF61 && item <= 0xFF9F) {
TRY(on_byte(0x8E));
TRY(on_byte(static_cast<u8>(item - 0xFF61 + 0xA1)));
continue;
}
// 6. If code point is U+2212, set it to U+FF0D.
if (item == 0x2212)
item = 0xFF0D;
// 7. Let pointer be the index pointer for code point in index jis0208.
auto pointer = code_point_jis0208_index(item);
// 8. If pointer is null, return error with code point.
if (!pointer.has_value()) {
// TODO: Report error.
continue;
}
// 9. Let lead be pointer / 94 + 0xA1.
auto lead = *pointer / 94 + 0xA1;
// 10. Let trail be pointer % 94 + 0xA1.
auto trail = *pointer % 94 + 0xA1;
// 11. Return two bytes whose values are lead and trail.
TRY(on_byte(static_cast<u8>(lead)));
TRY(on_byte(static_cast<u8>(trail)));
}
return {};
}
}

View file

@ -0,0 +1,35 @@
/*
* Copyright (c) 2024, Ben Jilks <benjyjilks@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Forward.h>
#include <AK/Function.h>
namespace TextCodec {
class Encoder {
public:
virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) = 0;
protected:
virtual ~Encoder() = default;
};
class UTF8Encoder final : public Encoder {
public:
virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
};
class EUCJPEncoder final : public Encoder {
public:
virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
};
Optional<Encoder&> encoder_for_exact_name(StringView encoding);
Optional<Encoder&> encoder_for(StringView label);
}

View file

@ -4,4 +4,4 @@ set(SOURCES
)
serenity_lib(LibURL url)
target_link_libraries(LibURL PRIVATE LibUnicode)
target_link_libraries(LibURL PRIVATE LibUnicode LibTextCodec)

View file

@ -14,6 +14,8 @@
#include <AK/StringBuilder.h>
#include <AK/StringUtils.h>
#include <AK/Utf8View.h>
#include <LibTextCodec/Decoder.h>
#include <LibTextCodec/Encoder.h>
#include <LibURL/Parser.h>
#include <LibUnicode/IDNA.h>
@ -768,18 +770,17 @@ void Parser::shorten_urls_path(URL& url)
}
// https://url.spec.whatwg.org/#string-percent-encode-after-encoding
ErrorOr<String> Parser::percent_encode_after_encoding(StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus)
ErrorOr<String> Parser::percent_encode_after_encoding(TextCodec::Encoder& encoder, StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus)
{
// NOTE: This is written somewhat ad-hoc since we don't yet implement the Encoding spec.
// 1. Let encodeOutput be an empty I/O queue.
StringBuilder output;
// 3. For each byte of encodeOutput converted to a byte sequence:
for (u8 byte : input) {
TRY(encoder.process(Utf8View(input), [&](u8 byte) -> ErrorOr<void> {
// 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue.
if (space_as_plus && byte == ' ') {
output.append('+');
continue;
return {};
}
// 2. Let isomorph be a code point whose value is bytes value.
@ -796,7 +797,9 @@ ErrorOr<String> Parser::percent_encode_after_encoding(StringView input, PercentE
else {
output.appendff("%{:02X}", byte);
}
}
return {};
}));
// 6. Return output.
return output.to_string();
@ -851,7 +854,9 @@ URL Parser::basic_parse(StringView raw_input, Optional<URL> const& base_url, Opt
// 4. Let state be state override if given, or scheme start state otherwise.
State state = state_override.value_or(State::SchemeStart);
// FIXME: 5. Set encoding to the result of getting an output encoding from encoding.
// 5. Set encoding to the result of getting an output encoding from encoding.
auto encoder = TextCodec::encoder_for("utf-8"sv);
VERIFY(encoder.has_value());
// 6. Let buffer be the empty string.
StringBuilder buffer;
@ -1684,7 +1689,7 @@ URL Parser::basic_parse(StringView raw_input, Optional<URL> const& base_url, Opt
auto query_percent_encode_set = url->is_special() ? PercentEncodeSet::SpecialQuery : PercentEncodeSet::Query;
// 2. Percent-encode after encoding, with encoding, buffer, and queryPercentEncodeSet, and append the result to urls query.
url->m_data->query = percent_encode_after_encoding(buffer.string_view(), query_percent_encode_set).release_value_but_fixme_should_propagate_errors();
url->m_data->query = percent_encode_after_encoding(*encoder, buffer.string_view(), query_percent_encode_set).release_value_but_fixme_should_propagate_errors();
// 3. Set buffer to the empty string.
buffer.clear();
@ -1726,7 +1731,7 @@ URL Parser::basic_parse(StringView raw_input, Optional<URL> const& base_url, Opt
// NOTE: The percent-encode is done on EOF on the entire buffer.
buffer.append_code_point(code_point);
} else {
url->m_data->fragment = percent_encode_after_encoding(buffer.string_view(), PercentEncodeSet::Fragment).release_value_but_fixme_should_propagate_errors();
url->m_data->fragment = percent_encode_after_encoding(*encoder, buffer.string_view(), PercentEncodeSet::Fragment).release_value_but_fixme_should_propagate_errors();
buffer.clear();
}
break;

View file

@ -9,6 +9,7 @@
#include <AK/Optional.h>
#include <AK/StringView.h>
#include <LibTextCodec/Encoder.h>
#include <LibURL/URL.h>
namespace URL {
@ -60,7 +61,7 @@ public:
static URL basic_parse(StringView input, Optional<URL> const& base_url = {}, Optional<URL> url = {}, Optional<State> state_override = {});
// https://url.spec.whatwg.org/#string-percent-encode-after-encoding
static ErrorOr<String> percent_encode_after_encoding(StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus = false);
static ErrorOr<String> percent_encode_after_encoding(TextCodec::Encoder&, StringView input, PercentEncodeSet percent_encode_set, bool space_as_plus = false);
// https://url.spec.whatwg.org/#concept-host-serializer
static ErrorOr<String> serialize_host(Host const&);

View file

@ -9,6 +9,7 @@
#include <AK/StringBuilder.h>
#include <AK/Utf8View.h>
#include <LibTextCodec/Decoder.h>
#include <LibTextCodec/Encoder.h>
#include <LibURL/Parser.h>
#include <LibWeb/Bindings/ExceptionOrUtils.h>
#include <LibWeb/Bindings/Intrinsics.h>
@ -47,6 +48,12 @@ ErrorOr<String> url_encode(Vector<QueryParam> const& tuples, StringView encoding
// 1. Set encoding to the result of getting an output encoding from encoding.
encoding = TextCodec::get_output_encoding(encoding);
auto encoder = TextCodec::encoder_for(encoding);
if (!encoder.has_value()) {
// NOTE: Fallback to default utf-8 encoder.
encoder = TextCodec::encoder_for("utf-8"sv);
}
// 2. Let output be the empty string.
StringBuilder output;
@ -55,12 +62,10 @@ ErrorOr<String> url_encode(Vector<QueryParam> const& tuples, StringView encoding
// 1. Assert: tuples name and tuples value are scalar value strings.
// 2. Let name be the result of running percent-encode after encoding with encoding, tuples name, the application/x-www-form-urlencoded percent-encode set, and true.
// FIXME: URL::Parser does not currently implement encoding.
auto name = TRY(URL::Parser::percent_encode_after_encoding(tuple.name, URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded, true));
auto name = TRY(URL::Parser::percent_encode_after_encoding(*encoder, tuple.name, URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded, true));
// 3. Let value be the result of running percent-encode after encoding with encoding, tuples value, the application/x-www-form-urlencoded percent-encode set, and true.
// FIXME: URL::Parser does not currently implement encoding.
auto value = TRY(URL::Parser::percent_encode_after_encoding(tuple.value, URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded, true));
auto value = TRY(URL::Parser::percent_encode_after_encoding(*encoder, tuple.value, URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded, true));
// 4. If output is not the empty string, then append U+0026 (&) to output.
if (!output.is_empty())

View file

@ -160,10 +160,6 @@ WebIDL::ExceptionOr<void> HTMLFormElement::submit_form(JS::NonnullGCPtr<HTMLElem
// 6. Let encoding be the result of picking an encoding for the form.
auto encoding = TRY_OR_THROW_OOM(vm, pick_an_encoding());
if (encoding != "UTF-8"sv) {
dbgln("FIXME: Support encodings other than UTF-8 in form submission. Returning from form submission.");
return {};
}
// 7. Let entry list be the result of constructing the entry list with form, submitter, and encoding.
auto entry_list_or_null = TRY(construct_entry_list(realm, *this, submitter, encoding));