This commit is contained in:
Ryan Liptak 2025-01-02 11:40:21 +00:00 committed by GitHub
commit 82246271b8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 5251 additions and 4544 deletions

View file

@ -874,6 +874,8 @@ invoke_generator(
generate_css_implementation()
generate_html_implementation()
invoke_generator(
"WebGLRenderingContextImpl.cpp"
Lagom::GenerateWebGLRenderingContext
@ -917,6 +919,7 @@ set(GENERATED_SOURCES
WebGL/WebGLRenderingContextImpl.cpp
Worker/WebWorkerClientEndpoint.h
Worker/WebWorkerServerEndpoint.h
HTML/Parser/NamedCharacterReferences.cpp
)
serenity_lib(LibWeb web)

File diff suppressed because it is too large Load diff

View file

@ -6,16 +6,47 @@
#pragma once
#include <AK/StringView.h>
#include <AK/Vector.h>
#include <AK/Optional.h>
#include <AK/Types.h>
#include <LibWeb/HTML/Parser/NamedCharacterReferences.h>
namespace Web::HTML {
struct EntityMatch {
Vector<u32, 2> code_points;
StringView entity;
class NamedCharacterReferenceMatcher {
public:
NamedCharacterReferenceMatcher() = default;
// If `c` is the codepoint of a child of the current `node_index`, the `node_index`
// is updated to that child and the function returns `true`.
// Otherwise, the `node_index` is unchanged and the function returns false.
bool update_with_codepoint(u32 c)
{
if (c > 0x7F)
return false;
return update_with_ascii_char(static_cast<u8>(c));
}
// If `c` is the character of a child of the current `node_index`, the `node_index`
// is updated to that child and the function returns `true`.
// Otherwise, the `node_index` is unchanged and the function returns false.
bool update_with_ascii_char(u8 c);
// Returns true if the current `node_index` is marked as the end of a word
bool currently_matches() const { return named_character_reference_is_end_of_word(m_node_index); }
// Returns the code points associated with the last match, if any.
Optional<NamedCharacterReferenceCodepoints> code_points() const { return named_character_reference_codepoints_from_unique_index(m_last_matched_unique_index); }
bool last_match_ends_with_semicolon() const { return m_ends_with_semicolon; }
u8 overconsumed_code_points() const { return m_overconsumed_code_points; }
private:
u16 m_node_index { 0 };
u16 m_last_matched_unique_index { 0 };
u16 m_pending_unique_index { 0 };
u8 m_overconsumed_code_points { 0 };
bool m_ends_with_semicolon { false };
};
Optional<EntityMatch> code_points_from_entity(StringView);
}

File diff suppressed because it is too large Load diff

View file

@ -94,9 +94,10 @@ namespace Web::HTML {
} \
} while (0)
#define DONT_CONSUME_NEXT_INPUT_CHARACTER \
do { \
restore_to(m_prev_utf8_iterator); \
#define DONT_CONSUME_NEXT_INPUT_CHARACTER \
do { \
if (current_input_character.has_value()) \
restore_to(m_prev_utf8_iterator); \
} while (0)
#define ON(code_point) \
@ -1679,6 +1680,7 @@ _StartOfFunction:
ON_ASCII_ALPHANUMERIC
{
m_named_character_reference_matcher = {};
RECONSUME_IN(NamedCharacterReference);
}
ON('#')
@ -1697,16 +1699,31 @@ _StartOfFunction:
// 13.2.5.73 Named character reference state, https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
BEGIN_STATE(NamedCharacterReference)
{
size_t byte_offset = m_utf8_view.byte_offset_of(m_prev_utf8_iterator);
if (current_input_character.has_value()) {
if (m_named_character_reference_matcher.update_with_codepoint(current_input_character.value())) {
m_temporary_buffer.append(current_input_character.value());
continue;
} else {
DONT_CONSUME_NEXT_INPUT_CHARACTER;
}
}
auto match = HTML::code_points_from_entity(m_decoded_input.substring_view(byte_offset, m_decoded_input.length() - byte_offset));
// Only consume the characters within the longest match. It's possible that we've overconsumed code points,
// though, so we want to backtrack to the longest match found. For example, `&notindo` (which could still
// have lead to `&notindot;`) would need to backtrack back to `&not`),
auto overconsumed_code_points = m_named_character_reference_matcher.overconsumed_code_points();
if (overconsumed_code_points > 0) {
auto current_byte_offset = m_utf8_view.byte_offset_of(m_utf8_iterator);
// All consumed code points during character reference matching are guaranteed to be
// within the ASCII range, so they are always 1 byte wide.
restore_to(m_utf8_view.iterator_at_byte_offset_without_validation(current_byte_offset - overconsumed_code_points));
m_temporary_buffer.resize_and_keep_capacity(m_temporary_buffer.size() - overconsumed_code_points);
}
if (match.has_value()) {
skip(match->entity.length() - 1);
for (auto ch : match.value().entity)
m_temporary_buffer.append(ch);
if (consumed_as_part_of_an_attribute() && !match.value().entity.ends_with(';')) {
auto mapped_codepoints = m_named_character_reference_matcher.code_points();
// If there is a match
if (mapped_codepoints.has_value()) {
if (consumed_as_part_of_an_attribute() && !m_named_character_reference_matcher.last_match_ends_with_semicolon()) {
auto next_code_point = peek_code_point(0, stop_at_insertion_point);
if (next_code_point.has_value() && (next_code_point.value() == '=' || is_ascii_alphanumeric(next_code_point.value()))) {
FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
@ -1714,19 +1731,22 @@ _StartOfFunction:
}
}
if (!match.value().entity.ends_with(';')) {
if (!m_named_character_reference_matcher.last_match_ends_with_semicolon()) {
log_parse_error();
}
m_temporary_buffer = match.value().code_points;
m_temporary_buffer.clear_with_capacity();
m_temporary_buffer.append(mapped_codepoints.value().first);
auto second_codepoint = named_character_reference_second_codepoint_value(mapped_codepoints.value().second);
if (second_codepoint.has_value()) {
m_temporary_buffer.append(second_codepoint.value());
}
FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
SWITCH_TO_RETURN_STATE;
} else {
FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
// FIXME: This should be SWITCH_TO, but we always lose the first character on this path, so just reconsume it.
// I can't wrap my head around how to do it as the spec says.
RECONSUME_IN(AmbiguousAmpersand);
SWITCH_TO_WITH_UNCLEAN_BUILDER(AmbiguousAmpersand);
}
}
END_STATE
@ -1818,16 +1838,11 @@ _StartOfFunction:
m_character_reference_code += current_input_character.value() - 0x30;
continue;
}
ON_ASCII_UPPER_ALPHA
ON_ASCII_HEX_DIGIT
{
m_character_reference_code *= 16;
m_character_reference_code += current_input_character.value() - 0x37;
continue;
}
ON_ASCII_LOWER_ALPHA
{
m_character_reference_code *= 16;
m_character_reference_code += current_input_character.value() - 0x57;
auto hex_digit_min_ascii_value = is_ascii_upper_alpha(current_input_character.value()) ? 0x37 : 0x57;
m_character_reference_code += current_input_character.value() - hex_digit_min_ascii_value;
continue;
}
ON(';')

View file

@ -14,6 +14,7 @@
#include <AK/Utf8View.h>
#include <LibGC/Ptr.h>
#include <LibWeb/Forward.h>
#include <LibWeb/HTML/Parser/Entities.h>
#include <LibWeb/HTML/Parser/HTMLToken.h>
namespace Web::HTML {
@ -211,6 +212,8 @@ private:
HTMLToken m_current_token;
StringBuilder m_current_builder;
NamedCharacterReferenceMatcher m_named_character_reference_matcher;
Optional<ByteString> m_last_emitted_start_tag_name;
bool m_explicit_eof_inserted { false };

View file

@ -1779,6 +1779,8 @@ public class HTMLTokenizer {
}
// 13.2.5.73 Named character reference state, https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
case .NamedCharacterReference:
// FIXME: This should be updated to match the C++ implementation, this
// won't handle one-character-at-a-time document.write calls properly.
var subString = self.input[self.previousCursor...]
let entityMatch = subString.withUTF8 { utf8 in
return Web.HTML.match_entity_for_named_character_reference(AK.StringView(utf8.baseAddress!, utf8.count))

View file

@ -5,7 +5,9 @@
*/
#include <LibTextCodec/Decoder.h>
#include <LibWeb/HTML/Parser/Entities.h>
#include <LibWeb/HTML/Parser/HTMLTokenizerHelpers.h>
#include <LibWeb/HTML/Parser/NamedCharacterReferences.h>
namespace Web::HTML {
@ -22,9 +24,27 @@ OptionalString decode_to_utf8(StringView text, StringView encoding)
OptionalEntityMatch match_entity_for_named_character_reference(StringView entity)
{
auto entity_match = code_points_from_entity(entity);
if (entity_match.has_value())
return entity_match.release_value();
NamedCharacterReferenceMatcher matcher;
int consumed_length = 0;
for (auto c : entity) {
if (!matcher.update_with_ascii_char(c))
break;
consumed_length++;
}
auto codepoints = matcher.code_points();
if (codepoints.has_value()) {
EntityMatch match;
auto matched_length = consumed_length - matcher.overconsumed_code_points();
auto matched_string_view = entity.substring_view(0, matched_length);
auto second_codepoint = named_character_reference_second_codepoint_value(codepoints.value().second);
if (second_codepoint.has_value()) {
match = { { codepoints.value().first, second_codepoint.release_value() }, matched_string_view };
} else {
match = { { codepoints.value().first }, matched_string_view };
}
return match;
}
return std::nullopt;
}

View file

@ -8,16 +8,24 @@
#include <AK/String.h>
#include <AK/StringView.h>
#include <LibWeb/HTML/Parser/Entities.h>
#include <optional>
namespace Web::HTML {
// FIXME: This is a temporary stop-gap solution, and it should be removed once the C++
// NamedCharacterReference state implementation is implemented in Swift.
struct EntityMatch {
Vector<u32, 2> code_points;
StringView entity;
};
// Swift-friendly wrapper for TextCodec::Decoder::to_utf8
using OptionalString = std::optional<String>;
OptionalString decode_to_utf8(StringView text, StringView encoding);
// Swift-friendly wrapper for HTML::code_points_from_entity
// FIXME: This is a temporary stop-gap solution, and it should be removed once the C++
// NamedCharacterReference state implementation is implemented in Swift.
using OptionalEntityMatch = std::optional<EntityMatch>;
OptionalEntityMatch match_entity_for_named_character_reference(StringView entity);

View file

@ -131,6 +131,29 @@ function (generate_css_implementation)
set(LIBWEB_ALL_GENERATED_IDL ${LIBWEB_ALL_GENERATED_IDL} PARENT_SCOPE)
endfunction()
function (generate_html_implementation)
set(LIBWEB_INPUT_FOLDER "${CMAKE_CURRENT_SOURCE_DIR}")
invoke_generator(
"NamedCharacterReferences.cpp"
Lagom::GenerateNamedCharacterReferences
"${LIBWEB_INPUT_FOLDER}/HTML/Parser/Entities.json"
"HTML/Parser/NamedCharacterReferences.h"
"HTML/Parser/NamedCharacterReferences.cpp"
arguments -j "${LIBWEB_INPUT_FOLDER}/HTML/Parser/Entities.json"
)
set(HTML_GENERATED_HEADERS
"HTML/Parser/NamedCharacterReferences.h"
)
list(TRANSFORM HTML_GENERATED_HEADERS PREPEND "${CMAKE_CURRENT_BINARY_DIR}/")
if (ENABLE_INSTALL_HEADERS)
install(FILES ${HTML_GENERATED_HEADERS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/LibWeb/HTML")
endif()
list(APPEND LIBWEB_ALL_GENERATED_HEADERS ${HTML_GENERATED_HEADERS})
set(LIBWEB_ALL_GENERATED_HEADERS ${LIBWEB_ALL_GENERATED_HEADERS} PARENT_SCOPE)
endfunction()
function (generate_js_bindings target)
set(LIBWEB_INPUT_FOLDER "${CMAKE_CURRENT_SOURCE_DIR}")
set(generated_idl_targets ${LIBWEB_ALL_GENERATED_IDL})

View file

@ -11,5 +11,6 @@ lagom_tool(GenerateCSSTransformFunctions SOURCES GenerateCSSTransformFunction
lagom_tool(GenerateWindowOrWorkerInterfaces SOURCES GenerateWindowOrWorkerInterfaces.cpp LIBS LibMain LibIDL)
lagom_tool(GenerateAriaRoles SOURCES GenerateAriaRoles.cpp LIBS LibMain)
lagom_tool(GenerateWebGLRenderingContext SOURCES GenerateWebGLRenderingContext.cpp BindingsGenerator/IDLGenerators.cpp LIBS LibMain LibIDL)
lagom_tool(GenerateNamedCharacterReferences SOURCES GenerateNamedCharacterReferences.cpp LIBS LibMain)
add_subdirectory(BindingsGenerator)

View file

@ -0,0 +1,503 @@
/*
* Copyright (c) 2024, the SerenityOS developers.
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include "GeneratorUtil.h"
#include <AK/Array.h>
#include <AK/FixedArray.h>
#include <AK/SourceGenerator.h>
#include <AK/StringBuilder.h>
#include <LibCore/ArgsParser.h>
#include <LibMain/Main.h>
ErrorOr<void> generate_header_file(Core::File& file);
ErrorOr<void> generate_implementation_file(JsonObject& named_character_reference_data, Core::File& file);
ErrorOr<int> serenity_main(Main::Arguments arguments)
{
StringView generated_header_path;
StringView generated_implementation_path;
StringView json_path;
Core::ArgsParser args_parser;
args_parser.add_option(generated_header_path, "Path to the Entities header file to generate", "generated-header-path", 'h', "generated-header-path");
args_parser.add_option(generated_implementation_path, "Path to the Entities implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
args_parser.add_option(json_path, "Path to the JSON file to read from", "json-path", 'j', "json-path");
args_parser.parse(arguments);
auto json = TRY(read_entire_file_as_json(json_path));
VERIFY(json.is_object());
auto named_character_reference_data = json.as_object();
auto generated_header_file = TRY(Core::File::open(generated_header_path, Core::File::OpenMode::Write));
auto generated_implementation_file = TRY(Core::File::open(generated_implementation_path, Core::File::OpenMode::Write));
TRY(generate_header_file(*generated_header_file));
TRY(generate_implementation_file(named_character_reference_data, *generated_implementation_file));
return 0;
}
struct Codepoints {
u32 first;
u32 second;
};
inline static StringView get_second_codepoint_enum_name(u32 codepoint)
{
switch (codepoint) {
case 0x0338:
return "CombiningLongSolidusOverlay"sv;
case 0x20D2:
return "CombiningLongVerticalLineOverlay"sv;
case 0x200A:
return "HairSpace"sv;
case 0x0333:
return "CombiningDoubleLowLine"sv;
case 0x20E5:
return "CombiningReverseSolidusOverlay"sv;
case 0xFE00:
return "VariationSelector1"sv;
case 0x006A:
return "LatinSmallLetterJ"sv;
case 0x0331:
return "CombiningMacronBelow"sv;
default:
return "None"sv;
}
}
ErrorOr<void> generate_header_file(Core::File& file)
{
StringBuilder builder;
SourceGenerator generator { builder };
generator.append(R"~~~(
#pragma once
#include <AK/Optional.h>
#include <AK/Types.h>
namespace Web::HTML {
enum class NamedCharacterReferenceSecondCodepoint {
None,
CombiningLongSolidusOverlay, // U+0338
CombiningLongVerticalLineOverlay, // U+20D2
HairSpace, // U+200A
CombiningDoubleLowLine, // U+0333
CombiningReverseSolidusOverlay, // U+20E5
VariationSelector1, // U+FE00
LatinSmallLetterJ, // U+006A
CombiningMacronBelow, // U+0331
};
inline Optional<u16> named_character_reference_second_codepoint_value(NamedCharacterReferenceSecondCodepoint codepoint)
{
switch (codepoint) {
case NamedCharacterReferenceSecondCodepoint::None:
return {};
case NamedCharacterReferenceSecondCodepoint::CombiningLongSolidusOverlay:
return 0x0338;
case NamedCharacterReferenceSecondCodepoint::CombiningLongVerticalLineOverlay:
return 0x20D2;
case NamedCharacterReferenceSecondCodepoint::HairSpace:
return 0x200A;
case NamedCharacterReferenceSecondCodepoint::CombiningDoubleLowLine:
return 0x0333;
case NamedCharacterReferenceSecondCodepoint::CombiningReverseSolidusOverlay:
return 0x20E5;
case NamedCharacterReferenceSecondCodepoint::VariationSelector1:
return 0xFE00;
case NamedCharacterReferenceSecondCodepoint::LatinSmallLetterJ:
return 0x006A;
case NamedCharacterReferenceSecondCodepoint::CombiningMacronBelow:
return 0x0331;
default:
VERIFY_NOT_REACHED();
}
}
// Note: The first codepoint could fit in 17 bits, and the second could fit in 4 (if unsigned).
// However, to get any benefit from minimizing the struct size, it would need to be accompanied by
// bit-packing the g_named_character_reference_codepoints_lookup array, and then either
// using 5 bits for the second field (since enum bitfields are signed), or using a 4-bit wide
// unsigned integer type.
struct NamedCharacterReferenceCodepoints {
u32 first : 24; // Largest value is U+1D56B
NamedCharacterReferenceSecondCodepoint second : 8;
};
static_assert(sizeof(NamedCharacterReferenceCodepoints) == 4);
u16 named_character_reference_child_index(u16 node_index);
bool named_character_reference_is_end_of_word(u16 node_index);
Optional<NamedCharacterReferenceCodepoints> named_character_reference_codepoints_from_unique_index(u16 unique_index);
Optional<u16> named_character_reference_find_sibling_and_update_unique_index(u16 first_child_index, u8 character, u16& unique_index);
} // namespace Web::HTML
)~~~");
TRY(file.write_until_depleted(generator.as_string_view().bytes()));
return {};
}
class Node final : public RefCounted<Node> {
private:
struct NonnullRefPtrNodeTraits {
static unsigned hash(NonnullRefPtr<Node> const& node)
{
u32 hash = 0;
for (int i = 0; i < 128; i++) {
hash ^= ptr_hash(node.ptr()->m_children[i]);
}
hash ^= int_hash(static_cast<u32>(node.ptr()->m_is_terminal));
return hash;
}
static bool equals(NonnullRefPtr<Node> const& a, NonnullRefPtr<Node> const& b)
{
if (a.ptr()->m_is_terminal != b.ptr()->m_is_terminal)
return false;
for (int i = 0; i < 128; i++) {
if (a.ptr()->m_children[i] != b.ptr()->m_children[i])
return false;
}
return true;
}
};
public:
static NonnullRefPtr<Node> create()
{
return adopt_ref(*new (nothrow) Node());
}
using NodeTableType = HashTable<NonnullRefPtr<Node>, NonnullRefPtrNodeTraits, false>;
void calc_numbers()
{
m_number = static_cast<u16>(m_is_terminal);
for (int i = 0; i < 128; i++) {
if (m_children[i] == nullptr)
continue;
m_children[i]->calc_numbers();
m_number += m_children[i]->m_number;
}
}
u8 num_direct_children()
{
u8 num = 0;
for (int i = 0; i < 128; i++) {
if (m_children[i] != nullptr)
num += 1;
}
return num;
}
Array<Node*, 128>& children() { return m_children; }
void set_as_terminal() { m_is_terminal = true; }
bool is_terminal() const { return m_is_terminal; }
u16 number() const { return m_number; }
private:
Node() = default;
Array<Node*, 128> m_children { 0 };
bool m_is_terminal { false };
u16 m_number { 0 };
};
struct UncheckedNode {
RefPtr<Node> parent;
char character;
RefPtr<Node> child;
};
class DafsaBuilder {
AK_MAKE_NONCOPYABLE(DafsaBuilder);
public:
using MappingType = HashMap<StringView, String>;
DafsaBuilder()
: m_root(Node::create())
{
}
void insert(StringView str)
{
// Must be inserted in sorted order
VERIFY(str > m_previous_word);
size_t common_prefix_len = 0;
for (size_t i = 0; i < min(str.length(), m_previous_word.length()); i++) {
if (str[i] != m_previous_word[i])
break;
common_prefix_len++;
}
minimize(common_prefix_len);
RefPtr<Node> node;
if (m_unchecked_nodes.size() == 0)
node = m_root;
else
node = m_unchecked_nodes.last().child;
auto remaining = str.substring_view(common_prefix_len);
for (char const c : remaining) {
VERIFY(node.ptr()->children().at(c) == nullptr);
auto child = Node::create();
node.ptr()->children().at(c) = child;
m_unchecked_nodes.append(UncheckedNode { node, c, child });
node = child;
}
node.ptr()->set_as_terminal();
VERIFY(str.copy_characters_to_buffer(m_previous_word_buf, sizeof(m_previous_word_buf)));
m_previous_word = StringView(m_previous_word_buf, str.length());
}
void minimize(size_t down_to)
{
if (m_unchecked_nodes.size() == 0)
return;
while (m_unchecked_nodes.size() > down_to) {
auto unchecked_node = m_unchecked_nodes.take_last();
auto child = unchecked_node.child.release_nonnull();
auto it = m_minimized_nodes.find(child);
if (it != m_minimized_nodes.end()) {
unchecked_node.parent.ptr()->children().at(unchecked_node.character) = *it;
} else {
m_minimized_nodes.set(child);
}
}
}
void calc_numbers()
{
m_root.ptr()->calc_numbers();
}
Optional<size_t> get_unique_index(StringView str)
{
size_t index = 0;
Node* node = m_root.ptr();
for (char const c : str) {
if (node->children().at(c) == nullptr)
return {};
for (int sibling_c = 0; sibling_c < 128; sibling_c++) {
if (node->children().at(sibling_c) == nullptr)
continue;
if (sibling_c < c) {
index += node->children().at(sibling_c)->number();
}
}
node = node->children().at(c);
if (node->is_terminal())
index += 1;
}
return index;
}
Node* root()
{
return m_root.ptr();
}
private:
NonnullRefPtr<Node> m_root;
Node::NodeTableType m_minimized_nodes;
Vector<UncheckedNode> m_unchecked_nodes;
char m_previous_word_buf[64];
StringView m_previous_word = ""sv;
};
static u16 write_children(Node* node, SourceGenerator& generator, Vector<Node*>& queue, HashMap<Node*, u16>& child_indexes, u16 first_available_index)
{
auto current_available_index = first_available_index;
auto num_children = node->num_direct_children();
u16 child_i = 0;
for (u8 c = 0; c < 128; c++) {
if (node->children().at(c) == nullptr)
continue;
auto* child = node->children().at(c);
auto is_last_child = child_i == num_children - 1;
if (!child_indexes.contains(child)) {
auto child_num_children = child->num_direct_children();
if (child_num_children > 0) {
child_indexes.set(child, current_available_index);
current_available_index += child_num_children;
}
queue.append(child);
}
auto member_generator = generator.fork();
member_generator.set("char", StringView(&c, 1));
member_generator.set("number", MUST(String::formatted("{}", child->number())));
member_generator.set("end_of_word", MUST(String::formatted("{}", child->is_terminal())));
member_generator.set("end_of_list", MUST(String::formatted("{}", is_last_child)));
auto child_index = child_indexes.get(child).value_or(0);
member_generator.set("child_index", MUST(String::formatted("{}", child_index)));
member_generator.append(R"~~~( { '@char@', @number@, @end_of_word@, @end_of_list@, @child_index@ },
)~~~");
child_i++;
}
return current_available_index;
}
ErrorOr<void> generate_implementation_file(JsonObject& named_character_reference_data, Core::File& file)
{
StringBuilder builder;
SourceGenerator generator { builder };
DafsaBuilder dafsa_builder;
named_character_reference_data.for_each_member([&](auto& key, auto&) {
dafsa_builder.insert(key.substring_view(1));
});
dafsa_builder.minimize(0);
dafsa_builder.calc_numbers();
// As a sanity check, confirm that the minimal perfect hashing doesn't
// have any collisions
{
HashTable<size_t> index_set;
named_character_reference_data.for_each_member([&](auto& key, auto&) {
auto index = dafsa_builder.get_unique_index(key.substring_view(1)).value();
VERIFY(!index_set.contains(index));
index_set.set(index);
});
VERIFY(named_character_reference_data.size() == index_set.size());
}
auto index_to_codepoints = MUST(FixedArray<Codepoints>::create(named_character_reference_data.size()));
named_character_reference_data.for_each_member([&](auto& key, auto& value) {
auto codepoints = value.as_object().get_array("codepoints"sv).value();
auto unique_index = dafsa_builder.get_unique_index(key.substring_view(1)).value();
auto array_index = unique_index - 1;
u32 second_codepoint = 0;
if (codepoints.size() == 2) {
second_codepoint = codepoints[1].template as_integer<u32>();
}
index_to_codepoints[array_index] = Codepoints { codepoints[0].template as_integer<u32>(), second_codepoint };
});
generator.append(R"~~~(
#include <LibWeb/HTML/Parser/Entities.h>
#include <AK/Format.h>
namespace Web::HTML {
static NamedCharacterReferenceCodepoints g_named_character_reference_codepoints_lookup[] = {
)~~~");
for (auto codepoints : index_to_codepoints) {
auto member_generator = generator.fork();
member_generator.set("first_codepoint", MUST(String::formatted("0x{:X}", codepoints.first)));
member_generator.set("second_codepoint_name", get_second_codepoint_enum_name(codepoints.second));
member_generator.append(R"~~~( {@first_codepoint@, NamedCharacterReferenceSecondCodepoint::@second_codepoint_name@},
)~~~");
}
generator.append(R"~~~(};
struct DafsaNode {
// The actual alphabet of characters used in the list of named character references only
// includes 61 unique characters ('1'...'8', ';', 'a'...'z', 'A'...'Z'), but we have
// bits to spare and encoding this as a `u8` allows us to avoid the need for converting
// between an `enum(u6)` containing only the alphabet and the actual `u8` character value.
u8 character;
// Nodes are numbered with "an integer which gives the number of words that
// would be accepted by the automaton starting from that state." This numbering
// allows calculating "a one-to-one correspondence between the integers 1 to L
// (L is the number of words accepted by the automaton) and the words themselves."
//
// Essentially, this allows us to have a minimal perfect hashing scheme such that
// it's possible to store & lookup the codepoint transformations of each named character
// reference using a separate array.
//
// Empirically, the largest number in our DAFSA is 168, so all number values fit in a u8.
u8 number;
// If true, this node is the end of a valid named character reference.
// Note: This does not necessarily mean that this node does not have child nodes.
bool end_of_word : 1;
// If true, this node is the end of a sibling list.
// If false, then (index + 1) will contain the next sibling.
bool end_of_list : 1;
// Index of the first child of this node.
// There are 3872 nodes in our DAFSA, so all indexes could fit in a u12.
u16 child_index : 14;
};
static_assert(sizeof(DafsaNode) == 4);
static DafsaNode g_named_character_reference_dafsa[] = {
{ 0, 0, false, true, 1 },
)~~~");
Vector<Node*> queue;
HashMap<Node*, u16> child_indexes;
u16 first_available_index = dafsa_builder.root()->num_direct_children() + 1;
Node* node = dafsa_builder.root();
while (true) {
first_available_index = write_children(node, generator, queue, child_indexes, first_available_index);
if (queue.size() == 0)
break;
node = queue.take_first();
}
generator.append(R"~~~(};
u16 named_character_reference_child_index(u16 node_index) {
return g_named_character_reference_dafsa[node_index].child_index;
}
bool named_character_reference_is_end_of_word(u16 node_index) {
return g_named_character_reference_dafsa[node_index].end_of_word;
}
// Note: The unique index is 1-based.
Optional<NamedCharacterReferenceCodepoints> named_character_reference_codepoints_from_unique_index(u16 unique_index) {
if (unique_index == 0) return {};
return g_named_character_reference_codepoints_lookup[unique_index - 1];
}
// Search `first_child_index` and siblings of `first_child_index` for a node with the value `character`.
// If found, returns the index of the node within the `dafsa` array. Otherwise, returns `null`.
// Updates `unique_index` as the array is traversed
Optional<u16> named_character_reference_find_sibling_and_update_unique_index(u16 first_child_index, u8 character, u16& unique_index) {
auto index = first_child_index;
while (true) {
if (g_named_character_reference_dafsa[index].character < character) {
unique_index += g_named_character_reference_dafsa[index].number;
}
if (g_named_character_reference_dafsa[index].character == character) {
if (g_named_character_reference_dafsa[index].end_of_word) unique_index++;
return index;
}
if (g_named_character_reference_dafsa[index].end_of_list) return {};
index += 1;
}
VERIFY_NOT_REACHED();
}
} // namespace Web::HTML
)~~~");
TRY(file.write_until_depleted(generator.as_string_view().bytes()));
return {};
}

View file

@ -76,3 +76,11 @@ lagom_tool("GenerateWindowOrWorkerInterfaces") {
"//Userland/Libraries/LibMain",
]
}
lagom_tool("GenerateNamedCharacterReferences") {
sources = [ "GenerateNamedCharacterReferences.cpp" ]
deps = [
":headers",
"//Userland/Libraries/LibMain",
]
}

View file

@ -248,6 +248,23 @@ embed_as_string("generate_quirks_mode_stylesheet_source") {
namespace = "Web::CSS"
}
compiled_action("generate_named_character_references") {
tool = "//Meta/Lagom/Tools/CodeGenerators/LibWeb:GenerateNamedCharacterReferences"
inputs = [ "HTML/Parser/Entities.json" ]
outputs = [
"$target_gen_dir/HTML/Parser/NamedCharacterReferences.h",
"$target_gen_dir/HTML/Parser/NamedCharacterReferences.cpp",
]
args = [
"-h",
rebase_path(outputs[0], root_build_dir),
"-c",
rebase_path(outputs[1], root_build_dir),
"-j",
rebase_path(inputs[0], root_build_dir),
]
}
source_set("all_generated") {
generated_deps = [
":WebWorkerClientEndpoint",

View file

@ -30,7 +30,6 @@ Text/input/Worker/Worker-performance.html
Text/input/Worker/Worker-postMessage-transfer.html
; Skipped due to assertion failures
Text/input/wpt-import/html/syntax/parsing/html5lib_entities01.html
Text/input/wpt-import/html/syntax/parsing/html5lib_plain-text-unsafe.html
Text/input/wpt-import/html/syntax/parsing/html5lib_template.html
Text/input/wpt-import/html/syntax/parsing/html5lib_tests1.html
@ -39,7 +38,6 @@ Text/input/wpt-import/html/syntax/parsing/html5lib_tests16.html
Text/input/wpt-import/html/syntax/parsing/html5lib_tests19.html
Text/input/wpt-import/html/syntax/parsing/html5lib_tests5.html
Text/input/wpt-import/html/syntax/parsing/html5lib_webkit01.html
Text/input/wpt-import/html/syntax/parsing/named-character-references.html
; Unknown, imported as skipped in #2148
Text/input/wpt-import/html/infrastructure/safe-passing-of-structured-data/structuredclone_0.html

View file

@ -199,6 +199,25 @@ TEST_CASE(character_reference_in_attribute)
END_ENUMERATION();
}
TEST_CASE(numeric_character_reference)
{
auto tokens = run_tokenizer("&#1111"sv);
BEGIN_ENUMERATION(tokens);
EXPECT_CHARACTER_TOKEN(1111);
EXPECT_END_OF_FILE_TOKEN();
END_ENUMERATION();
}
TEST_CASE(hex_character_reference)
{
auto tokens = run_tokenizer("&#xA12bZ"sv);
BEGIN_ENUMERATION(tokens);
EXPECT_CHARACTER_TOKEN(0xA12B);
EXPECT_CHARACTER_TOKEN('Z');
EXPECT_END_OF_FILE_TOKEN();
END_ENUMERATION();
}
TEST_CASE(comment)
{
auto tokens = run_tokenizer("<p><!-- This is a comment --></p>"sv);

View file

@ -2,8 +2,7 @@ Harness status: OK
Found 3 tests
2 Pass
1 Fail
3 Pass
Pass Check number of divs
Pass Check div structure: network
Fail Check div structure: document.write
Pass Check div structure: document.write

View file

@ -0,0 +1,80 @@
Harness status: OK
Found 75 tests
75 Pass
Pass html5lib_entities01.html 16c694bcf0b3ff3723fa070eea7e1e82ef12a337
Pass html5lib_entities01.html 05e04b39ef06e2367a33326f5dd566913aa6628f
Pass html5lib_entities01.html fbf7d9fec595585869c5c595d5588b34fd175278
Pass html5lib_entities01.html e59b0a76d7bcfb429b27e00e469f35e08a9bdd1a
Pass html5lib_entities01.html 5ea854d6ecd4d6dd459cb36d4faf3ed36e11c073
Pass html5lib_entities01.html 119cd15b852615cd0fce759769b4a3788595e3bb
Pass html5lib_entities01.html 903cefcfae1125cb71fc77f4a6b7d3546e8f4020
Pass html5lib_entities01.html 69f08b40c7506153e809415ca98e2ed98992216b
Pass html5lib_entities01.html 9c00a1833e8cf4af28c8bd94902412ad7052b4b0
Pass html5lib_entities01.html b5bcdcbc6e88b380be0e48ca2620fbbb8e92e497
Pass html5lib_entities01.html bf6c90305b2856c2d9c9a146dfff867fe7a5e0f3
Pass html5lib_entities01.html 6b9c8d175a3d7b6cf04ffd72e44a7dc88686460f
Pass html5lib_entities01.html 76c184d9ce64b8a52c2e67eafeb8d332c096f2be
Pass html5lib_entities01.html 4c30f8f931eb44c2f208e837555c0cc444dd4612
Pass html5lib_entities01.html 1db77ef761092d65ce847c0bcd6e7cb892db754d
Pass html5lib_entities01.html 284c18aa40a73e4052525a9ffb30b23182f237ea
Pass html5lib_entities01.html 6b336a43e394d3ab7ceb2ab54c63409e8a27aded
Pass html5lib_entities01.html 436c30dd76bf9b5c2b29a181d9a9412ec0ab4bdf
Pass html5lib_entities01.html 1373a52ddcb71f20f29d92abb6714eaabeba7424
Pass html5lib_entities01.html d60f4f324a1ad9c09c4d3590c8c537af2852eeb4
Pass html5lib_entities01.html d21511e2df56c306c78e1449c960c66e565e016e
Pass html5lib_entities01.html 39107d16f24d4c7bcd40ad1239b5f4f677877ee8
Pass html5lib_entities01.html a44b740e8b2349e75c9eb0376f665eab13ff821d
Pass html5lib_entities01.html a0e38b1c19eba037b34c68864634cff032f0b892
Pass html5lib_entities01.html 390d9571a24be0961c8fcd78c69eea16a6414246
Pass html5lib_entities01.html ceba8404405dd3b3b423c45411bde15bf72a846d
Pass html5lib_entities01.html f8dd2fccc21d3a08790a7877186840a692adf111
Pass html5lib_entities01.html d526830d439d3c4e966b22fbedf819d465d3107b
Pass html5lib_entities01.html 240af7bacbfecce6e2a973de9c89fad817fb8d42
Pass html5lib_entities01.html d657585ca1df5b86693fca8a0a2eae76bd9b1c2b
Pass html5lib_entities01.html b508fcffb2d9f2424c7837270e51824321fb4570
Pass html5lib_entities01.html 897ab551df27df14418a46ff1c3acef8338c53f3
Pass html5lib_entities01.html 493a472ccf903088c813ff6874d54482a161df8f
Pass html5lib_entities01.html 67d19edb1f6ecbbb6ccb90df2345e52e5c58efc3
Pass html5lib_entities01.html dded422b7406c966c944555f220d7d3dfcf2a143
Pass html5lib_entities01.html 60a76c9e10e4ac53f836f9e45eb0518dd0b7b73b
Pass html5lib_entities01.html b47fcc6a614247319908b00935f10ec134399917
Pass html5lib_entities01.html 3b5c7d0331ae900e1179eaf3545c78d147434fbf
Pass html5lib_entities01.html a0f119508046dbb4f8059232f6e99f66c1e8e7a6
Pass html5lib_entities01.html daab384be8471edeb755353c5dccaad0c415dac5
Pass html5lib_entities01.html 8485e4d103a517615f39c0d0b71fe5065c5437db
Pass html5lib_entities01.html 4c28749faddb096d1f04792b7daf039268c43181
Pass html5lib_entities01.html c84c576954c4c493528eaa34233c926653152be5
Pass html5lib_entities01.html b2797e18c499df32296545225c259dbf4bea2908
Pass html5lib_entities01.html 5b2dfe6f187413faecd91336bd353c05768ea722
Pass html5lib_entities01.html 34af7be5bcff18ec869a306b19daea70f61f7088
Pass html5lib_entities01.html 8f2e74688427858fc1a895fc472d074a0528a7ae
Pass html5lib_entities01.html 530d6251a43d688e69959237e519812585de8266
Pass html5lib_entities01.html bced9b8c339d0d2838ea0fffe9a64027b4f3a877
Pass html5lib_entities01.html 9c9e8079df25999c606bc84f46b348544a23b9b4
Pass html5lib_entities01.html ec61d22b3bc6f93e54bbff964311bba9a3a06b21
Pass html5lib_entities01.html 391f0136aebdd0e874c8eb85651ffda7e9f86f24
Pass html5lib_entities01.html e2974e7029b008539aacc1ee885705764f8c53f4
Pass html5lib_entities01.html 23e105ca5329cc0338a96fe7e088ba9b319c46d1
Pass html5lib_entities01.html 87af28752724c400edef3970e52440639be5b1a8
Pass html5lib_entities01.html f50d9e39e2bda3d8c0b1c69d1f1e4e86c4b39ac3
Pass html5lib_entities01.html 12827fadb8d36b829e9c2c7315e0848d2d7ef278
Pass html5lib_entities01.html f603997321070e6ccada6fefe4240a9e6c7e870a
Pass html5lib_entities01.html a88e381bf36e74aaa8ac5b0877153b7158bae579
Pass html5lib_entities01.html 11019fa64a25748a5bfb864fc200ec2710b54aa3
Pass html5lib_entities01.html 3d238b9146102bd11e898ff4913f86e8ded65be4
Pass html5lib_entities01.html a1375bfde7be56e514471700e030b1c7e7090e2f
Pass html5lib_entities01.html 7936b73efa385d183e93453bea24fd0c4dff4742
Pass html5lib_entities01.html 41cdf6978b0c48e7044d5e4534fc8bb08de4cdf2
Pass html5lib_entities01.html 5138f572a4db2e2edc2d723e1bb87af72ab501f4
Pass html5lib_entities01.html f30757617b6df330deba1cb607e8d47f71bda13f
Pass html5lib_entities01.html a6fd8cdca1fa8cf07519d9a0c5b779eafa438b70
Pass html5lib_entities01.html ada342466887e85d89c3b815b127bfced036ac76
Pass html5lib_entities01.html 74bd99a9263f0b8e8a5fac4d2684fe37d5a1a9cc
Pass html5lib_entities01.html bffe7b00046407080251ab6bf58cb97ce2a34893
Pass html5lib_entities01.html 5aef37f1f2b9ac45adfade044c882eb09a297569
Pass html5lib_entities01.html 6e2d817539fb3b2023c7bcb88ad220c136f70cf0
Pass html5lib_entities01.html d4ac52727ff405f61a1d878a0aa1951ae5264c80
Pass html5lib_entities01.html d2584faaa4dda5283955b2dc22812a018d04a72d
Pass html5lib_entities01.html 56dc3e612fbfa06cfeb26957e357defcf73aa220

View file

@ -2,8 +2,7 @@ Harness status: OK
Found 63 tests
62 Pass
1 Fail
63 Pass
Pass html5lib_tests2.html e070301fb578bd639ecbc7ec720fa60222d05826
Pass html5lib_tests2.html aaf24dabcb42470e447d241a40def0d136c12b93
Pass html5lib_tests2.html b6c1142484570bb90c36e454ee193cca17bb618a
@ -27,7 +26,7 @@ Pass html5lib_tests2.html 73b97cd984a62703ec54ec4a876ec32aa5fd3b8c
Pass html5lib_tests2.html 2db9616ed62fc2a26056f3395459869cf556974d
Pass html5lib_tests2.html b59aa1c714892618eaccd51696658887fcbd2045
Pass html5lib_tests2.html 98818e7fda2506603bd208662613edb40297c2d3
Fail html5lib_tests2.html e0c43080cf61c0696031bdb097bea4f2a647cfc2
Pass html5lib_tests2.html e0c43080cf61c0696031bdb097bea4f2a647cfc2
Pass html5lib_tests2.html f7753d80a422c40b5fa04d99e52d8ae83369757a
Pass html5lib_tests2.html 7cbd584aef9508a90c98f80040078149a92ec869
Pass html5lib_tests2.html e0f7f130b1e3653dd06f10f3492e4f0bf4cd3cfa

View file

@ -2,5 +2,5 @@ Harness status: OK
Found 1 tests
1 Fail
Fail document.write entity
1 Pass
Pass document.write entity