
Currently this can parse XML and resolve external resources/references, and read a DTD (but not apply or verify its rules). That's good enough for _most_ XHTML documents as the HTML 5 spec enforces its own rules about document well-formedness, and does not make use of XML DTDs (aside from a list of predefined entities). An accompanying `xml` utility is provided that can read and dump XML documents, and can also run the XML conformance test suite.
1780 lines
57 KiB
C++
1780 lines
57 KiB
C++
/*
|
|
* Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
#include <LibXML/DOM/Document.h>
|
|
#include <LibXML/Parser/Parser.h>
|
|
|
|
struct Range {
|
|
consteval Range(u32 start, u32 end)
|
|
: start(start)
|
|
, end(end)
|
|
{
|
|
}
|
|
|
|
u32 start;
|
|
u32 end;
|
|
};
|
|
|
|
template<auto... ranges>
|
|
struct ranges_for_search {
|
|
auto contains(u32 value) const
|
|
{
|
|
return ((value >= ranges.start && value <= ranges.end) || ...);
|
|
}
|
|
|
|
bool operator()(u32 value) const
|
|
{
|
|
return contains(value);
|
|
}
|
|
|
|
template<auto... ranges_to_include>
|
|
consteval auto with() const
|
|
{
|
|
return ranges_for_search<ranges..., ranges_to_include...>();
|
|
}
|
|
|
|
template<auto... ranges_to_include>
|
|
consteval auto unify(ranges_for_search<ranges_to_include...> const&) const
|
|
{
|
|
return ranges_for_search<ranges..., ranges_to_include...>();
|
|
}
|
|
};
|
|
|
|
template<size_t Count, typename Element>
|
|
struct StringSet {
|
|
consteval StringSet(Element const (&entries)[Count])
|
|
{
|
|
for (size_t i = 0; i < Count - 1; ++i)
|
|
elements[i] = entries[i];
|
|
}
|
|
|
|
consteval auto operator[](size_t i) const { return elements[i]; }
|
|
|
|
Element elements[Count - 1];
|
|
};
|
|
|
|
template<StringSet chars>
|
|
consteval static auto set_to_search()
|
|
{
|
|
return ([&]<auto... Ix>(IndexSequence<Ix...>) {
|
|
return ranges_for_search<Range(chars[Ix], chars[Ix])...>();
|
|
}(MakeIndexSequence<array_size(chars.elements)>()));
|
|
}
|
|
|
|
namespace XML {
|
|
|
|
size_t Parser::s_debug_indent_level { 0 };
|
|
|
|
void Parser::append_node(NonnullOwnPtr<Node> node)
|
|
{
|
|
if (m_entered_node) {
|
|
m_entered_node->content.get<Node::Element>().children.append(move(node));
|
|
} else {
|
|
m_root_node = move(node);
|
|
m_entered_node = m_root_node.ptr();
|
|
}
|
|
}
|
|
|
|
void Parser::append_text(String text)
|
|
{
|
|
if (m_listener) {
|
|
m_listener->text(text);
|
|
return;
|
|
}
|
|
|
|
if (!m_entered_node) {
|
|
Node::Text node;
|
|
node.builder.append(text);
|
|
m_root_node = make<Node>(move(node));
|
|
return;
|
|
}
|
|
|
|
m_entered_node->content.visit(
|
|
[&](Node::Element& node) {
|
|
if (!node.children.is_empty()) {
|
|
auto* text_node = node.children.last().content.get_pointer<Node::Text>();
|
|
if (text_node) {
|
|
text_node->builder.append(text);
|
|
return;
|
|
}
|
|
}
|
|
Node::Text text_node;
|
|
text_node.builder.append(text);
|
|
node.children.append(make<Node>(move(text_node)));
|
|
},
|
|
[&](auto&) {
|
|
// Can't enter a text or comment node.
|
|
VERIFY_NOT_REACHED();
|
|
});
|
|
}
|
|
|
|
void Parser::append_comment(String text)
|
|
{
|
|
if (m_listener) {
|
|
m_listener->comment(text);
|
|
return;
|
|
}
|
|
|
|
// If there's no node to attach this to, drop it on the floor.
|
|
// This can happen to comments in the prolog.
|
|
if (!m_entered_node)
|
|
return;
|
|
|
|
m_entered_node->content.visit(
|
|
[&](Node::Element& node) {
|
|
node.children.append(make<Node>(Node::Comment { move(text) }));
|
|
},
|
|
[&](auto&) {
|
|
// Can't enter a text or comment node.
|
|
VERIFY_NOT_REACHED();
|
|
});
|
|
}
|
|
|
|
void Parser::enter_node(Node& node)
|
|
{
|
|
if (m_listener) {
|
|
auto& element = node.content.get<Node::Element>();
|
|
m_listener->element_start(element.name, element.attributes);
|
|
}
|
|
|
|
if (&node != m_root_node.ptr())
|
|
node.parent = m_entered_node;
|
|
m_entered_node = &node;
|
|
}
|
|
|
|
void Parser::leave_node()
|
|
{
|
|
if (m_listener) {
|
|
auto& element = m_entered_node->content.get<Node::Element>();
|
|
m_listener->element_end(element.name);
|
|
}
|
|
|
|
m_entered_node = m_entered_node->parent;
|
|
}
|
|
|
|
ErrorOr<Document, ParseError> Parser::parse()
|
|
{
|
|
if (auto result = parse_internal(); result.is_error()) {
|
|
if (m_parse_errors.is_empty())
|
|
return result.release_error();
|
|
return m_parse_errors.take_first();
|
|
}
|
|
return Document {
|
|
m_root_node.release_nonnull(),
|
|
move(m_doctype),
|
|
move(m_processing_instructions),
|
|
m_version,
|
|
};
|
|
}
|
|
|
|
ErrorOr<void, ParseError> Parser::parse_with_listener(Listener& listener)
|
|
{
|
|
m_listener = &listener;
|
|
ScopeGuard unset_listener { [this] { m_listener = nullptr; } };
|
|
m_listener->document_start();
|
|
auto result = parse_internal();
|
|
if (result.is_error())
|
|
m_listener->error(result.error());
|
|
m_listener->document_end();
|
|
m_root_node.clear();
|
|
return result;
|
|
}
|
|
|
|
// 2.3.3. S, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-S
|
|
ErrorOr<void, ParseError> Parser::skip_whitespace(Required required)
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// S ::= (#x20 | #x9 | #xD | #xA)+
|
|
auto matched = m_lexer.consume_while(is_any_of("\x20\x09\x0d\x0a"));
|
|
if (required == Required::Yes && matched.is_empty())
|
|
return parse_error(m_lexer.tell(), "Expected whitespace");
|
|
|
|
rollback.disarm();
|
|
return {};
|
|
}
|
|
|
|
// 2.2.a. RestrictedChar, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-RestrictedChar
|
|
constexpr static auto s_restricted_characters = ranges_for_search<Range(0x1, 0x8), Range(0xb, 0xc), Range(0xe, 0x1f), Range(0x7f, 0x84), Range(0x86, 0x9f)>();
|
|
|
|
// 2.1.1. Document, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-well-formed
|
|
ErrorOr<void, ParseError> Parser::parse_internal()
|
|
{
|
|
auto rule = enter_rule();
|
|
|
|
// document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* )
|
|
TRY(parse_prolog());
|
|
TRY(parse_element());
|
|
while (true) {
|
|
if (auto result = parse_misc(); result.is_error())
|
|
break;
|
|
}
|
|
|
|
auto matched_source = m_source.substring_view(0, m_lexer.tell());
|
|
if (auto it = find_if(matched_source.begin(), matched_source.end(), s_restricted_characters); !it.is_end()) {
|
|
return parse_error(
|
|
it.index(),
|
|
String::formatted("Invalid character #{:x} used in document", *it));
|
|
}
|
|
|
|
if (!m_lexer.is_eof())
|
|
return parse_error(m_lexer.tell(), "Garbage after document");
|
|
|
|
return {};
|
|
}
|
|
|
|
ErrorOr<void, ParseError> Parser::expect(StringView expected)
|
|
{
|
|
auto rollback = rollback_point();
|
|
|
|
if (!m_lexer.consume_specific(expected)) {
|
|
if (m_options.treat_errors_as_fatal)
|
|
return parse_error(m_lexer.tell(), String::formatted("Expected '{}'", expected));
|
|
}
|
|
|
|
rollback.disarm();
|
|
return {};
|
|
}
|
|
|
|
template<typename Pred>
|
|
requires(IsCallableWithArguments<Pred, char>) ErrorOr<StringView, ParseError> Parser::expect(Pred predicate, StringView description)
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto start = m_lexer.tell();
|
|
if (!m_lexer.next_is(predicate)) {
|
|
if (m_options.treat_errors_as_fatal)
|
|
return parse_error(m_lexer.tell(), String::formatted("Expected {}", description));
|
|
}
|
|
|
|
m_lexer.ignore();
|
|
rollback.disarm();
|
|
return m_source.substring_view(start, m_lexer.tell() - start);
|
|
}
|
|
|
|
template<typename Pred>
|
|
requires(IsCallableWithArguments<Pred, char>) ErrorOr<StringView, ParseError> Parser::expect_many(Pred predicate, StringView description)
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto start = m_lexer.tell();
|
|
while (m_lexer.next_is(predicate)) {
|
|
if (m_lexer.is_eof())
|
|
break;
|
|
m_lexer.ignore();
|
|
}
|
|
|
|
if (m_lexer.tell() == start) {
|
|
if (m_options.treat_errors_as_fatal) {
|
|
return parse_error(m_lexer.tell(), String::formatted("Expected {}", description));
|
|
}
|
|
}
|
|
|
|
rollback.disarm();
|
|
return m_source.substring_view(start, m_lexer.tell() - start);
|
|
}
|
|
|
|
// 2.8.22. Prolog, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog
|
|
ErrorOr<void, ParseError> Parser::parse_prolog()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// prolog ::= XMLDecl Misc* (doctypedecl Misc*)?
|
|
// The following is valid in XML 1.0.
|
|
// prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
|
|
if (auto result = parse_xml_decl(); result.is_error()) {
|
|
m_version = Version::Version10;
|
|
m_in_compatibility_mode = true;
|
|
}
|
|
auto accept = accept_rule();
|
|
|
|
while (true) {
|
|
if (auto result = parse_misc(); result.is_error())
|
|
break;
|
|
}
|
|
|
|
if (auto result = parse_doctype_decl(); !result.is_error()) {
|
|
while (true) {
|
|
if (auto result = parse_misc(); result.is_error())
|
|
break;
|
|
}
|
|
}
|
|
|
|
rollback.disarm();
|
|
return {};
|
|
}
|
|
|
|
// 2.8.23. XMLDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl
|
|
ErrorOr<void, ParseError> Parser::parse_xml_decl()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// XMLDecl::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
|
|
|
|
TRY(expect("<?xml"));
|
|
auto accept = accept_rule();
|
|
|
|
TRY(parse_version_info());
|
|
(void)parse_encoding_decl();
|
|
(void)parse_standalone_document_decl();
|
|
TRY(skip_whitespace());
|
|
TRY(expect("?>"));
|
|
|
|
rollback.disarm();
|
|
return {};
|
|
}
|
|
|
|
// 2.8.24. VersionInfo, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-VersionInfo
|
|
ErrorOr<void, ParseError> Parser::parse_version_info()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
|
|
TRY(skip_whitespace(Required::Yes));
|
|
TRY(expect("version"));
|
|
auto accept = accept_rule();
|
|
|
|
TRY(parse_eq());
|
|
TRY(expect(is_any_of("'\""), "one of ' or \""));
|
|
m_lexer.retreat();
|
|
|
|
auto version_string = m_lexer.consume_quoted_string();
|
|
if (version_string == "1.0") {
|
|
// FIXME: Compatibility mode, figure out which rules are different in XML 1.0.
|
|
m_version = Version::Version10;
|
|
m_in_compatibility_mode = true;
|
|
} else {
|
|
if (version_string != "1.1" && m_options.treat_errors_as_fatal)
|
|
return parse_error(m_lexer.tell(), String::formatted("Expected '1.1', found '{}'", version_string));
|
|
}
|
|
|
|
m_version = Version::Version11;
|
|
rollback.disarm();
|
|
return {};
|
|
}
|
|
|
|
// 2.8.25. Eq, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Eq
|
|
ErrorOr<void, ParseError> Parser::parse_eq()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// Eq ::= S? '=' S?
|
|
auto accept = accept_rule();
|
|
TRY(skip_whitespace());
|
|
TRY(expect("="));
|
|
TRY(skip_whitespace());
|
|
rollback.disarm();
|
|
return {};
|
|
}
|
|
|
|
// 4.3.3.80. EncodingDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
|
|
ErrorOr<void, ParseError> Parser::parse_encoding_decl()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
|
|
TRY(skip_whitespace(Required::Yes));
|
|
TRY(expect("encoding"));
|
|
auto accept = accept_rule();
|
|
|
|
TRY(parse_eq());
|
|
TRY(expect(is_any_of("'\""), "one of ' or \""));
|
|
m_lexer.retreat();
|
|
|
|
// FIXME: Actually do something with this encoding.
|
|
m_encoding = m_lexer.consume_quoted_string();
|
|
|
|
rollback.disarm();
|
|
return {};
|
|
}
|
|
|
|
// 2.9.32 SDDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-rmd
|
|
ErrorOr<void, ParseError> Parser::parse_standalone_document_decl()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
|
|
TRY(skip_whitespace(Required::Yes));
|
|
TRY(expect("standalone"));
|
|
auto accept = accept_rule();
|
|
|
|
TRY(expect(is_any_of("'\""), "one of ' or \""));
|
|
m_lexer.retreat();
|
|
|
|
auto value = m_lexer.consume_quoted_string();
|
|
if (!value.is_one_of("yes", "no"))
|
|
return parse_error(m_lexer.tell() - value.length(), "Expected one of 'yes' or 'no'");
|
|
|
|
m_standalone = value == "yes";
|
|
|
|
rollback.disarm();
|
|
return {};
|
|
}
|
|
|
|
// 2.8.27. Misc, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc
|
|
ErrorOr<void, ParseError> Parser::parse_misc()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// Misc ::= Comment | PI | S
|
|
if (auto result = parse_comment(); !result.is_error()) {
|
|
rollback.disarm();
|
|
return {};
|
|
}
|
|
|
|
if (auto result = parse_processing_instruction(); !result.is_error()) {
|
|
rollback.disarm();
|
|
return {};
|
|
}
|
|
|
|
if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
|
|
rollback.disarm();
|
|
return {};
|
|
}
|
|
|
|
return parse_error(m_lexer.tell(), "Expected a match for 'Misc', but found none");
|
|
}
|
|
|
|
// 2.5.15 Comment, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Comment
|
|
ErrorOr<void, ParseError> Parser::parse_comment()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
|
|
TRY(expect("<!--"));
|
|
auto accept = accept_rule();
|
|
|
|
bool last_seen_a_dash = false;
|
|
// FIXME: This should disallow surrogate blocks
|
|
auto text = m_lexer.consume_while([&](auto ch) {
|
|
if (ch != '-') {
|
|
last_seen_a_dash = false;
|
|
return true;
|
|
}
|
|
|
|
if (last_seen_a_dash)
|
|
return false;
|
|
|
|
last_seen_a_dash = true;
|
|
return true;
|
|
});
|
|
|
|
if (last_seen_a_dash) {
|
|
m_lexer.retreat();
|
|
text = text.substring_view(0, text.length() - 1);
|
|
}
|
|
|
|
TRY(expect("-->"));
|
|
|
|
if (m_options.preserve_comments)
|
|
append_comment(text);
|
|
|
|
rollback.disarm();
|
|
return {};
|
|
}
|
|
|
|
// 2.6.16 PI, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI
|
|
ErrorOr<void, ParseError> Parser::parse_processing_instruction()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
|
|
TRY(expect("<?"));
|
|
auto accept = accept_rule();
|
|
|
|
auto target = TRY(parse_processing_instruction_target());
|
|
String data;
|
|
if (auto result = skip_whitespace(Required::Yes); !result.is_error())
|
|
data = m_lexer.consume_until("?>");
|
|
TRY(expect("?>"));
|
|
|
|
m_processing_instructions.set(target, data);
|
|
rollback.disarm();
|
|
return {};
|
|
}
|
|
|
|
// 2.6.17. PITarget, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
|
|
ErrorOr<Name, ParseError> Parser::parse_processing_instruction_target()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
|
|
auto target = TRY(parse_name());
|
|
auto accept = accept_rule();
|
|
|
|
if (target.equals_ignoring_case("xml") && m_options.treat_errors_as_fatal) {
|
|
return parse_error(
|
|
m_lexer.tell() - target.length(),
|
|
"Use of the reserved 'xml' name for processing instruction target name is disallowed");
|
|
}
|
|
|
|
rollback.disarm();
|
|
return target;
|
|
}
|
|
|
|
// NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
|
|
constexpr static auto s_name_start_characters = ranges_for_search<Range(':', ':'), Range('A', 'Z'), Range('_', '_'), Range('a', 'z'), Range(0xc0, 0xd6), Range(0xd8, 0xf6), Range(0xf8, 0x2ff), Range(0x370, 0x37d), Range(0x37f, 0x1fff), Range(0x200c, 0x200d), Range(0x2070, 0x218f), Range(0x2c00, 0x2fef), Range(0x3001, 0xd7ff), Range(0xf900, 0xfdcf), Range(0xfdf0, 0xfffd), Range(0x10000, 0xeffff)> {};
|
|
|
|
// NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
|
|
constexpr static auto s_name_characters = s_name_start_characters.with<Range('-', '-'), Range('.', '.'), Range('0', '9'), Range(0xb7, 0xb7), Range(0x0300, 0x036f), Range(0x203f, 0x2040)>();
|
|
|
|
// 2.3.5. Name, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name
|
|
ErrorOr<Name, ParseError> Parser::parse_name()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// Name ::= NameStartChar (NameChar)*
|
|
auto start = TRY(expect(s_name_start_characters, "a NameStartChar"));
|
|
auto accept = accept_rule();
|
|
|
|
auto rest = m_lexer.consume_while(s_name_characters);
|
|
StringBuilder builder;
|
|
builder.append(start);
|
|
builder.append(rest);
|
|
|
|
rollback.disarm();
|
|
return builder.to_string();
|
|
}
|
|
|
|
// 2.8.28. doctypedecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl
|
|
ErrorOr<void, ParseError> Parser::parse_doctype_decl()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
Doctype doctype;
|
|
|
|
// doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
|
|
TRY(expect("<!DOCTYPE"));
|
|
auto accept = accept_rule();
|
|
|
|
TRY(skip_whitespace(Required::Yes));
|
|
doctype.type = TRY(parse_name());
|
|
if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
|
|
auto id_start = m_lexer.tell();
|
|
if (auto id_result = parse_external_id(); !id_result.is_error()) {
|
|
doctype.external_id = id_result.release_value();
|
|
if (m_options.resolve_external_resource) {
|
|
auto resource_result = m_options.resolve_external_resource(doctype.external_id->system_id, doctype.external_id->public_id);
|
|
if (resource_result.is_error()) {
|
|
return parse_error(
|
|
id_start,
|
|
String::formatted("Failed to resolve external subset '{}': {}", doctype.external_id->system_id.system_literal, resource_result.error()));
|
|
}
|
|
StringView resolved_source = resource_result.value();
|
|
TemporaryChange source { m_source, resolved_source };
|
|
TemporaryChange lexer { m_lexer, GenericLexer(m_source) };
|
|
auto declarations = TRY(parse_external_subset());
|
|
if (!m_lexer.is_eof()) {
|
|
return parse_error(
|
|
m_lexer.tell(),
|
|
String::formatted("Failed to resolve external subset '{}': garbage after declarations", doctype.external_id->system_id.system_literal));
|
|
}
|
|
doctype.markup_declarations.extend(move(declarations));
|
|
}
|
|
}
|
|
}
|
|
TRY(skip_whitespace(Required::No));
|
|
if (m_lexer.consume_specific('[')) {
|
|
auto internal_subset = TRY(parse_internal_subset());
|
|
TRY(expect("]"));
|
|
TRY(skip_whitespace());
|
|
doctype.markup_declarations.extend(internal_subset);
|
|
}
|
|
|
|
TRY(expect(">"));
|
|
|
|
rollback.disarm();
|
|
m_doctype = move(doctype);
|
|
return {};
|
|
}
|
|
|
|
// 3.39. element, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-element
|
|
ErrorOr<void, ParseError> Parser::parse_element()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// element ::= EmptyElemTag
|
|
// | STag content ETag
|
|
if (auto result = parse_empty_element_tag(); !result.is_error()) {
|
|
append_node(result.release_value());
|
|
rollback.disarm();
|
|
return {};
|
|
}
|
|
|
|
auto start_tag = TRY(parse_start_tag());
|
|
auto& node = *start_tag;
|
|
auto& tag = node.content.get<Node::Element>();
|
|
append_node(move(start_tag));
|
|
enter_node(node);
|
|
ScopeGuard quit {
|
|
[&] {
|
|
leave_node();
|
|
}
|
|
};
|
|
|
|
TRY(parse_content());
|
|
|
|
auto tag_location = m_lexer.tell();
|
|
auto closing_name = TRY(parse_end_tag());
|
|
|
|
// Well-formedness constraint: The Name in an element's end-tag MUST match the element type in the start-tag.
|
|
if (m_options.treat_errors_as_fatal && closing_name != tag.name)
|
|
return parse_error(tag_location, "Invalid closing tag");
|
|
|
|
rollback.disarm();
|
|
return {};
|
|
}
|
|
|
|
// 3.1.44. EmptyElemTag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EmptyElemTag
|
|
ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_empty_element_tag()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
|
|
TRY(expect("<"));
|
|
auto accept = accept_rule();
|
|
|
|
auto name = TRY(parse_name());
|
|
HashMap<Name, String> attributes;
|
|
|
|
while (true) {
|
|
if (auto result = skip_whitespace(Required::Yes); result.is_error())
|
|
break;
|
|
|
|
if (auto result = parse_attribute(); !result.is_error()) {
|
|
auto attribute = result.release_value();
|
|
attributes.set(move(attribute.name), move(attribute.value));
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
TRY(skip_whitespace());
|
|
TRY(expect("/>"));
|
|
|
|
rollback.disarm();
|
|
return make<Node>(Node::Element { move(name), move(attributes), {} });
|
|
}
|
|
|
|
// 3.1.41. Attribute, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Attribute
|
|
ErrorOr<Attribute, ParseError> Parser::parse_attribute()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// Attribute ::= Name Eq AttValue
|
|
auto name = TRY(parse_name());
|
|
auto accept = accept_rule();
|
|
|
|
TRY(parse_eq());
|
|
auto value = TRY(parse_attribute_value());
|
|
|
|
rollback.disarm();
|
|
return Attribute {
|
|
move(name),
|
|
move(value),
|
|
};
|
|
}
|
|
|
|
// 2.3.10. AttValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttValue
|
|
ErrorOr<String, ParseError> Parser::parse_attribute_value()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// AttValue ::= '"' ([^<&"] | Reference)* '"'
|
|
// | "'" ([^<&'] | Reference)* "'"
|
|
auto quote = TRY(expect(is_any_of("'\""), "one of ' or \""));
|
|
auto accept = accept_rule();
|
|
|
|
auto text = TRY(parse_attribute_value_inner(quote));
|
|
TRY(expect(quote));
|
|
|
|
rollback.disarm();
|
|
return text;
|
|
}
|
|
|
|
ErrorOr<String, ParseError> Parser::parse_attribute_value_inner(StringView disallow)
|
|
{
|
|
StringBuilder builder;
|
|
while (true) {
|
|
if (m_lexer.next_is(is_any_of(disallow)) || m_lexer.is_eof())
|
|
break;
|
|
|
|
if (m_lexer.next_is('<')) {
|
|
// Not allowed, return a nice error to make it easier to debug.
|
|
return parse_error(m_lexer.tell(), "Unescaped '<' not allowed in attribute values");
|
|
}
|
|
|
|
if (m_lexer.next_is('&')) {
|
|
auto reference = TRY(parse_reference());
|
|
if (auto* char_reference = reference.get_pointer<String>())
|
|
builder.append(*char_reference);
|
|
else
|
|
builder.append(TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::AttributeValue)));
|
|
} else {
|
|
builder.append(m_lexer.consume());
|
|
}
|
|
}
|
|
return builder.to_string();
|
|
}
|
|
|
|
// Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
|
|
constexpr static auto s_characters = ranges_for_search<Range(0x1, 0xd7ff), Range(0xe000, 0xfffd), Range(0x10000, 0x10ffff)>();
|
|
|
|
// 4.1.67. Reference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Reference
|
|
ErrorOr<Variant<Parser::EntityReference, String>, ParseError> Parser::parse_reference()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
// Reference ::= EntityRef | CharRef
|
|
|
|
// 4.1.68. EntityRef
|
|
// EntityRef ::= '&' Name ';'
|
|
|
|
// 4.1.66. CharRef
|
|
// CharRef ::= '&#' [0-9]+ ';'
|
|
// | '&#x' [0-9a-fA-F]+ ';'
|
|
|
|
auto reference_start = m_lexer.tell();
|
|
TRY(expect("&"));
|
|
auto accept = accept_rule();
|
|
|
|
auto name_result = parse_name();
|
|
if (name_result.is_error()) {
|
|
TRY(expect("#"));
|
|
u32 code_point;
|
|
if (m_lexer.consume_specific('x')) {
|
|
auto hex = TRY(expect_many(
|
|
ranges_for_search<Range('0', '9'), Range('a', 'f'), Range('A', 'F')>(),
|
|
"any of [0-9a-fA-F]"));
|
|
code_point = *AK::StringUtils::convert_to_uint_from_hex<u32>(hex);
|
|
} else {
|
|
auto decimal = TRY(expect_many(
|
|
ranges_for_search<Range('0', '9')>(),
|
|
"any of [0-9]"));
|
|
code_point = *decimal.to_uint<u32>();
|
|
}
|
|
|
|
if (!s_characters.contains(code_point))
|
|
return parse_error(reference_start, "Invalid character reference");
|
|
|
|
TRY(expect(";"));
|
|
|
|
StringBuilder builder;
|
|
builder.append_code_point(code_point);
|
|
|
|
rollback.disarm();
|
|
return builder.to_string();
|
|
}
|
|
|
|
auto name = name_result.release_value();
|
|
TRY(expect(";"));
|
|
|
|
rollback.disarm();
|
|
return EntityReference { move(name) };
|
|
}
|
|
|
|
// 3.1.40 STag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-STag
|
|
ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_start_tag()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// STag ::= '<' Name (S Attribute)* S? '>'
|
|
TRY(expect("<"));
|
|
auto accept = accept_rule();
|
|
|
|
auto name = TRY(parse_name());
|
|
HashMap<Name, String> attributes;
|
|
|
|
while (true) {
|
|
if (auto result = skip_whitespace(Required::Yes); result.is_error())
|
|
break;
|
|
|
|
if (auto result = parse_attribute(); !result.is_error()) {
|
|
auto attribute = result.release_value();
|
|
attributes.set(move(attribute.name), move(attribute.value));
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
TRY(skip_whitespace());
|
|
TRY(expect(">"));
|
|
|
|
rollback.disarm();
|
|
return make<Node>(Node::Element { move(name), move(attributes), {} });
|
|
}
|
|
|
|
// 3.1.42 ETag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ETag
|
|
ErrorOr<Name, ParseError> Parser::parse_end_tag()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// ETag ::= '</' Name S? '>'
|
|
TRY(expect("</"));
|
|
auto accept = accept_rule();
|
|
|
|
auto name = TRY(parse_name());
|
|
TRY(skip_whitespace());
|
|
TRY(expect(">"));
|
|
|
|
rollback.disarm();
|
|
return name;
|
|
}
|
|
|
|
// 3.1.42 content, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-content
|
|
ErrorOr<void, ParseError> Parser::parse_content()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
|
|
if (auto result = parse_char_data(); !result.is_error())
|
|
append_text(result.release_value());
|
|
|
|
while (true) {
|
|
if (auto result = parse_element(); !result.is_error())
|
|
goto try_char_data;
|
|
if (auto result = parse_reference(); !result.is_error()) {
|
|
auto reference = result.release_value();
|
|
if (auto char_reference = reference.get_pointer<String>())
|
|
append_text(*char_reference);
|
|
else
|
|
TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::Content));
|
|
goto try_char_data;
|
|
}
|
|
if (auto result = parse_cdata_section(); !result.is_error()) {
|
|
if (m_options.preserve_cdata)
|
|
append_text(result.release_value());
|
|
goto try_char_data;
|
|
}
|
|
if (auto result = parse_processing_instruction(); !result.is_error())
|
|
goto try_char_data;
|
|
if (auto result = parse_comment(); !result.is_error())
|
|
goto try_char_data;
|
|
|
|
break;
|
|
|
|
try_char_data:;
|
|
if (auto result = parse_char_data(); !result.is_error())
|
|
append_text(result.release_value());
|
|
}
|
|
|
|
rollback.disarm();
|
|
return {};
|
|
}
|
|
|
|
// 2.4.14 CharData, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CharData
|
|
ErrorOr<StringView, ParseError> Parser::parse_char_data()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
|
|
auto cend_state = 0; // 1: ], 2: ], 3: >
|
|
auto text = m_lexer.consume_while([&](auto ch) {
|
|
if (ch == '<' || ch == '&')
|
|
return false;
|
|
switch (cend_state) {
|
|
case 0:
|
|
case 1:
|
|
if (ch == ']')
|
|
cend_state++;
|
|
else
|
|
cend_state = 0;
|
|
return true;
|
|
case 2:
|
|
if (ch == '>') {
|
|
cend_state++;
|
|
return false;
|
|
}
|
|
cend_state = 0;
|
|
return true;
|
|
default:
|
|
VERIFY_NOT_REACHED();
|
|
}
|
|
});
|
|
if (cend_state == 3) {
|
|
m_lexer.retreat(3);
|
|
text = text.substring_view(0, text.length() - 3);
|
|
}
|
|
|
|
rollback.disarm();
|
|
return text;
|
|
}
|
|
|
|
// 2.8.28b intSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-intSubset
|
|
ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_internal_subset()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
Vector<MarkupDeclaration> declarations;
|
|
|
|
// intSubset ::= (markupdecl | DeclSep)*
|
|
while (true) {
|
|
if (auto result = parse_markup_declaration(); !result.is_error()) {
|
|
auto maybe_declaration = result.release_value();
|
|
if (maybe_declaration.has_value())
|
|
declarations.append(maybe_declaration.release_value());
|
|
continue;
|
|
}
|
|
if (auto result = parse_declaration_separator(); !result.is_error()) {
|
|
// The markup declarations may be made up in whole or in part of the replacement text of parameter entities.
|
|
// The replacement text of a parameter entity reference in a DeclSep MUST match the production extSubsetDecl.
|
|
auto maybe_replacement_text = result.release_value();
|
|
if (maybe_replacement_text.has_value()) {
|
|
TemporaryChange<StringView> source { m_source, maybe_replacement_text.value() };
|
|
TemporaryChange lexer { m_lexer, GenericLexer { m_source } };
|
|
|
|
auto contained_declarations = TRY(parse_external_subset_declaration());
|
|
declarations.extend(move(contained_declarations));
|
|
}
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
|
|
rollback.disarm();
|
|
return declarations;
|
|
}
|
|
|
|
// 2.8.29 markupdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-markupdecl
|
|
ErrorOr<Optional<MarkupDeclaration>, ParseError> Parser::parse_markup_declaration()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
|
|
if (auto result = parse_element_declaration(); !result.is_error()) {
|
|
rollback.disarm();
|
|
return MarkupDeclaration { result.release_value() };
|
|
}
|
|
if (auto result = parse_attribute_list_declaration(); !result.is_error()) {
|
|
rollback.disarm();
|
|
return MarkupDeclaration { result.release_value() };
|
|
}
|
|
if (auto result = parse_entity_declaration(); !result.is_error()) {
|
|
rollback.disarm();
|
|
return MarkupDeclaration { result.release_value() };
|
|
}
|
|
if (auto result = parse_notation_declaration(); !result.is_error()) {
|
|
rollback.disarm();
|
|
return MarkupDeclaration { result.release_value() };
|
|
}
|
|
if (auto result = parse_processing_instruction(); !result.is_error()) {
|
|
rollback.disarm();
|
|
return Optional<MarkupDeclaration> {};
|
|
}
|
|
if (auto result = parse_comment(); !result.is_error()) {
|
|
rollback.disarm();
|
|
return Optional<MarkupDeclaration> {};
|
|
}
|
|
|
|
return parse_error(m_lexer.tell(), "Expected one of elementdecl, attlistdecl, entitydecl, notationdecl, PI or comment");
|
|
}
|
|
|
|
// 2.8.28a DeclSep, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-DeclSep
|
|
ErrorOr<Optional<String>, ParseError> Parser::parse_declaration_separator()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// DeclSep ::= PEReference | S
|
|
if (auto name = parse_parameter_entity_reference(); !name.is_error()) {
|
|
rollback.disarm();
|
|
// FIXME: Resolve this PEReference.
|
|
return "";
|
|
}
|
|
|
|
if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
|
|
rollback.disarm();
|
|
return Optional<String> {};
|
|
}
|
|
|
|
return parse_error(m_lexer.tell(), "Expected either whitespace, or a PEReference");
|
|
}
|
|
|
|
// 4.1.69 PEReference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEReference
|
|
ErrorOr<Name, ParseError> Parser::parse_parameter_entity_reference()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// PEReference ::= '%' Name ';'
|
|
TRY(expect("%"));
|
|
auto accept = accept_rule();
|
|
|
|
auto name = TRY(parse_name());
|
|
TRY(expect(";"));
|
|
|
|
rollback.disarm();
|
|
return name;
|
|
}
|
|
|
|
// 3.2.46 elementdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-elementdecl
|
|
ErrorOr<ElementDeclaration, ParseError> Parser::parse_element_declaration()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// FIXME: Apparently both name _and_ contentspec here are allowed to be PEReferences,
|
|
// but the grammar does not allow that, figure this out.
|
|
// elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
|
|
TRY(expect("<!ELEMENT"));
|
|
auto accept = accept_rule();
|
|
|
|
TRY(skip_whitespace(Required::Yes));
|
|
auto name = TRY(parse_name());
|
|
TRY(skip_whitespace(Required::Yes));
|
|
auto spec = TRY(parse_content_spec());
|
|
TRY(expect(">"));
|
|
|
|
rollback.disarm();
|
|
return ElementDeclaration {
|
|
move(name),
|
|
move(spec),
|
|
};
|
|
}
|
|
|
|
// 3.3.52 AttlistDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttlistDecl
|
|
ErrorOr<AttributeListDeclaration, ParseError> Parser::parse_attribute_list_declaration()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
AttributeListDeclaration declaration;
|
|
|
|
// AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
|
|
TRY(expect("<!ATTLIST"));
|
|
auto accept = accept_rule();
|
|
|
|
TRY(skip_whitespace(Required::Yes));
|
|
declaration.type = TRY(parse_name());
|
|
|
|
while (true) {
|
|
if (auto result = parse_attribute_definition(); !result.is_error())
|
|
declaration.attributes.append(result.release_value());
|
|
else
|
|
break;
|
|
}
|
|
|
|
TRY(skip_whitespace());
|
|
TRY(expect(">"));
|
|
|
|
rollback.disarm();
|
|
return declaration;
|
|
}
|
|
|
|
// 3.3.53 AttDef, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttDef
|
|
ErrorOr<AttributeListDeclaration::Definition, ParseError> Parser::parse_attribute_definition()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
Optional<AttributeListDeclaration::Type> type;
|
|
Optional<AttributeListDeclaration::Default> default_;
|
|
|
|
// AttDef ::= S Name S AttType S DefaultDecl
|
|
TRY(skip_whitespace(Required::Yes));
|
|
auto name = TRY(parse_name());
|
|
auto accept = accept_rule();
|
|
|
|
TRY(skip_whitespace(Required::Yes));
|
|
|
|
// AttType ::= StringType | TokenizedType | EnumeratedType
|
|
// StringType ::= 'CDATA'
|
|
// TokenizedType ::= 'ID'
|
|
// | 'IDREF'
|
|
// | 'IDREFS'
|
|
// | 'ENTITY'
|
|
// | 'ENTITIES'
|
|
// | 'NMTOKEN'
|
|
// | 'NMTOKENS'
|
|
// EnumeratedType ::= NotationType | Enumeration
|
|
// NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
|
|
// Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
|
|
if (m_lexer.consume_specific("CDATA")) {
|
|
type = AttributeListDeclaration::StringType::CData;
|
|
} else if (m_lexer.consume_specific("IDREFS")) {
|
|
type = AttributeListDeclaration::TokenizedType::IDRefs;
|
|
} else if (m_lexer.consume_specific("IDREF")) {
|
|
type = AttributeListDeclaration::TokenizedType::IDRef;
|
|
} else if (m_lexer.consume_specific("ID")) {
|
|
type = AttributeListDeclaration::TokenizedType::ID;
|
|
} else if (m_lexer.consume_specific("ENTITIES")) {
|
|
type = AttributeListDeclaration::TokenizedType::Entities;
|
|
} else if (m_lexer.consume_specific("ENTITY")) {
|
|
type = AttributeListDeclaration::TokenizedType::Entity;
|
|
} else if (m_lexer.consume_specific("NMTOKENS")) {
|
|
type = AttributeListDeclaration::TokenizedType::NMTokens;
|
|
} else if (m_lexer.consume_specific("NMTOKEN")) {
|
|
type = AttributeListDeclaration::TokenizedType::NMToken;
|
|
} else if (m_lexer.consume_specific("NOTATION")) {
|
|
HashTable<Name> names;
|
|
TRY(skip_whitespace(Required::Yes));
|
|
TRY(expect("("));
|
|
TRY(skip_whitespace());
|
|
names.set(TRY(parse_name()));
|
|
while (true) {
|
|
TRY(skip_whitespace());
|
|
if (auto result = expect("|"); result.is_error())
|
|
break;
|
|
TRY(skip_whitespace());
|
|
names.set(TRY(parse_name()));
|
|
}
|
|
TRY(skip_whitespace());
|
|
TRY(expect(")"));
|
|
type = AttributeListDeclaration::NotationType { move(names) };
|
|
} else {
|
|
HashTable<String> names;
|
|
TRY(expect("("));
|
|
TRY(skip_whitespace());
|
|
names.set(TRY(parse_nm_token()));
|
|
while (true) {
|
|
TRY(skip_whitespace());
|
|
if (auto result = expect("|"); result.is_error())
|
|
break;
|
|
TRY(skip_whitespace());
|
|
names.set(TRY(parse_nm_token()));
|
|
}
|
|
TRY(skip_whitespace());
|
|
TRY(expect(")"));
|
|
type = AttributeListDeclaration::Enumeration { move(names) };
|
|
}
|
|
|
|
TRY(skip_whitespace(Required::Yes));
|
|
|
|
// DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
|
|
// | (('#FIXED' S)? AttValue)
|
|
if (m_lexer.consume_specific("#REQUIRED")) {
|
|
default_ = AttributeListDeclaration::Required {};
|
|
} else if (m_lexer.consume_specific("#IMPLIED")) {
|
|
default_ = AttributeListDeclaration::Implied {};
|
|
} else {
|
|
bool fixed = false;
|
|
if (m_lexer.consume_specific("#FIXED")) {
|
|
TRY(skip_whitespace(Required::Yes));
|
|
fixed = true;
|
|
}
|
|
auto value = TRY(parse_attribute_value());
|
|
if (fixed)
|
|
default_ = AttributeListDeclaration::Fixed { move(value) };
|
|
else
|
|
default_ = AttributeListDeclaration::DefaultValue { move(value) };
|
|
}
|
|
|
|
rollback.disarm();
|
|
return AttributeListDeclaration::Definition {
|
|
move(name),
|
|
type.release_value(),
|
|
default_.release_value(),
|
|
};
|
|
}
|
|
|
|
// 2.3.7 Nmtoken, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Nmtoken
|
|
ErrorOr<StringView, ParseError> Parser::parse_nm_token()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// Nmtoken ::= (NameChar)+
|
|
auto token = TRY(expect_many(s_name_characters, "a NameChar"));
|
|
|
|
rollback.disarm();
|
|
return token;
|
|
}
|
|
|
|
// 4.7.82 NotationDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#Notations
|
|
ErrorOr<NotationDeclaration, ParseError> Parser::parse_notation_declaration()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
Variant<ExternalID, PublicID, Empty> notation;
|
|
|
|
// NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'
|
|
TRY(expect("<!NOTATION"));
|
|
auto accept = accept_rule();
|
|
|
|
TRY(skip_whitespace(Required::Yes));
|
|
auto name = TRY(parse_name());
|
|
TRY(skip_whitespace(Required::Yes));
|
|
|
|
if (auto result = parse_external_id(); !result.is_error())
|
|
notation = result.release_value();
|
|
else
|
|
notation = TRY(parse_public_id());
|
|
|
|
TRY(expect(">"));
|
|
|
|
rollback.disarm();
|
|
return NotationDeclaration {
|
|
move(name),
|
|
move(notation).downcast<ExternalID, PublicID>(),
|
|
};
|
|
}
|
|
|
|
// 3.2.46 contentspec, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-contentspec
|
|
ErrorOr<ElementDeclaration::ContentSpec, ParseError> Parser::parse_content_spec()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
Optional<ElementDeclaration::ContentSpec> content_spec;
|
|
|
|
// contentspec ::= 'EMPTY' | 'ANY' | Mixed | children
|
|
if (m_lexer.consume_specific("EMPTY")) {
|
|
content_spec = ElementDeclaration::Empty {};
|
|
} else if (m_lexer.consume_specific("ANY")) {
|
|
content_spec = ElementDeclaration::Any {};
|
|
} else {
|
|
TRY(expect("("));
|
|
TRY(skip_whitespace());
|
|
if (m_lexer.consume_specific("#PCDATA")) {
|
|
HashTable<Name> names;
|
|
// Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*'
|
|
// | '(' S? '#PCDATA' S? ')'
|
|
TRY(skip_whitespace());
|
|
if (m_lexer.consume_specific(")*")) {
|
|
content_spec = ElementDeclaration::Mixed { .types = {}, .many = true };
|
|
} else if (m_lexer.consume_specific(')')) {
|
|
content_spec = ElementDeclaration::Mixed { .types = {}, .many = false };
|
|
} else {
|
|
while (true) {
|
|
TRY(skip_whitespace());
|
|
if (!m_lexer.consume_specific('|'))
|
|
break;
|
|
TRY(skip_whitespace());
|
|
if (auto result = parse_name(); !result.is_error())
|
|
names.set(result.release_value());
|
|
else
|
|
return parse_error(m_lexer.tell(), "Expected a Name");
|
|
}
|
|
TRY(skip_whitespace());
|
|
TRY(expect(")*"));
|
|
content_spec = ElementDeclaration::Mixed { .types = move(names), .many = true };
|
|
}
|
|
} else {
|
|
while (!m_lexer.next_is('('))
|
|
m_lexer.retreat();
|
|
// children ::= (choice | seq) ('?' | '*' | '+')?
|
|
// cp ::= (Name | choice | seq) ('?' | '*' | '+')?
|
|
// choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')'
|
|
// seq ::= '(' S? cp ( S? ',' S? cp )* S? ')'
|
|
Function<ErrorOr<ElementDeclaration::Children::Choice, ParseError>()> parse_choice;
|
|
Function<ErrorOr<ElementDeclaration::Children::Sequence, ParseError>()> parse_sequence;
|
|
|
|
auto parse_cp_init = [&]() -> ErrorOr<Variant<Name, ElementDeclaration::Children::Choice, ElementDeclaration::Children::Sequence>, ParseError> {
|
|
if (auto result = parse_name(); !result.is_error())
|
|
return result.release_value();
|
|
if (auto result = parse_choice(); !result.is_error())
|
|
return result.release_value();
|
|
return TRY(parse_sequence());
|
|
};
|
|
auto parse_qualifier = [&]() -> ElementDeclaration::Children::Qualifier {
|
|
ElementDeclaration::Children::Qualifier qualifier { ElementDeclaration::Children::Qualifier::ExactlyOnce };
|
|
if (m_lexer.consume_specific('?'))
|
|
qualifier = ElementDeclaration::Children::Qualifier::Optional;
|
|
else if (m_lexer.consume_specific('*'))
|
|
qualifier = ElementDeclaration::Children::Qualifier::Any;
|
|
else if (m_lexer.consume_specific('+'))
|
|
qualifier = ElementDeclaration::Children::Qualifier::OneOrMore;
|
|
return qualifier;
|
|
};
|
|
auto parse_cp = [&]() -> ErrorOr<ElementDeclaration::Children::Entry, ParseError> {
|
|
auto sub_entry = TRY(parse_cp_init());
|
|
auto qualifier = parse_qualifier();
|
|
return ElementDeclaration::Children::Entry {
|
|
move(sub_entry),
|
|
qualifier,
|
|
};
|
|
};
|
|
parse_choice = [&]() -> ErrorOr<ElementDeclaration::Children::Choice, ParseError> {
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
TRY(expect("("));
|
|
auto accept = accept_rule();
|
|
|
|
TRY(skip_whitespace());
|
|
Vector<ElementDeclaration::Children::Entry> choices;
|
|
choices.append(TRY(parse_cp()));
|
|
while (true) {
|
|
TRY(skip_whitespace());
|
|
if (!m_lexer.consume_specific('|'))
|
|
break;
|
|
TRY(skip_whitespace());
|
|
choices.append(TRY(parse_cp()));
|
|
}
|
|
|
|
TRY(expect(")"));
|
|
|
|
if (choices.size() < 2)
|
|
return parse_error(m_lexer.tell(), "Expected more than one choice");
|
|
|
|
TRY(skip_whitespace());
|
|
auto qualifier = parse_qualifier();
|
|
|
|
rollback.disarm();
|
|
return ElementDeclaration::Children::Choice {
|
|
move(choices),
|
|
qualifier,
|
|
};
|
|
};
|
|
parse_sequence = [&]() -> ErrorOr<ElementDeclaration::Children::Sequence, ParseError> {
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
TRY(expect("("));
|
|
auto accept = accept_rule();
|
|
|
|
TRY(skip_whitespace());
|
|
Vector<ElementDeclaration::Children::Entry> entries;
|
|
entries.append(TRY(parse_cp()));
|
|
while (true) {
|
|
TRY(skip_whitespace());
|
|
if (!m_lexer.consume_specific(','))
|
|
break;
|
|
TRY(skip_whitespace());
|
|
entries.append(TRY(parse_cp()));
|
|
}
|
|
|
|
TRY(expect(")"));
|
|
|
|
TRY(skip_whitespace());
|
|
auto qualifier = parse_qualifier();
|
|
|
|
rollback.disarm();
|
|
return ElementDeclaration::Children::Sequence {
|
|
move(entries),
|
|
qualifier,
|
|
};
|
|
};
|
|
if (auto result = parse_choice(); !result.is_error()) {
|
|
auto qualifier = parse_qualifier();
|
|
content_spec = ElementDeclaration::Children {
|
|
result.release_value(),
|
|
qualifier,
|
|
};
|
|
} else {
|
|
auto sequence = TRY(parse_sequence());
|
|
auto qualifier = parse_qualifier();
|
|
content_spec = ElementDeclaration::Children {
|
|
move(sequence),
|
|
qualifier,
|
|
};
|
|
}
|
|
}
|
|
}
|
|
|
|
rollback.disarm();
|
|
return content_spec.release_value();
|
|
}
|
|
|
|
// 2.8.31 extSubsetDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubsetDecl
|
|
ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset_declaration()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
Vector<MarkupDeclaration> declarations;
|
|
|
|
// extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep )*
|
|
while (true) {
|
|
if (auto result = parse_markup_declaration(); !result.is_error()) {
|
|
if (result.value().has_value())
|
|
declarations.append(result.release_value().release_value());
|
|
continue;
|
|
}
|
|
|
|
// FIXME: conditionalSect
|
|
|
|
if (auto result = parse_declaration_separator(); !result.is_error())
|
|
continue;
|
|
|
|
break;
|
|
}
|
|
|
|
rollback.disarm();
|
|
return declarations;
|
|
}
|
|
|
|
// 4.2.70 EntityDecl, https://www.w3.org/TR/xml/#NT-EntityDecl
|
|
ErrorOr<EntityDeclaration, ParseError> Parser::parse_entity_declaration()
|
|
{
|
|
// EntityDecl ::= GEDecl | PEDecl
|
|
if (auto result = parse_general_entity_declaration(); !result.is_error())
|
|
return result;
|
|
|
|
return parse_parameter_entity_declaration();
|
|
}
|
|
|
|
// 4.2.71 GEDecl, https://www.w3.org/TR/xml/#NT-GEDecl
|
|
ErrorOr<EntityDeclaration, ParseError> Parser::parse_general_entity_declaration()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
Variant<String, EntityDefinition, Empty> definition;
|
|
|
|
// GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
|
|
TRY(expect("<!ENTITY"));
|
|
auto accept = accept_rule();
|
|
|
|
TRY(skip_whitespace(Required::Yes));
|
|
auto name = TRY(parse_name());
|
|
TRY(skip_whitespace(Required::Yes));
|
|
// EntityDef ::= EntityValue | (ExternalID NDataDecl?)
|
|
if (auto result = parse_entity_value(); !result.is_error()) {
|
|
definition = result.release_value();
|
|
} else {
|
|
auto external_id = TRY(parse_external_id());
|
|
Optional<Name> notation;
|
|
if (auto notation_result = parse_notation_data_declaration(); !notation_result.is_error())
|
|
notation = notation_result.release_value();
|
|
|
|
definition = EntityDefinition {
|
|
move(external_id),
|
|
move(notation),
|
|
};
|
|
}
|
|
|
|
TRY(skip_whitespace());
|
|
TRY(expect(">"));
|
|
|
|
rollback.disarm();
|
|
return GEDeclaration {
|
|
move(name),
|
|
move(definition).downcast<String, EntityDefinition>(),
|
|
};
|
|
}
|
|
|
|
// 4.2.72 PEDecl, https://www.w3.org/TR/xml/#NT-PEDecl
|
|
ErrorOr<EntityDeclaration, ParseError> Parser::parse_parameter_entity_declaration()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
Variant<String, ExternalID, Empty> definition;
|
|
// PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
|
|
TRY(expect("<!ENTITY"));
|
|
auto accept = accept_rule();
|
|
|
|
TRY(skip_whitespace(Required::Yes));
|
|
TRY(expect("%"));
|
|
TRY(skip_whitespace(Required::Yes));
|
|
auto name = TRY(parse_name());
|
|
TRY(skip_whitespace(Required::Yes));
|
|
// PEDef ::= EntityValue | ExternalID
|
|
if (auto result = parse_entity_value(); !result.is_error())
|
|
definition = result.release_value();
|
|
else
|
|
definition = TRY(parse_external_id());
|
|
|
|
TRY(skip_whitespace());
|
|
TRY(expect(">"));
|
|
|
|
rollback.disarm();
|
|
return PEDeclaration {
|
|
move(name),
|
|
move(definition).downcast<String, ExternalID>(),
|
|
};
|
|
}
|
|
|
|
// 4.7.83 PublicID, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PublicID
|
|
ErrorOr<PublicID, ParseError> Parser::parse_public_id()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// PublicID ::= 'PUBLIC' S PubidLiteral
|
|
TRY(expect("PUBLIC"));
|
|
auto accept = accept_rule();
|
|
|
|
TRY(skip_whitespace(Required::Yes));
|
|
auto text = TRY(parse_public_id_literal());
|
|
|
|
rollback.disarm();
|
|
return PublicID {
|
|
text,
|
|
};
|
|
}
|
|
|
|
constexpr static auto s_public_id_characters = set_to_search<StringSet("\x20\x0d\x0a-'()+,./:=?;!*#@$_%")>().unify(ranges_for_search<Range('a', 'z'), Range('A', 'Z'), Range('0', '9')>());
|
|
|
|
// 2.3.12, PubidLiteral, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral
|
|
ErrorOr<StringView, ParseError> Parser::parse_public_id_literal()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
|
|
auto quote = TRY(expect(is_any_of("'\""), "any of ' or \""));
|
|
auto accept = accept_rule();
|
|
|
|
auto id = TRY(expect_many(
|
|
[q = quote[0]](auto x) {
|
|
return (q == '\'' ? x != '\'' : true) && s_public_id_characters.contains(x);
|
|
},
|
|
"a PubidChar"));
|
|
TRY(expect(quote));
|
|
|
|
rollback.disarm();
|
|
return id;
|
|
}
|
|
|
|
// 2.3.11 SystemLiteral, https://www.w3.org/TR/xml/#NT-SystemLiteral
|
|
ErrorOr<StringView, ParseError> Parser::parse_system_id_literal()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
|
|
auto quote = TRY(expect(is_any_of("'\""), "any of ' or \""));
|
|
auto accept = accept_rule();
|
|
|
|
auto id = TRY(expect_many(is_not_any_of(quote), "not a quote"));
|
|
TRY(expect(quote));
|
|
|
|
rollback.disarm();
|
|
return id;
|
|
}
|
|
|
|
// 4.2.75 ExternalID, https://www.w3.org/TR/xml/#NT-ExternalID
|
|
ErrorOr<ExternalID, ParseError> Parser::parse_external_id()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// ExternalID ::= 'SYSTEM' S SystemLiteral
|
|
// | 'PUBLIC' S PubidLiteral S SystemLiteral
|
|
Optional<PublicID> public_id;
|
|
SystemID system_id;
|
|
|
|
if (m_lexer.consume_specific("SYSTEM")) {
|
|
auto accept = accept_rule();
|
|
TRY(skip_whitespace(Required::Yes));
|
|
system_id = SystemID { TRY(parse_system_id_literal()) };
|
|
} else {
|
|
TRY(expect("PUBLIC"));
|
|
auto accept = accept_rule();
|
|
|
|
TRY(skip_whitespace(Required::Yes));
|
|
public_id = PublicID { TRY(parse_public_id_literal()) };
|
|
TRY(skip_whitespace(Required::Yes));
|
|
system_id = SystemID { TRY(parse_system_id_literal()) };
|
|
}
|
|
|
|
rollback.disarm();
|
|
return ExternalID {
|
|
move(public_id),
|
|
move(system_id),
|
|
};
|
|
}
|
|
|
|
// 4.2.2.76 NDataDecl, https://www.w3.org/TR/xml/#NT-NDataDecl
|
|
ErrorOr<Name, ParseError> Parser::parse_notation_data_declaration()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// NDataDecl ::= S 'NDATA' S Name
|
|
TRY(skip_whitespace(Required::Yes));
|
|
auto accept = accept_rule();
|
|
|
|
TRY(expect("NDATA"));
|
|
TRY(skip_whitespace(Required::Yes));
|
|
auto name = TRY(parse_name());
|
|
|
|
rollback.disarm();
|
|
return name;
|
|
}
|
|
|
|
// 2.3.9 EntityValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue
|
|
ErrorOr<String, ParseError> Parser::parse_entity_value()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
StringBuilder builder;
|
|
|
|
// EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
|
|
// | "'" ([^%&'] | PEReference | Reference)* "'"
|
|
auto quote = TRY(expect(is_any_of("'\""), "any of ' or \""));
|
|
auto accept = accept_rule();
|
|
|
|
while (true) {
|
|
if (m_lexer.is_eof())
|
|
break;
|
|
if (m_lexer.next_is(quote))
|
|
break;
|
|
if (m_lexer.next_is('%')) {
|
|
auto start = m_lexer.tell();
|
|
TRY(parse_parameter_entity_reference());
|
|
builder.append(m_source.substring_view(start, m_lexer.tell() - start));
|
|
continue;
|
|
}
|
|
if (m_lexer.next_is('&')) {
|
|
auto start = m_lexer.tell();
|
|
TRY(parse_reference());
|
|
builder.append(m_source.substring_view(start, m_lexer.tell() - start));
|
|
continue;
|
|
}
|
|
builder.append(m_lexer.consume());
|
|
}
|
|
TRY(expect(quote));
|
|
|
|
rollback.disarm();
|
|
return builder.to_string();
|
|
}
|
|
|
|
// 2.7.18 CDSect, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CDSect
|
|
ErrorOr<StringView, ParseError> Parser::parse_cdata_section()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// CDSect ::= CDStart CData CDEnd
|
|
// CDStart ::= '<![CDATA['
|
|
// CData ::= (Char* - (Char* ']]>' Char*))
|
|
// CDEnd ::= ']]>'
|
|
TRY(expect("<![CDATA["));
|
|
auto accept = accept_rule();
|
|
|
|
auto section_start = m_lexer.tell();
|
|
while (!m_lexer.next_is("]]>")) {
|
|
if (m_lexer.is_eof())
|
|
break;
|
|
m_lexer.ignore();
|
|
}
|
|
auto section_end = m_lexer.tell();
|
|
TRY(expect("]]>"));
|
|
|
|
rollback.disarm();
|
|
return m_source.substring_view(section_start, section_end - section_start);
|
|
}
|
|
|
|
// 2.8.30 extSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubset
|
|
ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// extSubset ::= TextDecl? extSubsetDecl
|
|
(void)parse_text_declaration();
|
|
auto result = TRY(parse_external_subset_declaration());
|
|
|
|
rollback.disarm();
|
|
return result;
|
|
}
|
|
|
|
// 4.3.1.77 TextDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-TextDecl
|
|
ErrorOr<void, ParseError> Parser::parse_text_declaration()
|
|
{
|
|
auto rollback = rollback_point();
|
|
auto rule = enter_rule();
|
|
|
|
// TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
|
|
TRY(expect("<?xml"));
|
|
auto accept = accept_rule();
|
|
|
|
(void)parse_version_info();
|
|
TRY(parse_encoding_decl());
|
|
TRY(skip_whitespace());
|
|
TRY(expect("?>"));
|
|
|
|
rollback.disarm();
|
|
return {};
|
|
}
|
|
|
|
ErrorOr<String, ParseError> Parser::resolve_reference(EntityReference const& reference, ReferencePlacement placement)
|
|
{
|
|
static HashTable<Name> reference_lookup {};
|
|
if (reference_lookup.contains(reference.name))
|
|
return parse_error(m_lexer.tell(), String::formatted("Invalid recursive definition for '{}'", reference.name));
|
|
|
|
reference_lookup.set(reference.name);
|
|
ScopeGuard remove_lookup {
|
|
[&] {
|
|
reference_lookup.remove(reference.name);
|
|
}
|
|
};
|
|
|
|
Optional<String> resolved;
|
|
if (m_doctype.has_value()) {
|
|
// FIXME: Split these up and resolve them ahead of time.
|
|
for (auto& declaration : m_doctype->markup_declarations) {
|
|
auto entity = declaration.get_pointer<EntityDeclaration>();
|
|
if (!entity)
|
|
continue;
|
|
auto ge_declaration = entity->get_pointer<GEDeclaration>();
|
|
if (!ge_declaration)
|
|
continue;
|
|
if (ge_declaration->name != reference.name)
|
|
continue;
|
|
TRY(ge_declaration->definition.visit(
|
|
[&](String const& definition) -> ErrorOr<void, ParseError> {
|
|
resolved = definition;
|
|
return {};
|
|
},
|
|
[&](EntityDefinition const& definition) -> ErrorOr<void, ParseError> {
|
|
if (placement == ReferencePlacement::AttributeValue)
|
|
return parse_error(m_lexer.tell(), String::formatted("Attribute references external entity '{}'", reference.name));
|
|
|
|
if (definition.notation.has_value())
|
|
return parse_error(0u, String::formatted("Entity reference to unparsed entity '{}'", reference.name));
|
|
|
|
if (!m_options.resolve_external_resource)
|
|
return parse_error(0u, String::formatted("Failed to resolve external entity '{}'", reference.name));
|
|
|
|
auto result = m_options.resolve_external_resource(definition.id.system_id, definition.id.public_id);
|
|
if (result.is_error())
|
|
return parse_error(0u, String::formatted("Failed to resolve external entity '{}': {}", reference.name, result.error()));
|
|
|
|
resolved = result.release_value();
|
|
return {};
|
|
}));
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!resolved.has_value()) {
|
|
if (reference.name == "amp")
|
|
return "&";
|
|
if (reference.name == "lt")
|
|
return "<";
|
|
if (reference.name == "gt")
|
|
return ">";
|
|
if (reference.name == "apos")
|
|
return "'";
|
|
if (reference.name == "quot")
|
|
return "\"";
|
|
return parse_error(0u, String::formatted("Reference to undeclared entity '{}'", reference.name));
|
|
}
|
|
|
|
StringView resolved_source = *resolved;
|
|
TemporaryChange source { m_source, resolved_source };
|
|
TemporaryChange lexer { m_lexer, GenericLexer(m_source) };
|
|
switch (placement) {
|
|
case ReferencePlacement::AttributeValue:
|
|
return TRY(parse_attribute_value_inner(""));
|
|
case ReferencePlacement::Content:
|
|
TRY(parse_content());
|
|
return "";
|
|
default:
|
|
VERIFY_NOT_REACHED();
|
|
}
|
|
}
|
|
|
|
}
|