123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791 |
- /*
- * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
- *
- * SPDX-License-Identifier: BSD-2-Clause
- */
- #include <LibXML/DOM/Document.h>
- #include <LibXML/Parser/Parser.h>
- struct Range {
- consteval Range(u32 start, u32 end)
- : start(start)
- , end(end)
- {
- }
- u32 start;
- u32 end;
- };
- template<auto... ranges>
- struct ranges_for_search {
- auto contains(u32 value) const
- {
- return ((value >= ranges.start && value <= ranges.end) || ...);
- }
- bool operator()(u32 value) const
- {
- return contains(value);
- }
- template<auto... ranges_to_include>
- consteval auto with() const
- {
- return ranges_for_search<ranges..., ranges_to_include...>();
- }
- template<auto... ranges_to_include>
- consteval auto unify(ranges_for_search<ranges_to_include...> const&) const
- {
- return ranges_for_search<ranges..., ranges_to_include...>();
- }
- };
- template<size_t Count, typename Element>
- struct StringSet {
- consteval StringSet(Element const (&entries)[Count])
- {
- for (size_t i = 0; i < Count - 1; ++i)
- elements[i] = entries[i];
- }
- consteval auto operator[](size_t i) const { return elements[i]; }
- Element elements[Count - 1];
- };
- template<StringSet chars>
- consteval static auto set_to_search()
- {
- return ([&]<auto... Ix>(IndexSequence<Ix...>) {
- return ranges_for_search<Range(chars[Ix], chars[Ix])...>();
- }(MakeIndexSequence<array_size(chars.elements)>()));
- }
- namespace XML {
- size_t Parser::s_debug_indent_level { 0 };
- void Parser::append_node(NonnullOwnPtr<Node> node)
- {
- if (m_entered_node) {
- auto& entered_element = m_entered_node->content.get<Node::Element>();
- entered_element.children.append(move(node));
- enter_node(*entered_element.children.last());
- } else {
- m_root_node = move(node);
- enter_node(*m_root_node);
- }
- }
- void Parser::append_text(StringView text, LineTrackingLexer::Position position)
- {
- if (m_listener) {
- m_listener->text(text);
- return;
- }
- if (!m_entered_node) {
- Node::Text node;
- node.builder.append(text);
- m_root_node = make<Node>(position, move(node));
- return;
- }
- m_entered_node->content.visit(
- [&](Node::Element& node) {
- if (!node.children.is_empty()) {
- auto* text_node = node.children.last()->content.get_pointer<Node::Text>();
- if (text_node) {
- text_node->builder.append(text);
- return;
- }
- }
- Node::Text text_node;
- text_node.builder.append(text);
- node.children.append(make<Node>(position, move(text_node), m_entered_node));
- },
- [&](auto&) {
- // Can't enter a text or comment node.
- VERIFY_NOT_REACHED();
- });
- }
- void Parser::append_comment(StringView text, LineTrackingLexer::Position position)
- {
- if (m_listener) {
- m_listener->comment(text);
- return;
- }
- // If there's no node to attach this to, drop it on the floor.
- // This can happen to comments in the prolog.
- if (!m_entered_node)
- return;
- m_entered_node->content.visit(
- [&](Node::Element& node) {
- node.children.append(make<Node>(position, Node::Comment { text }, m_entered_node));
- },
- [&](auto&) {
- // Can't enter a text or comment node.
- VERIFY_NOT_REACHED();
- });
- }
- void Parser::enter_node(Node& node)
- {
- if (m_listener) {
- auto& element = node.content.get<Node::Element>();
- m_listener->element_start(element.name, element.attributes);
- }
- if (&node != m_root_node.ptr())
- node.parent = m_entered_node;
- m_entered_node = &node;
- }
- void Parser::leave_node()
- {
- if (m_listener) {
- auto& element = m_entered_node->content.get<Node::Element>();
- m_listener->element_end(element.name);
- }
- m_entered_node = m_entered_node->parent;
- }
- ErrorOr<Document, ParseError> Parser::parse()
- {
- if (auto result = parse_internal(); result.is_error()) {
- if (m_parse_errors.is_empty())
- return result.release_error();
- return m_parse_errors.take_first();
- }
- return Document {
- m_root_node.release_nonnull(),
- move(m_doctype),
- move(m_processing_instructions),
- m_version,
- };
- }
- ErrorOr<void, ParseError> Parser::parse_with_listener(Listener& listener)
- {
- m_listener = &listener;
- ScopeGuard unset_listener { [this] { m_listener = nullptr; } };
- m_listener->set_source(m_source);
- m_listener->document_start();
- auto result = parse_internal();
- if (result.is_error())
- m_listener->error(result.error());
- m_listener->document_end();
- m_root_node.clear();
- return result;
- }
- // 2.3.3. S, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-S
- ErrorOr<void, ParseError> Parser::skip_whitespace(Required required)
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // S ::= (#x20 | #x9 | #xD | #xA)+
- auto matched = m_lexer.consume_while(is_any_of("\x20\x09\x0d\x0a"sv));
- if (required == Required::Yes && matched.is_empty())
- return parse_error(m_lexer.tell(), "Expected whitespace");
- rollback.disarm();
- return {};
- }
- // 2.2.a. RestrictedChar, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-RestrictedChar
- constexpr static auto s_restricted_characters = ranges_for_search<Range(0x1, 0x8), Range(0xb, 0xc), Range(0xe, 0x1f), Range(0x7f, 0x84), Range(0x86, 0x9f)>();
- // 2.1.1. Document, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-well-formed
- ErrorOr<void, ParseError> Parser::parse_internal()
- {
- auto rule = enter_rule();
- // document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* )
- TRY(parse_prolog());
- TRY(parse_element());
- while (true) {
- if (auto result = parse_misc(); result.is_error())
- break;
- }
- auto matched_source = m_source.substring_view(0, m_lexer.tell());
- if (auto it = find_if(matched_source.begin(), matched_source.end(), s_restricted_characters); !it.is_end()) {
- return parse_error(
- it.index(),
- ByteString::formatted("Invalid character #{:x} used in document", *it));
- }
- if (!m_lexer.is_eof())
- return parse_error(m_lexer.tell(), "Garbage after document");
- return {};
- }
- ErrorOr<void, ParseError> Parser::expect(StringView expected)
- {
- auto rollback = rollback_point();
- if (!m_lexer.consume_specific(expected)) {
- if (m_options.treat_errors_as_fatal)
- return parse_error(m_lexer.tell(), ByteString::formatted("Expected '{}'", expected));
- }
- rollback.disarm();
- return {};
- }
- template<typename Pred>
- requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> Parser::expect(Pred predicate, StringView description)
- {
- auto rollback = rollback_point();
- auto start = m_lexer.tell();
- if (!m_lexer.next_is(predicate)) {
- if (m_options.treat_errors_as_fatal)
- return parse_error(m_lexer.tell(), ByteString::formatted("Expected {}", description));
- }
- m_lexer.ignore();
- rollback.disarm();
- return m_source.substring_view(start, m_lexer.tell() - start);
- }
- template<typename Pred>
- requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> Parser::expect_many(Pred predicate, StringView description)
- {
- auto rollback = rollback_point();
- auto start = m_lexer.tell();
- while (m_lexer.next_is(predicate)) {
- if (m_lexer.is_eof())
- break;
- m_lexer.ignore();
- }
- if (m_lexer.tell() == start) {
- if (m_options.treat_errors_as_fatal) {
- return parse_error(m_lexer.tell(), ByteString::formatted("Expected {}", description));
- }
- }
- rollback.disarm();
- return m_source.substring_view(start, m_lexer.tell() - start);
- }
- // 2.8.22. Prolog, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog
- ErrorOr<void, ParseError> Parser::parse_prolog()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // prolog ::= XMLDecl Misc* (doctypedecl Misc*)?
- // The following is valid in XML 1.0.
- // prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
- if (auto result = parse_xml_decl(); result.is_error()) {
- m_version = Version::Version10;
- m_in_compatibility_mode = true;
- }
- auto accept = accept_rule();
- while (true) {
- if (auto result = parse_misc(); result.is_error())
- break;
- }
- if (auto result = parse_doctype_decl(); !result.is_error()) {
- while (true) {
- if (auto result = parse_misc(); result.is_error())
- break;
- }
- }
- rollback.disarm();
- return {};
- }
- // 2.8.23. XMLDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl
- ErrorOr<void, ParseError> Parser::parse_xml_decl()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // XMLDecl::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
- TRY(expect("<?xml"sv));
- auto accept = accept_rule();
- TRY(parse_version_info());
- (void)parse_encoding_decl();
- (void)parse_standalone_document_decl();
- TRY(skip_whitespace());
- TRY(expect("?>"sv));
- rollback.disarm();
- return {};
- }
- // 2.8.24. VersionInfo, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-VersionInfo
- ErrorOr<void, ParseError> Parser::parse_version_info()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
- TRY(skip_whitespace(Required::Yes));
- TRY(expect("version"sv));
- auto accept = accept_rule();
- TRY(parse_eq());
- TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
- m_lexer.retreat();
- auto version_string = m_lexer.consume_quoted_string();
- if (version_string == "1.0") {
- // FIXME: Compatibility mode, figure out which rules are different in XML 1.0.
- m_version = Version::Version10;
- m_in_compatibility_mode = true;
- } else {
- if (version_string != "1.1" && m_options.treat_errors_as_fatal)
- return parse_error(m_lexer.tell(), ByteString::formatted("Expected '1.1', found '{}'", version_string));
- }
- m_version = Version::Version11;
- rollback.disarm();
- return {};
- }
- // 2.8.25. Eq, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Eq
- ErrorOr<void, ParseError> Parser::parse_eq()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // Eq ::= S? '=' S?
- auto accept = accept_rule();
- TRY(skip_whitespace());
- TRY(expect("="sv));
- TRY(skip_whitespace());
- rollback.disarm();
- return {};
- }
- // 4.3.3.80. EncodingDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
- ErrorOr<void, ParseError> Parser::parse_encoding_decl()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
- TRY(skip_whitespace(Required::Yes));
- TRY(expect("encoding"sv));
- auto accept = accept_rule();
- TRY(parse_eq());
- TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
- m_lexer.retreat();
- // FIXME: Actually do something with this encoding.
- m_encoding = m_lexer.consume_quoted_string();
- rollback.disarm();
- return {};
- }
- // 2.9.32 SDDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-rmd
- ErrorOr<void, ParseError> Parser::parse_standalone_document_decl()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
- TRY(skip_whitespace(Required::Yes));
- TRY(expect("standalone"sv));
- auto accept = accept_rule();
- TRY(parse_eq());
- TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
- m_lexer.retreat();
- auto value = m_lexer.consume_quoted_string();
- if (!value.is_one_of("yes", "no"))
- return parse_error(m_lexer.tell() - value.length(), "Expected one of 'yes' or 'no'");
- m_standalone = value == "yes";
- rollback.disarm();
- return {};
- }
- // 2.8.27. Misc, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc
- ErrorOr<void, ParseError> Parser::parse_misc()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // Misc ::= Comment | PI | S
- if (auto result = parse_comment(); !result.is_error()) {
- rollback.disarm();
- return {};
- }
- if (auto result = parse_processing_instruction(); !result.is_error()) {
- rollback.disarm();
- return {};
- }
- if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
- rollback.disarm();
- return {};
- }
- return parse_error(m_lexer.tell(), "Expected a match for 'Misc', but found none");
- }
- // 2.5.15 Comment, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Comment
- ErrorOr<void, ParseError> Parser::parse_comment()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
- auto comment_start = m_lexer.tell();
- TRY(expect("<!--"sv));
- auto accept = accept_rule();
- bool last_seen_a_dash = false;
- // FIXME: This should disallow surrogate blocks
- auto text = m_lexer.consume_while([&](auto ch) {
- if (ch != '-') {
- last_seen_a_dash = false;
- return true;
- }
- if (last_seen_a_dash)
- return false;
- last_seen_a_dash = true;
- return true;
- });
- if (last_seen_a_dash) {
- m_lexer.retreat();
- text = text.substring_view(0, text.length() - 1);
- }
- TRY(expect("-->"sv));
- if (m_options.preserve_comments)
- append_comment(text, m_lexer.position_for(comment_start));
- rollback.disarm();
- return {};
- }
- // 2.6.16 PI, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI
- ErrorOr<void, ParseError> Parser::parse_processing_instruction()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
- TRY(expect("<?"sv));
- auto accept = accept_rule();
- auto target = TRY(parse_processing_instruction_target());
- ByteString data;
- if (auto result = skip_whitespace(Required::Yes); !result.is_error())
- data = m_lexer.consume_until("?>");
- TRY(expect("?>"sv));
- m_processing_instructions.set(target, data);
- rollback.disarm();
- return {};
- }
- // 2.6.17. PITarget, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
- ErrorOr<Name, ParseError> Parser::parse_processing_instruction_target()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
- auto target = TRY(parse_name());
- auto accept = accept_rule();
- if (target.equals_ignoring_ascii_case("xml"sv) && m_options.treat_errors_as_fatal) {
- return parse_error(
- m_lexer.tell() - target.length(),
- "Use of the reserved 'xml' name for processing instruction target name is disallowed");
- }
- rollback.disarm();
- return target;
- }
- // NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
- constexpr static auto s_name_start_characters = ranges_for_search<Range(':', ':'), Range('A', 'Z'), Range('_', '_'), Range('a', 'z'), Range(0xc0, 0xd6), Range(0xd8, 0xf6), Range(0xf8, 0x2ff), Range(0x370, 0x37d), Range(0x37f, 0x1fff), Range(0x200c, 0x200d), Range(0x2070, 0x218f), Range(0x2c00, 0x2fef), Range(0x3001, 0xd7ff), Range(0xf900, 0xfdcf), Range(0xfdf0, 0xfffd), Range(0x10000, 0xeffff)> {};
- // NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
- constexpr static auto s_name_characters = s_name_start_characters.with<Range('-', '-'), Range('.', '.'), Range('0', '9'), Range(0xb7, 0xb7), Range(0x0300, 0x036f), Range(0x203f, 0x2040)>();
- // 2.3.5. Name, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name
- ErrorOr<Name, ParseError> Parser::parse_name()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // Name ::= NameStartChar (NameChar)*
- auto start = TRY(expect(s_name_start_characters, "a NameStartChar"sv));
- auto accept = accept_rule();
- auto rest = m_lexer.consume_while(s_name_characters);
- StringBuilder builder;
- builder.append(start);
- builder.append(rest);
- rollback.disarm();
- return builder.to_byte_string();
- }
- // 2.8.28. doctypedecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl
- ErrorOr<void, ParseError> Parser::parse_doctype_decl()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- Doctype doctype;
- // doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
- TRY(expect("<!DOCTYPE"sv));
- auto accept = accept_rule();
- TRY(skip_whitespace(Required::Yes));
- doctype.type = TRY(parse_name());
- if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
- auto id_start = m_lexer.tell();
- if (auto id_result = parse_external_id(); !id_result.is_error()) {
- doctype.external_id = id_result.release_value();
- if (m_options.resolve_external_resource) {
- auto resource_result = m_options.resolve_external_resource(doctype.external_id->system_id, doctype.external_id->public_id);
- if (resource_result.is_error()) {
- return parse_error(
- id_start,
- ByteString::formatted("Failed to resolve external subset '{}': {}", doctype.external_id->system_id.system_literal, resource_result.error()));
- }
- StringView resolved_source = resource_result.value();
- TemporaryChange source { m_source, resolved_source };
- TemporaryChange lexer { m_lexer, LineTrackingLexer(m_source) };
- auto declarations = TRY(parse_external_subset());
- if (!m_lexer.is_eof()) {
- return parse_error(
- m_lexer.tell(),
- ByteString::formatted("Failed to resolve external subset '{}': garbage after declarations", doctype.external_id->system_id.system_literal));
- }
- doctype.markup_declarations.extend(move(declarations));
- }
- }
- }
- TRY(skip_whitespace(Required::No));
- if (m_lexer.consume_specific('[')) {
- auto internal_subset = TRY(parse_internal_subset());
- TRY(expect("]"sv));
- TRY(skip_whitespace());
- doctype.markup_declarations.extend(internal_subset);
- }
- TRY(expect(">"sv));
- rollback.disarm();
- m_doctype = move(doctype);
- return {};
- }
- // 3.39. element, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-element
- ErrorOr<void, ParseError> Parser::parse_element()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // element ::= EmptyElemTag
- // | STag content ETag
- if (auto result = parse_empty_element_tag(); !result.is_error()) {
- append_node(result.release_value());
- leave_node();
- rollback.disarm();
- return {};
- }
- auto start_tag = TRY(parse_start_tag());
- auto& node = *start_tag;
- auto& tag = node.content.get<Node::Element>();
- append_node(move(start_tag));
- ScopeGuard quit {
- [&] {
- leave_node();
- }
- };
- TRY(parse_content());
- auto tag_location = m_lexer.tell();
- auto closing_name = TRY(parse_end_tag());
- // Well-formedness constraint: The Name in an element's end-tag MUST match the element type in the start-tag.
- if (m_options.treat_errors_as_fatal && closing_name != tag.name)
- return parse_error(tag_location, "Invalid closing tag");
- rollback.disarm();
- return {};
- }
- // 3.1.44. EmptyElemTag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EmptyElemTag
- ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_empty_element_tag()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
- auto tag_start = m_lexer.tell();
- TRY(expect("<"sv));
- auto accept = accept_rule();
- auto name = TRY(parse_name());
- HashMap<Name, ByteString> attributes;
- while (true) {
- if (auto result = skip_whitespace(Required::Yes); result.is_error())
- break;
- if (auto result = parse_attribute(); !result.is_error()) {
- auto attribute = result.release_value();
- attributes.set(move(attribute.name), move(attribute.value));
- } else {
- break;
- }
- }
- TRY(skip_whitespace());
- TRY(expect("/>"sv));
- rollback.disarm();
- return make<Node>(m_lexer.position_for(tag_start), Node::Element { move(name), move(attributes), {} });
- }
- // 3.1.41. Attribute, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Attribute
- ErrorOr<Attribute, ParseError> Parser::parse_attribute()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // Attribute ::= Name Eq AttValue
- auto name = TRY(parse_name());
- auto accept = accept_rule();
- TRY(parse_eq());
- auto value = TRY(parse_attribute_value());
- rollback.disarm();
- return Attribute {
- move(name),
- move(value),
- };
- }
- // 2.3.10. AttValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttValue
- ErrorOr<ByteString, ParseError> Parser::parse_attribute_value()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // AttValue ::= '"' ([^<&"] | Reference)* '"'
- // | "'" ([^<&'] | Reference)* "'"
- auto quote = TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
- auto accept = accept_rule();
- auto text = TRY(parse_attribute_value_inner(quote));
- TRY(expect(quote));
- rollback.disarm();
- return text;
- }
- ErrorOr<ByteString, ParseError> Parser::parse_attribute_value_inner(StringView disallow)
- {
- StringBuilder builder;
- while (true) {
- if (m_lexer.next_is(is_any_of(disallow)) || m_lexer.is_eof())
- break;
- if (m_lexer.next_is('<')) {
- // Not allowed, return a nice error to make it easier to debug.
- return parse_error(m_lexer.tell(), "Unescaped '<' not allowed in attribute values");
- }
- if (m_lexer.next_is('&')) {
- auto reference = TRY(parse_reference());
- if (auto* char_reference = reference.get_pointer<ByteString>())
- builder.append(*char_reference);
- else
- builder.append(TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::AttributeValue)));
- } else {
- builder.append(m_lexer.consume());
- }
- }
- return builder.to_byte_string();
- }
- // Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
- constexpr static auto s_characters = ranges_for_search<Range(0x1, 0xd7ff), Range(0xe000, 0xfffd), Range(0x10000, 0x10ffff)>();
- // 4.1.67. Reference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Reference
- ErrorOr<Variant<Parser::EntityReference, ByteString>, ParseError> Parser::parse_reference()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // Reference ::= EntityRef | CharRef
- // 4.1.68. EntityRef
- // EntityRef ::= '&' Name ';'
- // 4.1.66. CharRef
- // CharRef ::= '&#' [0-9]+ ';'
- // | '&#x' [0-9a-fA-F]+ ';'
- auto reference_start = m_lexer.tell();
- TRY(expect("&"sv));
- auto accept = accept_rule();
- auto name_result = parse_name();
- if (name_result.is_error()) {
- TRY(expect("#"sv));
- Optional<u32> code_point;
- if (m_lexer.consume_specific('x')) {
- auto hex = TRY(expect_many(
- ranges_for_search<Range('0', '9'), Range('a', 'f'), Range('A', 'F')>(),
- "any of [0-9a-fA-F]"sv));
- code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(hex);
- } else {
- auto decimal = TRY(expect_many(
- ranges_for_search<Range('0', '9')>(),
- "any of [0-9]"sv));
- code_point = decimal.to_number<u32>();
- }
- if (!code_point.has_value() || !s_characters.contains(*code_point))
- return parse_error(reference_start, "Invalid character reference");
- TRY(expect(";"sv));
- StringBuilder builder;
- builder.append_code_point(*code_point);
- rollback.disarm();
- return builder.to_byte_string();
- }
- auto name = name_result.release_value();
- TRY(expect(";"sv));
- rollback.disarm();
- return EntityReference { move(name) };
- }
- // 3.1.40 STag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-STag
- ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_start_tag()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // STag ::= '<' Name (S Attribute)* S? '>'
- auto tag_start = m_lexer.tell();
- TRY(expect("<"sv));
- auto accept = accept_rule();
- auto name = TRY(parse_name());
- HashMap<Name, ByteString> attributes;
- while (true) {
- if (auto result = skip_whitespace(Required::Yes); result.is_error())
- break;
- if (auto result = parse_attribute(); !result.is_error()) {
- auto attribute = result.release_value();
- attributes.set(move(attribute.name), move(attribute.value));
- } else {
- break;
- }
- }
- TRY(skip_whitespace());
- TRY(expect(">"sv));
- rollback.disarm();
- return make<Node>(m_lexer.position_for(tag_start), Node::Element { move(name), move(attributes), {} });
- }
- // 3.1.42 ETag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ETag
- ErrorOr<Name, ParseError> Parser::parse_end_tag()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // ETag ::= '</' Name S? '>'
- TRY(expect("</"sv));
- auto accept = accept_rule();
- auto name = TRY(parse_name());
- TRY(skip_whitespace());
- TRY(expect(">"sv));
- rollback.disarm();
- return name;
- }
- // 3.1.42 content, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-content
- ErrorOr<void, ParseError> Parser::parse_content()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
- auto content_start = m_lexer.tell();
- if (auto result = parse_char_data(); !result.is_error())
- append_text(result.release_value(), m_lexer.position_for(content_start));
- while (true) {
- auto node_start = m_lexer.tell();
- if (auto result = parse_element(); !result.is_error())
- goto try_char_data;
- if (auto result = parse_reference(); !result.is_error()) {
- auto reference = result.release_value();
- auto reference_offset = m_lexer.position_for(node_start);
- if (auto char_reference = reference.get_pointer<ByteString>())
- append_text(*char_reference, reference_offset);
- else
- append_text(TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::Content)), reference_offset);
- goto try_char_data;
- }
- if (auto result = parse_cdata_section(); !result.is_error()) {
- if (m_options.preserve_cdata)
- append_text(result.release_value(), m_lexer.position_for(node_start));
- goto try_char_data;
- }
- if (auto result = parse_processing_instruction(); !result.is_error())
- goto try_char_data;
- if (auto result = parse_comment(); !result.is_error())
- goto try_char_data;
- break;
- try_char_data:;
- if (auto result = parse_char_data(); !result.is_error())
- append_text(result.release_value(), m_lexer.position_for(node_start));
- }
- rollback.disarm();
- return {};
- }
- // 2.4.14 CharData, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CharData
- ErrorOr<StringView, ParseError> Parser::parse_char_data()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
- auto cend_state = 0; // 1: ], 2: ], 3: >
- auto text = m_lexer.consume_while([&](auto ch) {
- if (ch == '<' || ch == '&' || cend_state == 3)
- return false;
- switch (cend_state) {
- case 0:
- case 1:
- if (ch == ']')
- cend_state++;
- else
- cend_state = 0;
- return true;
- case 2:
- if (ch == '>') {
- cend_state++;
- return true;
- }
- cend_state = 0;
- return true;
- default:
- VERIFY_NOT_REACHED();
- }
- });
- if (cend_state == 3) {
- m_lexer.retreat(3);
- text = text.substring_view(0, text.length() - 3);
- }
- rollback.disarm();
- return text;
- }
- // 2.8.28b intSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-intSubset
- ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_internal_subset()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- Vector<MarkupDeclaration> declarations;
- // intSubset ::= (markupdecl | DeclSep)*
- while (true) {
- if (auto result = parse_markup_declaration(); !result.is_error()) {
- auto maybe_declaration = result.release_value();
- if (maybe_declaration.has_value())
- declarations.append(maybe_declaration.release_value());
- continue;
- }
- if (auto result = parse_declaration_separator(); !result.is_error()) {
- // The markup declarations may be made up in whole or in part of the replacement text of parameter entities.
- // The replacement text of a parameter entity reference in a DeclSep MUST match the production extSubsetDecl.
- auto maybe_replacement_text = result.release_value();
- if (maybe_replacement_text.has_value()) {
- TemporaryChange<StringView> source { m_source, maybe_replacement_text.value() };
- TemporaryChange lexer { m_lexer, LineTrackingLexer { m_source } };
- auto contained_declarations = TRY(parse_external_subset_declaration());
- declarations.extend(move(contained_declarations));
- }
- continue;
- }
- break;
- }
- rollback.disarm();
- return declarations;
- }
- // 2.8.29 markupdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-markupdecl
- ErrorOr<Optional<MarkupDeclaration>, ParseError> Parser::parse_markup_declaration()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
- if (auto result = parse_element_declaration(); !result.is_error()) {
- rollback.disarm();
- return MarkupDeclaration { result.release_value() };
- }
- if (auto result = parse_attribute_list_declaration(); !result.is_error()) {
- rollback.disarm();
- return MarkupDeclaration { result.release_value() };
- }
- if (auto result = parse_entity_declaration(); !result.is_error()) {
- rollback.disarm();
- return MarkupDeclaration { result.release_value() };
- }
- if (auto result = parse_notation_declaration(); !result.is_error()) {
- rollback.disarm();
- return MarkupDeclaration { result.release_value() };
- }
- if (auto result = parse_processing_instruction(); !result.is_error()) {
- rollback.disarm();
- return Optional<MarkupDeclaration> {};
- }
- if (auto result = parse_comment(); !result.is_error()) {
- rollback.disarm();
- return Optional<MarkupDeclaration> {};
- }
- return parse_error(m_lexer.tell(), "Expected one of elementdecl, attlistdecl, entitydecl, notationdecl, PI or comment");
- }
- // 2.8.28a DeclSep, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-DeclSep
- ErrorOr<Optional<ByteString>, ParseError> Parser::parse_declaration_separator()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // DeclSep ::= PEReference | S
- if (auto name = parse_parameter_entity_reference(); !name.is_error()) {
- rollback.disarm();
- // FIXME: Resolve this PEReference.
- return "";
- }
- if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
- rollback.disarm();
- return Optional<ByteString> {};
- }
- return parse_error(m_lexer.tell(), "Expected either whitespace, or a PEReference");
- }
- // 4.1.69 PEReference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEReference
- ErrorOr<Name, ParseError> Parser::parse_parameter_entity_reference()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // PEReference ::= '%' Name ';'
- TRY(expect("%"sv));
- auto accept = accept_rule();
- auto name = TRY(parse_name());
- TRY(expect(";"sv));
- rollback.disarm();
- return name;
- }
- // 3.2.46 elementdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-elementdecl
- ErrorOr<ElementDeclaration, ParseError> Parser::parse_element_declaration()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // FIXME: Apparently both name _and_ contentspec here are allowed to be PEReferences,
- // but the grammar does not allow that, figure this out.
- // elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
- TRY(expect("<!ELEMENT"sv));
- auto accept = accept_rule();
- TRY(skip_whitespace(Required::Yes));
- auto name = TRY(parse_name());
- TRY(skip_whitespace(Required::Yes));
- auto spec = TRY(parse_content_spec());
- TRY(expect(">"sv));
- rollback.disarm();
- return ElementDeclaration {
- move(name),
- move(spec),
- };
- }
- // 3.3.52 AttlistDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttlistDecl
- ErrorOr<AttributeListDeclaration, ParseError> Parser::parse_attribute_list_declaration()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- AttributeListDeclaration declaration;
- // AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
- TRY(expect("<!ATTLIST"sv));
- auto accept = accept_rule();
- TRY(skip_whitespace(Required::Yes));
- declaration.type = TRY(parse_name());
- while (true) {
- if (auto result = parse_attribute_definition(); !result.is_error())
- declaration.attributes.append(result.release_value());
- else
- break;
- }
- TRY(skip_whitespace());
- TRY(expect(">"sv));
- rollback.disarm();
- return declaration;
- }
- // 3.3.53 AttDef, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttDef
- ErrorOr<AttributeListDeclaration::Definition, ParseError> Parser::parse_attribute_definition()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- Optional<AttributeListDeclaration::Type> type;
- Optional<AttributeListDeclaration::Default> default_;
- // AttDef ::= S Name S AttType S DefaultDecl
- TRY(skip_whitespace(Required::Yes));
- auto name = TRY(parse_name());
- auto accept = accept_rule();
- TRY(skip_whitespace(Required::Yes));
- // AttType ::= StringType | TokenizedType | EnumeratedType
- // StringType ::= 'CDATA'
- // TokenizedType ::= 'ID'
- // | 'IDREF'
- // | 'IDREFS'
- // | 'ENTITY'
- // | 'ENTITIES'
- // | 'NMTOKEN'
- // | 'NMTOKENS'
- // EnumeratedType ::= NotationType | Enumeration
- // NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
- // Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
- if (m_lexer.consume_specific("CDATA"sv)) {
- type = AttributeListDeclaration::StringType::CData;
- } else if (m_lexer.consume_specific("IDREFS"sv)) {
- type = AttributeListDeclaration::TokenizedType::IDRefs;
- } else if (m_lexer.consume_specific("IDREF"sv)) {
- type = AttributeListDeclaration::TokenizedType::IDRef;
- } else if (m_lexer.consume_specific("ID"sv)) {
- type = AttributeListDeclaration::TokenizedType::ID;
- } else if (m_lexer.consume_specific("ENTITIES"sv)) {
- type = AttributeListDeclaration::TokenizedType::Entities;
- } else if (m_lexer.consume_specific("ENTITY"sv)) {
- type = AttributeListDeclaration::TokenizedType::Entity;
- } else if (m_lexer.consume_specific("NMTOKENS"sv)) {
- type = AttributeListDeclaration::TokenizedType::NMTokens;
- } else if (m_lexer.consume_specific("NMTOKEN"sv)) {
- type = AttributeListDeclaration::TokenizedType::NMToken;
- } else if (m_lexer.consume_specific("NOTATION"sv)) {
- HashTable<Name> names;
- TRY(skip_whitespace(Required::Yes));
- TRY(expect("("sv));
- TRY(skip_whitespace());
- names.set(TRY(parse_name()));
- while (true) {
- TRY(skip_whitespace());
- if (auto result = expect("|"sv); result.is_error())
- break;
- TRY(skip_whitespace());
- names.set(TRY(parse_name()));
- }
- TRY(skip_whitespace());
- TRY(expect(")"sv));
- type = AttributeListDeclaration::NotationType { move(names) };
- } else {
- HashTable<ByteString> names;
- TRY(expect("("sv));
- TRY(skip_whitespace());
- names.set(TRY(parse_nm_token()));
- while (true) {
- TRY(skip_whitespace());
- if (auto result = expect("|"sv); result.is_error())
- break;
- TRY(skip_whitespace());
- names.set(TRY(parse_nm_token()));
- }
- TRY(skip_whitespace());
- TRY(expect(")"sv));
- type = AttributeListDeclaration::Enumeration { move(names) };
- }
- TRY(skip_whitespace(Required::Yes));
- // DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
- // | (('#FIXED' S)? AttValue)
- if (m_lexer.consume_specific("#REQUIRED"sv)) {
- default_ = AttributeListDeclaration::Required {};
- } else if (m_lexer.consume_specific("#IMPLIED"sv)) {
- default_ = AttributeListDeclaration::Implied {};
- } else {
- bool fixed = false;
- if (m_lexer.consume_specific("#FIXED"sv)) {
- TRY(skip_whitespace(Required::Yes));
- fixed = true;
- }
- auto value = TRY(parse_attribute_value());
- if (fixed)
- default_ = AttributeListDeclaration::Fixed { move(value) };
- else
- default_ = AttributeListDeclaration::DefaultValue { move(value) };
- }
- rollback.disarm();
- return AttributeListDeclaration::Definition {
- move(name),
- type.release_value(),
- default_.release_value(),
- };
- }
- // 2.3.7 Nmtoken, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Nmtoken
- ErrorOr<StringView, ParseError> Parser::parse_nm_token()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // Nmtoken ::= (NameChar)+
- auto token = TRY(expect_many(s_name_characters, "a NameChar"sv));
- rollback.disarm();
- return token;
- }
- // 4.7.82 NotationDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#Notations
- ErrorOr<NotationDeclaration, ParseError> Parser::parse_notation_declaration()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- Variant<ExternalID, PublicID, Empty> notation;
- // NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'
- TRY(expect("<!NOTATION"sv));
- auto accept = accept_rule();
- TRY(skip_whitespace(Required::Yes));
- auto name = TRY(parse_name());
- TRY(skip_whitespace(Required::Yes));
- if (auto result = parse_external_id(); !result.is_error())
- notation = result.release_value();
- else
- notation = TRY(parse_public_id());
- TRY(expect(">"sv));
- rollback.disarm();
- return NotationDeclaration {
- move(name),
- move(notation).downcast<ExternalID, PublicID>(),
- };
- }
- // 3.2.46 contentspec, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-contentspec
- ErrorOr<ElementDeclaration::ContentSpec, ParseError> Parser::parse_content_spec()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- Optional<ElementDeclaration::ContentSpec> content_spec;
- // contentspec ::= 'EMPTY' | 'ANY' | Mixed | children
- if (m_lexer.consume_specific("EMPTY"sv)) {
- content_spec = ElementDeclaration::Empty {};
- } else if (m_lexer.consume_specific("ANY"sv)) {
- content_spec = ElementDeclaration::Any {};
- } else {
- TRY(expect("("sv));
- TRY(skip_whitespace());
- if (m_lexer.consume_specific("#PCDATA"sv)) {
- HashTable<Name> names;
- // Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*'
- // | '(' S? '#PCDATA' S? ')'
- TRY(skip_whitespace());
- if (m_lexer.consume_specific(")*"sv)) {
- content_spec = ElementDeclaration::Mixed { .types = {}, .many = true };
- } else if (m_lexer.consume_specific(')')) {
- content_spec = ElementDeclaration::Mixed { .types = {}, .many = false };
- } else {
- while (true) {
- TRY(skip_whitespace());
- if (!m_lexer.consume_specific('|'))
- break;
- TRY(skip_whitespace());
- if (auto result = parse_name(); !result.is_error())
- names.set(result.release_value());
- else
- return parse_error(m_lexer.tell(), "Expected a Name");
- }
- TRY(skip_whitespace());
- TRY(expect(")*"sv));
- content_spec = ElementDeclaration::Mixed { .types = move(names), .many = true };
- }
- } else {
- while (!m_lexer.next_is('('))
- m_lexer.retreat();
- // children ::= (choice | seq) ('?' | '*' | '+')?
- // cp ::= (Name | choice | seq) ('?' | '*' | '+')?
- // choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')'
- // seq ::= '(' S? cp ( S? ',' S? cp )* S? ')'
- Function<ErrorOr<ElementDeclaration::Children::Choice, ParseError>()> parse_choice;
- Function<ErrorOr<ElementDeclaration::Children::Sequence, ParseError>()> parse_sequence;
- auto parse_cp_init = [&]() -> ErrorOr<Variant<Name, ElementDeclaration::Children::Choice, ElementDeclaration::Children::Sequence>, ParseError> {
- if (auto result = parse_name(); !result.is_error())
- return result.release_value();
- if (auto result = parse_choice(); !result.is_error())
- return result.release_value();
- return TRY(parse_sequence());
- };
- auto parse_qualifier = [&]() -> ElementDeclaration::Children::Qualifier {
- ElementDeclaration::Children::Qualifier qualifier { ElementDeclaration::Children::Qualifier::ExactlyOnce };
- if (m_lexer.consume_specific('?'))
- qualifier = ElementDeclaration::Children::Qualifier::Optional;
- else if (m_lexer.consume_specific('*'))
- qualifier = ElementDeclaration::Children::Qualifier::Any;
- else if (m_lexer.consume_specific('+'))
- qualifier = ElementDeclaration::Children::Qualifier::OneOrMore;
- return qualifier;
- };
- auto parse_cp = [&]() -> ErrorOr<ElementDeclaration::Children::Entry, ParseError> {
- auto sub_entry = TRY(parse_cp_init());
- auto qualifier = parse_qualifier();
- return ElementDeclaration::Children::Entry {
- move(sub_entry),
- qualifier,
- };
- };
- parse_choice = [&]() -> ErrorOr<ElementDeclaration::Children::Choice, ParseError> {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- TRY(expect("("sv));
- auto accept = accept_rule();
- TRY(skip_whitespace());
- Vector<ElementDeclaration::Children::Entry> choices;
- choices.append(TRY(parse_cp()));
- while (true) {
- TRY(skip_whitespace());
- if (!m_lexer.consume_specific('|'))
- break;
- TRY(skip_whitespace());
- choices.append(TRY(parse_cp()));
- }
- TRY(expect(")"sv));
- if (choices.size() < 2)
- return parse_error(m_lexer.tell(), "Expected more than one choice");
- TRY(skip_whitespace());
- auto qualifier = parse_qualifier();
- rollback.disarm();
- return ElementDeclaration::Children::Choice {
- move(choices),
- qualifier,
- };
- };
- parse_sequence = [&]() -> ErrorOr<ElementDeclaration::Children::Sequence, ParseError> {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- TRY(expect("("sv));
- auto accept = accept_rule();
- TRY(skip_whitespace());
- Vector<ElementDeclaration::Children::Entry> entries;
- entries.append(TRY(parse_cp()));
- while (true) {
- TRY(skip_whitespace());
- if (!m_lexer.consume_specific(','))
- break;
- TRY(skip_whitespace());
- entries.append(TRY(parse_cp()));
- }
- TRY(expect(")"sv));
- TRY(skip_whitespace());
- auto qualifier = parse_qualifier();
- rollback.disarm();
- return ElementDeclaration::Children::Sequence {
- move(entries),
- qualifier,
- };
- };
- if (auto result = parse_choice(); !result.is_error()) {
- auto qualifier = parse_qualifier();
- content_spec = ElementDeclaration::Children {
- result.release_value(),
- qualifier,
- };
- } else {
- auto sequence = TRY(parse_sequence());
- auto qualifier = parse_qualifier();
- content_spec = ElementDeclaration::Children {
- move(sequence),
- qualifier,
- };
- }
- }
- }
- rollback.disarm();
- return content_spec.release_value();
- }
- // 2.8.31 extSubsetDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubsetDecl
- ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset_declaration()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- Vector<MarkupDeclaration> declarations;
- // extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep )*
- while (true) {
- if (auto result = parse_markup_declaration(); !result.is_error()) {
- if (result.value().has_value())
- declarations.append(result.release_value().release_value());
- continue;
- }
- // FIXME: conditionalSect
- if (auto result = parse_declaration_separator(); !result.is_error())
- continue;
- break;
- }
- rollback.disarm();
- return declarations;
- }
- // 4.2.70 EntityDecl, https://www.w3.org/TR/xml/#NT-EntityDecl
- ErrorOr<EntityDeclaration, ParseError> Parser::parse_entity_declaration()
- {
- // EntityDecl ::= GEDecl | PEDecl
- if (auto result = parse_general_entity_declaration(); !result.is_error())
- return result;
- return parse_parameter_entity_declaration();
- }
- // 4.2.71 GEDecl, https://www.w3.org/TR/xml/#NT-GEDecl
- ErrorOr<EntityDeclaration, ParseError> Parser::parse_general_entity_declaration()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- Variant<ByteString, EntityDefinition, Empty> definition;
- // GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
- TRY(expect("<!ENTITY"sv));
- auto accept = accept_rule();
- TRY(skip_whitespace(Required::Yes));
- auto name = TRY(parse_name());
- TRY(skip_whitespace(Required::Yes));
- // EntityDef ::= EntityValue | (ExternalID NDataDecl?)
- if (auto result = parse_entity_value(); !result.is_error()) {
- definition = result.release_value();
- } else {
- auto external_id = TRY(parse_external_id());
- Optional<Name> notation;
- if (auto notation_result = parse_notation_data_declaration(); !notation_result.is_error())
- notation = notation_result.release_value();
- definition = EntityDefinition {
- move(external_id),
- move(notation),
- };
- }
- TRY(skip_whitespace());
- TRY(expect(">"sv));
- rollback.disarm();
- return GEDeclaration {
- move(name),
- move(definition).downcast<ByteString, EntityDefinition>(),
- };
- }
- // 4.2.72 PEDecl, https://www.w3.org/TR/xml/#NT-PEDecl
- ErrorOr<EntityDeclaration, ParseError> Parser::parse_parameter_entity_declaration()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- Variant<ByteString, ExternalID, Empty> definition;
- // PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
- TRY(expect("<!ENTITY"sv));
- auto accept = accept_rule();
- TRY(skip_whitespace(Required::Yes));
- TRY(expect("%"sv));
- TRY(skip_whitespace(Required::Yes));
- auto name = TRY(parse_name());
- TRY(skip_whitespace(Required::Yes));
- // PEDef ::= EntityValue | ExternalID
- if (auto result = parse_entity_value(); !result.is_error())
- definition = result.release_value();
- else
- definition = TRY(parse_external_id());
- TRY(skip_whitespace());
- TRY(expect(">"sv));
- rollback.disarm();
- return PEDeclaration {
- move(name),
- move(definition).downcast<ByteString, ExternalID>(),
- };
- }
- // 4.7.83 PublicID, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PublicID
- ErrorOr<PublicID, ParseError> Parser::parse_public_id()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // PublicID ::= 'PUBLIC' S PubidLiteral
- TRY(expect("PUBLIC"sv));
- auto accept = accept_rule();
- TRY(skip_whitespace(Required::Yes));
- auto text = TRY(parse_public_id_literal());
- rollback.disarm();
- return PublicID {
- text,
- };
- }
- constexpr static auto s_public_id_characters = set_to_search<StringSet("\x20\x0d\x0a-'()+,./:=?;!*#@$_%")>().unify(ranges_for_search<Range('a', 'z'), Range('A', 'Z'), Range('0', '9')>());
- // 2.3.12, PubidLiteral, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral
- ErrorOr<StringView, ParseError> Parser::parse_public_id_literal()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
- auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv));
- auto accept = accept_rule();
- auto id = TRY(expect_many(
- [q = quote[0]](auto x) {
- return (q == '\'' ? x != '\'' : true) && s_public_id_characters.contains(x);
- },
- "a PubidChar"sv));
- TRY(expect(quote));
- rollback.disarm();
- return id;
- }
- // 2.3.11 SystemLiteral, https://www.w3.org/TR/xml/#NT-SystemLiteral
- ErrorOr<StringView, ParseError> Parser::parse_system_id_literal()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
- auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv));
- auto accept = accept_rule();
- auto id = TRY(expect_many(is_not_any_of(quote), "not a quote"sv));
- TRY(expect(quote));
- rollback.disarm();
- return id;
- }
- // 4.2.75 ExternalID, https://www.w3.org/TR/xml/#NT-ExternalID
- ErrorOr<ExternalID, ParseError> Parser::parse_external_id()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // ExternalID ::= 'SYSTEM' S SystemLiteral
- // | 'PUBLIC' S PubidLiteral S SystemLiteral
- Optional<PublicID> public_id;
- SystemID system_id;
- if (m_lexer.consume_specific("SYSTEM"sv)) {
- auto accept = accept_rule();
- TRY(skip_whitespace(Required::Yes));
- system_id = SystemID { TRY(parse_system_id_literal()) };
- } else {
- TRY(expect("PUBLIC"sv));
- auto accept = accept_rule();
- TRY(skip_whitespace(Required::Yes));
- public_id = PublicID { TRY(parse_public_id_literal()) };
- TRY(skip_whitespace(Required::Yes));
- system_id = SystemID { TRY(parse_system_id_literal()) };
- }
- rollback.disarm();
- return ExternalID {
- move(public_id),
- move(system_id),
- };
- }
- // 4.2.2.76 NDataDecl, https://www.w3.org/TR/xml/#NT-NDataDecl
- ErrorOr<Name, ParseError> Parser::parse_notation_data_declaration()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // NDataDecl ::= S 'NDATA' S Name
- TRY(skip_whitespace(Required::Yes));
- auto accept = accept_rule();
- TRY(expect("NDATA"sv));
- TRY(skip_whitespace(Required::Yes));
- auto name = TRY(parse_name());
- rollback.disarm();
- return name;
- }
- // 2.3.9 EntityValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue
- ErrorOr<ByteString, ParseError> Parser::parse_entity_value()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- StringBuilder builder;
- // EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
- // | "'" ([^%&'] | PEReference | Reference)* "'"
- auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv));
- auto accept = accept_rule();
- while (true) {
- if (m_lexer.is_eof())
- break;
- if (m_lexer.next_is(quote))
- break;
- if (m_lexer.next_is('%')) {
- auto start = m_lexer.tell();
- TRY(parse_parameter_entity_reference());
- builder.append(m_source.substring_view(start, m_lexer.tell() - start));
- continue;
- }
- if (m_lexer.next_is('&')) {
- auto start = m_lexer.tell();
- TRY(parse_reference());
- builder.append(m_source.substring_view(start, m_lexer.tell() - start));
- continue;
- }
- builder.append(m_lexer.consume());
- }
- TRY(expect(quote));
- rollback.disarm();
- return builder.to_byte_string();
- }
- // 2.7.18 CDSect, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CDSect
- ErrorOr<StringView, ParseError> Parser::parse_cdata_section()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // CDSect ::= CDStart CData CDEnd
- // CDStart ::= '<![CDATA['
- // CData ::= (Char* - (Char* ']]>' Char*))
- // CDEnd ::= ']]>'
- TRY(expect("<![CDATA["sv));
- auto accept = accept_rule();
- auto section_start = m_lexer.tell();
- while (!m_lexer.next_is("]]>")) {
- if (m_lexer.is_eof())
- break;
- m_lexer.ignore();
- }
- auto section_end = m_lexer.tell();
- TRY(expect("]]>"sv));
- rollback.disarm();
- return m_source.substring_view(section_start, section_end - section_start);
- }
- // 2.8.30 extSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubset
- ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // extSubset ::= TextDecl? extSubsetDecl
- (void)parse_text_declaration();
- auto result = TRY(parse_external_subset_declaration());
- rollback.disarm();
- return result;
- }
- // 4.3.1.77 TextDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-TextDecl
- ErrorOr<void, ParseError> Parser::parse_text_declaration()
- {
- auto rollback = rollback_point();
- auto rule = enter_rule();
- // TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
- TRY(expect("<?xml"sv));
- auto accept = accept_rule();
- (void)parse_version_info();
- TRY(parse_encoding_decl());
- TRY(skip_whitespace());
- TRY(expect("?>"sv));
- rollback.disarm();
- return {};
- }
- ErrorOr<ByteString, ParseError> Parser::resolve_reference(EntityReference const& reference, ReferencePlacement placement)
- {
- static HashTable<Name> reference_lookup {};
- if (reference_lookup.contains(reference.name))
- return parse_error(m_lexer.tell(), ByteString::formatted("Invalid recursive definition for '{}'", reference.name));
- reference_lookup.set(reference.name);
- ScopeGuard remove_lookup {
- [&] {
- reference_lookup.remove(reference.name);
- }
- };
- Optional<ByteString> resolved;
- if (m_doctype.has_value()) {
- // FIXME: Split these up and resolve them ahead of time.
- for (auto& declaration : m_doctype->markup_declarations) {
- auto entity = declaration.get_pointer<EntityDeclaration>();
- if (!entity)
- continue;
- auto ge_declaration = entity->get_pointer<GEDeclaration>();
- if (!ge_declaration)
- continue;
- if (ge_declaration->name != reference.name)
- continue;
- TRY(ge_declaration->definition.visit(
- [&](ByteString const& definition) -> ErrorOr<void, ParseError> {
- resolved = definition;
- return {};
- },
- [&](EntityDefinition const& definition) -> ErrorOr<void, ParseError> {
- if (placement == ReferencePlacement::AttributeValue)
- return parse_error(m_lexer.tell(), ByteString::formatted("Attribute references external entity '{}'", reference.name));
- if (definition.notation.has_value())
- return parse_error(0u, ByteString::formatted("Entity reference to unparsed entity '{}'", reference.name));
- if (!m_options.resolve_external_resource)
- return parse_error(0u, ByteString::formatted("Failed to resolve external entity '{}'", reference.name));
- auto result = m_options.resolve_external_resource(definition.id.system_id, definition.id.public_id);
- if (result.is_error())
- return parse_error(0u, ByteString::formatted("Failed to resolve external entity '{}': {}", reference.name, result.error()));
- resolved = result.release_value();
- return {};
- }));
- break;
- }
- }
- if (!resolved.has_value()) {
- if (reference.name == "amp")
- return "&";
- if (reference.name == "lt")
- return "<";
- if (reference.name == "gt")
- return ">";
- if (reference.name == "apos")
- return "'";
- if (reference.name == "quot")
- return "\"";
- return parse_error(0u, ByteString::formatted("Reference to undeclared entity '{}'", reference.name));
- }
- StringView resolved_source = *resolved;
- TemporaryChange source { m_source, resolved_source };
- TemporaryChange lexer { m_lexer, LineTrackingLexer(m_source) };
- switch (placement) {
- case ReferencePlacement::AttributeValue:
- return TRY(parse_attribute_value_inner(""sv));
- case ReferencePlacement::Content:
- TRY(parse_content());
- return "";
- default:
- VERIFY_NOT_REACHED();
- }
- }
- }
|