Parser.cpp 58 KB


  1. /*
  2. * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <LibXML/DOM/Document.h>
  7. #include <LibXML/Parser/Parser.h>
  8. struct Range {
  9. consteval Range(u32 start, u32 end)
  10. : start(start)
  11. , end(end)
  12. {
  13. }
  14. u32 start;
  15. u32 end;
  16. };
  17. template<auto... ranges>
  18. struct ranges_for_search {
  19. auto contains(u32 value) const
  20. {
  21. return ((value >= ranges.start && value <= ranges.end) || ...);
  22. }
  23. bool operator()(u32 value) const
  24. {
  25. return contains(value);
  26. }
  27. template<auto... ranges_to_include>
  28. consteval auto with() const
  29. {
  30. return ranges_for_search<ranges..., ranges_to_include...>();
  31. }
  32. template<auto... ranges_to_include>
  33. consteval auto unify(ranges_for_search<ranges_to_include...> const&) const
  34. {
  35. return ranges_for_search<ranges..., ranges_to_include...>();
  36. }
  37. };
  38. template<size_t Count, typename Element>
  39. struct StringSet {
  40. consteval StringSet(Element const (&entries)[Count])
  41. {
  42. for (size_t i = 0; i < Count - 1; ++i)
  43. elements[i] = entries[i];
  44. }
  45. consteval auto operator[](size_t i) const { return elements[i]; }
  46. Element elements[Count - 1];
  47. };
  48. template<StringSet chars>
  49. consteval static auto set_to_search()
  50. {
  51. return ([&]<auto... Ix>(IndexSequence<Ix...>) {
  52. return ranges_for_search<Range(chars[Ix], chars[Ix])...>();
  53. }(MakeIndexSequence<array_size(chars.elements)>()));
  54. }
  55. namespace XML {
  56. size_t Parser::s_debug_indent_level { 0 };
  57. void Parser::append_node(NonnullOwnPtr<Node> node)
  58. {
  59. if (m_entered_node) {
  60. auto& entered_element = m_entered_node->content.get<Node::Element>();
  61. entered_element.children.append(move(node));
  62. enter_node(*entered_element.children.last());
  63. } else {
  64. m_root_node = move(node);
  65. enter_node(*m_root_node);
  66. }
  67. }
  68. void Parser::append_text(StringView text, LineTrackingLexer::Position position)
  69. {
  70. if (m_listener) {
  71. m_listener->text(text);
  72. return;
  73. }
  74. if (!m_entered_node) {
  75. Node::Text node;
  76. node.builder.append(text);
  77. m_root_node = make<Node>(position, move(node));
  78. return;
  79. }
  80. m_entered_node->content.visit(
  81. [&](Node::Element& node) {
  82. if (!node.children.is_empty()) {
  83. auto* text_node = node.children.last()->content.get_pointer<Node::Text>();
  84. if (text_node) {
  85. text_node->builder.append(text);
  86. return;
  87. }
  88. }
  89. Node::Text text_node;
  90. text_node.builder.append(text);
  91. node.children.append(make<Node>(position, move(text_node), m_entered_node));
  92. },
  93. [&](auto&) {
  94. // Can't enter a text or comment node.
  95. VERIFY_NOT_REACHED();
  96. });
  97. }
  98. void Parser::append_comment(StringView text, LineTrackingLexer::Position position)
  99. {
  100. if (m_listener) {
  101. m_listener->comment(text);
  102. return;
  103. }
  104. // If there's no node to attach this to, drop it on the floor.
  105. // This can happen to comments in the prolog.
  106. if (!m_entered_node)
  107. return;
  108. m_entered_node->content.visit(
  109. [&](Node::Element& node) {
  110. node.children.append(make<Node>(position, Node::Comment { text }, m_entered_node));
  111. },
  112. [&](auto&) {
  113. // Can't enter a text or comment node.
  114. VERIFY_NOT_REACHED();
  115. });
  116. }
  117. void Parser::enter_node(Node& node)
  118. {
  119. if (m_listener) {
  120. auto& element = node.content.get<Node::Element>();
  121. m_listener->element_start(element.name, element.attributes);
  122. }
  123. if (&node != m_root_node.ptr())
  124. node.parent = m_entered_node;
  125. m_entered_node = &node;
  126. }
  127. void Parser::leave_node()
  128. {
  129. if (m_listener) {
  130. auto& element = m_entered_node->content.get<Node::Element>();
  131. m_listener->element_end(element.name);
  132. }
  133. m_entered_node = m_entered_node->parent;
  134. }
  135. ErrorOr<Document, ParseError> Parser::parse()
  136. {
  137. if (auto result = parse_internal(); result.is_error()) {
  138. if (m_parse_errors.is_empty())
  139. return result.release_error();
  140. return m_parse_errors.take_first();
  141. }
  142. return Document {
  143. m_root_node.release_nonnull(),
  144. move(m_doctype),
  145. move(m_processing_instructions),
  146. m_version,
  147. };
  148. }
  149. ErrorOr<void, ParseError> Parser::parse_with_listener(Listener& listener)
  150. {
  151. m_listener = &listener;
  152. ScopeGuard unset_listener { [this] { m_listener = nullptr; } };
  153. m_listener->set_source(m_source);
  154. m_listener->document_start();
  155. auto result = parse_internal();
  156. if (result.is_error())
  157. m_listener->error(result.error());
  158. m_listener->document_end();
  159. m_root_node.clear();
  160. return result;
  161. }
  162. // 2.3.3. S, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-S
  163. ErrorOr<void, ParseError> Parser::skip_whitespace(Required required)
  164. {
  165. auto rollback = rollback_point();
  166. auto rule = enter_rule();
  167. // S ::= (#x20 | #x9 | #xD | #xA)+
  168. auto matched = m_lexer.consume_while(is_any_of("\x20\x09\x0d\x0a"sv));
  169. if (required == Required::Yes && matched.is_empty())
  170. return parse_error(m_lexer.tell(), "Expected whitespace");
  171. rollback.disarm();
  172. return {};
  173. }
  174. // 2.2.a. RestrictedChar, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-RestrictedChar
  175. constexpr static auto s_restricted_characters = ranges_for_search<Range(0x1, 0x8), Range(0xb, 0xc), Range(0xe, 0x1f), Range(0x7f, 0x84), Range(0x86, 0x9f)>();
  176. // 2.1.1. Document, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-well-formed
  177. ErrorOr<void, ParseError> Parser::parse_internal()
  178. {
  179. auto rule = enter_rule();
  180. // document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* )
  181. TRY(parse_prolog());
  182. TRY(parse_element());
  183. while (true) {
  184. if (auto result = parse_misc(); result.is_error())
  185. break;
  186. }
  187. auto matched_source = m_source.substring_view(0, m_lexer.tell());
  188. if (auto it = find_if(matched_source.begin(), matched_source.end(), s_restricted_characters); !it.is_end()) {
  189. return parse_error(
  190. it.index(),
  191. ByteString::formatted("Invalid character #{:x} used in document", *it));
  192. }
  193. if (!m_lexer.is_eof())
  194. return parse_error(m_lexer.tell(), "Garbage after document");
  195. return {};
  196. }
  197. ErrorOr<void, ParseError> Parser::expect(StringView expected)
  198. {
  199. auto rollback = rollback_point();
  200. if (!m_lexer.consume_specific(expected)) {
  201. if (m_options.treat_errors_as_fatal)
  202. return parse_error(m_lexer.tell(), ByteString::formatted("Expected '{}'", expected));
  203. }
  204. rollback.disarm();
  205. return {};
  206. }
  207. template<typename Pred>
  208. requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> Parser::expect(Pred predicate, StringView description)
  209. {
  210. auto rollback = rollback_point();
  211. auto start = m_lexer.tell();
  212. if (!m_lexer.next_is(predicate)) {
  213. if (m_options.treat_errors_as_fatal)
  214. return parse_error(m_lexer.tell(), ByteString::formatted("Expected {}", description));
  215. }
  216. m_lexer.ignore();
  217. rollback.disarm();
  218. return m_source.substring_view(start, m_lexer.tell() - start);
  219. }
  220. template<typename Pred>
  221. requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> Parser::expect_many(Pred predicate, StringView description)
  222. {
  223. auto rollback = rollback_point();
  224. auto start = m_lexer.tell();
  225. while (m_lexer.next_is(predicate)) {
  226. if (m_lexer.is_eof())
  227. break;
  228. m_lexer.ignore();
  229. }
  230. if (m_lexer.tell() == start) {
  231. if (m_options.treat_errors_as_fatal) {
  232. return parse_error(m_lexer.tell(), ByteString::formatted("Expected {}", description));
  233. }
  234. }
  235. rollback.disarm();
  236. return m_source.substring_view(start, m_lexer.tell() - start);
  237. }
  238. // 2.8.22. Prolog, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog
  239. ErrorOr<void, ParseError> Parser::parse_prolog()
  240. {
  241. auto rollback = rollback_point();
  242. auto rule = enter_rule();
  243. // prolog ::= XMLDecl Misc* (doctypedecl Misc*)?
  244. // The following is valid in XML 1.0.
  245. // prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
  246. if (auto result = parse_xml_decl(); result.is_error()) {
  247. m_version = Version::Version10;
  248. m_in_compatibility_mode = true;
  249. }
  250. auto accept = accept_rule();
  251. while (true) {
  252. if (auto result = parse_misc(); result.is_error())
  253. break;
  254. }
  255. if (auto result = parse_doctype_decl(); !result.is_error()) {
  256. while (true) {
  257. if (auto result = parse_misc(); result.is_error())
  258. break;
  259. }
  260. }
  261. rollback.disarm();
  262. return {};
  263. }
  264. // 2.8.23. XMLDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl
  265. ErrorOr<void, ParseError> Parser::parse_xml_decl()
  266. {
  267. auto rollback = rollback_point();
  268. auto rule = enter_rule();
  269. // XMLDecl::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
  270. TRY(expect("<?xml"sv));
  271. auto accept = accept_rule();
  272. TRY(parse_version_info());
  273. (void)parse_encoding_decl();
  274. (void)parse_standalone_document_decl();
  275. TRY(skip_whitespace());
  276. TRY(expect("?>"sv));
  277. rollback.disarm();
  278. return {};
  279. }
  280. // 2.8.24. VersionInfo, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-VersionInfo
  281. ErrorOr<void, ParseError> Parser::parse_version_info()
  282. {
  283. auto rollback = rollback_point();
  284. auto rule = enter_rule();
  285. // VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
  286. TRY(skip_whitespace(Required::Yes));
  287. TRY(expect("version"sv));
  288. auto accept = accept_rule();
  289. TRY(parse_eq());
  290. TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
  291. m_lexer.retreat();
  292. auto version_string = m_lexer.consume_quoted_string();
  293. if (version_string == "1.0") {
  294. // FIXME: Compatibility mode, figure out which rules are different in XML 1.0.
  295. m_version = Version::Version10;
  296. m_in_compatibility_mode = true;
  297. } else {
  298. if (version_string != "1.1" && m_options.treat_errors_as_fatal)
  299. return parse_error(m_lexer.tell(), ByteString::formatted("Expected '1.1', found '{}'", version_string));
  300. }
  301. m_version = Version::Version11;
  302. rollback.disarm();
  303. return {};
  304. }
  305. // 2.8.25. Eq, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Eq
  306. ErrorOr<void, ParseError> Parser::parse_eq()
  307. {
  308. auto rollback = rollback_point();
  309. auto rule = enter_rule();
  310. // Eq ::= S? '=' S?
  311. auto accept = accept_rule();
  312. TRY(skip_whitespace());
  313. TRY(expect("="sv));
  314. TRY(skip_whitespace());
  315. rollback.disarm();
  316. return {};
  317. }
  318. // 4.3.3.80. EncodingDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
  319. ErrorOr<void, ParseError> Parser::parse_encoding_decl()
  320. {
  321. auto rollback = rollback_point();
  322. auto rule = enter_rule();
  323. // EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
  324. TRY(skip_whitespace(Required::Yes));
  325. TRY(expect("encoding"sv));
  326. auto accept = accept_rule();
  327. TRY(parse_eq());
  328. TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
  329. m_lexer.retreat();
  330. // FIXME: Actually do something with this encoding.
  331. m_encoding = m_lexer.consume_quoted_string();
  332. rollback.disarm();
  333. return {};
  334. }
  335. // 2.9.32 SDDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-rmd
  336. ErrorOr<void, ParseError> Parser::parse_standalone_document_decl()
  337. {
  338. auto rollback = rollback_point();
  339. auto rule = enter_rule();
  340. // SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
  341. TRY(skip_whitespace(Required::Yes));
  342. TRY(expect("standalone"sv));
  343. auto accept = accept_rule();
  344. TRY(parse_eq());
  345. TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
  346. m_lexer.retreat();
  347. auto value = m_lexer.consume_quoted_string();
  348. if (!value.is_one_of("yes", "no"))
  349. return parse_error(m_lexer.tell() - value.length(), "Expected one of 'yes' or 'no'");
  350. m_standalone = value == "yes";
  351. rollback.disarm();
  352. return {};
  353. }
  354. // 2.8.27. Misc, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc
  355. ErrorOr<void, ParseError> Parser::parse_misc()
  356. {
  357. auto rollback = rollback_point();
  358. auto rule = enter_rule();
  359. // Misc ::= Comment | PI | S
  360. if (auto result = parse_comment(); !result.is_error()) {
  361. rollback.disarm();
  362. return {};
  363. }
  364. if (auto result = parse_processing_instruction(); !result.is_error()) {
  365. rollback.disarm();
  366. return {};
  367. }
  368. if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
  369. rollback.disarm();
  370. return {};
  371. }
  372. return parse_error(m_lexer.tell(), "Expected a match for 'Misc', but found none");
  373. }
  374. // 2.5.15 Comment, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Comment
  375. ErrorOr<void, ParseError> Parser::parse_comment()
  376. {
  377. auto rollback = rollback_point();
  378. auto rule = enter_rule();
  379. // Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
  380. auto comment_start = m_lexer.tell();
  381. TRY(expect("<!--"sv));
  382. auto accept = accept_rule();
  383. bool last_seen_a_dash = false;
  384. // FIXME: This should disallow surrogate blocks
  385. auto text = m_lexer.consume_while([&](auto ch) {
  386. if (ch != '-') {
  387. last_seen_a_dash = false;
  388. return true;
  389. }
  390. if (last_seen_a_dash)
  391. return false;
  392. last_seen_a_dash = true;
  393. return true;
  394. });
  395. if (last_seen_a_dash) {
  396. m_lexer.retreat();
  397. text = text.substring_view(0, text.length() - 1);
  398. }
  399. TRY(expect("-->"sv));
  400. if (m_options.preserve_comments)
  401. append_comment(text, m_lexer.position_for(comment_start));
  402. rollback.disarm();
  403. return {};
  404. }
  405. // 2.6.16 PI, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI
  406. ErrorOr<void, ParseError> Parser::parse_processing_instruction()
  407. {
  408. auto rollback = rollback_point();
  409. auto rule = enter_rule();
  410. // PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
  411. TRY(expect("<?"sv));
  412. auto accept = accept_rule();
  413. auto target = TRY(parse_processing_instruction_target());
  414. ByteString data;
  415. if (auto result = skip_whitespace(Required::Yes); !result.is_error())
  416. data = m_lexer.consume_until("?>");
  417. TRY(expect("?>"sv));
  418. m_processing_instructions.set(target, data);
  419. rollback.disarm();
  420. return {};
  421. }
  422. // 2.6.17. PITarget, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
  423. ErrorOr<Name, ParseError> Parser::parse_processing_instruction_target()
  424. {
  425. auto rollback = rollback_point();
  426. auto rule = enter_rule();
  427. // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
  428. auto target = TRY(parse_name());
  429. auto accept = accept_rule();
  430. if (target.equals_ignoring_ascii_case("xml"sv) && m_options.treat_errors_as_fatal) {
  431. return parse_error(
  432. m_lexer.tell() - target.length(),
  433. "Use of the reserved 'xml' name for processing instruction target name is disallowed");
  434. }
  435. rollback.disarm();
  436. return target;
  437. }
  438. // NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
  439. constexpr static auto s_name_start_characters = ranges_for_search<Range(':', ':'), Range('A', 'Z'), Range('_', '_'), Range('a', 'z'), Range(0xc0, 0xd6), Range(0xd8, 0xf6), Range(0xf8, 0x2ff), Range(0x370, 0x37d), Range(0x37f, 0x1fff), Range(0x200c, 0x200d), Range(0x2070, 0x218f), Range(0x2c00, 0x2fef), Range(0x3001, 0xd7ff), Range(0xf900, 0xfdcf), Range(0xfdf0, 0xfffd), Range(0x10000, 0xeffff)> {};
  440. // NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
  441. constexpr static auto s_name_characters = s_name_start_characters.with<Range('-', '-'), Range('.', '.'), Range('0', '9'), Range(0xb7, 0xb7), Range(0x0300, 0x036f), Range(0x203f, 0x2040)>();
  442. // 2.3.5. Name, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name
  443. ErrorOr<Name, ParseError> Parser::parse_name()
  444. {
  445. auto rollback = rollback_point();
  446. auto rule = enter_rule();
  447. // Name ::= NameStartChar (NameChar)*
  448. auto start = TRY(expect(s_name_start_characters, "a NameStartChar"sv));
  449. auto accept = accept_rule();
  450. auto rest = m_lexer.consume_while(s_name_characters);
  451. StringBuilder builder;
  452. builder.append(start);
  453. builder.append(rest);
  454. rollback.disarm();
  455. return builder.to_byte_string();
  456. }
  457. // 2.8.28. doctypedecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl
  458. ErrorOr<void, ParseError> Parser::parse_doctype_decl()
  459. {
  460. auto rollback = rollback_point();
  461. auto rule = enter_rule();
  462. Doctype doctype;
  463. // doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
  464. TRY(expect("<!DOCTYPE"sv));
  465. auto accept = accept_rule();
  466. TRY(skip_whitespace(Required::Yes));
  467. doctype.type = TRY(parse_name());
  468. if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
  469. auto id_start = m_lexer.tell();
  470. if (auto id_result = parse_external_id(); !id_result.is_error()) {
  471. doctype.external_id = id_result.release_value();
  472. if (m_options.resolve_external_resource) {
  473. auto resource_result = m_options.resolve_external_resource(doctype.external_id->system_id, doctype.external_id->public_id);
  474. if (resource_result.is_error()) {
  475. return parse_error(
  476. id_start,
  477. ByteString::formatted("Failed to resolve external subset '{}': {}", doctype.external_id->system_id.system_literal, resource_result.error()));
  478. }
  479. StringView resolved_source = resource_result.value();
  480. TemporaryChange source { m_source, resolved_source };
  481. TemporaryChange lexer { m_lexer, LineTrackingLexer(m_source) };
  482. auto declarations = TRY(parse_external_subset());
  483. if (!m_lexer.is_eof()) {
  484. return parse_error(
  485. m_lexer.tell(),
  486. ByteString::formatted("Failed to resolve external subset '{}': garbage after declarations", doctype.external_id->system_id.system_literal));
  487. }
  488. doctype.markup_declarations.extend(move(declarations));
  489. }
  490. }
  491. }
  492. TRY(skip_whitespace(Required::No));
  493. if (m_lexer.consume_specific('[')) {
  494. auto internal_subset = TRY(parse_internal_subset());
  495. TRY(expect("]"sv));
  496. TRY(skip_whitespace());
  497. doctype.markup_declarations.extend(internal_subset);
  498. }
  499. TRY(expect(">"sv));
  500. rollback.disarm();
  501. m_doctype = move(doctype);
  502. return {};
  503. }
  504. // 3.39. element, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-element
  505. ErrorOr<void, ParseError> Parser::parse_element()
  506. {
  507. auto rollback = rollback_point();
  508. auto rule = enter_rule();
  509. // element ::= EmptyElemTag
  510. // | STag content ETag
  511. if (auto result = parse_empty_element_tag(); !result.is_error()) {
  512. append_node(result.release_value());
  513. leave_node();
  514. rollback.disarm();
  515. return {};
  516. }
  517. auto start_tag = TRY(parse_start_tag());
  518. auto& node = *start_tag;
  519. auto& tag = node.content.get<Node::Element>();
  520. append_node(move(start_tag));
  521. ScopeGuard quit {
  522. [&] {
  523. leave_node();
  524. }
  525. };
  526. TRY(parse_content());
  527. auto tag_location = m_lexer.tell();
  528. auto closing_name = TRY(parse_end_tag());
  529. // Well-formedness constraint: The Name in an element's end-tag MUST match the element type in the start-tag.
  530. if (m_options.treat_errors_as_fatal && closing_name != tag.name)
  531. return parse_error(tag_location, "Invalid closing tag");
  532. rollback.disarm();
  533. return {};
  534. }
  535. // 3.1.44. EmptyElemTag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EmptyElemTag
  536. ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_empty_element_tag()
  537. {
  538. auto rollback = rollback_point();
  539. auto rule = enter_rule();
  540. // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
  541. auto tag_start = m_lexer.tell();
  542. TRY(expect("<"sv));
  543. auto accept = accept_rule();
  544. auto name = TRY(parse_name());
  545. HashMap<Name, ByteString> attributes;
  546. while (true) {
  547. if (auto result = skip_whitespace(Required::Yes); result.is_error())
  548. break;
  549. if (auto result = parse_attribute(); !result.is_error()) {
  550. auto attribute = result.release_value();
  551. attributes.set(move(attribute.name), move(attribute.value));
  552. } else {
  553. break;
  554. }
  555. }
  556. TRY(skip_whitespace());
  557. TRY(expect("/>"sv));
  558. rollback.disarm();
  559. return make<Node>(m_lexer.position_for(tag_start), Node::Element { move(name), move(attributes), {} });
  560. }
  561. // 3.1.41. Attribute, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Attribute
  562. ErrorOr<Attribute, ParseError> Parser::parse_attribute()
  563. {
  564. auto rollback = rollback_point();
  565. auto rule = enter_rule();
  566. // Attribute ::= Name Eq AttValue
  567. auto name = TRY(parse_name());
  568. auto accept = accept_rule();
  569. TRY(parse_eq());
  570. auto value = TRY(parse_attribute_value());
  571. rollback.disarm();
  572. return Attribute {
  573. move(name),
  574. move(value),
  575. };
  576. }
  577. // 2.3.10. AttValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttValue
  578. ErrorOr<ByteString, ParseError> Parser::parse_attribute_value()
  579. {
  580. auto rollback = rollback_point();
  581. auto rule = enter_rule();
  582. // AttValue ::= '"' ([^<&"] | Reference)* '"'
  583. // | "'" ([^<&'] | Reference)* "'"
  584. auto quote = TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
  585. auto accept = accept_rule();
  586. auto text = TRY(parse_attribute_value_inner(quote));
  587. TRY(expect(quote));
  588. rollback.disarm();
  589. return text;
  590. }
  591. ErrorOr<ByteString, ParseError> Parser::parse_attribute_value_inner(StringView disallow)
  592. {
  593. StringBuilder builder;
  594. while (true) {
  595. if (m_lexer.next_is(is_any_of(disallow)) || m_lexer.is_eof())
  596. break;
  597. if (m_lexer.next_is('<')) {
  598. // Not allowed, return a nice error to make it easier to debug.
  599. return parse_error(m_lexer.tell(), "Unescaped '<' not allowed in attribute values");
  600. }
  601. if (m_lexer.next_is('&')) {
  602. auto reference = TRY(parse_reference());
  603. if (auto* char_reference = reference.get_pointer<ByteString>())
  604. builder.append(*char_reference);
  605. else
  606. builder.append(TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::AttributeValue)));
  607. } else {
  608. builder.append(m_lexer.consume());
  609. }
  610. }
  611. return builder.to_byte_string();
  612. }
  613. // Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
  614. constexpr static auto s_characters = ranges_for_search<Range(0x1, 0xd7ff), Range(0xe000, 0xfffd), Range(0x10000, 0x10ffff)>();
  615. // 4.1.67. Reference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Reference
  616. ErrorOr<Variant<Parser::EntityReference, ByteString>, ParseError> Parser::parse_reference()
  617. {
  618. auto rollback = rollback_point();
  619. auto rule = enter_rule();
  620. // Reference ::= EntityRef | CharRef
  621. // 4.1.68. EntityRef
  622. // EntityRef ::= '&' Name ';'
  623. // 4.1.66. CharRef
  624. // CharRef ::= '&#' [0-9]+ ';'
  625. // | '&#x' [0-9a-fA-F]+ ';'
  626. auto reference_start = m_lexer.tell();
  627. TRY(expect("&"sv));
  628. auto accept = accept_rule();
  629. auto name_result = parse_name();
  630. if (name_result.is_error()) {
  631. TRY(expect("#"sv));
  632. Optional<u32> code_point;
  633. if (m_lexer.consume_specific('x')) {
  634. auto hex = TRY(expect_many(
  635. ranges_for_search<Range('0', '9'), Range('a', 'f'), Range('A', 'F')>(),
  636. "any of [0-9a-fA-F]"sv));
  637. code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(hex);
  638. } else {
  639. auto decimal = TRY(expect_many(
  640. ranges_for_search<Range('0', '9')>(),
  641. "any of [0-9]"sv));
  642. code_point = decimal.to_number<u32>();
  643. }
  644. if (!code_point.has_value() || !s_characters.contains(*code_point))
  645. return parse_error(reference_start, "Invalid character reference");
  646. TRY(expect(";"sv));
  647. StringBuilder builder;
  648. builder.append_code_point(*code_point);
  649. rollback.disarm();
  650. return builder.to_byte_string();
  651. }
  652. auto name = name_result.release_value();
  653. TRY(expect(";"sv));
  654. rollback.disarm();
  655. return EntityReference { move(name) };
  656. }
  657. // 3.1.40 STag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-STag
  658. ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_start_tag()
  659. {
  660. auto rollback = rollback_point();
  661. auto rule = enter_rule();
  662. // STag ::= '<' Name (S Attribute)* S? '>'
  663. auto tag_start = m_lexer.tell();
  664. TRY(expect("<"sv));
  665. auto accept = accept_rule();
  666. auto name = TRY(parse_name());
  667. HashMap<Name, ByteString> attributes;
  668. while (true) {
  669. if (auto result = skip_whitespace(Required::Yes); result.is_error())
  670. break;
  671. if (auto result = parse_attribute(); !result.is_error()) {
  672. auto attribute = result.release_value();
  673. attributes.set(move(attribute.name), move(attribute.value));
  674. } else {
  675. break;
  676. }
  677. }
  678. TRY(skip_whitespace());
  679. TRY(expect(">"sv));
  680. rollback.disarm();
  681. return make<Node>(m_lexer.position_for(tag_start), Node::Element { move(name), move(attributes), {} });
  682. }
  683. // 3.1.42 ETag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ETag
  684. ErrorOr<Name, ParseError> Parser::parse_end_tag()
  685. {
  686. auto rollback = rollback_point();
  687. auto rule = enter_rule();
  688. // ETag ::= '</' Name S? '>'
  689. TRY(expect("</"sv));
  690. auto accept = accept_rule();
  691. auto name = TRY(parse_name());
  692. TRY(skip_whitespace());
  693. TRY(expect(">"sv));
  694. rollback.disarm();
  695. return name;
  696. }
  697. // 3.1.42 content, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-content
  698. ErrorOr<void, ParseError> Parser::parse_content()
  699. {
  700. auto rollback = rollback_point();
  701. auto rule = enter_rule();
  702. // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
  703. auto content_start = m_lexer.tell();
  704. if (auto result = parse_char_data(); !result.is_error())
  705. append_text(result.release_value(), m_lexer.position_for(content_start));
  706. while (true) {
  707. auto node_start = m_lexer.tell();
  708. if (auto result = parse_element(); !result.is_error())
  709. goto try_char_data;
  710. if (auto result = parse_reference(); !result.is_error()) {
  711. auto reference = result.release_value();
  712. auto reference_offset = m_lexer.position_for(node_start);
  713. if (auto char_reference = reference.get_pointer<ByteString>())
  714. append_text(*char_reference, reference_offset);
  715. else
  716. append_text(TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::Content)), reference_offset);
  717. goto try_char_data;
  718. }
  719. if (auto result = parse_cdata_section(); !result.is_error()) {
  720. if (m_options.preserve_cdata)
  721. append_text(result.release_value(), m_lexer.position_for(node_start));
  722. goto try_char_data;
  723. }
  724. if (auto result = parse_processing_instruction(); !result.is_error())
  725. goto try_char_data;
  726. if (auto result = parse_comment(); !result.is_error())
  727. goto try_char_data;
  728. break;
  729. try_char_data:;
  730. if (auto result = parse_char_data(); !result.is_error())
  731. append_text(result.release_value(), m_lexer.position_for(node_start));
  732. }
  733. rollback.disarm();
  734. return {};
  735. }
  736. // 2.4.14 CharData, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CharData
  737. ErrorOr<StringView, ParseError> Parser::parse_char_data()
  738. {
  739. auto rollback = rollback_point();
  740. auto rule = enter_rule();
  741. // CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
  742. auto cend_state = 0; // 1: ], 2: ], 3: >
  743. auto text = m_lexer.consume_while([&](auto ch) {
  744. if (ch == '<' || ch == '&' || cend_state == 3)
  745. return false;
  746. switch (cend_state) {
  747. case 0:
  748. case 1:
  749. if (ch == ']')
  750. cend_state++;
  751. else
  752. cend_state = 0;
  753. return true;
  754. case 2:
  755. if (ch == '>') {
  756. cend_state++;
  757. return true;
  758. }
  759. cend_state = 0;
  760. return true;
  761. default:
  762. VERIFY_NOT_REACHED();
  763. }
  764. });
  765. if (cend_state == 3) {
  766. m_lexer.retreat(3);
  767. text = text.substring_view(0, text.length() - 3);
  768. }
  769. rollback.disarm();
  770. return text;
  771. }
  772. // 2.8.28b intSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-intSubset
  773. ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_internal_subset()
  774. {
  775. auto rollback = rollback_point();
  776. auto rule = enter_rule();
  777. Vector<MarkupDeclaration> declarations;
  778. // intSubset ::= (markupdecl | DeclSep)*
  779. while (true) {
  780. if (auto result = parse_markup_declaration(); !result.is_error()) {
  781. auto maybe_declaration = result.release_value();
  782. if (maybe_declaration.has_value())
  783. declarations.append(maybe_declaration.release_value());
  784. continue;
  785. }
  786. if (auto result = parse_declaration_separator(); !result.is_error()) {
  787. // The markup declarations may be made up in whole or in part of the replacement text of parameter entities.
  788. // The replacement text of a parameter entity reference in a DeclSep MUST match the production extSubsetDecl.
  789. auto maybe_replacement_text = result.release_value();
  790. if (maybe_replacement_text.has_value()) {
  791. TemporaryChange<StringView> source { m_source, maybe_replacement_text.value() };
  792. TemporaryChange lexer { m_lexer, LineTrackingLexer { m_source } };
  793. auto contained_declarations = TRY(parse_external_subset_declaration());
  794. declarations.extend(move(contained_declarations));
  795. }
  796. continue;
  797. }
  798. break;
  799. }
  800. rollback.disarm();
  801. return declarations;
  802. }
  803. // 2.8.29 markupdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-markupdecl
  804. ErrorOr<Optional<MarkupDeclaration>, ParseError> Parser::parse_markup_declaration()
  805. {
  806. auto rollback = rollback_point();
  807. auto rule = enter_rule();
  808. // markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
  809. if (auto result = parse_element_declaration(); !result.is_error()) {
  810. rollback.disarm();
  811. return MarkupDeclaration { result.release_value() };
  812. }
  813. if (auto result = parse_attribute_list_declaration(); !result.is_error()) {
  814. rollback.disarm();
  815. return MarkupDeclaration { result.release_value() };
  816. }
  817. if (auto result = parse_entity_declaration(); !result.is_error()) {
  818. rollback.disarm();
  819. return MarkupDeclaration { result.release_value() };
  820. }
  821. if (auto result = parse_notation_declaration(); !result.is_error()) {
  822. rollback.disarm();
  823. return MarkupDeclaration { result.release_value() };
  824. }
  825. if (auto result = parse_processing_instruction(); !result.is_error()) {
  826. rollback.disarm();
  827. return Optional<MarkupDeclaration> {};
  828. }
  829. if (auto result = parse_comment(); !result.is_error()) {
  830. rollback.disarm();
  831. return Optional<MarkupDeclaration> {};
  832. }
  833. return parse_error(m_lexer.tell(), "Expected one of elementdecl, attlistdecl, entitydecl, notationdecl, PI or comment");
  834. }
  835. // 2.8.28a DeclSep, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-DeclSep
  836. ErrorOr<Optional<ByteString>, ParseError> Parser::parse_declaration_separator()
  837. {
  838. auto rollback = rollback_point();
  839. auto rule = enter_rule();
  840. // DeclSep ::= PEReference | S
  841. if (auto name = parse_parameter_entity_reference(); !name.is_error()) {
  842. rollback.disarm();
  843. // FIXME: Resolve this PEReference.
  844. return "";
  845. }
  846. if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
  847. rollback.disarm();
  848. return Optional<ByteString> {};
  849. }
  850. return parse_error(m_lexer.tell(), "Expected either whitespace, or a PEReference");
  851. }
  852. // 4.1.69 PEReference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEReference
  853. ErrorOr<Name, ParseError> Parser::parse_parameter_entity_reference()
  854. {
  855. auto rollback = rollback_point();
  856. auto rule = enter_rule();
  857. // PEReference ::= '%' Name ';'
  858. TRY(expect("%"sv));
  859. auto accept = accept_rule();
  860. auto name = TRY(parse_name());
  861. TRY(expect(";"sv));
  862. rollback.disarm();
  863. return name;
  864. }
  865. // 3.2.46 elementdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-elementdecl
  866. ErrorOr<ElementDeclaration, ParseError> Parser::parse_element_declaration()
  867. {
  868. auto rollback = rollback_point();
  869. auto rule = enter_rule();
  870. // FIXME: Apparently both name _and_ contentspec here are allowed to be PEReferences,
  871. // but the grammar does not allow that, figure this out.
  872. // elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
  873. TRY(expect("<!ELEMENT"sv));
  874. auto accept = accept_rule();
  875. TRY(skip_whitespace(Required::Yes));
  876. auto name = TRY(parse_name());
  877. TRY(skip_whitespace(Required::Yes));
  878. auto spec = TRY(parse_content_spec());
  879. TRY(expect(">"sv));
  880. rollback.disarm();
  881. return ElementDeclaration {
  882. move(name),
  883. move(spec),
  884. };
  885. }
  886. // 3.3.52 AttlistDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttlistDecl
  887. ErrorOr<AttributeListDeclaration, ParseError> Parser::parse_attribute_list_declaration()
  888. {
  889. auto rollback = rollback_point();
  890. auto rule = enter_rule();
  891. AttributeListDeclaration declaration;
  892. // AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
  893. TRY(expect("<!ATTLIST"sv));
  894. auto accept = accept_rule();
  895. TRY(skip_whitespace(Required::Yes));
  896. declaration.type = TRY(parse_name());
  897. while (true) {
  898. if (auto result = parse_attribute_definition(); !result.is_error())
  899. declaration.attributes.append(result.release_value());
  900. else
  901. break;
  902. }
  903. TRY(skip_whitespace());
  904. TRY(expect(">"sv));
  905. rollback.disarm();
  906. return declaration;
  907. }
  908. // 3.3.53 AttDef, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttDef
  909. ErrorOr<AttributeListDeclaration::Definition, ParseError> Parser::parse_attribute_definition()
  910. {
  911. auto rollback = rollback_point();
  912. auto rule = enter_rule();
  913. Optional<AttributeListDeclaration::Type> type;
  914. Optional<AttributeListDeclaration::Default> default_;
  915. // AttDef ::= S Name S AttType S DefaultDecl
  916. TRY(skip_whitespace(Required::Yes));
  917. auto name = TRY(parse_name());
  918. auto accept = accept_rule();
  919. TRY(skip_whitespace(Required::Yes));
  920. // AttType ::= StringType | TokenizedType | EnumeratedType
  921. // StringType ::= 'CDATA'
  922. // TokenizedType ::= 'ID'
  923. // | 'IDREF'
  924. // | 'IDREFS'
  925. // | 'ENTITY'
  926. // | 'ENTITIES'
  927. // | 'NMTOKEN'
  928. // | 'NMTOKENS'
  929. // EnumeratedType ::= NotationType | Enumeration
  930. // NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
  931. // Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
  932. if (m_lexer.consume_specific("CDATA"sv)) {
  933. type = AttributeListDeclaration::StringType::CData;
  934. } else if (m_lexer.consume_specific("IDREFS"sv)) {
  935. type = AttributeListDeclaration::TokenizedType::IDRefs;
  936. } else if (m_lexer.consume_specific("IDREF"sv)) {
  937. type = AttributeListDeclaration::TokenizedType::IDRef;
  938. } else if (m_lexer.consume_specific("ID"sv)) {
  939. type = AttributeListDeclaration::TokenizedType::ID;
  940. } else if (m_lexer.consume_specific("ENTITIES"sv)) {
  941. type = AttributeListDeclaration::TokenizedType::Entities;
  942. } else if (m_lexer.consume_specific("ENTITY"sv)) {
  943. type = AttributeListDeclaration::TokenizedType::Entity;
  944. } else if (m_lexer.consume_specific("NMTOKENS"sv)) {
  945. type = AttributeListDeclaration::TokenizedType::NMTokens;
  946. } else if (m_lexer.consume_specific("NMTOKEN"sv)) {
  947. type = AttributeListDeclaration::TokenizedType::NMToken;
  948. } else if (m_lexer.consume_specific("NOTATION"sv)) {
  949. HashTable<Name> names;
  950. TRY(skip_whitespace(Required::Yes));
  951. TRY(expect("("sv));
  952. TRY(skip_whitespace());
  953. names.set(TRY(parse_name()));
  954. while (true) {
  955. TRY(skip_whitespace());
  956. if (auto result = expect("|"sv); result.is_error())
  957. break;
  958. TRY(skip_whitespace());
  959. names.set(TRY(parse_name()));
  960. }
  961. TRY(skip_whitespace());
  962. TRY(expect(")"sv));
  963. type = AttributeListDeclaration::NotationType { move(names) };
  964. } else {
  965. HashTable<ByteString> names;
  966. TRY(expect("("sv));
  967. TRY(skip_whitespace());
  968. names.set(TRY(parse_nm_token()));
  969. while (true) {
  970. TRY(skip_whitespace());
  971. if (auto result = expect("|"sv); result.is_error())
  972. break;
  973. TRY(skip_whitespace());
  974. names.set(TRY(parse_nm_token()));
  975. }
  976. TRY(skip_whitespace());
  977. TRY(expect(")"sv));
  978. type = AttributeListDeclaration::Enumeration { move(names) };
  979. }
  980. TRY(skip_whitespace(Required::Yes));
  981. // DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
  982. // | (('#FIXED' S)? AttValue)
  983. if (m_lexer.consume_specific("#REQUIRED"sv)) {
  984. default_ = AttributeListDeclaration::Required {};
  985. } else if (m_lexer.consume_specific("#IMPLIED"sv)) {
  986. default_ = AttributeListDeclaration::Implied {};
  987. } else {
  988. bool fixed = false;
  989. if (m_lexer.consume_specific("#FIXED"sv)) {
  990. TRY(skip_whitespace(Required::Yes));
  991. fixed = true;
  992. }
  993. auto value = TRY(parse_attribute_value());
  994. if (fixed)
  995. default_ = AttributeListDeclaration::Fixed { move(value) };
  996. else
  997. default_ = AttributeListDeclaration::DefaultValue { move(value) };
  998. }
  999. rollback.disarm();
  1000. return AttributeListDeclaration::Definition {
  1001. move(name),
  1002. type.release_value(),
  1003. default_.release_value(),
  1004. };
  1005. }
  1006. // 2.3.7 Nmtoken, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Nmtoken
  1007. ErrorOr<StringView, ParseError> Parser::parse_nm_token()
  1008. {
  1009. auto rollback = rollback_point();
  1010. auto rule = enter_rule();
  1011. // Nmtoken ::= (NameChar)+
  1012. auto token = TRY(expect_many(s_name_characters, "a NameChar"sv));
  1013. rollback.disarm();
  1014. return token;
  1015. }
  1016. // 4.7.82 NotationDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#Notations
  1017. ErrorOr<NotationDeclaration, ParseError> Parser::parse_notation_declaration()
  1018. {
  1019. auto rollback = rollback_point();
  1020. auto rule = enter_rule();
  1021. Variant<ExternalID, PublicID, Empty> notation;
  1022. // NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'
  1023. TRY(expect("<!NOTATION"sv));
  1024. auto accept = accept_rule();
  1025. TRY(skip_whitespace(Required::Yes));
  1026. auto name = TRY(parse_name());
  1027. TRY(skip_whitespace(Required::Yes));
  1028. if (auto result = parse_external_id(); !result.is_error())
  1029. notation = result.release_value();
  1030. else
  1031. notation = TRY(parse_public_id());
  1032. TRY(expect(">"sv));
  1033. rollback.disarm();
  1034. return NotationDeclaration {
  1035. move(name),
  1036. move(notation).downcast<ExternalID, PublicID>(),
  1037. };
  1038. }
  1039. // 3.2.46 contentspec, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-contentspec
  1040. ErrorOr<ElementDeclaration::ContentSpec, ParseError> Parser::parse_content_spec()
  1041. {
  1042. auto rollback = rollback_point();
  1043. auto rule = enter_rule();
  1044. Optional<ElementDeclaration::ContentSpec> content_spec;
  1045. // contentspec ::= 'EMPTY' | 'ANY' | Mixed | children
  1046. if (m_lexer.consume_specific("EMPTY"sv)) {
  1047. content_spec = ElementDeclaration::Empty {};
  1048. } else if (m_lexer.consume_specific("ANY"sv)) {
  1049. content_spec = ElementDeclaration::Any {};
  1050. } else {
  1051. TRY(expect("("sv));
  1052. TRY(skip_whitespace());
  1053. if (m_lexer.consume_specific("#PCDATA"sv)) {
  1054. HashTable<Name> names;
  1055. // Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*'
  1056. // | '(' S? '#PCDATA' S? ')'
  1057. TRY(skip_whitespace());
  1058. if (m_lexer.consume_specific(")*"sv)) {
  1059. content_spec = ElementDeclaration::Mixed { .types = {}, .many = true };
  1060. } else if (m_lexer.consume_specific(')')) {
  1061. content_spec = ElementDeclaration::Mixed { .types = {}, .many = false };
  1062. } else {
  1063. while (true) {
  1064. TRY(skip_whitespace());
  1065. if (!m_lexer.consume_specific('|'))
  1066. break;
  1067. TRY(skip_whitespace());
  1068. if (auto result = parse_name(); !result.is_error())
  1069. names.set(result.release_value());
  1070. else
  1071. return parse_error(m_lexer.tell(), "Expected a Name");
  1072. }
  1073. TRY(skip_whitespace());
  1074. TRY(expect(")*"sv));
  1075. content_spec = ElementDeclaration::Mixed { .types = move(names), .many = true };
  1076. }
  1077. } else {
  1078. while (!m_lexer.next_is('('))
  1079. m_lexer.retreat();
  1080. // children ::= (choice | seq) ('?' | '*' | '+')?
  1081. // cp ::= (Name | choice | seq) ('?' | '*' | '+')?
  1082. // choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')'
  1083. // seq ::= '(' S? cp ( S? ',' S? cp )* S? ')'
  1084. Function<ErrorOr<ElementDeclaration::Children::Choice, ParseError>()> parse_choice;
  1085. Function<ErrorOr<ElementDeclaration::Children::Sequence, ParseError>()> parse_sequence;
  1086. auto parse_cp_init = [&]() -> ErrorOr<Variant<Name, ElementDeclaration::Children::Choice, ElementDeclaration::Children::Sequence>, ParseError> {
  1087. if (auto result = parse_name(); !result.is_error())
  1088. return result.release_value();
  1089. if (auto result = parse_choice(); !result.is_error())
  1090. return result.release_value();
  1091. return TRY(parse_sequence());
  1092. };
  1093. auto parse_qualifier = [&]() -> ElementDeclaration::Children::Qualifier {
  1094. ElementDeclaration::Children::Qualifier qualifier { ElementDeclaration::Children::Qualifier::ExactlyOnce };
  1095. if (m_lexer.consume_specific('?'))
  1096. qualifier = ElementDeclaration::Children::Qualifier::Optional;
  1097. else if (m_lexer.consume_specific('*'))
  1098. qualifier = ElementDeclaration::Children::Qualifier::Any;
  1099. else if (m_lexer.consume_specific('+'))
  1100. qualifier = ElementDeclaration::Children::Qualifier::OneOrMore;
  1101. return qualifier;
  1102. };
  1103. auto parse_cp = [&]() -> ErrorOr<ElementDeclaration::Children::Entry, ParseError> {
  1104. auto sub_entry = TRY(parse_cp_init());
  1105. auto qualifier = parse_qualifier();
  1106. return ElementDeclaration::Children::Entry {
  1107. move(sub_entry),
  1108. qualifier,
  1109. };
  1110. };
  1111. parse_choice = [&]() -> ErrorOr<ElementDeclaration::Children::Choice, ParseError> {
  1112. auto rollback = rollback_point();
  1113. auto rule = enter_rule();
  1114. TRY(expect("("sv));
  1115. auto accept = accept_rule();
  1116. TRY(skip_whitespace());
  1117. Vector<ElementDeclaration::Children::Entry> choices;
  1118. choices.append(TRY(parse_cp()));
  1119. while (true) {
  1120. TRY(skip_whitespace());
  1121. if (!m_lexer.consume_specific('|'))
  1122. break;
  1123. TRY(skip_whitespace());
  1124. choices.append(TRY(parse_cp()));
  1125. }
  1126. TRY(expect(")"sv));
  1127. if (choices.size() < 2)
  1128. return parse_error(m_lexer.tell(), "Expected more than one choice");
  1129. TRY(skip_whitespace());
  1130. auto qualifier = parse_qualifier();
  1131. rollback.disarm();
  1132. return ElementDeclaration::Children::Choice {
  1133. move(choices),
  1134. qualifier,
  1135. };
  1136. };
  1137. parse_sequence = [&]() -> ErrorOr<ElementDeclaration::Children::Sequence, ParseError> {
  1138. auto rollback = rollback_point();
  1139. auto rule = enter_rule();
  1140. TRY(expect("("sv));
  1141. auto accept = accept_rule();
  1142. TRY(skip_whitespace());
  1143. Vector<ElementDeclaration::Children::Entry> entries;
  1144. entries.append(TRY(parse_cp()));
  1145. while (true) {
  1146. TRY(skip_whitespace());
  1147. if (!m_lexer.consume_specific(','))
  1148. break;
  1149. TRY(skip_whitespace());
  1150. entries.append(TRY(parse_cp()));
  1151. }
  1152. TRY(expect(")"sv));
  1153. TRY(skip_whitespace());
  1154. auto qualifier = parse_qualifier();
  1155. rollback.disarm();
  1156. return ElementDeclaration::Children::Sequence {
  1157. move(entries),
  1158. qualifier,
  1159. };
  1160. };
  1161. if (auto result = parse_choice(); !result.is_error()) {
  1162. auto qualifier = parse_qualifier();
  1163. content_spec = ElementDeclaration::Children {
  1164. result.release_value(),
  1165. qualifier,
  1166. };
  1167. } else {
  1168. auto sequence = TRY(parse_sequence());
  1169. auto qualifier = parse_qualifier();
  1170. content_spec = ElementDeclaration::Children {
  1171. move(sequence),
  1172. qualifier,
  1173. };
  1174. }
  1175. }
  1176. }
  1177. rollback.disarm();
  1178. return content_spec.release_value();
  1179. }
  1180. // 2.8.31 extSubsetDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubsetDecl
  1181. ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset_declaration()
  1182. {
  1183. auto rollback = rollback_point();
  1184. auto rule = enter_rule();
  1185. Vector<MarkupDeclaration> declarations;
  1186. // extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep )*
  1187. while (true) {
  1188. if (auto result = parse_markup_declaration(); !result.is_error()) {
  1189. if (result.value().has_value())
  1190. declarations.append(result.release_value().release_value());
  1191. continue;
  1192. }
  1193. // FIXME: conditionalSect
  1194. if (auto result = parse_declaration_separator(); !result.is_error())
  1195. continue;
  1196. break;
  1197. }
  1198. rollback.disarm();
  1199. return declarations;
  1200. }
  1201. // 4.2.70 EntityDecl, https://www.w3.org/TR/xml/#NT-EntityDecl
  1202. ErrorOr<EntityDeclaration, ParseError> Parser::parse_entity_declaration()
  1203. {
  1204. // EntityDecl ::= GEDecl | PEDecl
  1205. if (auto result = parse_general_entity_declaration(); !result.is_error())
  1206. return result;
  1207. return parse_parameter_entity_declaration();
  1208. }
  1209. // 4.2.71 GEDecl, https://www.w3.org/TR/xml/#NT-GEDecl
  1210. ErrorOr<EntityDeclaration, ParseError> Parser::parse_general_entity_declaration()
  1211. {
  1212. auto rollback = rollback_point();
  1213. auto rule = enter_rule();
  1214. Variant<ByteString, EntityDefinition, Empty> definition;
  1215. // GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
  1216. TRY(expect("<!ENTITY"sv));
  1217. auto accept = accept_rule();
  1218. TRY(skip_whitespace(Required::Yes));
  1219. auto name = TRY(parse_name());
  1220. TRY(skip_whitespace(Required::Yes));
  1221. // EntityDef ::= EntityValue | (ExternalID NDataDecl?)
  1222. if (auto result = parse_entity_value(); !result.is_error()) {
  1223. definition = result.release_value();
  1224. } else {
  1225. auto external_id = TRY(parse_external_id());
  1226. Optional<Name> notation;
  1227. if (auto notation_result = parse_notation_data_declaration(); !notation_result.is_error())
  1228. notation = notation_result.release_value();
  1229. definition = EntityDefinition {
  1230. move(external_id),
  1231. move(notation),
  1232. };
  1233. }
  1234. TRY(skip_whitespace());
  1235. TRY(expect(">"sv));
  1236. rollback.disarm();
  1237. return GEDeclaration {
  1238. move(name),
  1239. move(definition).downcast<ByteString, EntityDefinition>(),
  1240. };
  1241. }
  1242. // 4.2.72 PEDecl, https://www.w3.org/TR/xml/#NT-PEDecl
  1243. ErrorOr<EntityDeclaration, ParseError> Parser::parse_parameter_entity_declaration()
  1244. {
  1245. auto rollback = rollback_point();
  1246. auto rule = enter_rule();
  1247. Variant<ByteString, ExternalID, Empty> definition;
  1248. // PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
  1249. TRY(expect("<!ENTITY"sv));
  1250. auto accept = accept_rule();
  1251. TRY(skip_whitespace(Required::Yes));
  1252. TRY(expect("%"sv));
  1253. TRY(skip_whitespace(Required::Yes));
  1254. auto name = TRY(parse_name());
  1255. TRY(skip_whitespace(Required::Yes));
  1256. // PEDef ::= EntityValue | ExternalID
  1257. if (auto result = parse_entity_value(); !result.is_error())
  1258. definition = result.release_value();
  1259. else
  1260. definition = TRY(parse_external_id());
  1261. TRY(skip_whitespace());
  1262. TRY(expect(">"sv));
  1263. rollback.disarm();
  1264. return PEDeclaration {
  1265. move(name),
  1266. move(definition).downcast<ByteString, ExternalID>(),
  1267. };
  1268. }
  1269. // 4.7.83 PublicID, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PublicID
  1270. ErrorOr<PublicID, ParseError> Parser::parse_public_id()
  1271. {
  1272. auto rollback = rollback_point();
  1273. auto rule = enter_rule();
  1274. // PublicID ::= 'PUBLIC' S PubidLiteral
  1275. TRY(expect("PUBLIC"sv));
  1276. auto accept = accept_rule();
  1277. TRY(skip_whitespace(Required::Yes));
  1278. auto text = TRY(parse_public_id_literal());
  1279. rollback.disarm();
  1280. return PublicID {
  1281. text,
  1282. };
  1283. }
  1284. constexpr static auto s_public_id_characters = set_to_search<StringSet("\x20\x0d\x0a-'()+,./:=?;!*#@$_%")>().unify(ranges_for_search<Range('a', 'z'), Range('A', 'Z'), Range('0', '9')>());
  1285. // 2.3.12, PubidLiteral, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral
  1286. ErrorOr<StringView, ParseError> Parser::parse_public_id_literal()
  1287. {
  1288. auto rollback = rollback_point();
  1289. auto rule = enter_rule();
  1290. // PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
  1291. auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv));
  1292. auto accept = accept_rule();
  1293. auto id = TRY(expect_many(
  1294. [q = quote[0]](auto x) {
  1295. return (q == '\'' ? x != '\'' : true) && s_public_id_characters.contains(x);
  1296. },
  1297. "a PubidChar"sv));
  1298. TRY(expect(quote));
  1299. rollback.disarm();
  1300. return id;
  1301. }
  1302. // 2.3.11 SystemLiteral, https://www.w3.org/TR/xml/#NT-SystemLiteral
  1303. ErrorOr<StringView, ParseError> Parser::parse_system_id_literal()
  1304. {
  1305. auto rollback = rollback_point();
  1306. auto rule = enter_rule();
  1307. // SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
  1308. auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv));
  1309. auto accept = accept_rule();
  1310. auto id = TRY(expect_many(is_not_any_of(quote), "not a quote"sv));
  1311. TRY(expect(quote));
  1312. rollback.disarm();
  1313. return id;
  1314. }
  1315. // 4.2.75 ExternalID, https://www.w3.org/TR/xml/#NT-ExternalID
  1316. ErrorOr<ExternalID, ParseError> Parser::parse_external_id()
  1317. {
  1318. auto rollback = rollback_point();
  1319. auto rule = enter_rule();
  1320. // ExternalID ::= 'SYSTEM' S SystemLiteral
  1321. // | 'PUBLIC' S PubidLiteral S SystemLiteral
  1322. Optional<PublicID> public_id;
  1323. SystemID system_id;
  1324. if (m_lexer.consume_specific("SYSTEM"sv)) {
  1325. auto accept = accept_rule();
  1326. TRY(skip_whitespace(Required::Yes));
  1327. system_id = SystemID { TRY(parse_system_id_literal()) };
  1328. } else {
  1329. TRY(expect("PUBLIC"sv));
  1330. auto accept = accept_rule();
  1331. TRY(skip_whitespace(Required::Yes));
  1332. public_id = PublicID { TRY(parse_public_id_literal()) };
  1333. TRY(skip_whitespace(Required::Yes));
  1334. system_id = SystemID { TRY(parse_system_id_literal()) };
  1335. }
  1336. rollback.disarm();
  1337. return ExternalID {
  1338. move(public_id),
  1339. move(system_id),
  1340. };
  1341. }
  1342. // 4.2.2.76 NDataDecl, https://www.w3.org/TR/xml/#NT-NDataDecl
  1343. ErrorOr<Name, ParseError> Parser::parse_notation_data_declaration()
  1344. {
  1345. auto rollback = rollback_point();
  1346. auto rule = enter_rule();
  1347. // NDataDecl ::= S 'NDATA' S Name
  1348. TRY(skip_whitespace(Required::Yes));
  1349. auto accept = accept_rule();
  1350. TRY(expect("NDATA"sv));
  1351. TRY(skip_whitespace(Required::Yes));
  1352. auto name = TRY(parse_name());
  1353. rollback.disarm();
  1354. return name;
  1355. }
  1356. // 2.3.9 EntityValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue
  1357. ErrorOr<ByteString, ParseError> Parser::parse_entity_value()
  1358. {
  1359. auto rollback = rollback_point();
  1360. auto rule = enter_rule();
  1361. StringBuilder builder;
  1362. // EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
  1363. // | "'" ([^%&'] | PEReference | Reference)* "'"
  1364. auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv));
  1365. auto accept = accept_rule();
  1366. while (true) {
  1367. if (m_lexer.is_eof())
  1368. break;
  1369. if (m_lexer.next_is(quote))
  1370. break;
  1371. if (m_lexer.next_is('%')) {
  1372. auto start = m_lexer.tell();
  1373. TRY(parse_parameter_entity_reference());
  1374. builder.append(m_source.substring_view(start, m_lexer.tell() - start));
  1375. continue;
  1376. }
  1377. if (m_lexer.next_is('&')) {
  1378. auto start = m_lexer.tell();
  1379. TRY(parse_reference());
  1380. builder.append(m_source.substring_view(start, m_lexer.tell() - start));
  1381. continue;
  1382. }
  1383. builder.append(m_lexer.consume());
  1384. }
  1385. TRY(expect(quote));
  1386. rollback.disarm();
  1387. return builder.to_byte_string();
  1388. }
  1389. // 2.7.18 CDSect, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CDSect
  1390. ErrorOr<StringView, ParseError> Parser::parse_cdata_section()
  1391. {
  1392. auto rollback = rollback_point();
  1393. auto rule = enter_rule();
  1394. // CDSect ::= CDStart CData CDEnd
  1395. // CDStart ::= '<![CDATA['
  1396. // CData ::= (Char* - (Char* ']]>' Char*))
  1397. // CDEnd ::= ']]>'
  1398. TRY(expect("<![CDATA["sv));
  1399. auto accept = accept_rule();
  1400. auto section_start = m_lexer.tell();
  1401. while (!m_lexer.next_is("]]>")) {
  1402. if (m_lexer.is_eof())
  1403. break;
  1404. m_lexer.ignore();
  1405. }
  1406. auto section_end = m_lexer.tell();
  1407. TRY(expect("]]>"sv));
  1408. rollback.disarm();
  1409. return m_source.substring_view(section_start, section_end - section_start);
  1410. }
  1411. // 2.8.30 extSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubset
  1412. ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset()
  1413. {
  1414. auto rollback = rollback_point();
  1415. auto rule = enter_rule();
  1416. // extSubset ::= TextDecl? extSubsetDecl
  1417. (void)parse_text_declaration();
  1418. auto result = TRY(parse_external_subset_declaration());
  1419. rollback.disarm();
  1420. return result;
  1421. }
  1422. // 4.3.1.77 TextDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-TextDecl
  1423. ErrorOr<void, ParseError> Parser::parse_text_declaration()
  1424. {
  1425. auto rollback = rollback_point();
  1426. auto rule = enter_rule();
  1427. // TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
  1428. TRY(expect("<?xml"sv));
  1429. auto accept = accept_rule();
  1430. (void)parse_version_info();
  1431. TRY(parse_encoding_decl());
  1432. TRY(skip_whitespace());
  1433. TRY(expect("?>"sv));
  1434. rollback.disarm();
  1435. return {};
  1436. }
  1437. ErrorOr<ByteString, ParseError> Parser::resolve_reference(EntityReference const& reference, ReferencePlacement placement)
  1438. {
  1439. static HashTable<Name> reference_lookup {};
  1440. if (reference_lookup.contains(reference.name))
  1441. return parse_error(m_lexer.tell(), ByteString::formatted("Invalid recursive definition for '{}'", reference.name));
  1442. reference_lookup.set(reference.name);
  1443. ScopeGuard remove_lookup {
  1444. [&] {
  1445. reference_lookup.remove(reference.name);
  1446. }
  1447. };
  1448. Optional<ByteString> resolved;
  1449. if (m_doctype.has_value()) {
  1450. // FIXME: Split these up and resolve them ahead of time.
  1451. for (auto& declaration : m_doctype->markup_declarations) {
  1452. auto entity = declaration.get_pointer<EntityDeclaration>();
  1453. if (!entity)
  1454. continue;
  1455. auto ge_declaration = entity->get_pointer<GEDeclaration>();
  1456. if (!ge_declaration)
  1457. continue;
  1458. if (ge_declaration->name != reference.name)
  1459. continue;
  1460. TRY(ge_declaration->definition.visit(
  1461. [&](ByteString const& definition) -> ErrorOr<void, ParseError> {
  1462. resolved = definition;
  1463. return {};
  1464. },
  1465. [&](EntityDefinition const& definition) -> ErrorOr<void, ParseError> {
  1466. if (placement == ReferencePlacement::AttributeValue)
  1467. return parse_error(m_lexer.tell(), ByteString::formatted("Attribute references external entity '{}'", reference.name));
  1468. if (definition.notation.has_value())
  1469. return parse_error(0u, ByteString::formatted("Entity reference to unparsed entity '{}'", reference.name));
  1470. if (!m_options.resolve_external_resource)
  1471. return parse_error(0u, ByteString::formatted("Failed to resolve external entity '{}'", reference.name));
  1472. auto result = m_options.resolve_external_resource(definition.id.system_id, definition.id.public_id);
  1473. if (result.is_error())
  1474. return parse_error(0u, ByteString::formatted("Failed to resolve external entity '{}': {}", reference.name, result.error()));
  1475. resolved = result.release_value();
  1476. return {};
  1477. }));
  1478. break;
  1479. }
  1480. }
  1481. if (!resolved.has_value()) {
  1482. if (reference.name == "amp")
  1483. return "&";
  1484. if (reference.name == "lt")
  1485. return "<";
  1486. if (reference.name == "gt")
  1487. return ">";
  1488. if (reference.name == "apos")
  1489. return "'";
  1490. if (reference.name == "quot")
  1491. return "\"";
  1492. return parse_error(0u, ByteString::formatted("Reference to undeclared entity '{}'", reference.name));
  1493. }
  1494. StringView resolved_source = *resolved;
  1495. TemporaryChange source { m_source, resolved_source };
  1496. TemporaryChange lexer { m_lexer, LineTrackingLexer(m_source) };
  1497. switch (placement) {
  1498. case ReferencePlacement::AttributeValue:
  1499. return TRY(parse_attribute_value_inner(""sv));
  1500. case ReferencePlacement::Content:
  1501. TRY(parse_content());
  1502. return "";
  1503. default:
  1504. VERIFY_NOT_REACHED();
  1505. }
  1506. }
  1507. }