Parser.cpp 57 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780
  1. /*
  2. * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <LibXML/DOM/Document.h>
  7. #include <LibXML/Parser/Parser.h>
  8. struct Range {
  9. consteval Range(u32 start, u32 end)
  10. : start(start)
  11. , end(end)
  12. {
  13. }
  14. u32 start;
  15. u32 end;
  16. };
  17. template<auto... ranges>
  18. struct ranges_for_search {
  19. auto contains(u32 value) const
  20. {
  21. return ((value >= ranges.start && value <= ranges.end) || ...);
  22. }
  23. bool operator()(u32 value) const
  24. {
  25. return contains(value);
  26. }
  27. template<auto... ranges_to_include>
  28. consteval auto with() const
  29. {
  30. return ranges_for_search<ranges..., ranges_to_include...>();
  31. }
  32. template<auto... ranges_to_include>
  33. consteval auto unify(ranges_for_search<ranges_to_include...> const&) const
  34. {
  35. return ranges_for_search<ranges..., ranges_to_include...>();
  36. }
  37. };
  38. template<size_t Count, typename Element>
  39. struct StringSet {
  40. consteval StringSet(Element const (&entries)[Count])
  41. {
  42. for (size_t i = 0; i < Count - 1; ++i)
  43. elements[i] = entries[i];
  44. }
  45. consteval auto operator[](size_t i) const { return elements[i]; }
  46. Element elements[Count - 1];
  47. };
  48. template<StringSet chars>
  49. consteval static auto set_to_search()
  50. {
  51. return ([&]<auto... Ix>(IndexSequence<Ix...>) {
  52. return ranges_for_search<Range(chars[Ix], chars[Ix])...>();
  53. }(MakeIndexSequence<array_size(chars.elements)>()));
  54. }
  55. namespace XML {
  56. size_t Parser::s_debug_indent_level { 0 };
  57. void Parser::append_node(NonnullOwnPtr<Node> node)
  58. {
  59. if (m_entered_node) {
  60. m_entered_node->content.get<Node::Element>().children.append(move(node));
  61. } else {
  62. m_root_node = move(node);
  63. m_entered_node = m_root_node.ptr();
  64. }
  65. }
  66. void Parser::append_text(String text)
  67. {
  68. if (m_listener) {
  69. m_listener->text(text);
  70. return;
  71. }
  72. if (!m_entered_node) {
  73. Node::Text node;
  74. node.builder.append(text);
  75. m_root_node = make<Node>(move(node));
  76. return;
  77. }
  78. m_entered_node->content.visit(
  79. [&](Node::Element& node) {
  80. if (!node.children.is_empty()) {
  81. auto* text_node = node.children.last().content.get_pointer<Node::Text>();
  82. if (text_node) {
  83. text_node->builder.append(text);
  84. return;
  85. }
  86. }
  87. Node::Text text_node;
  88. text_node.builder.append(text);
  89. node.children.append(make<Node>(move(text_node)));
  90. },
  91. [&](auto&) {
  92. // Can't enter a text or comment node.
  93. VERIFY_NOT_REACHED();
  94. });
  95. }
  96. void Parser::append_comment(String text)
  97. {
  98. if (m_listener) {
  99. m_listener->comment(text);
  100. return;
  101. }
  102. // If there's no node to attach this to, drop it on the floor.
  103. // This can happen to comments in the prolog.
  104. if (!m_entered_node)
  105. return;
  106. m_entered_node->content.visit(
  107. [&](Node::Element& node) {
  108. node.children.append(make<Node>(Node::Comment { move(text) }));
  109. },
  110. [&](auto&) {
  111. // Can't enter a text or comment node.
  112. VERIFY_NOT_REACHED();
  113. });
  114. }
  115. void Parser::enter_node(Node& node)
  116. {
  117. if (m_listener) {
  118. auto& element = node.content.get<Node::Element>();
  119. m_listener->element_start(element.name, element.attributes);
  120. }
  121. if (&node != m_root_node.ptr())
  122. node.parent = m_entered_node;
  123. m_entered_node = &node;
  124. }
  125. void Parser::leave_node()
  126. {
  127. if (m_listener) {
  128. auto& element = m_entered_node->content.get<Node::Element>();
  129. m_listener->element_end(element.name);
  130. }
  131. m_entered_node = m_entered_node->parent;
  132. }
  133. ErrorOr<Document, ParseError> Parser::parse()
  134. {
  135. if (auto result = parse_internal(); result.is_error()) {
  136. if (m_parse_errors.is_empty())
  137. return result.release_error();
  138. return m_parse_errors.take_first();
  139. }
  140. return Document {
  141. m_root_node.release_nonnull(),
  142. move(m_doctype),
  143. move(m_processing_instructions),
  144. m_version,
  145. };
  146. }
  147. ErrorOr<void, ParseError> Parser::parse_with_listener(Listener& listener)
  148. {
  149. m_listener = &listener;
  150. ScopeGuard unset_listener { [this] { m_listener = nullptr; } };
  151. m_listener->document_start();
  152. auto result = parse_internal();
  153. if (result.is_error())
  154. m_listener->error(result.error());
  155. m_listener->document_end();
  156. m_root_node.clear();
  157. return result;
  158. }
  159. // 2.3.3. S, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-S
  160. ErrorOr<void, ParseError> Parser::skip_whitespace(Required required)
  161. {
  162. auto rollback = rollback_point();
  163. auto rule = enter_rule();
  164. // S ::= (#x20 | #x9 | #xD | #xA)+
  165. auto matched = m_lexer.consume_while(is_any_of("\x20\x09\x0d\x0a"));
  166. if (required == Required::Yes && matched.is_empty())
  167. return parse_error(m_lexer.tell(), "Expected whitespace");
  168. rollback.disarm();
  169. return {};
  170. }
  171. // 2.2.a. RestrictedChar, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-RestrictedChar
  172. constexpr static auto s_restricted_characters = ranges_for_search<Range(0x1, 0x8), Range(0xb, 0xc), Range(0xe, 0x1f), Range(0x7f, 0x84), Range(0x86, 0x9f)>();
  173. // 2.1.1. Document, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-well-formed
  174. ErrorOr<void, ParseError> Parser::parse_internal()
  175. {
  176. auto rule = enter_rule();
  177. // document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* )
  178. TRY(parse_prolog());
  179. TRY(parse_element());
  180. while (true) {
  181. if (auto result = parse_misc(); result.is_error())
  182. break;
  183. }
  184. auto matched_source = m_source.substring_view(0, m_lexer.tell());
  185. if (auto it = find_if(matched_source.begin(), matched_source.end(), s_restricted_characters); !it.is_end()) {
  186. return parse_error(
  187. it.index(),
  188. String::formatted("Invalid character #{:x} used in document", *it));
  189. }
  190. if (!m_lexer.is_eof())
  191. return parse_error(m_lexer.tell(), "Garbage after document");
  192. return {};
  193. }
  194. ErrorOr<void, ParseError> Parser::expect(StringView expected)
  195. {
  196. auto rollback = rollback_point();
  197. if (!m_lexer.consume_specific(expected)) {
  198. if (m_options.treat_errors_as_fatal)
  199. return parse_error(m_lexer.tell(), String::formatted("Expected '{}'", expected));
  200. }
  201. rollback.disarm();
  202. return {};
  203. }
  204. template<typename Pred>
  205. requires(IsCallableWithArguments<Pred, char>) ErrorOr<StringView, ParseError> Parser::expect(Pred predicate, StringView description)
  206. {
  207. auto rollback = rollback_point();
  208. auto start = m_lexer.tell();
  209. if (!m_lexer.next_is(predicate)) {
  210. if (m_options.treat_errors_as_fatal)
  211. return parse_error(m_lexer.tell(), String::formatted("Expected {}", description));
  212. }
  213. m_lexer.ignore();
  214. rollback.disarm();
  215. return m_source.substring_view(start, m_lexer.tell() - start);
  216. }
  217. template<typename Pred>
  218. requires(IsCallableWithArguments<Pred, char>) ErrorOr<StringView, ParseError> Parser::expect_many(Pred predicate, StringView description)
  219. {
  220. auto rollback = rollback_point();
  221. auto start = m_lexer.tell();
  222. while (m_lexer.next_is(predicate)) {
  223. if (m_lexer.is_eof())
  224. break;
  225. m_lexer.ignore();
  226. }
  227. if (m_lexer.tell() == start) {
  228. if (m_options.treat_errors_as_fatal) {
  229. return parse_error(m_lexer.tell(), String::formatted("Expected {}", description));
  230. }
  231. }
  232. rollback.disarm();
  233. return m_source.substring_view(start, m_lexer.tell() - start);
  234. }
  235. // 2.8.22. Prolog, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog
  236. ErrorOr<void, ParseError> Parser::parse_prolog()
  237. {
  238. auto rollback = rollback_point();
  239. auto rule = enter_rule();
  240. // prolog ::= XMLDecl Misc* (doctypedecl Misc*)?
  241. // The following is valid in XML 1.0.
  242. // prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
  243. if (auto result = parse_xml_decl(); result.is_error()) {
  244. m_version = Version::Version10;
  245. m_in_compatibility_mode = true;
  246. }
  247. auto accept = accept_rule();
  248. while (true) {
  249. if (auto result = parse_misc(); result.is_error())
  250. break;
  251. }
  252. if (auto result = parse_doctype_decl(); !result.is_error()) {
  253. while (true) {
  254. if (auto result = parse_misc(); result.is_error())
  255. break;
  256. }
  257. }
  258. rollback.disarm();
  259. return {};
  260. }
  261. // 2.8.23. XMLDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl
  262. ErrorOr<void, ParseError> Parser::parse_xml_decl()
  263. {
  264. auto rollback = rollback_point();
  265. auto rule = enter_rule();
  266. // XMLDecl::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
  267. TRY(expect("<?xml"));
  268. auto accept = accept_rule();
  269. TRY(parse_version_info());
  270. (void)parse_encoding_decl();
  271. (void)parse_standalone_document_decl();
  272. TRY(skip_whitespace());
  273. TRY(expect("?>"));
  274. rollback.disarm();
  275. return {};
  276. }
  277. // 2.8.24. VersionInfo, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-VersionInfo
  278. ErrorOr<void, ParseError> Parser::parse_version_info()
  279. {
  280. auto rollback = rollback_point();
  281. auto rule = enter_rule();
  282. // VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
  283. TRY(skip_whitespace(Required::Yes));
  284. TRY(expect("version"));
  285. auto accept = accept_rule();
  286. TRY(parse_eq());
  287. TRY(expect(is_any_of("'\""), "one of ' or \""));
  288. m_lexer.retreat();
  289. auto version_string = m_lexer.consume_quoted_string();
  290. if (version_string == "1.0") {
  291. // FIXME: Compatibility mode, figure out which rules are different in XML 1.0.
  292. m_version = Version::Version10;
  293. m_in_compatibility_mode = true;
  294. } else {
  295. if (version_string != "1.1" && m_options.treat_errors_as_fatal)
  296. return parse_error(m_lexer.tell(), String::formatted("Expected '1.1', found '{}'", version_string));
  297. }
  298. m_version = Version::Version11;
  299. rollback.disarm();
  300. return {};
  301. }
  302. // 2.8.25. Eq, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Eq
  303. ErrorOr<void, ParseError> Parser::parse_eq()
  304. {
  305. auto rollback = rollback_point();
  306. auto rule = enter_rule();
  307. // Eq ::= S? '=' S?
  308. auto accept = accept_rule();
  309. TRY(skip_whitespace());
  310. TRY(expect("="));
  311. TRY(skip_whitespace());
  312. rollback.disarm();
  313. return {};
  314. }
  315. // 4.3.3.80. EncodingDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
  316. ErrorOr<void, ParseError> Parser::parse_encoding_decl()
  317. {
  318. auto rollback = rollback_point();
  319. auto rule = enter_rule();
  320. // EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
  321. TRY(skip_whitespace(Required::Yes));
  322. TRY(expect("encoding"));
  323. auto accept = accept_rule();
  324. TRY(parse_eq());
  325. TRY(expect(is_any_of("'\""), "one of ' or \""));
  326. m_lexer.retreat();
  327. // FIXME: Actually do something with this encoding.
  328. m_encoding = m_lexer.consume_quoted_string();
  329. rollback.disarm();
  330. return {};
  331. }
  332. // 2.9.32 SDDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-rmd
  333. ErrorOr<void, ParseError> Parser::parse_standalone_document_decl()
  334. {
  335. auto rollback = rollback_point();
  336. auto rule = enter_rule();
  337. // SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
  338. TRY(skip_whitespace(Required::Yes));
  339. TRY(expect("standalone"));
  340. auto accept = accept_rule();
  341. TRY(expect(is_any_of("'\""), "one of ' or \""));
  342. m_lexer.retreat();
  343. auto value = m_lexer.consume_quoted_string();
  344. if (!value.is_one_of("yes", "no"))
  345. return parse_error(m_lexer.tell() - value.length(), "Expected one of 'yes' or 'no'");
  346. m_standalone = value == "yes";
  347. rollback.disarm();
  348. return {};
  349. }
  350. // 2.8.27. Misc, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc
  351. ErrorOr<void, ParseError> Parser::parse_misc()
  352. {
  353. auto rollback = rollback_point();
  354. auto rule = enter_rule();
  355. // Misc ::= Comment | PI | S
  356. if (auto result = parse_comment(); !result.is_error()) {
  357. rollback.disarm();
  358. return {};
  359. }
  360. if (auto result = parse_processing_instruction(); !result.is_error()) {
  361. rollback.disarm();
  362. return {};
  363. }
  364. if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
  365. rollback.disarm();
  366. return {};
  367. }
  368. return parse_error(m_lexer.tell(), "Expected a match for 'Misc', but found none");
  369. }
  370. // 2.5.15 Comment, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Comment
  371. ErrorOr<void, ParseError> Parser::parse_comment()
  372. {
  373. auto rollback = rollback_point();
  374. auto rule = enter_rule();
  375. // Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
  376. TRY(expect("<!--"));
  377. auto accept = accept_rule();
  378. bool last_seen_a_dash = false;
  379. // FIXME: This should disallow surrogate blocks
  380. auto text = m_lexer.consume_while([&](auto ch) {
  381. if (ch != '-') {
  382. last_seen_a_dash = false;
  383. return true;
  384. }
  385. if (last_seen_a_dash)
  386. return false;
  387. last_seen_a_dash = true;
  388. return true;
  389. });
  390. if (last_seen_a_dash) {
  391. m_lexer.retreat();
  392. text = text.substring_view(0, text.length() - 1);
  393. }
  394. TRY(expect("-->"));
  395. if (m_options.preserve_comments)
  396. append_comment(text);
  397. rollback.disarm();
  398. return {};
  399. }
  400. // 2.6.16 PI, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI
  401. ErrorOr<void, ParseError> Parser::parse_processing_instruction()
  402. {
  403. auto rollback = rollback_point();
  404. auto rule = enter_rule();
  405. // PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
  406. TRY(expect("<?"));
  407. auto accept = accept_rule();
  408. auto target = TRY(parse_processing_instruction_target());
  409. String data;
  410. if (auto result = skip_whitespace(Required::Yes); !result.is_error())
  411. data = m_lexer.consume_until("?>");
  412. TRY(expect("?>"));
  413. m_processing_instructions.set(target, data);
  414. rollback.disarm();
  415. return {};
  416. }
  417. // 2.6.17. PITarget, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
  418. ErrorOr<Name, ParseError> Parser::parse_processing_instruction_target()
  419. {
  420. auto rollback = rollback_point();
  421. auto rule = enter_rule();
  422. // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
  423. auto target = TRY(parse_name());
  424. auto accept = accept_rule();
  425. if (target.equals_ignoring_case("xml") && m_options.treat_errors_as_fatal) {
  426. return parse_error(
  427. m_lexer.tell() - target.length(),
  428. "Use of the reserved 'xml' name for processing instruction target name is disallowed");
  429. }
  430. rollback.disarm();
  431. return target;
  432. }
  433. // NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
  434. constexpr static auto s_name_start_characters = ranges_for_search<Range(':', ':'), Range('A', 'Z'), Range('_', '_'), Range('a', 'z'), Range(0xc0, 0xd6), Range(0xd8, 0xf6), Range(0xf8, 0x2ff), Range(0x370, 0x37d), Range(0x37f, 0x1fff), Range(0x200c, 0x200d), Range(0x2070, 0x218f), Range(0x2c00, 0x2fef), Range(0x3001, 0xd7ff), Range(0xf900, 0xfdcf), Range(0xfdf0, 0xfffd), Range(0x10000, 0xeffff)> {};
  435. // NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
  436. constexpr static auto s_name_characters = s_name_start_characters.with<Range('-', '-'), Range('.', '.'), Range('0', '9'), Range(0xb7, 0xb7), Range(0x0300, 0x036f), Range(0x203f, 0x2040)>();
  437. // 2.3.5. Name, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name
  438. ErrorOr<Name, ParseError> Parser::parse_name()
  439. {
  440. auto rollback = rollback_point();
  441. auto rule = enter_rule();
  442. // Name ::= NameStartChar (NameChar)*
  443. auto start = TRY(expect(s_name_start_characters, "a NameStartChar"));
  444. auto accept = accept_rule();
  445. auto rest = m_lexer.consume_while(s_name_characters);
  446. StringBuilder builder;
  447. builder.append(start);
  448. builder.append(rest);
  449. rollback.disarm();
  450. return builder.to_string();
  451. }
  452. // 2.8.28. doctypedecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl
  453. ErrorOr<void, ParseError> Parser::parse_doctype_decl()
  454. {
  455. auto rollback = rollback_point();
  456. auto rule = enter_rule();
  457. Doctype doctype;
  458. // doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
  459. TRY(expect("<!DOCTYPE"));
  460. auto accept = accept_rule();
  461. TRY(skip_whitespace(Required::Yes));
  462. doctype.type = TRY(parse_name());
  463. if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
  464. auto id_start = m_lexer.tell();
  465. if (auto id_result = parse_external_id(); !id_result.is_error()) {
  466. doctype.external_id = id_result.release_value();
  467. if (m_options.resolve_external_resource) {
  468. auto resource_result = m_options.resolve_external_resource(doctype.external_id->system_id, doctype.external_id->public_id);
  469. if (resource_result.is_error()) {
  470. return parse_error(
  471. id_start,
  472. String::formatted("Failed to resolve external subset '{}': {}", doctype.external_id->system_id.system_literal, resource_result.error()));
  473. }
  474. StringView resolved_source = resource_result.value();
  475. TemporaryChange source { m_source, resolved_source };
  476. TemporaryChange lexer { m_lexer, GenericLexer(m_source) };
  477. auto declarations = TRY(parse_external_subset());
  478. if (!m_lexer.is_eof()) {
  479. return parse_error(
  480. m_lexer.tell(),
  481. String::formatted("Failed to resolve external subset '{}': garbage after declarations", doctype.external_id->system_id.system_literal));
  482. }
  483. doctype.markup_declarations.extend(move(declarations));
  484. }
  485. }
  486. }
  487. TRY(skip_whitespace(Required::No));
  488. if (m_lexer.consume_specific('[')) {
  489. auto internal_subset = TRY(parse_internal_subset());
  490. TRY(expect("]"));
  491. TRY(skip_whitespace());
  492. doctype.markup_declarations.extend(internal_subset);
  493. }
  494. TRY(expect(">"));
  495. rollback.disarm();
  496. m_doctype = move(doctype);
  497. return {};
  498. }
  499. // 3.39. element, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-element
  500. ErrorOr<void, ParseError> Parser::parse_element()
  501. {
  502. auto rollback = rollback_point();
  503. auto rule = enter_rule();
  504. // element ::= EmptyElemTag
  505. // | STag content ETag
  506. if (auto result = parse_empty_element_tag(); !result.is_error()) {
  507. append_node(result.release_value());
  508. rollback.disarm();
  509. return {};
  510. }
  511. auto start_tag = TRY(parse_start_tag());
  512. auto& node = *start_tag;
  513. auto& tag = node.content.get<Node::Element>();
  514. append_node(move(start_tag));
  515. enter_node(node);
  516. ScopeGuard quit {
  517. [&] {
  518. leave_node();
  519. }
  520. };
  521. TRY(parse_content());
  522. auto tag_location = m_lexer.tell();
  523. auto closing_name = TRY(parse_end_tag());
  524. // Well-formedness constraint: The Name in an element's end-tag MUST match the element type in the start-tag.
  525. if (m_options.treat_errors_as_fatal && closing_name != tag.name)
  526. return parse_error(tag_location, "Invalid closing tag");
  527. rollback.disarm();
  528. return {};
  529. }
  530. // 3.1.44. EmptyElemTag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EmptyElemTag
  531. ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_empty_element_tag()
  532. {
  533. auto rollback = rollback_point();
  534. auto rule = enter_rule();
  535. // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
  536. TRY(expect("<"));
  537. auto accept = accept_rule();
  538. auto name = TRY(parse_name());
  539. HashMap<Name, String> attributes;
  540. while (true) {
  541. if (auto result = skip_whitespace(Required::Yes); result.is_error())
  542. break;
  543. if (auto result = parse_attribute(); !result.is_error()) {
  544. auto attribute = result.release_value();
  545. attributes.set(move(attribute.name), move(attribute.value));
  546. } else {
  547. break;
  548. }
  549. }
  550. TRY(skip_whitespace());
  551. TRY(expect("/>"));
  552. rollback.disarm();
  553. return make<Node>(Node::Element { move(name), move(attributes), {} });
  554. }
  555. // 3.1.41. Attribute, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Attribute
  556. ErrorOr<Attribute, ParseError> Parser::parse_attribute()
  557. {
  558. auto rollback = rollback_point();
  559. auto rule = enter_rule();
  560. // Attribute ::= Name Eq AttValue
  561. auto name = TRY(parse_name());
  562. auto accept = accept_rule();
  563. TRY(parse_eq());
  564. auto value = TRY(parse_attribute_value());
  565. rollback.disarm();
  566. return Attribute {
  567. move(name),
  568. move(value),
  569. };
  570. }
  571. // 2.3.10. AttValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttValue
  572. ErrorOr<String, ParseError> Parser::parse_attribute_value()
  573. {
  574. auto rollback = rollback_point();
  575. auto rule = enter_rule();
  576. // AttValue ::= '"' ([^<&"] | Reference)* '"'
  577. // | "'" ([^<&'] | Reference)* "'"
  578. auto quote = TRY(expect(is_any_of("'\""), "one of ' or \""));
  579. auto accept = accept_rule();
  580. auto text = TRY(parse_attribute_value_inner(quote));
  581. TRY(expect(quote));
  582. rollback.disarm();
  583. return text;
  584. }
  585. ErrorOr<String, ParseError> Parser::parse_attribute_value_inner(StringView disallow)
  586. {
  587. StringBuilder builder;
  588. while (true) {
  589. if (m_lexer.next_is(is_any_of(disallow)) || m_lexer.is_eof())
  590. break;
  591. if (m_lexer.next_is('<')) {
  592. // Not allowed, return a nice error to make it easier to debug.
  593. return parse_error(m_lexer.tell(), "Unescaped '<' not allowed in attribute values");
  594. }
  595. if (m_lexer.next_is('&')) {
  596. auto reference = TRY(parse_reference());
  597. if (auto* char_reference = reference.get_pointer<String>())
  598. builder.append(*char_reference);
  599. else
  600. builder.append(TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::AttributeValue)));
  601. } else {
  602. builder.append(m_lexer.consume());
  603. }
  604. }
  605. return builder.to_string();
  606. }
  607. // Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
  608. constexpr static auto s_characters = ranges_for_search<Range(0x1, 0xd7ff), Range(0xe000, 0xfffd), Range(0x10000, 0x10ffff)>();
  609. // 4.1.67. Reference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Reference
  610. ErrorOr<Variant<Parser::EntityReference, String>, ParseError> Parser::parse_reference()
  611. {
  612. auto rollback = rollback_point();
  613. auto rule = enter_rule();
  614. // Reference ::= EntityRef | CharRef
  615. // 4.1.68. EntityRef
  616. // EntityRef ::= '&' Name ';'
  617. // 4.1.66. CharRef
  618. // CharRef ::= '&#' [0-9]+ ';'
  619. // | '&#x' [0-9a-fA-F]+ ';'
  620. auto reference_start = m_lexer.tell();
  621. TRY(expect("&"));
  622. auto accept = accept_rule();
  623. auto name_result = parse_name();
  624. if (name_result.is_error()) {
  625. TRY(expect("#"));
  626. u32 code_point;
  627. if (m_lexer.consume_specific('x')) {
  628. auto hex = TRY(expect_many(
  629. ranges_for_search<Range('0', '9'), Range('a', 'f'), Range('A', 'F')>(),
  630. "any of [0-9a-fA-F]"));
  631. code_point = *AK::StringUtils::convert_to_uint_from_hex<u32>(hex);
  632. } else {
  633. auto decimal = TRY(expect_many(
  634. ranges_for_search<Range('0', '9')>(),
  635. "any of [0-9]"));
  636. code_point = *decimal.to_uint<u32>();
  637. }
  638. if (!s_characters.contains(code_point))
  639. return parse_error(reference_start, "Invalid character reference");
  640. TRY(expect(";"));
  641. StringBuilder builder;
  642. builder.append_code_point(code_point);
  643. rollback.disarm();
  644. return builder.to_string();
  645. }
  646. auto name = name_result.release_value();
  647. TRY(expect(";"));
  648. rollback.disarm();
  649. return EntityReference { move(name) };
  650. }
  651. // 3.1.40 STag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-STag
  652. ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_start_tag()
  653. {
  654. auto rollback = rollback_point();
  655. auto rule = enter_rule();
  656. // STag ::= '<' Name (S Attribute)* S? '>'
  657. TRY(expect("<"));
  658. auto accept = accept_rule();
  659. auto name = TRY(parse_name());
  660. HashMap<Name, String> attributes;
  661. while (true) {
  662. if (auto result = skip_whitespace(Required::Yes); result.is_error())
  663. break;
  664. if (auto result = parse_attribute(); !result.is_error()) {
  665. auto attribute = result.release_value();
  666. attributes.set(move(attribute.name), move(attribute.value));
  667. } else {
  668. break;
  669. }
  670. }
  671. TRY(skip_whitespace());
  672. TRY(expect(">"));
  673. rollback.disarm();
  674. return make<Node>(Node::Element { move(name), move(attributes), {} });
  675. }
  676. // 3.1.42 ETag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ETag
  677. ErrorOr<Name, ParseError> Parser::parse_end_tag()
  678. {
  679. auto rollback = rollback_point();
  680. auto rule = enter_rule();
  681. // ETag ::= '</' Name S? '>'
  682. TRY(expect("</"));
  683. auto accept = accept_rule();
  684. auto name = TRY(parse_name());
  685. TRY(skip_whitespace());
  686. TRY(expect(">"));
  687. rollback.disarm();
  688. return name;
  689. }
  690. // 3.1.42 content, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-content
  691. ErrorOr<void, ParseError> Parser::parse_content()
  692. {
  693. auto rollback = rollback_point();
  694. auto rule = enter_rule();
  695. // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
  696. if (auto result = parse_char_data(); !result.is_error())
  697. append_text(result.release_value());
  698. while (true) {
  699. if (auto result = parse_element(); !result.is_error())
  700. goto try_char_data;
  701. if (auto result = parse_reference(); !result.is_error()) {
  702. auto reference = result.release_value();
  703. if (auto char_reference = reference.get_pointer<String>())
  704. append_text(*char_reference);
  705. else
  706. TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::Content));
  707. goto try_char_data;
  708. }
  709. if (auto result = parse_cdata_section(); !result.is_error()) {
  710. if (m_options.preserve_cdata)
  711. append_text(result.release_value());
  712. goto try_char_data;
  713. }
  714. if (auto result = parse_processing_instruction(); !result.is_error())
  715. goto try_char_data;
  716. if (auto result = parse_comment(); !result.is_error())
  717. goto try_char_data;
  718. break;
  719. try_char_data:;
  720. if (auto result = parse_char_data(); !result.is_error())
  721. append_text(result.release_value());
  722. }
  723. rollback.disarm();
  724. return {};
  725. }
  726. // 2.4.14 CharData, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CharData
  727. ErrorOr<StringView, ParseError> Parser::parse_char_data()
  728. {
  729. auto rollback = rollback_point();
  730. auto rule = enter_rule();
  731. // CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
  732. auto cend_state = 0; // 1: ], 2: ], 3: >
  733. auto text = m_lexer.consume_while([&](auto ch) {
  734. if (ch == '<' || ch == '&' || cend_state == 3)
  735. return false;
  736. switch (cend_state) {
  737. case 0:
  738. case 1:
  739. if (ch == ']')
  740. cend_state++;
  741. else
  742. cend_state = 0;
  743. return true;
  744. case 2:
  745. if (ch == '>') {
  746. cend_state++;
  747. return true;
  748. }
  749. cend_state = 0;
  750. return true;
  751. default:
  752. VERIFY_NOT_REACHED();
  753. }
  754. });
  755. if (cend_state == 3) {
  756. m_lexer.retreat(3);
  757. text = text.substring_view(0, text.length() - 3);
  758. }
  759. rollback.disarm();
  760. return text;
  761. }
  762. // 2.8.28b intSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-intSubset
  763. ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_internal_subset()
  764. {
  765. auto rollback = rollback_point();
  766. auto rule = enter_rule();
  767. Vector<MarkupDeclaration> declarations;
  768. // intSubset ::= (markupdecl | DeclSep)*
  769. while (true) {
  770. if (auto result = parse_markup_declaration(); !result.is_error()) {
  771. auto maybe_declaration = result.release_value();
  772. if (maybe_declaration.has_value())
  773. declarations.append(maybe_declaration.release_value());
  774. continue;
  775. }
  776. if (auto result = parse_declaration_separator(); !result.is_error()) {
  777. // The markup declarations may be made up in whole or in part of the replacement text of parameter entities.
  778. // The replacement text of a parameter entity reference in a DeclSep MUST match the production extSubsetDecl.
  779. auto maybe_replacement_text = result.release_value();
  780. if (maybe_replacement_text.has_value()) {
  781. TemporaryChange<StringView> source { m_source, maybe_replacement_text.value() };
  782. TemporaryChange lexer { m_lexer, GenericLexer { m_source } };
  783. auto contained_declarations = TRY(parse_external_subset_declaration());
  784. declarations.extend(move(contained_declarations));
  785. }
  786. continue;
  787. }
  788. break;
  789. }
  790. rollback.disarm();
  791. return declarations;
  792. }
  793. // 2.8.29 markupdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-markupdecl
  794. ErrorOr<Optional<MarkupDeclaration>, ParseError> Parser::parse_markup_declaration()
  795. {
  796. auto rollback = rollback_point();
  797. auto rule = enter_rule();
  798. // markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
  799. if (auto result = parse_element_declaration(); !result.is_error()) {
  800. rollback.disarm();
  801. return MarkupDeclaration { result.release_value() };
  802. }
  803. if (auto result = parse_attribute_list_declaration(); !result.is_error()) {
  804. rollback.disarm();
  805. return MarkupDeclaration { result.release_value() };
  806. }
  807. if (auto result = parse_entity_declaration(); !result.is_error()) {
  808. rollback.disarm();
  809. return MarkupDeclaration { result.release_value() };
  810. }
  811. if (auto result = parse_notation_declaration(); !result.is_error()) {
  812. rollback.disarm();
  813. return MarkupDeclaration { result.release_value() };
  814. }
  815. if (auto result = parse_processing_instruction(); !result.is_error()) {
  816. rollback.disarm();
  817. return Optional<MarkupDeclaration> {};
  818. }
  819. if (auto result = parse_comment(); !result.is_error()) {
  820. rollback.disarm();
  821. return Optional<MarkupDeclaration> {};
  822. }
  823. return parse_error(m_lexer.tell(), "Expected one of elementdecl, attlistdecl, entitydecl, notationdecl, PI or comment");
  824. }
  825. // 2.8.28a DeclSep, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-DeclSep
  826. ErrorOr<Optional<String>, ParseError> Parser::parse_declaration_separator()
  827. {
  828. auto rollback = rollback_point();
  829. auto rule = enter_rule();
  830. // DeclSep ::= PEReference | S
  831. if (auto name = parse_parameter_entity_reference(); !name.is_error()) {
  832. rollback.disarm();
  833. // FIXME: Resolve this PEReference.
  834. return "";
  835. }
  836. if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
  837. rollback.disarm();
  838. return Optional<String> {};
  839. }
  840. return parse_error(m_lexer.tell(), "Expected either whitespace, or a PEReference");
  841. }
  842. // 4.1.69 PEReference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEReference
  843. ErrorOr<Name, ParseError> Parser::parse_parameter_entity_reference()
  844. {
  845. auto rollback = rollback_point();
  846. auto rule = enter_rule();
  847. // PEReference ::= '%' Name ';'
  848. TRY(expect("%"));
  849. auto accept = accept_rule();
  850. auto name = TRY(parse_name());
  851. TRY(expect(";"));
  852. rollback.disarm();
  853. return name;
  854. }
  855. // 3.2.46 elementdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-elementdecl
  856. ErrorOr<ElementDeclaration, ParseError> Parser::parse_element_declaration()
  857. {
  858. auto rollback = rollback_point();
  859. auto rule = enter_rule();
  860. // FIXME: Apparently both name _and_ contentspec here are allowed to be PEReferences,
  861. // but the grammar does not allow that, figure this out.
  862. // elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
  863. TRY(expect("<!ELEMENT"));
  864. auto accept = accept_rule();
  865. TRY(skip_whitespace(Required::Yes));
  866. auto name = TRY(parse_name());
  867. TRY(skip_whitespace(Required::Yes));
  868. auto spec = TRY(parse_content_spec());
  869. TRY(expect(">"));
  870. rollback.disarm();
  871. return ElementDeclaration {
  872. move(name),
  873. move(spec),
  874. };
  875. }
  876. // 3.3.52 AttlistDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttlistDecl
  877. ErrorOr<AttributeListDeclaration, ParseError> Parser::parse_attribute_list_declaration()
  878. {
  879. auto rollback = rollback_point();
  880. auto rule = enter_rule();
  881. AttributeListDeclaration declaration;
  882. // AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
  883. TRY(expect("<!ATTLIST"));
  884. auto accept = accept_rule();
  885. TRY(skip_whitespace(Required::Yes));
  886. declaration.type = TRY(parse_name());
  887. while (true) {
  888. if (auto result = parse_attribute_definition(); !result.is_error())
  889. declaration.attributes.append(result.release_value());
  890. else
  891. break;
  892. }
  893. TRY(skip_whitespace());
  894. TRY(expect(">"));
  895. rollback.disarm();
  896. return declaration;
  897. }
  898. // 3.3.53 AttDef, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttDef
  899. ErrorOr<AttributeListDeclaration::Definition, ParseError> Parser::parse_attribute_definition()
  900. {
  901. auto rollback = rollback_point();
  902. auto rule = enter_rule();
  903. Optional<AttributeListDeclaration::Type> type;
  904. Optional<AttributeListDeclaration::Default> default_;
  905. // AttDef ::= S Name S AttType S DefaultDecl
  906. TRY(skip_whitespace(Required::Yes));
  907. auto name = TRY(parse_name());
  908. auto accept = accept_rule();
  909. TRY(skip_whitespace(Required::Yes));
  910. // AttType ::= StringType | TokenizedType | EnumeratedType
  911. // StringType ::= 'CDATA'
  912. // TokenizedType ::= 'ID'
  913. // | 'IDREF'
  914. // | 'IDREFS'
  915. // | 'ENTITY'
  916. // | 'ENTITIES'
  917. // | 'NMTOKEN'
  918. // | 'NMTOKENS'
  919. // EnumeratedType ::= NotationType | Enumeration
  920. // NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
  921. // Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
  922. if (m_lexer.consume_specific("CDATA")) {
  923. type = AttributeListDeclaration::StringType::CData;
  924. } else if (m_lexer.consume_specific("IDREFS")) {
  925. type = AttributeListDeclaration::TokenizedType::IDRefs;
  926. } else if (m_lexer.consume_specific("IDREF")) {
  927. type = AttributeListDeclaration::TokenizedType::IDRef;
  928. } else if (m_lexer.consume_specific("ID")) {
  929. type = AttributeListDeclaration::TokenizedType::ID;
  930. } else if (m_lexer.consume_specific("ENTITIES")) {
  931. type = AttributeListDeclaration::TokenizedType::Entities;
  932. } else if (m_lexer.consume_specific("ENTITY")) {
  933. type = AttributeListDeclaration::TokenizedType::Entity;
  934. } else if (m_lexer.consume_specific("NMTOKENS")) {
  935. type = AttributeListDeclaration::TokenizedType::NMTokens;
  936. } else if (m_lexer.consume_specific("NMTOKEN")) {
  937. type = AttributeListDeclaration::TokenizedType::NMToken;
  938. } else if (m_lexer.consume_specific("NOTATION")) {
  939. HashTable<Name> names;
  940. TRY(skip_whitespace(Required::Yes));
  941. TRY(expect("("));
  942. TRY(skip_whitespace());
  943. names.set(TRY(parse_name()));
  944. while (true) {
  945. TRY(skip_whitespace());
  946. if (auto result = expect("|"); result.is_error())
  947. break;
  948. TRY(skip_whitespace());
  949. names.set(TRY(parse_name()));
  950. }
  951. TRY(skip_whitespace());
  952. TRY(expect(")"));
  953. type = AttributeListDeclaration::NotationType { move(names) };
  954. } else {
  955. HashTable<String> names;
  956. TRY(expect("("));
  957. TRY(skip_whitespace());
  958. names.set(TRY(parse_nm_token()));
  959. while (true) {
  960. TRY(skip_whitespace());
  961. if (auto result = expect("|"); result.is_error())
  962. break;
  963. TRY(skip_whitespace());
  964. names.set(TRY(parse_nm_token()));
  965. }
  966. TRY(skip_whitespace());
  967. TRY(expect(")"));
  968. type = AttributeListDeclaration::Enumeration { move(names) };
  969. }
  970. TRY(skip_whitespace(Required::Yes));
  971. // DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
  972. // | (('#FIXED' S)? AttValue)
  973. if (m_lexer.consume_specific("#REQUIRED")) {
  974. default_ = AttributeListDeclaration::Required {};
  975. } else if (m_lexer.consume_specific("#IMPLIED")) {
  976. default_ = AttributeListDeclaration::Implied {};
  977. } else {
  978. bool fixed = false;
  979. if (m_lexer.consume_specific("#FIXED")) {
  980. TRY(skip_whitespace(Required::Yes));
  981. fixed = true;
  982. }
  983. auto value = TRY(parse_attribute_value());
  984. if (fixed)
  985. default_ = AttributeListDeclaration::Fixed { move(value) };
  986. else
  987. default_ = AttributeListDeclaration::DefaultValue { move(value) };
  988. }
  989. rollback.disarm();
  990. return AttributeListDeclaration::Definition {
  991. move(name),
  992. type.release_value(),
  993. default_.release_value(),
  994. };
  995. }
  996. // 2.3.7 Nmtoken, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Nmtoken
  997. ErrorOr<StringView, ParseError> Parser::parse_nm_token()
  998. {
  999. auto rollback = rollback_point();
  1000. auto rule = enter_rule();
  1001. // Nmtoken ::= (NameChar)+
  1002. auto token = TRY(expect_many(s_name_characters, "a NameChar"));
  1003. rollback.disarm();
  1004. return token;
  1005. }
  1006. // 4.7.82 NotationDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#Notations
  1007. ErrorOr<NotationDeclaration, ParseError> Parser::parse_notation_declaration()
  1008. {
  1009. auto rollback = rollback_point();
  1010. auto rule = enter_rule();
  1011. Variant<ExternalID, PublicID, Empty> notation;
  1012. // NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'
  1013. TRY(expect("<!NOTATION"));
  1014. auto accept = accept_rule();
  1015. TRY(skip_whitespace(Required::Yes));
  1016. auto name = TRY(parse_name());
  1017. TRY(skip_whitespace(Required::Yes));
  1018. if (auto result = parse_external_id(); !result.is_error())
  1019. notation = result.release_value();
  1020. else
  1021. notation = TRY(parse_public_id());
  1022. TRY(expect(">"));
  1023. rollback.disarm();
  1024. return NotationDeclaration {
  1025. move(name),
  1026. move(notation).downcast<ExternalID, PublicID>(),
  1027. };
  1028. }
  1029. // 3.2.46 contentspec, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-contentspec
  1030. ErrorOr<ElementDeclaration::ContentSpec, ParseError> Parser::parse_content_spec()
  1031. {
  1032. auto rollback = rollback_point();
  1033. auto rule = enter_rule();
  1034. Optional<ElementDeclaration::ContentSpec> content_spec;
  1035. // contentspec ::= 'EMPTY' | 'ANY' | Mixed | children
  1036. if (m_lexer.consume_specific("EMPTY")) {
  1037. content_spec = ElementDeclaration::Empty {};
  1038. } else if (m_lexer.consume_specific("ANY")) {
  1039. content_spec = ElementDeclaration::Any {};
  1040. } else {
  1041. TRY(expect("("));
  1042. TRY(skip_whitespace());
  1043. if (m_lexer.consume_specific("#PCDATA")) {
  1044. HashTable<Name> names;
  1045. // Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*'
  1046. // | '(' S? '#PCDATA' S? ')'
  1047. TRY(skip_whitespace());
  1048. if (m_lexer.consume_specific(")*")) {
  1049. content_spec = ElementDeclaration::Mixed { .types = {}, .many = true };
  1050. } else if (m_lexer.consume_specific(')')) {
  1051. content_spec = ElementDeclaration::Mixed { .types = {}, .many = false };
  1052. } else {
  1053. while (true) {
  1054. TRY(skip_whitespace());
  1055. if (!m_lexer.consume_specific('|'))
  1056. break;
  1057. TRY(skip_whitespace());
  1058. if (auto result = parse_name(); !result.is_error())
  1059. names.set(result.release_value());
  1060. else
  1061. return parse_error(m_lexer.tell(), "Expected a Name");
  1062. }
  1063. TRY(skip_whitespace());
  1064. TRY(expect(")*"));
  1065. content_spec = ElementDeclaration::Mixed { .types = move(names), .many = true };
  1066. }
  1067. } else {
  1068. while (!m_lexer.next_is('('))
  1069. m_lexer.retreat();
  1070. // children ::= (choice | seq) ('?' | '*' | '+')?
  1071. // cp ::= (Name | choice | seq) ('?' | '*' | '+')?
  1072. // choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')'
  1073. // seq ::= '(' S? cp ( S? ',' S? cp )* S? ')'
  1074. Function<ErrorOr<ElementDeclaration::Children::Choice, ParseError>()> parse_choice;
  1075. Function<ErrorOr<ElementDeclaration::Children::Sequence, ParseError>()> parse_sequence;
  1076. auto parse_cp_init = [&]() -> ErrorOr<Variant<Name, ElementDeclaration::Children::Choice, ElementDeclaration::Children::Sequence>, ParseError> {
  1077. if (auto result = parse_name(); !result.is_error())
  1078. return result.release_value();
  1079. if (auto result = parse_choice(); !result.is_error())
  1080. return result.release_value();
  1081. return TRY(parse_sequence());
  1082. };
  1083. auto parse_qualifier = [&]() -> ElementDeclaration::Children::Qualifier {
  1084. ElementDeclaration::Children::Qualifier qualifier { ElementDeclaration::Children::Qualifier::ExactlyOnce };
  1085. if (m_lexer.consume_specific('?'))
  1086. qualifier = ElementDeclaration::Children::Qualifier::Optional;
  1087. else if (m_lexer.consume_specific('*'))
  1088. qualifier = ElementDeclaration::Children::Qualifier::Any;
  1089. else if (m_lexer.consume_specific('+'))
  1090. qualifier = ElementDeclaration::Children::Qualifier::OneOrMore;
  1091. return qualifier;
  1092. };
  1093. auto parse_cp = [&]() -> ErrorOr<ElementDeclaration::Children::Entry, ParseError> {
  1094. auto sub_entry = TRY(parse_cp_init());
  1095. auto qualifier = parse_qualifier();
  1096. return ElementDeclaration::Children::Entry {
  1097. move(sub_entry),
  1098. qualifier,
  1099. };
  1100. };
  1101. parse_choice = [&]() -> ErrorOr<ElementDeclaration::Children::Choice, ParseError> {
  1102. auto rollback = rollback_point();
  1103. auto rule = enter_rule();
  1104. TRY(expect("("));
  1105. auto accept = accept_rule();
  1106. TRY(skip_whitespace());
  1107. Vector<ElementDeclaration::Children::Entry> choices;
  1108. choices.append(TRY(parse_cp()));
  1109. while (true) {
  1110. TRY(skip_whitespace());
  1111. if (!m_lexer.consume_specific('|'))
  1112. break;
  1113. TRY(skip_whitespace());
  1114. choices.append(TRY(parse_cp()));
  1115. }
  1116. TRY(expect(")"));
  1117. if (choices.size() < 2)
  1118. return parse_error(m_lexer.tell(), "Expected more than one choice");
  1119. TRY(skip_whitespace());
  1120. auto qualifier = parse_qualifier();
  1121. rollback.disarm();
  1122. return ElementDeclaration::Children::Choice {
  1123. move(choices),
  1124. qualifier,
  1125. };
  1126. };
  1127. parse_sequence = [&]() -> ErrorOr<ElementDeclaration::Children::Sequence, ParseError> {
  1128. auto rollback = rollback_point();
  1129. auto rule = enter_rule();
  1130. TRY(expect("("));
  1131. auto accept = accept_rule();
  1132. TRY(skip_whitespace());
  1133. Vector<ElementDeclaration::Children::Entry> entries;
  1134. entries.append(TRY(parse_cp()));
  1135. while (true) {
  1136. TRY(skip_whitespace());
  1137. if (!m_lexer.consume_specific(','))
  1138. break;
  1139. TRY(skip_whitespace());
  1140. entries.append(TRY(parse_cp()));
  1141. }
  1142. TRY(expect(")"));
  1143. TRY(skip_whitespace());
  1144. auto qualifier = parse_qualifier();
  1145. rollback.disarm();
  1146. return ElementDeclaration::Children::Sequence {
  1147. move(entries),
  1148. qualifier,
  1149. };
  1150. };
  1151. if (auto result = parse_choice(); !result.is_error()) {
  1152. auto qualifier = parse_qualifier();
  1153. content_spec = ElementDeclaration::Children {
  1154. result.release_value(),
  1155. qualifier,
  1156. };
  1157. } else {
  1158. auto sequence = TRY(parse_sequence());
  1159. auto qualifier = parse_qualifier();
  1160. content_spec = ElementDeclaration::Children {
  1161. move(sequence),
  1162. qualifier,
  1163. };
  1164. }
  1165. }
  1166. }
  1167. rollback.disarm();
  1168. return content_spec.release_value();
  1169. }
  1170. // 2.8.31 extSubsetDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubsetDecl
  1171. ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset_declaration()
  1172. {
  1173. auto rollback = rollback_point();
  1174. auto rule = enter_rule();
  1175. Vector<MarkupDeclaration> declarations;
  1176. // extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep )*
  1177. while (true) {
  1178. if (auto result = parse_markup_declaration(); !result.is_error()) {
  1179. if (result.value().has_value())
  1180. declarations.append(result.release_value().release_value());
  1181. continue;
  1182. }
  1183. // FIXME: conditionalSect
  1184. if (auto result = parse_declaration_separator(); !result.is_error())
  1185. continue;
  1186. break;
  1187. }
  1188. rollback.disarm();
  1189. return declarations;
  1190. }
  1191. // 4.2.70 EntityDecl, https://www.w3.org/TR/xml/#NT-EntityDecl
  1192. ErrorOr<EntityDeclaration, ParseError> Parser::parse_entity_declaration()
  1193. {
  1194. // EntityDecl ::= GEDecl | PEDecl
  1195. if (auto result = parse_general_entity_declaration(); !result.is_error())
  1196. return result;
  1197. return parse_parameter_entity_declaration();
  1198. }
  1199. // 4.2.71 GEDecl, https://www.w3.org/TR/xml/#NT-GEDecl
  1200. ErrorOr<EntityDeclaration, ParseError> Parser::parse_general_entity_declaration()
  1201. {
  1202. auto rollback = rollback_point();
  1203. auto rule = enter_rule();
  1204. Variant<String, EntityDefinition, Empty> definition;
  1205. // GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
  1206. TRY(expect("<!ENTITY"));
  1207. auto accept = accept_rule();
  1208. TRY(skip_whitespace(Required::Yes));
  1209. auto name = TRY(parse_name());
  1210. TRY(skip_whitespace(Required::Yes));
  1211. // EntityDef ::= EntityValue | (ExternalID NDataDecl?)
  1212. if (auto result = parse_entity_value(); !result.is_error()) {
  1213. definition = result.release_value();
  1214. } else {
  1215. auto external_id = TRY(parse_external_id());
  1216. Optional<Name> notation;
  1217. if (auto notation_result = parse_notation_data_declaration(); !notation_result.is_error())
  1218. notation = notation_result.release_value();
  1219. definition = EntityDefinition {
  1220. move(external_id),
  1221. move(notation),
  1222. };
  1223. }
  1224. TRY(skip_whitespace());
  1225. TRY(expect(">"));
  1226. rollback.disarm();
  1227. return GEDeclaration {
  1228. move(name),
  1229. move(definition).downcast<String, EntityDefinition>(),
  1230. };
  1231. }
  1232. // 4.2.72 PEDecl, https://www.w3.org/TR/xml/#NT-PEDecl
  1233. ErrorOr<EntityDeclaration, ParseError> Parser::parse_parameter_entity_declaration()
  1234. {
  1235. auto rollback = rollback_point();
  1236. auto rule = enter_rule();
  1237. Variant<String, ExternalID, Empty> definition;
  1238. // PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
  1239. TRY(expect("<!ENTITY"));
  1240. auto accept = accept_rule();
  1241. TRY(skip_whitespace(Required::Yes));
  1242. TRY(expect("%"));
  1243. TRY(skip_whitespace(Required::Yes));
  1244. auto name = TRY(parse_name());
  1245. TRY(skip_whitespace(Required::Yes));
  1246. // PEDef ::= EntityValue | ExternalID
  1247. if (auto result = parse_entity_value(); !result.is_error())
  1248. definition = result.release_value();
  1249. else
  1250. definition = TRY(parse_external_id());
  1251. TRY(skip_whitespace());
  1252. TRY(expect(">"));
  1253. rollback.disarm();
  1254. return PEDeclaration {
  1255. move(name),
  1256. move(definition).downcast<String, ExternalID>(),
  1257. };
  1258. }
  1259. // 4.7.83 PublicID, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PublicID
  1260. ErrorOr<PublicID, ParseError> Parser::parse_public_id()
  1261. {
  1262. auto rollback = rollback_point();
  1263. auto rule = enter_rule();
  1264. // PublicID ::= 'PUBLIC' S PubidLiteral
  1265. TRY(expect("PUBLIC"));
  1266. auto accept = accept_rule();
  1267. TRY(skip_whitespace(Required::Yes));
  1268. auto text = TRY(parse_public_id_literal());
  1269. rollback.disarm();
  1270. return PublicID {
  1271. text,
  1272. };
  1273. }
  1274. constexpr static auto s_public_id_characters = set_to_search<StringSet("\x20\x0d\x0a-'()+,./:=?;!*#@$_%")>().unify(ranges_for_search<Range('a', 'z'), Range('A', 'Z'), Range('0', '9')>());
  1275. // 2.3.12, PubidLiteral, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral
  1276. ErrorOr<StringView, ParseError> Parser::parse_public_id_literal()
  1277. {
  1278. auto rollback = rollback_point();
  1279. auto rule = enter_rule();
  1280. // PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
  1281. auto quote = TRY(expect(is_any_of("'\""), "any of ' or \""));
  1282. auto accept = accept_rule();
  1283. auto id = TRY(expect_many(
  1284. [q = quote[0]](auto x) {
  1285. return (q == '\'' ? x != '\'' : true) && s_public_id_characters.contains(x);
  1286. },
  1287. "a PubidChar"));
  1288. TRY(expect(quote));
  1289. rollback.disarm();
  1290. return id;
  1291. }
  1292. // 2.3.11 SystemLiteral, https://www.w3.org/TR/xml/#NT-SystemLiteral
  1293. ErrorOr<StringView, ParseError> Parser::parse_system_id_literal()
  1294. {
  1295. auto rollback = rollback_point();
  1296. auto rule = enter_rule();
  1297. // SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
  1298. auto quote = TRY(expect(is_any_of("'\""), "any of ' or \""));
  1299. auto accept = accept_rule();
  1300. auto id = TRY(expect_many(is_not_any_of(quote), "not a quote"));
  1301. TRY(expect(quote));
  1302. rollback.disarm();
  1303. return id;
  1304. }
  1305. // 4.2.75 ExternalID, https://www.w3.org/TR/xml/#NT-ExternalID
  1306. ErrorOr<ExternalID, ParseError> Parser::parse_external_id()
  1307. {
  1308. auto rollback = rollback_point();
  1309. auto rule = enter_rule();
  1310. // ExternalID ::= 'SYSTEM' S SystemLiteral
  1311. // | 'PUBLIC' S PubidLiteral S SystemLiteral
  1312. Optional<PublicID> public_id;
  1313. SystemID system_id;
  1314. if (m_lexer.consume_specific("SYSTEM")) {
  1315. auto accept = accept_rule();
  1316. TRY(skip_whitespace(Required::Yes));
  1317. system_id = SystemID { TRY(parse_system_id_literal()) };
  1318. } else {
  1319. TRY(expect("PUBLIC"));
  1320. auto accept = accept_rule();
  1321. TRY(skip_whitespace(Required::Yes));
  1322. public_id = PublicID { TRY(parse_public_id_literal()) };
  1323. TRY(skip_whitespace(Required::Yes));
  1324. system_id = SystemID { TRY(parse_system_id_literal()) };
  1325. }
  1326. rollback.disarm();
  1327. return ExternalID {
  1328. move(public_id),
  1329. move(system_id),
  1330. };
  1331. }
  1332. // 4.2.2.76 NDataDecl, https://www.w3.org/TR/xml/#NT-NDataDecl
  1333. ErrorOr<Name, ParseError> Parser::parse_notation_data_declaration()
  1334. {
  1335. auto rollback = rollback_point();
  1336. auto rule = enter_rule();
  1337. // NDataDecl ::= S 'NDATA' S Name
  1338. TRY(skip_whitespace(Required::Yes));
  1339. auto accept = accept_rule();
  1340. TRY(expect("NDATA"));
  1341. TRY(skip_whitespace(Required::Yes));
  1342. auto name = TRY(parse_name());
  1343. rollback.disarm();
  1344. return name;
  1345. }
  1346. // 2.3.9 EntityValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue
  1347. ErrorOr<String, ParseError> Parser::parse_entity_value()
  1348. {
  1349. auto rollback = rollback_point();
  1350. auto rule = enter_rule();
  1351. StringBuilder builder;
  1352. // EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
  1353. // | "'" ([^%&'] | PEReference | Reference)* "'"
  1354. auto quote = TRY(expect(is_any_of("'\""), "any of ' or \""));
  1355. auto accept = accept_rule();
  1356. while (true) {
  1357. if (m_lexer.is_eof())
  1358. break;
  1359. if (m_lexer.next_is(quote))
  1360. break;
  1361. if (m_lexer.next_is('%')) {
  1362. auto start = m_lexer.tell();
  1363. TRY(parse_parameter_entity_reference());
  1364. builder.append(m_source.substring_view(start, m_lexer.tell() - start));
  1365. continue;
  1366. }
  1367. if (m_lexer.next_is('&')) {
  1368. auto start = m_lexer.tell();
  1369. TRY(parse_reference());
  1370. builder.append(m_source.substring_view(start, m_lexer.tell() - start));
  1371. continue;
  1372. }
  1373. builder.append(m_lexer.consume());
  1374. }
  1375. TRY(expect(quote));
  1376. rollback.disarm();
  1377. return builder.to_string();
  1378. }
  1379. // 2.7.18 CDSect, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CDSect
  1380. ErrorOr<StringView, ParseError> Parser::parse_cdata_section()
  1381. {
  1382. auto rollback = rollback_point();
  1383. auto rule = enter_rule();
  1384. // CDSect ::= CDStart CData CDEnd
  1385. // CDStart ::= '<![CDATA['
  1386. // CData ::= (Char* - (Char* ']]>' Char*))
  1387. // CDEnd ::= ']]>'
  1388. TRY(expect("<![CDATA["));
  1389. auto accept = accept_rule();
  1390. auto section_start = m_lexer.tell();
  1391. while (!m_lexer.next_is("]]>")) {
  1392. if (m_lexer.is_eof())
  1393. break;
  1394. m_lexer.ignore();
  1395. }
  1396. auto section_end = m_lexer.tell();
  1397. TRY(expect("]]>"));
  1398. rollback.disarm();
  1399. return m_source.substring_view(section_start, section_end - section_start);
  1400. }
  1401. // 2.8.30 extSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubset
  1402. ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset()
  1403. {
  1404. auto rollback = rollback_point();
  1405. auto rule = enter_rule();
  1406. // extSubset ::= TextDecl? extSubsetDecl
  1407. (void)parse_text_declaration();
  1408. auto result = TRY(parse_external_subset_declaration());
  1409. rollback.disarm();
  1410. return result;
  1411. }
  1412. // 4.3.1.77 TextDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-TextDecl
  1413. ErrorOr<void, ParseError> Parser::parse_text_declaration()
  1414. {
  1415. auto rollback = rollback_point();
  1416. auto rule = enter_rule();
  1417. // TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
  1418. TRY(expect("<?xml"));
  1419. auto accept = accept_rule();
  1420. (void)parse_version_info();
  1421. TRY(parse_encoding_decl());
  1422. TRY(skip_whitespace());
  1423. TRY(expect("?>"));
  1424. rollback.disarm();
  1425. return {};
  1426. }
  1427. ErrorOr<String, ParseError> Parser::resolve_reference(EntityReference const& reference, ReferencePlacement placement)
  1428. {
  1429. static HashTable<Name> reference_lookup {};
  1430. if (reference_lookup.contains(reference.name))
  1431. return parse_error(m_lexer.tell(), String::formatted("Invalid recursive definition for '{}'", reference.name));
  1432. reference_lookup.set(reference.name);
  1433. ScopeGuard remove_lookup {
  1434. [&] {
  1435. reference_lookup.remove(reference.name);
  1436. }
  1437. };
  1438. Optional<String> resolved;
  1439. if (m_doctype.has_value()) {
  1440. // FIXME: Split these up and resolve them ahead of time.
  1441. for (auto& declaration : m_doctype->markup_declarations) {
  1442. auto entity = declaration.get_pointer<EntityDeclaration>();
  1443. if (!entity)
  1444. continue;
  1445. auto ge_declaration = entity->get_pointer<GEDeclaration>();
  1446. if (!ge_declaration)
  1447. continue;
  1448. if (ge_declaration->name != reference.name)
  1449. continue;
  1450. TRY(ge_declaration->definition.visit(
  1451. [&](String const& definition) -> ErrorOr<void, ParseError> {
  1452. resolved = definition;
  1453. return {};
  1454. },
  1455. [&](EntityDefinition const& definition) -> ErrorOr<void, ParseError> {
  1456. if (placement == ReferencePlacement::AttributeValue)
  1457. return parse_error(m_lexer.tell(), String::formatted("Attribute references external entity '{}'", reference.name));
  1458. if (definition.notation.has_value())
  1459. return parse_error(0u, String::formatted("Entity reference to unparsed entity '{}'", reference.name));
  1460. if (!m_options.resolve_external_resource)
  1461. return parse_error(0u, String::formatted("Failed to resolve external entity '{}'", reference.name));
  1462. auto result = m_options.resolve_external_resource(definition.id.system_id, definition.id.public_id);
  1463. if (result.is_error())
  1464. return parse_error(0u, String::formatted("Failed to resolve external entity '{}': {}", reference.name, result.error()));
  1465. resolved = result.release_value();
  1466. return {};
  1467. }));
  1468. break;
  1469. }
  1470. }
  1471. if (!resolved.has_value()) {
  1472. if (reference.name == "amp")
  1473. return "&";
  1474. if (reference.name == "lt")
  1475. return "<";
  1476. if (reference.name == "gt")
  1477. return ">";
  1478. if (reference.name == "apos")
  1479. return "'";
  1480. if (reference.name == "quot")
  1481. return "\"";
  1482. return parse_error(0u, String::formatted("Reference to undeclared entity '{}'", reference.name));
  1483. }
  1484. StringView resolved_source = *resolved;
  1485. TemporaryChange source { m_source, resolved_source };
  1486. TemporaryChange lexer { m_lexer, GenericLexer(m_source) };
  1487. switch (placement) {
  1488. case ReferencePlacement::AttributeValue:
  1489. return TRY(parse_attribute_value_inner(""));
  1490. case ReferencePlacement::Content:
  1491. TRY(parse_content());
  1492. return "";
  1493. default:
  1494. VERIFY_NOT_REACHED();
  1495. }
  1496. }
  1497. }