Parser.cpp 58 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781
  1. /*
  2. * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <LibXML/DOM/Document.h>
  7. #include <LibXML/Parser/Parser.h>
  8. struct Range {
  9. consteval Range(u32 start, u32 end)
  10. : start(start)
  11. , end(end)
  12. {
  13. }
  14. u32 start;
  15. u32 end;
  16. };
  17. template<auto... ranges>
  18. struct ranges_for_search {
  19. auto contains(u32 value) const
  20. {
  21. return ((value >= ranges.start && value <= ranges.end) || ...);
  22. }
  23. bool operator()(u32 value) const
  24. {
  25. return contains(value);
  26. }
  27. template<auto... ranges_to_include>
  28. consteval auto with() const
  29. {
  30. return ranges_for_search<ranges..., ranges_to_include...>();
  31. }
  32. template<auto... ranges_to_include>
  33. consteval auto unify(ranges_for_search<ranges_to_include...> const&) const
  34. {
  35. return ranges_for_search<ranges..., ranges_to_include...>();
  36. }
  37. };
  38. template<size_t Count, typename Element>
  39. struct StringSet {
  40. consteval StringSet(Element const (&entries)[Count])
  41. {
  42. for (size_t i = 0; i < Count - 1; ++i)
  43. elements[i] = entries[i];
  44. }
  45. consteval auto operator[](size_t i) const { return elements[i]; }
  46. Element elements[Count - 1];
  47. };
  48. template<StringSet chars>
  49. consteval static auto set_to_search()
  50. {
  51. return ([&]<auto... Ix>(IndexSequence<Ix...>) {
  52. return ranges_for_search<Range(chars[Ix], chars[Ix])...>();
  53. }(MakeIndexSequence<array_size(chars.elements)>()));
  54. }
  55. namespace XML {
  56. size_t Parser::s_debug_indent_level { 0 };
  57. void Parser::append_node(NonnullOwnPtr<Node> node)
  58. {
  59. if (m_entered_node) {
  60. m_entered_node->content.get<Node::Element>().children.append(move(node));
  61. } else {
  62. m_root_node = move(node);
  63. m_entered_node = m_root_node.ptr();
  64. }
  65. }
  66. void Parser::append_text(StringView text)
  67. {
  68. if (m_listener) {
  69. m_listener->text(text);
  70. return;
  71. }
  72. if (!m_entered_node) {
  73. Node::Text node;
  74. node.builder.append(text);
  75. m_root_node = make<Node>(move(node));
  76. return;
  77. }
  78. m_entered_node->content.visit(
  79. [&](Node::Element& node) {
  80. if (!node.children.is_empty()) {
  81. auto* text_node = node.children.last()->content.get_pointer<Node::Text>();
  82. if (text_node) {
  83. text_node->builder.append(text);
  84. return;
  85. }
  86. }
  87. Node::Text text_node;
  88. text_node.builder.append(text);
  89. node.children.append(make<Node>(move(text_node)));
  90. },
  91. [&](auto&) {
  92. // Can't enter a text or comment node.
  93. VERIFY_NOT_REACHED();
  94. });
  95. }
  96. void Parser::append_comment(StringView text)
  97. {
  98. if (m_listener) {
  99. m_listener->comment(text);
  100. return;
  101. }
  102. // If there's no node to attach this to, drop it on the floor.
  103. // This can happen to comments in the prolog.
  104. if (!m_entered_node)
  105. return;
  106. m_entered_node->content.visit(
  107. [&](Node::Element& node) {
  108. node.children.append(make<Node>(Node::Comment { text }));
  109. },
  110. [&](auto&) {
  111. // Can't enter a text or comment node.
  112. VERIFY_NOT_REACHED();
  113. });
  114. }
  115. void Parser::enter_node(Node& node)
  116. {
  117. if (m_listener) {
  118. auto& element = node.content.get<Node::Element>();
  119. m_listener->element_start(element.name, element.attributes);
  120. }
  121. if (&node != m_root_node.ptr())
  122. node.parent = m_entered_node;
  123. m_entered_node = &node;
  124. }
  125. void Parser::leave_node()
  126. {
  127. if (m_listener) {
  128. auto& element = m_entered_node->content.get<Node::Element>();
  129. m_listener->element_end(element.name);
  130. }
  131. m_entered_node = m_entered_node->parent;
  132. }
  133. ErrorOr<Document, ParseError> Parser::parse()
  134. {
  135. if (auto result = parse_internal(); result.is_error()) {
  136. if (m_parse_errors.is_empty())
  137. return result.release_error();
  138. return m_parse_errors.take_first();
  139. }
  140. return Document {
  141. m_root_node.release_nonnull(),
  142. move(m_doctype),
  143. move(m_processing_instructions),
  144. m_version,
  145. };
  146. }
  147. ErrorOr<void, ParseError> Parser::parse_with_listener(Listener& listener)
  148. {
  149. m_listener = &listener;
  150. ScopeGuard unset_listener { [this] { m_listener = nullptr; } };
  151. m_listener->set_source(m_source);
  152. m_listener->document_start();
  153. auto result = parse_internal();
  154. if (result.is_error())
  155. m_listener->error(result.error());
  156. m_listener->document_end();
  157. m_root_node.clear();
  158. return result;
  159. }
  160. // 2.3.3. S, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-S
  161. ErrorOr<void, ParseError> Parser::skip_whitespace(Required required)
  162. {
  163. auto rollback = rollback_point();
  164. auto rule = enter_rule();
  165. // S ::= (#x20 | #x9 | #xD | #xA)+
  166. auto matched = m_lexer.consume_while(is_any_of("\x20\x09\x0d\x0a"sv));
  167. if (required == Required::Yes && matched.is_empty())
  168. return parse_error(m_lexer.tell(), "Expected whitespace");
  169. rollback.disarm();
  170. return {};
  171. }
  172. // 2.2.a. RestrictedChar, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-RestrictedChar
  173. constexpr static auto s_restricted_characters = ranges_for_search<Range(0x1, 0x8), Range(0xb, 0xc), Range(0xe, 0x1f), Range(0x7f, 0x84), Range(0x86, 0x9f)>();
  174. // 2.1.1. Document, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-well-formed
  175. ErrorOr<void, ParseError> Parser::parse_internal()
  176. {
  177. auto rule = enter_rule();
  178. // document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* )
  179. TRY(parse_prolog());
  180. TRY(parse_element());
  181. while (true) {
  182. if (auto result = parse_misc(); result.is_error())
  183. break;
  184. }
  185. auto matched_source = m_source.substring_view(0, m_lexer.tell());
  186. if (auto it = find_if(matched_source.begin(), matched_source.end(), s_restricted_characters); !it.is_end()) {
  187. return parse_error(
  188. it.index(),
  189. DeprecatedString::formatted("Invalid character #{:x} used in document", *it));
  190. }
  191. if (!m_lexer.is_eof())
  192. return parse_error(m_lexer.tell(), "Garbage after document");
  193. return {};
  194. }
  195. ErrorOr<void, ParseError> Parser::expect(StringView expected)
  196. {
  197. auto rollback = rollback_point();
  198. if (!m_lexer.consume_specific(expected)) {
  199. if (m_options.treat_errors_as_fatal)
  200. return parse_error(m_lexer.tell(), DeprecatedString::formatted("Expected '{}'", expected));
  201. }
  202. rollback.disarm();
  203. return {};
  204. }
  205. template<typename Pred>
  206. requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> Parser::expect(Pred predicate, StringView description)
  207. {
  208. auto rollback = rollback_point();
  209. auto start = m_lexer.tell();
  210. if (!m_lexer.next_is(predicate)) {
  211. if (m_options.treat_errors_as_fatal)
  212. return parse_error(m_lexer.tell(), DeprecatedString::formatted("Expected {}", description));
  213. }
  214. m_lexer.ignore();
  215. rollback.disarm();
  216. return m_source.substring_view(start, m_lexer.tell() - start);
  217. }
  218. template<typename Pred>
  219. requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> Parser::expect_many(Pred predicate, StringView description)
  220. {
  221. auto rollback = rollback_point();
  222. auto start = m_lexer.tell();
  223. while (m_lexer.next_is(predicate)) {
  224. if (m_lexer.is_eof())
  225. break;
  226. m_lexer.ignore();
  227. }
  228. if (m_lexer.tell() == start) {
  229. if (m_options.treat_errors_as_fatal) {
  230. return parse_error(m_lexer.tell(), DeprecatedString::formatted("Expected {}", description));
  231. }
  232. }
  233. rollback.disarm();
  234. return m_source.substring_view(start, m_lexer.tell() - start);
  235. }
  236. // 2.8.22. Prolog, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog
  237. ErrorOr<void, ParseError> Parser::parse_prolog()
  238. {
  239. auto rollback = rollback_point();
  240. auto rule = enter_rule();
  241. // prolog ::= XMLDecl Misc* (doctypedecl Misc*)?
  242. // The following is valid in XML 1.0.
  243. // prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
  244. if (auto result = parse_xml_decl(); result.is_error()) {
  245. m_version = Version::Version10;
  246. m_in_compatibility_mode = true;
  247. }
  248. auto accept = accept_rule();
  249. while (true) {
  250. if (auto result = parse_misc(); result.is_error())
  251. break;
  252. }
  253. if (auto result = parse_doctype_decl(); !result.is_error()) {
  254. while (true) {
  255. if (auto result = parse_misc(); result.is_error())
  256. break;
  257. }
  258. }
  259. rollback.disarm();
  260. return {};
  261. }
  262. // 2.8.23. XMLDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl
  263. ErrorOr<void, ParseError> Parser::parse_xml_decl()
  264. {
  265. auto rollback = rollback_point();
  266. auto rule = enter_rule();
  267. // XMLDecl::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
  268. TRY(expect("<?xml"sv));
  269. auto accept = accept_rule();
  270. TRY(parse_version_info());
  271. (void)parse_encoding_decl();
  272. (void)parse_standalone_document_decl();
  273. TRY(skip_whitespace());
  274. TRY(expect("?>"sv));
  275. rollback.disarm();
  276. return {};
  277. }
  278. // 2.8.24. VersionInfo, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-VersionInfo
  279. ErrorOr<void, ParseError> Parser::parse_version_info()
  280. {
  281. auto rollback = rollback_point();
  282. auto rule = enter_rule();
  283. // VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
  284. TRY(skip_whitespace(Required::Yes));
  285. TRY(expect("version"sv));
  286. auto accept = accept_rule();
  287. TRY(parse_eq());
  288. TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
  289. m_lexer.retreat();
  290. auto version_string = m_lexer.consume_quoted_string();
  291. if (version_string == "1.0") {
  292. // FIXME: Compatibility mode, figure out which rules are different in XML 1.0.
  293. m_version = Version::Version10;
  294. m_in_compatibility_mode = true;
  295. } else {
  296. if (version_string != "1.1" && m_options.treat_errors_as_fatal)
  297. return parse_error(m_lexer.tell(), DeprecatedString::formatted("Expected '1.1', found '{}'", version_string));
  298. }
  299. m_version = Version::Version11;
  300. rollback.disarm();
  301. return {};
  302. }
  303. // 2.8.25. Eq, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Eq
  304. ErrorOr<void, ParseError> Parser::parse_eq()
  305. {
  306. auto rollback = rollback_point();
  307. auto rule = enter_rule();
  308. // Eq ::= S? '=' S?
  309. auto accept = accept_rule();
  310. TRY(skip_whitespace());
  311. TRY(expect("="sv));
  312. TRY(skip_whitespace());
  313. rollback.disarm();
  314. return {};
  315. }
  316. // 4.3.3.80. EncodingDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
  317. ErrorOr<void, ParseError> Parser::parse_encoding_decl()
  318. {
  319. auto rollback = rollback_point();
  320. auto rule = enter_rule();
  321. // EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
  322. TRY(skip_whitespace(Required::Yes));
  323. TRY(expect("encoding"sv));
  324. auto accept = accept_rule();
  325. TRY(parse_eq());
  326. TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
  327. m_lexer.retreat();
  328. // FIXME: Actually do something with this encoding.
  329. m_encoding = m_lexer.consume_quoted_string();
  330. rollback.disarm();
  331. return {};
  332. }
  333. // 2.9.32 SDDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-rmd
  334. ErrorOr<void, ParseError> Parser::parse_standalone_document_decl()
  335. {
  336. auto rollback = rollback_point();
  337. auto rule = enter_rule();
  338. // SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
  339. TRY(skip_whitespace(Required::Yes));
  340. TRY(expect("standalone"sv));
  341. auto accept = accept_rule();
  342. TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
  343. m_lexer.retreat();
  344. auto value = m_lexer.consume_quoted_string();
  345. if (!value.is_one_of("yes", "no"))
  346. return parse_error(m_lexer.tell() - value.length(), "Expected one of 'yes' or 'no'");
  347. m_standalone = value == "yes";
  348. rollback.disarm();
  349. return {};
  350. }
  351. // 2.8.27. Misc, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc
  352. ErrorOr<void, ParseError> Parser::parse_misc()
  353. {
  354. auto rollback = rollback_point();
  355. auto rule = enter_rule();
  356. // Misc ::= Comment | PI | S
  357. if (auto result = parse_comment(); !result.is_error()) {
  358. rollback.disarm();
  359. return {};
  360. }
  361. if (auto result = parse_processing_instruction(); !result.is_error()) {
  362. rollback.disarm();
  363. return {};
  364. }
  365. if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
  366. rollback.disarm();
  367. return {};
  368. }
  369. return parse_error(m_lexer.tell(), "Expected a match for 'Misc', but found none");
  370. }
  371. // 2.5.15 Comment, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Comment
  372. ErrorOr<void, ParseError> Parser::parse_comment()
  373. {
  374. auto rollback = rollback_point();
  375. auto rule = enter_rule();
  376. // Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
  377. TRY(expect("<!--"sv));
  378. auto accept = accept_rule();
  379. bool last_seen_a_dash = false;
  380. // FIXME: This should disallow surrogate blocks
  381. auto text = m_lexer.consume_while([&](auto ch) {
  382. if (ch != '-') {
  383. last_seen_a_dash = false;
  384. return true;
  385. }
  386. if (last_seen_a_dash)
  387. return false;
  388. last_seen_a_dash = true;
  389. return true;
  390. });
  391. if (last_seen_a_dash) {
  392. m_lexer.retreat();
  393. text = text.substring_view(0, text.length() - 1);
  394. }
  395. TRY(expect("-->"sv));
  396. if (m_options.preserve_comments)
  397. append_comment(text);
  398. rollback.disarm();
  399. return {};
  400. }
  401. // 2.6.16 PI, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI
  402. ErrorOr<void, ParseError> Parser::parse_processing_instruction()
  403. {
  404. auto rollback = rollback_point();
  405. auto rule = enter_rule();
  406. // PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
  407. TRY(expect("<?"sv));
  408. auto accept = accept_rule();
  409. auto target = TRY(parse_processing_instruction_target());
  410. DeprecatedString data;
  411. if (auto result = skip_whitespace(Required::Yes); !result.is_error())
  412. data = m_lexer.consume_until("?>");
  413. TRY(expect("?>"sv));
  414. m_processing_instructions.set(target, data);
  415. rollback.disarm();
  416. return {};
  417. }
  418. // 2.6.17. PITarget, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
  419. ErrorOr<Name, ParseError> Parser::parse_processing_instruction_target()
  420. {
  421. auto rollback = rollback_point();
  422. auto rule = enter_rule();
  423. // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
  424. auto target = TRY(parse_name());
  425. auto accept = accept_rule();
  426. if (target.equals_ignoring_case("xml"sv) && m_options.treat_errors_as_fatal) {
  427. return parse_error(
  428. m_lexer.tell() - target.length(),
  429. "Use of the reserved 'xml' name for processing instruction target name is disallowed");
  430. }
  431. rollback.disarm();
  432. return target;
  433. }
  434. // NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
  435. constexpr static auto s_name_start_characters = ranges_for_search<Range(':', ':'), Range('A', 'Z'), Range('_', '_'), Range('a', 'z'), Range(0xc0, 0xd6), Range(0xd8, 0xf6), Range(0xf8, 0x2ff), Range(0x370, 0x37d), Range(0x37f, 0x1fff), Range(0x200c, 0x200d), Range(0x2070, 0x218f), Range(0x2c00, 0x2fef), Range(0x3001, 0xd7ff), Range(0xf900, 0xfdcf), Range(0xfdf0, 0xfffd), Range(0x10000, 0xeffff)> {};
  436. // NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
  437. constexpr static auto s_name_characters = s_name_start_characters.with<Range('-', '-'), Range('.', '.'), Range('0', '9'), Range(0xb7, 0xb7), Range(0x0300, 0x036f), Range(0x203f, 0x2040)>();
  438. // 2.3.5. Name, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name
  439. ErrorOr<Name, ParseError> Parser::parse_name()
  440. {
  441. auto rollback = rollback_point();
  442. auto rule = enter_rule();
  443. // Name ::= NameStartChar (NameChar)*
  444. auto start = TRY(expect(s_name_start_characters, "a NameStartChar"sv));
  445. auto accept = accept_rule();
  446. auto rest = m_lexer.consume_while(s_name_characters);
  447. StringBuilder builder;
  448. builder.append(start);
  449. builder.append(rest);
  450. rollback.disarm();
  451. return builder.to_deprecated_string();
  452. }
  453. // 2.8.28. doctypedecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl
  454. ErrorOr<void, ParseError> Parser::parse_doctype_decl()
  455. {
  456. auto rollback = rollback_point();
  457. auto rule = enter_rule();
  458. Doctype doctype;
  459. // doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
  460. TRY(expect("<!DOCTYPE"sv));
  461. auto accept = accept_rule();
  462. TRY(skip_whitespace(Required::Yes));
  463. doctype.type = TRY(parse_name());
  464. if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
  465. auto id_start = m_lexer.tell();
  466. if (auto id_result = parse_external_id(); !id_result.is_error()) {
  467. doctype.external_id = id_result.release_value();
  468. if (m_options.resolve_external_resource) {
  469. auto resource_result = m_options.resolve_external_resource(doctype.external_id->system_id, doctype.external_id->public_id);
  470. if (resource_result.is_error()) {
  471. return parse_error(
  472. id_start,
  473. DeprecatedString::formatted("Failed to resolve external subset '{}': {}", doctype.external_id->system_id.system_literal, resource_result.error()));
  474. }
  475. StringView resolved_source = resource_result.value();
  476. TemporaryChange source { m_source, resolved_source };
  477. TemporaryChange lexer { m_lexer, GenericLexer(m_source) };
  478. auto declarations = TRY(parse_external_subset());
  479. if (!m_lexer.is_eof()) {
  480. return parse_error(
  481. m_lexer.tell(),
  482. DeprecatedString::formatted("Failed to resolve external subset '{}': garbage after declarations", doctype.external_id->system_id.system_literal));
  483. }
  484. doctype.markup_declarations.extend(move(declarations));
  485. }
  486. }
  487. }
  488. TRY(skip_whitespace(Required::No));
  489. if (m_lexer.consume_specific('[')) {
  490. auto internal_subset = TRY(parse_internal_subset());
  491. TRY(expect("]"sv));
  492. TRY(skip_whitespace());
  493. doctype.markup_declarations.extend(internal_subset);
  494. }
  495. TRY(expect(">"sv));
  496. rollback.disarm();
  497. m_doctype = move(doctype);
  498. return {};
  499. }
  500. // 3.39. element, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-element
  501. ErrorOr<void, ParseError> Parser::parse_element()
  502. {
  503. auto rollback = rollback_point();
  504. auto rule = enter_rule();
  505. // element ::= EmptyElemTag
  506. // | STag content ETag
  507. if (auto result = parse_empty_element_tag(); !result.is_error()) {
  508. append_node(result.release_value());
  509. rollback.disarm();
  510. return {};
  511. }
  512. auto start_tag = TRY(parse_start_tag());
  513. auto& node = *start_tag;
  514. auto& tag = node.content.get<Node::Element>();
  515. append_node(move(start_tag));
  516. enter_node(node);
  517. ScopeGuard quit {
  518. [&] {
  519. leave_node();
  520. }
  521. };
  522. TRY(parse_content());
  523. auto tag_location = m_lexer.tell();
  524. auto closing_name = TRY(parse_end_tag());
  525. // Well-formedness constraint: The Name in an element's end-tag MUST match the element type in the start-tag.
  526. if (m_options.treat_errors_as_fatal && closing_name != tag.name)
  527. return parse_error(tag_location, "Invalid closing tag");
  528. rollback.disarm();
  529. return {};
  530. }
  531. // 3.1.44. EmptyElemTag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EmptyElemTag
  532. ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_empty_element_tag()
  533. {
  534. auto rollback = rollback_point();
  535. auto rule = enter_rule();
  536. // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
  537. TRY(expect("<"sv));
  538. auto accept = accept_rule();
  539. auto name = TRY(parse_name());
  540. HashMap<Name, DeprecatedString> attributes;
  541. while (true) {
  542. if (auto result = skip_whitespace(Required::Yes); result.is_error())
  543. break;
  544. if (auto result = parse_attribute(); !result.is_error()) {
  545. auto attribute = result.release_value();
  546. attributes.set(move(attribute.name), move(attribute.value));
  547. } else {
  548. break;
  549. }
  550. }
  551. TRY(skip_whitespace());
  552. TRY(expect("/>"sv));
  553. rollback.disarm();
  554. return make<Node>(Node::Element { move(name), move(attributes), {} });
  555. }
  556. // 3.1.41. Attribute, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Attribute
  557. ErrorOr<Attribute, ParseError> Parser::parse_attribute()
  558. {
  559. auto rollback = rollback_point();
  560. auto rule = enter_rule();
  561. // Attribute ::= Name Eq AttValue
  562. auto name = TRY(parse_name());
  563. auto accept = accept_rule();
  564. TRY(parse_eq());
  565. auto value = TRY(parse_attribute_value());
  566. rollback.disarm();
  567. return Attribute {
  568. move(name),
  569. move(value),
  570. };
  571. }
  572. // 2.3.10. AttValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttValue
  573. ErrorOr<DeprecatedString, ParseError> Parser::parse_attribute_value()
  574. {
  575. auto rollback = rollback_point();
  576. auto rule = enter_rule();
  577. // AttValue ::= '"' ([^<&"] | Reference)* '"'
  578. // | "'" ([^<&'] | Reference)* "'"
  579. auto quote = TRY(expect(is_any_of("'\""sv), "one of ' or \""sv));
  580. auto accept = accept_rule();
  581. auto text = TRY(parse_attribute_value_inner(quote));
  582. TRY(expect(quote));
  583. rollback.disarm();
  584. return text;
  585. }
  586. ErrorOr<DeprecatedString, ParseError> Parser::parse_attribute_value_inner(StringView disallow)
  587. {
  588. StringBuilder builder;
  589. while (true) {
  590. if (m_lexer.next_is(is_any_of(disallow)) || m_lexer.is_eof())
  591. break;
  592. if (m_lexer.next_is('<')) {
  593. // Not allowed, return a nice error to make it easier to debug.
  594. return parse_error(m_lexer.tell(), "Unescaped '<' not allowed in attribute values");
  595. }
  596. if (m_lexer.next_is('&')) {
  597. auto reference = TRY(parse_reference());
  598. if (auto* char_reference = reference.get_pointer<DeprecatedString>())
  599. builder.append(*char_reference);
  600. else
  601. builder.append(TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::AttributeValue)));
  602. } else {
  603. builder.append(m_lexer.consume());
  604. }
  605. }
  606. return builder.to_deprecated_string();
  607. }
  608. // Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
  609. constexpr static auto s_characters = ranges_for_search<Range(0x1, 0xd7ff), Range(0xe000, 0xfffd), Range(0x10000, 0x10ffff)>();
  610. // 4.1.67. Reference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Reference
  611. ErrorOr<Variant<Parser::EntityReference, DeprecatedString>, ParseError> Parser::parse_reference()
  612. {
  613. auto rollback = rollback_point();
  614. auto rule = enter_rule();
  615. // Reference ::= EntityRef | CharRef
  616. // 4.1.68. EntityRef
  617. // EntityRef ::= '&' Name ';'
  618. // 4.1.66. CharRef
  619. // CharRef ::= '&#' [0-9]+ ';'
  620. // | '&#x' [0-9a-fA-F]+ ';'
  621. auto reference_start = m_lexer.tell();
  622. TRY(expect("&"sv));
  623. auto accept = accept_rule();
  624. auto name_result = parse_name();
  625. if (name_result.is_error()) {
  626. TRY(expect("#"sv));
  627. Optional<u32> code_point;
  628. if (m_lexer.consume_specific('x')) {
  629. auto hex = TRY(expect_many(
  630. ranges_for_search<Range('0', '9'), Range('a', 'f'), Range('A', 'F')>(),
  631. "any of [0-9a-fA-F]"sv));
  632. code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(hex);
  633. } else {
  634. auto decimal = TRY(expect_many(
  635. ranges_for_search<Range('0', '9')>(),
  636. "any of [0-9]"sv));
  637. code_point = decimal.to_uint<u32>();
  638. }
  639. if (!code_point.has_value() || !s_characters.contains(*code_point))
  640. return parse_error(reference_start, "Invalid character reference");
  641. TRY(expect(";"sv));
  642. StringBuilder builder;
  643. builder.append_code_point(*code_point);
  644. rollback.disarm();
  645. return builder.to_deprecated_string();
  646. }
  647. auto name = name_result.release_value();
  648. TRY(expect(";"sv));
  649. rollback.disarm();
  650. return EntityReference { move(name) };
  651. }
  652. // 3.1.40 STag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-STag
  653. ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_start_tag()
  654. {
  655. auto rollback = rollback_point();
  656. auto rule = enter_rule();
  657. // STag ::= '<' Name (S Attribute)* S? '>'
  658. TRY(expect("<"sv));
  659. auto accept = accept_rule();
  660. auto name = TRY(parse_name());
  661. HashMap<Name, DeprecatedString> attributes;
  662. while (true) {
  663. if (auto result = skip_whitespace(Required::Yes); result.is_error())
  664. break;
  665. if (auto result = parse_attribute(); !result.is_error()) {
  666. auto attribute = result.release_value();
  667. attributes.set(move(attribute.name), move(attribute.value));
  668. } else {
  669. break;
  670. }
  671. }
  672. TRY(skip_whitespace());
  673. TRY(expect(">"sv));
  674. rollback.disarm();
  675. return make<Node>(Node::Element { move(name), move(attributes), {} });
  676. }
  677. // 3.1.42 ETag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ETag
  678. ErrorOr<Name, ParseError> Parser::parse_end_tag()
  679. {
  680. auto rollback = rollback_point();
  681. auto rule = enter_rule();
  682. // ETag ::= '</' Name S? '>'
  683. TRY(expect("</"sv));
  684. auto accept = accept_rule();
  685. auto name = TRY(parse_name());
  686. TRY(skip_whitespace());
  687. TRY(expect(">"sv));
  688. rollback.disarm();
  689. return name;
  690. }
  691. // 3.1.42 content, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-content
  692. ErrorOr<void, ParseError> Parser::parse_content()
  693. {
  694. auto rollback = rollback_point();
  695. auto rule = enter_rule();
  696. // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
  697. if (auto result = parse_char_data(); !result.is_error())
  698. append_text(result.release_value());
  699. while (true) {
  700. if (auto result = parse_element(); !result.is_error())
  701. goto try_char_data;
  702. if (auto result = parse_reference(); !result.is_error()) {
  703. auto reference = result.release_value();
  704. if (auto char_reference = reference.get_pointer<DeprecatedString>())
  705. append_text(*char_reference);
  706. else
  707. TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::Content));
  708. goto try_char_data;
  709. }
  710. if (auto result = parse_cdata_section(); !result.is_error()) {
  711. if (m_options.preserve_cdata)
  712. append_text(result.release_value());
  713. goto try_char_data;
  714. }
  715. if (auto result = parse_processing_instruction(); !result.is_error())
  716. goto try_char_data;
  717. if (auto result = parse_comment(); !result.is_error())
  718. goto try_char_data;
  719. break;
  720. try_char_data:;
  721. if (auto result = parse_char_data(); !result.is_error())
  722. append_text(result.release_value());
  723. }
  724. rollback.disarm();
  725. return {};
  726. }
  727. // 2.4.14 CharData, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CharData
  728. ErrorOr<StringView, ParseError> Parser::parse_char_data()
  729. {
  730. auto rollback = rollback_point();
  731. auto rule = enter_rule();
  732. // CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
  733. auto cend_state = 0; // 1: ], 2: ], 3: >
  734. auto text = m_lexer.consume_while([&](auto ch) {
  735. if (ch == '<' || ch == '&' || cend_state == 3)
  736. return false;
  737. switch (cend_state) {
  738. case 0:
  739. case 1:
  740. if (ch == ']')
  741. cend_state++;
  742. else
  743. cend_state = 0;
  744. return true;
  745. case 2:
  746. if (ch == '>') {
  747. cend_state++;
  748. return true;
  749. }
  750. cend_state = 0;
  751. return true;
  752. default:
  753. VERIFY_NOT_REACHED();
  754. }
  755. });
  756. if (cend_state == 3) {
  757. m_lexer.retreat(3);
  758. text = text.substring_view(0, text.length() - 3);
  759. }
  760. rollback.disarm();
  761. return text;
  762. }
  763. // 2.8.28b intSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-intSubset
  764. ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_internal_subset()
  765. {
  766. auto rollback = rollback_point();
  767. auto rule = enter_rule();
  768. Vector<MarkupDeclaration> declarations;
  769. // intSubset ::= (markupdecl | DeclSep)*
  770. while (true) {
  771. if (auto result = parse_markup_declaration(); !result.is_error()) {
  772. auto maybe_declaration = result.release_value();
  773. if (maybe_declaration.has_value())
  774. declarations.append(maybe_declaration.release_value());
  775. continue;
  776. }
  777. if (auto result = parse_declaration_separator(); !result.is_error()) {
  778. // The markup declarations may be made up in whole or in part of the replacement text of parameter entities.
  779. // The replacement text of a parameter entity reference in a DeclSep MUST match the production extSubsetDecl.
  780. auto maybe_replacement_text = result.release_value();
  781. if (maybe_replacement_text.has_value()) {
  782. TemporaryChange<StringView> source { m_source, maybe_replacement_text.value() };
  783. TemporaryChange lexer { m_lexer, GenericLexer { m_source } };
  784. auto contained_declarations = TRY(parse_external_subset_declaration());
  785. declarations.extend(move(contained_declarations));
  786. }
  787. continue;
  788. }
  789. break;
  790. }
  791. rollback.disarm();
  792. return declarations;
  793. }
  794. // 2.8.29 markupdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-markupdecl
  795. ErrorOr<Optional<MarkupDeclaration>, ParseError> Parser::parse_markup_declaration()
  796. {
  797. auto rollback = rollback_point();
  798. auto rule = enter_rule();
  799. // markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
  800. if (auto result = parse_element_declaration(); !result.is_error()) {
  801. rollback.disarm();
  802. return MarkupDeclaration { result.release_value() };
  803. }
  804. if (auto result = parse_attribute_list_declaration(); !result.is_error()) {
  805. rollback.disarm();
  806. return MarkupDeclaration { result.release_value() };
  807. }
  808. if (auto result = parse_entity_declaration(); !result.is_error()) {
  809. rollback.disarm();
  810. return MarkupDeclaration { result.release_value() };
  811. }
  812. if (auto result = parse_notation_declaration(); !result.is_error()) {
  813. rollback.disarm();
  814. return MarkupDeclaration { result.release_value() };
  815. }
  816. if (auto result = parse_processing_instruction(); !result.is_error()) {
  817. rollback.disarm();
  818. return Optional<MarkupDeclaration> {};
  819. }
  820. if (auto result = parse_comment(); !result.is_error()) {
  821. rollback.disarm();
  822. return Optional<MarkupDeclaration> {};
  823. }
  824. return parse_error(m_lexer.tell(), "Expected one of elementdecl, attlistdecl, entitydecl, notationdecl, PI or comment");
  825. }
  826. // 2.8.28a DeclSep, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-DeclSep
  827. ErrorOr<Optional<DeprecatedString>, ParseError> Parser::parse_declaration_separator()
  828. {
  829. auto rollback = rollback_point();
  830. auto rule = enter_rule();
  831. // DeclSep ::= PEReference | S
  832. if (auto name = parse_parameter_entity_reference(); !name.is_error()) {
  833. rollback.disarm();
  834. // FIXME: Resolve this PEReference.
  835. return "";
  836. }
  837. if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
  838. rollback.disarm();
  839. return Optional<DeprecatedString> {};
  840. }
  841. return parse_error(m_lexer.tell(), "Expected either whitespace, or a PEReference");
  842. }
  843. // 4.1.69 PEReference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEReference
  844. ErrorOr<Name, ParseError> Parser::parse_parameter_entity_reference()
  845. {
  846. auto rollback = rollback_point();
  847. auto rule = enter_rule();
  848. // PEReference ::= '%' Name ';'
  849. TRY(expect("%"sv));
  850. auto accept = accept_rule();
  851. auto name = TRY(parse_name());
  852. TRY(expect(";"sv));
  853. rollback.disarm();
  854. return name;
  855. }
  856. // 3.2.46 elementdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-elementdecl
  857. ErrorOr<ElementDeclaration, ParseError> Parser::parse_element_declaration()
  858. {
  859. auto rollback = rollback_point();
  860. auto rule = enter_rule();
  861. // FIXME: Apparently both name _and_ contentspec here are allowed to be PEReferences,
  862. // but the grammar does not allow that, figure this out.
  863. // elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
  864. TRY(expect("<!ELEMENT"sv));
  865. auto accept = accept_rule();
  866. TRY(skip_whitespace(Required::Yes));
  867. auto name = TRY(parse_name());
  868. TRY(skip_whitespace(Required::Yes));
  869. auto spec = TRY(parse_content_spec());
  870. TRY(expect(">"sv));
  871. rollback.disarm();
  872. return ElementDeclaration {
  873. move(name),
  874. move(spec),
  875. };
  876. }
  877. // 3.3.52 AttlistDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttlistDecl
  878. ErrorOr<AttributeListDeclaration, ParseError> Parser::parse_attribute_list_declaration()
  879. {
  880. auto rollback = rollback_point();
  881. auto rule = enter_rule();
  882. AttributeListDeclaration declaration;
  883. // AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
  884. TRY(expect("<!ATTLIST"sv));
  885. auto accept = accept_rule();
  886. TRY(skip_whitespace(Required::Yes));
  887. declaration.type = TRY(parse_name());
  888. while (true) {
  889. if (auto result = parse_attribute_definition(); !result.is_error())
  890. declaration.attributes.append(result.release_value());
  891. else
  892. break;
  893. }
  894. TRY(skip_whitespace());
  895. TRY(expect(">"sv));
  896. rollback.disarm();
  897. return declaration;
  898. }
  899. // 3.3.53 AttDef, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttDef
  900. ErrorOr<AttributeListDeclaration::Definition, ParseError> Parser::parse_attribute_definition()
  901. {
  902. auto rollback = rollback_point();
  903. auto rule = enter_rule();
  904. Optional<AttributeListDeclaration::Type> type;
  905. Optional<AttributeListDeclaration::Default> default_;
  906. // AttDef ::= S Name S AttType S DefaultDecl
  907. TRY(skip_whitespace(Required::Yes));
  908. auto name = TRY(parse_name());
  909. auto accept = accept_rule();
  910. TRY(skip_whitespace(Required::Yes));
  911. // AttType ::= StringType | TokenizedType | EnumeratedType
  912. // StringType ::= 'CDATA'
  913. // TokenizedType ::= 'ID'
  914. // | 'IDREF'
  915. // | 'IDREFS'
  916. // | 'ENTITY'
  917. // | 'ENTITIES'
  918. // | 'NMTOKEN'
  919. // | 'NMTOKENS'
  920. // EnumeratedType ::= NotationType | Enumeration
  921. // NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
  922. // Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
  923. if (m_lexer.consume_specific("CDATA")) {
  924. type = AttributeListDeclaration::StringType::CData;
  925. } else if (m_lexer.consume_specific("IDREFS")) {
  926. type = AttributeListDeclaration::TokenizedType::IDRefs;
  927. } else if (m_lexer.consume_specific("IDREF")) {
  928. type = AttributeListDeclaration::TokenizedType::IDRef;
  929. } else if (m_lexer.consume_specific("ID")) {
  930. type = AttributeListDeclaration::TokenizedType::ID;
  931. } else if (m_lexer.consume_specific("ENTITIES")) {
  932. type = AttributeListDeclaration::TokenizedType::Entities;
  933. } else if (m_lexer.consume_specific("ENTITY")) {
  934. type = AttributeListDeclaration::TokenizedType::Entity;
  935. } else if (m_lexer.consume_specific("NMTOKENS")) {
  936. type = AttributeListDeclaration::TokenizedType::NMTokens;
  937. } else if (m_lexer.consume_specific("NMTOKEN")) {
  938. type = AttributeListDeclaration::TokenizedType::NMToken;
  939. } else if (m_lexer.consume_specific("NOTATION")) {
  940. HashTable<Name> names;
  941. TRY(skip_whitespace(Required::Yes));
  942. TRY(expect("("sv));
  943. TRY(skip_whitespace());
  944. names.set(TRY(parse_name()));
  945. while (true) {
  946. TRY(skip_whitespace());
  947. if (auto result = expect("|"sv); result.is_error())
  948. break;
  949. TRY(skip_whitespace());
  950. names.set(TRY(parse_name()));
  951. }
  952. TRY(skip_whitespace());
  953. TRY(expect(")"sv));
  954. type = AttributeListDeclaration::NotationType { move(names) };
  955. } else {
  956. HashTable<DeprecatedString> names;
  957. TRY(expect("("sv));
  958. TRY(skip_whitespace());
  959. names.set(TRY(parse_nm_token()));
  960. while (true) {
  961. TRY(skip_whitespace());
  962. if (auto result = expect("|"sv); result.is_error())
  963. break;
  964. TRY(skip_whitespace());
  965. names.set(TRY(parse_nm_token()));
  966. }
  967. TRY(skip_whitespace());
  968. TRY(expect(")"sv));
  969. type = AttributeListDeclaration::Enumeration { move(names) };
  970. }
  971. TRY(skip_whitespace(Required::Yes));
  972. // DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
  973. // | (('#FIXED' S)? AttValue)
  974. if (m_lexer.consume_specific("#REQUIRED")) {
  975. default_ = AttributeListDeclaration::Required {};
  976. } else if (m_lexer.consume_specific("#IMPLIED")) {
  977. default_ = AttributeListDeclaration::Implied {};
  978. } else {
  979. bool fixed = false;
  980. if (m_lexer.consume_specific("#FIXED")) {
  981. TRY(skip_whitespace(Required::Yes));
  982. fixed = true;
  983. }
  984. auto value = TRY(parse_attribute_value());
  985. if (fixed)
  986. default_ = AttributeListDeclaration::Fixed { move(value) };
  987. else
  988. default_ = AttributeListDeclaration::DefaultValue { move(value) };
  989. }
  990. rollback.disarm();
  991. return AttributeListDeclaration::Definition {
  992. move(name),
  993. type.release_value(),
  994. default_.release_value(),
  995. };
  996. }
  997. // 2.3.7 Nmtoken, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Nmtoken
  998. ErrorOr<StringView, ParseError> Parser::parse_nm_token()
  999. {
  1000. auto rollback = rollback_point();
  1001. auto rule = enter_rule();
  1002. // Nmtoken ::= (NameChar)+
  1003. auto token = TRY(expect_many(s_name_characters, "a NameChar"sv));
  1004. rollback.disarm();
  1005. return token;
  1006. }
  1007. // 4.7.82 NotationDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#Notations
  1008. ErrorOr<NotationDeclaration, ParseError> Parser::parse_notation_declaration()
  1009. {
  1010. auto rollback = rollback_point();
  1011. auto rule = enter_rule();
  1012. Variant<ExternalID, PublicID, Empty> notation;
  1013. // NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'
  1014. TRY(expect("<!NOTATION"sv));
  1015. auto accept = accept_rule();
  1016. TRY(skip_whitespace(Required::Yes));
  1017. auto name = TRY(parse_name());
  1018. TRY(skip_whitespace(Required::Yes));
  1019. if (auto result = parse_external_id(); !result.is_error())
  1020. notation = result.release_value();
  1021. else
  1022. notation = TRY(parse_public_id());
  1023. TRY(expect(">"sv));
  1024. rollback.disarm();
  1025. return NotationDeclaration {
  1026. move(name),
  1027. move(notation).downcast<ExternalID, PublicID>(),
  1028. };
  1029. }
  1030. // 3.2.46 contentspec, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-contentspec
  1031. ErrorOr<ElementDeclaration::ContentSpec, ParseError> Parser::parse_content_spec()
  1032. {
  1033. auto rollback = rollback_point();
  1034. auto rule = enter_rule();
  1035. Optional<ElementDeclaration::ContentSpec> content_spec;
  1036. // contentspec ::= 'EMPTY' | 'ANY' | Mixed | children
  1037. if (m_lexer.consume_specific("EMPTY")) {
  1038. content_spec = ElementDeclaration::Empty {};
  1039. } else if (m_lexer.consume_specific("ANY")) {
  1040. content_spec = ElementDeclaration::Any {};
  1041. } else {
  1042. TRY(expect("("sv));
  1043. TRY(skip_whitespace());
  1044. if (m_lexer.consume_specific("#PCDATA")) {
  1045. HashTable<Name> names;
  1046. // Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*'
  1047. // | '(' S? '#PCDATA' S? ')'
  1048. TRY(skip_whitespace());
  1049. if (m_lexer.consume_specific(")*")) {
  1050. content_spec = ElementDeclaration::Mixed { .types = {}, .many = true };
  1051. } else if (m_lexer.consume_specific(')')) {
  1052. content_spec = ElementDeclaration::Mixed { .types = {}, .many = false };
  1053. } else {
  1054. while (true) {
  1055. TRY(skip_whitespace());
  1056. if (!m_lexer.consume_specific('|'))
  1057. break;
  1058. TRY(skip_whitespace());
  1059. if (auto result = parse_name(); !result.is_error())
  1060. names.set(result.release_value());
  1061. else
  1062. return parse_error(m_lexer.tell(), "Expected a Name");
  1063. }
  1064. TRY(skip_whitespace());
  1065. TRY(expect(")*"sv));
  1066. content_spec = ElementDeclaration::Mixed { .types = move(names), .many = true };
  1067. }
  1068. } else {
  1069. while (!m_lexer.next_is('('))
  1070. m_lexer.retreat();
  1071. // children ::= (choice | seq) ('?' | '*' | '+')?
  1072. // cp ::= (Name | choice | seq) ('?' | '*' | '+')?
  1073. // choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')'
  1074. // seq ::= '(' S? cp ( S? ',' S? cp )* S? ')'
  1075. Function<ErrorOr<ElementDeclaration::Children::Choice, ParseError>()> parse_choice;
  1076. Function<ErrorOr<ElementDeclaration::Children::Sequence, ParseError>()> parse_sequence;
  1077. auto parse_cp_init = [&]() -> ErrorOr<Variant<Name, ElementDeclaration::Children::Choice, ElementDeclaration::Children::Sequence>, ParseError> {
  1078. if (auto result = parse_name(); !result.is_error())
  1079. return result.release_value();
  1080. if (auto result = parse_choice(); !result.is_error())
  1081. return result.release_value();
  1082. return TRY(parse_sequence());
  1083. };
  1084. auto parse_qualifier = [&]() -> ElementDeclaration::Children::Qualifier {
  1085. ElementDeclaration::Children::Qualifier qualifier { ElementDeclaration::Children::Qualifier::ExactlyOnce };
  1086. if (m_lexer.consume_specific('?'))
  1087. qualifier = ElementDeclaration::Children::Qualifier::Optional;
  1088. else if (m_lexer.consume_specific('*'))
  1089. qualifier = ElementDeclaration::Children::Qualifier::Any;
  1090. else if (m_lexer.consume_specific('+'))
  1091. qualifier = ElementDeclaration::Children::Qualifier::OneOrMore;
  1092. return qualifier;
  1093. };
  1094. auto parse_cp = [&]() -> ErrorOr<ElementDeclaration::Children::Entry, ParseError> {
  1095. auto sub_entry = TRY(parse_cp_init());
  1096. auto qualifier = parse_qualifier();
  1097. return ElementDeclaration::Children::Entry {
  1098. move(sub_entry),
  1099. qualifier,
  1100. };
  1101. };
  1102. parse_choice = [&]() -> ErrorOr<ElementDeclaration::Children::Choice, ParseError> {
  1103. auto rollback = rollback_point();
  1104. auto rule = enter_rule();
  1105. TRY(expect("("sv));
  1106. auto accept = accept_rule();
  1107. TRY(skip_whitespace());
  1108. Vector<ElementDeclaration::Children::Entry> choices;
  1109. choices.append(TRY(parse_cp()));
  1110. while (true) {
  1111. TRY(skip_whitespace());
  1112. if (!m_lexer.consume_specific('|'))
  1113. break;
  1114. TRY(skip_whitespace());
  1115. choices.append(TRY(parse_cp()));
  1116. }
  1117. TRY(expect(")"sv));
  1118. if (choices.size() < 2)
  1119. return parse_error(m_lexer.tell(), "Expected more than one choice");
  1120. TRY(skip_whitespace());
  1121. auto qualifier = parse_qualifier();
  1122. rollback.disarm();
  1123. return ElementDeclaration::Children::Choice {
  1124. move(choices),
  1125. qualifier,
  1126. };
  1127. };
  1128. parse_sequence = [&]() -> ErrorOr<ElementDeclaration::Children::Sequence, ParseError> {
  1129. auto rollback = rollback_point();
  1130. auto rule = enter_rule();
  1131. TRY(expect("("sv));
  1132. auto accept = accept_rule();
  1133. TRY(skip_whitespace());
  1134. Vector<ElementDeclaration::Children::Entry> entries;
  1135. entries.append(TRY(parse_cp()));
  1136. while (true) {
  1137. TRY(skip_whitespace());
  1138. if (!m_lexer.consume_specific(','))
  1139. break;
  1140. TRY(skip_whitespace());
  1141. entries.append(TRY(parse_cp()));
  1142. }
  1143. TRY(expect(")"sv));
  1144. TRY(skip_whitespace());
  1145. auto qualifier = parse_qualifier();
  1146. rollback.disarm();
  1147. return ElementDeclaration::Children::Sequence {
  1148. move(entries),
  1149. qualifier,
  1150. };
  1151. };
  1152. if (auto result = parse_choice(); !result.is_error()) {
  1153. auto qualifier = parse_qualifier();
  1154. content_spec = ElementDeclaration::Children {
  1155. result.release_value(),
  1156. qualifier,
  1157. };
  1158. } else {
  1159. auto sequence = TRY(parse_sequence());
  1160. auto qualifier = parse_qualifier();
  1161. content_spec = ElementDeclaration::Children {
  1162. move(sequence),
  1163. qualifier,
  1164. };
  1165. }
  1166. }
  1167. }
  1168. rollback.disarm();
  1169. return content_spec.release_value();
  1170. }
  1171. // 2.8.31 extSubsetDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubsetDecl
  1172. ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset_declaration()
  1173. {
  1174. auto rollback = rollback_point();
  1175. auto rule = enter_rule();
  1176. Vector<MarkupDeclaration> declarations;
  1177. // extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep )*
  1178. while (true) {
  1179. if (auto result = parse_markup_declaration(); !result.is_error()) {
  1180. if (result.value().has_value())
  1181. declarations.append(result.release_value().release_value());
  1182. continue;
  1183. }
  1184. // FIXME: conditionalSect
  1185. if (auto result = parse_declaration_separator(); !result.is_error())
  1186. continue;
  1187. break;
  1188. }
  1189. rollback.disarm();
  1190. return declarations;
  1191. }
  1192. // 4.2.70 EntityDecl, https://www.w3.org/TR/xml/#NT-EntityDecl
  1193. ErrorOr<EntityDeclaration, ParseError> Parser::parse_entity_declaration()
  1194. {
  1195. // EntityDecl ::= GEDecl | PEDecl
  1196. if (auto result = parse_general_entity_declaration(); !result.is_error())
  1197. return result;
  1198. return parse_parameter_entity_declaration();
  1199. }
  1200. // 4.2.71 GEDecl, https://www.w3.org/TR/xml/#NT-GEDecl
  1201. ErrorOr<EntityDeclaration, ParseError> Parser::parse_general_entity_declaration()
  1202. {
  1203. auto rollback = rollback_point();
  1204. auto rule = enter_rule();
  1205. Variant<DeprecatedString, EntityDefinition, Empty> definition;
  1206. // GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
  1207. TRY(expect("<!ENTITY"sv));
  1208. auto accept = accept_rule();
  1209. TRY(skip_whitespace(Required::Yes));
  1210. auto name = TRY(parse_name());
  1211. TRY(skip_whitespace(Required::Yes));
  1212. // EntityDef ::= EntityValue | (ExternalID NDataDecl?)
  1213. if (auto result = parse_entity_value(); !result.is_error()) {
  1214. definition = result.release_value();
  1215. } else {
  1216. auto external_id = TRY(parse_external_id());
  1217. Optional<Name> notation;
  1218. if (auto notation_result = parse_notation_data_declaration(); !notation_result.is_error())
  1219. notation = notation_result.release_value();
  1220. definition = EntityDefinition {
  1221. move(external_id),
  1222. move(notation),
  1223. };
  1224. }
  1225. TRY(skip_whitespace());
  1226. TRY(expect(">"sv));
  1227. rollback.disarm();
  1228. return GEDeclaration {
  1229. move(name),
  1230. move(definition).downcast<DeprecatedString, EntityDefinition>(),
  1231. };
  1232. }
  1233. // 4.2.72 PEDecl, https://www.w3.org/TR/xml/#NT-PEDecl
  1234. ErrorOr<EntityDeclaration, ParseError> Parser::parse_parameter_entity_declaration()
  1235. {
  1236. auto rollback = rollback_point();
  1237. auto rule = enter_rule();
  1238. Variant<DeprecatedString, ExternalID, Empty> definition;
  1239. // PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
  1240. TRY(expect("<!ENTITY"sv));
  1241. auto accept = accept_rule();
  1242. TRY(skip_whitespace(Required::Yes));
  1243. TRY(expect("%"sv));
  1244. TRY(skip_whitespace(Required::Yes));
  1245. auto name = TRY(parse_name());
  1246. TRY(skip_whitespace(Required::Yes));
  1247. // PEDef ::= EntityValue | ExternalID
  1248. if (auto result = parse_entity_value(); !result.is_error())
  1249. definition = result.release_value();
  1250. else
  1251. definition = TRY(parse_external_id());
  1252. TRY(skip_whitespace());
  1253. TRY(expect(">"sv));
  1254. rollback.disarm();
  1255. return PEDeclaration {
  1256. move(name),
  1257. move(definition).downcast<DeprecatedString, ExternalID>(),
  1258. };
  1259. }
  1260. // 4.7.83 PublicID, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PublicID
  1261. ErrorOr<PublicID, ParseError> Parser::parse_public_id()
  1262. {
  1263. auto rollback = rollback_point();
  1264. auto rule = enter_rule();
  1265. // PublicID ::= 'PUBLIC' S PubidLiteral
  1266. TRY(expect("PUBLIC"sv));
  1267. auto accept = accept_rule();
  1268. TRY(skip_whitespace(Required::Yes));
  1269. auto text = TRY(parse_public_id_literal());
  1270. rollback.disarm();
  1271. return PublicID {
  1272. text,
  1273. };
  1274. }
  1275. constexpr static auto s_public_id_characters = set_to_search<StringSet("\x20\x0d\x0a-'()+,./:=?;!*#@$_%")>().unify(ranges_for_search<Range('a', 'z'), Range('A', 'Z'), Range('0', '9')>());
  1276. // 2.3.12, PubidLiteral, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral
  1277. ErrorOr<StringView, ParseError> Parser::parse_public_id_literal()
  1278. {
  1279. auto rollback = rollback_point();
  1280. auto rule = enter_rule();
  1281. // PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
  1282. auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv));
  1283. auto accept = accept_rule();
  1284. auto id = TRY(expect_many(
  1285. [q = quote[0]](auto x) {
  1286. return (q == '\'' ? x != '\'' : true) && s_public_id_characters.contains(x);
  1287. },
  1288. "a PubidChar"sv));
  1289. TRY(expect(quote));
  1290. rollback.disarm();
  1291. return id;
  1292. }
  1293. // 2.3.11 SystemLiteral, https://www.w3.org/TR/xml/#NT-SystemLiteral
  1294. ErrorOr<StringView, ParseError> Parser::parse_system_id_literal()
  1295. {
  1296. auto rollback = rollback_point();
  1297. auto rule = enter_rule();
  1298. // SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
  1299. auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv));
  1300. auto accept = accept_rule();
  1301. auto id = TRY(expect_many(is_not_any_of(quote), "not a quote"sv));
  1302. TRY(expect(quote));
  1303. rollback.disarm();
  1304. return id;
  1305. }
  1306. // 4.2.75 ExternalID, https://www.w3.org/TR/xml/#NT-ExternalID
  1307. ErrorOr<ExternalID, ParseError> Parser::parse_external_id()
  1308. {
  1309. auto rollback = rollback_point();
  1310. auto rule = enter_rule();
  1311. // ExternalID ::= 'SYSTEM' S SystemLiteral
  1312. // | 'PUBLIC' S PubidLiteral S SystemLiteral
  1313. Optional<PublicID> public_id;
  1314. SystemID system_id;
  1315. if (m_lexer.consume_specific("SYSTEM")) {
  1316. auto accept = accept_rule();
  1317. TRY(skip_whitespace(Required::Yes));
  1318. system_id = SystemID { TRY(parse_system_id_literal()) };
  1319. } else {
  1320. TRY(expect("PUBLIC"sv));
  1321. auto accept = accept_rule();
  1322. TRY(skip_whitespace(Required::Yes));
  1323. public_id = PublicID { TRY(parse_public_id_literal()) };
  1324. TRY(skip_whitespace(Required::Yes));
  1325. system_id = SystemID { TRY(parse_system_id_literal()) };
  1326. }
  1327. rollback.disarm();
  1328. return ExternalID {
  1329. move(public_id),
  1330. move(system_id),
  1331. };
  1332. }
  1333. // 4.2.2.76 NDataDecl, https://www.w3.org/TR/xml/#NT-NDataDecl
  1334. ErrorOr<Name, ParseError> Parser::parse_notation_data_declaration()
  1335. {
  1336. auto rollback = rollback_point();
  1337. auto rule = enter_rule();
  1338. // NDataDecl ::= S 'NDATA' S Name
  1339. TRY(skip_whitespace(Required::Yes));
  1340. auto accept = accept_rule();
  1341. TRY(expect("NDATA"sv));
  1342. TRY(skip_whitespace(Required::Yes));
  1343. auto name = TRY(parse_name());
  1344. rollback.disarm();
  1345. return name;
  1346. }
  1347. // 2.3.9 EntityValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue
  1348. ErrorOr<DeprecatedString, ParseError> Parser::parse_entity_value()
  1349. {
  1350. auto rollback = rollback_point();
  1351. auto rule = enter_rule();
  1352. StringBuilder builder;
  1353. // EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
  1354. // | "'" ([^%&'] | PEReference | Reference)* "'"
  1355. auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv));
  1356. auto accept = accept_rule();
  1357. while (true) {
  1358. if (m_lexer.is_eof())
  1359. break;
  1360. if (m_lexer.next_is(quote))
  1361. break;
  1362. if (m_lexer.next_is('%')) {
  1363. auto start = m_lexer.tell();
  1364. TRY(parse_parameter_entity_reference());
  1365. builder.append(m_source.substring_view(start, m_lexer.tell() - start));
  1366. continue;
  1367. }
  1368. if (m_lexer.next_is('&')) {
  1369. auto start = m_lexer.tell();
  1370. TRY(parse_reference());
  1371. builder.append(m_source.substring_view(start, m_lexer.tell() - start));
  1372. continue;
  1373. }
  1374. builder.append(m_lexer.consume());
  1375. }
  1376. TRY(expect(quote));
  1377. rollback.disarm();
  1378. return builder.to_deprecated_string();
  1379. }
  1380. // 2.7.18 CDSect, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CDSect
  1381. ErrorOr<StringView, ParseError> Parser::parse_cdata_section()
  1382. {
  1383. auto rollback = rollback_point();
  1384. auto rule = enter_rule();
  1385. // CDSect ::= CDStart CData CDEnd
  1386. // CDStart ::= '<![CDATA['
  1387. // CData ::= (Char* - (Char* ']]>' Char*))
  1388. // CDEnd ::= ']]>'
  1389. TRY(expect("<![CDATA["sv));
  1390. auto accept = accept_rule();
  1391. auto section_start = m_lexer.tell();
  1392. while (!m_lexer.next_is("]]>")) {
  1393. if (m_lexer.is_eof())
  1394. break;
  1395. m_lexer.ignore();
  1396. }
  1397. auto section_end = m_lexer.tell();
  1398. TRY(expect("]]>"sv));
  1399. rollback.disarm();
  1400. return m_source.substring_view(section_start, section_end - section_start);
  1401. }
  1402. // 2.8.30 extSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubset
  1403. ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset()
  1404. {
  1405. auto rollback = rollback_point();
  1406. auto rule = enter_rule();
  1407. // extSubset ::= TextDecl? extSubsetDecl
  1408. (void)parse_text_declaration();
  1409. auto result = TRY(parse_external_subset_declaration());
  1410. rollback.disarm();
  1411. return result;
  1412. }
  1413. // 4.3.1.77 TextDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-TextDecl
  1414. ErrorOr<void, ParseError> Parser::parse_text_declaration()
  1415. {
  1416. auto rollback = rollback_point();
  1417. auto rule = enter_rule();
  1418. // TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
  1419. TRY(expect("<?xml"sv));
  1420. auto accept = accept_rule();
  1421. (void)parse_version_info();
  1422. TRY(parse_encoding_decl());
  1423. TRY(skip_whitespace());
  1424. TRY(expect("?>"sv));
  1425. rollback.disarm();
  1426. return {};
  1427. }
  1428. ErrorOr<DeprecatedString, ParseError> Parser::resolve_reference(EntityReference const& reference, ReferencePlacement placement)
  1429. {
  1430. static HashTable<Name> reference_lookup {};
  1431. if (reference_lookup.contains(reference.name))
  1432. return parse_error(m_lexer.tell(), DeprecatedString::formatted("Invalid recursive definition for '{}'", reference.name));
  1433. reference_lookup.set(reference.name);
  1434. ScopeGuard remove_lookup {
  1435. [&] {
  1436. reference_lookup.remove(reference.name);
  1437. }
  1438. };
  1439. Optional<DeprecatedString> resolved;
  1440. if (m_doctype.has_value()) {
  1441. // FIXME: Split these up and resolve them ahead of time.
  1442. for (auto& declaration : m_doctype->markup_declarations) {
  1443. auto entity = declaration.get_pointer<EntityDeclaration>();
  1444. if (!entity)
  1445. continue;
  1446. auto ge_declaration = entity->get_pointer<GEDeclaration>();
  1447. if (!ge_declaration)
  1448. continue;
  1449. if (ge_declaration->name != reference.name)
  1450. continue;
  1451. TRY(ge_declaration->definition.visit(
  1452. [&](DeprecatedString const& definition) -> ErrorOr<void, ParseError> {
  1453. resolved = definition;
  1454. return {};
  1455. },
  1456. [&](EntityDefinition const& definition) -> ErrorOr<void, ParseError> {
  1457. if (placement == ReferencePlacement::AttributeValue)
  1458. return parse_error(m_lexer.tell(), DeprecatedString::formatted("Attribute references external entity '{}'", reference.name));
  1459. if (definition.notation.has_value())
  1460. return parse_error(0u, DeprecatedString::formatted("Entity reference to unparsed entity '{}'", reference.name));
  1461. if (!m_options.resolve_external_resource)
  1462. return parse_error(0u, DeprecatedString::formatted("Failed to resolve external entity '{}'", reference.name));
  1463. auto result = m_options.resolve_external_resource(definition.id.system_id, definition.id.public_id);
  1464. if (result.is_error())
  1465. return parse_error(0u, DeprecatedString::formatted("Failed to resolve external entity '{}': {}", reference.name, result.error()));
  1466. resolved = result.release_value();
  1467. return {};
  1468. }));
  1469. break;
  1470. }
  1471. }
  1472. if (!resolved.has_value()) {
  1473. if (reference.name == "amp")
  1474. return "&";
  1475. if (reference.name == "lt")
  1476. return "<";
  1477. if (reference.name == "gt")
  1478. return ">";
  1479. if (reference.name == "apos")
  1480. return "'";
  1481. if (reference.name == "quot")
  1482. return "\"";
  1483. return parse_error(0u, DeprecatedString::formatted("Reference to undeclared entity '{}'", reference.name));
  1484. }
  1485. StringView resolved_source = *resolved;
  1486. TemporaryChange source { m_source, resolved_source };
  1487. TemporaryChange lexer { m_lexer, GenericLexer(m_source) };
  1488. switch (placement) {
  1489. case ReferencePlacement::AttributeValue:
  1490. return TRY(parse_attribute_value_inner(""sv));
  1491. case ReferencePlacement::Content:
  1492. TRY(parse_content());
  1493. return "";
  1494. default:
  1495. VERIFY_NOT_REACHED();
  1496. }
  1497. }
  1498. }