HTMLDocumentParser.cpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include <AK/Utf32View.h>
  27. #include <LibWeb/DOM/Comment.h>
  28. #include <LibWeb/DOM/Document.h>
  29. #include <LibWeb/DOM/DocumentType.h>
  30. #include <LibWeb/DOM/ElementFactory.h>
  31. #include <LibWeb/DOM/HTMLFormElement.h>
  32. #include <LibWeb/DOM/HTMLHeadElement.h>
  33. #include <LibWeb/DOM/HTMLScriptElement.h>
  34. #include <LibWeb/DOM/Text.h>
  35. #include <LibWeb/Parser/HTMLDocumentParser.h>
  36. #include <LibWeb/Parser/HTMLToken.h>
  37. #define TODO() \
  38. do { \
  39. ASSERT_NOT_REACHED(); \
  40. } while (0)
  41. namespace Web {
  42. HTMLDocumentParser::HTMLDocumentParser(const StringView& input)
  43. : m_tokenizer(input)
  44. {
  45. }
  46. HTMLDocumentParser::~HTMLDocumentParser()
  47. {
  48. }
  49. void HTMLDocumentParser::run(const URL& url)
  50. {
  51. m_document = adopt(*new Document);
  52. m_document->set_url(url);
  53. for (;;) {
  54. auto optional_token = m_tokenizer.next_token();
  55. if (!optional_token.has_value())
  56. return;
  57. auto& token = optional_token.value();
  58. dbg() << "[" << insertion_mode_name() << "] " << token.to_string();
  59. process_using_the_rules_for(m_insertion_mode, token);
  60. }
  61. }
  62. void HTMLDocumentParser::process_using_the_rules_for(InsertionMode mode, HTMLToken& token)
  63. {
  64. switch (mode) {
  65. case InsertionMode::Initial:
  66. handle_initial(token);
  67. break;
  68. case InsertionMode::BeforeHTML:
  69. handle_before_html(token);
  70. break;
  71. case InsertionMode::BeforeHead:
  72. handle_before_head(token);
  73. break;
  74. case InsertionMode::InHead:
  75. handle_in_head(token);
  76. break;
  77. case InsertionMode::InHeadNoscript:
  78. handle_in_head_noscript(token);
  79. break;
  80. case InsertionMode::AfterHead:
  81. handle_after_head(token);
  82. break;
  83. case InsertionMode::InBody:
  84. handle_in_body(token);
  85. break;
  86. case InsertionMode::AfterBody:
  87. handle_after_body(token);
  88. break;
  89. case InsertionMode::AfterAfterBody:
  90. handle_after_after_body(token);
  91. break;
  92. case InsertionMode::Text:
  93. handle_text(token);
  94. break;
  95. default:
  96. ASSERT_NOT_REACHED();
  97. }
  98. }
  99. void HTMLDocumentParser::handle_initial(HTMLToken& token)
  100. {
  101. if (token.type() == HTMLToken::Type::DOCTYPE) {
  102. auto doctype = adopt(*new DocumentType(document()));
  103. doctype->set_name(token.m_doctype.name.to_string());
  104. document().append_child(move(doctype));
  105. m_insertion_mode = InsertionMode::BeforeHTML;
  106. return;
  107. }
  108. ASSERT_NOT_REACHED();
  109. }
  110. void HTMLDocumentParser::handle_before_html(HTMLToken& token)
  111. {
  112. if (token.is_character() && token.is_parser_whitespace()) {
  113. return;
  114. }
  115. if (token.is_start_tag() && token.tag_name() == "html") {
  116. auto element = create_element_for(token);
  117. document().append_child(element);
  118. m_stack_of_open_elements.push(move(element));
  119. m_insertion_mode = InsertionMode::BeforeHead;
  120. return;
  121. }
  122. ASSERT_NOT_REACHED();
  123. }
  124. Element& HTMLDocumentParser::current_node()
  125. {
  126. return m_stack_of_open_elements.current_node();
  127. }
  128. RefPtr<Node> HTMLDocumentParser::find_appropriate_place_for_inserting_node()
  129. {
  130. auto& target = current_node();
  131. if (m_foster_parenting) {
  132. ASSERT_NOT_REACHED();
  133. }
  134. return target;
  135. }
  136. NonnullRefPtr<Element> HTMLDocumentParser::create_element_for(HTMLToken& token)
  137. {
  138. auto element = create_element(document(), token.tag_name());
  139. for (auto& attribute : token.m_tag.attributes) {
  140. element->set_attribute(attribute.name_builder.to_string(), attribute.value_builder.to_string());
  141. }
  142. return element;
  143. }
  144. RefPtr<Element> HTMLDocumentParser::insert_html_element(HTMLToken& token)
  145. {
  146. auto adjusted_insertion_location = find_appropriate_place_for_inserting_node();
  147. auto element = create_element_for(token);
  148. // FIXME: Check if it's possible to insert `element` at `adjusted_insertion_location`
  149. adjusted_insertion_location->append_child(element);
  150. m_stack_of_open_elements.push(element);
  151. return element;
  152. }
  153. void HTMLDocumentParser::handle_before_head(HTMLToken& token)
  154. {
  155. if (token.is_character() && token.is_parser_whitespace()) {
  156. return;
  157. }
  158. if (token.is_start_tag() && token.tag_name() == "head") {
  159. auto element = insert_html_element(token);
  160. m_head_element = to<HTMLHeadElement>(element);
  161. m_insertion_mode = InsertionMode::InHead;
  162. return;
  163. }
  164. ASSERT_NOT_REACHED();
  165. }
  166. void HTMLDocumentParser::insert_comment(HTMLToken& token)
  167. {
  168. auto data = token.m_comment_or_character.data.to_string();
  169. auto adjusted_insertion_location = find_appropriate_place_for_inserting_node();
  170. adjusted_insertion_location->append_child(adopt(*new Comment(document(), data)));
  171. }
  172. void HTMLDocumentParser::handle_in_head(HTMLToken& token)
  173. {
  174. if (token.is_parser_whitespace()) {
  175. insert_character(token.codepoint());
  176. return;
  177. }
  178. if (token.is_comment()) {
  179. insert_comment(token);
  180. return;
  181. }
  182. if (token.is_start_tag() && token.tag_name() == "title") {
  183. insert_html_element(token);
  184. m_tokenizer.switch_to({}, HTMLTokenizer::State::RCDATA);
  185. m_original_insertion_mode = m_insertion_mode;
  186. m_insertion_mode = InsertionMode::Text;
  187. return;
  188. }
  189. if (token.is_start_tag() && ((token.tag_name() == "noscript" && m_scripting_enabled) || token.tag_name() == "noframes" || token.tag_name() == "style")) {
  190. parse_generic_raw_text_element(token);
  191. return;
  192. }
  193. if (token.is_start_tag() && token.tag_name() == "script") {
  194. auto adjusted_insertion_location = find_appropriate_place_for_inserting_node();
  195. auto element = create_element_for(token);
  196. auto& script_element = to<HTMLScriptElement>(*element);
  197. script_element.set_parser_document({}, document());
  198. script_element.set_non_blocking({}, false);
  199. if (m_parsing_fragment) {
  200. TODO();
  201. }
  202. if (m_invoked_via_document_write) {
  203. TODO();
  204. }
  205. adjusted_insertion_location->append_child(element, false);
  206. m_stack_of_open_elements.push(element);
  207. m_tokenizer.switch_to({}, HTMLTokenizer::State::ScriptData);
  208. m_original_insertion_mode = m_insertion_mode;
  209. m_insertion_mode = InsertionMode::Text;
  210. return;
  211. }
  212. if (token.is_start_tag() && token.tag_name() == "meta") {
  213. auto element = insert_html_element(token);
  214. m_stack_of_open_elements.pop();
  215. if (token.is_self_closing()) {
  216. ASSERT_NOT_REACHED();
  217. }
  218. return;
  219. }
  220. if (token.is_end_tag() && token.tag_name() == "head") {
  221. m_stack_of_open_elements.pop();
  222. m_insertion_mode = InsertionMode::AfterHead;
  223. return;
  224. }
  225. ASSERT_NOT_REACHED();
  226. }
  227. void HTMLDocumentParser::handle_in_head_noscript(HTMLToken&)
  228. {
  229. ASSERT_NOT_REACHED();
  230. }
  231. void HTMLDocumentParser::parse_generic_raw_text_element(HTMLToken& token)
  232. {
  233. insert_html_element(token);
  234. m_tokenizer.switch_to({}, HTMLTokenizer::State::RAWTEXT);
  235. m_original_insertion_mode = m_insertion_mode;
  236. m_insertion_mode = InsertionMode::Text;
  237. }
  238. void HTMLDocumentParser::insert_character(u32 data)
  239. {
  240. auto adjusted_insertion_location = find_appropriate_place_for_inserting_node();
  241. if (adjusted_insertion_location->is_document())
  242. return;
  243. if (adjusted_insertion_location->last_child() && adjusted_insertion_location->last_child()->is_text()) {
  244. auto& existing_text_node = to<Text>(*adjusted_insertion_location->last_child());
  245. StringBuilder builder;
  246. builder.append(existing_text_node.data());
  247. builder.append(Utf32View { &data, 1 });
  248. existing_text_node.set_data(builder.to_string());
  249. return;
  250. }
  251. StringBuilder builder;
  252. builder.append(Utf32View { &data, 1 });
  253. adjusted_insertion_location->append_child(adopt(*new Text(document(), builder.to_string())));
  254. }
  255. void HTMLDocumentParser::handle_after_head(HTMLToken& token)
  256. {
  257. if (token.is_character()) {
  258. if (token.is_parser_whitespace()) {
  259. insert_character(token.codepoint());
  260. return;
  261. }
  262. ASSERT_NOT_REACHED();
  263. }
  264. if (token.is_comment()) {
  265. ASSERT_NOT_REACHED();
  266. }
  267. if (token.is_doctype()) {
  268. ASSERT_NOT_REACHED();
  269. }
  270. if (token.is_start_tag() && token.tag_name() == "html") {
  271. ASSERT_NOT_REACHED();
  272. }
  273. if (token.is_start_tag() && token.tag_name() == "body") {
  274. insert_html_element(token);
  275. m_frameset_ok = false;
  276. m_insertion_mode = InsertionMode::InBody;
  277. return;
  278. }
  279. if (token.is_start_tag() && token.tag_name() == "frameset") {
  280. ASSERT_NOT_REACHED();
  281. }
  282. {
  283. Vector<String> names = { "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "template", "title" };
  284. if (token.is_start_tag() && names.contains_slow(token.tag_name())) {
  285. ASSERT_NOT_REACHED();
  286. }
  287. }
  288. if (token.is_end_tag() && token.tag_name() == "template") {
  289. ASSERT_NOT_REACHED();
  290. }
  291. if (token.is_end_tag() && (token.tag_name() == "body" || token.tag_name() == "html" || token.tag_name() == "br")) {
  292. goto AnythingElse;
  293. }
  294. if ((token.is_start_tag() && token.tag_name() == "head") || token.is_end_tag()) {
  295. ASSERT_NOT_REACHED();
  296. }
  297. AnythingElse:
  298. HTMLToken fake_body_token;
  299. fake_body_token.m_type = HTMLToken::Type::StartTag;
  300. fake_body_token.m_tag.tag_name.append("body");
  301. insert_html_element(fake_body_token);
  302. m_insertion_mode = InsertionMode::InBody;
  303. // FIXME: Reprocess the current token in InBody!
  304. }
  305. void HTMLDocumentParser::generate_implied_end_tags(const FlyString& exception)
  306. {
  307. static Vector<FlyString> names { "dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc" };
  308. while (current_node().tag_name() != exception && names.contains_slow(current_node().tag_name()))
  309. m_stack_of_open_elements.pop();
  310. }
  311. void HTMLDocumentParser::close_a_p_element()
  312. {
  313. generate_implied_end_tags("p");
  314. if (current_node().tag_name() != "p") {
  315. // Parse error.
  316. TODO();
  317. }
  318. for (;;) {
  319. auto popped_element = m_stack_of_open_elements.pop();
  320. if (popped_element->tag_name() == "p")
  321. break;
  322. }
  323. }
  324. void HTMLDocumentParser::handle_after_body(HTMLToken& token)
  325. {
  326. if (token.is_character() && token.is_parser_whitespace()) {
  327. process_using_the_rules_for(InsertionMode::InBody, token);
  328. return;
  329. }
  330. if (token.is_end_tag() && token.tag_name() == "html") {
  331. if (m_parsing_fragment) {
  332. ASSERT_NOT_REACHED();
  333. }
  334. m_insertion_mode = InsertionMode::AfterAfterBody;
  335. return;
  336. }
  337. ASSERT_NOT_REACHED();
  338. }
  339. void HTMLDocumentParser::handle_after_after_body(HTMLToken& token)
  340. {
  341. if (token.is_doctype() || token.is_parser_whitespace() || (token.is_start_tag() && token.tag_name() == "html")) {
  342. process_using_the_rules_for(InsertionMode::InBody, token);
  343. return;
  344. }
  345. if (token.is_end_of_file()) {
  346. dbg() << "Stop parsing! :^)";
  347. return;
  348. }
  349. ASSERT_NOT_REACHED();
  350. }
  351. void HTMLDocumentParser::reconstruct_the_active_formatting_elements()
  352. {
  353. if (m_list_of_active_formatting_elements.is_empty())
  354. return;
  355. ASSERT_NOT_REACHED();
  356. }
  357. void HTMLDocumentParser::handle_in_body(HTMLToken& token)
  358. {
  359. if (token.is_character()) {
  360. if (token.codepoint() == 0) {
  361. ASSERT_NOT_REACHED();
  362. }
  363. if (token.is_parser_whitespace()) {
  364. reconstruct_the_active_formatting_elements();
  365. insert_character(token.codepoint());
  366. return;
  367. }
  368. reconstruct_the_active_formatting_elements();
  369. insert_character(token.codepoint());
  370. m_frameset_ok = false;
  371. return;
  372. }
  373. if (token.is_end_tag() && token.tag_name() == "body") {
  374. if (!m_stack_of_open_elements.has_in_scope("body")) {
  375. ASSERT_NOT_REACHED();
  376. }
  377. // FIXME: Otherwise, if there is a node in the stack of open elements that is
  378. // not either a dd element, a dt element, an li element, an optgroup element,
  379. // an option element, a p element, an rb element, an rp element, an rt element,
  380. // an rtc element, a tbody element, a td element, a tfoot element, a th element,
  381. // a thead element, a tr element, the body element, or the html element,
  382. // then this is a parse error.
  383. m_insertion_mode = InsertionMode::AfterBody;
  384. return;
  385. }
  386. {
  387. static Vector<FlyString> names { "h1", "h2", "h3", "h4", "h5", "h6" };
  388. if (token.is_start_tag() && names.contains_slow(token.tag_name())) {
  389. if (m_stack_of_open_elements.has_in_button_scope("p"))
  390. close_a_p_element();
  391. if (names.contains_slow(current_node().tag_name())) {
  392. // FIXME: This is a parse error!
  393. TODO();
  394. }
  395. insert_html_element(token);
  396. return;
  397. }
  398. }
  399. {
  400. static Vector<FlyString> names { "h1", "h2", "h3", "h4", "h5", "h6" };
  401. if (token.is_end_tag() && names.contains_slow(token.tag_name())) {
  402. if (!m_stack_of_open_elements.has_in_scope("h1")
  403. && !m_stack_of_open_elements.has_in_scope("h2")
  404. && !m_stack_of_open_elements.has_in_scope("h3")
  405. && !m_stack_of_open_elements.has_in_scope("h4")
  406. && !m_stack_of_open_elements.has_in_scope("h5")
  407. && !m_stack_of_open_elements.has_in_scope("h6")) {
  408. TODO();
  409. }
  410. generate_implied_end_tags();
  411. if (current_node().tag_name() != token.tag_name()) {
  412. TODO();
  413. }
  414. for (;;) {
  415. auto popped_element = m_stack_of_open_elements.pop();
  416. if (popped_element->tag_name() == "h1"
  417. || popped_element->tag_name() == "h2"
  418. || popped_element->tag_name() == "h3"
  419. || popped_element->tag_name() == "h4"
  420. || popped_element->tag_name() == "h5"
  421. || popped_element->tag_name() == "h6") {
  422. break;
  423. }
  424. }
  425. return;
  426. }
  427. }
  428. if (token.is_end_tag() && token.tag_name() == "p") {
  429. if (!m_stack_of_open_elements.has_in_button_scope("p")) {
  430. TODO();
  431. }
  432. close_a_p_element();
  433. return;
  434. }
  435. {
  436. static Vector<FlyString> names { "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u" };
  437. if (token.is_start_tag() && names.contains_slow(token.tag_name())) {
  438. reconstruct_the_active_formatting_elements();
  439. auto element = insert_html_element(token);
  440. m_list_of_active_formatting_elements.append(*element);
  441. return;
  442. }
  443. }
  444. {
  445. Vector<String> names { "address", "article", "aside", "blockquote", "center", "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", "section", "summary", "ul" };
  446. if (token.is_start_tag() && names.contains_slow(token.tag_name())) {
  447. // FIXME: If the stack of open elements has a p element in button scope, then close a p element.
  448. insert_html_element(token);
  449. return;
  450. }
  451. if (token.is_end_tag() && names.contains_slow(token.tag_name())) {
  452. // FIXME: If the stack of open elements has a p element in button scope, then close a p element.
  453. if (!m_stack_of_open_elements.has_in_scope(token.tag_name())) {
  454. ASSERT_NOT_REACHED();
  455. }
  456. generate_implied_end_tags();
  457. if (current_node().tag_name() != token.tag_name()) {
  458. ASSERT_NOT_REACHED();
  459. }
  460. m_stack_of_open_elements.pop();
  461. return;
  462. }
  463. }
  464. if (token.is_start_tag()) {
  465. reconstruct_the_active_formatting_elements();
  466. insert_html_element(token);
  467. return;
  468. }
  469. ASSERT_NOT_REACHED();
  470. }
  471. void HTMLDocumentParser::increment_script_nesting_level()
  472. {
  473. ++m_script_nesting_level;
  474. }
  475. void HTMLDocumentParser::decrement_script_nesting_level()
  476. {
  477. ASSERT(m_script_nesting_level);
  478. --m_script_nesting_level;
  479. }
  480. void HTMLDocumentParser::handle_text(HTMLToken& token)
  481. {
  482. if (token.is_character()) {
  483. insert_character(token.codepoint());
  484. return;
  485. }
  486. if (token.is_end_tag() && token.tag_name() == "script") {
  487. NonnullRefPtr<HTMLScriptElement> script = to<HTMLScriptElement>(current_node());
  488. m_stack_of_open_elements.pop();
  489. m_insertion_mode = m_original_insertion_mode;
  490. // FIXME: Handle tokenizer insertion point stuff here.
  491. increment_script_nesting_level();
  492. script->prepare_script({});
  493. decrement_script_nesting_level();
  494. if (script_nesting_level() == 0)
  495. m_parser_pause_flag = false;
  496. // FIXME: Handle tokenizer insertion point stuff here too.
  497. return;
  498. }
  499. if (token.is_end_tag()) {
  500. m_stack_of_open_elements.pop();
  501. m_insertion_mode = m_original_insertion_mode;
  502. return;
  503. }
  504. ASSERT_NOT_REACHED();
  505. }
  506. const char* HTMLDocumentParser::insertion_mode_name() const
  507. {
  508. switch (m_insertion_mode) {
  509. #define __ENUMERATE_INSERTION_MODE(mode) \
  510. case InsertionMode::mode: \
  511. return #mode;
  512. ENUMERATE_INSERTION_MODES
  513. #undef __ENUMERATE_INSERTION_MODE
  514. }
  515. ASSERT_NOT_REACHED();
  516. }
  517. Document& HTMLDocumentParser::document()
  518. {
  519. return *m_document;
  520. }
  521. }