HTMLDocumentParser.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include <AK/Utf32View.h>
  27. #include <LibWeb/DOM/Comment.h>
  28. #include <LibWeb/DOM/Document.h>
  29. #include <LibWeb/DOM/DocumentType.h>
  30. #include <LibWeb/DOM/ElementFactory.h>
  31. #include <LibWeb/DOM/HTMLFormElement.h>
  32. #include <LibWeb/DOM/HTMLHeadElement.h>
  33. #include <LibWeb/DOM/HTMLScriptElement.h>
  34. #include <LibWeb/DOM/Text.h>
  35. #include <LibWeb/Parser/HTMLDocumentParser.h>
  36. #include <LibWeb/Parser/HTMLToken.h>
  37. #define TODO() \
  38. do { \
  39. ASSERT_NOT_REACHED(); \
  40. } while (0)
  41. namespace Web {
  42. HTMLDocumentParser::HTMLDocumentParser(const StringView& input)
  43. : m_tokenizer(input)
  44. {
  45. }
  46. HTMLDocumentParser::~HTMLDocumentParser()
  47. {
  48. }
  49. void HTMLDocumentParser::run(const URL& url)
  50. {
  51. m_document = adopt(*new Document);
  52. m_document->set_url(url);
  53. for (;;) {
  54. auto optional_token = m_tokenizer.next_token();
  55. if (!optional_token.has_value())
  56. return;
  57. auto& token = optional_token.value();
  58. dbg() << "[" << insertion_mode_name() << "] " << token.to_string();
  59. process_using_the_rules_for(m_insertion_mode, token);
  60. }
  61. }
  62. void HTMLDocumentParser::process_using_the_rules_for(InsertionMode mode, HTMLToken& token)
  63. {
  64. switch (mode) {
  65. case InsertionMode::Initial:
  66. handle_initial(token);
  67. break;
  68. case InsertionMode::BeforeHTML:
  69. handle_before_html(token);
  70. break;
  71. case InsertionMode::BeforeHead:
  72. handle_before_head(token);
  73. break;
  74. case InsertionMode::InHead:
  75. handle_in_head(token);
  76. break;
  77. case InsertionMode::InHeadNoscript:
  78. handle_in_head_noscript(token);
  79. break;
  80. case InsertionMode::AfterHead:
  81. handle_after_head(token);
  82. break;
  83. case InsertionMode::InBody:
  84. handle_in_body(token);
  85. break;
  86. case InsertionMode::AfterBody:
  87. handle_after_body(token);
  88. break;
  89. case InsertionMode::AfterAfterBody:
  90. handle_after_after_body(token);
  91. break;
  92. case InsertionMode::Text:
  93. handle_text(token);
  94. break;
  95. default:
  96. ASSERT_NOT_REACHED();
  97. }
  98. }
  99. void HTMLDocumentParser::handle_initial(HTMLToken& token)
  100. {
  101. if (token.type() == HTMLToken::Type::DOCTYPE) {
  102. auto doctype = adopt(*new DocumentType(document()));
  103. doctype->set_name(token.m_doctype.name.to_string());
  104. document().append_child(move(doctype));
  105. m_insertion_mode = InsertionMode::BeforeHTML;
  106. return;
  107. }
  108. ASSERT_NOT_REACHED();
  109. }
  110. void HTMLDocumentParser::handle_before_html(HTMLToken& token)
  111. {
  112. if (token.is_character() && token.is_parser_whitespace()) {
  113. return;
  114. }
  115. if (token.is_start_tag() && token.tag_name() == "html") {
  116. auto element = create_element_for(token);
  117. document().append_child(element);
  118. m_stack_of_open_elements.push(move(element));
  119. m_insertion_mode = InsertionMode::BeforeHead;
  120. return;
  121. }
  122. ASSERT_NOT_REACHED();
  123. }
  124. Element& HTMLDocumentParser::current_node()
  125. {
  126. return m_stack_of_open_elements.current_node();
  127. }
  128. RefPtr<Node> HTMLDocumentParser::find_appropriate_place_for_inserting_node()
  129. {
  130. auto& target = current_node();
  131. if (m_foster_parenting) {
  132. ASSERT_NOT_REACHED();
  133. }
  134. return target;
  135. }
  136. NonnullRefPtr<Element> HTMLDocumentParser::create_element_for(HTMLToken& token)
  137. {
  138. auto element = create_element(document(), token.tag_name());
  139. for (auto& attribute : token.m_tag.attributes) {
  140. element->set_attribute(attribute.name_builder.to_string(), attribute.value_builder.to_string());
  141. }
  142. return element;
  143. }
  144. RefPtr<Element> HTMLDocumentParser::insert_html_element(HTMLToken& token)
  145. {
  146. auto adjusted_insertion_location = find_appropriate_place_for_inserting_node();
  147. auto element = create_element_for(token);
  148. // FIXME: Check if it's possible to insert `element` at `adjusted_insertion_location`
  149. adjusted_insertion_location->append_child(element);
  150. m_stack_of_open_elements.push(element);
  151. return element;
  152. }
  153. void HTMLDocumentParser::handle_before_head(HTMLToken& token)
  154. {
  155. if (token.is_character() && token.is_parser_whitespace()) {
  156. return;
  157. }
  158. if (token.is_start_tag() && token.tag_name() == "head") {
  159. auto element = insert_html_element(token);
  160. m_head_element = to<HTMLHeadElement>(element);
  161. m_insertion_mode = InsertionMode::InHead;
  162. return;
  163. }
  164. ASSERT_NOT_REACHED();
  165. }
  166. void HTMLDocumentParser::insert_comment(HTMLToken& token)
  167. {
  168. auto data = token.m_comment_or_character.data.to_string();
  169. auto adjusted_insertion_location = find_appropriate_place_for_inserting_node();
  170. adjusted_insertion_location->append_child(adopt(*new Comment(document(), data)));
  171. }
  172. void HTMLDocumentParser::handle_in_head(HTMLToken& token)
  173. {
  174. if (token.is_parser_whitespace()) {
  175. insert_character(token.codepoint());
  176. return;
  177. }
  178. if (token.is_comment()) {
  179. insert_comment(token);
  180. return;
  181. }
  182. if (token.is_start_tag() && token.tag_name() == "title") {
  183. insert_html_element(token);
  184. m_tokenizer.switch_to({}, HTMLTokenizer::State::RCDATA);
  185. m_original_insertion_mode = m_insertion_mode;
  186. m_insertion_mode = InsertionMode::Text;
  187. return;
  188. }
  189. if (token.is_start_tag() && ((token.tag_name() == "noscript" && m_scripting_enabled) || token.tag_name() == "noframes" || token.tag_name() == "style")) {
  190. parse_generic_raw_text_element(token);
  191. return;
  192. }
  193. if (token.is_start_tag() && token.tag_name() == "script") {
  194. auto adjusted_insertion_location = find_appropriate_place_for_inserting_node();
  195. auto element = create_element_for(token);
  196. auto& script_element = to<HTMLScriptElement>(*element);
  197. script_element.set_parser_document({}, document());
  198. script_element.set_non_blocking({}, false);
  199. if (m_parsing_fragment) {
  200. TODO();
  201. }
  202. if (m_invoked_via_document_write) {
  203. TODO();
  204. }
  205. adjusted_insertion_location->append_child(element, false);
  206. m_stack_of_open_elements.push(element);
  207. m_tokenizer.switch_to({}, HTMLTokenizer::State::ScriptData);
  208. m_original_insertion_mode = m_insertion_mode;
  209. m_insertion_mode = InsertionMode::Text;
  210. return;
  211. }
  212. if (token.is_start_tag() && token.tag_name() == "meta") {
  213. auto element = insert_html_element(token);
  214. m_stack_of_open_elements.pop();
  215. if (token.is_self_closing()) {
  216. ASSERT_NOT_REACHED();
  217. }
  218. return;
  219. }
  220. if (token.is_end_tag() && token.tag_name() == "head") {
  221. m_stack_of_open_elements.pop();
  222. m_insertion_mode = InsertionMode::AfterHead;
  223. return;
  224. }
  225. ASSERT_NOT_REACHED();
  226. }
  227. void HTMLDocumentParser::handle_in_head_noscript(HTMLToken&)
  228. {
  229. ASSERT_NOT_REACHED();
  230. }
  231. void HTMLDocumentParser::parse_generic_raw_text_element(HTMLToken& token)
  232. {
  233. insert_html_element(token);
  234. m_tokenizer.switch_to({}, HTMLTokenizer::State::RAWTEXT);
  235. m_original_insertion_mode = m_insertion_mode;
  236. m_insertion_mode = InsertionMode::Text;
  237. }
  238. void HTMLDocumentParser::insert_character(u32 data)
  239. {
  240. auto adjusted_insertion_location = find_appropriate_place_for_inserting_node();
  241. if (adjusted_insertion_location->is_document())
  242. return;
  243. if (adjusted_insertion_location->last_child() && adjusted_insertion_location->last_child()->is_text()) {
  244. auto& existing_text_node = to<Text>(*adjusted_insertion_location->last_child());
  245. StringBuilder builder;
  246. builder.append(existing_text_node.data());
  247. builder.append(Utf32View { &data, 1 });
  248. existing_text_node.set_data(builder.to_string());
  249. return;
  250. }
  251. StringBuilder builder;
  252. builder.append(Utf32View { &data, 1 });
  253. adjusted_insertion_location->append_child(adopt(*new Text(document(), builder.to_string())));
  254. }
  255. void HTMLDocumentParser::handle_after_head(HTMLToken& token)
  256. {
  257. if (token.is_character()) {
  258. if (token.is_parser_whitespace()) {
  259. insert_character(token.codepoint());
  260. return;
  261. }
  262. ASSERT_NOT_REACHED();
  263. }
  264. if (token.is_comment()) {
  265. ASSERT_NOT_REACHED();
  266. }
  267. if (token.is_doctype()) {
  268. ASSERT_NOT_REACHED();
  269. }
  270. if (token.is_start_tag() && token.tag_name() == "html") {
  271. ASSERT_NOT_REACHED();
  272. }
  273. if (token.is_start_tag() && token.tag_name() == "body") {
  274. insert_html_element(token);
  275. m_frameset_ok = false;
  276. m_insertion_mode = InsertionMode::InBody;
  277. return;
  278. }
  279. if (token.is_start_tag() && token.tag_name() == "frameset") {
  280. ASSERT_NOT_REACHED();
  281. }
  282. {
  283. Vector<String> names = { "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "template", "title" };
  284. if (token.is_start_tag() && names.contains_slow(token.tag_name())) {
  285. ASSERT_NOT_REACHED();
  286. }
  287. }
  288. if (token.is_end_tag() && token.tag_name() == "template") {
  289. ASSERT_NOT_REACHED();
  290. }
  291. if (token.is_end_tag() && (token.tag_name() == "body" || token.tag_name() == "html" || token.tag_name() == "br")) {
  292. goto AnythingElse;
  293. }
  294. if ((token.is_start_tag() && token.tag_name() == "head") || token.is_end_tag()) {
  295. ASSERT_NOT_REACHED();
  296. }
  297. AnythingElse:
  298. HTMLToken fake_body_token;
  299. fake_body_token.m_type = HTMLToken::Type::StartTag;
  300. fake_body_token.m_tag.tag_name.append("body");
  301. insert_html_element(fake_body_token);
  302. m_insertion_mode = InsertionMode::InBody;
  303. // FIXME: Reprocess the current token in InBody!
  304. }
  305. void HTMLDocumentParser::generate_implied_end_tags()
  306. {
  307. Vector<String> names { "dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc" };
  308. while (names.contains_slow(current_node().tag_name()))
  309. m_stack_of_open_elements.pop();
  310. }
  311. void HTMLDocumentParser::handle_after_body(HTMLToken& token)
  312. {
  313. if (token.is_character() && token.is_parser_whitespace()) {
  314. process_using_the_rules_for(InsertionMode::InBody, token);
  315. return;
  316. }
  317. if (token.is_end_tag() && token.tag_name() == "html") {
  318. if (m_parsing_fragment) {
  319. ASSERT_NOT_REACHED();
  320. }
  321. m_insertion_mode = InsertionMode::AfterAfterBody;
  322. return;
  323. }
  324. ASSERT_NOT_REACHED();
  325. }
  326. void HTMLDocumentParser::handle_after_after_body(HTMLToken& token)
  327. {
  328. if (token.is_doctype() || token.is_parser_whitespace() || (token.is_start_tag() && token.tag_name() == "html")) {
  329. process_using_the_rules_for(InsertionMode::InBody, token);
  330. return;
  331. }
  332. if (token.is_end_of_file()) {
  333. dbg() << "Stop parsing! :^)";
  334. return;
  335. }
  336. ASSERT_NOT_REACHED();
  337. }
  338. void HTMLDocumentParser::reconstruct_the_active_formatting_elements()
  339. {
  340. if (m_list_of_active_formatting_elements.is_empty())
  341. return;
  342. ASSERT_NOT_REACHED();
  343. }
  344. void HTMLDocumentParser::handle_in_body(HTMLToken& token)
  345. {
  346. if (token.is_character()) {
  347. if (token.codepoint() == 0) {
  348. ASSERT_NOT_REACHED();
  349. }
  350. if (token.is_parser_whitespace()) {
  351. reconstruct_the_active_formatting_elements();
  352. insert_character(token.codepoint());
  353. return;
  354. }
  355. }
  356. if (token.is_end_tag() && token.tag_name() == "body") {
  357. if (!m_stack_of_open_elements.has_in_scope("body")) {
  358. ASSERT_NOT_REACHED();
  359. }
  360. // FIXME: Otherwise, if there is a node in the stack of open elements that is
  361. // not either a dd element, a dt element, an li element, an optgroup element,
  362. // an option element, a p element, an rb element, an rp element, an rt element,
  363. // an rtc element, a tbody element, a td element, a tfoot element, a th element,
  364. // a thead element, a tr element, the body element, or the html element,
  365. // then this is a parse error.
  366. m_insertion_mode = InsertionMode::AfterBody;
  367. return;
  368. }
  369. {
  370. Vector<String> names { "address", "article", "aside", "blockquote", "center", "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", "section", "summary", "ul" };
  371. if (token.is_start_tag() && names.contains_slow(token.tag_name())) {
  372. // FIXME: If the stack of open elements has a p element in button scope, then close a p element.
  373. insert_html_element(token);
  374. return;
  375. }
  376. if (token.is_end_tag() && names.contains_slow(token.tag_name())) {
  377. // FIXME: If the stack of open elements has a p element in button scope, then close a p element.
  378. if (!m_stack_of_open_elements.has_in_scope(token.tag_name())) {
  379. ASSERT_NOT_REACHED();
  380. }
  381. generate_implied_end_tags();
  382. if (current_node().tag_name() != token.tag_name()) {
  383. ASSERT_NOT_REACHED();
  384. }
  385. m_stack_of_open_elements.pop();
  386. return;
  387. }
  388. }
  389. ASSERT_NOT_REACHED();
  390. }
  391. void HTMLDocumentParser::increment_script_nesting_level()
  392. {
  393. ++m_script_nesting_level;
  394. }
  395. void HTMLDocumentParser::decrement_script_nesting_level()
  396. {
  397. ASSERT(m_script_nesting_level);
  398. --m_script_nesting_level;
  399. }
  400. void HTMLDocumentParser::handle_text(HTMLToken& token)
  401. {
  402. if (token.is_character()) {
  403. insert_character(token.codepoint());
  404. return;
  405. }
  406. if (token.is_end_tag() && token.tag_name() == "script") {
  407. NonnullRefPtr<HTMLScriptElement> script = to<HTMLScriptElement>(current_node());
  408. m_stack_of_open_elements.pop();
  409. m_insertion_mode = m_original_insertion_mode;
  410. // FIXME: Handle tokenizer insertion point stuff here.
  411. increment_script_nesting_level();
  412. script->prepare_script({});
  413. decrement_script_nesting_level();
  414. if (script_nesting_level() == 0)
  415. m_parser_pause_flag = false;
  416. // FIXME: Handle tokenizer insertion point stuff here too.
  417. return;
  418. }
  419. if (token.is_end_tag()) {
  420. m_stack_of_open_elements.pop();
  421. m_insertion_mode = m_original_insertion_mode;
  422. return;
  423. }
  424. ASSERT_NOT_REACHED();
  425. }
  426. const char* HTMLDocumentParser::insertion_mode_name() const
  427. {
  428. switch (m_insertion_mode) {
  429. #define __ENUMERATE_INSERTION_MODE(mode) \
  430. case InsertionMode::mode: \
  431. return #mode;
  432. ENUMERATE_INSERTION_MODES
  433. #undef __ENUMERATE_INSERTION_MODE
  434. }
  435. ASSERT_NOT_REACHED();
  436. }
  437. Document& HTMLDocumentParser::document()
  438. {
  439. return *m_document;
  440. }
  441. }