HTMLDocumentParser.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include <LibWeb/DOM/Document.h>
  27. #include <LibWeb/DOM/DocumentType.h>
  28. #include <LibWeb/DOM/ElementFactory.h>
  29. #include <LibWeb/DOM/HTMLFormElement.h>
  30. #include <LibWeb/DOM/HTMLHeadElement.h>
  31. #include <LibWeb/Parser/HTMLDocumentParser.h>
  32. #include <LibWeb/Parser/HTMLToken.h>
  33. namespace Web {
  34. HTMLDocumentParser::HTMLDocumentParser(const StringView& input)
  35. : m_tokenizer(input)
  36. {
  37. }
  38. HTMLDocumentParser::~HTMLDocumentParser()
  39. {
  40. }
  41. void HTMLDocumentParser::run()
  42. {
  43. m_document = adopt(*new Document);
  44. for (;;) {
  45. auto optional_token = m_tokenizer.next_token();
  46. if (!optional_token.has_value())
  47. return;
  48. auto& token = optional_token.value();
  49. dbg() << "[" << insertion_mode_name() << "] " << token.to_string();
  50. switch (m_insertion_mode) {
  51. case InsertionMode::Initial:
  52. handle_initial(token);
  53. break;
  54. case InsertionMode::BeforeHTML:
  55. handle_before_html(token);
  56. break;
  57. case InsertionMode::BeforeHead:
  58. handle_before_head(token);
  59. break;
  60. case InsertionMode::InHead:
  61. handle_in_head(token);
  62. break;
  63. case InsertionMode::InHeadNoscript:
  64. handle_in_head_noscript(token);
  65. break;
  66. case InsertionMode::AfterHead:
  67. handle_after_head(token);
  68. break;
  69. case InsertionMode::InBody:
  70. handle_in_body(token);
  71. break;
  72. case InsertionMode::AfterBody:
  73. handle_after_body(token);
  74. break;
  75. case InsertionMode::AfterAfterBody:
  76. handle_after_after_body(token);
  77. break;
  78. case InsertionMode::Text:
  79. handle_text(token);
  80. break;
  81. default:
  82. ASSERT_NOT_REACHED();
  83. }
  84. }
  85. }
  86. void HTMLDocumentParser::handle_initial(HTMLToken& token)
  87. {
  88. if (token.type() == HTMLToken::Type::DOCTYPE) {
  89. auto doctype = adopt(*new DocumentType(document()));
  90. doctype->set_name(token.m_doctype.name.to_string());
  91. document().append_child(move(doctype));
  92. m_insertion_mode = InsertionMode::BeforeHTML;
  93. return;
  94. }
  95. ASSERT_NOT_REACHED();
  96. }
  97. void HTMLDocumentParser::handle_before_html(HTMLToken& token)
  98. {
  99. if (token.is_start_tag() && token.tag_name() == "html") {
  100. auto element = create_element_for(token);
  101. document().append_child(element);
  102. m_stack_of_open_elements.append(element);
  103. m_insertion_mode = InsertionMode::BeforeHead;
  104. return;
  105. }
  106. ASSERT_NOT_REACHED();
  107. }
  108. NonnullRefPtr<Node> HTMLDocumentParser::current_node()
  109. {
  110. return m_stack_of_open_elements.last();
  111. }
  112. RefPtr<Node> HTMLDocumentParser::find_appropriate_place_for_inserting_node()
  113. {
  114. auto target = current_node();
  115. if (m_foster_parenting) {
  116. ASSERT_NOT_REACHED();
  117. }
  118. return target;
  119. }
  120. NonnullRefPtr<Element> HTMLDocumentParser::create_element_for(HTMLToken& token)
  121. {
  122. auto element = create_element(document(), token.tag_name());
  123. for (auto& attribute : token.m_tag.attributes) {
  124. element->set_attribute(attribute.name_builder.to_string(), attribute.value_builder.to_string());
  125. }
  126. return element;
  127. }
  128. RefPtr<Element> HTMLDocumentParser::insert_html_element(HTMLToken& token)
  129. {
  130. auto adjusted_insertion_location = find_appropriate_place_for_inserting_node();
  131. auto element = create_element_for(token);
  132. // FIXME: Check if it's possible to insert `element` at `adjusted_insertion_location`
  133. adjusted_insertion_location->append_child(element);
  134. m_stack_of_open_elements.append(element);
  135. return element;
  136. }
  137. void HTMLDocumentParser::handle_before_head(HTMLToken& token)
  138. {
  139. if (token.is_start_tag() && token.tag_name() == "head") {
  140. auto element = insert_html_element(token);
  141. m_head_element = to<HTMLHeadElement>(element);
  142. m_insertion_mode = InsertionMode::InHead;
  143. return;
  144. }
  145. ASSERT_NOT_REACHED();
  146. }
  147. void HTMLDocumentParser::handle_in_head(HTMLToken& token)
  148. {
  149. if (token.is_start_tag() && token.tag_name() == "meta") {
  150. auto element = insert_html_element(token);
  151. m_stack_of_open_elements.take_last();
  152. if (token.is_self_closing()) {
  153. ASSERT_NOT_REACHED();
  154. }
  155. return;
  156. }
  157. if (token.is_end_tag() && token.tag_name() == "head") {
  158. m_stack_of_open_elements.take_last();
  159. m_insertion_mode = InsertionMode::AfterHead;
  160. return;
  161. }
  162. ASSERT_NOT_REACHED();
  163. }
  164. void HTMLDocumentParser::handle_in_head_noscript(HTMLToken&)
  165. {
  166. ASSERT_NOT_REACHED();
  167. }
  168. void HTMLDocumentParser::handle_after_head(HTMLToken& token)
  169. {
  170. if (token.is_character()) {
  171. ASSERT_NOT_REACHED();
  172. }
  173. if (token.is_comment()) {
  174. ASSERT_NOT_REACHED();
  175. }
  176. if (token.is_doctype()) {
  177. ASSERT_NOT_REACHED();
  178. }
  179. if (token.is_start_tag() && token.tag_name() == "html") {
  180. ASSERT_NOT_REACHED();
  181. }
  182. if (token.is_start_tag() && token.tag_name() == "body") {
  183. insert_html_element(token);
  184. m_frameset_ok = false;
  185. m_insertion_mode = InsertionMode::InBody;
  186. return;
  187. }
  188. if (token.is_start_tag() && token.tag_name() == "frameset") {
  189. ASSERT_NOT_REACHED();
  190. }
  191. {
  192. Vector<String> names = { "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "template", "title" };
  193. if (token.is_end_tag() && names.contains_slow(token.tag_name())) {
  194. ASSERT_NOT_REACHED();
  195. }
  196. }
  197. if (token.is_end_tag() && token.tag_name() == "template") {
  198. ASSERT_NOT_REACHED();
  199. }
  200. if (token.is_end_tag() && (token.tag_name() == "body" || token.tag_name() == "html" || token.tag_name() == "br")) {
  201. goto AnythingElse;
  202. }
  203. if ((token.is_start_tag() && token.tag_name() == "head") || token.is_end_tag()) {
  204. ASSERT_NOT_REACHED();
  205. }
  206. AnythingElse:
  207. HTMLToken fake_body_token;
  208. fake_body_token.m_type = HTMLToken::Type::StartTag;
  209. fake_body_token.m_tag.tag_name.append("body");
  210. insert_html_element(fake_body_token);
  211. m_insertion_mode = InsertionMode::InBody;
  212. // FIXME: Reprocess the current token in InBody!
  213. }
  214. void HTMLDocumentParser::generate_implied_end_tags()
  215. {
  216. Vector<String> names { "dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc" };
  217. while (names.contains_slow(current_node()->tag_name()))
  218. m_stack_of_open_elements.take_last();
  219. }
  220. bool HTMLDocumentParser::stack_of_open_elements_has_element_with_tag_name_in_scope(const FlyString& tag_name)
  221. {
  222. Vector<String> list { "applet", "caption", "html", "table", "td", "th", "marquee", "object", "template" };
  223. for (ssize_t i = m_stack_of_open_elements.size() - 1; i >= 0; --i) {
  224. auto& node = m_stack_of_open_elements.at(i);
  225. if (node.tag_name() == tag_name)
  226. return true;
  227. if (list.contains_slow(node.tag_name()))
  228. return false;
  229. }
  230. ASSERT_NOT_REACHED();
  231. }
  232. void HTMLDocumentParser::handle_after_body(HTMLToken& token)
  233. {
  234. if (token.is_end_tag() && token.tag_name() == "html") {
  235. if (m_parsing_fragment) {
  236. ASSERT_NOT_REACHED();
  237. }
  238. m_insertion_mode = InsertionMode::AfterAfterBody;
  239. return;
  240. }
  241. ASSERT_NOT_REACHED();
  242. }
  243. void HTMLDocumentParser::handle_after_after_body(HTMLToken& token)
  244. {
  245. if (token.is_end_of_file()) {
  246. dbg() << "Stop parsing! :^)";
  247. return;
  248. }
  249. ASSERT_NOT_REACHED();
  250. }
  251. void HTMLDocumentParser::handle_in_body(HTMLToken& token)
  252. {
  253. if (token.is_end_tag() && token.tag_name() == "body") {
  254. if (!stack_of_open_elements_has_element_with_tag_name_in_scope("body")) {
  255. ASSERT_NOT_REACHED();
  256. }
  257. // FIXME: Otherwise, if there is a node in the stack of open elements that is
  258. // not either a dd element, a dt element, an li element, an optgroup element,
  259. // an option element, a p element, an rb element, an rp element, an rt element,
  260. // an rtc element, a tbody element, a td element, a tfoot element, a th element,
  261. // a thead element, a tr element, the body element, or the html element,
  262. // then this is a parse error.
  263. m_insertion_mode = InsertionMode::AfterBody;
  264. return;
  265. }
  266. {
  267. Vector<String> names { "address", "article", "aside", "blockquote", "center", "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", "section", "summary", "ul" };
  268. if (token.is_start_tag() && names.contains_slow(token.tag_name())) {
  269. // FIXME: If the stack of open elements has a p element in button scope, then close a p element.
  270. insert_html_element(token);
  271. return;
  272. }
  273. if (token.is_end_tag() && names.contains_slow(token.tag_name())) {
  274. // FIXME: If the stack of open elements has a p element in button scope, then close a p element.
  275. if (!stack_of_open_elements_has_element_with_tag_name_in_scope(token.tag_name())) {
  276. ASSERT_NOT_REACHED();
  277. }
  278. generate_implied_end_tags();
  279. if (current_node()->tag_name() != token.tag_name()) {
  280. ASSERT_NOT_REACHED();
  281. }
  282. m_stack_of_open_elements.take_last();
  283. return;
  284. }
  285. }
  286. ASSERT_NOT_REACHED();
  287. }
  288. void HTMLDocumentParser::handle_text(HTMLToken&)
  289. {
  290. ASSERT_NOT_REACHED();
  291. }
  292. const char* HTMLDocumentParser::insertion_mode_name() const
  293. {
  294. switch (m_insertion_mode) {
  295. #define __ENUMERATE_INSERTION_MODE(mode) \
  296. case InsertionMode::mode: \
  297. return #mode;
  298. ENUMERATE_INSERTION_MODES
  299. #undef __ENUMERATE_INSERTION_MODE
  300. }
  301. ASSERT_NOT_REACHED();
  302. }
  303. Document& HTMLDocumentParser::document()
  304. {
  305. return *m_document;
  306. }
  307. }