/* * Copyright (c) 2020-2022, Andreas Kling * Copyright (c) 2021, Luke Wilde * Copyright (c) 2023, Shannon Booth * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace Web::HTML { static inline void log_parse_error(SourceLocation const& location = SourceLocation::current()) { dbgln_if(HTML_PARSER_DEBUG, "Parse error! {}", location); } static Vector s_quirks_public_ids = { "+//Silmaril//dtd html Pro v0r11 19970101//", "-//AS//DTD HTML 3.0 asWedit + extensions//", "-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//", "-//IETF//DTD HTML 2.0 Level 1//", "-//IETF//DTD HTML 2.0 Level 2//", "-//IETF//DTD HTML 2.0 Strict Level 1//", "-//IETF//DTD HTML 2.0 Strict Level 2//", "-//IETF//DTD HTML 2.0 Strict//", "-//IETF//DTD HTML 2.0//", "-//IETF//DTD HTML 2.1E//", "-//IETF//DTD HTML 3.0//", "-//IETF//DTD HTML 3.2 Final//", "-//IETF//DTD HTML 3.2//", "-//IETF//DTD HTML 3//", "-//IETF//DTD HTML Level 0//", "-//IETF//DTD HTML Level 1//", "-//IETF//DTD HTML Level 2//", "-//IETF//DTD HTML Level 3//", "-//IETF//DTD HTML Strict Level 0//", "-//IETF//DTD HTML Strict Level 1//", "-//IETF//DTD HTML Strict Level 2//", "-//IETF//DTD HTML Strict Level 3//", "-//IETF//DTD HTML Strict//", "-//IETF//DTD HTML//", "-//Metrius//DTD Metrius Presentational//", "-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//", "-//Microsoft//DTD Internet Explorer 2.0 HTML//", "-//Microsoft//DTD Internet Explorer 2.0 Tables//", "-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//", "-//Microsoft//DTD Internet Explorer 3.0 HTML//", "-//Microsoft//DTD Internet Explorer 3.0 Tables//", "-//Netscape Comm. Corp.//DTD HTML//", "-//Netscape Comm. Corp.//DTD Strict HTML//", "-//O'Reilly and Associates//DTD HTML 2.0//", "-//O'Reilly and Associates//DTD HTML Extended 1.0//", "-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//", "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//", "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//", "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//", "-//Spyglass//DTD HTML 2.0 Extended//", "-//Sun Microsystems Corp.//DTD HotJava HTML//", "-//Sun Microsystems Corp.//DTD HotJava Strict HTML//", "-//W3C//DTD HTML 3 1995-03-24//", "-//W3C//DTD HTML 3.2 Draft//", "-//W3C//DTD HTML 3.2 Final//", "-//W3C//DTD HTML 3.2//", "-//W3C//DTD HTML 3.2S Draft//", "-//W3C//DTD HTML 4.0 Frameset//", "-//W3C//DTD HTML 4.0 Transitional//", "-//W3C//DTD HTML Experimental 19960712//", "-//W3C//DTD HTML Experimental 970421//", "-//W3C//DTD W3 HTML//", "-//W3O//DTD W3 HTML 3.0//", "-//WebTechs//DTD Mozilla HTML 2.0//", "-//WebTechs//DTD Mozilla HTML//" }; // https://html.spec.whatwg.org/multipage/parsing.html#mathml-text-integration-point static bool is_mathml_text_integration_point(DOM::Element const&) { // FIXME: Implement. return false; } // https://html.spec.whatwg.org/multipage/parsing.html#html-integration-point static bool is_html_integration_point(DOM::Element const& element) { // A node is an HTML integration point if it is one of the following elements: // FIXME: A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "text/html" // FIXME: A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "application/xhtml+xml" // An SVG foreignObject element // An SVG desc element // An SVG title element if (element.tag_name().is_one_of(SVG::TagNames::foreignObject, SVG::TagNames::desc, SVG::TagNames::title)) return true; return false; } HTMLParser::HTMLParser(DOM::Document& document, StringView input, DeprecatedString const& encoding) : m_tokenizer(input, encoding) , m_scripting_enabled(document.is_scripting_enabled()) , m_document(JS::make_handle(document)) { m_tokenizer.set_parser({}, *this); m_document->set_parser({}, *this); auto standardized_encoding = TextCodec::get_standardized_encoding(encoding); VERIFY(standardized_encoding.has_value()); m_document->set_encoding(standardized_encoding.value()); } HTMLParser::HTMLParser(DOM::Document& document) : m_scripting_enabled(document.is_scripting_enabled()) , m_document(JS::make_handle(document)) { m_document->set_parser({}, *this); m_tokenizer.set_parser({}, *this); } HTMLParser::~HTMLParser() { } void HTMLParser::visit_edges(Cell::Visitor& visitor) { Base::visit_edges(visitor); visitor.visit(m_document); visitor.visit(m_head_element); visitor.visit(m_form_element); visitor.visit(m_context_element); visitor.visit(m_character_insertion_node); m_stack_of_open_elements.visit_edges(visitor); m_list_of_active_formatting_elements.visit_edges(visitor); } void HTMLParser::run() { for (;;) { // FIXME: Find a better way to say that we come from Document::close() and want to process EOF. if (!m_tokenizer.is_eof_inserted() && m_tokenizer.is_insertion_point_reached()) return; auto optional_token = m_tokenizer.next_token(); if (!optional_token.has_value()) break; auto& token = optional_token.value(); dbgln_if(HTML_PARSER_DEBUG, "[{}] {}", insertion_mode_name(), token.to_deprecated_string()); // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction-dispatcher // As each token is emitted from the tokenizer, the user agent must follow the appropriate steps from the following list, known as the tree construction dispatcher: if (m_stack_of_open_elements.is_empty() || adjusted_current_node().namespace_() == Namespace::HTML || (is_html_integration_point(adjusted_current_node()) && (token.is_start_tag() || token.is_character())) || token.is_end_of_file()) { // -> If the stack of open elements is empty // -> If the adjusted current node is an element in the HTML namespace // FIXME: -> If the adjusted current node is a MathML text integration point and the token is a start tag whose tag name is neither "mglyph" nor "malignmark" // FIXME: -> If the adjusted current node is a MathML text integration point and the token is a character token // FIXME: -> If the adjusted current node is a MathML annotation-xml element and the token is a start tag whose tag name is "svg" // -> If the adjusted current node is an HTML integration point and the token is a start tag // -> If the adjusted current node is an HTML integration point and the token is a character token // -> If the token is an end-of-file token // Process the token according to the rules given in the section corresponding to the current insertion mode in HTML content. process_using_the_rules_for(m_insertion_mode, token); } else { // -> Otherwise // Process the token according to the rules given in the section for parsing tokens in foreign content. process_using_the_rules_for_foreign_content(token); } if (m_stop_parsing) { dbgln_if(HTML_PARSER_DEBUG, "Stop parsing{}! :^)", m_parsing_fragment ? " fragment" : ""); break; } } flush_character_insertions(); } void HTMLParser::run(const AK::URL& url) { m_document->set_url(url); m_document->set_source(m_tokenizer.source()); run(); the_end(); m_document->detach_parser({}); } // https://html.spec.whatwg.org/multipage/parsing.html#the-end void HTMLParser::the_end() { // Once the user agent stops parsing the document, the user agent must run the following steps: // The entirety of "the end" should be a no-op for HTML fragment parsers, because: // - the temporary document is not accessible, making the DOMContentLoaded event and "ready for post load tasks" do // nothing, making the parser not re-entrant from document.{open,write,close} and document.readyState inaccessible // - there is no Window associated with it and no associated browsing context with the temporary document (meaning // the Window load event is skipped and making the load timing info inaccessible) // - scripts are not able to be prepared, meaning the script queues are empty. // However, the unconditional "spin the event loop" invocations cause two issues: // - Microtask timing is changed, as "spin the event loop" performs an unconditional microtask checkpoint, causing // things to happen out of order. For example, YouTube sets the innerHTML of a