/* * Copyright (c) 2020-2024, Andreas Kling * Copyright (c) 2021, Luke Wilde * Copyright (c) 2023-2024, Shannon Booth * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace Web::HTML { JS_DEFINE_ALLOCATOR(HTMLParser); static inline void log_parse_error(SourceLocation const& location = SourceLocation::current()) { dbgln_if(HTML_PARSER_DEBUG, "Parse error! {}", location); } static Vector const s_quirks_public_ids = { "+//Silmaril//dtd html Pro v0r11 19970101//"sv, "-//AS//DTD HTML 3.0 asWedit + extensions//"sv, "-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"sv, "-//IETF//DTD HTML 2.0 Level 1//"sv, "-//IETF//DTD HTML 2.0 Level 2//"sv, "-//IETF//DTD HTML 2.0 Strict Level 1//"sv, "-//IETF//DTD HTML 2.0 Strict Level 2//"sv, "-//IETF//DTD HTML 2.0 Strict//"sv, "-//IETF//DTD HTML 2.0//"sv, "-//IETF//DTD HTML 2.1E//"sv, "-//IETF//DTD HTML 3.0//"sv, "-//IETF//DTD HTML 3.2 Final//"sv, "-//IETF//DTD HTML 3.2//"sv, "-//IETF//DTD HTML 3//"sv, "-//IETF//DTD HTML Level 0//"sv, "-//IETF//DTD HTML Level 1//"sv, "-//IETF//DTD HTML Level 2//"sv, "-//IETF//DTD HTML Level 3//"sv, "-//IETF//DTD HTML Strict Level 0//"sv, "-//IETF//DTD HTML Strict Level 1//"sv, "-//IETF//DTD HTML Strict Level 2//"sv, "-//IETF//DTD HTML Strict Level 3//"sv, "-//IETF//DTD HTML Strict//"sv, "-//IETF//DTD HTML//"sv, "-//Metrius//DTD Metrius Presentational//"sv, "-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"sv, "-//Microsoft//DTD Internet Explorer 2.0 HTML//"sv, "-//Microsoft//DTD Internet Explorer 2.0 Tables//"sv, "-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"sv, "-//Microsoft//DTD Internet Explorer 3.0 HTML//"sv, "-//Microsoft//DTD Internet Explorer 3.0 Tables//"sv, "-//Netscape Comm. Corp.//DTD HTML//"sv, "-//Netscape Comm. Corp.//DTD Strict HTML//"sv, "-//O'Reilly and Associates//DTD HTML 2.0//"sv, "-//O'Reilly and Associates//DTD HTML Extended 1.0//"sv, "-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"sv, "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"sv, "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//"sv, "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//"sv, "-//Spyglass//DTD HTML 2.0 Extended//"sv, "-//Sun Microsystems Corp.//DTD HotJava HTML//"sv, "-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"sv, "-//W3C//DTD HTML 3 1995-03-24//"sv, "-//W3C//DTD HTML 3.2 Draft//"sv, "-//W3C//DTD HTML 3.2 Final//"sv, "-//W3C//DTD HTML 3.2//"sv, "-//W3C//DTD HTML 3.2S Draft//"sv, "-//W3C//DTD HTML 4.0 Frameset//"sv, "-//W3C//DTD HTML 4.0 Transitional//"sv, "-//W3C//DTD HTML Experimental 19960712//"sv, "-//W3C//DTD HTML Experimental 970421//"sv, "-//W3C//DTD W3 HTML//"sv, "-//W3O//DTD W3 HTML 3.0//"sv, "-//WebTechs//DTD Mozilla HTML 2.0//"sv, "-//WebTechs//DTD Mozilla HTML//"sv, }; // https://html.spec.whatwg.org/multipage/parsing.html#mathml-text-integration-point static bool is_mathml_text_integration_point(DOM::Element const& element) { // A node is a MathML text integration point if it is one of the following elements: // - A MathML mi element // - A MathML mo element // - A MathML mn element // - A MathML ms element // - A MathML mtext element return element.local_name().is_one_of(MathML::TagNames::mi, MathML::TagNames::mo, MathML::TagNames::mn, MathML::TagNames::ms, MathML::TagNames::mtext); } // https://html.spec.whatwg.org/multipage/parsing.html#html-integration-point static bool is_html_integration_point(DOM::Element const& element) { // A node is an HTML integration point if it is one of the following elements: // - A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "text/html" // - A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "application/xhtml+xml" if (element.namespace_uri() == Namespace::MathML && element.local_name() == MathML::TagNames::annotation_xml) { auto encoding = element.attribute("encoding"_fly_string); if (encoding.has_value() && (encoding->equals_ignoring_ascii_case("text/html"sv) || encoding->equals_ignoring_ascii_case("application/xhtml+xml"sv))) return true; } // - An SVG foreignObject element // - An SVG desc element // - An SVG title element if (element.namespace_uri() == Namespace::SVG && element.local_name().is_one_of(SVG::TagNames::foreignObject, SVG::TagNames::desc, SVG::TagNames::title)) { return true; } return false; } HTMLParser::HTMLParser(DOM::Document& document, StringView input, StringView encoding) : m_tokenizer(input, encoding) , m_scripting_enabled(document.is_scripting_enabled()) , m_document(document) { m_tokenizer.set_parser({}, *this); m_document->set_parser({}, *this); auto standardized_encoding = TextCodec::get_standardized_encoding(encoding); VERIFY(standardized_encoding.has_value()); m_document->set_encoding(MUST(String::from_utf8(standardized_encoding.value()))); } HTMLParser::HTMLParser(DOM::Document& document) : m_scripting_enabled(document.is_scripting_enabled()) , m_document(document) { m_document->set_parser({}, *this); m_tokenizer.set_parser({}, *this); } HTMLParser::~HTMLParser() { } void HTMLParser::visit_edges(Cell::Visitor& visitor) { Base::visit_edges(visitor); visitor.visit(m_document); visitor.visit(m_head_element); visitor.visit(m_form_element); visitor.visit(m_context_element); visitor.visit(m_character_insertion_node); m_stack_of_open_elements.visit_edges(visitor); m_list_of_active_formatting_elements.visit_edges(visitor); } void HTMLParser::run(HTMLTokenizer::StopAtInsertionPoint stop_at_insertion_point) { for (;;) { // FIXME: Find a better way to say that we come from Document::close() and want to process EOF. if (!m_tokenizer.is_eof_inserted() && m_tokenizer.is_insertion_point_reached()) break; auto optional_token = m_tokenizer.next_token(stop_at_insertion_point); if (!optional_token.has_value()) break; auto& token = optional_token.value(); dbgln_if(HTML_PARSER_DEBUG, "[{}] {}", insertion_mode_name(), token.to_string()); // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction-dispatcher // As each token is emitted from the tokenizer, the user agent must follow the appropriate steps from the following list, known as the tree construction dispatcher: if (m_stack_of_open_elements.is_empty() || adjusted_current_node()->namespace_uri() == Namespace::HTML || (is_mathml_text_integration_point(*adjusted_current_node()) && token.is_start_tag() && token.tag_name() != MathML::TagNames::mglyph && token.tag_name() != MathML::TagNames::malignmark) || (is_mathml_text_integration_point(*adjusted_current_node()) && token.is_character()) || (adjusted_current_node()->namespace_uri() == Namespace::MathML && adjusted_current_node()->local_name() == MathML::TagNames::annotation_xml && token.is_start_tag() && token.tag_name() == SVG::TagNames::svg) || (is_html_integration_point(*adjusted_current_node()) && (token.is_start_tag() || token.is_character())) || token.is_end_of_file()) { // -> If the stack of open elements is empty // -> If the adjusted current node is an element in the HTML namespace // -> If the adjusted current node is a MathML text integration point and the token is a start tag whose tag name is neither "mglyph" nor "malignmark" // -> If the adjusted current node is a MathML text integration point and the token is a character token // -> If the adjusted current node is a MathML annotation-xml element and the token is a start tag whose tag name is "svg" // -> If the adjusted current node is an HTML integration point and the token is a start tag // -> If the adjusted current node is an HTML integration point and the token is a character token // -> If the token is an end-of-file token // Process the token according to the rules given in the section corresponding to the current insertion mode in HTML content. process_using_the_rules_for(m_insertion_mode, token); } else { // -> Otherwise // Process the token according to the rules given in the section for parsing tokens in foreign content. process_using_the_rules_for_foreign_content(token); } if (m_stop_parsing) { dbgln_if(HTML_PARSER_DEBUG, "Stop parsing{}! :^)", m_parsing_fragment ? " fragment" : ""); break; } } flush_character_insertions(); } void HTMLParser::run(const URL::URL& url, HTMLTokenizer::StopAtInsertionPoint stop_at_insertion_point) { m_document->set_url(url); m_document->set_source(MUST(String::from_byte_string(m_tokenizer.source()))); run(stop_at_insertion_point); the_end(*m_document, this); m_document->detach_parser({}); } // https://html.spec.whatwg.org/multipage/parsing.html#the-end void HTMLParser::the_end(JS::NonnullGCPtr document, JS::GCPtr parser) { auto& heap = document->heap(); // Once the user agent stops parsing the document, the user agent must run the following steps: // NOTE: This is a static method because the spec sometimes wants us to "act as if the user agent had stopped // parsing document" which means running these steps without an HTML Parser. That makes it awkward to call, // but it's preferable to duplicating so much code. if (parser) VERIFY(document == parser->m_document); // The entirety of "the end" should be a no-op for HTML fragment parsers, because: // - the temporary document is not accessible, making the DOMContentLoaded event and "ready for post load tasks" do // nothing, making the parser not re-entrant from document.{open,write,close} and document.readyState inaccessible // - there is no Window associated with it and no associated browsing context with the temporary document (meaning // the Window load event is skipped and making the load timing info inaccessible) // - scripts are not able to be prepared, meaning the script queues are empty. // However, the unconditional "spin the event loop" invocations cause two issues: // - Microtask timing is changed, as "spin the event loop" performs an unconditional microtask checkpoint, causing // things to happen out of order. For example, YouTube sets the innerHTML of a