/* * Copyright (c) 2020, Andreas Kling * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#define PARSER_DEBUG #include #include #include #include #include #include #include #include #include #include #include #include #define PARSE_ERROR() \ do { \ dbg() << "Parse error! " << __PRETTY_FUNCTION__ << " @ " << __LINE__; \ } while (0) namespace Web { static Vector s_quirks_public_ids = { "+//Silmaril//dtd html Pro v0r11 19970101//", "-//AS//DTD HTML 3.0 asWedit + extensions//", "-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//", "-//IETF//DTD HTML 2.0 Level 1//", "-//IETF//DTD HTML 2.0 Level 2//", "-//IETF//DTD HTML 2.0 Strict Level 1//", "-//IETF//DTD HTML 2.0 Strict Level 2//", "-//IETF//DTD HTML 2.0 Strict//", "-//IETF//DTD HTML 2.0//", "-//IETF//DTD HTML 2.1E//", "-//IETF//DTD HTML 3.0//", "-//IETF//DTD HTML 3.2 Final//", "-//IETF//DTD HTML 3.2//", "-//IETF//DTD HTML 3//", "-//IETF//DTD HTML Level 0//", "-//IETF//DTD HTML Level 1//", "-//IETF//DTD HTML Level 2//", "-//IETF//DTD HTML Level 3//", "-//IETF//DTD HTML Strict Level 0//", "-//IETF//DTD HTML Strict Level 1//", "-//IETF//DTD HTML Strict Level 2//", "-//IETF//DTD HTML Strict Level 3//", "-//IETF//DTD HTML Strict//", "-//IETF//DTD HTML//", "-//Metrius//DTD Metrius Presentational//", "-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//", "-//Microsoft//DTD Internet Explorer 2.0 HTML//", "-//Microsoft//DTD Internet Explorer 2.0 Tables//", "-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//", "-//Microsoft//DTD Internet Explorer 3.0 HTML//", "-//Microsoft//DTD Internet Explorer 3.0 Tables//", "-//Netscape Comm. Corp.//DTD HTML//", "-//Netscape Comm. Corp.//DTD Strict HTML//", "-//O'Reilly and Associates//DTD HTML 2.0//", "-//O'Reilly and Associates//DTD HTML Extended 1.0//", "-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//", "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//", "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::extensions to HTML 4.0//", "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::extensions to HTML 4.0//", "-//Spyglass//DTD HTML 2.0 Extended//", "-//Sun Microsystems Corp.//DTD HotJava HTML//", "-//Sun Microsystems Corp.//DTD HotJava Strict HTML//", "-//W3C//DTD HTML 3 1995-03-24//", "-//W3C//DTD HTML 3.2 Draft//", "-//W3C//DTD HTML 3.2 Final//", "-//W3C//DTD HTML 3.2//", "-//W3C//DTD HTML 3.2S Draft//", "-//W3C//DTD HTML 4.0 Frameset//", "-//W3C//DTD HTML 4.0 Transitional//", "-//W3C//DTD HTML Experimental 19960712//", "-//W3C//DTD HTML Experimental 970421//", "-//W3C//DTD W3 HTML//", "-//W3O//DTD W3 HTML 3.0//", "-//WebTechs//DTD Mozilla HTML 2.0//", "-//WebTechs//DTD Mozilla HTML//" }; RefPtr parse_html_document(const StringView& data, const URL& url, const String& encoding) { HTMLDocumentParser parser(data, encoding); parser.run(url); return parser.document(); } HTMLDocumentParser::HTMLDocumentParser(const StringView& input, const String& encoding) : m_tokenizer(input, encoding) { m_document = adopt(*new Document); } HTMLDocumentParser::HTMLDocumentParser(const StringView& input, const String& encoding, Document& existing_document) : m_tokenizer(input, encoding) , m_document(existing_document) { } HTMLDocumentParser::~HTMLDocumentParser() { } void HTMLDocumentParser::run(const URL& url) { m_document->set_url(url); m_document->set_source(m_tokenizer.source()); for (;;) { auto optional_token = m_tokenizer.next_token(); if (!optional_token.has_value()) break; auto& token = optional_token.value(); #ifdef PARSER_DEBUG dbg() << "[" << insertion_mode_name() << "] " << token.to_string(); #endif process_using_the_rules_for(m_insertion_mode, token); if (m_stop_parsing) { #ifdef PARSER_DEBUG dbg() << "Stop parsing" << (m_parsing_fragment ? " fragment" : "") << "! :^)"; #endif break; } } flush_character_insertions(); // "The end" auto scripts_to_execute_when_parsing_has_finished = m_document->take_scripts_to_execute_when_parsing_has_finished({}); for (auto& script : scripts_to_execute_when_parsing_has_finished) { script.execute_script(); } m_document->dispatch_event(Event::create("DOMContentLoaded")); auto scripts_to_execute_as_soon_as_possible = m_document->take_scripts_to_execute_as_soon_as_possible({}); for (auto& script : scripts_to_execute_as_soon_as_possible) { script.execute_script(); } } void HTMLDocumentParser::process_using_the_rules_for(InsertionMode mode, HTMLToken& token) { switch (mode) { case InsertionMode::Initial: handle_initial(token); break; case InsertionMode::BeforeHTML: handle_before_html(token); break; case InsertionMode::BeforeHead: handle_before_head(token); break; case InsertionMode::InHead: handle_in_head(token); break; case InsertionMode::InHeadNoscript: handle_in_head_noscript(token); break; case InsertionMode::AfterHead: handle_after_head(token); break; case InsertionMode::InBody: handle_in_body(token); break; case InsertionMode::AfterBody: handle_after_body(token); break; case InsertionMode::AfterAfterBody: handle_after_after_body(token); break; case InsertionMode::Text: handle_text(token); break; case InsertionMode::InTable: handle_in_table(token); break; case InsertionMode::InTableBody: handle_in_table_body(token); break; case InsertionMode::InRow: handle_in_row(token); break; case InsertionMode::InCell: handle_in_cell(token); break; case InsertionMode::InTableText: handle_in_table_text(token); break; case InsertionMode::InSelectInTable: handle_in_select_in_table(token); break; case InsertionMode::InSelect: handle_in_select(token); break; case InsertionMode::InCaption: handle_in_caption(token); break; case InsertionMode::InColumnGroup: handle_in_column_group(token); break; case InsertionMode::InTemplate: handle_in_template(token); break; case InsertionMode::InFrameset: handle_in_frameset(token); break; case InsertionMode::AfterFrameset: handle_after_frameset(token); break; case InsertionMode::AfterAfterFrameset: handle_after_after_frameset(token); break; default: ASSERT_NOT_REACHED(); } } QuirksMode HTMLDocumentParser::which_quirks_mode(const HTMLToken& doctype_token) const { if (doctype_token.m_doctype.force_quirks) return QuirksMode::Yes; // NOTE: The tokenizer puts the name into lower case for us. if (doctype_token.m_doctype.name.to_string() != "html") return QuirksMode::Yes; auto public_identifier = doctype_token.m_doctype.public_identifier.to_string(); auto system_identifier = doctype_token.m_doctype.system_identifier.to_string(); if (public_identifier.equals_ignoring_case("-//W3O//DTD W3 HTML Strict 3.0//EN//")) return QuirksMode::Yes; if (public_identifier.equals_ignoring_case("-/W3C/DTD HTML 4.0 Transitional/EN")) return QuirksMode::Yes; if (public_identifier.equals_ignoring_case("HTML")) return QuirksMode::Yes; if (system_identifier.equals_ignoring_case("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")) return QuirksMode::Yes; for (auto& public_id : s_quirks_public_ids) { if (public_identifier.starts_with(public_id, CaseSensitivity::CaseInsensitive)) return QuirksMode::Yes; } if (doctype_token.m_doctype.missing_system_identifier) { if (public_identifier.starts_with("-//W3C//DTD HTML 4.01 Frameset//", CaseSensitivity::CaseInsensitive)) return QuirksMode::Yes; if (public_identifier.starts_with("-//W3C//DTD HTML 4.01 Transitional//", CaseSensitivity::CaseInsensitive)) return QuirksMode::Yes; } if (public_identifier.starts_with("-//W3C//DTD XHTML 1.0 Frameset//", CaseSensitivity::CaseInsensitive)) return QuirksMode::Limited; if (public_identifier.starts_with("-//W3C//DTD XHTML 1.0 Transitional//", CaseSensitivity::CaseInsensitive)) return QuirksMode::Limited; if (!doctype_token.m_doctype.missing_system_identifier) { if (public_identifier.starts_with("-//W3C//DTD HTML 4.01 Frameset//", CaseSensitivity::CaseInsensitive)) return QuirksMode::Limited; if (public_identifier.starts_with("-//W3C//DTD HTML 4.01 Transitional//", CaseSensitivity::CaseInsensitive)) return QuirksMode::Limited; } return QuirksMode::No; } void HTMLDocumentParser::handle_initial(HTMLToken& token) { if (token.is_character() && token.is_parser_whitespace()) { return; } if (token.is_comment()) { auto comment = adopt(*new Comment(document(), token.m_comment_or_character.data.to_string())); document().append_child(move(comment)); return; } if (token.is_doctype()) { auto doctype = adopt(*new DocumentType(document())); doctype->set_name(token.m_doctype.name.to_string()); doctype->set_public_id(token.m_doctype.public_identifier.to_string()); doctype->set_system_id(token.m_doctype.system_identifier.to_string()); document().append_child(move(doctype)); document().set_quirks_mode(which_quirks_mode(token)); m_insertion_mode = InsertionMode::BeforeHTML; return; } PARSE_ERROR(); document().set_quirks_mode(QuirksMode::Yes); m_insertion_mode = InsertionMode::BeforeHTML; process_using_the_rules_for(InsertionMode::BeforeHTML, token); } void HTMLDocumentParser::handle_before_html(HTMLToken& token) { if (token.is_doctype()) { PARSE_ERROR(); return; } if (token.is_comment()) { auto comment = adopt(*new Comment(document(), token.m_comment_or_character.data.to_string())); document().append_child(move(comment)); return; } if (token.is_character() && token.is_parser_whitespace()) { return; } if (token.is_start_tag() && token.tag_name() == HTML::TagNames::html) { auto element = create_element_for(token); document().append_child(element); m_stack_of_open_elements.push(move(element)); m_insertion_mode = InsertionMode::BeforeHead; return; } if (token.is_end_tag() && token.tag_name().is_one_of(HTML::TagNames::head, HTML::TagNames::body, HTML::TagNames::html, HTML::TagNames::br)) { goto AnythingElse; } if (token.is_end_tag()) { PARSE_ERROR(); return; } AnythingElse: auto element = create_element(document(), HTML::TagNames::html); document().append_child(element); m_stack_of_open_elements.push(element); // FIXME: If the Document is being loaded as part of navigation of a browsing context, then: run the application cache selection algorithm with no manifest, passing it the Document object. m_insertion_mode = InsertionMode::BeforeHead; process_using_the_rules_for(InsertionMode::BeforeHead, token); return; } Element& HTMLDocumentParser::current_node() { return m_stack_of_open_elements.current_node(); } Element& HTMLDocumentParser::node_before_current_node() { return m_stack_of_open_elements.elements().at(m_stack_of_open_elements.elements().size() - 2); } HTMLDocumentParser::AdjustedInsertionLocation HTMLDocumentParser::find_appropriate_place_for_inserting_node() { auto& target = current_node(); if (m_foster_parenting && target.local_name().is_one_of(HTML::TagNames::table, HTML::TagNames::tbody, HTML::TagNames::tfoot, HTML::TagNames::thead, HTML::TagNames::tr)) { // FIXME: There's a bunch of steps for