/* * Copyright (c) 2020, Andreas Kling * Copyright (c) 2022, Linus Groh * * SPDX-License-Identifier: BSD-2-Clause */ #pragma once #include #include #include #include #include #include #include #include namespace Web::HTML { #define ENUMERATE_TOKENIZER_STATES \ __ENUMERATE_TOKENIZER_STATE(Data) \ __ENUMERATE_TOKENIZER_STATE(RCDATA) \ __ENUMERATE_TOKENIZER_STATE(RAWTEXT) \ __ENUMERATE_TOKENIZER_STATE(ScriptData) \ __ENUMERATE_TOKENIZER_STATE(PLAINTEXT) \ __ENUMERATE_TOKENIZER_STATE(TagOpen) \ __ENUMERATE_TOKENIZER_STATE(EndTagOpen) \ __ENUMERATE_TOKENIZER_STATE(TagName) \ __ENUMERATE_TOKENIZER_STATE(RCDATALessThanSign) \ __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagOpen) \ __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagName) \ __ENUMERATE_TOKENIZER_STATE(RAWTEXTLessThanSign) \ __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagOpen) \ __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagName) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataLessThanSign) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagOpen) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagName) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStart) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStartDash) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEscaped) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDash) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDashDash) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedLessThanSign) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagOpen) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagName) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeStart) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscaped) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDash) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDashDash) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedLessThanSign) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeEnd) \ __ENUMERATE_TOKENIZER_STATE(BeforeAttributeName) \ __ENUMERATE_TOKENIZER_STATE(AttributeName) \ __ENUMERATE_TOKENIZER_STATE(AfterAttributeName) \ __ENUMERATE_TOKENIZER_STATE(BeforeAttributeValue) \ __ENUMERATE_TOKENIZER_STATE(AttributeValueDoubleQuoted) \ __ENUMERATE_TOKENIZER_STATE(AttributeValueSingleQuoted) \ __ENUMERATE_TOKENIZER_STATE(AttributeValueUnquoted) \ __ENUMERATE_TOKENIZER_STATE(AfterAttributeValueQuoted) \ __ENUMERATE_TOKENIZER_STATE(SelfClosingStartTag) \ __ENUMERATE_TOKENIZER_STATE(BogusComment) \ __ENUMERATE_TOKENIZER_STATE(MarkupDeclarationOpen) \ __ENUMERATE_TOKENIZER_STATE(CommentStart) \ __ENUMERATE_TOKENIZER_STATE(CommentStartDash) \ __ENUMERATE_TOKENIZER_STATE(Comment) \ __ENUMERATE_TOKENIZER_STATE(CommentLessThanSign) \ __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBang) \ __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDash) \ __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDashDash) \ __ENUMERATE_TOKENIZER_STATE(CommentEndDash) \ __ENUMERATE_TOKENIZER_STATE(CommentEnd) \ __ENUMERATE_TOKENIZER_STATE(CommentEndBang) \ __ENUMERATE_TOKENIZER_STATE(DOCTYPE) \ __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEName) \ __ENUMERATE_TOKENIZER_STATE(DOCTYPEName) \ __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEName) \ __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicKeyword) \ __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEPublicIdentifier) \ __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierDoubleQuoted) \ __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierSingleQuoted) \ __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicIdentifier) \ __ENUMERATE_TOKENIZER_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers) \ __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemKeyword) \ __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPESystemIdentifier) \ __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierDoubleQuoted) \ __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierSingleQuoted) \ __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemIdentifier) \ __ENUMERATE_TOKENIZER_STATE(BogusDOCTYPE) \ __ENUMERATE_TOKENIZER_STATE(CDATASection) \ __ENUMERATE_TOKENIZER_STATE(CDATASectionBracket) \ __ENUMERATE_TOKENIZER_STATE(CDATASectionEnd) \ __ENUMERATE_TOKENIZER_STATE(CharacterReference) \ __ENUMERATE_TOKENIZER_STATE(NamedCharacterReference) \ __ENUMERATE_TOKENIZER_STATE(AmbiguousAmpersand) \ __ENUMERATE_TOKENIZER_STATE(NumericCharacterReference) \ __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReferenceStart) \ __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReferenceStart) \ __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReference) \ __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReference) \ __ENUMERATE_TOKENIZER_STATE(NumericCharacterReferenceEnd) class HTMLTokenizer { public: explicit HTMLTokenizer(); explicit HTMLTokenizer(StringView input, ByteString const& encoding); enum class State { #define __ENUMERATE_TOKENIZER_STATE(state) state, ENUMERATE_TOKENIZER_STATES #undef __ENUMERATE_TOKENIZER_STATE }; enum class StopAtInsertionPoint { No, Yes, }; Optional next_token(StopAtInsertionPoint = StopAtInsertionPoint::No); void set_parser(Badge, HTMLParser& parser) { m_parser = &parser; } void switch_to(Badge, State new_state); void switch_to(State new_state) { m_state = new_state; } void set_blocked(bool b) { m_blocked = b; } bool is_blocked() const { return m_blocked; } ByteString source() const { return m_decoded_input; } void insert_input_at_insertion_point(StringView input); void insert_eof(); bool is_eof_inserted(); bool is_insertion_point_defined() const { return m_insertion_point.defined; } bool is_insertion_point_reached() { return m_insertion_point.defined && m_utf8_view.iterator_offset(m_utf8_iterator) >= m_insertion_point.position; } void undefine_insertion_point() { m_insertion_point.defined = false; } void store_insertion_point() { m_old_insertion_point = m_insertion_point; } void restore_insertion_point() { m_insertion_point = m_old_insertion_point; } void update_insertion_point() { m_insertion_point.defined = true; m_insertion_point.position = m_utf8_view.iterator_offset(m_utf8_iterator); } // This permanently cuts off the tokenizer input stream. void abort() { m_aborted = true; } private: void skip(size_t count); Optional next_code_point(); Optional peek_code_point(size_t offset) const; bool consume_next_if_match(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive); void create_new_token(HTMLToken::Type); bool current_end_tag_token_is_appropriate() const; String consume_current_builder(); static char const* state_name(State state) { switch (state) { #define __ENUMERATE_TOKENIZER_STATE(state) \ case State::state: \ return #state; ENUMERATE_TOKENIZER_STATES #undef __ENUMERATE_TOKENIZER_STATE }; VERIFY_NOT_REACHED(); } void will_emit(HTMLToken&); void will_switch_to(State); void will_reconsume_in(State); bool consumed_as_part_of_an_attribute() const; void restore_to(Utf8CodePointIterator const& new_iterator); HTMLToken::Position nth_last_position(size_t n = 0); GC::Ptr m_parser; State m_state { State::Data }; State m_return_state { State::Data }; Vector m_temporary_buffer; ByteString m_decoded_input; struct InsertionPoint { size_t position { 0 }; bool defined { false }; }; InsertionPoint m_insertion_point {}; InsertionPoint m_old_insertion_point {}; Utf8View m_utf8_view; Utf8CodePointIterator m_utf8_iterator; Utf8CodePointIterator m_prev_utf8_iterator; HTMLToken m_current_token; StringBuilder m_current_builder; Optional m_last_emitted_start_tag_name; bool m_explicit_eof_inserted { false }; bool m_has_emitted_eof { false }; Queue m_queued_tokens; u32 m_character_reference_code { 0 }; bool m_blocked { false }; bool m_aborted { false }; Vector m_source_positions; }; }