/* * Copyright (c) 2020, Andreas Kling * * SPDX-License-Identifier: BSD-2-Clause */ #pragma once #include #include #include #include #include #include #include namespace Web::HTML { #define ENUMERATE_TOKENIZER_STATES \ __ENUMERATE_TOKENIZER_STATE(Data) \ __ENUMERATE_TOKENIZER_STATE(RCDATA) \ __ENUMERATE_TOKENIZER_STATE(RAWTEXT) \ __ENUMERATE_TOKENIZER_STATE(ScriptData) \ __ENUMERATE_TOKENIZER_STATE(PLAINTEXT) \ __ENUMERATE_TOKENIZER_STATE(TagOpen) \ __ENUMERATE_TOKENIZER_STATE(EndTagOpen) \ __ENUMERATE_TOKENIZER_STATE(TagName) \ __ENUMERATE_TOKENIZER_STATE(RCDATALessThanSign) \ __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagOpen) \ __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagName) \ __ENUMERATE_TOKENIZER_STATE(RAWTEXTLessThanSign) \ __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagOpen) \ __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagName) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataLessThanSign) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagOpen) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagName) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStart) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStartDash) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEscaped) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDash) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDashDash) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedLessThanSign) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagOpen) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagName) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeStart) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscaped) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDash) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDashDash) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedLessThanSign) \ __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeEnd) \ __ENUMERATE_TOKENIZER_STATE(BeforeAttributeName) \ __ENUMERATE_TOKENIZER_STATE(AttributeName) \ __ENUMERATE_TOKENIZER_STATE(AfterAttributeName) \ __ENUMERATE_TOKENIZER_STATE(BeforeAttributeValue) \ __ENUMERATE_TOKENIZER_STATE(AttributeValueDoubleQuoted) \ __ENUMERATE_TOKENIZER_STATE(AttributeValueSingleQuoted) \ __ENUMERATE_TOKENIZER_STATE(AttributeValueUnquoted) \ __ENUMERATE_TOKENIZER_STATE(AfterAttributeValueQuoted) \ __ENUMERATE_TOKENIZER_STATE(SelfClosingStartTag) \ __ENUMERATE_TOKENIZER_STATE(BogusComment) \ __ENUMERATE_TOKENIZER_STATE(MarkupDeclarationOpen) \ __ENUMERATE_TOKENIZER_STATE(CommentStart) \ __ENUMERATE_TOKENIZER_STATE(CommentStartDash) \ __ENUMERATE_TOKENIZER_STATE(Comment) \ __ENUMERATE_TOKENIZER_STATE(CommentLessThanSign) \ __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBang) \ __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDash) \ __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDashDash) \ __ENUMERATE_TOKENIZER_STATE(CommentEndDash) \ __ENUMERATE_TOKENIZER_STATE(CommentEnd) \ __ENUMERATE_TOKENIZER_STATE(CommentEndBang) \ __ENUMERATE_TOKENIZER_STATE(DOCTYPE) \ __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEName) \ __ENUMERATE_TOKENIZER_STATE(DOCTYPEName) \ __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEName) \ __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicKeyword) \ __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEPublicIdentifier) \ __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierDoubleQuoted) \ __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierSingleQuoted) \ __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicIdentifier) \ __ENUMERATE_TOKENIZER_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers) \ __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemKeyword) \ __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPESystemIdentifier) \ __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierDoubleQuoted) \ __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierSingleQuoted) \ __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemIdentifier) \ __ENUMERATE_TOKENIZER_STATE(BogusDOCTYPE) \ __ENUMERATE_TOKENIZER_STATE(CDATASection) \ __ENUMERATE_TOKENIZER_STATE(CDATASectionBracket) \ __ENUMERATE_TOKENIZER_STATE(CDATASectionEnd) \ __ENUMERATE_TOKENIZER_STATE(CharacterReference) \ __ENUMERATE_TOKENIZER_STATE(NamedCharacterReference) \ __ENUMERATE_TOKENIZER_STATE(AmbiguousAmpersand) \ __ENUMERATE_TOKENIZER_STATE(NumericCharacterReference) \ __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReferenceStart) \ __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReferenceStart) \ __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReference) \ __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReference) \ __ENUMERATE_TOKENIZER_STATE(NumericCharacterReferenceEnd) class HTMLTokenizer { public: explicit HTMLTokenizer(StringView input, String const& encoding); enum class State { #define __ENUMERATE_TOKENIZER_STATE(state) state, ENUMERATE_TOKENIZER_STATES #undef __ENUMERATE_TOKENIZER_STATE }; Optional next_token(); void switch_to(Badge, State new_state); void switch_to(State new_state) { m_state = new_state; } void set_blocked(bool b) { m_blocked = b; } bool is_blocked() const { return m_blocked; } String source() const { return m_decoded_input; } private: void skip(size_t count); Optional next_code_point(); Optional peek_code_point(size_t offset) const; bool consume_next_if_match(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive); void create_new_token(HTMLToken::Type); bool current_end_tag_token_is_appropriate() const; String consume_current_builder(); static char const* state_name(State state) { switch (state) { #define __ENUMERATE_TOKENIZER_STATE(state) \ case State::state: \ return #state; ENUMERATE_TOKENIZER_STATES #undef __ENUMERATE_TOKENIZER_STATE }; VERIFY_NOT_REACHED(); } void will_emit(HTMLToken&); void will_switch_to(State); void will_reconsume_in(State); bool consumed_as_part_of_an_attribute() const; void restore_to(Utf8CodePointIterator const& new_iterator); HTMLToken::Position nth_last_position(size_t n = 0); State m_state { State::Data }; State m_return_state { State::Data }; Vector m_temporary_buffer; String m_decoded_input; Utf8View m_utf8_view; Utf8CodePointIterator m_utf8_iterator; Utf8CodePointIterator m_prev_utf8_iterator; HTMLToken m_current_token; StringBuilder m_current_builder; Optional m_last_emitted_start_tag_name; bool m_has_emitted_eof { false }; Queue m_queued_tokens; u32 m_character_reference_code { 0 }; bool m_blocked { false }; Vector m_source_positions; }; }