123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239 |
- /*
- * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
- * Copyright (c) 2021, Max Wipfli <max.wipfli@serenityos.org>
- *
- * SPDX-License-Identifier: BSD-2-Clause
- */
- #pragma once
- #include <AK/FlyString.h>
- #include <AK/String.h>
- #include <AK/Types.h>
- #include <AK/Utf8View.h>
- #include <AK/Vector.h>
- namespace Web::HTML {
- class HTMLToken {
- friend class HTMLDocumentParser;
- friend class HTMLTokenizer;
- public:
- enum class Type {
- Invalid,
- DOCTYPE,
- StartTag,
- EndTag,
- Comment,
- Character,
- EndOfFile,
- };
- struct Position {
- size_t line { 0 };
- size_t column { 0 };
- };
- struct Attribute {
- String prefix;
- String local_name;
- String namespace_;
- String value;
- Position name_start_position;
- Position value_start_position;
- Position name_end_position;
- Position value_end_position;
- };
- static HTMLToken make_character(u32 code_point)
- {
- HTMLToken token;
- token.m_type = Type::Character;
- token.set_code_point(code_point);
- return token;
- }
- static HTMLToken make_start_tag(FlyString const& tag_name)
- {
- HTMLToken token;
- token.m_type = Type::StartTag;
- token.m_tag.tag_name = tag_name;
- return token;
- }
- bool is_doctype() const { return m_type == Type::DOCTYPE; }
- bool is_start_tag() const { return m_type == Type::StartTag; }
- bool is_end_tag() const { return m_type == Type::EndTag; }
- bool is_comment() const { return m_type == Type::Comment; }
- bool is_character() const { return m_type == Type::Character; }
- bool is_end_of_file() const { return m_type == Type::EndOfFile; }
- u32 code_point() const
- {
- VERIFY(is_character());
- Utf8View view(m_comment_or_character.data);
- VERIFY(view.length() == 1);
- return *view.begin();
- }
- bool is_parser_whitespace() const
- {
- // NOTE: The parser considers '\r' to be whitespace, while the tokenizer does not.
- if (!is_character())
- return false;
- switch (code_point()) {
- case '\t':
- case '\n':
- case '\f':
- case '\r':
- case ' ':
- return true;
- default:
- return false;
- }
- }
- void set_code_point(u32 code_point)
- {
- VERIFY(is_character());
- StringBuilder builder;
- builder.append_code_point(code_point);
- m_comment_or_character.data = builder.to_string();
- }
- String const& comment() const
- {
- VERIFY(is_comment());
- return m_comment_or_character.data;
- }
- void set_comment(String comment)
- {
- VERIFY(is_comment());
- m_comment_or_character.data = move(comment);
- }
- String tag_name() const
- {
- VERIFY(is_start_tag() || is_end_tag());
- return m_tag.tag_name;
- }
- bool is_self_closing() const
- {
- VERIFY(is_start_tag() || is_end_tag());
- return m_tag.self_closing;
- }
- bool has_acknowledged_self_closing_flag() const
- {
- VERIFY(is_self_closing());
- return m_tag.self_closing_acknowledged;
- }
- void acknowledge_self_closing_flag_if_set()
- {
- if (is_self_closing())
- m_tag.self_closing_acknowledged = true;
- }
- StringView attribute(FlyString const& attribute_name)
- {
- VERIFY(is_start_tag() || is_end_tag());
- for (auto& attribute : m_tag.attributes) {
- if (attribute_name == attribute.local_name)
- return attribute.value;
- }
- return {};
- }
- bool has_attribute(FlyString const& attribute_name)
- {
- return !attribute(attribute_name).is_null();
- }
- void adjust_tag_name(FlyString const& old_name, FlyString const& new_name)
- {
- VERIFY(is_start_tag() || is_end_tag());
- if (old_name == m_tag.tag_name)
- m_tag.tag_name = new_name;
- }
- void adjust_attribute_name(FlyString const& old_name, FlyString const& new_name)
- {
- VERIFY(is_start_tag() || is_end_tag());
- for (auto& attribute : m_tag.attributes) {
- if (old_name == attribute.local_name) {
- attribute.local_name = new_name;
- }
- }
- }
- void adjust_foreign_attribute(FlyString const& old_name, FlyString const& prefix, FlyString const& local_name, FlyString const& namespace_)
- {
- VERIFY(is_start_tag() || is_end_tag());
- for (auto& attribute : m_tag.attributes) {
- if (old_name == attribute.local_name) {
- attribute.prefix = prefix;
- attribute.local_name = local_name;
- attribute.namespace_ = namespace_;
- }
- }
- }
- void drop_attributes()
- {
- VERIFY(is_start_tag() || is_end_tag());
- m_tag.attributes.clear();
- }
- Type type() const { return m_type; }
- String to_string() const;
- Position const& start_position() const { return m_start_position; }
- Position const& end_position() const { return m_end_position; }
- Vector<Attribute> const& attributes() const
- {
- VERIFY(is_start_tag() || is_end_tag());
- return m_tag.attributes;
- }
- private:
- Type m_type { Type::Invalid };
- // Type::DOCTYPE
- struct {
- // NOTE: "Missing" is a distinct state from the empty string.
- String name;
- bool missing_name { true };
- String public_identifier;
- bool missing_public_identifier { true };
- String system_identifier;
- bool missing_system_identifier { true };
- bool force_quirks { false };
- } m_doctype;
- // Type::StartTag
- // Type::EndTag
- struct {
- String tag_name;
- bool self_closing { false };
- bool self_closing_acknowledged { false };
- Vector<Attribute> attributes;
- } m_tag;
- // Type::Comment
- // Type::Character
- struct {
- String data;
- } m_comment_or_character;
- Position m_start_position;
- Position m_end_position;
- };
- }
|