HTMLToken.h 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include <AK/FlyString.h>
  8. #include <AK/String.h>
  9. #include <AK/Types.h>
  10. #include <AK/Utf8View.h>
  11. #include <AK/Vector.h>
  12. namespace Web::HTML {
  13. class HTMLToken {
  14. friend class HTMLDocumentParser;
  15. friend class HTMLTokenizer;
  16. public:
  17. enum class Type {
  18. Invalid,
  19. DOCTYPE,
  20. StartTag,
  21. EndTag,
  22. Comment,
  23. Character,
  24. EndOfFile,
  25. };
  26. static HTMLToken make_character(u32 code_point)
  27. {
  28. HTMLToken token;
  29. token.m_type = Type::Character;
  30. StringBuilder builder;
  31. // FIXME: This narrows code_point to char, should this be append_code_point() instead?
  32. builder.append(code_point);
  33. token.m_comment_or_character.data = builder.to_string();
  34. return token;
  35. }
  36. static HTMLToken make_start_tag(FlyString const& tag_name)
  37. {
  38. HTMLToken token;
  39. token.m_type = Type::StartTag;
  40. token.m_tag.tag_name = tag_name;
  41. return token;
  42. }
  43. bool is_doctype() const { return m_type == Type::DOCTYPE; }
  44. bool is_start_tag() const { return m_type == Type::StartTag; }
  45. bool is_end_tag() const { return m_type == Type::EndTag; }
  46. bool is_comment() const { return m_type == Type::Comment; }
  47. bool is_character() const { return m_type == Type::Character; }
  48. bool is_end_of_file() const { return m_type == Type::EndOfFile; }
  49. u32 code_point() const
  50. {
  51. VERIFY(is_character());
  52. Utf8View view(m_comment_or_character.data);
  53. VERIFY(view.length() == 1);
  54. return *view.begin();
  55. }
  56. bool is_parser_whitespace() const
  57. {
  58. // NOTE: The parser considers '\r' to be whitespace, while the tokenizer does not.
  59. if (!is_character())
  60. return false;
  61. switch (code_point()) {
  62. case '\t':
  63. case '\n':
  64. case '\f':
  65. case '\r':
  66. case ' ':
  67. return true;
  68. default:
  69. return false;
  70. }
  71. }
  72. String tag_name() const
  73. {
  74. VERIFY(is_start_tag() || is_end_tag());
  75. return m_tag.tag_name;
  76. }
  77. bool is_self_closing() const
  78. {
  79. VERIFY(is_start_tag() || is_end_tag());
  80. return m_tag.self_closing;
  81. }
  82. bool has_acknowledged_self_closing_flag() const
  83. {
  84. VERIFY(is_self_closing());
  85. return m_tag.self_closing_acknowledged;
  86. }
  87. void acknowledge_self_closing_flag_if_set()
  88. {
  89. if (is_self_closing())
  90. m_tag.self_closing_acknowledged = true;
  91. }
  92. StringView attribute(FlyString const& attribute_name)
  93. {
  94. VERIFY(is_start_tag() || is_end_tag());
  95. for (auto& attribute : m_tag.attributes) {
  96. if (attribute_name == attribute.local_name)
  97. return attribute.value;
  98. }
  99. return {};
  100. }
  101. bool has_attribute(FlyString const& attribute_name)
  102. {
  103. return !attribute(attribute_name).is_null();
  104. }
  105. void adjust_tag_name(FlyString const& old_name, FlyString const& new_name)
  106. {
  107. VERIFY(is_start_tag() || is_end_tag());
  108. if (old_name == m_tag.tag_name)
  109. m_tag.tag_name = new_name;
  110. }
  111. void adjust_attribute_name(FlyString const& old_name, FlyString const& new_name)
  112. {
  113. VERIFY(is_start_tag() || is_end_tag());
  114. for (auto& attribute : m_tag.attributes) {
  115. if (old_name == attribute.local_name) {
  116. attribute.local_name = new_name;
  117. }
  118. }
  119. }
  120. void adjust_foreign_attribute(FlyString const& old_name, FlyString const& prefix, FlyString const& local_name, FlyString const& namespace_)
  121. {
  122. VERIFY(is_start_tag() || is_end_tag());
  123. for (auto& attribute : m_tag.attributes) {
  124. if (old_name == attribute.local_name) {
  125. attribute.prefix = prefix;
  126. attribute.local_name = local_name;
  127. attribute.namespace_ = namespace_;
  128. }
  129. }
  130. }
  131. void drop_attributes()
  132. {
  133. VERIFY(is_start_tag() || is_end_tag());
  134. m_tag.attributes.clear();
  135. }
  136. Type type() const { return m_type; }
  137. String to_string() const;
  138. auto const& start_position() const { return m_start_position; }
  139. auto const& end_position() const { return m_end_position; }
  140. auto const& attributes() const
  141. {
  142. VERIFY(is_start_tag() || is_end_tag());
  143. return m_tag.attributes;
  144. }
  145. private:
  146. struct Position {
  147. size_t line { 0 };
  148. size_t column { 0 };
  149. };
  150. struct AttributeBuilder {
  151. String prefix;
  152. String local_name;
  153. String namespace_;
  154. String value;
  155. Position name_start_position;
  156. Position value_start_position;
  157. Position name_end_position;
  158. Position value_end_position;
  159. };
  160. Type m_type { Type::Invalid };
  161. // Type::DOCTYPE
  162. struct {
  163. // NOTE: "Missing" is a distinct state from the empty string.
  164. String name;
  165. bool missing_name { true };
  166. String public_identifier;
  167. bool missing_public_identifier { true };
  168. String system_identifier;
  169. bool missing_system_identifier { true };
  170. bool force_quirks { false };
  171. } m_doctype;
  172. // Type::StartTag
  173. // Type::EndTag
  174. struct {
  175. String tag_name;
  176. bool self_closing { false };
  177. bool self_closing_acknowledged { false };
  178. Vector<AttributeBuilder> attributes;
  179. } m_tag;
  180. // Type::Comment
  181. // Type::Character
  182. struct {
  183. String data;
  184. } m_comment_or_character;
  185. Position m_start_position;
  186. Position m_end_position;
  187. };
  188. }