HTMLToken.h 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * Copyright (c) 2021, Max Wipfli <max.wipfli@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #pragma once
  8. #include <AK/FlyString.h>
  9. #include <AK/String.h>
  10. #include <AK/Types.h>
  11. #include <AK/Utf8View.h>
  12. #include <AK/Vector.h>
  13. namespace Web::HTML {
  14. class HTMLToken {
  15. friend class HTMLDocumentParser;
  16. friend class HTMLTokenizer;
  17. public:
  18. enum class Type {
  19. Invalid,
  20. DOCTYPE,
  21. StartTag,
  22. EndTag,
  23. Comment,
  24. Character,
  25. EndOfFile,
  26. };
  27. struct Position {
  28. size_t line { 0 };
  29. size_t column { 0 };
  30. };
  31. struct Attribute {
  32. String prefix;
  33. String local_name;
  34. String namespace_;
  35. String value;
  36. Position name_start_position;
  37. Position value_start_position;
  38. Position name_end_position;
  39. Position value_end_position;
  40. };
  41. static HTMLToken make_character(u32 code_point)
  42. {
  43. HTMLToken token;
  44. token.m_type = Type::Character;
  45. token.set_code_point(code_point);
  46. return token;
  47. }
  48. static HTMLToken make_start_tag(FlyString const& tag_name)
  49. {
  50. HTMLToken token;
  51. token.m_type = Type::StartTag;
  52. token.set_tag_name(tag_name);
  53. return token;
  54. }
  55. bool is_doctype() const { return m_type == Type::DOCTYPE; }
  56. bool is_start_tag() const { return m_type == Type::StartTag; }
  57. bool is_end_tag() const { return m_type == Type::EndTag; }
  58. bool is_comment() const { return m_type == Type::Comment; }
  59. bool is_character() const { return m_type == Type::Character; }
  60. bool is_end_of_file() const { return m_type == Type::EndOfFile; }
  61. u32 code_point() const
  62. {
  63. VERIFY(is_character());
  64. Utf8View view(m_comment_or_character.data);
  65. VERIFY(view.length() == 1);
  66. return *view.begin();
  67. }
  68. bool is_parser_whitespace() const
  69. {
  70. // NOTE: The parser considers '\r' to be whitespace, while the tokenizer does not.
  71. if (!is_character())
  72. return false;
  73. switch (code_point()) {
  74. case '\t':
  75. case '\n':
  76. case '\f':
  77. case '\r':
  78. case ' ':
  79. return true;
  80. default:
  81. return false;
  82. }
  83. }
  84. void set_code_point(u32 code_point)
  85. {
  86. VERIFY(is_character());
  87. StringBuilder builder;
  88. builder.append_code_point(code_point);
  89. m_comment_or_character.data = builder.to_string();
  90. }
  91. String const& comment() const
  92. {
  93. VERIFY(is_comment());
  94. return m_comment_or_character.data;
  95. }
  96. void set_comment(String comment)
  97. {
  98. VERIFY(is_comment());
  99. m_comment_or_character.data = move(comment);
  100. }
  101. String const& tag_name() const
  102. {
  103. VERIFY(is_start_tag() || is_end_tag());
  104. return m_tag.tag_name;
  105. }
  106. void set_tag_name(String name)
  107. {
  108. VERIFY(is_start_tag() || is_end_tag());
  109. m_tag.tag_name = move(name);
  110. }
  111. bool is_self_closing() const
  112. {
  113. VERIFY(is_start_tag() || is_end_tag());
  114. return m_tag.self_closing;
  115. }
  116. void set_self_closing(bool self_closing)
  117. {
  118. VERIFY(is_start_tag() || is_end_tag());
  119. m_tag.self_closing = self_closing;
  120. }
  121. bool has_acknowledged_self_closing_flag() const
  122. {
  123. VERIFY(is_self_closing());
  124. return m_tag.self_closing_acknowledged;
  125. }
  126. void acknowledge_self_closing_flag_if_set()
  127. {
  128. if (is_self_closing())
  129. m_tag.self_closing_acknowledged = true;
  130. }
  131. StringView attribute(FlyString const& attribute_name)
  132. {
  133. VERIFY(is_start_tag() || is_end_tag());
  134. for (auto& attribute : m_tag.attributes) {
  135. if (attribute_name == attribute.local_name)
  136. return attribute.value;
  137. }
  138. return {};
  139. }
  140. bool has_attribute(FlyString const& attribute_name)
  141. {
  142. return !attribute(attribute_name).is_null();
  143. }
  144. void adjust_tag_name(FlyString const& old_name, FlyString const& new_name)
  145. {
  146. VERIFY(is_start_tag() || is_end_tag());
  147. if (old_name == tag_name())
  148. set_tag_name(new_name);
  149. }
  150. void adjust_attribute_name(FlyString const& old_name, FlyString const& new_name)
  151. {
  152. VERIFY(is_start_tag() || is_end_tag());
  153. for (auto& attribute : m_tag.attributes) {
  154. if (old_name == attribute.local_name) {
  155. attribute.local_name = new_name;
  156. }
  157. }
  158. }
  159. void adjust_foreign_attribute(FlyString const& old_name, FlyString const& prefix, FlyString const& local_name, FlyString const& namespace_)
  160. {
  161. VERIFY(is_start_tag() || is_end_tag());
  162. for (auto& attribute : m_tag.attributes) {
  163. if (old_name == attribute.local_name) {
  164. attribute.prefix = prefix;
  165. attribute.local_name = local_name;
  166. attribute.namespace_ = namespace_;
  167. }
  168. }
  169. }
  170. void drop_attributes()
  171. {
  172. VERIFY(is_start_tag() || is_end_tag());
  173. m_tag.attributes.clear();
  174. }
  175. Type type() const { return m_type; }
  176. String to_string() const;
  177. Position const& start_position() const { return m_start_position; }
  178. Position const& end_position() const { return m_end_position; }
  179. Vector<Attribute> const& attributes() const
  180. {
  181. VERIFY(is_start_tag() || is_end_tag());
  182. return m_tag.attributes;
  183. }
  184. private:
  185. Type m_type { Type::Invalid };
  186. // Type::DOCTYPE
  187. struct {
  188. // NOTE: "Missing" is a distinct state from the empty string.
  189. String name;
  190. bool missing_name { true };
  191. String public_identifier;
  192. bool missing_public_identifier { true };
  193. String system_identifier;
  194. bool missing_system_identifier { true };
  195. bool force_quirks { false };
  196. } m_doctype;
  197. // Type::StartTag
  198. // Type::EndTag
  199. struct {
  200. String tag_name;
  201. bool self_closing { false };
  202. bool self_closing_acknowledged { false };
  203. Vector<Attribute> attributes;
  204. } m_tag;
  205. // Type::Comment
  206. // Type::Character
  207. struct {
  208. String data;
  209. } m_comment_or_character;
  210. Position m_start_position;
  211. Position m_end_position;
  212. };
  213. }