HTMLToken.h 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * Copyright (c) 2021, Max Wipfli <max.wipfli@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #pragma once
  8. #include <AK/FlyString.h>
  9. #include <AK/Function.h>
  10. #include <AK/String.h>
  11. #include <AK/Types.h>
  12. #include <AK/Utf8View.h>
  13. #include <AK/Vector.h>
  14. namespace Web::HTML {
  15. class HTMLToken {
  16. friend class HTMLDocumentParser;
  17. friend class HTMLTokenizer;
  18. public:
  19. enum class Type {
  20. Invalid,
  21. DOCTYPE,
  22. StartTag,
  23. EndTag,
  24. Comment,
  25. Character,
  26. EndOfFile,
  27. };
  28. struct Position {
  29. size_t line { 0 };
  30. size_t column { 0 };
  31. };
  32. struct Attribute {
  33. String prefix;
  34. String local_name;
  35. String namespace_;
  36. String value;
  37. Position name_start_position;
  38. Position value_start_position;
  39. Position name_end_position;
  40. Position value_end_position;
  41. };
  42. struct DoctypeData {
  43. // NOTE: "Missing" is a distinct state from the empty string.
  44. String name;
  45. String public_identifier;
  46. String system_identifier;
  47. bool missing_name { true };
  48. bool missing_public_identifier { true };
  49. bool missing_system_identifier { true };
  50. bool force_quirks { false };
  51. };
  52. static HTMLToken make_character(u32 code_point)
  53. {
  54. HTMLToken token;
  55. token.m_type = Type::Character;
  56. token.set_code_point(code_point);
  57. return token;
  58. }
  59. static HTMLToken make_start_tag(FlyString const& tag_name)
  60. {
  61. HTMLToken token;
  62. token.m_type = Type::StartTag;
  63. token.set_tag_name(tag_name);
  64. return token;
  65. }
  66. bool is_doctype() const { return m_type == Type::DOCTYPE; }
  67. bool is_start_tag() const { return m_type == Type::StartTag; }
  68. bool is_end_tag() const { return m_type == Type::EndTag; }
  69. bool is_comment() const { return m_type == Type::Comment; }
  70. bool is_character() const { return m_type == Type::Character; }
  71. bool is_end_of_file() const { return m_type == Type::EndOfFile; }
  72. u32 code_point() const
  73. {
  74. VERIFY(is_character());
  75. Utf8View view(m_comment_or_character.data);
  76. VERIFY(view.length() == 1);
  77. return *view.begin();
  78. }
  79. bool is_parser_whitespace() const
  80. {
  81. // NOTE: The parser considers '\r' to be whitespace, while the tokenizer does not.
  82. if (!is_character())
  83. return false;
  84. switch (code_point()) {
  85. case '\t':
  86. case '\n':
  87. case '\f':
  88. case '\r':
  89. case ' ':
  90. return true;
  91. default:
  92. return false;
  93. }
  94. }
  95. void set_code_point(u32 code_point)
  96. {
  97. VERIFY(is_character());
  98. StringBuilder builder;
  99. builder.append_code_point(code_point);
  100. m_comment_or_character.data = builder.to_string();
  101. }
  102. String const& comment() const
  103. {
  104. VERIFY(is_comment());
  105. return m_comment_or_character.data;
  106. }
  107. void set_comment(String comment)
  108. {
  109. VERIFY(is_comment());
  110. m_comment_or_character.data = move(comment);
  111. }
  112. String const& tag_name() const
  113. {
  114. VERIFY(is_start_tag() || is_end_tag());
  115. return m_tag.tag_name;
  116. }
  117. void set_tag_name(String name)
  118. {
  119. VERIFY(is_start_tag() || is_end_tag());
  120. m_tag.tag_name = move(name);
  121. }
  122. bool is_self_closing() const
  123. {
  124. VERIFY(is_start_tag() || is_end_tag());
  125. return m_tag.self_closing;
  126. }
  127. void set_self_closing(bool self_closing)
  128. {
  129. VERIFY(is_start_tag() || is_end_tag());
  130. m_tag.self_closing = self_closing;
  131. }
  132. bool has_acknowledged_self_closing_flag() const
  133. {
  134. VERIFY(is_self_closing());
  135. return m_tag.self_closing_acknowledged;
  136. }
  137. void acknowledge_self_closing_flag_if_set()
  138. {
  139. if (is_self_closing())
  140. m_tag.self_closing_acknowledged = true;
  141. }
  142. bool has_attributes() const
  143. {
  144. VERIFY(is_start_tag() || is_end_tag());
  145. return !m_tag.attributes.is_empty();
  146. }
  147. size_t attribute_count() const
  148. {
  149. VERIFY(is_start_tag() || is_end_tag());
  150. return m_tag.attributes.size();
  151. }
  152. void add_attribute(Attribute attribute)
  153. {
  154. VERIFY(is_start_tag() || is_end_tag());
  155. m_tag.attributes.append(move(attribute));
  156. }
  157. Attribute const& last_attribute() const
  158. {
  159. VERIFY(is_start_tag() || is_end_tag());
  160. VERIFY(!m_tag.attributes.is_empty());
  161. return m_tag.attributes.last();
  162. }
  163. Attribute& last_attribute()
  164. {
  165. VERIFY(is_start_tag() || is_end_tag());
  166. VERIFY(!m_tag.attributes.is_empty());
  167. return m_tag.attributes.last();
  168. }
  169. void drop_attributes()
  170. {
  171. VERIFY(is_start_tag() || is_end_tag());
  172. m_tag.attributes.clear();
  173. }
  174. void for_each_attribute(Function<IterationDecision(Attribute const&)> callback) const
  175. {
  176. VERIFY(is_start_tag() || is_end_tag());
  177. for (auto& attribute : m_tag.attributes) {
  178. if (callback(attribute) == IterationDecision::Break)
  179. break;
  180. }
  181. }
  182. void for_each_attribute(Function<IterationDecision(Attribute&)> callback)
  183. {
  184. VERIFY(is_start_tag() || is_end_tag());
  185. for (auto& attribute : m_tag.attributes) {
  186. if (callback(attribute) == IterationDecision::Break)
  187. break;
  188. }
  189. }
  190. StringView attribute(FlyString const& attribute_name)
  191. {
  192. VERIFY(is_start_tag() || is_end_tag());
  193. for (auto& attribute : m_tag.attributes) {
  194. if (attribute_name == attribute.local_name)
  195. return attribute.value;
  196. }
  197. return {};
  198. }
  199. bool has_attribute(FlyString const& attribute_name)
  200. {
  201. return !attribute(attribute_name).is_null();
  202. }
  203. void adjust_tag_name(FlyString const& old_name, FlyString const& new_name)
  204. {
  205. VERIFY(is_start_tag() || is_end_tag());
  206. if (old_name == tag_name())
  207. set_tag_name(new_name);
  208. }
  209. void adjust_attribute_name(FlyString const& old_name, FlyString const& new_name)
  210. {
  211. VERIFY(is_start_tag() || is_end_tag());
  212. for_each_attribute([&](Attribute& attribute) {
  213. if (old_name == attribute.local_name)
  214. attribute.local_name = new_name;
  215. return IterationDecision::Continue;
  216. });
  217. }
  218. void adjust_foreign_attribute(FlyString const& old_name, FlyString const& prefix, FlyString const& local_name, FlyString const& namespace_)
  219. {
  220. VERIFY(is_start_tag() || is_end_tag());
  221. for_each_attribute([&](Attribute& attribute) {
  222. if (old_name == attribute.local_name) {
  223. attribute.prefix = prefix;
  224. attribute.local_name = local_name;
  225. attribute.namespace_ = namespace_;
  226. }
  227. return IterationDecision::Continue;
  228. });
  229. }
  230. DoctypeData const& doctype_data() const
  231. {
  232. VERIFY(is_doctype());
  233. return m_doctype;
  234. }
  235. DoctypeData& doctype_data()
  236. {
  237. VERIFY(is_doctype());
  238. return m_doctype;
  239. }
  240. Type type() const { return m_type; }
  241. String to_string() const;
  242. Position const& start_position() const { return m_start_position; }
  243. Position const& end_position() const { return m_end_position; }
  244. private:
  245. Type m_type { Type::Invalid };
  246. // Type::DOCTYPE
  247. DoctypeData m_doctype;
  248. // Type::StartTag
  249. // Type::EndTag
  250. struct {
  251. String tag_name;
  252. bool self_closing { false };
  253. bool self_closing_acknowledged { false };
  254. Vector<Attribute> attributes;
  255. } m_tag;
  256. // Type::Comment
  257. // Type::Character
  258. struct {
  259. String data;
  260. } m_comment_or_character;
  261. Position m_start_position;
  262. Position m_end_position;
  263. };
  264. }