HTMLToken.h 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * Copyright (c) 2021, Max Wipfli <max.wipfli@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #pragma once
  8. #include <AK/FlyString.h>
  9. #include <AK/Function.h>
  10. #include <AK/String.h>
  11. #include <AK/Types.h>
  12. #include <AK/Utf8View.h>
  13. #include <AK/Vector.h>
  14. namespace Web::HTML {
  15. class HTMLTokenizer;
  16. class HTMLToken {
  17. friend class HTMLDocumentParser;
  18. friend class HTMLTokenizer;
  19. public:
  20. enum class Type {
  21. Invalid,
  22. DOCTYPE,
  23. StartTag,
  24. EndTag,
  25. Comment,
  26. Character,
  27. EndOfFile,
  28. };
  29. struct Position {
  30. size_t line { 0 };
  31. size_t column { 0 };
  32. };
  33. struct Attribute {
  34. String prefix;
  35. String local_name;
  36. String namespace_;
  37. String value;
  38. Position name_start_position;
  39. Position value_start_position;
  40. Position name_end_position;
  41. Position value_end_position;
  42. };
  43. struct DoctypeData {
  44. // NOTE: "Missing" is a distinct state from the empty string.
  45. String name;
  46. String public_identifier;
  47. String system_identifier;
  48. bool missing_name { true };
  49. bool missing_public_identifier { true };
  50. bool missing_system_identifier { true };
  51. bool force_quirks { false };
  52. };
  53. static HTMLToken make_character(u32 code_point)
  54. {
  55. HTMLToken token;
  56. token.m_type = Type::Character;
  57. token.set_code_point(code_point);
  58. return token;
  59. }
  60. static HTMLToken make_start_tag(FlyString const& tag_name)
  61. {
  62. HTMLToken token;
  63. token.m_type = Type::StartTag;
  64. token.set_tag_name(tag_name);
  65. return token;
  66. }
  67. bool is_doctype() const { return m_type == Type::DOCTYPE; }
  68. bool is_start_tag() const { return m_type == Type::StartTag; }
  69. bool is_end_tag() const { return m_type == Type::EndTag; }
  70. bool is_comment() const { return m_type == Type::Comment; }
  71. bool is_character() const { return m_type == Type::Character; }
  72. bool is_end_of_file() const { return m_type == Type::EndOfFile; }
  73. u32 code_point() const
  74. {
  75. VERIFY(is_character());
  76. Utf8View view(m_comment_or_character.data);
  77. VERIFY(view.length() == 1);
  78. return *view.begin();
  79. }
  80. bool is_parser_whitespace() const
  81. {
  82. // NOTE: The parser considers '\r' to be whitespace, while the tokenizer does not.
  83. if (!is_character())
  84. return false;
  85. switch (code_point()) {
  86. case '\t':
  87. case '\n':
  88. case '\f':
  89. case '\r':
  90. case ' ':
  91. return true;
  92. default:
  93. return false;
  94. }
  95. }
  96. void set_code_point(u32 code_point)
  97. {
  98. VERIFY(is_character());
  99. StringBuilder builder;
  100. builder.append_code_point(code_point);
  101. m_comment_or_character.data = builder.to_string();
  102. }
  103. String const& comment() const
  104. {
  105. VERIFY(is_comment());
  106. return m_comment_or_character.data;
  107. }
  108. void set_comment(String comment)
  109. {
  110. VERIFY(is_comment());
  111. m_comment_or_character.data = move(comment);
  112. }
  113. String const& tag_name() const
  114. {
  115. VERIFY(is_start_tag() || is_end_tag());
  116. return m_tag.tag_name;
  117. }
  118. void set_tag_name(String name)
  119. {
  120. VERIFY(is_start_tag() || is_end_tag());
  121. m_tag.tag_name = move(name);
  122. }
  123. bool is_self_closing() const
  124. {
  125. VERIFY(is_start_tag() || is_end_tag());
  126. return m_tag.self_closing;
  127. }
  128. void set_self_closing(bool self_closing)
  129. {
  130. VERIFY(is_start_tag() || is_end_tag());
  131. m_tag.self_closing = self_closing;
  132. }
  133. bool has_acknowledged_self_closing_flag() const
  134. {
  135. VERIFY(is_self_closing());
  136. return m_tag.self_closing_acknowledged;
  137. }
  138. void acknowledge_self_closing_flag_if_set()
  139. {
  140. if (is_self_closing())
  141. m_tag.self_closing_acknowledged = true;
  142. }
  143. bool has_attributes() const
  144. {
  145. VERIFY(is_start_tag() || is_end_tag());
  146. return !m_tag.attributes.is_empty();
  147. }
  148. size_t attribute_count() const
  149. {
  150. VERIFY(is_start_tag() || is_end_tag());
  151. return m_tag.attributes.size();
  152. }
  153. void add_attribute(Attribute attribute)
  154. {
  155. VERIFY(is_start_tag() || is_end_tag());
  156. m_tag.attributes.append(move(attribute));
  157. }
  158. Attribute const& last_attribute() const
  159. {
  160. VERIFY(is_start_tag() || is_end_tag());
  161. VERIFY(!m_tag.attributes.is_empty());
  162. return m_tag.attributes.last();
  163. }
  164. Attribute& last_attribute()
  165. {
  166. VERIFY(is_start_tag() || is_end_tag());
  167. VERIFY(!m_tag.attributes.is_empty());
  168. return m_tag.attributes.last();
  169. }
  170. void drop_attributes()
  171. {
  172. VERIFY(is_start_tag() || is_end_tag());
  173. m_tag.attributes.clear();
  174. }
  175. void for_each_attribute(Function<IterationDecision(Attribute const&)> callback) const
  176. {
  177. VERIFY(is_start_tag() || is_end_tag());
  178. for (auto& attribute : m_tag.attributes) {
  179. if (callback(attribute) == IterationDecision::Break)
  180. break;
  181. }
  182. }
  183. void for_each_attribute(Function<IterationDecision(Attribute&)> callback)
  184. {
  185. VERIFY(is_start_tag() || is_end_tag());
  186. for (auto& attribute : m_tag.attributes) {
  187. if (callback(attribute) == IterationDecision::Break)
  188. break;
  189. }
  190. }
  191. StringView attribute(FlyString const& attribute_name)
  192. {
  193. VERIFY(is_start_tag() || is_end_tag());
  194. for (auto& attribute : m_tag.attributes) {
  195. if (attribute_name == attribute.local_name)
  196. return attribute.value;
  197. }
  198. return {};
  199. }
  200. bool has_attribute(FlyString const& attribute_name)
  201. {
  202. return !attribute(attribute_name).is_null();
  203. }
  204. void adjust_tag_name(FlyString const& old_name, FlyString const& new_name)
  205. {
  206. VERIFY(is_start_tag() || is_end_tag());
  207. if (old_name == tag_name())
  208. set_tag_name(new_name);
  209. }
  210. void adjust_attribute_name(FlyString const& old_name, FlyString const& new_name)
  211. {
  212. VERIFY(is_start_tag() || is_end_tag());
  213. for_each_attribute([&](Attribute& attribute) {
  214. if (old_name == attribute.local_name)
  215. attribute.local_name = new_name;
  216. return IterationDecision::Continue;
  217. });
  218. }
  219. void adjust_foreign_attribute(FlyString const& old_name, FlyString const& prefix, FlyString const& local_name, FlyString const& namespace_)
  220. {
  221. VERIFY(is_start_tag() || is_end_tag());
  222. for_each_attribute([&](Attribute& attribute) {
  223. if (old_name == attribute.local_name) {
  224. attribute.prefix = prefix;
  225. attribute.local_name = local_name;
  226. attribute.namespace_ = namespace_;
  227. }
  228. return IterationDecision::Continue;
  229. });
  230. }
  231. DoctypeData const& doctype_data() const
  232. {
  233. VERIFY(is_doctype());
  234. return m_doctype;
  235. }
  236. DoctypeData& doctype_data()
  237. {
  238. VERIFY(is_doctype());
  239. return m_doctype;
  240. }
  241. Type type() const { return m_type; }
  242. String to_string() const;
  243. Position const& start_position() const { return m_start_position; }
  244. Position const& end_position() const { return m_end_position; }
  245. void set_start_position(Badge<HTMLTokenizer>, Position start_position) { m_start_position = start_position; }
  246. void set_end_position(Badge<HTMLTokenizer>, Position end_position) { m_end_position = end_position; }
  247. private:
  248. Type m_type { Type::Invalid };
  249. // Type::DOCTYPE
  250. DoctypeData m_doctype;
  251. // Type::StartTag
  252. // Type::EndTag
  253. struct {
  254. String tag_name;
  255. bool self_closing { false };
  256. bool self_closing_acknowledged { false };
  257. Vector<Attribute> attributes;
  258. } m_tag;
  259. // Type::Comment
  260. // Type::Character
  261. struct {
  262. String data;
  263. } m_comment_or_character;
  264. Position m_start_position;
  265. Position m_end_position;
  266. };
  267. }