HTMLToken.h 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * Copyright (c) 2021, Max Wipfli <max.wipfli@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #pragma once
  8. #include <AK/DeprecatedFlyString.h>
  9. #include <AK/DeprecatedString.h>
  10. #include <AK/Function.h>
  11. #include <AK/OwnPtr.h>
  12. #include <AK/Types.h>
  13. #include <AK/Variant.h>
  14. #include <AK/Vector.h>
  15. namespace Web::HTML {
  16. class HTMLTokenizer;
  17. class HTMLToken {
  18. AK_MAKE_NONCOPYABLE(HTMLToken);
  19. AK_MAKE_DEFAULT_MOVABLE(HTMLToken);
  20. public:
  21. enum class Type : u8 {
  22. Invalid,
  23. DOCTYPE,
  24. StartTag,
  25. EndTag,
  26. Comment,
  27. Character,
  28. EndOfFile,
  29. };
  30. struct Position {
  31. size_t line { 0 };
  32. size_t column { 0 };
  33. size_t byte_offset { 0 };
  34. };
  35. struct Attribute {
  36. DeprecatedString prefix;
  37. DeprecatedString local_name { "" };
  38. DeprecatedString namespace_;
  39. DeprecatedString value { "" };
  40. Position name_start_position;
  41. Position value_start_position;
  42. Position name_end_position;
  43. Position value_end_position;
  44. };
  45. struct DoctypeData {
  46. // NOTE: "Missing" is a distinct state from the empty string.
  47. DeprecatedString name;
  48. DeprecatedString public_identifier;
  49. DeprecatedString system_identifier;
  50. bool missing_name { true };
  51. bool missing_public_identifier { true };
  52. bool missing_system_identifier { true };
  53. bool force_quirks { false };
  54. };
  55. static HTMLToken make_character(u32 code_point)
  56. {
  57. HTMLToken token { Type::Character };
  58. token.set_code_point(code_point);
  59. return token;
  60. }
  61. static HTMLToken make_start_tag(DeprecatedFlyString const& tag_name)
  62. {
  63. HTMLToken token { Type::StartTag };
  64. token.set_tag_name(tag_name);
  65. return token;
  66. }
  67. HTMLToken() = default;
  68. HTMLToken(Type type)
  69. : m_type(type)
  70. {
  71. switch (m_type) {
  72. case Type::Character:
  73. m_data.set(0u);
  74. break;
  75. case Type::DOCTYPE:
  76. m_data.set(OwnPtr<DoctypeData> {});
  77. break;
  78. case Type::StartTag:
  79. case Type::EndTag:
  80. m_data.set(OwnPtr<Vector<Attribute>>());
  81. break;
  82. default:
  83. break;
  84. }
  85. }
  86. bool is_doctype() const { return m_type == Type::DOCTYPE; }
  87. bool is_start_tag() const { return m_type == Type::StartTag; }
  88. bool is_end_tag() const { return m_type == Type::EndTag; }
  89. bool is_comment() const { return m_type == Type::Comment; }
  90. bool is_character() const { return m_type == Type::Character; }
  91. bool is_end_of_file() const { return m_type == Type::EndOfFile; }
  92. u32 code_point() const
  93. {
  94. VERIFY(is_character());
  95. return m_data.get<u32>();
  96. }
  97. bool is_parser_whitespace() const
  98. {
  99. // NOTE: The parser considers '\r' to be whitespace, while the tokenizer does not.
  100. if (!is_character())
  101. return false;
  102. switch (code_point()) {
  103. case '\t':
  104. case '\n':
  105. case '\f':
  106. case '\r':
  107. case ' ':
  108. return true;
  109. default:
  110. return false;
  111. }
  112. }
  113. void set_code_point(u32 code_point)
  114. {
  115. VERIFY(is_character());
  116. m_data.get<u32>() = code_point;
  117. }
  118. DeprecatedFlyString const& comment() const
  119. {
  120. VERIFY(is_comment());
  121. return m_string_data;
  122. }
  123. void set_comment(DeprecatedString comment)
  124. {
  125. VERIFY(is_comment());
  126. m_string_data = move(comment);
  127. }
  128. DeprecatedFlyString const& tag_name() const
  129. {
  130. VERIFY(is_start_tag() || is_end_tag());
  131. return m_string_data;
  132. }
  133. void set_tag_name(DeprecatedString name)
  134. {
  135. VERIFY(is_start_tag() || is_end_tag());
  136. m_string_data = move(name);
  137. }
  138. bool is_self_closing() const
  139. {
  140. VERIFY(is_start_tag() || is_end_tag());
  141. return m_tag_self_closing;
  142. }
  143. void set_self_closing(bool self_closing)
  144. {
  145. VERIFY(is_start_tag() || is_end_tag());
  146. m_tag_self_closing = self_closing;
  147. }
  148. bool has_acknowledged_self_closing_flag() const
  149. {
  150. VERIFY(is_self_closing());
  151. return m_tag_self_closing_acknowledged;
  152. }
  153. void acknowledge_self_closing_flag_if_set()
  154. {
  155. if (is_self_closing())
  156. m_tag_self_closing_acknowledged = true;
  157. }
  158. bool has_attributes() const
  159. {
  160. VERIFY(is_start_tag() || is_end_tag());
  161. auto* ptr = tag_attributes();
  162. return ptr && !ptr->is_empty();
  163. }
  164. size_t attribute_count() const
  165. {
  166. VERIFY(is_start_tag() || is_end_tag());
  167. if (auto* ptr = tag_attributes())
  168. return ptr->size();
  169. return 0;
  170. }
  171. void add_attribute(Attribute attribute)
  172. {
  173. VERIFY(is_start_tag() || is_end_tag());
  174. ensure_tag_attributes().append(move(attribute));
  175. }
  176. Attribute const& last_attribute() const
  177. {
  178. VERIFY(is_start_tag() || is_end_tag());
  179. VERIFY(has_attributes());
  180. return tag_attributes()->last();
  181. }
  182. Attribute& last_attribute()
  183. {
  184. VERIFY(is_start_tag() || is_end_tag());
  185. VERIFY(has_attributes());
  186. return tag_attributes()->last();
  187. }
  188. void drop_attributes()
  189. {
  190. VERIFY(is_start_tag() || is_end_tag());
  191. m_data.get<OwnPtr<Vector<Attribute>>>().clear();
  192. }
  193. void for_each_attribute(Function<IterationDecision(Attribute const&)> callback) const
  194. {
  195. VERIFY(is_start_tag() || is_end_tag());
  196. auto* ptr = tag_attributes();
  197. if (!ptr)
  198. return;
  199. for (auto& attribute : *ptr) {
  200. if (callback(attribute) == IterationDecision::Break)
  201. break;
  202. }
  203. }
  204. void for_each_attribute(Function<IterationDecision(Attribute&)> callback)
  205. {
  206. VERIFY(is_start_tag() || is_end_tag());
  207. auto* ptr = tag_attributes();
  208. if (!ptr)
  209. return;
  210. for (auto& attribute : *ptr) {
  211. if (callback(attribute) == IterationDecision::Break)
  212. break;
  213. }
  214. }
  215. StringView attribute(DeprecatedFlyString const& attribute_name) const
  216. {
  217. if (auto result = raw_attribute(attribute_name); result.has_value())
  218. return result->value;
  219. return {};
  220. }
  221. Optional<Attribute const&> raw_attribute(DeprecatedFlyString const& attribute_name) const
  222. {
  223. VERIFY(is_start_tag() || is_end_tag());
  224. auto* ptr = tag_attributes();
  225. if (!ptr)
  226. return {};
  227. for (auto& attribute : *ptr) {
  228. if (attribute_name == attribute.local_name)
  229. return attribute;
  230. }
  231. return {};
  232. }
  233. bool has_attribute(DeprecatedFlyString const& attribute_name)
  234. {
  235. return !attribute(attribute_name).is_null();
  236. }
  237. void adjust_tag_name(DeprecatedFlyString const& old_name, DeprecatedFlyString const& new_name)
  238. {
  239. VERIFY(is_start_tag() || is_end_tag());
  240. if (old_name == tag_name())
  241. set_tag_name(new_name);
  242. }
  243. void adjust_attribute_name(DeprecatedFlyString const& old_name, DeprecatedFlyString const& new_name)
  244. {
  245. VERIFY(is_start_tag() || is_end_tag());
  246. for_each_attribute([&](Attribute& attribute) {
  247. if (old_name == attribute.local_name)
  248. attribute.local_name = new_name;
  249. return IterationDecision::Continue;
  250. });
  251. }
  252. void adjust_foreign_attribute(DeprecatedFlyString const& old_name, DeprecatedFlyString const& prefix, DeprecatedFlyString const& local_name, DeprecatedFlyString const& namespace_)
  253. {
  254. VERIFY(is_start_tag() || is_end_tag());
  255. for_each_attribute([&](Attribute& attribute) {
  256. if (old_name == attribute.local_name) {
  257. attribute.prefix = prefix;
  258. attribute.local_name = local_name;
  259. attribute.namespace_ = namespace_;
  260. }
  261. return IterationDecision::Continue;
  262. });
  263. }
  264. DoctypeData const& doctype_data() const
  265. {
  266. VERIFY(is_doctype());
  267. auto* ptr = m_data.get<OwnPtr<DoctypeData>>().ptr();
  268. VERIFY(ptr);
  269. return *ptr;
  270. }
  271. DoctypeData& ensure_doctype_data()
  272. {
  273. VERIFY(is_doctype());
  274. auto& ptr = m_data.get<OwnPtr<DoctypeData>>();
  275. if (!ptr)
  276. ptr = make<DoctypeData>();
  277. return *ptr;
  278. }
  279. Type type() const { return m_type; }
  280. DeprecatedString to_deprecated_string() const;
  281. Position const& start_position() const { return m_start_position; }
  282. Position const& end_position() const { return m_end_position; }
  283. void set_start_position(Badge<HTMLTokenizer>, Position start_position) { m_start_position = start_position; }
  284. void set_end_position(Badge<HTMLTokenizer>, Position end_position) { m_end_position = end_position; }
  285. private:
  286. Vector<Attribute> const* tag_attributes() const
  287. {
  288. return m_data.get<OwnPtr<Vector<Attribute>>>().ptr();
  289. }
  290. Vector<Attribute>* tag_attributes()
  291. {
  292. return m_data.get<OwnPtr<Vector<Attribute>>>().ptr();
  293. }
  294. Vector<Attribute>& ensure_tag_attributes()
  295. {
  296. VERIFY(is_start_tag() || is_end_tag());
  297. auto& ptr = m_data.get<OwnPtr<Vector<Attribute>>>();
  298. if (!ptr)
  299. ptr = make<Vector<Attribute>>();
  300. return *ptr;
  301. }
  302. Type m_type { Type::Invalid };
  303. // Type::StartTag and Type::EndTag
  304. bool m_tag_self_closing { false };
  305. bool m_tag_self_closing_acknowledged { false };
  306. // Type::Comment (comment data), Type::StartTag and Type::EndTag (tag name)
  307. DeprecatedFlyString m_string_data;
  308. Variant<Empty, u32, OwnPtr<DoctypeData>, OwnPtr<Vector<Attribute>>> m_data {};
  309. Position m_start_position;
  310. Position m_end_position;
  311. };
  312. }