HTMLToken.h 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * Copyright (c) 2021, Max Wipfli <max.wipfli@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #pragma once
  8. #include <AK/DeprecatedFlyString.h>
  9. #include <AK/DeprecatedString.h>
  10. #include <AK/FlyString.h>
  11. #include <AK/Function.h>
  12. #include <AK/OwnPtr.h>
  13. #include <AK/Types.h>
  14. #include <AK/Variant.h>
  15. #include <AK/Vector.h>
  16. namespace Web::HTML {
  17. class HTMLTokenizer;
  18. class HTMLToken {
  19. AK_MAKE_NONCOPYABLE(HTMLToken);
  20. AK_MAKE_DEFAULT_MOVABLE(HTMLToken);
  21. public:
  22. enum class Type : u8 {
  23. Invalid,
  24. DOCTYPE,
  25. StartTag,
  26. EndTag,
  27. Comment,
  28. Character,
  29. EndOfFile,
  30. };
  31. struct Position {
  32. size_t line { 0 };
  33. size_t column { 0 };
  34. size_t byte_offset { 0 };
  35. };
  36. struct Attribute {
  37. DeprecatedString prefix;
  38. FlyString local_name;
  39. DeprecatedString namespace_;
  40. String value;
  41. Position name_start_position;
  42. Position value_start_position;
  43. Position name_end_position;
  44. Position value_end_position;
  45. };
  46. struct DoctypeData {
  47. // NOTE: "Missing" is a distinct state from the empty string.
  48. DeprecatedString name;
  49. DeprecatedString public_identifier;
  50. DeprecatedString system_identifier;
  51. bool missing_name { true };
  52. bool missing_public_identifier { true };
  53. bool missing_system_identifier { true };
  54. bool force_quirks { false };
  55. };
  56. static HTMLToken make_character(u32 code_point)
  57. {
  58. HTMLToken token { Type::Character };
  59. token.set_code_point(code_point);
  60. return token;
  61. }
  62. static HTMLToken make_start_tag(FlyString const& tag_name)
  63. {
  64. HTMLToken token { Type::StartTag };
  65. token.set_tag_name(tag_name.to_deprecated_fly_string());
  66. return token;
  67. }
  68. HTMLToken() = default;
  69. HTMLToken(Type type)
  70. : m_type(type)
  71. {
  72. switch (m_type) {
  73. case Type::Character:
  74. m_data.set(0u);
  75. break;
  76. case Type::DOCTYPE:
  77. m_data.set(OwnPtr<DoctypeData> {});
  78. break;
  79. case Type::StartTag:
  80. case Type::EndTag:
  81. m_data.set(OwnPtr<Vector<Attribute>>());
  82. break;
  83. default:
  84. break;
  85. }
  86. }
  87. bool is_doctype() const { return m_type == Type::DOCTYPE; }
  88. bool is_start_tag() const { return m_type == Type::StartTag; }
  89. bool is_end_tag() const { return m_type == Type::EndTag; }
  90. bool is_comment() const { return m_type == Type::Comment; }
  91. bool is_character() const { return m_type == Type::Character; }
  92. bool is_end_of_file() const { return m_type == Type::EndOfFile; }
  93. u32 code_point() const
  94. {
  95. VERIFY(is_character());
  96. return m_data.get<u32>();
  97. }
  98. bool is_parser_whitespace() const
  99. {
  100. // NOTE: The parser considers '\r' to be whitespace, while the tokenizer does not.
  101. if (!is_character())
  102. return false;
  103. switch (code_point()) {
  104. case '\t':
  105. case '\n':
  106. case '\f':
  107. case '\r':
  108. case ' ':
  109. return true;
  110. default:
  111. return false;
  112. }
  113. }
  114. void set_code_point(u32 code_point)
  115. {
  116. VERIFY(is_character());
  117. m_data.get<u32>() = code_point;
  118. }
  119. DeprecatedFlyString const& comment() const
  120. {
  121. VERIFY(is_comment());
  122. return m_string_data;
  123. }
  124. void set_comment(DeprecatedString comment)
  125. {
  126. VERIFY(is_comment());
  127. m_string_data = move(comment);
  128. }
  129. FlyString tag_name() const
  130. {
  131. VERIFY(is_start_tag() || is_end_tag());
  132. return MUST(FlyString::from_deprecated_fly_string(m_string_data));
  133. }
  134. void set_tag_name(DeprecatedString name)
  135. {
  136. VERIFY(is_start_tag() || is_end_tag());
  137. m_string_data = move(name);
  138. }
  139. bool is_self_closing() const
  140. {
  141. VERIFY(is_start_tag() || is_end_tag());
  142. return m_tag_self_closing;
  143. }
  144. void set_self_closing(bool self_closing)
  145. {
  146. VERIFY(is_start_tag() || is_end_tag());
  147. m_tag_self_closing = self_closing;
  148. }
  149. bool has_acknowledged_self_closing_flag() const
  150. {
  151. VERIFY(is_self_closing());
  152. return m_tag_self_closing_acknowledged;
  153. }
  154. void acknowledge_self_closing_flag_if_set()
  155. {
  156. if (is_self_closing())
  157. m_tag_self_closing_acknowledged = true;
  158. }
  159. bool has_attributes() const
  160. {
  161. VERIFY(is_start_tag() || is_end_tag());
  162. auto* ptr = tag_attributes();
  163. return ptr && !ptr->is_empty();
  164. }
  165. size_t attribute_count() const
  166. {
  167. VERIFY(is_start_tag() || is_end_tag());
  168. if (auto* ptr = tag_attributes())
  169. return ptr->size();
  170. return 0;
  171. }
  172. void add_attribute(Attribute attribute)
  173. {
  174. VERIFY(is_start_tag() || is_end_tag());
  175. ensure_tag_attributes().append(move(attribute));
  176. }
  177. Attribute const& last_attribute() const
  178. {
  179. VERIFY(is_start_tag() || is_end_tag());
  180. VERIFY(has_attributes());
  181. return tag_attributes()->last();
  182. }
  183. Attribute& last_attribute()
  184. {
  185. VERIFY(is_start_tag() || is_end_tag());
  186. VERIFY(has_attributes());
  187. return tag_attributes()->last();
  188. }
  189. void drop_attributes()
  190. {
  191. VERIFY(is_start_tag() || is_end_tag());
  192. m_data.get<OwnPtr<Vector<Attribute>>>().clear();
  193. }
  194. void for_each_attribute(Function<IterationDecision(Attribute const&)> callback) const
  195. {
  196. VERIFY(is_start_tag() || is_end_tag());
  197. auto* ptr = tag_attributes();
  198. if (!ptr)
  199. return;
  200. for (auto& attribute : *ptr) {
  201. if (callback(attribute) == IterationDecision::Break)
  202. break;
  203. }
  204. }
  205. void for_each_attribute(Function<IterationDecision(Attribute&)> callback)
  206. {
  207. VERIFY(is_start_tag() || is_end_tag());
  208. auto* ptr = tag_attributes();
  209. if (!ptr)
  210. return;
  211. for (auto& attribute : *ptr) {
  212. if (callback(attribute) == IterationDecision::Break)
  213. break;
  214. }
  215. }
  216. StringView attribute(FlyString const& attribute_name) const
  217. {
  218. if (auto result = raw_attribute(attribute_name); result.has_value())
  219. return result->value;
  220. return {};
  221. }
  222. Optional<Attribute const&> raw_attribute(FlyString const& attribute_name) const
  223. {
  224. VERIFY(is_start_tag() || is_end_tag());
  225. auto* ptr = tag_attributes();
  226. if (!ptr)
  227. return {};
  228. for (auto const& attribute : *ptr) {
  229. if (attribute_name == attribute.local_name)
  230. return attribute;
  231. }
  232. return {};
  233. }
  234. bool has_attribute(FlyString const& attribute_name) const
  235. {
  236. return !attribute(attribute_name).is_null();
  237. }
  238. void adjust_tag_name(DeprecatedFlyString const& old_name, DeprecatedFlyString const& new_name)
  239. {
  240. VERIFY(is_start_tag() || is_end_tag());
  241. if (old_name == tag_name().to_deprecated_fly_string())
  242. set_tag_name(new_name);
  243. }
  244. void adjust_attribute_name(FlyString const& old_name, FlyString const& new_name)
  245. {
  246. VERIFY(is_start_tag() || is_end_tag());
  247. for_each_attribute([&](Attribute& attribute) {
  248. if (old_name == attribute.local_name)
  249. attribute.local_name = new_name;
  250. return IterationDecision::Continue;
  251. });
  252. }
  253. void adjust_foreign_attribute(FlyString const& old_name, DeprecatedFlyString const& prefix, FlyString const& local_name, DeprecatedFlyString const& namespace_)
  254. {
  255. VERIFY(is_start_tag() || is_end_tag());
  256. for_each_attribute([&](Attribute& attribute) {
  257. if (old_name == attribute.local_name) {
  258. attribute.prefix = prefix;
  259. attribute.local_name = local_name;
  260. attribute.namespace_ = namespace_;
  261. }
  262. return IterationDecision::Continue;
  263. });
  264. }
  265. DoctypeData const& doctype_data() const
  266. {
  267. VERIFY(is_doctype());
  268. auto* ptr = m_data.get<OwnPtr<DoctypeData>>().ptr();
  269. VERIFY(ptr);
  270. return *ptr;
  271. }
  272. DoctypeData& ensure_doctype_data()
  273. {
  274. VERIFY(is_doctype());
  275. auto& ptr = m_data.get<OwnPtr<DoctypeData>>();
  276. if (!ptr)
  277. ptr = make<DoctypeData>();
  278. return *ptr;
  279. }
  280. Type type() const { return m_type; }
  281. DeprecatedString to_deprecated_string() const;
  282. Position const& start_position() const { return m_start_position; }
  283. Position const& end_position() const { return m_end_position; }
  284. void set_start_position(Badge<HTMLTokenizer>, Position start_position) { m_start_position = start_position; }
  285. void set_end_position(Badge<HTMLTokenizer>, Position end_position) { m_end_position = end_position; }
  286. private:
  287. Vector<Attribute> const* tag_attributes() const
  288. {
  289. return m_data.get<OwnPtr<Vector<Attribute>>>().ptr();
  290. }
  291. Vector<Attribute>* tag_attributes()
  292. {
  293. return m_data.get<OwnPtr<Vector<Attribute>>>().ptr();
  294. }
  295. Vector<Attribute>& ensure_tag_attributes()
  296. {
  297. VERIFY(is_start_tag() || is_end_tag());
  298. auto& ptr = m_data.get<OwnPtr<Vector<Attribute>>>();
  299. if (!ptr)
  300. ptr = make<Vector<Attribute>>();
  301. return *ptr;
  302. }
  303. Type m_type { Type::Invalid };
  304. // Type::StartTag and Type::EndTag
  305. bool m_tag_self_closing { false };
  306. bool m_tag_self_closing_acknowledged { false };
  307. // Type::Comment (comment data), Type::StartTag and Type::EndTag (tag name)
  308. DeprecatedFlyString m_string_data;
  309. Variant<Empty, u32, OwnPtr<DoctypeData>, OwnPtr<Vector<Attribute>>> m_data {};
  310. Position m_start_position;
  311. Position m_end_position;
  312. };
  313. }