HTMLTokenizer.h 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #pragma once
  8. #include <AK/Queue.h>
  9. #include <AK/StringBuilder.h>
  10. #include <AK/StringView.h>
  11. #include <AK/Types.h>
  12. #include <AK/Utf8View.h>
  13. #include <LibJS/Heap/GCPtr.h>
  14. #include <LibWeb/Forward.h>
  15. #include <LibWeb/HTML/Parser/HTMLToken.h>
  16. namespace Web::HTML {
  17. #define ENUMERATE_TOKENIZER_STATES \
  18. __ENUMERATE_TOKENIZER_STATE(Data) \
  19. __ENUMERATE_TOKENIZER_STATE(RCDATA) \
  20. __ENUMERATE_TOKENIZER_STATE(RAWTEXT) \
  21. __ENUMERATE_TOKENIZER_STATE(ScriptData) \
  22. __ENUMERATE_TOKENIZER_STATE(PLAINTEXT) \
  23. __ENUMERATE_TOKENIZER_STATE(TagOpen) \
  24. __ENUMERATE_TOKENIZER_STATE(EndTagOpen) \
  25. __ENUMERATE_TOKENIZER_STATE(TagName) \
  26. __ENUMERATE_TOKENIZER_STATE(RCDATALessThanSign) \
  27. __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagOpen) \
  28. __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagName) \
  29. __ENUMERATE_TOKENIZER_STATE(RAWTEXTLessThanSign) \
  30. __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagOpen) \
  31. __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagName) \
  32. __ENUMERATE_TOKENIZER_STATE(ScriptDataLessThanSign) \
  33. __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagOpen) \
  34. __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagName) \
  35. __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStart) \
  36. __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStartDash) \
  37. __ENUMERATE_TOKENIZER_STATE(ScriptDataEscaped) \
  38. __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDash) \
  39. __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDashDash) \
  40. __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedLessThanSign) \
  41. __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagOpen) \
  42. __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagName) \
  43. __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeStart) \
  44. __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscaped) \
  45. __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDash) \
  46. __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDashDash) \
  47. __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedLessThanSign) \
  48. __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeEnd) \
  49. __ENUMERATE_TOKENIZER_STATE(BeforeAttributeName) \
  50. __ENUMERATE_TOKENIZER_STATE(AttributeName) \
  51. __ENUMERATE_TOKENIZER_STATE(AfterAttributeName) \
  52. __ENUMERATE_TOKENIZER_STATE(BeforeAttributeValue) \
  53. __ENUMERATE_TOKENIZER_STATE(AttributeValueDoubleQuoted) \
  54. __ENUMERATE_TOKENIZER_STATE(AttributeValueSingleQuoted) \
  55. __ENUMERATE_TOKENIZER_STATE(AttributeValueUnquoted) \
  56. __ENUMERATE_TOKENIZER_STATE(AfterAttributeValueQuoted) \
  57. __ENUMERATE_TOKENIZER_STATE(SelfClosingStartTag) \
  58. __ENUMERATE_TOKENIZER_STATE(BogusComment) \
  59. __ENUMERATE_TOKENIZER_STATE(MarkupDeclarationOpen) \
  60. __ENUMERATE_TOKENIZER_STATE(CommentStart) \
  61. __ENUMERATE_TOKENIZER_STATE(CommentStartDash) \
  62. __ENUMERATE_TOKENIZER_STATE(Comment) \
  63. __ENUMERATE_TOKENIZER_STATE(CommentLessThanSign) \
  64. __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBang) \
  65. __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDash) \
  66. __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDashDash) \
  67. __ENUMERATE_TOKENIZER_STATE(CommentEndDash) \
  68. __ENUMERATE_TOKENIZER_STATE(CommentEnd) \
  69. __ENUMERATE_TOKENIZER_STATE(CommentEndBang) \
  70. __ENUMERATE_TOKENIZER_STATE(DOCTYPE) \
  71. __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEName) \
  72. __ENUMERATE_TOKENIZER_STATE(DOCTYPEName) \
  73. __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEName) \
  74. __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicKeyword) \
  75. __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEPublicIdentifier) \
  76. __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierDoubleQuoted) \
  77. __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierSingleQuoted) \
  78. __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicIdentifier) \
  79. __ENUMERATE_TOKENIZER_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers) \
  80. __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemKeyword) \
  81. __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPESystemIdentifier) \
  82. __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierDoubleQuoted) \
  83. __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierSingleQuoted) \
  84. __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemIdentifier) \
  85. __ENUMERATE_TOKENIZER_STATE(BogusDOCTYPE) \
  86. __ENUMERATE_TOKENIZER_STATE(CDATASection) \
  87. __ENUMERATE_TOKENIZER_STATE(CDATASectionBracket) \
  88. __ENUMERATE_TOKENIZER_STATE(CDATASectionEnd) \
  89. __ENUMERATE_TOKENIZER_STATE(CharacterReference) \
  90. __ENUMERATE_TOKENIZER_STATE(NamedCharacterReference) \
  91. __ENUMERATE_TOKENIZER_STATE(AmbiguousAmpersand) \
  92. __ENUMERATE_TOKENIZER_STATE(NumericCharacterReference) \
  93. __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReferenceStart) \
  94. __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReferenceStart) \
  95. __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReference) \
  96. __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReference) \
  97. __ENUMERATE_TOKENIZER_STATE(NumericCharacterReferenceEnd)
  98. class HTMLTokenizer {
  99. public:
  100. explicit HTMLTokenizer();
  101. explicit HTMLTokenizer(StringView input, DeprecatedString const& encoding);
  102. enum class State {
  103. #define __ENUMERATE_TOKENIZER_STATE(state) state,
  104. ENUMERATE_TOKENIZER_STATES
  105. #undef __ENUMERATE_TOKENIZER_STATE
  106. };
  107. Optional<HTMLToken> next_token();
  108. void set_parser(Badge<HTMLParser>, HTMLParser& parser) { m_parser = &parser; }
  109. void switch_to(Badge<HTMLParser>, State new_state);
  110. void switch_to(State new_state)
  111. {
  112. m_state = new_state;
  113. }
  114. void set_blocked(bool b) { m_blocked = b; }
  115. bool is_blocked() const { return m_blocked; }
  116. DeprecatedString source() const { return m_decoded_input; }
  117. void insert_input_at_insertion_point(StringView input);
  118. void insert_eof();
  119. bool is_eof_inserted();
  120. bool is_insertion_point_defined() const { return m_insertion_point.defined; }
  121. bool is_insertion_point_reached()
  122. {
  123. return m_insertion_point.defined && m_utf8_view.iterator_offset(m_utf8_iterator) >= m_insertion_point.position;
  124. }
  125. void undefine_insertion_point() { m_insertion_point.defined = false; }
  126. void store_insertion_point() { m_old_insertion_point = m_insertion_point; }
  127. void restore_insertion_point() { m_insertion_point = m_old_insertion_point; }
  128. void update_insertion_point()
  129. {
  130. m_insertion_point.defined = true;
  131. m_insertion_point.position = m_utf8_view.iterator_offset(m_utf8_iterator);
  132. }
  133. // This permanently cuts off the tokenizer input stream.
  134. void abort() { m_aborted = true; }
  135. private:
  136. void skip(size_t count);
  137. Optional<u32> next_code_point();
  138. Optional<u32> peek_code_point(size_t offset) const;
  139. bool consume_next_if_match(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive);
  140. void create_new_token(HTMLToken::Type);
  141. bool current_end_tag_token_is_appropriate() const;
  142. DeprecatedString consume_current_builder();
  143. static char const* state_name(State state)
  144. {
  145. switch (state) {
  146. #define __ENUMERATE_TOKENIZER_STATE(state) \
  147. case State::state: \
  148. return #state;
  149. ENUMERATE_TOKENIZER_STATES
  150. #undef __ENUMERATE_TOKENIZER_STATE
  151. };
  152. VERIFY_NOT_REACHED();
  153. }
  154. void will_emit(HTMLToken&);
  155. void will_switch_to(State);
  156. void will_reconsume_in(State);
  157. bool consumed_as_part_of_an_attribute() const;
  158. void restore_to(Utf8CodePointIterator const& new_iterator);
  159. HTMLToken::Position nth_last_position(size_t n = 0);
  160. JS::GCPtr<HTMLParser> m_parser;
  161. State m_state { State::Data };
  162. State m_return_state { State::Data };
  163. Vector<u32> m_temporary_buffer;
  164. DeprecatedString m_decoded_input;
  165. struct InsertionPoint {
  166. size_t position { 0 };
  167. bool defined { false };
  168. };
  169. InsertionPoint m_insertion_point {};
  170. InsertionPoint m_old_insertion_point {};
  171. Utf8View m_utf8_view;
  172. Utf8CodePointIterator m_utf8_iterator;
  173. Utf8CodePointIterator m_prev_utf8_iterator;
  174. HTMLToken m_current_token;
  175. StringBuilder m_current_builder;
  176. Optional<DeprecatedString> m_last_emitted_start_tag_name;
  177. bool m_explicit_eof_inserted { false };
  178. bool m_has_emitted_eof { false };
  179. Queue<HTMLToken> m_queued_tokens;
  180. u32 m_character_reference_code { 0 };
  181. bool m_blocked { false };
  182. bool m_aborted { false };
  183. Vector<HTMLToken::Position> m_source_positions;
  184. };
  185. }