TestHTMLTokenizer.cpp 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. /*
  2. * Copyright (c) 2021, Max Wipfli <max.wipfli@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <LibTest/TestCase.h>
  7. #include <LibCore/File.h>
  8. #include <LibWeb/HTML/Parser/HTMLTokenizer.h>
  9. using Tokenizer = Web::HTML::HTMLTokenizer;
  10. using Token = Web::HTML::HTMLToken;
  11. #define BEGIN_ENUMERATION(tokens) \
  12. auto current_token = (tokens).begin(); \
  13. [[maybe_unused]] Token* last_token;
  14. #define END_ENUMERATION() \
  15. EXPECT(current_token.is_end());
  16. #define NEXT_TOKEN() \
  17. last_token = &*current_token; \
  18. ++current_token;
  19. #define EXPECT_START_TAG_TOKEN(_tag_name) \
  20. EXPECT_EQ(current_token->type(), Token::Type::StartTag); \
  21. EXPECT_EQ(current_token->tag_name(), #_tag_name); \
  22. NEXT_TOKEN();
  23. #define EXPECT_END_TAG_TOKEN(_tag_name) \
  24. EXPECT_EQ(current_token->type(), Token::Type::EndTag); \
  25. EXPECT_EQ(current_token->tag_name(), #_tag_name); \
  26. NEXT_TOKEN();
  27. #define EXPECT_END_OF_FILE_TOKEN() \
  28. EXPECT_EQ(current_token->type(), Token::Type::EndOfFile); \
  29. NEXT_TOKEN();
  30. #define EXPECT_CHARACTER_TOKEN(character) \
  31. EXPECT_EQ(current_token->type(), Token::Type::Character); \
  32. EXPECT_EQ(current_token->code_point(), (u32)(character)); \
  33. NEXT_TOKEN();
  34. #define EXPECT_CHARACTER_TOKENS(string) \
  35. for (auto c : StringView(#string)) { \
  36. EXPECT_CHARACTER_TOKEN(c); \
  37. }
  38. #define EXPECT_COMMENT_TOKEN() \
  39. EXPECT_EQ(current_token->type(), Token::Type::Comment); \
  40. NEXT_TOKEN();
  41. #define EXPECT_DOCTYPE_TOKEN() \
  42. EXPECT_EQ(current_token->type(), Token::Type::DOCTYPE); \
  43. NEXT_TOKEN();
  44. #define EXPECT_TAG_TOKEN_ATTRIBUTE(name, value) \
  45. VERIFY(last_token); \
  46. EXPECT_EQ(last_token->attribute(#name), value);
  47. #define EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(count) \
  48. VERIFY(last_token); \
  49. EXPECT_EQ(last_token->attribute_count(), (size_t)(count));
  50. static Vector<Token> run_tokenizer(StringView input)
  51. {
  52. Vector<Token> tokens;
  53. Tokenizer tokenizer { input, "UTF-8"sv };
  54. while (true) {
  55. auto maybe_token = tokenizer.next_token();
  56. if (!maybe_token.has_value())
  57. break;
  58. tokens.append(maybe_token.release_value());
  59. }
  60. return tokens;
  61. }
  62. // FIXME: It's not very nice to rely on the format of HTMLToken::to_string() to stay the same.
  63. static u32 hash_tokens(Vector<Token> const& tokens)
  64. {
  65. StringBuilder builder;
  66. for (auto& token : tokens)
  67. builder.append(token.to_string());
  68. return (u32)builder.string_view().hash();
  69. }
  70. TEST_CASE(empty)
  71. {
  72. auto tokens = run_tokenizer("");
  73. BEGIN_ENUMERATION(tokens);
  74. EXPECT_END_OF_FILE_TOKEN();
  75. END_ENUMERATION();
  76. }
  77. TEST_CASE(basic)
  78. {
  79. auto tokens = run_tokenizer("<html><head></head><body></body></html>");
  80. BEGIN_ENUMERATION(tokens);
  81. EXPECT_START_TAG_TOKEN(html);
  82. EXPECT_START_TAG_TOKEN(head);
  83. EXPECT_END_TAG_TOKEN(head);
  84. EXPECT_START_TAG_TOKEN(body);
  85. EXPECT_END_TAG_TOKEN(body);
  86. EXPECT_END_TAG_TOKEN(html);
  87. EXPECT_END_OF_FILE_TOKEN();
  88. END_ENUMERATION();
  89. }
  90. TEST_CASE(basic_with_text)
  91. {
  92. auto tokens = run_tokenizer("<p>This is some text.</p>");
  93. BEGIN_ENUMERATION(tokens);
  94. EXPECT_START_TAG_TOKEN(p);
  95. EXPECT_CHARACTER_TOKENS(This is some text.);
  96. EXPECT_END_TAG_TOKEN(p);
  97. EXPECT_END_OF_FILE_TOKEN();
  98. END_ENUMERATION();
  99. }
  100. TEST_CASE(unquoted_attributes)
  101. {
  102. auto tokens = run_tokenizer("<p foo=bar>");
  103. BEGIN_ENUMERATION(tokens);
  104. EXPECT_START_TAG_TOKEN(p);
  105. EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
  106. EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
  107. EXPECT_END_OF_FILE_TOKEN();
  108. END_ENUMERATION();
  109. }
  110. TEST_CASE(single_quoted_attributes)
  111. {
  112. auto tokens = run_tokenizer("<p foo='bar'>");
  113. BEGIN_ENUMERATION(tokens);
  114. EXPECT_START_TAG_TOKEN(p);
  115. EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
  116. EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
  117. EXPECT_END_OF_FILE_TOKEN();
  118. END_ENUMERATION();
  119. }
  120. TEST_CASE(double_quoted_attributes)
  121. {
  122. auto tokens = run_tokenizer("<p foo=\"bar\">");
  123. BEGIN_ENUMERATION(tokens);
  124. EXPECT_START_TAG_TOKEN(p);
  125. EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
  126. EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
  127. EXPECT_END_OF_FILE_TOKEN();
  128. END_ENUMERATION();
  129. }
  130. TEST_CASE(multiple_attributes)
  131. {
  132. auto tokens = run_tokenizer("<p foo=\"bar\" baz=foobar foo2=\"bar2\">");
  133. BEGIN_ENUMERATION(tokens);
  134. EXPECT_START_TAG_TOKEN(p);
  135. EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(3);
  136. EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
  137. EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "foobar");
  138. EXPECT_TAG_TOKEN_ATTRIBUTE(foo2, "bar2");
  139. EXPECT_END_OF_FILE_TOKEN();
  140. END_ENUMERATION();
  141. }
  142. TEST_CASE(character_reference_in_attribute)
  143. {
  144. auto tokens = run_tokenizer("<p foo=a&amp;b bar='a&#38;b' baz=\"a&#x26;b\">");
  145. BEGIN_ENUMERATION(tokens);
  146. EXPECT_START_TAG_TOKEN(p);
  147. EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(3);
  148. EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "a&b");
  149. EXPECT_TAG_TOKEN_ATTRIBUTE(bar, "a&b");
  150. EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "a&b");
  151. EXPECT_END_OF_FILE_TOKEN();
  152. END_ENUMERATION();
  153. }
  154. TEST_CASE(comment)
  155. {
  156. auto tokens = run_tokenizer("<p><!-- This is a comment --></p>");
  157. BEGIN_ENUMERATION(tokens);
  158. EXPECT_START_TAG_TOKEN(p);
  159. EXPECT_COMMENT_TOKEN();
  160. EXPECT_END_TAG_TOKEN(p);
  161. EXPECT_END_OF_FILE_TOKEN();
  162. END_ENUMERATION();
  163. }
  164. TEST_CASE(doctype)
  165. {
  166. auto tokens = run_tokenizer("<!DOCTYPE html><html></html>");
  167. BEGIN_ENUMERATION(tokens);
  168. EXPECT_DOCTYPE_TOKEN();
  169. EXPECT_START_TAG_TOKEN(html);
  170. EXPECT_END_TAG_TOKEN(html);
  171. }
  172. // NOTE: This relies on the format of HTMLToken::to_string() staying the same.
  173. // If that changes, or something is added to the test HTML, the hash needs to be adjusted.
  174. TEST_CASE(regression)
  175. {
  176. auto file = Core::File::open("/usr/Tests/LibWeb/tokenizer-test.html", Core::OpenMode::ReadOnly);
  177. VERIFY(!file.is_error());
  178. auto file_contents = file.value()->read_all();
  179. auto tokens = run_tokenizer(file_contents);
  180. u32 hash = hash_tokens(tokens);
  181. EXPECT_EQ(hash, 710375345u);
  182. }