TestHTMLTokenizer.cpp 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. /*
  2. * Copyright (c) 2021, Max Wipfli <max.wipfli@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <LibTest/TestCase.h>
  7. #include <LibCore/File.h>
  8. #include <LibCore/Stream.h>
  9. #include <LibWeb/HTML/Parser/HTMLTokenizer.h>
  10. using Tokenizer = Web::HTML::HTMLTokenizer;
  11. using Token = Web::HTML::HTMLToken;
  12. #define BEGIN_ENUMERATION(tokens) \
  13. auto current_token = (tokens).begin(); \
  14. [[maybe_unused]] Token* last_token;
  15. #define END_ENUMERATION() \
  16. EXPECT(current_token.is_end());
  17. #define NEXT_TOKEN() \
  18. last_token = &*current_token; \
  19. ++current_token;
  20. #define EXPECT_START_TAG_TOKEN(_tag_name) \
  21. EXPECT_EQ(current_token->type(), Token::Type::StartTag); \
  22. EXPECT_EQ(current_token->tag_name(), #_tag_name); \
  23. NEXT_TOKEN();
  24. #define EXPECT_END_TAG_TOKEN(_tag_name) \
  25. EXPECT_EQ(current_token->type(), Token::Type::EndTag); \
  26. EXPECT_EQ(current_token->tag_name(), #_tag_name); \
  27. NEXT_TOKEN();
  28. #define EXPECT_END_OF_FILE_TOKEN() \
  29. EXPECT_EQ(current_token->type(), Token::Type::EndOfFile); \
  30. NEXT_TOKEN();
  31. #define EXPECT_CHARACTER_TOKEN(character) \
  32. EXPECT_EQ(current_token->type(), Token::Type::Character); \
  33. EXPECT_EQ(current_token->code_point(), (u32)(character)); \
  34. NEXT_TOKEN();
  35. #define EXPECT_CHARACTER_TOKENS(string) \
  36. for (auto c : #string##sv) { \
  37. EXPECT_CHARACTER_TOKEN(c); \
  38. }
  39. #define EXPECT_COMMENT_TOKEN() \
  40. EXPECT_EQ(current_token->type(), Token::Type::Comment); \
  41. NEXT_TOKEN();
  42. #define EXPECT_DOCTYPE_TOKEN() \
  43. EXPECT_EQ(current_token->type(), Token::Type::DOCTYPE); \
  44. NEXT_TOKEN();
  45. #define EXPECT_TAG_TOKEN_ATTRIBUTE(name, value) \
  46. VERIFY(last_token); \
  47. EXPECT_EQ(last_token->attribute(#name), value);
  48. #define EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(count) \
  49. VERIFY(last_token); \
  50. EXPECT_EQ(last_token->attribute_count(), (size_t)(count));
  51. static Vector<Token> run_tokenizer(StringView input)
  52. {
  53. Vector<Token> tokens;
  54. Tokenizer tokenizer { input, "UTF-8"sv };
  55. while (true) {
  56. auto maybe_token = tokenizer.next_token();
  57. if (!maybe_token.has_value())
  58. break;
  59. tokens.append(maybe_token.release_value());
  60. }
  61. return tokens;
  62. }
  63. // FIXME: It's not very nice to rely on the format of HTMLToken::to_string() to stay the same.
  64. static u32 hash_tokens(Vector<Token> const& tokens)
  65. {
  66. StringBuilder builder;
  67. for (auto& token : tokens)
  68. builder.append(token.to_deprecated_string());
  69. return (u32)builder.string_view().hash();
  70. }
  71. TEST_CASE(empty)
  72. {
  73. auto tokens = run_tokenizer(""sv);
  74. BEGIN_ENUMERATION(tokens);
  75. EXPECT_END_OF_FILE_TOKEN();
  76. END_ENUMERATION();
  77. }
  78. TEST_CASE(basic)
  79. {
  80. auto tokens = run_tokenizer("<html><head></head><body></body></html>"sv);
  81. BEGIN_ENUMERATION(tokens);
  82. EXPECT_START_TAG_TOKEN(html);
  83. EXPECT_START_TAG_TOKEN(head);
  84. EXPECT_END_TAG_TOKEN(head);
  85. EXPECT_START_TAG_TOKEN(body);
  86. EXPECT_END_TAG_TOKEN(body);
  87. EXPECT_END_TAG_TOKEN(html);
  88. EXPECT_END_OF_FILE_TOKEN();
  89. END_ENUMERATION();
  90. }
  91. TEST_CASE(basic_with_text)
  92. {
  93. auto tokens = run_tokenizer("<p>This is some text.</p>"sv);
  94. BEGIN_ENUMERATION(tokens);
  95. EXPECT_START_TAG_TOKEN(p);
  96. EXPECT_CHARACTER_TOKENS(This is some text.);
  97. EXPECT_END_TAG_TOKEN(p);
  98. EXPECT_END_OF_FILE_TOKEN();
  99. END_ENUMERATION();
  100. }
  101. TEST_CASE(unquoted_attributes)
  102. {
  103. auto tokens = run_tokenizer("<p foo=bar>"sv);
  104. BEGIN_ENUMERATION(tokens);
  105. EXPECT_START_TAG_TOKEN(p);
  106. EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
  107. EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
  108. EXPECT_END_OF_FILE_TOKEN();
  109. END_ENUMERATION();
  110. }
  111. TEST_CASE(single_quoted_attributes)
  112. {
  113. auto tokens = run_tokenizer("<p foo='bar'>"sv);
  114. BEGIN_ENUMERATION(tokens);
  115. EXPECT_START_TAG_TOKEN(p);
  116. EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
  117. EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
  118. EXPECT_END_OF_FILE_TOKEN();
  119. END_ENUMERATION();
  120. }
  121. TEST_CASE(double_quoted_attributes)
  122. {
  123. auto tokens = run_tokenizer("<p foo=\"bar\">"sv);
  124. BEGIN_ENUMERATION(tokens);
  125. EXPECT_START_TAG_TOKEN(p);
  126. EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
  127. EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
  128. EXPECT_END_OF_FILE_TOKEN();
  129. END_ENUMERATION();
  130. }
  131. TEST_CASE(multiple_attributes)
  132. {
  133. auto tokens = run_tokenizer("<p foo=\"bar\" baz=foobar foo2=\"bar2\">"sv);
  134. BEGIN_ENUMERATION(tokens);
  135. EXPECT_START_TAG_TOKEN(p);
  136. EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(3);
  137. EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
  138. EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "foobar");
  139. EXPECT_TAG_TOKEN_ATTRIBUTE(foo2, "bar2");
  140. EXPECT_END_OF_FILE_TOKEN();
  141. END_ENUMERATION();
  142. }
  143. TEST_CASE(character_reference_in_attribute)
  144. {
  145. auto tokens = run_tokenizer("<p foo=a&amp;b bar='a&#38;b' baz=\"a&#x26;b\">"sv);
  146. BEGIN_ENUMERATION(tokens);
  147. EXPECT_START_TAG_TOKEN(p);
  148. EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(3);
  149. EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "a&b");
  150. EXPECT_TAG_TOKEN_ATTRIBUTE(bar, "a&b");
  151. EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "a&b");
  152. EXPECT_END_OF_FILE_TOKEN();
  153. END_ENUMERATION();
  154. }
  155. TEST_CASE(comment)
  156. {
  157. auto tokens = run_tokenizer("<p><!-- This is a comment --></p>"sv);
  158. BEGIN_ENUMERATION(tokens);
  159. EXPECT_START_TAG_TOKEN(p);
  160. EXPECT_COMMENT_TOKEN();
  161. EXPECT_END_TAG_TOKEN(p);
  162. EXPECT_END_OF_FILE_TOKEN();
  163. END_ENUMERATION();
  164. }
  165. TEST_CASE(doctype)
  166. {
  167. auto tokens = run_tokenizer("<!DOCTYPE html><html></html>"sv);
  168. BEGIN_ENUMERATION(tokens);
  169. EXPECT_DOCTYPE_TOKEN();
  170. EXPECT_START_TAG_TOKEN(html);
  171. EXPECT_END_TAG_TOKEN(html);
  172. }
  173. // NOTE: This relies on the format of HTMLToken::to_string() staying the same.
  174. // If that changes, or something is added to the test HTML, the hash needs to be adjusted.
  175. TEST_CASE(regression)
  176. {
  177. // This makes sure that the tests will run both on target and in Lagom.
  178. #ifdef AK_OS_SERENITY
  179. StringView path = "/usr/Tests/LibWeb/tokenizer-test.html"sv;
  180. #else
  181. StringView path = "tokenizer-test.html"sv;
  182. #endif
  183. auto file = MUST(Core::File::open(path, Core::File::OpenMode::Read));
  184. auto file_size = MUST(file->size());
  185. auto content = MUST(ByteBuffer::create_uninitialized(file_size));
  186. MUST(file->read(content.bytes()));
  187. DeprecatedString file_contents { content.bytes() };
  188. auto tokens = run_tokenizer(file_contents);
  189. u32 hash = hash_tokens(tokens);
  190. EXPECT_EQ(hash, 710375345u);
  191. }