TestHTMLTokenizer.cpp 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. /*
  2. * Copyright (c) 2021, Max Wipfli <max.wipfli@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <LibTest/TestCase.h>
  7. #include <LibCore/File.h>
  8. #include <LibWeb/HTML/Parser/HTMLTokenizer.h>
  9. using Tokenizer = Web::HTML::HTMLTokenizer;
  10. using Token = Web::HTML::HTMLToken;
  11. #define BEGIN_ENUMERATION(tokens) \
  12. auto current_token = (tokens).begin(); \
  13. [[maybe_unused]] Token* last_token;
  14. #define END_ENUMERATION() \
  15. EXPECT(current_token.is_end());
  16. #define NEXT_TOKEN() \
  17. last_token = &*current_token; \
  18. ++current_token;
  19. #define EXPECT_START_TAG_TOKEN(_tag_name, start_column, end_column) \
  20. EXPECT_EQ(current_token->type(), Token::Type::StartTag); \
  21. EXPECT_EQ(current_token->tag_name(), #_tag_name); \
  22. EXPECT_EQ(current_token->start_position().column, start_column); \
  23. EXPECT_EQ(current_token->end_position().column, end_column); \
  24. NEXT_TOKEN();
  25. #define EXPECT_END_TAG_TOKEN(_tag_name, start_column, end_column) \
  26. EXPECT_EQ(current_token->type(), Token::Type::EndTag); \
  27. EXPECT_EQ(current_token->tag_name(), #_tag_name); \
  28. EXPECT_EQ(current_token->start_position().column, start_column); \
  29. EXPECT_EQ(current_token->end_position().column, end_column); \
  30. NEXT_TOKEN();
  31. #define EXPECT_END_OF_FILE_TOKEN() \
  32. EXPECT_EQ(current_token->type(), Token::Type::EndOfFile); \
  33. NEXT_TOKEN();
  34. #define EXPECT_CHARACTER_TOKEN(character) \
  35. EXPECT_EQ(current_token->type(), Token::Type::Character); \
  36. EXPECT_EQ(current_token->code_point(), (u32)(character)); \
  37. NEXT_TOKEN();
  38. #define EXPECT_CHARACTER_TOKENS(string) \
  39. for (auto c : #string##sv) { \
  40. EXPECT_CHARACTER_TOKEN(c); \
  41. }
  42. #define EXPECT_COMMENT_TOKEN() \
  43. EXPECT_EQ(current_token->type(), Token::Type::Comment); \
  44. NEXT_TOKEN();
  45. #define EXPECT_DOCTYPE_TOKEN() \
  46. EXPECT_EQ(current_token->type(), Token::Type::DOCTYPE); \
  47. NEXT_TOKEN();
  48. #define EXPECT_TAG_TOKEN_ATTRIBUTE(name, value) \
  49. VERIFY(last_token); \
  50. EXPECT_EQ(last_token->attribute(#name), value);
  51. #define EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(count) \
  52. VERIFY(last_token); \
  53. EXPECT_EQ(last_token->attribute_count(), (size_t)(count));
  54. static Vector<Token> run_tokenizer(StringView input)
  55. {
  56. Vector<Token> tokens;
  57. Tokenizer tokenizer { input, "UTF-8"sv };
  58. while (true) {
  59. auto maybe_token = tokenizer.next_token();
  60. if (!maybe_token.has_value())
  61. break;
  62. tokens.append(maybe_token.release_value());
  63. }
  64. return tokens;
  65. }
  66. // FIXME: It's not very nice to rely on the format of HTMLToken::to_string() to stay the same.
  67. static u32 hash_tokens(Vector<Token> const& tokens)
  68. {
  69. StringBuilder builder;
  70. for (auto& token : tokens)
  71. builder.append(token.to_deprecated_string());
  72. return (u32)builder.string_view().hash();
  73. }
  74. TEST_CASE(empty)
  75. {
  76. auto tokens = run_tokenizer(""sv);
  77. BEGIN_ENUMERATION(tokens);
  78. EXPECT_END_OF_FILE_TOKEN();
  79. END_ENUMERATION();
  80. }
  81. TEST_CASE(basic)
  82. {
  83. auto tokens = run_tokenizer("<html><head></head><body></body></html>"sv);
  84. BEGIN_ENUMERATION(tokens);
  85. EXPECT_START_TAG_TOKEN(html, 1u, 5u);
  86. EXPECT_START_TAG_TOKEN(head, 7u, 11u);
  87. EXPECT_END_TAG_TOKEN(head, 14u, 18u);
  88. EXPECT_START_TAG_TOKEN(body, 20u, 24u);
  89. EXPECT_END_TAG_TOKEN(body, 27u, 31u);
  90. EXPECT_END_TAG_TOKEN(html, 34u, 38u);
  91. EXPECT_END_OF_FILE_TOKEN();
  92. END_ENUMERATION();
  93. }
  94. TEST_CASE(basic_with_text)
  95. {
  96. auto tokens = run_tokenizer("<p>This is some text.</p>"sv);
  97. BEGIN_ENUMERATION(tokens);
  98. EXPECT_START_TAG_TOKEN(p, 1u, 2u);
  99. EXPECT_CHARACTER_TOKENS(This is some text.);
  100. EXPECT_END_TAG_TOKEN(p, 23u, 24u);
  101. EXPECT_END_OF_FILE_TOKEN();
  102. END_ENUMERATION();
  103. }
  104. TEST_CASE(unquoted_attributes)
  105. {
  106. auto tokens = run_tokenizer("<p foo=bar>"sv);
  107. BEGIN_ENUMERATION(tokens);
  108. EXPECT_START_TAG_TOKEN(p, 1u, 10u);
  109. EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
  110. EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
  111. EXPECT_END_OF_FILE_TOKEN();
  112. END_ENUMERATION();
  113. }
  114. TEST_CASE(single_quoted_attributes)
  115. {
  116. auto tokens = run_tokenizer("<p foo='bar'>"sv);
  117. BEGIN_ENUMERATION(tokens);
  118. EXPECT_START_TAG_TOKEN(p, 1u, 12u);
  119. EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
  120. EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
  121. EXPECT_END_OF_FILE_TOKEN();
  122. END_ENUMERATION();
  123. }
  124. TEST_CASE(double_quoted_attributes)
  125. {
  126. auto tokens = run_tokenizer("<p foo=\"bar\">"sv);
  127. BEGIN_ENUMERATION(tokens);
  128. EXPECT_START_TAG_TOKEN(p, 1u, 12u);
  129. EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
  130. EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
  131. EXPECT_END_OF_FILE_TOKEN();
  132. END_ENUMERATION();
  133. }
  134. TEST_CASE(multiple_attributes)
  135. {
  136. auto tokens = run_tokenizer("<p foo=\"bar\" baz=foobar foo2=\"bar2\">"sv);
  137. BEGIN_ENUMERATION(tokens);
  138. EXPECT_START_TAG_TOKEN(p, 1u, 35u);
  139. EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(3);
  140. EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
  141. EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "foobar");
  142. EXPECT_TAG_TOKEN_ATTRIBUTE(foo2, "bar2");
  143. EXPECT_END_OF_FILE_TOKEN();
  144. END_ENUMERATION();
  145. }
  146. TEST_CASE(character_reference_in_attribute)
  147. {
  148. auto tokens = run_tokenizer("<p foo=a&amp;b bar='a&#38;b' baz=\"a&#x26;b\">"sv);
  149. BEGIN_ENUMERATION(tokens);
  150. EXPECT_START_TAG_TOKEN(p, 1u, 43u);
  151. EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(3);
  152. EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "a&b");
  153. EXPECT_TAG_TOKEN_ATTRIBUTE(bar, "a&b");
  154. EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "a&b");
  155. EXPECT_END_OF_FILE_TOKEN();
  156. END_ENUMERATION();
  157. }
  158. TEST_CASE(comment)
  159. {
  160. auto tokens = run_tokenizer("<p><!-- This is a comment --></p>"sv);
  161. BEGIN_ENUMERATION(tokens);
  162. EXPECT_START_TAG_TOKEN(p, 1u, 2u);
  163. EXPECT_COMMENT_TOKEN();
  164. EXPECT_END_TAG_TOKEN(p, 31u, 32u);
  165. EXPECT_END_OF_FILE_TOKEN();
  166. END_ENUMERATION();
  167. }
  168. TEST_CASE(doctype)
  169. {
  170. auto tokens = run_tokenizer("<!DOCTYPE html><html></html>"sv);
  171. BEGIN_ENUMERATION(tokens);
  172. EXPECT_DOCTYPE_TOKEN();
  173. EXPECT_START_TAG_TOKEN(html, 16u, 20u);
  174. EXPECT_END_TAG_TOKEN(html, 23u, 27u);
  175. }
  176. // NOTE: This relies on the format of HTMLToken::to_string() staying the same.
  177. // If that changes, or something is added to the test HTML, the hash needs to be adjusted.
  178. TEST_CASE(regression)
  179. {
  180. // This makes sure that the tests will run both on target and in Lagom.
  181. #ifdef AK_OS_SERENITY
  182. StringView path = "/usr/Tests/LibWeb/tokenizer-test.html"sv;
  183. #else
  184. StringView path = "tokenizer-test.html"sv;
  185. #endif
  186. auto file = MUST(Core::File::open(path, Core::File::OpenMode::Read));
  187. auto file_size = MUST(file->size());
  188. auto content = MUST(ByteBuffer::create_uninitialized(file_size));
  189. MUST(file->read_until_filled(content.bytes()));
  190. DeprecatedString file_contents { content.bytes() };
  191. auto tokens = run_tokenizer(file_contents);
  192. u32 hash = hash_tokens(tokens);
  193. EXPECT_EQ(hash, 3657343287u);
  194. }