2021-07-12 13:12:47 +00:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2021, Max Wipfli <max.wipfli@serenityos.org>
|
|
|
|
*
|
|
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <LibTest/TestCase.h>
|
|
|
|
|
2023-02-09 02:02:46 +00:00
|
|
|
#include <LibCore/File.h>
|
2021-07-12 13:12:47 +00:00
|
|
|
#include <LibWeb/HTML/Parser/HTMLTokenizer.h>
|
|
|
|
|
|
|
|
using Tokenizer = Web::HTML::HTMLTokenizer;
|
|
|
|
using Token = Web::HTML::HTMLToken;
|
|
|
|
|
|
|
|
#define BEGIN_ENUMERATION(tokens) \
|
|
|
|
auto current_token = (tokens).begin(); \
|
2021-07-15 20:23:06 +00:00
|
|
|
[[maybe_unused]] Token* last_token;
|
2021-07-12 13:12:47 +00:00
|
|
|
|
|
|
|
#define END_ENUMERATION() \
|
|
|
|
EXPECT(current_token.is_end());
|
|
|
|
|
2021-07-15 20:23:06 +00:00
|
|
|
#define NEXT_TOKEN() \
|
|
|
|
last_token = &*current_token; \
|
2021-07-12 13:12:47 +00:00
|
|
|
++current_token;
|
|
|
|
|
|
|
|
#define EXPECT_START_TAG_TOKEN(_tag_name) \
|
|
|
|
EXPECT_EQ(current_token->type(), Token::Type::StartTag); \
|
|
|
|
EXPECT_EQ(current_token->tag_name(), #_tag_name); \
|
|
|
|
NEXT_TOKEN();
|
|
|
|
|
|
|
|
#define EXPECT_END_TAG_TOKEN(_tag_name) \
|
|
|
|
EXPECT_EQ(current_token->type(), Token::Type::EndTag); \
|
|
|
|
EXPECT_EQ(current_token->tag_name(), #_tag_name); \
|
|
|
|
NEXT_TOKEN();
|
|
|
|
|
|
|
|
#define EXPECT_END_OF_FILE_TOKEN() \
|
|
|
|
EXPECT_EQ(current_token->type(), Token::Type::EndOfFile); \
|
|
|
|
NEXT_TOKEN();
|
|
|
|
|
|
|
|
#define EXPECT_CHARACTER_TOKEN(character) \
|
|
|
|
EXPECT_EQ(current_token->type(), Token::Type::Character); \
|
|
|
|
EXPECT_EQ(current_token->code_point(), (u32)(character)); \
|
|
|
|
NEXT_TOKEN();
|
|
|
|
|
2022-07-11 17:32:29 +00:00
|
|
|
#define EXPECT_CHARACTER_TOKENS(string) \
|
|
|
|
for (auto c : #string##sv) { \
|
|
|
|
EXPECT_CHARACTER_TOKEN(c); \
|
2021-07-12 13:12:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#define EXPECT_COMMENT_TOKEN() \
|
|
|
|
EXPECT_EQ(current_token->type(), Token::Type::Comment); \
|
|
|
|
NEXT_TOKEN();
|
|
|
|
|
|
|
|
#define EXPECT_DOCTYPE_TOKEN() \
|
|
|
|
EXPECT_EQ(current_token->type(), Token::Type::DOCTYPE); \
|
|
|
|
NEXT_TOKEN();
|
|
|
|
|
|
|
|
#define EXPECT_TAG_TOKEN_ATTRIBUTE(name, value) \
|
2021-07-15 20:23:06 +00:00
|
|
|
VERIFY(last_token); \
|
2021-07-24 22:15:47 +00:00
|
|
|
EXPECT_EQ(last_token->attribute(#name), value);
|
2021-07-12 13:12:47 +00:00
|
|
|
|
|
|
|
#define EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(count) \
|
2021-07-15 20:23:06 +00:00
|
|
|
VERIFY(last_token); \
|
2021-07-14 21:53:11 +00:00
|
|
|
EXPECT_EQ(last_token->attribute_count(), (size_t)(count));
|
2021-07-12 13:12:47 +00:00
|
|
|
|
2021-11-10 23:55:02 +00:00
|
|
|
static Vector<Token> run_tokenizer(StringView input)
|
2021-07-12 13:12:47 +00:00
|
|
|
{
|
|
|
|
Vector<Token> tokens;
|
|
|
|
Tokenizer tokenizer { input, "UTF-8"sv };
|
|
|
|
while (true) {
|
|
|
|
auto maybe_token = tokenizer.next_token();
|
|
|
|
if (!maybe_token.has_value())
|
|
|
|
break;
|
|
|
|
tokens.append(maybe_token.release_value());
|
|
|
|
}
|
|
|
|
return tokens;
|
|
|
|
}
|
|
|
|
|
|
|
|
// FIXME: It's not very nice to rely on the format of HTMLToken::to_string() to stay the same.
|
|
|
|
static u32 hash_tokens(Vector<Token> const& tokens)
|
|
|
|
{
|
|
|
|
StringBuilder builder;
|
|
|
|
for (auto& token : tokens)
|
2022-12-06 01:12:49 +00:00
|
|
|
builder.append(token.to_deprecated_string());
|
2021-07-12 13:12:47 +00:00
|
|
|
return (u32)builder.string_view().hash();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE(empty)
|
|
|
|
{
|
2022-07-11 17:32:29 +00:00
|
|
|
auto tokens = run_tokenizer(""sv);
|
2021-07-12 13:12:47 +00:00
|
|
|
BEGIN_ENUMERATION(tokens);
|
|
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
|
|
END_ENUMERATION();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE(basic)
|
|
|
|
{
|
2022-07-11 17:32:29 +00:00
|
|
|
auto tokens = run_tokenizer("<html><head></head><body></body></html>"sv);
|
2021-07-12 13:12:47 +00:00
|
|
|
BEGIN_ENUMERATION(tokens);
|
|
|
|
EXPECT_START_TAG_TOKEN(html);
|
|
|
|
EXPECT_START_TAG_TOKEN(head);
|
|
|
|
EXPECT_END_TAG_TOKEN(head);
|
|
|
|
EXPECT_START_TAG_TOKEN(body);
|
|
|
|
EXPECT_END_TAG_TOKEN(body);
|
|
|
|
EXPECT_END_TAG_TOKEN(html);
|
|
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
|
|
END_ENUMERATION();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE(basic_with_text)
|
|
|
|
{
|
2022-07-11 17:32:29 +00:00
|
|
|
auto tokens = run_tokenizer("<p>This is some text.</p>"sv);
|
2021-07-12 13:12:47 +00:00
|
|
|
BEGIN_ENUMERATION(tokens);
|
|
|
|
EXPECT_START_TAG_TOKEN(p);
|
|
|
|
EXPECT_CHARACTER_TOKENS(This is some text.);
|
|
|
|
EXPECT_END_TAG_TOKEN(p);
|
|
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
|
|
END_ENUMERATION();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE(unquoted_attributes)
|
|
|
|
{
|
2022-07-11 17:32:29 +00:00
|
|
|
auto tokens = run_tokenizer("<p foo=bar>"sv);
|
2021-07-12 13:12:47 +00:00
|
|
|
BEGIN_ENUMERATION(tokens);
|
|
|
|
EXPECT_START_TAG_TOKEN(p);
|
|
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
|
2021-07-24 22:15:47 +00:00
|
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
|
2021-07-12 13:12:47 +00:00
|
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
|
|
END_ENUMERATION();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE(single_quoted_attributes)
|
|
|
|
{
|
2022-07-11 17:32:29 +00:00
|
|
|
auto tokens = run_tokenizer("<p foo='bar'>"sv);
|
2021-07-12 13:12:47 +00:00
|
|
|
BEGIN_ENUMERATION(tokens);
|
|
|
|
EXPECT_START_TAG_TOKEN(p);
|
|
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
|
2021-07-24 22:15:47 +00:00
|
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
|
2021-07-12 13:12:47 +00:00
|
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
|
|
END_ENUMERATION();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE(double_quoted_attributes)
|
|
|
|
{
|
2022-07-11 17:32:29 +00:00
|
|
|
auto tokens = run_tokenizer("<p foo=\"bar\">"sv);
|
2021-07-12 13:12:47 +00:00
|
|
|
BEGIN_ENUMERATION(tokens);
|
|
|
|
EXPECT_START_TAG_TOKEN(p);
|
|
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
|
2021-07-24 22:15:47 +00:00
|
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
|
2021-07-12 13:12:47 +00:00
|
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
|
|
END_ENUMERATION();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE(multiple_attributes)
|
|
|
|
{
|
2022-07-11 17:32:29 +00:00
|
|
|
auto tokens = run_tokenizer("<p foo=\"bar\" baz=foobar foo2=\"bar2\">"sv);
|
2021-07-12 13:12:47 +00:00
|
|
|
BEGIN_ENUMERATION(tokens);
|
|
|
|
EXPECT_START_TAG_TOKEN(p);
|
|
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(3);
|
2021-07-24 22:15:47 +00:00
|
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
|
|
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "foobar");
|
|
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo2, "bar2");
|
|
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
|
|
END_ENUMERATION();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE(character_reference_in_attribute)
|
|
|
|
{
|
2022-07-11 17:32:29 +00:00
|
|
|
auto tokens = run_tokenizer("<p foo=a&b bar='a&b' baz=\"a&b\">"sv);
|
2021-07-24 22:15:47 +00:00
|
|
|
BEGIN_ENUMERATION(tokens);
|
|
|
|
EXPECT_START_TAG_TOKEN(p);
|
|
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(3);
|
|
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "a&b");
|
|
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(bar, "a&b");
|
|
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "a&b");
|
2021-07-12 13:12:47 +00:00
|
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
|
|
END_ENUMERATION();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE(comment)
|
|
|
|
{
|
2022-07-11 17:32:29 +00:00
|
|
|
auto tokens = run_tokenizer("<p><!-- This is a comment --></p>"sv);
|
2021-07-12 13:12:47 +00:00
|
|
|
BEGIN_ENUMERATION(tokens);
|
|
|
|
EXPECT_START_TAG_TOKEN(p);
|
|
|
|
EXPECT_COMMENT_TOKEN();
|
|
|
|
EXPECT_END_TAG_TOKEN(p);
|
|
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
|
|
END_ENUMERATION();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE(doctype)
|
|
|
|
{
|
2022-07-11 17:32:29 +00:00
|
|
|
auto tokens = run_tokenizer("<!DOCTYPE html><html></html>"sv);
|
2021-07-12 13:12:47 +00:00
|
|
|
BEGIN_ENUMERATION(tokens);
|
|
|
|
EXPECT_DOCTYPE_TOKEN();
|
|
|
|
EXPECT_START_TAG_TOKEN(html);
|
|
|
|
EXPECT_END_TAG_TOKEN(html);
|
|
|
|
}
|
|
|
|
|
|
|
|
// NOTE: This relies on the format of HTMLToken::to_string() staying the same.
|
|
|
|
// If that changes, or something is added to the test HTML, the hash needs to be adjusted.
|
|
|
|
TEST_CASE(regression)
|
|
|
|
{
|
2023-01-09 21:29:39 +00:00
|
|
|
// This makes sure that the tests will run both on target and in Lagom.
|
|
|
|
#ifdef AK_OS_SERENITY
|
|
|
|
StringView path = "/usr/Tests/LibWeb/tokenizer-test.html"sv;
|
|
|
|
#else
|
|
|
|
StringView path = "tokenizer-test.html"sv;
|
|
|
|
#endif
|
|
|
|
|
2023-02-09 02:02:46 +00:00
|
|
|
auto file = MUST(Core::File::open(path, Core::File::OpenMode::Read));
|
2022-03-10 14:02:40 +00:00
|
|
|
auto file_size = MUST(file->size());
|
|
|
|
auto content = MUST(ByteBuffer::create_uninitialized(file_size));
|
2023-03-01 16:24:50 +00:00
|
|
|
MUST(file->read_until_filled(content.bytes()));
|
2022-12-04 18:02:33 +00:00
|
|
|
DeprecatedString file_contents { content.bytes() };
|
2021-07-12 13:12:47 +00:00
|
|
|
auto tokens = run_tokenizer(file_contents);
|
|
|
|
u32 hash = hash_tokens(tokens);
|
2022-02-18 23:02:52 +00:00
|
|
|
EXPECT_EQ(hash, 710375345u);
|
2021-07-12 13:12:47 +00:00
|
|
|
}
|