
To illustrate the previous behavior, consider these tags and their start and end positions (shown inclusively below): Start tag: End tag: <span> </span> ^ start ^ start ^end ^end The start position of a tag is the first ASCII-alpha code point after the opening brace. The start position of a close tag is the slash just before the first ASCII-alpha code point. And the end position of both is the closing brace. So the opening brace is not included in the emitted tag, but the closing brace is. And the end tag including the slash is an oddity that had to be worked around in its only use case (syntax highlighting). We now consistently exclude the braces from the emitted tag, and also exclude the slash from the end tag, so that it does not need to be accounted for in syntax highlighting. That is, we now have: Start tag: End tag: <span> </span> ^ start ^ start ^end ^end The tokenizer unit test has been extended to test these positions.
223 lines
6.9 KiB
C++
223 lines
6.9 KiB
C++
/*
|
|
* Copyright (c) 2021, Max Wipfli <max.wipfli@serenityos.org>
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
#include <LibTest/TestCase.h>
|
|
|
|
#include <LibCore/File.h>
|
|
#include <LibWeb/HTML/Parser/HTMLTokenizer.h>
|
|
|
|
using Tokenizer = Web::HTML::HTMLTokenizer;
|
|
using Token = Web::HTML::HTMLToken;
|
|
|
|
#define BEGIN_ENUMERATION(tokens) \
|
|
auto current_token = (tokens).begin(); \
|
|
[[maybe_unused]] Token* last_token;
|
|
|
|
#define END_ENUMERATION() \
|
|
EXPECT(current_token.is_end());
|
|
|
|
#define NEXT_TOKEN() \
|
|
last_token = &*current_token; \
|
|
++current_token;
|
|
|
|
#define EXPECT_START_TAG_TOKEN(_tag_name, start_column, end_column) \
|
|
EXPECT_EQ(current_token->type(), Token::Type::StartTag); \
|
|
EXPECT_EQ(current_token->tag_name(), #_tag_name); \
|
|
EXPECT_EQ(current_token->start_position().column, start_column); \
|
|
EXPECT_EQ(current_token->end_position().column, end_column); \
|
|
NEXT_TOKEN();
|
|
|
|
#define EXPECT_END_TAG_TOKEN(_tag_name, start_column, end_column) \
|
|
EXPECT_EQ(current_token->type(), Token::Type::EndTag); \
|
|
EXPECT_EQ(current_token->tag_name(), #_tag_name); \
|
|
EXPECT_EQ(current_token->start_position().column, start_column); \
|
|
EXPECT_EQ(current_token->end_position().column, end_column); \
|
|
NEXT_TOKEN();
|
|
|
|
#define EXPECT_END_OF_FILE_TOKEN() \
|
|
EXPECT_EQ(current_token->type(), Token::Type::EndOfFile); \
|
|
NEXT_TOKEN();
|
|
|
|
#define EXPECT_CHARACTER_TOKEN(character) \
|
|
EXPECT_EQ(current_token->type(), Token::Type::Character); \
|
|
EXPECT_EQ(current_token->code_point(), (u32)(character)); \
|
|
NEXT_TOKEN();
|
|
|
|
#define EXPECT_CHARACTER_TOKENS(string) \
|
|
for (auto c : #string##sv) { \
|
|
EXPECT_CHARACTER_TOKEN(c); \
|
|
}
|
|
|
|
#define EXPECT_COMMENT_TOKEN() \
|
|
EXPECT_EQ(current_token->type(), Token::Type::Comment); \
|
|
NEXT_TOKEN();
|
|
|
|
#define EXPECT_DOCTYPE_TOKEN() \
|
|
EXPECT_EQ(current_token->type(), Token::Type::DOCTYPE); \
|
|
NEXT_TOKEN();
|
|
|
|
#define EXPECT_TAG_TOKEN_ATTRIBUTE(name, value) \
|
|
VERIFY(last_token); \
|
|
EXPECT_EQ(last_token->attribute(#name), value);
|
|
|
|
#define EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(count) \
|
|
VERIFY(last_token); \
|
|
EXPECT_EQ(last_token->attribute_count(), (size_t)(count));
|
|
|
|
static Vector<Token> run_tokenizer(StringView input)
|
|
{
|
|
Vector<Token> tokens;
|
|
Tokenizer tokenizer { input, "UTF-8"sv };
|
|
while (true) {
|
|
auto maybe_token = tokenizer.next_token();
|
|
if (!maybe_token.has_value())
|
|
break;
|
|
tokens.append(maybe_token.release_value());
|
|
}
|
|
return tokens;
|
|
}
|
|
|
|
// FIXME: It's not very nice to rely on the format of HTMLToken::to_string() to stay the same.
|
|
static u32 hash_tokens(Vector<Token> const& tokens)
|
|
{
|
|
StringBuilder builder;
|
|
for (auto& token : tokens)
|
|
builder.append(token.to_deprecated_string());
|
|
return (u32)builder.string_view().hash();
|
|
}
|
|
|
|
TEST_CASE(empty)
|
|
{
|
|
auto tokens = run_tokenizer(""sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(basic)
|
|
{
|
|
auto tokens = run_tokenizer("<html><head></head><body></body></html>"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_START_TAG_TOKEN(html, 1u, 5u);
|
|
EXPECT_START_TAG_TOKEN(head, 7u, 11u);
|
|
EXPECT_END_TAG_TOKEN(head, 14u, 18u);
|
|
EXPECT_START_TAG_TOKEN(body, 20u, 24u);
|
|
EXPECT_END_TAG_TOKEN(body, 27u, 31u);
|
|
EXPECT_END_TAG_TOKEN(html, 34u, 38u);
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(basic_with_text)
|
|
{
|
|
auto tokens = run_tokenizer("<p>This is some text.</p>"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_START_TAG_TOKEN(p, 1u, 2u);
|
|
EXPECT_CHARACTER_TOKENS(This is some text.);
|
|
EXPECT_END_TAG_TOKEN(p, 23u, 24u);
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(unquoted_attributes)
|
|
{
|
|
auto tokens = run_tokenizer("<p foo=bar>"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_START_TAG_TOKEN(p, 1u, 10u);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(single_quoted_attributes)
|
|
{
|
|
auto tokens = run_tokenizer("<p foo='bar'>"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_START_TAG_TOKEN(p, 1u, 12u);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(double_quoted_attributes)
|
|
{
|
|
auto tokens = run_tokenizer("<p foo=\"bar\">"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_START_TAG_TOKEN(p, 1u, 12u);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(multiple_attributes)
|
|
{
|
|
auto tokens = run_tokenizer("<p foo=\"bar\" baz=foobar foo2=\"bar2\">"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_START_TAG_TOKEN(p, 1u, 35u);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(3);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "foobar");
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo2, "bar2");
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(character_reference_in_attribute)
|
|
{
|
|
auto tokens = run_tokenizer("<p foo=a&b bar='a&b' baz=\"a&b\">"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_START_TAG_TOKEN(p, 1u, 43u);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(3);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "a&b");
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(bar, "a&b");
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "a&b");
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(comment)
|
|
{
|
|
auto tokens = run_tokenizer("<p><!-- This is a comment --></p>"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_START_TAG_TOKEN(p, 1u, 2u);
|
|
EXPECT_COMMENT_TOKEN();
|
|
EXPECT_END_TAG_TOKEN(p, 31u, 32u);
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(doctype)
|
|
{
|
|
auto tokens = run_tokenizer("<!DOCTYPE html><html></html>"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_DOCTYPE_TOKEN();
|
|
EXPECT_START_TAG_TOKEN(html, 16u, 20u);
|
|
EXPECT_END_TAG_TOKEN(html, 23u, 27u);
|
|
}
|
|
|
|
// NOTE: This relies on the format of HTMLToken::to_string() staying the same.
|
|
// If that changes, or something is added to the test HTML, the hash needs to be adjusted.
|
|
TEST_CASE(regression)
|
|
{
|
|
// This makes sure that the tests will run both on target and in Lagom.
|
|
#ifdef AK_OS_SERENITY
|
|
StringView path = "/usr/Tests/LibWeb/tokenizer-test.html"sv;
|
|
#else
|
|
StringView path = "tokenizer-test.html"sv;
|
|
#endif
|
|
|
|
auto file = MUST(Core::File::open(path, Core::File::OpenMode::Read));
|
|
auto file_size = MUST(file->size());
|
|
auto content = MUST(ByteBuffer::create_uninitialized(file_size));
|
|
MUST(file->read_until_filled(content.bytes()));
|
|
DeprecatedString file_contents { content.bytes() };
|
|
auto tokens = run_tokenizer(file_contents);
|
|
u32 hash = hash_tokens(tokens);
|
|
EXPECT_EQ(hash, 3657343287u);
|
|
}
|