LibJS: Add a specific test for invalid unicode characters in the lexer

Also fixes that it tried to make substrings past the end of the source
if we overran the source length.
This commit is contained in:
davidot 2021-10-03 13:10:35 +02:00 committed by Andreas Kling
parent 962298b040
commit ac2c3a73b1
Notes: sideshowbarker 2024-07-18 03:08:40 +09:00
4 changed files with 83 additions and 1 deletions

View file

@ -528,6 +528,9 @@ if (BUILD_LAGOM)
)
set_tests_properties(JS PROPERTIES ENVIRONMENT SERENITY_SOURCE_DIR=${SERENITY_PROJECT_ROOT})
# test-invalid-unicode-js
lagom_test(../../Tests/LibJS/test-invalid-unicode-js.cpp LIBS LagomJS)
# Markdown
include(commonmark_spec)
file(GLOB LIBMARKDOWN_TEST_SOURCES CONFIGURE_DEPENDS "../../Tests/LibMarkdown/*.cpp")

View file

@ -1,2 +1,5 @@
serenity_testjs_test(test-js.cpp test-js)
install(TARGETS test-js RUNTIME DESTINATION bin OPTIONAL)
serenity_test(test-invalid-unicode-js.cpp LibJS LIBS LibJS)

View file

@ -0,0 +1,76 @@
/*
* Copyright (c) 2021, David Tuin <davidot@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibJS/Parser.h>
#include <LibTest/TestCase.h>
TEST_CASE(invalid_unicode_only)
{
char const* code = "\xEA\xFD";
auto lexer = JS::Lexer(code);
auto token = lexer.next();
EXPECT_EQ(token.type(), JS::TokenType::Invalid);
// After this we can get as many eof tokens as we like.
for (auto i = 0; i < 10; i++) {
auto eof_token = lexer.next();
EXPECT_EQ(eof_token.type(), JS::TokenType::Eof);
}
}
TEST_CASE(long_invalid_unicode)
{
char const* code = "\xF7";
auto lexer = JS::Lexer(code);
auto token = lexer.next();
EXPECT_EQ(token.type(), JS::TokenType::Invalid);
// After this we can get as many eof tokens as we like.
for (auto i = 0; i < 10; i++) {
auto eof_token = lexer.next();
EXPECT_EQ(eof_token.type(), JS::TokenType::Eof);
}
}
TEST_CASE(invalid_unicode_and_valid_code)
{
char const* code = "\xEA\xFDthrow 1;";
auto lexer = JS::Lexer(code);
auto invalid_token = lexer.next();
EXPECT_EQ(invalid_token.type(), JS::TokenType::Invalid);
// 0xEA is the start of a three character unicode code point thus it consumes the 't'.
auto token_after = lexer.next();
EXPECT_EQ(token_after.value(), "hrow");
}
TEST_CASE(long_invalid_unicode_and_valid_code)
{
char const* code = "\xF7throw 1;";
auto lexer = JS::Lexer(code);
auto invalid_token = lexer.next();
EXPECT_EQ(invalid_token.type(), JS::TokenType::Invalid);
// 0xF7 is the start of a four character unicode code point thus it consumes 'thr'.
auto token_after = lexer.next();
EXPECT_EQ(token_after.value(), "ow");
}
TEST_CASE(invalid_unicode_after_valid_code_and_before_eof)
{
char const* code = "let \xEA\xFD;";
auto lexer = JS::Lexer(code);
auto let_token = lexer.next();
EXPECT_EQ(let_token.type(), JS::TokenType::Let);
auto invalid_token = lexer.next();
EXPECT_EQ(invalid_token.type(), JS::TokenType::Invalid);
// It should still get the valid trivia in front.
EXPECT_EQ(invalid_token.trivia(), " ");
// After this we can get as many eof tokens as we like.
for (auto i = 0; i < 10; i++) {
auto eof_token = lexer.next();
EXPECT_EQ(eof_token.type(), JS::TokenType::Eof);
}
}

View file

@ -145,7 +145,7 @@ void Lexer::consume()
return false;
m_eof = true;
m_current_char = '\0';
m_position++;
m_position = m_source.length() + 1;
m_line_column++;
return true;
};