mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-25 09:00:22 +00:00
LibJS: Support all line terminators (LF, CR, LS, PS)
https://tc39.es/ecma262/#sec-line-terminators
This commit is contained in:
parent
5043c4a3e5
commit
15642874f3
Notes:
sideshowbarker
2024-07-19 01:49:17 +09:00
Author: https://github.com/linusg Commit: https://github.com/SerenityOS/serenity/commit/15642874f39 Pull-request: https://github.com/SerenityOS/serenity/pull/3815 Reviewed-by: https://github.com/alimpfard
8 changed files with 161 additions and 29 deletions
|
@ -60,7 +60,7 @@ if (ALL_THE_DEBUG_MACROS)
|
|||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DICMP_DEBUG -DICO_DEBUG -DImage_DEBUG -DIMAGE_DECODER_CLIENT_DEBUG -DIMAGE_DECODER_DEBUG -DIMAGE_LOADER_DEBUG -DINTERPRETER_DEBUG -DINTERRUPT_DEBUG -DIOAPIC_DEBUG -DIPC_DEBUG -DIPV4_DEBUG -DIPV4_SOCKET_DEBUG -DIRC_DEBUG -DIRQ_DEBUG")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DJOB_DEBUG")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DKEYBOARD_DEBUG -DKEYBOARD_SHORTCUTS_DEBUG -DKMALLOC_DEBUG_LARGE_ALLOCATIONS")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLoader_DEBUG")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLEXER_DEBUG -DLoader_DEBUG")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMALLOC_DEBUG -DMASTERPTY_DEBUG -DMBR_DEBUG -DMEMORY_DEBUG -DMENU_DEBUG -DMINIMIZE_ANIMATION_DEBUG -DMM_DEBUG -DMOVE_DEBUG -DMULTIPROCESSOR_DEBUG")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNETWORK_TASK_DEBUG -DNT_DEBUG")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DOBJECT_DEBUG -DOCCLUSIONS_DEBUG -DOFFD_DEBUG")
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*
|
||||
* Copyright (c) 2020, Stephan Unverwerth <s.unverwerth@gmx.de>
|
||||
* Copyright (c) 2020, Linus Groh <mail@linusgroh.de>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
|
@ -30,6 +31,8 @@
|
|||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
|
||||
//#define LEXER_DEBUG
|
||||
|
||||
namespace JS {
|
||||
|
||||
HashMap<String, TokenType> Lexer::s_keywords;
|
||||
|
@ -152,19 +155,57 @@ Lexer::Lexer(StringView source)
|
|||
|
||||
void Lexer::consume()
|
||||
{
|
||||
if (m_position > m_source.length())
|
||||
return;
|
||||
|
||||
if (m_position == m_source.length()) {
|
||||
auto did_reach_eof = [this] {
|
||||
if (m_position != m_source.length())
|
||||
return false;
|
||||
m_position++;
|
||||
m_line_column++;
|
||||
m_current_char = EOF;
|
||||
return true;
|
||||
};
|
||||
|
||||
if (m_position > m_source.length())
|
||||
return;
|
||||
|
||||
if (did_reach_eof())
|
||||
return;
|
||||
|
||||
if (is_line_terminator()) {
|
||||
#ifdef LEXER_DEBUG
|
||||
String type;
|
||||
if (m_current_char == '\n')
|
||||
type = "LINE FEED";
|
||||
else if (m_current_char == '\r')
|
||||
type = "CARRIAGE RETURN";
|
||||
else if (m_source[m_position + 1] == (char)0xa8)
|
||||
type = "LINE SEPARATOR";
|
||||
else
|
||||
type = "PARAGRAPH SEPARATOR";
|
||||
dbg() << "Found a line terminator: " << type;
|
||||
#endif
|
||||
// This is a three-char line terminator, we need to increase m_position some more.
|
||||
// We might reach EOF and need to check again.
|
||||
if (m_current_char != '\n' && m_current_char != '\r') {
|
||||
m_position += 2;
|
||||
if (did_reach_eof())
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_current_char == '\n') {
|
||||
// If the previous character is \r and the current one \n we already updated line number
|
||||
// and column - don't do it again. From https://tc39.es/ecma262/#sec-line-terminators:
|
||||
// The sequence <CR><LF> is commonly used as a line terminator.
|
||||
// It should be considered a single SourceCharacter for the purpose of reporting line numbers.
|
||||
auto second_char_of_crlf = m_position > 1 && m_source[m_position - 2] == '\r' && m_current_char == '\n';
|
||||
|
||||
if (!second_char_of_crlf) {
|
||||
m_line_number++;
|
||||
m_line_column = 1;
|
||||
#ifdef LEXER_DEBUG
|
||||
dbg() << "Incremented line number, now at: line " << m_line_number << ", column 1";
|
||||
} else {
|
||||
dbg() << "Previous was CR, this is LF - not incrementing line number again.";
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
m_line_column++;
|
||||
}
|
||||
|
@ -259,6 +300,17 @@ bool Lexer::is_eof() const
|
|||
return m_current_char == EOF;
|
||||
}
|
||||
|
||||
bool Lexer::is_line_terminator() const
|
||||
{
|
||||
if (m_current_char == '\n' || m_current_char == '\r')
|
||||
return true;
|
||||
if (m_position + 1 < m_source.length()) {
|
||||
auto three_chars_view = m_source.substring_view(m_position - 1, 3);
|
||||
return (three_chars_view == LINE_SEPARATOR) || (three_chars_view == PARAGRAPH_SEPARATOR);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Lexer::is_identifier_start() const
|
||||
{
|
||||
return isalpha(m_current_char) || m_current_char == '_' || m_current_char == '$';
|
||||
|
@ -314,15 +366,15 @@ Token Lexer::next()
|
|||
if (!in_template || m_template_states.last().in_expr) {
|
||||
// consume whitespace and comments
|
||||
while (true) {
|
||||
if (isspace(m_current_char)) {
|
||||
if (isspace(m_current_char) || is_line_terminator()) {
|
||||
do {
|
||||
consume();
|
||||
} while (isspace(m_current_char));
|
||||
} while (isspace(m_current_char) || is_line_terminator());
|
||||
} else if (is_line_comment_start()) {
|
||||
consume();
|
||||
do {
|
||||
consume();
|
||||
} while (!is_eof() && m_current_char != '\n');
|
||||
} while (!is_eof() && !is_line_terminator());
|
||||
} else if (is_block_comment_start()) {
|
||||
consume();
|
||||
do {
|
||||
|
@ -449,7 +501,8 @@ Token Lexer::next()
|
|||
} else if (m_current_char == '"' || m_current_char == '\'') {
|
||||
char stop_char = m_current_char;
|
||||
consume();
|
||||
while (m_current_char != stop_char && m_current_char != '\n' && !is_eof()) {
|
||||
// Note: LS/PS line terminators are allowed in string literals.
|
||||
while (m_current_char != stop_char && m_current_char != '\r' && m_current_char != '\n' && !is_eof()) {
|
||||
if (m_current_char == '\\') {
|
||||
consume();
|
||||
}
|
||||
|
@ -500,10 +553,7 @@ Token Lexer::next()
|
|||
|
||||
bool found_three_char_token = false;
|
||||
if (!found_four_char_token && m_position + 1 < m_source.length()) {
|
||||
char second_char = m_source[m_position];
|
||||
char third_char = m_source[m_position + 1];
|
||||
char three_chars[] { (char)m_current_char, second_char, third_char, 0 };
|
||||
StringView three_chars_view { three_chars };
|
||||
auto three_chars_view = m_source.substring_view(m_position - 1, 3);
|
||||
auto it = s_three_char_tokens.find(three_chars_view.hash(), [&](auto& entry) { return entry.key == three_chars_view; });
|
||||
if (it != s_three_char_tokens.end()) {
|
||||
found_three_char_token = true;
|
||||
|
@ -516,9 +566,7 @@ Token Lexer::next()
|
|||
|
||||
bool found_two_char_token = false;
|
||||
if (!found_four_char_token && !found_three_char_token && m_position < m_source.length()) {
|
||||
char second_char = m_source[m_position];
|
||||
char two_chars[] { (char)m_current_char, second_char, 0 };
|
||||
StringView two_chars_view = { two_chars };
|
||||
auto two_chars_view = m_source.substring_view(m_position - 1, 2);
|
||||
auto it = s_two_char_tokens.find(two_chars_view.hash(), [&](auto& entry) { return entry.key == two_chars_view; });
|
||||
if (it != s_two_char_tokens.end()) {
|
||||
found_two_char_token = true;
|
||||
|
@ -559,6 +607,15 @@ Token Lexer::next()
|
|||
value_start_line_number,
|
||||
value_start_column_number);
|
||||
|
||||
#ifdef LEXER_DEBUG
|
||||
dbg() << "------------------------------";
|
||||
dbg() << "Token: " << m_current_token.name();
|
||||
dbg() << "Trivia: _" << m_current_token.trivia() << "_";
|
||||
dbg() << "Value: _" << m_current_token.value() << "_";
|
||||
dbg() << "Line: " << m_current_token.line_number() << ", Column: " << m_current_token.line_column();
|
||||
dbg() << "------------------------------";
|
||||
#endif
|
||||
|
||||
return m_current_token;
|
||||
}
|
||||
|
||||
|
|
|
@ -47,6 +47,7 @@ private:
|
|||
bool consume_hexadecimal_number();
|
||||
bool consume_binary_number();
|
||||
bool is_eof() const;
|
||||
bool is_line_terminator() const;
|
||||
bool is_identifier_start() const;
|
||||
bool is_identifier_middle() const;
|
||||
bool is_line_comment_start() const;
|
||||
|
@ -61,7 +62,7 @@ private:
|
|||
StringView m_source;
|
||||
size_t m_position { 0 };
|
||||
Token m_current_token;
|
||||
int m_current_char { 0 };
|
||||
char m_current_char { 0 };
|
||||
size_t m_line_number { 1 };
|
||||
size_t m_line_column { 0 };
|
||||
|
||||
|
|
|
@ -356,7 +356,7 @@ RefPtr<FunctionExpression> Parser::try_parse_arrow_function_expression(bool expe
|
|||
}
|
||||
// If there's a newline between the closing paren and arrow it's not a valid arrow function,
|
||||
// ASI should kick in instead (it'll then fail with "Unexpected token Arrow")
|
||||
if (m_parser_state.m_current_token.trivia().contains('\n'))
|
||||
if (m_parser_state.m_current_token.trivia_contains_line_terminator())
|
||||
return nullptr;
|
||||
if (!match(TokenType::Arrow))
|
||||
return nullptr;
|
||||
|
@ -1174,7 +1174,7 @@ NonnullRefPtr<ReturnStatement> Parser::parse_return_statement()
|
|||
consume(TokenType::Return);
|
||||
|
||||
// Automatic semicolon insertion: terminate statement when return is followed by newline
|
||||
if (m_parser_state.m_current_token.trivia().contains('\n'))
|
||||
if (m_parser_state.m_current_token.trivia_contains_line_terminator())
|
||||
return create_ast_node<ReturnStatement>(nullptr);
|
||||
|
||||
if (match_expression()) {
|
||||
|
@ -1356,7 +1356,7 @@ NonnullRefPtr<ThrowStatement> Parser::parse_throw_statement()
|
|||
consume(TokenType::Throw);
|
||||
|
||||
// Automatic semicolon insertion: terminate statement when throw is followed by newline
|
||||
if (m_parser_state.m_current_token.trivia().contains('\n')) {
|
||||
if (m_parser_state.m_current_token.trivia_contains_line_terminator()) {
|
||||
syntax_error("No line break is allowed between 'throw' and its expression");
|
||||
return create_ast_node<ThrowStatement>(create_ast_node<ErrorExpression>());
|
||||
}
|
||||
|
@ -1373,7 +1373,7 @@ NonnullRefPtr<BreakStatement> Parser::parse_break_statement()
|
|||
if (match(TokenType::Semicolon)) {
|
||||
consume();
|
||||
} else {
|
||||
if (match(TokenType::Identifier) && !m_parser_state.m_current_token.trivia().contains('\n')) {
|
||||
if (match(TokenType::Identifier) && !m_parser_state.m_current_token.trivia_contains_line_terminator()) {
|
||||
target_label = consume().value();
|
||||
if (!m_parser_state.m_labels_in_scope.contains(target_label))
|
||||
syntax_error(String::formatted("Label '{}' not found", target_label));
|
||||
|
@ -1398,7 +1398,7 @@ NonnullRefPtr<ContinueStatement> Parser::parse_continue_statement()
|
|||
consume();
|
||||
return create_ast_node<ContinueStatement>(target_label);
|
||||
}
|
||||
if (match(TokenType::Identifier) && !m_parser_state.m_current_token.trivia().contains('\n')) {
|
||||
if (match(TokenType::Identifier) && !m_parser_state.m_current_token.trivia_contains_line_terminator()) {
|
||||
target_label = consume().value();
|
||||
if (!m_parser_state.m_labels_in_scope.contains(target_label))
|
||||
syntax_error(String::formatted("Label '{}' not found", target_label));
|
||||
|
@ -1799,7 +1799,7 @@ void Parser::consume_or_insert_semicolon()
|
|||
}
|
||||
// Insert semicolon if...
|
||||
// ...token is preceded by one or more newlines
|
||||
if (m_parser_state.m_current_token.trivia().contains('\n'))
|
||||
if (m_parser_state.m_current_token.trivia_contains_line_terminator())
|
||||
return;
|
||||
// ...token is a closing curly brace
|
||||
if (match(TokenType::CurlyClose))
|
||||
|
|
|
@ -114,8 +114,15 @@ public:
|
|||
{
|
||||
if (line == 0 || column == 0)
|
||||
return {};
|
||||
// We need to modify the source to match what the lexer considers one line - normalizing
|
||||
// line terminators to \n is easier than splitting using all different LT characters.
|
||||
String source_string { source };
|
||||
source_string.replace("\r\n", "\n");
|
||||
source_string.replace("\r", "\n");
|
||||
source_string.replace(LINE_SEPARATOR, "\n");
|
||||
source_string.replace(PARAGRAPH_SEPARATOR, "\n");
|
||||
StringBuilder builder;
|
||||
builder.append(source.split_view('\n', true)[line - 1]);
|
||||
builder.append(source_string.split_view('\n', true)[line - 1]);
|
||||
builder.append('\n');
|
||||
for (size_t i = 0; i < column - 1; ++i)
|
||||
builder.append(spacer);
|
||||
|
|
53
Libraries/LibJS/Tests/parser-line-terminators.js
Normal file
53
Libraries/LibJS/Tests/parser-line-terminators.js
Normal file
|
@ -0,0 +1,53 @@
|
|||
/*
|
||||
These tests deliberately produce syntax errors to check what line the parser thinks we're on.
|
||||
|
||||
⚠ PLEASE MAKE SURE TO NOT LET YOUR EDITOR REMOVE THE LS/PS LINE TERMINATORS!
|
||||
*/
|
||||
|
||||
test("LINE FEED is a line terminator", () => {
|
||||
expect(() => {
|
||||
Function("\n\n@");
|
||||
}).toThrowWithMessage(SyntaxError, "line: 3, column: 1");
|
||||
});
|
||||
|
||||
test("CARRIAGE RETURN is a line terminator", () => {
|
||||
expect(() => {
|
||||
Function("\r\r@");
|
||||
}).toThrowWithMessage(SyntaxError, "line: 3, column: 1");
|
||||
});
|
||||
|
||||
test("LINE SEPARATOR is a line terminator", () => {
|
||||
expect(() => {
|
||||
Function(`
@`);
|
||||
}).toThrowWithMessage(SyntaxError, "line: 3, column: 1");
|
||||
});
|
||||
|
||||
test("PARAGRAPH SEPARATOR is a line terminator", () => {
|
||||
expect(() => {
|
||||
Function(`
@`);
|
||||
}).toThrowWithMessage(SyntaxError, "line: 3, column: 1");
|
||||
});
|
||||
|
||||
test("CR LF is counted as only one line terminator", () => {
|
||||
expect(() => {
|
||||
Function("\r\n\r\n@");
|
||||
}).toThrowWithMessage(SyntaxError, "line: 3, column: 1");
|
||||
});
|
||||
|
||||
test("LF/CR are not allowed in string literal", () => {
|
||||
expect(() => {
|
||||
Function(`"
|
||||
"`);
|
||||
}).toThrowWithMessage(SyntaxError, "Unexpected token UnterminatedStringLiteral");
|
||||
});
|
||||
|
||||
test("LS/PS are allowed in string literal", () => {
|
||||
expect(`"
"`).toEval();
|
||||
expect(`"
"`).toEval();
|
||||
});
|
||||
|
||||
test("line terminators can be mixed (but please don't)", () => {
|
||||
expect(() => {
|
||||
Function(`\r\
\r\n
\n\r@`);
|
||||
}).toThrowWithMessage(SyntaxError, "line: 7, column: 1");
|
||||
});
|
|
@ -262,4 +262,9 @@ bool Token::is_identifier_name() const
|
|||
|| m_type == TokenType::Yield;
|
||||
}
|
||||
|
||||
bool Token::trivia_contains_line_terminator() const
|
||||
{
|
||||
return m_trivia.contains('\n') || m_trivia.contains('\r') || m_trivia.contains(LINE_SEPARATOR) || m_trivia.contains(PARAGRAPH_SEPARATOR);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -31,6 +31,14 @@
|
|||
|
||||
namespace JS {
|
||||
|
||||
// U+2028 LINE SEPARATOR
|
||||
constexpr const char line_separator_chars[] { (char)0xe2, (char)0x80, (char)0xa8, 0 };
|
||||
constexpr const StringView LINE_SEPARATOR { line_separator_chars };
|
||||
|
||||
// U+2029 PARAGRAPH SEPARATOR
|
||||
constexpr const char paragraph_separator_chars[] { (char)0xe2, (char)0x80, (char)0xa9, 0 };
|
||||
constexpr const StringView PARAGRAPH_SEPARATOR { paragraph_separator_chars };
|
||||
|
||||
#define ENUMERATE_JS_TOKENS \
|
||||
__ENUMERATE_JS_TOKEN(Ampersand, Operator) \
|
||||
__ENUMERATE_JS_TOKEN(AmpersandEquals, Operator) \
|
||||
|
@ -204,6 +212,7 @@ public:
|
|||
String string_value(StringValueStatus& status) const;
|
||||
|
||||
bool is_identifier_name() const;
|
||||
bool trivia_contains_line_terminator() const;
|
||||
|
||||
private:
|
||||
TokenType m_type;
|
||||
|
|
Loading…
Reference in a new issue