From 15642874f39579b9cbcdf97b82ca8b8844279e55 Mon Sep 17 00:00:00 2001 From: Linus Groh Date: Wed, 21 Oct 2020 22:16:45 +0100 Subject: [PATCH] LibJS: Support all line terminators (LF, CR, LS, PS) https://tc39.es/ecma262/#sec-line-terminators --- CMakeLists.txt | 2 +- Libraries/LibJS/Lexer.cpp | 97 +++++++++++++++---- Libraries/LibJS/Lexer.h | 3 +- Libraries/LibJS/Parser.cpp | 12 +-- Libraries/LibJS/Parser.h | 9 +- .../LibJS/Tests/parser-line-terminators.js | 53 ++++++++++ Libraries/LibJS/Token.cpp | 5 + Libraries/LibJS/Token.h | 9 ++ 8 files changed, 161 insertions(+), 29 deletions(-) create mode 100644 Libraries/LibJS/Tests/parser-line-terminators.js diff --git a/CMakeLists.txt b/CMakeLists.txt index 59b6d14b17c..ab6a2e445e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,7 +60,7 @@ if (ALL_THE_DEBUG_MACROS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DICMP_DEBUG -DICO_DEBUG -DImage_DEBUG -DIMAGE_DECODER_CLIENT_DEBUG -DIMAGE_DECODER_DEBUG -DIMAGE_LOADER_DEBUG -DINTERPRETER_DEBUG -DINTERRUPT_DEBUG -DIOAPIC_DEBUG -DIPC_DEBUG -DIPV4_DEBUG -DIPV4_SOCKET_DEBUG -DIRC_DEBUG -DIRQ_DEBUG") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DJOB_DEBUG") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DKEYBOARD_DEBUG -DKEYBOARD_SHORTCUTS_DEBUG -DKMALLOC_DEBUG_LARGE_ALLOCATIONS") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLoader_DEBUG") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLEXER_DEBUG -DLoader_DEBUG") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMALLOC_DEBUG -DMASTERPTY_DEBUG -DMBR_DEBUG -DMEMORY_DEBUG -DMENU_DEBUG -DMINIMIZE_ANIMATION_DEBUG -DMM_DEBUG -DMOVE_DEBUG -DMULTIPROCESSOR_DEBUG") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNETWORK_TASK_DEBUG -DNT_DEBUG") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DOBJECT_DEBUG -DOCCLUSIONS_DEBUG -DOFFD_DEBUG") diff --git a/Libraries/LibJS/Lexer.cpp b/Libraries/LibJS/Lexer.cpp index d7e6516b63d..d61e1a184ee 100644 --- a/Libraries/LibJS/Lexer.cpp +++ b/Libraries/LibJS/Lexer.cpp @@ -1,5 +1,6 @@ /* * Copyright (c) 2020, Stephan Unverwerth + * Copyright (c) 2020, Linus Groh * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -30,6 +31,8 @@ #include #include +//#define LEXER_DEBUG + namespace JS { HashMap Lexer::s_keywords; @@ -152,19 +155,57 @@ Lexer::Lexer(StringView source) void Lexer::consume() { - if (m_position > m_source.length()) - return; - - if (m_position == m_source.length()) { + auto did_reach_eof = [this] { + if (m_position != m_source.length()) + return false; m_position++; m_line_column++; m_current_char = EOF; - return; - } + return true; + }; - if (m_current_char == '\n') { - m_line_number++; - m_line_column = 1; + if (m_position > m_source.length()) + return; + + if (did_reach_eof()) + return; + + if (is_line_terminator()) { +#ifdef LEXER_DEBUG + String type; + if (m_current_char == '\n') + type = "LINE FEED"; + else if (m_current_char == '\r') + type = "CARRIAGE RETURN"; + else if (m_source[m_position + 1] == (char)0xa8) + type = "LINE SEPARATOR"; + else + type = "PARAGRAPH SEPARATOR"; + dbg() << "Found a line terminator: " << type; +#endif + // This is a three-char line terminator, we need to increase m_position some more. + // We might reach EOF and need to check again. + if (m_current_char != '\n' && m_current_char != '\r') { + m_position += 2; + if (did_reach_eof()) + return; + } + + // If the previous character is \r and the current one \n we already updated line number + // and column - don't do it again. From https://tc39.es/ecma262/#sec-line-terminators: + // The sequence is commonly used as a line terminator. + // It should be considered a single SourceCharacter for the purpose of reporting line numbers. + auto second_char_of_crlf = m_position > 1 && m_source[m_position - 2] == '\r' && m_current_char == '\n'; + + if (!second_char_of_crlf) { + m_line_number++; + m_line_column = 1; +#ifdef LEXER_DEBUG + dbg() << "Incremented line number, now at: line " << m_line_number << ", column 1"; + } else { + dbg() << "Previous was CR, this is LF - not incrementing line number again."; +#endif + } } else { m_line_column++; } @@ -259,6 +300,17 @@ bool Lexer::is_eof() const return m_current_char == EOF; } +bool Lexer::is_line_terminator() const +{ + if (m_current_char == '\n' || m_current_char == '\r') + return true; + if (m_position + 1 < m_source.length()) { + auto three_chars_view = m_source.substring_view(m_position - 1, 3); + return (three_chars_view == LINE_SEPARATOR) || (three_chars_view == PARAGRAPH_SEPARATOR); + } + return false; +} + bool Lexer::is_identifier_start() const { return isalpha(m_current_char) || m_current_char == '_' || m_current_char == '$'; @@ -314,15 +366,15 @@ Token Lexer::next() if (!in_template || m_template_states.last().in_expr) { // consume whitespace and comments while (true) { - if (isspace(m_current_char)) { + if (isspace(m_current_char) || is_line_terminator()) { do { consume(); - } while (isspace(m_current_char)); + } while (isspace(m_current_char) || is_line_terminator()); } else if (is_line_comment_start()) { consume(); do { consume(); - } while (!is_eof() && m_current_char != '\n'); + } while (!is_eof() && !is_line_terminator()); } else if (is_block_comment_start()) { consume(); do { @@ -449,7 +501,8 @@ Token Lexer::next() } else if (m_current_char == '"' || m_current_char == '\'') { char stop_char = m_current_char; consume(); - while (m_current_char != stop_char && m_current_char != '\n' && !is_eof()) { + // Note: LS/PS line terminators are allowed in string literals. + while (m_current_char != stop_char && m_current_char != '\r' && m_current_char != '\n' && !is_eof()) { if (m_current_char == '\\') { consume(); } @@ -500,10 +553,7 @@ Token Lexer::next() bool found_three_char_token = false; if (!found_four_char_token && m_position + 1 < m_source.length()) { - char second_char = m_source[m_position]; - char third_char = m_source[m_position + 1]; - char three_chars[] { (char)m_current_char, second_char, third_char, 0 }; - StringView three_chars_view { three_chars }; + auto three_chars_view = m_source.substring_view(m_position - 1, 3); auto it = s_three_char_tokens.find(three_chars_view.hash(), [&](auto& entry) { return entry.key == three_chars_view; }); if (it != s_three_char_tokens.end()) { found_three_char_token = true; @@ -516,9 +566,7 @@ Token Lexer::next() bool found_two_char_token = false; if (!found_four_char_token && !found_three_char_token && m_position < m_source.length()) { - char second_char = m_source[m_position]; - char two_chars[] { (char)m_current_char, second_char, 0 }; - StringView two_chars_view = { two_chars }; + auto two_chars_view = m_source.substring_view(m_position - 1, 2); auto it = s_two_char_tokens.find(two_chars_view.hash(), [&](auto& entry) { return entry.key == two_chars_view; }); if (it != s_two_char_tokens.end()) { found_two_char_token = true; @@ -559,6 +607,15 @@ Token Lexer::next() value_start_line_number, value_start_column_number); +#ifdef LEXER_DEBUG + dbg() << "------------------------------"; + dbg() << "Token: " << m_current_token.name(); + dbg() << "Trivia: _" << m_current_token.trivia() << "_"; + dbg() << "Value: _" << m_current_token.value() << "_"; + dbg() << "Line: " << m_current_token.line_number() << ", Column: " << m_current_token.line_column(); + dbg() << "------------------------------"; +#endif + return m_current_token; } diff --git a/Libraries/LibJS/Lexer.h b/Libraries/LibJS/Lexer.h index 95d514e25d5..979ff24b57c 100644 --- a/Libraries/LibJS/Lexer.h +++ b/Libraries/LibJS/Lexer.h @@ -47,6 +47,7 @@ private: bool consume_hexadecimal_number(); bool consume_binary_number(); bool is_eof() const; + bool is_line_terminator() const; bool is_identifier_start() const; bool is_identifier_middle() const; bool is_line_comment_start() const; @@ -61,7 +62,7 @@ private: StringView m_source; size_t m_position { 0 }; Token m_current_token; - int m_current_char { 0 }; + char m_current_char { 0 }; size_t m_line_number { 1 }; size_t m_line_column { 0 }; diff --git a/Libraries/LibJS/Parser.cpp b/Libraries/LibJS/Parser.cpp index 3257c1f5249..61e67706828 100644 --- a/Libraries/LibJS/Parser.cpp +++ b/Libraries/LibJS/Parser.cpp @@ -356,7 +356,7 @@ RefPtr Parser::try_parse_arrow_function_expression(bool expe } // If there's a newline between the closing paren and arrow it's not a valid arrow function, // ASI should kick in instead (it'll then fail with "Unexpected token Arrow") - if (m_parser_state.m_current_token.trivia().contains('\n')) + if (m_parser_state.m_current_token.trivia_contains_line_terminator()) return nullptr; if (!match(TokenType::Arrow)) return nullptr; @@ -1174,7 +1174,7 @@ NonnullRefPtr Parser::parse_return_statement() consume(TokenType::Return); // Automatic semicolon insertion: terminate statement when return is followed by newline - if (m_parser_state.m_current_token.trivia().contains('\n')) + if (m_parser_state.m_current_token.trivia_contains_line_terminator()) return create_ast_node(nullptr); if (match_expression()) { @@ -1356,7 +1356,7 @@ NonnullRefPtr Parser::parse_throw_statement() consume(TokenType::Throw); // Automatic semicolon insertion: terminate statement when throw is followed by newline - if (m_parser_state.m_current_token.trivia().contains('\n')) { + if (m_parser_state.m_current_token.trivia_contains_line_terminator()) { syntax_error("No line break is allowed between 'throw' and its expression"); return create_ast_node(create_ast_node()); } @@ -1373,7 +1373,7 @@ NonnullRefPtr Parser::parse_break_statement() if (match(TokenType::Semicolon)) { consume(); } else { - if (match(TokenType::Identifier) && !m_parser_state.m_current_token.trivia().contains('\n')) { + if (match(TokenType::Identifier) && !m_parser_state.m_current_token.trivia_contains_line_terminator()) { target_label = consume().value(); if (!m_parser_state.m_labels_in_scope.contains(target_label)) syntax_error(String::formatted("Label '{}' not found", target_label)); @@ -1398,7 +1398,7 @@ NonnullRefPtr Parser::parse_continue_statement() consume(); return create_ast_node(target_label); } - if (match(TokenType::Identifier) && !m_parser_state.m_current_token.trivia().contains('\n')) { + if (match(TokenType::Identifier) && !m_parser_state.m_current_token.trivia_contains_line_terminator()) { target_label = consume().value(); if (!m_parser_state.m_labels_in_scope.contains(target_label)) syntax_error(String::formatted("Label '{}' not found", target_label)); @@ -1799,7 +1799,7 @@ void Parser::consume_or_insert_semicolon() } // Insert semicolon if... // ...token is preceded by one or more newlines - if (m_parser_state.m_current_token.trivia().contains('\n')) + if (m_parser_state.m_current_token.trivia_contains_line_terminator()) return; // ...token is a closing curly brace if (match(TokenType::CurlyClose)) diff --git a/Libraries/LibJS/Parser.h b/Libraries/LibJS/Parser.h index 208b9a4a4fe..2fd3c99b221 100644 --- a/Libraries/LibJS/Parser.h +++ b/Libraries/LibJS/Parser.h @@ -114,8 +114,15 @@ public: { if (line == 0 || column == 0) return {}; + // We need to modify the source to match what the lexer considers one line - normalizing + // line terminators to \n is easier than splitting using all different LT characters. + String source_string { source }; + source_string.replace("\r\n", "\n"); + source_string.replace("\r", "\n"); + source_string.replace(LINE_SEPARATOR, "\n"); + source_string.replace(PARAGRAPH_SEPARATOR, "\n"); StringBuilder builder; - builder.append(source.split_view('\n', true)[line - 1]); + builder.append(source_string.split_view('\n', true)[line - 1]); builder.append('\n'); for (size_t i = 0; i < column - 1; ++i) builder.append(spacer); diff --git a/Libraries/LibJS/Tests/parser-line-terminators.js b/Libraries/LibJS/Tests/parser-line-terminators.js new file mode 100644 index 00000000000..11fb17cada8 --- /dev/null +++ b/Libraries/LibJS/Tests/parser-line-terminators.js @@ -0,0 +1,53 @@ +/* +These tests deliberately produce syntax errors to check what line the parser thinks we're on. + +⚠ PLEASE MAKE SURE TO NOT LET YOUR EDITOR REMOVE THE LS/PS LINE TERMINATORS! +*/ + +test("LINE FEED is a line terminator", () => { + expect(() => { + Function("\n\n@"); + }).toThrowWithMessage(SyntaxError, "line: 3, column: 1"); +}); + +test("CARRIAGE RETURN is a line terminator", () => { + expect(() => { + Function("\r\r@"); + }).toThrowWithMessage(SyntaxError, "line: 3, column: 1"); +}); + +test("LINE SEPARATOR is a line terminator", () => { + expect(() => { + Function(`

@`); + }).toThrowWithMessage(SyntaxError, "line: 3, column: 1"); +}); + +test("PARAGRAPH SEPARATOR is a line terminator", () => { + expect(() => { + Function(`

@`); + }).toThrowWithMessage(SyntaxError, "line: 3, column: 1"); +}); + +test("CR LF is counted as only one line terminator", () => { + expect(() => { + Function("\r\n\r\n@"); + }).toThrowWithMessage(SyntaxError, "line: 3, column: 1"); +}); + +test("LF/CR are not allowed in string literal", () => { + expect(() => { + Function(`" + "`); + }).toThrowWithMessage(SyntaxError, "Unexpected token UnterminatedStringLiteral"); +}); + +test("LS/PS are allowed in string literal", () => { + expect(`"
"`).toEval(); + expect(`"
"`).toEval(); +}); + +test("line terminators can be mixed (but please don't)", () => { + expect(() => { + Function(`\r\
\r\n
\n\r@`); + }).toThrowWithMessage(SyntaxError, "line: 7, column: 1"); +}); diff --git a/Libraries/LibJS/Token.cpp b/Libraries/LibJS/Token.cpp index c91051a611b..0921ee9e141 100644 --- a/Libraries/LibJS/Token.cpp +++ b/Libraries/LibJS/Token.cpp @@ -262,4 +262,9 @@ bool Token::is_identifier_name() const || m_type == TokenType::Yield; } +bool Token::trivia_contains_line_terminator() const +{ + return m_trivia.contains('\n') || m_trivia.contains('\r') || m_trivia.contains(LINE_SEPARATOR) || m_trivia.contains(PARAGRAPH_SEPARATOR); +} + } diff --git a/Libraries/LibJS/Token.h b/Libraries/LibJS/Token.h index fcbbfb4a064..3b967f97756 100644 --- a/Libraries/LibJS/Token.h +++ b/Libraries/LibJS/Token.h @@ -31,6 +31,14 @@ namespace JS { +// U+2028 LINE SEPARATOR +constexpr const char line_separator_chars[] { (char)0xe2, (char)0x80, (char)0xa8, 0 }; +constexpr const StringView LINE_SEPARATOR { line_separator_chars }; + +// U+2029 PARAGRAPH SEPARATOR +constexpr const char paragraph_separator_chars[] { (char)0xe2, (char)0x80, (char)0xa9, 0 }; +constexpr const StringView PARAGRAPH_SEPARATOR { paragraph_separator_chars }; + #define ENUMERATE_JS_TOKENS \ __ENUMERATE_JS_TOKEN(Ampersand, Operator) \ __ENUMERATE_JS_TOKEN(AmpersandEquals, Operator) \ @@ -204,6 +212,7 @@ public: String string_value(StringValueStatus& status) const; bool is_identifier_name() const; + bool trivia_contains_line_terminator() const; private: TokenType m_type;