4 anni fa · 1259dc3623
--- a/.prettierignore
+++ b/.prettierignore
@@ -1,3 +1,3 @@
 
				 Base/home/anon/Source/js
			
 
				 Userland/Libraries/LibJS/Tests/eval-aliasing.js
			
 
				-
			
 
				+Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js
			
--- a/Userland/Libraries/LibJS/Lexer.cpp
+++ b/Userland/Libraries/LibJS/Lexer.cpp
@@ -8,6 +8,7 @@
 
				 #include "Lexer.h"
			
 
				 #include <AK/CharacterTypes.h>
			
 
				 #include <AK/Debug.h>
			
 
				+#include <AK/GenericLexer.h>
			
 
				 #include <AK/HashMap.h>
			
 
				 #include <AK/Utf8View.h>
			
 
				 #include <LibUnicode/CharacterTypes.h>
			
@@ -350,6 +351,8 @@ u32 Lexer::current_code_point() const
 
				     if (m_position == 0)
			
 
				         return REPLACEMENT_CHARACTER;
			
 
				     Utf8View utf_8_view { m_source.substring_view(m_position - 1) };
			
 
				+    if (utf_8_view.is_empty())
			
 
				+        return REPLACEMENT_CHARACTER;
			
 
				     return *utf_8_view.begin();
			
 
				 }
			
 
				 
			
@@ -369,30 +372,60 @@ bool Lexer::is_whitespace() const
 
				     return false;
			
 
				 }
			
 
				 
			
 
				-bool Lexer::is_identifier_start() const
			
 
				+Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
			
 
				 {
			
 
				-    if (!is_unicode_character())
			
 
				-        return is_ascii_alpha(m_current_char) || m_current_char == '_' || m_current_char == '$';
			
 
				-    auto code_point = current_code_point();
			
 
				+    GenericLexer lexer(source().substring_view(m_position - 1));
			
 
				+
			
 
				+    if (auto code_point_or_error = lexer.consume_escaped_code_point(); !code_point_or_error.is_error()) {
			
 
				+        identifier_length = lexer.tell();
			
 
				+        return code_point_or_error.value();
			
 
				+    }
			
 
				+
			
 
				+    return {};
			
 
				+}
			
 
				+
			
 
				+Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
			
 
				+{
			
 
				+    u32 code_point = current_code_point();
			
 
				+    identifier_length = 1;
			
 
				+
			
 
				+    if (code_point == '\\') {
			
 
				+        if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
			
 
				+            code_point = *maybe_code_point;
			
 
				+        else
			
 
				+            return {};
			
 
				+    }
			
 
				+
			
 
				+    if (is_ascii_alpha(code_point) || code_point == '_' || code_point == '$')
			
 
				+        return code_point;
			
 
				 
			
 
				     static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
			
 
				-    if (id_start_category.has_value())
			
 
				-        return Unicode::code_point_has_property(code_point, *id_start_category);
			
 
				-    return false;
			
 
				+    if (id_start_category.has_value() && Unicode::code_point_has_property(code_point, *id_start_category))
			
 
				+        return code_point;
			
 
				+
			
 
				+    return {};
			
 
				 }
			
 
				 
			
 
				-bool Lexer::is_identifier_middle() const
			
 
				+Optional<u32> Lexer::is_identifier_middle(size_t& identifier_length) const
			
 
				 {
			
 
				-    if (!is_unicode_character())
			
 
				-        return is_identifier_start() || is_ascii_digit(m_current_char);
			
 
				-    auto code_point = current_code_point();
			
 
				-    if (code_point == ZERO_WIDTH_NON_JOINER || code_point == ZERO_WIDTH_JOINER)
			
 
				-        return true;
			
 
				+    u32 code_point = current_code_point();
			
 
				+    identifier_length = 1;
			
 
				+
			
 
				+    if (code_point == '\\') {
			
 
				+        if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
			
 
				+            code_point = *maybe_code_point;
			
 
				+        else
			
 
				+            return {};
			
 
				+    }
			
 
				+
			
 
				+    if (is_ascii_alphanumeric(code_point) || (code_point == '$') || (code_point == ZERO_WIDTH_NON_JOINER) || (code_point == ZERO_WIDTH_JOINER))
			
 
				+        return code_point;
			
 
				 
			
 
				     static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
			
 
				-    if (id_continue_category.has_value())
			
 
				-        return Unicode::code_point_has_property(code_point, *id_continue_category);
			
 
				-    return false;
			
 
				+    if (id_continue_category.has_value() && Unicode::code_point_has_property(code_point, *id_continue_category))
			
 
				+        return code_point;
			
 
				+
			
 
				+    return {};
			
 
				 }
			
 
				 
			
 
				 bool Lexer::is_line_comment_start(bool line_has_token_yet) const
			
@@ -494,6 +527,9 @@ Token Lexer::next()
 
				     // bunch of Invalid* tokens (bad numeric literals, unterminated comments etc.)
			
 
				     String token_message;
			
 
				 
			
 
				+    Optional<FlyString> identifier;
			
 
				+    size_t identifier_length = 0;
			
 
				+
			
 
				     if (m_current_token.type() == TokenType::RegexLiteral && !is_eof() && is_ascii_alpha(m_current_char) && !did_consume_whitespace_or_comments) {
			
 
				         token_type = TokenType::RegexFlags;
			
 
				         while (!is_eof() && is_ascii_alpha(m_current_char))
			
@@ -537,19 +573,26 @@ Token Lexer::next()
 
				             else
			
 
				                 token_type = TokenType::TemplateLiteralString;
			
 
				         }
			
 
				-    } else if (is_identifier_start()) {
			
 
				+    } else if (auto code_point = is_identifier_start(identifier_length); code_point.has_value()) {
			
 
				         // identifier or keyword
			
 
				+        StringBuilder builder;
			
 
				         do {
			
 
				-            consume();
			
 
				-        } while (is_identifier_middle());
			
 
				+            builder.append_code_point(*code_point);
			
 
				+            for (size_t i = 0; i < identifier_length; ++i)
			
 
				+                consume();
			
 
				+
			
 
				+            code_point = is_identifier_middle(identifier_length);
			
 
				+        } while (code_point.has_value());
			
 
				 
			
 
				-        StringView value = m_source.substring_view(value_start - 1, m_position - value_start);
			
 
				-        auto it = s_keywords.find(value.hash(), [&](auto& entry) { return entry.key == value; });
			
 
				-        if (it == s_keywords.end()) {
			
 
				+        identifier = builder.build();
			
 
				+        if (!m_parsed_identifiers.contains_slow(*identifier))
			
 
				+            m_parsed_identifiers.append(*identifier);
			
 
				+
			
 
				+        auto it = s_keywords.find(identifier->hash(), [&](auto& entry) { return entry.key == identifier; });
			
 
				+        if (it == s_keywords.end())
			
 
				             token_type = TokenType::Identifier;
			
 
				-        } else {
			
 
				+        else
			
 
				             token_type = it->value;
			
 
				-        }
			
 
				     } else if (is_numeric_literal_start()) {
			
 
				         token_type = TokenType::NumericLiteral;
			
 
				         bool is_invalid_numeric_literal = false;
			
@@ -708,15 +751,28 @@ Token Lexer::next()
 
				         }
			
 
				     }
			
 
				 
			
 
				-    m_current_token = Token(
			
 
				-        token_type,
			
 
				-        token_message,
			
 
				-        m_source.substring_view(trivia_start - 1, value_start - trivia_start),
			
 
				-        m_source.substring_view(value_start - 1, m_position - value_start),
			
 
				-        m_filename,
			
 
				-        value_start_line_number,
			
 
				-        value_start_column_number,
			
 
				-        m_position);
			
 
				+    if (identifier.has_value()) {
			
 
				+        m_current_token = Token(
			
 
				+            token_type,
			
 
				+            token_message,
			
 
				+            m_source.substring_view(trivia_start - 1, value_start - trivia_start),
			
 
				+            m_source.substring_view(value_start - 1, m_position - value_start),
			
 
				+            identifier.release_value(),
			
 
				+            m_filename,
			
 
				+            value_start_line_number,
			
 
				+            value_start_column_number,
			
 
				+            m_position);
			
 
				+    } else {
			
 
				+        m_current_token = Token(
			
 
				+            token_type,
			
 
				+            token_message,
			
 
				+            m_source.substring_view(trivia_start - 1, value_start - trivia_start),
			
 
				+            m_source.substring_view(value_start - 1, m_position - value_start),
			
 
				+            m_filename,
			
 
				+            value_start_line_number,
			
 
				+            value_start_column_number,
			
 
				+            m_position);
			
 
				+    }
			
 
				 
			
 
				     if constexpr (LEXER_DEBUG) {
			
 
				         dbgln("------------------------------");
			
--- a/Userland/Libraries/LibJS/Lexer.h
+++ b/Userland/Libraries/LibJS/Lexer.h
@@ -41,8 +41,9 @@ private:
 
				     bool is_eof() const;
			
 
				     bool is_line_terminator() const;
			
 
				     bool is_whitespace() const;
			
 
				-    bool is_identifier_start() const;
			
 
				-    bool is_identifier_middle() const;
			
 
				+    Optional<u32> is_unicode_escape(size_t& identifier_length) const;
			
 
				+    Optional<u32> is_identifier_start(size_t& identifier_length) const;
			
 
				+    Optional<u32> is_identifier_middle(size_t& identifier_length) const;
			
 
				     bool is_line_comment_start(bool line_has_token_yet) const;
			
 
				     bool is_block_comment_start() const;
			
 
				     bool is_block_comment_end() const;
			
@@ -80,6 +81,10 @@ private:
 
				     static HashMap<String, TokenType> s_three_char_tokens;
			
 
				     static HashMap<String, TokenType> s_two_char_tokens;
			
 
				     static HashMap<char, TokenType> s_single_char_tokens;
			
 
				+
			
 
				+    // Resolved identifiers must be kept alive for the duration of the parsing stage, otherwise
			
 
				+    // the only references to these strings are deleted by the Token destructor.
			
 
				+    Vector<FlyString> m_parsed_identifiers;
			
 
				 };
			
 
				 
			
 
				 }
			
--- a/Userland/Libraries/LibJS/Parser.cpp
+++ b/Userland/Libraries/LibJS/Parser.cpp
@@ -210,7 +210,6 @@ constexpr OperatorPrecedenceTable g_operator_precedence;
 
				 
			
 
				 Parser::ParserState::ParserState(Lexer l, Program::Type program_type)
			
 
				     : lexer(move(l))
			
 
				-    , current_token(TokenType::Invalid, {}, {}, {}, {}, 0, 0, 0)
			
 
				 {
			
 
				     if (program_type == Program::Type::Module)
			
 
				         lexer.disallow_html_comments();
			
@@ -680,7 +679,7 @@ NonnullRefPtr<ClassExpression> Parser::parse_class_expression(bool expect_class_
 
				 
			
 
				         if (match_property_key()) {
			
 
				             StringView name;
			
 
				-            if (!is_generator && m_state.current_token.value() == "static"sv) {
			
 
				+            if (!is_generator && m_state.current_token.original_value() == "static"sv) {
			
 
				                 if (match(TokenType::Identifier)) {
			
 
				                     consume();
			
 
				                     is_static = true;
			
@@ -2524,7 +2523,7 @@ NonnullRefPtr<Statement> Parser::parse_for_statement()
 
				 {
			
 
				     auto rule_start = push_start();
			
 
				     auto match_for_in_of = [&]() {
			
 
				-        return match(TokenType::In) || (match(TokenType::Identifier) && m_state.current_token.value() == "of");
			
 
				+        return match(TokenType::In) || (match(TokenType::Identifier) && m_state.current_token.original_value() == "of");
			
 
				     };
			
 
				 
			
 
				     consume(TokenType::For);
			
@@ -3019,7 +3018,7 @@ NonnullRefPtr<ImportStatement> Parser::parse_import_statement(Program& program)
 
				     };
			
 
				 
			
 
				     auto match_as = [&] {
			
 
				-        return match(TokenType::Identifier) && m_state.current_token.value() == "as"sv;
			
 
				+        return match(TokenType::Identifier) && m_state.current_token.original_value() == "as"sv;
			
 
				     };
			
 
				 
			
 
				     bool continue_parsing = true;
			
@@ -3134,11 +3133,15 @@ NonnullRefPtr<ExportStatement> Parser::parse_export_statement(Program& program)
 
				         syntax_error("Cannot use export statement outside a module");
			
 
				 
			
 
				     auto match_as = [&] {
			
 
				-        return match(TokenType::Identifier) && m_state.current_token.value() == "as"sv;
			
 
				+        return match(TokenType::Identifier) && m_state.current_token.original_value() == "as"sv;
			
 
				     };
			
 
				 
			
 
				     auto match_from = [&] {
			
 
				-        return match(TokenType::Identifier) && m_state.current_token.value() == "from"sv;
			
 
				+        return match(TokenType::Identifier) && m_state.current_token.original_value() == "from"sv;
			
 
				+    };
			
 
				+
			
 
				+    auto match_default = [&] {
			
 
				+        return match(TokenType::Default) && m_state.current_token.original_value() == "default"sv;
			
 
				     };
			
 
				 
			
 
				     consume(TokenType::Export);
			
@@ -3158,7 +3161,7 @@ NonnullRefPtr<ExportStatement> Parser::parse_export_statement(Program& program)
 
				 
			
 
				     RefPtr<ASTNode> expression = {};
			
 
				 
			
 
				-    if (match(TokenType::Default)) {
			
 
				+    if (match_default()) {
			
 
				         auto default_position = position();
			
 
				         consume(TokenType::Default);
			
 
				 
			
--- a/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js
+++ b/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js
@@ -0,0 +1,19 @@
 
				+test("basic escapes", () => {
			
 
				+    var foo = {};
			
 
				+    foo.brown = 12389;
			
 
				+
			
 
				+    expect(foo.brown).toBe(12389);
			
 
				+    expect(foo.br\u006fwn).toBe(12389);
			
 
				+    expect(foo.br\u{6f}wn).toBe(12389);
			
 
				+    expect(foo.\u{62}\u{72}\u{6f}\u{77}\u{6e}).toBe(12389);
			
 
				+});
			
 
				+
			
 
				+test("non-ascii escapes", () => {
			
 
				+    var foo = {};
			
 
				+    foo.𝓑𝓻𝓸𝔀𝓷 = 12389;
			
 
				+
			
 
				+    expect(foo.𝓑𝓻𝓸𝔀𝓷).toBe(12389);
			
 
				+    expect(foo.𝓑𝓻\ud835\udcf8𝔀𝓷).toBe(12389);
			
 
				+    expect(foo.𝓑𝓻\u{1d4f8}𝔀𝓷).toBe(12389);
			
 
				+    expect(foo.\u{1d4d1}\u{1d4fb}\u{1d4f8}\u{1d500}\u{1d4f7}).toBe(12389);
			
 
				+});
			
--- a/Userland/Libraries/LibJS/Token.cpp
+++ b/Userland/Libraries/LibJS/Token.cpp
@@ -56,7 +56,7 @@ double Token::double_value() const
 
				 
			
 
				     StringBuilder builder;
			
 
				 
			
 
				-    for (auto ch : m_value) {
			
 
				+    for (auto ch : value()) {
			
 
				         if (ch == '_')
			
 
				             continue;
			
 
				         builder.append(ch);
			
@@ -75,7 +75,7 @@ double Token::double_value() const
 
				             return static_cast<double>(strtoul(value_string.characters() + 2, nullptr, 2));
			
 
				         } else if (is_ascii_digit(value_string[1])) {
			
 
				             // also octal, but syntax error in strict mode
			
 
				-            if (!m_value.contains('8') && !m_value.contains('9'))
			
 
				+            if (!value().contains('8') && !value().contains('9'))
			
 
				                 return static_cast<double>(strtoul(value_string.characters() + 1, nullptr, 8));
			
 
				         }
			
 
				     }
			
@@ -95,7 +95,7 @@ String Token::string_value(StringValueStatus& status) const
 
				     VERIFY(type() == TokenType::StringLiteral || type() == TokenType::TemplateLiteralString);
			
 
				 
			
 
				     auto is_template = type() == TokenType::TemplateLiteralString;
			
 
				-    GenericLexer lexer(is_template ? m_value : m_value.substring_view(1, m_value.length() - 2));
			
 
				+    GenericLexer lexer(is_template ? value() : value().substring_view(1, value().length() - 2));
			
 
				 
			
 
				     auto encoding_failure = [&status](StringValueStatus parse_status) -> String {
			
 
				         status = parse_status;
			
@@ -195,7 +195,7 @@ String Token::string_value(StringValueStatus& status) const
 
				 bool Token::bool_value() const
			
 
				 {
			
 
				     VERIFY(type() == TokenType::BoolLiteral);
			
 
				-    return m_value == "true";
			
 
				+    return value() == "true";
			
 
				 }
			
 
				 
			
 
				 bool Token::is_identifier_name() const
			
--- a/Userland/Libraries/LibJS/Token.h
+++ b/Userland/Libraries/LibJS/Token.h
@@ -6,8 +6,10 @@
 
				 
			
 
				 #pragma once
			
 
				 
			
 
				+#include <AK/FlyString.h>
			
 
				 #include <AK/String.h>
			
 
				 #include <AK/StringView.h>
			
 
				+#include <AK/Variant.h>
			
 
				 
			
 
				 namespace JS {
			
 
				 
			
@@ -172,10 +174,13 @@ enum class TokenCategory {
 
				 
			
 
				 class Token {
			
 
				 public:
			
 
				+    Token() = default;
			
 
				+
			
 
				     Token(TokenType type, String message, StringView trivia, StringView value, StringView filename, size_t line_number, size_t line_column, size_t offset)
			
 
				         : m_type(type)
			
 
				         , m_message(message)
			
 
				         , m_trivia(trivia)
			
 
				+        , m_original_value(value)
			
 
				         , m_value(value)
			
 
				         , m_filename(filename)
			
 
				         , m_line_number(line_number)
			
@@ -184,6 +189,19 @@ public:
 
				     {
			
 
				     }
			
 
				 
			
 
				+    Token(TokenType type, String message, StringView trivia, StringView original_value, FlyString value, StringView filename, size_t line_number, size_t line_column, size_t offset)
			
 
				+        : m_type(type)
			
 
				+        , m_message(message)
			
 
				+        , m_trivia(trivia)
			
 
				+        , m_original_value(original_value)
			
 
				+        , m_value(move(value))
			
 
				+        , m_filename(filename)
			
 
				+        , m_line_number(line_number)
			
 
				+        , m_line_column(line_column)
			
 
				+        , m_offset(offset)
			
 
				+    {
			
 
				+    }
			
 
				+
			
 
				     TokenType type() const { return m_type; }
			
 
				     TokenCategory category() const;
			
 
				     static TokenCategory category(TokenType);
			
@@ -192,7 +210,14 @@ public:
 
				 
			
 
				     const String& message() const { return m_message; }
			
 
				     const StringView& trivia() const { return m_trivia; }
			
 
				-    const StringView& value() const { return m_value; }
			
 
				+    const StringView& original_value() const { return m_original_value; }
			
 
				+    StringView value() const
			
 
				+    {
			
 
				+        return m_value.visit(
			
 
				+            [](StringView const& view) { return view; },
			
 
				+            [](FlyString const& identifier) { return identifier.view(); },
			
 
				+            [](Empty) -> StringView { VERIFY_NOT_REACHED(); });
			
 
				+    }
			
 
				     const StringView& filename() const { return m_filename; }
			
 
				     size_t line_number() const { return m_line_number; }
			
 
				     size_t line_column() const { return m_line_column; }
			
@@ -213,14 +238,15 @@ public:
 
				     bool trivia_contains_line_terminator() const;
			
 
				 
			
 
				 private:
			
 
				-    TokenType m_type;
			
 
				+    TokenType m_type { TokenType::Invalid };
			
 
				     String m_message;
			
 
				     StringView m_trivia;
			
 
				-    StringView m_value;
			
 
				+    StringView m_original_value;
			
 
				+    Variant<Empty, StringView, FlyString> m_value { Empty {} };
			
 
				     StringView m_filename;
			
 
				-    size_t m_line_number;
			
 
				-    size_t m_line_column;
			
 
				-    size_t m_offset;
			
 
				+    size_t m_line_number { 0 };
			
 
				+    size_t m_line_column { 0 };
			
 
				+    size_t m_offset { 0 };
			
 
				 };
			
 
				 
			
 
				 }