LibJS: Allow Unicode escape sequences in identifiers
For example, "property.br\u{64}wn" should resolve to "property.brown". To support this behavior, this commit changes the Token class to hold both the evaluated identifier name and a view into the original source for the unevaluated name. There are some contexts in which identifiers are not allowed to contain Unicode escape sequences; for example, export statements of the form "export {} from foo.js" forbid escapes in the identifier "from". The test file is added to .prettierignore because prettier will replace all escaped Unicode sequences with their unescaped value.
This commit is contained in:
parent
c5b5c779ff
commit
1259dc3623
Notes:
sideshowbarker
2024-07-18 05:28:09 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/1259dc36237 Pull-request: https://github.com/SerenityOS/serenity/pull/9499 Reviewed-by: https://github.com/alimpfard Reviewed-by: https://github.com/davidot ✅
7 changed files with 163 additions and 54 deletions
|
@ -1,3 +1,3 @@
|
|||
Base/home/anon/Source/js
|
||||
Userland/Libraries/LibJS/Tests/eval-aliasing.js
|
||||
|
||||
Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
#include "Lexer.h"
|
||||
#include <AK/CharacterTypes.h>
|
||||
#include <AK/Debug.h>
|
||||
#include <AK/GenericLexer.h>
|
||||
#include <AK/HashMap.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
|
@ -350,6 +351,8 @@ u32 Lexer::current_code_point() const
|
|||
if (m_position == 0)
|
||||
return REPLACEMENT_CHARACTER;
|
||||
Utf8View utf_8_view { m_source.substring_view(m_position - 1) };
|
||||
if (utf_8_view.is_empty())
|
||||
return REPLACEMENT_CHARACTER;
|
||||
return *utf_8_view.begin();
|
||||
}
|
||||
|
||||
|
@ -369,30 +372,60 @@ bool Lexer::is_whitespace() const
|
|||
return false;
|
||||
}
|
||||
|
||||
bool Lexer::is_identifier_start() const
|
||||
Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
|
||||
{
|
||||
if (!is_unicode_character())
|
||||
return is_ascii_alpha(m_current_char) || m_current_char == '_' || m_current_char == '$';
|
||||
auto code_point = current_code_point();
|
||||
GenericLexer lexer(source().substring_view(m_position - 1));
|
||||
|
||||
static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
|
||||
if (id_start_category.has_value())
|
||||
return Unicode::code_point_has_property(code_point, *id_start_category);
|
||||
return false;
|
||||
if (auto code_point_or_error = lexer.consume_escaped_code_point(); !code_point_or_error.is_error()) {
|
||||
identifier_length = lexer.tell();
|
||||
return code_point_or_error.value();
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
bool Lexer::is_identifier_middle() const
|
||||
Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
|
||||
{
|
||||
if (!is_unicode_character())
|
||||
return is_identifier_start() || is_ascii_digit(m_current_char);
|
||||
auto code_point = current_code_point();
|
||||
if (code_point == ZERO_WIDTH_NON_JOINER || code_point == ZERO_WIDTH_JOINER)
|
||||
return true;
|
||||
u32 code_point = current_code_point();
|
||||
identifier_length = 1;
|
||||
|
||||
if (code_point == '\\') {
|
||||
if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
|
||||
code_point = *maybe_code_point;
|
||||
else
|
||||
return {};
|
||||
}
|
||||
|
||||
if (is_ascii_alpha(code_point) || code_point == '_' || code_point == '$')
|
||||
return code_point;
|
||||
|
||||
static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
|
||||
if (id_start_category.has_value() && Unicode::code_point_has_property(code_point, *id_start_category))
|
||||
return code_point;
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
Optional<u32> Lexer::is_identifier_middle(size_t& identifier_length) const
|
||||
{
|
||||
u32 code_point = current_code_point();
|
||||
identifier_length = 1;
|
||||
|
||||
if (code_point == '\\') {
|
||||
if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
|
||||
code_point = *maybe_code_point;
|
||||
else
|
||||
return {};
|
||||
}
|
||||
|
||||
if (is_ascii_alphanumeric(code_point) || (code_point == '$') || (code_point == ZERO_WIDTH_NON_JOINER) || (code_point == ZERO_WIDTH_JOINER))
|
||||
return code_point;
|
||||
|
||||
static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
|
||||
if (id_continue_category.has_value())
|
||||
return Unicode::code_point_has_property(code_point, *id_continue_category);
|
||||
return false;
|
||||
if (id_continue_category.has_value() && Unicode::code_point_has_property(code_point, *id_continue_category))
|
||||
return code_point;
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
bool Lexer::is_line_comment_start(bool line_has_token_yet) const
|
||||
|
@ -494,6 +527,9 @@ Token Lexer::next()
|
|||
// bunch of Invalid* tokens (bad numeric literals, unterminated comments etc.)
|
||||
String token_message;
|
||||
|
||||
Optional<FlyString> identifier;
|
||||
size_t identifier_length = 0;
|
||||
|
||||
if (m_current_token.type() == TokenType::RegexLiteral && !is_eof() && is_ascii_alpha(m_current_char) && !did_consume_whitespace_or_comments) {
|
||||
token_type = TokenType::RegexFlags;
|
||||
while (!is_eof() && is_ascii_alpha(m_current_char))
|
||||
|
@ -537,19 +573,26 @@ Token Lexer::next()
|
|||
else
|
||||
token_type = TokenType::TemplateLiteralString;
|
||||
}
|
||||
} else if (is_identifier_start()) {
|
||||
} else if (auto code_point = is_identifier_start(identifier_length); code_point.has_value()) {
|
||||
// identifier or keyword
|
||||
StringBuilder builder;
|
||||
do {
|
||||
consume();
|
||||
} while (is_identifier_middle());
|
||||
builder.append_code_point(*code_point);
|
||||
for (size_t i = 0; i < identifier_length; ++i)
|
||||
consume();
|
||||
|
||||
StringView value = m_source.substring_view(value_start - 1, m_position - value_start);
|
||||
auto it = s_keywords.find(value.hash(), [&](auto& entry) { return entry.key == value; });
|
||||
if (it == s_keywords.end()) {
|
||||
code_point = is_identifier_middle(identifier_length);
|
||||
} while (code_point.has_value());
|
||||
|
||||
identifier = builder.build();
|
||||
if (!m_parsed_identifiers.contains_slow(*identifier))
|
||||
m_parsed_identifiers.append(*identifier);
|
||||
|
||||
auto it = s_keywords.find(identifier->hash(), [&](auto& entry) { return entry.key == identifier; });
|
||||
if (it == s_keywords.end())
|
||||
token_type = TokenType::Identifier;
|
||||
} else {
|
||||
else
|
||||
token_type = it->value;
|
||||
}
|
||||
} else if (is_numeric_literal_start()) {
|
||||
token_type = TokenType::NumericLiteral;
|
||||
bool is_invalid_numeric_literal = false;
|
||||
|
@ -708,15 +751,28 @@ Token Lexer::next()
|
|||
}
|
||||
}
|
||||
|
||||
m_current_token = Token(
|
||||
token_type,
|
||||
token_message,
|
||||
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
|
||||
m_source.substring_view(value_start - 1, m_position - value_start),
|
||||
m_filename,
|
||||
value_start_line_number,
|
||||
value_start_column_number,
|
||||
m_position);
|
||||
if (identifier.has_value()) {
|
||||
m_current_token = Token(
|
||||
token_type,
|
||||
token_message,
|
||||
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
|
||||
m_source.substring_view(value_start - 1, m_position - value_start),
|
||||
identifier.release_value(),
|
||||
m_filename,
|
||||
value_start_line_number,
|
||||
value_start_column_number,
|
||||
m_position);
|
||||
} else {
|
||||
m_current_token = Token(
|
||||
token_type,
|
||||
token_message,
|
||||
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
|
||||
m_source.substring_view(value_start - 1, m_position - value_start),
|
||||
m_filename,
|
||||
value_start_line_number,
|
||||
value_start_column_number,
|
||||
m_position);
|
||||
}
|
||||
|
||||
if constexpr (LEXER_DEBUG) {
|
||||
dbgln("------------------------------");
|
||||
|
|
|
@ -41,8 +41,9 @@ private:
|
|||
bool is_eof() const;
|
||||
bool is_line_terminator() const;
|
||||
bool is_whitespace() const;
|
||||
bool is_identifier_start() const;
|
||||
bool is_identifier_middle() const;
|
||||
Optional<u32> is_unicode_escape(size_t& identifier_length) const;
|
||||
Optional<u32> is_identifier_start(size_t& identifier_length) const;
|
||||
Optional<u32> is_identifier_middle(size_t& identifier_length) const;
|
||||
bool is_line_comment_start(bool line_has_token_yet) const;
|
||||
bool is_block_comment_start() const;
|
||||
bool is_block_comment_end() const;
|
||||
|
@ -80,6 +81,10 @@ private:
|
|||
static HashMap<String, TokenType> s_three_char_tokens;
|
||||
static HashMap<String, TokenType> s_two_char_tokens;
|
||||
static HashMap<char, TokenType> s_single_char_tokens;
|
||||
|
||||
// Resolved identifiers must be kept alive for the duration of the parsing stage, otherwise
|
||||
// the only references to these strings are deleted by the Token destructor.
|
||||
Vector<FlyString> m_parsed_identifiers;
|
||||
};
|
||||
|
||||
}
|
||||
|
|
|
@ -210,7 +210,6 @@ constexpr OperatorPrecedenceTable g_operator_precedence;
|
|||
|
||||
Parser::ParserState::ParserState(Lexer l, Program::Type program_type)
|
||||
: lexer(move(l))
|
||||
, current_token(TokenType::Invalid, {}, {}, {}, {}, 0, 0, 0)
|
||||
{
|
||||
if (program_type == Program::Type::Module)
|
||||
lexer.disallow_html_comments();
|
||||
|
@ -680,7 +679,7 @@ NonnullRefPtr<ClassExpression> Parser::parse_class_expression(bool expect_class_
|
|||
|
||||
if (match_property_key()) {
|
||||
StringView name;
|
||||
if (!is_generator && m_state.current_token.value() == "static"sv) {
|
||||
if (!is_generator && m_state.current_token.original_value() == "static"sv) {
|
||||
if (match(TokenType::Identifier)) {
|
||||
consume();
|
||||
is_static = true;
|
||||
|
@ -2524,7 +2523,7 @@ NonnullRefPtr<Statement> Parser::parse_for_statement()
|
|||
{
|
||||
auto rule_start = push_start();
|
||||
auto match_for_in_of = [&]() {
|
||||
return match(TokenType::In) || (match(TokenType::Identifier) && m_state.current_token.value() == "of");
|
||||
return match(TokenType::In) || (match(TokenType::Identifier) && m_state.current_token.original_value() == "of");
|
||||
};
|
||||
|
||||
consume(TokenType::For);
|
||||
|
@ -3019,7 +3018,7 @@ NonnullRefPtr<ImportStatement> Parser::parse_import_statement(Program& program)
|
|||
};
|
||||
|
||||
auto match_as = [&] {
|
||||
return match(TokenType::Identifier) && m_state.current_token.value() == "as"sv;
|
||||
return match(TokenType::Identifier) && m_state.current_token.original_value() == "as"sv;
|
||||
};
|
||||
|
||||
bool continue_parsing = true;
|
||||
|
@ -3134,11 +3133,15 @@ NonnullRefPtr<ExportStatement> Parser::parse_export_statement(Program& program)
|
|||
syntax_error("Cannot use export statement outside a module");
|
||||
|
||||
auto match_as = [&] {
|
||||
return match(TokenType::Identifier) && m_state.current_token.value() == "as"sv;
|
||||
return match(TokenType::Identifier) && m_state.current_token.original_value() == "as"sv;
|
||||
};
|
||||
|
||||
auto match_from = [&] {
|
||||
return match(TokenType::Identifier) && m_state.current_token.value() == "from"sv;
|
||||
return match(TokenType::Identifier) && m_state.current_token.original_value() == "from"sv;
|
||||
};
|
||||
|
||||
auto match_default = [&] {
|
||||
return match(TokenType::Default) && m_state.current_token.original_value() == "default"sv;
|
||||
};
|
||||
|
||||
consume(TokenType::Export);
|
||||
|
@ -3158,7 +3161,7 @@ NonnullRefPtr<ExportStatement> Parser::parse_export_statement(Program& program)
|
|||
|
||||
RefPtr<ASTNode> expression = {};
|
||||
|
||||
if (match(TokenType::Default)) {
|
||||
if (match_default()) {
|
||||
auto default_position = position();
|
||||
consume(TokenType::Default);
|
||||
|
||||
|
|
19
Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js
Normal file
19
Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js
Normal file
|
@ -0,0 +1,19 @@
|
|||
test("basic escapes", () => {
|
||||
var foo = {};
|
||||
foo.brown = 12389;
|
||||
|
||||
expect(foo.brown).toBe(12389);
|
||||
expect(foo.br\u006fwn).toBe(12389);
|
||||
expect(foo.br\u{6f}wn).toBe(12389);
|
||||
expect(foo.\u{62}\u{72}\u{6f}\u{77}\u{6e}).toBe(12389);
|
||||
});
|
||||
|
||||
test("non-ascii escapes", () => {
|
||||
var foo = {};
|
||||
foo.𝓑𝓻𝓸𝔀𝓷 = 12389;
|
||||
|
||||
expect(foo.𝓑𝓻𝓸𝔀𝓷).toBe(12389);
|
||||
expect(foo.𝓑𝓻\ud835\udcf8𝔀𝓷).toBe(12389);
|
||||
expect(foo.𝓑𝓻\u{1d4f8}𝔀𝓷).toBe(12389);
|
||||
expect(foo.\u{1d4d1}\u{1d4fb}\u{1d4f8}\u{1d500}\u{1d4f7}).toBe(12389);
|
||||
});
|
|
@ -56,7 +56,7 @@ double Token::double_value() const
|
|||
|
||||
StringBuilder builder;
|
||||
|
||||
for (auto ch : m_value) {
|
||||
for (auto ch : value()) {
|
||||
if (ch == '_')
|
||||
continue;
|
||||
builder.append(ch);
|
||||
|
@ -75,7 +75,7 @@ double Token::double_value() const
|
|||
return static_cast<double>(strtoul(value_string.characters() + 2, nullptr, 2));
|
||||
} else if (is_ascii_digit(value_string[1])) {
|
||||
// also octal, but syntax error in strict mode
|
||||
if (!m_value.contains('8') && !m_value.contains('9'))
|
||||
if (!value().contains('8') && !value().contains('9'))
|
||||
return static_cast<double>(strtoul(value_string.characters() + 1, nullptr, 8));
|
||||
}
|
||||
}
|
||||
|
@ -95,7 +95,7 @@ String Token::string_value(StringValueStatus& status) const
|
|||
VERIFY(type() == TokenType::StringLiteral || type() == TokenType::TemplateLiteralString);
|
||||
|
||||
auto is_template = type() == TokenType::TemplateLiteralString;
|
||||
GenericLexer lexer(is_template ? m_value : m_value.substring_view(1, m_value.length() - 2));
|
||||
GenericLexer lexer(is_template ? value() : value().substring_view(1, value().length() - 2));
|
||||
|
||||
auto encoding_failure = [&status](StringValueStatus parse_status) -> String {
|
||||
status = parse_status;
|
||||
|
@ -195,7 +195,7 @@ String Token::string_value(StringValueStatus& status) const
|
|||
bool Token::bool_value() const
|
||||
{
|
||||
VERIFY(type() == TokenType::BoolLiteral);
|
||||
return m_value == "true";
|
||||
return value() == "true";
|
||||
}
|
||||
|
||||
bool Token::is_identifier_name() const
|
||||
|
|
|
@ -6,8 +6,10 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <AK/FlyString.h>
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/Variant.h>
|
||||
|
||||
namespace JS {
|
||||
|
||||
|
@ -172,10 +174,13 @@ enum class TokenCategory {
|
|||
|
||||
class Token {
|
||||
public:
|
||||
Token() = default;
|
||||
|
||||
Token(TokenType type, String message, StringView trivia, StringView value, StringView filename, size_t line_number, size_t line_column, size_t offset)
|
||||
: m_type(type)
|
||||
, m_message(message)
|
||||
, m_trivia(trivia)
|
||||
, m_original_value(value)
|
||||
, m_value(value)
|
||||
, m_filename(filename)
|
||||
, m_line_number(line_number)
|
||||
|
@ -184,6 +189,19 @@ public:
|
|||
{
|
||||
}
|
||||
|
||||
Token(TokenType type, String message, StringView trivia, StringView original_value, FlyString value, StringView filename, size_t line_number, size_t line_column, size_t offset)
|
||||
: m_type(type)
|
||||
, m_message(message)
|
||||
, m_trivia(trivia)
|
||||
, m_original_value(original_value)
|
||||
, m_value(move(value))
|
||||
, m_filename(filename)
|
||||
, m_line_number(line_number)
|
||||
, m_line_column(line_column)
|
||||
, m_offset(offset)
|
||||
{
|
||||
}
|
||||
|
||||
TokenType type() const { return m_type; }
|
||||
TokenCategory category() const;
|
||||
static TokenCategory category(TokenType);
|
||||
|
@ -192,7 +210,14 @@ public:
|
|||
|
||||
const String& message() const { return m_message; }
|
||||
const StringView& trivia() const { return m_trivia; }
|
||||
const StringView& value() const { return m_value; }
|
||||
const StringView& original_value() const { return m_original_value; }
|
||||
StringView value() const
|
||||
{
|
||||
return m_value.visit(
|
||||
[](StringView const& view) { return view; },
|
||||
[](FlyString const& identifier) { return identifier.view(); },
|
||||
[](Empty) -> StringView { VERIFY_NOT_REACHED(); });
|
||||
}
|
||||
const StringView& filename() const { return m_filename; }
|
||||
size_t line_number() const { return m_line_number; }
|
||||
size_t line_column() const { return m_line_column; }
|
||||
|
@ -213,14 +238,15 @@ public:
|
|||
bool trivia_contains_line_terminator() const;
|
||||
|
||||
private:
|
||||
TokenType m_type;
|
||||
TokenType m_type { TokenType::Invalid };
|
||||
String m_message;
|
||||
StringView m_trivia;
|
||||
StringView m_value;
|
||||
StringView m_original_value;
|
||||
Variant<Empty, StringView, FlyString> m_value { Empty {} };
|
||||
StringView m_filename;
|
||||
size_t m_line_number;
|
||||
size_t m_line_column;
|
||||
size_t m_offset;
|
||||
size_t m_line_number { 0 };
|
||||
size_t m_line_column { 0 };
|
||||
size_t m_offset { 0 };
|
||||
};
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue