ladybird/Userland/Libraries/LibJS/Lexer.cpp
davidot 56c425eec1 LibJS: Detect invalid unicode and stop lexing at that point
Previously we might swallow invalid unicode point which would skip valid
ascii characters. This could be dangerous as we might skip a '"' thus
not closing a string where we should.
This might have been exploitable as it would not have been clear what
code gets executed when looking at a script.

Another approach to this would be simply replacing all invalid
characters with the replacement character (this is what v8 does). But
our lexer and parser are currently not set up for such a change.
2021-12-29 16:57:23 +01:00

932 lines
34 KiB
C++

/*
* Copyright (c) 2020, Stephan Unverwerth <s.unverwerth@serenityos.org>
* Copyright (c) 2020-2021, Linus Groh <linusg@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include "Lexer.h"
#include <AK/CharacterTypes.h>
#include <AK/Debug.h>
#include <AK/GenericLexer.h>
#include <AK/HashMap.h>
#include <AK/Utf8View.h>
#include <LibUnicode/CharacterTypes.h>
#include <stdio.h>
namespace JS {
HashMap<FlyString, TokenType> Lexer::s_keywords;
HashMap<String, TokenType> Lexer::s_three_char_tokens;
HashMap<String, TokenType> Lexer::s_two_char_tokens;
HashMap<char, TokenType> Lexer::s_single_char_tokens;
Lexer::Lexer(StringView source, StringView filename, size_t line_number, size_t line_column)
: m_source(source)
, m_current_token(TokenType::Eof, {}, StringView(nullptr), StringView(nullptr), filename, 0, 0, 0)
, m_filename(filename)
, m_line_number(line_number)
, m_line_column(line_column)
, m_parsed_identifiers(adopt_ref(*new ParsedIdentifiers))
{
if (s_keywords.is_empty()) {
s_keywords.set("async", TokenType::Async);
s_keywords.set("await", TokenType::Await);
s_keywords.set("break", TokenType::Break);
s_keywords.set("case", TokenType::Case);
s_keywords.set("catch", TokenType::Catch);
s_keywords.set("class", TokenType::Class);
s_keywords.set("const", TokenType::Const);
s_keywords.set("continue", TokenType::Continue);
s_keywords.set("debugger", TokenType::Debugger);
s_keywords.set("default", TokenType::Default);
s_keywords.set("delete", TokenType::Delete);
s_keywords.set("do", TokenType::Do);
s_keywords.set("else", TokenType::Else);
s_keywords.set("enum", TokenType::Enum);
s_keywords.set("export", TokenType::Export);
s_keywords.set("extends", TokenType::Extends);
s_keywords.set("false", TokenType::BoolLiteral);
s_keywords.set("finally", TokenType::Finally);
s_keywords.set("for", TokenType::For);
s_keywords.set("function", TokenType::Function);
s_keywords.set("if", TokenType::If);
s_keywords.set("import", TokenType::Import);
s_keywords.set("in", TokenType::In);
s_keywords.set("instanceof", TokenType::Instanceof);
s_keywords.set("let", TokenType::Let);
s_keywords.set("new", TokenType::New);
s_keywords.set("null", TokenType::NullLiteral);
s_keywords.set("return", TokenType::Return);
s_keywords.set("super", TokenType::Super);
s_keywords.set("switch", TokenType::Switch);
s_keywords.set("this", TokenType::This);
s_keywords.set("throw", TokenType::Throw);
s_keywords.set("true", TokenType::BoolLiteral);
s_keywords.set("try", TokenType::Try);
s_keywords.set("typeof", TokenType::Typeof);
s_keywords.set("var", TokenType::Var);
s_keywords.set("void", TokenType::Void);
s_keywords.set("while", TokenType::While);
s_keywords.set("with", TokenType::With);
s_keywords.set("yield", TokenType::Yield);
}
if (s_three_char_tokens.is_empty()) {
s_three_char_tokens.set("===", TokenType::EqualsEqualsEquals);
s_three_char_tokens.set("!==", TokenType::ExclamationMarkEqualsEquals);
s_three_char_tokens.set("**=", TokenType::DoubleAsteriskEquals);
s_three_char_tokens.set("<<=", TokenType::ShiftLeftEquals);
s_three_char_tokens.set(">>=", TokenType::ShiftRightEquals);
s_three_char_tokens.set("&&=", TokenType::DoubleAmpersandEquals);
s_three_char_tokens.set("||=", TokenType::DoublePipeEquals);
s_three_char_tokens.set("\?\?=", TokenType::DoubleQuestionMarkEquals);
s_three_char_tokens.set(">>>", TokenType::UnsignedShiftRight);
s_three_char_tokens.set("...", TokenType::TripleDot);
}
if (s_two_char_tokens.is_empty()) {
s_two_char_tokens.set("=>", TokenType::Arrow);
s_two_char_tokens.set("+=", TokenType::PlusEquals);
s_two_char_tokens.set("-=", TokenType::MinusEquals);
s_two_char_tokens.set("*=", TokenType::AsteriskEquals);
s_two_char_tokens.set("/=", TokenType::SlashEquals);
s_two_char_tokens.set("%=", TokenType::PercentEquals);
s_two_char_tokens.set("&=", TokenType::AmpersandEquals);
s_two_char_tokens.set("|=", TokenType::PipeEquals);
s_two_char_tokens.set("^=", TokenType::CaretEquals);
s_two_char_tokens.set("&&", TokenType::DoubleAmpersand);
s_two_char_tokens.set("||", TokenType::DoublePipe);
s_two_char_tokens.set("??", TokenType::DoubleQuestionMark);
s_two_char_tokens.set("**", TokenType::DoubleAsterisk);
s_two_char_tokens.set("==", TokenType::EqualsEquals);
s_two_char_tokens.set("<=", TokenType::LessThanEquals);
s_two_char_tokens.set(">=", TokenType::GreaterThanEquals);
s_two_char_tokens.set("!=", TokenType::ExclamationMarkEquals);
s_two_char_tokens.set("--", TokenType::MinusMinus);
s_two_char_tokens.set("++", TokenType::PlusPlus);
s_two_char_tokens.set("<<", TokenType::ShiftLeft);
s_two_char_tokens.set(">>", TokenType::ShiftRight);
s_two_char_tokens.set("?.", TokenType::QuestionMarkPeriod);
}
if (s_single_char_tokens.is_empty()) {
s_single_char_tokens.set('&', TokenType::Ampersand);
s_single_char_tokens.set('*', TokenType::Asterisk);
s_single_char_tokens.set('[', TokenType::BracketOpen);
s_single_char_tokens.set(']', TokenType::BracketClose);
s_single_char_tokens.set('^', TokenType::Caret);
s_single_char_tokens.set(':', TokenType::Colon);
s_single_char_tokens.set(',', TokenType::Comma);
s_single_char_tokens.set('{', TokenType::CurlyOpen);
s_single_char_tokens.set('}', TokenType::CurlyClose);
s_single_char_tokens.set('=', TokenType::Equals);
s_single_char_tokens.set('!', TokenType::ExclamationMark);
s_single_char_tokens.set('-', TokenType::Minus);
s_single_char_tokens.set('(', TokenType::ParenOpen);
s_single_char_tokens.set(')', TokenType::ParenClose);
s_single_char_tokens.set('%', TokenType::Percent);
s_single_char_tokens.set('.', TokenType::Period);
s_single_char_tokens.set('|', TokenType::Pipe);
s_single_char_tokens.set('+', TokenType::Plus);
s_single_char_tokens.set('?', TokenType::QuestionMark);
s_single_char_tokens.set(';', TokenType::Semicolon);
s_single_char_tokens.set('/', TokenType::Slash);
s_single_char_tokens.set('~', TokenType::Tilde);
s_single_char_tokens.set('<', TokenType::LessThan);
s_single_char_tokens.set('>', TokenType::GreaterThan);
}
consume();
}
void Lexer::consume()
{
auto did_reach_eof = [this] {
if (m_position < m_source.length())
return false;
m_eof = true;
m_current_char = '\0';
m_position = m_source.length() + 1;
m_line_column++;
return true;
};
if (m_position > m_source.length())
return;
if (did_reach_eof())
return;
if (is_line_terminator()) {
if constexpr (LEXER_DEBUG) {
String type;
if (m_current_char == '\n')
type = "LINE FEED";
else if (m_current_char == '\r')
type = "CARRIAGE RETURN";
else if (m_source[m_position + 1] == (char)0xa8)
type = "LINE SEPARATOR";
else
type = "PARAGRAPH SEPARATOR";
dbgln("Found a line terminator: {}", type);
}
// This is a three-char line terminator, we need to increase m_position some more.
// We might reach EOF and need to check again.
if (m_current_char != '\n' && m_current_char != '\r') {
m_position += 2;
if (did_reach_eof())
return;
}
// If the previous character is \r and the current one \n we already updated line number
// and column - don't do it again. From https://tc39.es/ecma262/#sec-line-terminators:
// The sequence <CR><LF> is commonly used as a line terminator.
// It should be considered a single SourceCharacter for the purpose of reporting line numbers.
auto second_char_of_crlf = m_position > 1 && m_source[m_position - 2] == '\r' && m_current_char == '\n';
if (!second_char_of_crlf) {
m_line_number++;
m_line_column = 1;
dbgln_if(LEXER_DEBUG, "Incremented line number, now at: line {}, column 1", m_line_number);
} else {
dbgln_if(LEXER_DEBUG, "Previous was CR, this is LF - not incrementing line number again.");
}
} else if (is_unicode_character()) {
size_t char_size = 1;
if ((m_current_char & 64) == 0) {
m_hit_invalid_unicode = m_position;
} else if ((m_current_char & 32) == 0) {
char_size = 2;
} else if ((m_current_char & 16) == 0) {
char_size = 3;
} else if ((m_current_char & 8) == 0) {
char_size = 4;
}
VERIFY(char_size >= 1);
--char_size;
for (size_t i = m_position; i < m_position + char_size; i++) {
if (i >= m_source.length() || (m_source[i] & 0b11000000) != 0b10000000) {
m_hit_invalid_unicode = m_position;
break;
}
}
if (m_hit_invalid_unicode.has_value())
m_position = m_source.length();
else
m_position += char_size;
if (did_reach_eof())
return;
m_line_column++;
} else {
m_line_column++;
}
m_current_char = m_source[m_position++];
}
bool Lexer::consume_decimal_number()
{
if (!is_ascii_digit(m_current_char))
return false;
while (is_ascii_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_ascii_digit)) {
consume();
}
return true;
}
bool Lexer::consume_exponent()
{
consume();
if (m_current_char == '-' || m_current_char == '+')
consume();
if (!is_ascii_digit(m_current_char))
return false;
return consume_decimal_number();
}
static constexpr bool is_octal_digit(char ch)
{
return ch >= '0' && ch <= '7';
}
bool Lexer::consume_octal_number()
{
consume();
if (!is_octal_digit(m_current_char))
return false;
while (is_octal_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_octal_digit))
consume();
return true;
}
bool Lexer::consume_hexadecimal_number()
{
consume();
if (!is_ascii_hex_digit(m_current_char))
return false;
while (is_ascii_hex_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_ascii_hex_digit))
consume();
return true;
}
static constexpr bool is_binary_digit(char ch)
{
return ch == '0' || ch == '1';
}
bool Lexer::consume_binary_number()
{
consume();
if (!is_binary_digit(m_current_char))
return false;
while (is_binary_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_binary_digit))
consume();
return true;
}
template<typename Callback>
bool Lexer::match_numeric_literal_separator_followed_by(Callback callback) const
{
if (m_position >= m_source.length())
return false;
return m_current_char == '_'
&& callback(m_source[m_position]);
}
bool Lexer::match(char a, char b) const
{
if (m_position >= m_source.length())
return false;
return m_current_char == a
&& m_source[m_position] == b;
}
bool Lexer::match(char a, char b, char c) const
{
if (m_position + 1 >= m_source.length())
return false;
return m_current_char == a
&& m_source[m_position] == b
&& m_source[m_position + 1] == c;
}
bool Lexer::match(char a, char b, char c, char d) const
{
if (m_position + 2 >= m_source.length())
return false;
return m_current_char == a
&& m_source[m_position] == b
&& m_source[m_position + 1] == c
&& m_source[m_position + 2] == d;
}
bool Lexer::is_eof() const
{
return m_eof;
}
ALWAYS_INLINE bool Lexer::is_line_terminator() const
{
if (m_current_char == '\n' || m_current_char == '\r')
return true;
if (!is_unicode_character())
return false;
auto code_point = current_code_point();
return code_point == LINE_SEPARATOR || code_point == PARAGRAPH_SEPARATOR;
}
ALWAYS_INLINE bool Lexer::is_unicode_character() const
{
return (m_current_char & 128) != 0;
}
u32 Lexer::current_code_point() const
{
static constexpr const u32 REPLACEMENT_CHARACTER = 0xFFFD;
if (m_position == 0)
return REPLACEMENT_CHARACTER;
Utf8View utf_8_view { m_source.substring_view(m_position - 1) };
if (utf_8_view.is_empty())
return REPLACEMENT_CHARACTER;
return *utf_8_view.begin();
}
bool Lexer::is_whitespace() const
{
if (is_ascii_space(m_current_char))
return true;
if (!is_unicode_character())
return false;
auto code_point = current_code_point();
if (code_point == NO_BREAK_SPACE || code_point == ZERO_WIDTH_NO_BREAK_SPACE)
return true;
static auto space_separator_category = Unicode::general_category_from_string("Space_Separator"sv);
if (space_separator_category.has_value())
return Unicode::code_point_has_general_category(code_point, *space_separator_category);
return false;
}
// UnicodeEscapeSequence :: https://tc39.es/ecma262/#prod-UnicodeEscapeSequence
// u Hex4Digits
// u{ CodePoint }
Optional<u32> Lexer::is_identifier_unicode_escape(size_t& identifier_length) const
{
GenericLexer lexer(source().substring_view(m_position - 1));
if (auto code_point_or_error = lexer.consume_escaped_code_point(false); !code_point_or_error.is_error()) {
identifier_length = lexer.tell();
return code_point_or_error.value();
}
return {};
}
// IdentifierStart :: https://tc39.es/ecma262/#prod-IdentifierStart
// UnicodeIDStart
// $
// _
// \ UnicodeEscapeSequence
Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
{
u32 code_point = current_code_point();
identifier_length = 1;
if (code_point == '\\') {
if (auto maybe_code_point = is_identifier_unicode_escape(identifier_length); maybe_code_point.has_value())
code_point = *maybe_code_point;
else
return {};
}
if (is_ascii_alpha(code_point) || code_point == '_' || code_point == '$')
return code_point;
// Optimization: the first codepoint with the ID_Start property after A-Za-z is outside the
// ASCII range (0x00AA), so we can skip code_point_has_property() for any ASCII characters.
if (is_ascii(code_point))
return {};
static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
if (id_start_category.has_value() && Unicode::code_point_has_property(code_point, *id_start_category))
return code_point;
return {};
}
// IdentifierPart :: https://tc39.es/ecma262/#prod-IdentifierPart
// UnicodeIDContinue
// $
// \ UnicodeEscapeSequence
// <ZWNJ>
// <ZWJ>
Optional<u32> Lexer::is_identifier_middle(size_t& identifier_length) const
{
u32 code_point = current_code_point();
identifier_length = 1;
if (code_point == '\\') {
if (auto maybe_code_point = is_identifier_unicode_escape(identifier_length); maybe_code_point.has_value())
code_point = *maybe_code_point;
else
return {};
}
if (is_ascii_alphanumeric(code_point) || (code_point == '$') || (code_point == ZERO_WIDTH_NON_JOINER) || (code_point == ZERO_WIDTH_JOINER))
return code_point;
// Optimization: the first codepoint with the ID_Continue property after A-Za-z0-9_ is outside the
// ASCII range (0x00AA), so we can skip code_point_has_property() for any ASCII characters.
if (code_point == '_')
return code_point;
if (is_ascii(code_point))
return {};
static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
if (id_continue_category.has_value() && Unicode::code_point_has_property(code_point, *id_continue_category))
return code_point;
return {};
}
bool Lexer::is_line_comment_start(bool line_has_token_yet) const
{
return match('/', '/')
|| (m_allow_html_comments && match('<', '!', '-', '-'))
// "-->" is considered a line comment start if the current line is only whitespace and/or
// other block comment(s); or in other words: the current line does not have a token or
// ongoing line comment yet
|| (m_allow_html_comments && !line_has_token_yet && match('-', '-', '>'))
// https://tc39.es/proposal-hashbang/out.html#sec-updated-syntax
|| (match('#', '!') && m_position == 1);
}
bool Lexer::is_block_comment_start() const
{
return match('/', '*');
}
bool Lexer::is_block_comment_end() const
{
return match('*', '/');
}
bool Lexer::is_numeric_literal_start() const
{
return is_ascii_digit(m_current_char) || (m_current_char == '.' && m_position < m_source.length() && is_ascii_digit(m_source[m_position]));
}
bool Lexer::slash_means_division() const
{
auto type = m_current_token.type();
return type == TokenType::BigIntLiteral
|| type == TokenType::BoolLiteral
|| type == TokenType::BracketClose
|| type == TokenType::CurlyClose
|| type == TokenType::Identifier
|| type == TokenType::In
|| type == TokenType::Instanceof
|| type == TokenType::MinusMinus
|| type == TokenType::NullLiteral
|| type == TokenType::NumericLiteral
|| type == TokenType::ParenClose
|| type == TokenType::PlusPlus
|| type == TokenType::PrivateIdentifier
|| type == TokenType::RegexLiteral
|| type == TokenType::StringLiteral
|| type == TokenType::TemplateLiteralEnd
|| type == TokenType::This;
}
Token Lexer::next()
{
size_t trivia_start = m_position;
auto in_template = !m_template_states.is_empty();
bool line_has_token_yet = m_line_column > 1;
bool unterminated_comment = false;
if (!in_template || m_template_states.last().in_expr) {
// consume whitespace and comments
while (true) {
if (is_line_terminator()) {
line_has_token_yet = false;
do {
consume();
} while (is_line_terminator());
} else if (is_whitespace()) {
do {
consume();
} while (is_whitespace());
} else if (is_line_comment_start(line_has_token_yet)) {
consume();
do {
consume();
} while (!is_eof() && !is_line_terminator());
} else if (is_block_comment_start()) {
size_t start_line_number = m_line_number;
consume();
do {
consume();
} while (!is_eof() && !is_block_comment_end());
if (is_eof())
unterminated_comment = true;
consume(); // consume *
if (is_eof())
unterminated_comment = true;
consume(); // consume /
if (start_line_number != m_line_number)
line_has_token_yet = false;
} else {
break;
}
}
}
size_t value_start = m_position;
size_t value_start_line_number = m_line_number;
size_t value_start_column_number = m_line_column;
auto token_type = TokenType::Invalid;
auto did_consume_whitespace_or_comments = trivia_start != value_start;
// This is being used to communicate info about invalid tokens to the parser, which then
// can turn that into more specific error messages - instead of us having to make up a
// bunch of Invalid* tokens (bad numeric literals, unterminated comments etc.)
String token_message;
Optional<FlyString> identifier;
size_t identifier_length = 0;
if (m_current_token.type() == TokenType::RegexLiteral && !is_eof() && is_ascii_alpha(m_current_char) && !did_consume_whitespace_or_comments) {
token_type = TokenType::RegexFlags;
while (!is_eof() && is_ascii_alpha(m_current_char))
consume();
} else if (m_current_char == '`') {
consume();
if (!in_template) {
token_type = TokenType::TemplateLiteralStart;
m_template_states.append({ false, 0 });
} else {
if (m_template_states.last().in_expr) {
m_template_states.append({ false, 0 });
token_type = TokenType::TemplateLiteralStart;
} else {
m_template_states.take_last();
token_type = TokenType::TemplateLiteralEnd;
}
}
} else if (in_template && m_template_states.last().in_expr && m_template_states.last().open_bracket_count == 0 && m_current_char == '}') {
consume();
token_type = TokenType::TemplateLiteralExprEnd;
m_template_states.last().in_expr = false;
} else if (in_template && !m_template_states.last().in_expr) {
if (is_eof()) {
token_type = TokenType::UnterminatedTemplateLiteral;
m_template_states.take_last();
} else if (match('$', '{')) {
token_type = TokenType::TemplateLiteralExprStart;
consume();
consume();
m_template_states.last().in_expr = true;
} else {
while (!match('$', '{') && m_current_char != '`' && !is_eof()) {
if (match('\\', '$') || match('\\', '`'))
consume();
consume();
}
if (is_eof() && !m_template_states.is_empty())
token_type = TokenType::UnterminatedTemplateLiteral;
else
token_type = TokenType::TemplateLiteralString;
}
} else if (m_current_char == '#') {
// Note: This has some duplicated code with the identifier lexing below
consume();
auto code_point = is_identifier_start(identifier_length);
if (code_point.has_value()) {
StringBuilder builder;
builder.append_code_point('#');
do {
builder.append_code_point(*code_point);
for (size_t i = 0; i < identifier_length; ++i)
consume();
code_point = is_identifier_middle(identifier_length);
} while (code_point.has_value());
identifier = builder.string_view();
token_type = TokenType::PrivateIdentifier;
m_parsed_identifiers->identifiers.set(*identifier);
} else {
token_type = TokenType::Invalid;
token_message = "Start of private name '#' but not followed by valid identifier";
}
} else if (auto code_point = is_identifier_start(identifier_length); code_point.has_value()) {
bool has_escaped_character = false;
// identifier or keyword
StringBuilder builder;
do {
builder.append_code_point(*code_point);
for (size_t i = 0; i < identifier_length; ++i)
consume();
has_escaped_character |= identifier_length > 1;
code_point = is_identifier_middle(identifier_length);
} while (code_point.has_value());
identifier = builder.string_view();
m_parsed_identifiers->identifiers.set(*identifier);
auto it = s_keywords.find(identifier->hash(), [&](auto& entry) { return entry.key == identifier; });
if (it == s_keywords.end())
token_type = TokenType::Identifier;
else
token_type = has_escaped_character ? TokenType::EscapedKeyword : it->value;
} else if (is_numeric_literal_start()) {
token_type = TokenType::NumericLiteral;
bool is_invalid_numeric_literal = false;
if (m_current_char == '0') {
consume();
if (m_current_char == '.') {
// decimal
consume();
while (is_ascii_digit(m_current_char))
consume();
if (m_current_char == 'e' || m_current_char == 'E')
is_invalid_numeric_literal = !consume_exponent();
} else if (m_current_char == 'e' || m_current_char == 'E') {
is_invalid_numeric_literal = !consume_exponent();
} else if (m_current_char == 'o' || m_current_char == 'O') {
// octal
is_invalid_numeric_literal = !consume_octal_number();
if (m_current_char == 'n') {
consume();
token_type = TokenType::BigIntLiteral;
}
} else if (m_current_char == 'b' || m_current_char == 'B') {
// binary
is_invalid_numeric_literal = !consume_binary_number();
if (m_current_char == 'n') {
consume();
token_type = TokenType::BigIntLiteral;
}
} else if (m_current_char == 'x' || m_current_char == 'X') {
// hexadecimal
is_invalid_numeric_literal = !consume_hexadecimal_number();
if (m_current_char == 'n') {
consume();
token_type = TokenType::BigIntLiteral;
}
} else if (m_current_char == 'n') {
consume();
token_type = TokenType::BigIntLiteral;
} else if (is_ascii_digit(m_current_char)) {
// octal without '0o' prefix. Forbidden in 'strict mode'
do {
consume();
} while (is_ascii_digit(m_current_char));
}
} else {
// 1...9 or period
while (is_ascii_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_ascii_digit))
consume();
if (m_current_char == 'n') {
consume();
token_type = TokenType::BigIntLiteral;
} else {
if (m_current_char == '.') {
consume();
if (m_current_char == '_')
is_invalid_numeric_literal = true;
while (is_ascii_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_ascii_digit)) {
consume();
}
}
if (m_current_char == 'e' || m_current_char == 'E')
is_invalid_numeric_literal = is_invalid_numeric_literal || !consume_exponent();
}
}
if (is_invalid_numeric_literal) {
token_type = TokenType::Invalid;
token_message = "Invalid numeric literal";
}
} else if (m_current_char == '"' || m_current_char == '\'') {
char stop_char = m_current_char;
consume();
// Note: LS/PS line terminators are allowed in string literals.
while (m_current_char != stop_char && m_current_char != '\r' && m_current_char != '\n' && !is_eof()) {
if (m_current_char == '\\') {
consume();
if (m_current_char == '\r' && m_position < m_source.length() && m_source[m_position] == '\n') {
consume();
}
}
consume();
}
if (m_current_char != stop_char) {
token_type = TokenType::UnterminatedStringLiteral;
} else {
consume();
token_type = TokenType::StringLiteral;
}
} else if (m_current_char == '/' && !slash_means_division()) {
consume();
token_type = consume_regex_literal();
} else if (m_eof) {
if (unterminated_comment) {
token_type = TokenType::Invalid;
token_message = "Unterminated multi-line comment";
} else {
token_type = TokenType::Eof;
}
} else {
// There is only one four-char operator: >>>=
bool found_four_char_token = false;
if (match('>', '>', '>', '=')) {
found_four_char_token = true;
consume();
consume();
consume();
consume();
token_type = TokenType::UnsignedShiftRightEquals;
}
bool found_three_char_token = false;
if (!found_four_char_token && m_position + 1 < m_source.length()) {
auto three_chars_view = m_source.substring_view(m_position - 1, 3);
auto it = s_three_char_tokens.find(three_chars_view.hash(), [&](auto& entry) { return entry.key == three_chars_view; });
if (it != s_three_char_tokens.end()) {
found_three_char_token = true;
consume();
consume();
consume();
token_type = it->value;
}
}
bool found_two_char_token = false;
if (!found_four_char_token && !found_three_char_token && m_position < m_source.length()) {
auto two_chars_view = m_source.substring_view(m_position - 1, 2);
auto it = s_two_char_tokens.find(two_chars_view.hash(), [&](auto& entry) { return entry.key == two_chars_view; });
if (it != s_two_char_tokens.end()) {
// OptionalChainingPunctuator :: ?. [lookahead ∉ DecimalDigit]
if (!(it->value == TokenType::QuestionMarkPeriod && m_position + 1 < m_source.length() && is_ascii_digit(m_source[m_position + 1]))) {
found_two_char_token = true;
consume();
consume();
token_type = it->value;
}
}
}
bool found_one_char_token = false;
if (!found_four_char_token && !found_three_char_token && !found_two_char_token) {
auto it = s_single_char_tokens.find(m_current_char);
if (it != s_single_char_tokens.end()) {
found_one_char_token = true;
consume();
token_type = it->value;
}
}
if (!found_four_char_token && !found_three_char_token && !found_two_char_token && !found_one_char_token) {
consume();
token_type = TokenType::Invalid;
}
}
if (!m_template_states.is_empty() && m_template_states.last().in_expr) {
if (token_type == TokenType::CurlyOpen) {
m_template_states.last().open_bracket_count++;
} else if (token_type == TokenType::CurlyClose) {
m_template_states.last().open_bracket_count--;
}
}
if (m_hit_invalid_unicode.has_value()) {
value_start = m_hit_invalid_unicode.value() - 1;
m_current_token = Token(TokenType::Invalid, "Invalid unicode codepoint in source",
"", // Since the invalid unicode can occur anywhere in the current token the trivia is not correct
m_source.substring_view(value_start + 1, min(4u, m_source.length() - value_start - 2)),
m_filename,
m_line_number,
m_line_column - 1,
m_position);
m_hit_invalid_unicode.clear();
// Do not produce any further tokens.
VERIFY(is_eof());
} else {
m_current_token = Token(
token_type,
token_message,
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
m_source.substring_view(value_start - 1, m_position - value_start),
m_filename,
value_start_line_number,
value_start_column_number,
m_position);
}
if (identifier.has_value())
m_current_token.set_identifier_value(identifier.release_value());
if constexpr (LEXER_DEBUG) {
dbgln("------------------------------");
dbgln("Token: {}", m_current_token.name());
dbgln("Trivia: _{}_", m_current_token.trivia());
dbgln("Value: _{}_", m_current_token.value());
dbgln("Line: {}, Column: {}", m_current_token.line_number(), m_current_token.line_column());
dbgln("------------------------------");
}
return m_current_token;
}
Token Lexer::force_slash_as_regex()
{
VERIFY(m_current_token.type() == TokenType::Slash || m_current_token.type() == TokenType::SlashEquals);
bool has_equals = m_current_token.type() == TokenType::SlashEquals;
VERIFY(m_position > 0);
size_t value_start = m_position - 1;
if (has_equals) {
VERIFY(m_source[value_start - 1] == '=');
--value_start;
--m_position;
m_current_char = '=';
}
TokenType token_type = consume_regex_literal();
m_current_token = Token(
token_type,
"",
m_current_token.trivia(),
m_source.substring_view(value_start - 1, m_position - value_start),
m_filename,
m_current_token.line_number(),
m_current_token.line_column(),
m_position);
if constexpr (LEXER_DEBUG) {
dbgln("------------------------------");
dbgln("Token: {}", m_current_token.name());
dbgln("Trivia: _{}_", m_current_token.trivia());
dbgln("Value: _{}_", m_current_token.value());
dbgln("Line: {}, Column: {}", m_current_token.line_number(), m_current_token.line_column());
dbgln("------------------------------");
}
return m_current_token;
}
TokenType Lexer::consume_regex_literal()
{
while (!is_eof()) {
if (is_line_terminator() || (!m_regex_is_in_character_class && m_current_char == '/')) {
break;
} else if (m_current_char == '[') {
m_regex_is_in_character_class = true;
} else if (m_current_char == ']') {
m_regex_is_in_character_class = false;
} else if (!m_regex_is_in_character_class && m_current_char == '/') {
break;
}
if (match('\\', '/') || match('\\', '[') || match('\\', '\\') || (m_regex_is_in_character_class && match('\\', ']')))
consume();
consume();
}
if (m_current_char == '/') {
consume();
return TokenType::RegexLiteral;
}
return TokenType::UnterminatedRegexLiteral;
}
}