2020-03-11 18:27:43 +00:00
|
|
|
/*
|
2021-05-29 10:38:28 +00:00
|
|
|
* Copyright (c) 2020, Stephan Unverwerth <s.unverwerth@serenityos.org>
|
2021-04-22 20:51:19 +00:00
|
|
|
* Copyright (c) 2020-2021, Linus Groh <linusg@serenityos.org>
|
2020-03-11 18:27:43 +00:00
|
|
|
*
|
2021-04-22 08:24:48 +00:00
|
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
2020-03-11 18:27:43 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include "Lexer.h"
|
2021-06-13 08:47:09 +00:00
|
|
|
#include <AK/CharacterTypes.h>
|
2021-01-24 14:28:26 +00:00
|
|
|
#include <AK/Debug.h>
|
2021-08-18 20:34:25 +00:00
|
|
|
#include <AK/GenericLexer.h>
|
2020-03-11 18:27:43 +00:00
|
|
|
#include <AK/HashMap.h>
|
2021-08-14 15:07:47 +00:00
|
|
|
#include <AK/Utf8View.h>
|
|
|
|
#include <LibUnicode/CharacterTypes.h>
|
2020-03-11 18:27:43 +00:00
|
|
|
#include <stdio.h>
|
|
|
|
|
|
|
|
namespace JS {
|
|
|
|
|
|
|
|
HashMap<String, TokenType> Lexer::s_keywords;
|
2020-03-12 12:11:33 +00:00
|
|
|
HashMap<String, TokenType> Lexer::s_three_char_tokens;
|
2020-03-11 18:27:43 +00:00
|
|
|
HashMap<String, TokenType> Lexer::s_two_char_tokens;
|
|
|
|
HashMap<char, TokenType> Lexer::s_single_char_tokens;
|
|
|
|
|
2021-02-28 09:42:34 +00:00
|
|
|
Lexer::Lexer(StringView source, StringView filename, size_t line_number, size_t line_column)
|
2020-03-11 18:27:43 +00:00
|
|
|
: m_source(source)
|
2021-07-10 20:46:17 +00:00
|
|
|
, m_current_token(TokenType::Eof, {}, StringView(nullptr), StringView(nullptr), filename, 0, 0, 0)
|
2021-02-28 09:42:34 +00:00
|
|
|
, m_filename(filename)
|
|
|
|
, m_line_number(line_number)
|
|
|
|
, m_line_column(line_column)
|
2020-03-11 18:27:43 +00:00
|
|
|
{
|
|
|
|
if (s_keywords.is_empty()) {
|
2020-03-12 22:02:41 +00:00
|
|
|
s_keywords.set("await", TokenType::Await);
|
2020-03-29 11:09:54 +00:00
|
|
|
s_keywords.set("break", TokenType::Break);
|
|
|
|
s_keywords.set("case", TokenType::Case);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_keywords.set("catch", TokenType::Catch);
|
|
|
|
s_keywords.set("class", TokenType::Class);
|
|
|
|
s_keywords.set("const", TokenType::Const);
|
2020-04-04 22:22:42 +00:00
|
|
|
s_keywords.set("continue", TokenType::Continue);
|
2020-04-30 16:26:27 +00:00
|
|
|
s_keywords.set("debugger", TokenType::Debugger);
|
2020-03-29 11:09:54 +00:00
|
|
|
s_keywords.set("default", TokenType::Default);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_keywords.set("delete", TokenType::Delete);
|
|
|
|
s_keywords.set("do", TokenType::Do);
|
|
|
|
s_keywords.set("else", TokenType::Else);
|
2020-05-12 16:43:35 +00:00
|
|
|
s_keywords.set("enum", TokenType::Enum);
|
|
|
|
s_keywords.set("export", TokenType::Export);
|
|
|
|
s_keywords.set("extends", TokenType::Extends);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_keywords.set("false", TokenType::BoolLiteral);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_keywords.set("finally", TokenType::Finally);
|
2020-03-12 12:12:12 +00:00
|
|
|
s_keywords.set("for", TokenType::For);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_keywords.set("function", TokenType::Function);
|
|
|
|
s_keywords.set("if", TokenType::If);
|
2020-05-12 16:43:35 +00:00
|
|
|
s_keywords.set("import", TokenType::Import);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_keywords.set("in", TokenType::In);
|
|
|
|
s_keywords.set("instanceof", TokenType::Instanceof);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_keywords.set("let", TokenType::Let);
|
|
|
|
s_keywords.set("new", TokenType::New);
|
|
|
|
s_keywords.set("null", TokenType::NullLiteral);
|
|
|
|
s_keywords.set("return", TokenType::Return);
|
2020-05-12 16:43:35 +00:00
|
|
|
s_keywords.set("super", TokenType::Super);
|
2020-03-29 11:09:54 +00:00
|
|
|
s_keywords.set("switch", TokenType::Switch);
|
2020-04-12 22:42:14 +00:00
|
|
|
s_keywords.set("this", TokenType::This);
|
2020-03-24 21:03:50 +00:00
|
|
|
s_keywords.set("throw", TokenType::Throw);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_keywords.set("true", TokenType::BoolLiteral);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_keywords.set("try", TokenType::Try);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_keywords.set("typeof", TokenType::Typeof);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_keywords.set("var", TokenType::Var);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_keywords.set("void", TokenType::Void);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_keywords.set("while", TokenType::While);
|
2020-05-12 16:43:35 +00:00
|
|
|
s_keywords.set("with", TokenType::With);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_keywords.set("yield", TokenType::Yield);
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
2020-03-12 12:11:33 +00:00
|
|
|
if (s_three_char_tokens.is_empty()) {
|
|
|
|
s_three_char_tokens.set("===", TokenType::EqualsEqualsEquals);
|
|
|
|
s_three_char_tokens.set("!==", TokenType::ExclamationMarkEqualsEquals);
|
2020-05-04 22:03:35 +00:00
|
|
|
s_three_char_tokens.set("**=", TokenType::DoubleAsteriskEquals);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_three_char_tokens.set("<<=", TokenType::ShiftLeftEquals);
|
|
|
|
s_three_char_tokens.set(">>=", TokenType::ShiftRightEquals);
|
2020-10-05 15:49:43 +00:00
|
|
|
s_three_char_tokens.set("&&=", TokenType::DoubleAmpersandEquals);
|
|
|
|
s_three_char_tokens.set("||=", TokenType::DoublePipeEquals);
|
|
|
|
s_three_char_tokens.set("\?\?=", TokenType::DoubleQuestionMarkEquals);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_three_char_tokens.set(">>>", TokenType::UnsignedShiftRight);
|
2020-04-27 06:05:37 +00:00
|
|
|
s_three_char_tokens.set("...", TokenType::TripleDot);
|
2020-03-12 12:11:33 +00:00
|
|
|
}
|
|
|
|
|
2020-03-11 18:27:43 +00:00
|
|
|
if (s_two_char_tokens.is_empty()) {
|
2020-03-30 13:26:09 +00:00
|
|
|
s_two_char_tokens.set("=>", TokenType::Arrow);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_two_char_tokens.set("+=", TokenType::PlusEquals);
|
|
|
|
s_two_char_tokens.set("-=", TokenType::MinusEquals);
|
|
|
|
s_two_char_tokens.set("*=", TokenType::AsteriskEquals);
|
|
|
|
s_two_char_tokens.set("/=", TokenType::SlashEquals);
|
|
|
|
s_two_char_tokens.set("%=", TokenType::PercentEquals);
|
|
|
|
s_two_char_tokens.set("&=", TokenType::AmpersandEquals);
|
|
|
|
s_two_char_tokens.set("|=", TokenType::PipeEquals);
|
2020-05-04 21:34:45 +00:00
|
|
|
s_two_char_tokens.set("^=", TokenType::CaretEquals);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_two_char_tokens.set("&&", TokenType::DoubleAmpersand);
|
|
|
|
s_two_char_tokens.set("||", TokenType::DoublePipe);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_two_char_tokens.set("??", TokenType::DoubleQuestionMark);
|
|
|
|
s_two_char_tokens.set("**", TokenType::DoubleAsterisk);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_two_char_tokens.set("==", TokenType::EqualsEquals);
|
2020-03-23 13:10:23 +00:00
|
|
|
s_two_char_tokens.set("<=", TokenType::LessThanEquals);
|
|
|
|
s_two_char_tokens.set(">=", TokenType::GreaterThanEquals);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_two_char_tokens.set("!=", TokenType::ExclamationMarkEquals);
|
|
|
|
s_two_char_tokens.set("--", TokenType::MinusMinus);
|
|
|
|
s_two_char_tokens.set("++", TokenType::PlusPlus);
|
|
|
|
s_two_char_tokens.set("<<", TokenType::ShiftLeft);
|
|
|
|
s_two_char_tokens.set(">>", TokenType::ShiftRight);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_two_char_tokens.set("?.", TokenType::QuestionMarkPeriod);
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (s_single_char_tokens.is_empty()) {
|
|
|
|
s_single_char_tokens.set('&', TokenType::Ampersand);
|
|
|
|
s_single_char_tokens.set('*', TokenType::Asterisk);
|
|
|
|
s_single_char_tokens.set('[', TokenType::BracketOpen);
|
|
|
|
s_single_char_tokens.set(']', TokenType::BracketClose);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_single_char_tokens.set('^', TokenType::Caret);
|
2020-03-21 00:29:00 +00:00
|
|
|
s_single_char_tokens.set(':', TokenType::Colon);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_single_char_tokens.set(',', TokenType::Comma);
|
|
|
|
s_single_char_tokens.set('{', TokenType::CurlyOpen);
|
|
|
|
s_single_char_tokens.set('}', TokenType::CurlyClose);
|
|
|
|
s_single_char_tokens.set('=', TokenType::Equals);
|
|
|
|
s_single_char_tokens.set('!', TokenType::ExclamationMark);
|
|
|
|
s_single_char_tokens.set('-', TokenType::Minus);
|
|
|
|
s_single_char_tokens.set('(', TokenType::ParenOpen);
|
|
|
|
s_single_char_tokens.set(')', TokenType::ParenClose);
|
|
|
|
s_single_char_tokens.set('%', TokenType::Percent);
|
|
|
|
s_single_char_tokens.set('.', TokenType::Period);
|
|
|
|
s_single_char_tokens.set('|', TokenType::Pipe);
|
|
|
|
s_single_char_tokens.set('+', TokenType::Plus);
|
|
|
|
s_single_char_tokens.set('?', TokenType::QuestionMark);
|
|
|
|
s_single_char_tokens.set(';', TokenType::Semicolon);
|
|
|
|
s_single_char_tokens.set('/', TokenType::Slash);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_single_char_tokens.set('~', TokenType::Tilde);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_single_char_tokens.set('<', TokenType::LessThan);
|
|
|
|
s_single_char_tokens.set('>', TokenType::GreaterThan);
|
|
|
|
}
|
|
|
|
consume();
|
|
|
|
}
|
|
|
|
|
|
|
|
void Lexer::consume()
|
|
|
|
{
|
2020-10-21 21:16:45 +00:00
|
|
|
auto did_reach_eof = [this] {
|
|
|
|
if (m_position != m_source.length())
|
|
|
|
return false;
|
2021-06-13 07:15:00 +00:00
|
|
|
m_eof = true;
|
|
|
|
m_current_char = '\0';
|
2020-05-26 17:00:30 +00:00
|
|
|
m_position++;
|
|
|
|
m_line_column++;
|
2020-10-21 21:16:45 +00:00
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
|
|
|
if (m_position > m_source.length())
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (did_reach_eof())
|
2020-03-11 18:27:43 +00:00
|
|
|
return;
|
|
|
|
|
2020-10-21 21:16:45 +00:00
|
|
|
if (is_line_terminator()) {
|
2021-04-18 16:13:27 +00:00
|
|
|
if constexpr (LEXER_DEBUG) {
|
|
|
|
String type;
|
|
|
|
if (m_current_char == '\n')
|
|
|
|
type = "LINE FEED";
|
|
|
|
else if (m_current_char == '\r')
|
|
|
|
type = "CARRIAGE RETURN";
|
|
|
|
else if (m_source[m_position + 1] == (char)0xa8)
|
|
|
|
type = "LINE SEPARATOR";
|
|
|
|
else
|
|
|
|
type = "PARAGRAPH SEPARATOR";
|
|
|
|
dbgln("Found a line terminator: {}", type);
|
|
|
|
}
|
2020-10-21 21:16:45 +00:00
|
|
|
// This is a three-char line terminator, we need to increase m_position some more.
|
|
|
|
// We might reach EOF and need to check again.
|
|
|
|
if (m_current_char != '\n' && m_current_char != '\r') {
|
|
|
|
m_position += 2;
|
|
|
|
if (did_reach_eof())
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the previous character is \r and the current one \n we already updated line number
|
|
|
|
// and column - don't do it again. From https://tc39.es/ecma262/#sec-line-terminators:
|
|
|
|
// The sequence <CR><LF> is commonly used as a line terminator.
|
|
|
|
// It should be considered a single SourceCharacter for the purpose of reporting line numbers.
|
|
|
|
auto second_char_of_crlf = m_position > 1 && m_source[m_position - 2] == '\r' && m_current_char == '\n';
|
|
|
|
|
|
|
|
if (!second_char_of_crlf) {
|
|
|
|
m_line_number++;
|
|
|
|
m_line_column = 1;
|
2021-04-18 16:13:27 +00:00
|
|
|
dbgln_if(LEXER_DEBUG, "Incremented line number, now at: line {}, column 1", m_line_number);
|
2020-10-21 21:16:45 +00:00
|
|
|
} else {
|
2021-04-18 16:13:27 +00:00
|
|
|
dbgln_if(LEXER_DEBUG, "Previous was CR, this is LF - not incrementing line number again.");
|
2020-10-21 21:16:45 +00:00
|
|
|
}
|
2021-08-14 15:07:47 +00:00
|
|
|
} else if (is_unicode_character()) {
|
|
|
|
size_t char_size = 1;
|
|
|
|
if ((m_current_char & 64) == 0) {
|
|
|
|
// invalid char
|
|
|
|
} else if ((m_current_char & 32) == 0) {
|
|
|
|
char_size = 2;
|
|
|
|
} else if ((m_current_char & 16) == 0) {
|
|
|
|
char_size = 3;
|
|
|
|
} else if ((m_current_char & 8) == 0) {
|
|
|
|
char_size = 4;
|
|
|
|
}
|
|
|
|
|
|
|
|
VERIFY(char_size > 1);
|
|
|
|
--char_size;
|
|
|
|
|
|
|
|
m_position += char_size;
|
|
|
|
if (did_reach_eof())
|
|
|
|
return;
|
|
|
|
|
|
|
|
m_line_column++;
|
2020-04-05 09:34:03 +00:00
|
|
|
} else {
|
|
|
|
m_line_column++;
|
|
|
|
}
|
|
|
|
|
2020-03-11 18:27:43 +00:00
|
|
|
m_current_char = m_source[m_position++];
|
|
|
|
}
|
|
|
|
|
2021-06-26 14:30:05 +00:00
|
|
|
bool Lexer::consume_decimal_number()
|
|
|
|
{
|
|
|
|
if (!is_ascii_digit(m_current_char))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
while (is_ascii_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_ascii_digit)) {
|
|
|
|
consume();
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-10-18 13:32:50 +00:00
|
|
|
bool Lexer::consume_exponent()
|
2020-04-05 12:20:58 +00:00
|
|
|
{
|
|
|
|
consume();
|
|
|
|
if (m_current_char == '-' || m_current_char == '+')
|
|
|
|
consume();
|
2020-10-18 13:32:50 +00:00
|
|
|
|
2021-06-13 08:47:09 +00:00
|
|
|
if (!is_ascii_digit(m_current_char))
|
2020-10-18 13:32:50 +00:00
|
|
|
return false;
|
|
|
|
|
2021-06-26 14:30:05 +00:00
|
|
|
return consume_decimal_number();
|
|
|
|
}
|
|
|
|
|
|
|
|
static constexpr bool is_octal_digit(char ch)
|
|
|
|
{
|
|
|
|
return ch >= '0' && ch <= '7';
|
2020-10-18 13:32:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::consume_octal_number()
|
|
|
|
{
|
|
|
|
consume();
|
2021-06-26 14:30:05 +00:00
|
|
|
if (!is_octal_digit(m_current_char))
|
2020-10-18 13:32:50 +00:00
|
|
|
return false;
|
|
|
|
|
2021-06-26 14:30:05 +00:00
|
|
|
while (is_octal_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_octal_digit))
|
2020-10-18 13:32:50 +00:00
|
|
|
consume();
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::consume_hexadecimal_number()
|
|
|
|
{
|
|
|
|
consume();
|
2021-06-13 08:47:09 +00:00
|
|
|
if (!is_ascii_hex_digit(m_current_char))
|
2020-10-18 13:32:50 +00:00
|
|
|
return false;
|
|
|
|
|
2021-06-26 14:30:05 +00:00
|
|
|
while (is_ascii_hex_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_ascii_hex_digit))
|
2020-10-18 13:32:50 +00:00
|
|
|
consume();
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2021-06-26 14:30:05 +00:00
|
|
|
static constexpr bool is_binary_digit(char ch)
|
|
|
|
{
|
|
|
|
return ch == '0' || ch == '1';
|
|
|
|
}
|
|
|
|
|
2020-10-18 13:32:50 +00:00
|
|
|
bool Lexer::consume_binary_number()
|
|
|
|
{
|
|
|
|
consume();
|
2021-06-26 14:30:05 +00:00
|
|
|
if (!is_binary_digit(m_current_char))
|
2020-10-18 13:32:50 +00:00
|
|
|
return false;
|
|
|
|
|
2021-06-26 14:30:05 +00:00
|
|
|
while (is_binary_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_binary_digit))
|
2020-10-18 13:32:50 +00:00
|
|
|
consume();
|
|
|
|
|
|
|
|
return true;
|
2020-04-05 12:20:58 +00:00
|
|
|
}
|
|
|
|
|
2021-06-26 14:30:05 +00:00
|
|
|
template<typename Callback>
|
|
|
|
bool Lexer::match_numeric_literal_separator_followed_by(Callback callback) const
|
|
|
|
{
|
|
|
|
if (m_position >= m_source.length())
|
|
|
|
return false;
|
|
|
|
return m_current_char == '_'
|
|
|
|
&& callback(m_source[m_position]);
|
|
|
|
}
|
|
|
|
|
2020-04-13 17:50:58 +00:00
|
|
|
bool Lexer::match(char a, char b) const
|
|
|
|
{
|
|
|
|
if (m_position >= m_source.length())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return m_current_char == a
|
|
|
|
&& m_source[m_position] == b;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::match(char a, char b, char c) const
|
|
|
|
{
|
|
|
|
if (m_position + 1 >= m_source.length())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return m_current_char == a
|
|
|
|
&& m_source[m_position] == b
|
|
|
|
&& m_source[m_position + 1] == c;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::match(char a, char b, char c, char d) const
|
|
|
|
{
|
|
|
|
if (m_position + 2 >= m_source.length())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return m_current_char == a
|
|
|
|
&& m_source[m_position] == b
|
|
|
|
&& m_source[m_position + 1] == c
|
|
|
|
&& m_source[m_position + 2] == d;
|
|
|
|
}
|
|
|
|
|
2020-03-11 18:27:43 +00:00
|
|
|
bool Lexer::is_eof() const
|
|
|
|
{
|
2021-06-13 07:15:00 +00:00
|
|
|
return m_eof;
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
2020-10-21 21:16:45 +00:00
|
|
|
bool Lexer::is_line_terminator() const
|
|
|
|
{
|
|
|
|
if (m_current_char == '\n' || m_current_char == '\r')
|
|
|
|
return true;
|
2021-08-14 15:07:47 +00:00
|
|
|
if (!is_unicode_character())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
auto code_point = current_code_point();
|
|
|
|
return code_point == LINE_SEPARATOR || code_point == PARAGRAPH_SEPARATOR;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::is_unicode_character() const
|
|
|
|
{
|
|
|
|
return (m_current_char & 128) != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
u32 Lexer::current_code_point() const
|
|
|
|
{
|
|
|
|
static constexpr const u32 REPLACEMENT_CHARACTER = 0xFFFD;
|
|
|
|
if (m_position == 0)
|
|
|
|
return REPLACEMENT_CHARACTER;
|
|
|
|
Utf8View utf_8_view { m_source.substring_view(m_position - 1) };
|
2021-08-18 20:34:25 +00:00
|
|
|
if (utf_8_view.is_empty())
|
|
|
|
return REPLACEMENT_CHARACTER;
|
2021-08-14 15:07:47 +00:00
|
|
|
return *utf_8_view.begin();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::is_whitespace() const
|
|
|
|
{
|
|
|
|
if (is_ascii_space(m_current_char))
|
|
|
|
return true;
|
|
|
|
if (!is_unicode_character())
|
|
|
|
return false;
|
|
|
|
auto code_point = current_code_point();
|
|
|
|
if (code_point == NO_BREAK_SPACE)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
static auto space_separator_category = Unicode::general_category_from_string("Space_Separator"sv);
|
|
|
|
if (space_separator_category.has_value())
|
|
|
|
return Unicode::code_point_has_general_category(code_point, *space_separator_category);
|
2020-10-21 21:16:45 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2021-08-18 20:34:25 +00:00
|
|
|
Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
|
2020-03-11 18:27:43 +00:00
|
|
|
{
|
2021-08-18 20:34:25 +00:00
|
|
|
GenericLexer lexer(source().substring_view(m_position - 1));
|
|
|
|
|
|
|
|
if (auto code_point_or_error = lexer.consume_escaped_code_point(); !code_point_or_error.is_error()) {
|
|
|
|
identifier_length = lexer.tell();
|
|
|
|
return code_point_or_error.value();
|
|
|
|
}
|
|
|
|
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
|
|
|
Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
|
|
|
|
{
|
|
|
|
u32 code_point = current_code_point();
|
|
|
|
identifier_length = 1;
|
|
|
|
|
|
|
|
if (code_point == '\\') {
|
|
|
|
if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
|
|
|
|
code_point = *maybe_code_point;
|
|
|
|
else
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_ascii_alpha(code_point) || code_point == '_' || code_point == '$')
|
|
|
|
return code_point;
|
2021-08-14 15:07:47 +00:00
|
|
|
|
|
|
|
static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
|
2021-08-18 20:34:25 +00:00
|
|
|
if (id_start_category.has_value() && Unicode::code_point_has_property(code_point, *id_start_category))
|
|
|
|
return code_point;
|
|
|
|
|
|
|
|
return {};
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
2021-08-18 20:34:25 +00:00
|
|
|
Optional<u32> Lexer::is_identifier_middle(size_t& identifier_length) const
|
2020-03-11 18:27:43 +00:00
|
|
|
{
|
2021-08-18 20:34:25 +00:00
|
|
|
u32 code_point = current_code_point();
|
|
|
|
identifier_length = 1;
|
|
|
|
|
|
|
|
if (code_point == '\\') {
|
|
|
|
if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
|
|
|
|
code_point = *maybe_code_point;
|
|
|
|
else
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_ascii_alphanumeric(code_point) || (code_point == '$') || (code_point == ZERO_WIDTH_NON_JOINER) || (code_point == ZERO_WIDTH_JOINER))
|
|
|
|
return code_point;
|
2021-08-14 15:07:47 +00:00
|
|
|
|
|
|
|
static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
|
2021-08-18 20:34:25 +00:00
|
|
|
if (id_continue_category.has_value() && Unicode::code_point_has_property(code_point, *id_continue_category))
|
|
|
|
return code_point;
|
|
|
|
|
|
|
|
return {};
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
2020-10-29 17:55:24 +00:00
|
|
|
bool Lexer::is_line_comment_start(bool line_has_token_yet) const
|
2020-03-11 18:27:43 +00:00
|
|
|
{
|
2020-10-29 17:55:24 +00:00
|
|
|
return match('/', '/')
|
2021-08-14 15:30:37 +00:00
|
|
|
|| (m_allow_html_comments && match('<', '!', '-', '-'))
|
2020-10-29 17:55:24 +00:00
|
|
|
// "-->" is considered a line comment start if the current line is only whitespace and/or
|
|
|
|
// other block comment(s); or in other words: the current line does not have a token or
|
|
|
|
// ongoing line comment yet
|
2021-08-14 15:30:37 +00:00
|
|
|
|| (m_allow_html_comments && !line_has_token_yet && match('-', '-', '>'))
|
2021-06-18 18:11:26 +00:00
|
|
|
// https://tc39.es/proposal-hashbang/out.html#sec-updated-syntax
|
|
|
|
|| (match('#', '!') && m_position == 1);
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::is_block_comment_start() const
|
|
|
|
{
|
2020-04-13 17:50:58 +00:00
|
|
|
return match('/', '*');
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::is_block_comment_end() const
|
|
|
|
{
|
2020-04-13 17:50:58 +00:00
|
|
|
return match('*', '/');
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
2020-04-05 12:20:58 +00:00
|
|
|
bool Lexer::is_numeric_literal_start() const
|
|
|
|
{
|
2021-06-13 08:47:09 +00:00
|
|
|
return is_ascii_digit(m_current_char) || (m_current_char == '.' && m_position < m_source.length() && is_ascii_digit(m_source[m_position]));
|
2020-04-05 12:20:58 +00:00
|
|
|
}
|
|
|
|
|
2020-06-03 23:05:49 +00:00
|
|
|
bool Lexer::slash_means_division() const
|
|
|
|
{
|
|
|
|
auto type = m_current_token.type();
|
2020-06-07 18:02:42 +00:00
|
|
|
return type == TokenType::BigIntLiteral
|
2020-08-29 17:40:28 +00:00
|
|
|
|| type == TokenType::BoolLiteral
|
|
|
|
|| type == TokenType::BracketClose
|
2021-07-29 21:28:28 +00:00
|
|
|
|| type == TokenType::CurlyClose
|
2020-08-29 17:40:28 +00:00
|
|
|
|| type == TokenType::Identifier
|
2021-07-29 21:28:28 +00:00
|
|
|
|| type == TokenType::In
|
|
|
|
|| type == TokenType::Instanceof
|
|
|
|
|| type == TokenType::MinusMinus
|
2020-08-29 17:40:28 +00:00
|
|
|
|| type == TokenType::NullLiteral
|
|
|
|
|| type == TokenType::NumericLiteral
|
|
|
|
|| type == TokenType::ParenClose
|
2021-07-29 21:28:28 +00:00
|
|
|
|| type == TokenType::PlusPlus
|
2020-08-29 17:40:28 +00:00
|
|
|
|| type == TokenType::RegexLiteral
|
|
|
|
|| type == TokenType::StringLiteral
|
|
|
|
|| type == TokenType::TemplateLiteralEnd
|
|
|
|
|| type == TokenType::This;
|
2020-06-03 23:05:49 +00:00
|
|
|
}
|
|
|
|
|
2020-03-11 18:27:43 +00:00
|
|
|
Token Lexer::next()
|
|
|
|
{
|
|
|
|
size_t trivia_start = m_position;
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
auto in_template = !m_template_states.is_empty();
|
2020-10-29 17:55:24 +00:00
|
|
|
bool line_has_token_yet = m_line_column > 1;
|
2020-10-26 20:10:50 +00:00
|
|
|
bool unterminated_comment = false;
|
2020-03-11 18:27:43 +00:00
|
|
|
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
if (!in_template || m_template_states.last().in_expr) {
|
|
|
|
// consume whitespace and comments
|
|
|
|
while (true) {
|
2020-10-29 17:55:24 +00:00
|
|
|
if (is_line_terminator()) {
|
|
|
|
line_has_token_yet = false;
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
do {
|
|
|
|
consume();
|
2020-10-29 17:55:24 +00:00
|
|
|
} while (is_line_terminator());
|
2021-08-14 15:07:47 +00:00
|
|
|
} else if (is_whitespace()) {
|
2020-10-29 17:55:24 +00:00
|
|
|
do {
|
|
|
|
consume();
|
2021-08-14 15:07:47 +00:00
|
|
|
} while (is_whitespace());
|
2020-10-29 17:55:24 +00:00
|
|
|
} else if (is_line_comment_start(line_has_token_yet)) {
|
2020-03-11 18:27:43 +00:00
|
|
|
consume();
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
do {
|
|
|
|
consume();
|
2020-10-21 21:16:45 +00:00
|
|
|
} while (!is_eof() && !is_line_terminator());
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
} else if (is_block_comment_start()) {
|
2020-03-11 18:27:43 +00:00
|
|
|
consume();
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
do {
|
|
|
|
consume();
|
|
|
|
} while (!is_eof() && !is_block_comment_end());
|
2020-10-26 20:10:50 +00:00
|
|
|
if (is_eof())
|
|
|
|
unterminated_comment = true;
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
consume(); // consume *
|
2020-10-26 20:10:50 +00:00
|
|
|
if (is_eof())
|
|
|
|
unterminated_comment = true;
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
consume(); // consume /
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t value_start = m_position;
|
2020-08-29 17:40:28 +00:00
|
|
|
size_t value_start_line_number = m_line_number;
|
|
|
|
size_t value_start_column_number = m_line_column;
|
2020-03-12 12:11:33 +00:00
|
|
|
auto token_type = TokenType::Invalid;
|
2021-06-22 13:08:40 +00:00
|
|
|
auto did_consume_whitespace_or_comments = trivia_start != value_start;
|
2020-10-26 20:08:01 +00:00
|
|
|
// This is being used to communicate info about invalid tokens to the parser, which then
|
|
|
|
// can turn that into more specific error messages - instead of us having to make up a
|
|
|
|
// bunch of Invalid* tokens (bad numeric literals, unterminated comments etc.)
|
|
|
|
String token_message;
|
2020-03-11 18:27:43 +00:00
|
|
|
|
2021-08-18 20:34:25 +00:00
|
|
|
Optional<FlyString> identifier;
|
|
|
|
size_t identifier_length = 0;
|
|
|
|
|
2021-06-22 13:08:40 +00:00
|
|
|
if (m_current_token.type() == TokenType::RegexLiteral && !is_eof() && is_ascii_alpha(m_current_char) && !did_consume_whitespace_or_comments) {
|
2020-06-03 23:05:49 +00:00
|
|
|
token_type = TokenType::RegexFlags;
|
2021-06-13 08:47:09 +00:00
|
|
|
while (!is_eof() && is_ascii_alpha(m_current_char))
|
2020-06-03 23:05:49 +00:00
|
|
|
consume();
|
|
|
|
} else if (m_current_char == '`') {
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
consume();
|
|
|
|
|
|
|
|
if (!in_template) {
|
|
|
|
token_type = TokenType::TemplateLiteralStart;
|
|
|
|
m_template_states.append({ false, 0 });
|
|
|
|
} else {
|
|
|
|
if (m_template_states.last().in_expr) {
|
|
|
|
m_template_states.append({ false, 0 });
|
|
|
|
token_type = TokenType::TemplateLiteralStart;
|
|
|
|
} else {
|
|
|
|
m_template_states.take_last();
|
|
|
|
token_type = TokenType::TemplateLiteralEnd;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (in_template && m_template_states.last().in_expr && m_template_states.last().open_bracket_count == 0 && m_current_char == '}') {
|
|
|
|
consume();
|
|
|
|
token_type = TokenType::TemplateLiteralExprEnd;
|
|
|
|
m_template_states.last().in_expr = false;
|
|
|
|
} else if (in_template && !m_template_states.last().in_expr) {
|
|
|
|
if (is_eof()) {
|
|
|
|
token_type = TokenType::UnterminatedTemplateLiteral;
|
|
|
|
m_template_states.take_last();
|
|
|
|
} else if (match('$', '{')) {
|
|
|
|
token_type = TokenType::TemplateLiteralExprStart;
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
m_template_states.last().in_expr = true;
|
|
|
|
} else {
|
|
|
|
while (!match('$', '{') && m_current_char != '`' && !is_eof()) {
|
|
|
|
if (match('\\', '$') || match('\\', '`'))
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
}
|
2020-11-29 22:59:06 +00:00
|
|
|
if (is_eof() && !m_template_states.is_empty())
|
|
|
|
token_type = TokenType::UnterminatedTemplateLiteral;
|
|
|
|
else
|
|
|
|
token_type = TokenType::TemplateLiteralString;
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
}
|
2021-08-18 20:34:25 +00:00
|
|
|
} else if (auto code_point = is_identifier_start(identifier_length); code_point.has_value()) {
|
2020-03-11 18:27:43 +00:00
|
|
|
// identifier or keyword
|
2021-08-18 20:34:25 +00:00
|
|
|
StringBuilder builder;
|
2020-03-11 18:27:43 +00:00
|
|
|
do {
|
2021-08-18 20:34:25 +00:00
|
|
|
builder.append_code_point(*code_point);
|
|
|
|
for (size_t i = 0; i < identifier_length; ++i)
|
|
|
|
consume();
|
|
|
|
|
|
|
|
code_point = is_identifier_middle(identifier_length);
|
|
|
|
} while (code_point.has_value());
|
2020-03-11 18:27:43 +00:00
|
|
|
|
2021-08-18 20:34:25 +00:00
|
|
|
identifier = builder.build();
|
|
|
|
if (!m_parsed_identifiers.contains_slow(*identifier))
|
|
|
|
m_parsed_identifiers.append(*identifier);
|
|
|
|
|
|
|
|
auto it = s_keywords.find(identifier->hash(), [&](auto& entry) { return entry.key == identifier; });
|
|
|
|
if (it == s_keywords.end())
|
2020-03-11 18:27:43 +00:00
|
|
|
token_type = TokenType::Identifier;
|
2021-08-18 20:34:25 +00:00
|
|
|
else
|
2020-03-11 18:27:43 +00:00
|
|
|
token_type = it->value;
|
2020-04-05 12:20:58 +00:00
|
|
|
} else if (is_numeric_literal_start()) {
|
2020-06-06 00:14:10 +00:00
|
|
|
token_type = TokenType::NumericLiteral;
|
2020-10-18 13:32:50 +00:00
|
|
|
bool is_invalid_numeric_literal = false;
|
2020-04-05 12:20:58 +00:00
|
|
|
if (m_current_char == '0') {
|
2020-03-11 18:27:43 +00:00
|
|
|
consume();
|
2020-04-05 12:20:58 +00:00
|
|
|
if (m_current_char == '.') {
|
|
|
|
// decimal
|
|
|
|
consume();
|
2021-06-26 14:30:05 +00:00
|
|
|
while (is_ascii_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_ascii_digit))
|
2020-04-05 12:20:58 +00:00
|
|
|
consume();
|
2020-06-06 00:14:10 +00:00
|
|
|
if (m_current_char == 'e' || m_current_char == 'E')
|
2020-10-18 13:32:50 +00:00
|
|
|
is_invalid_numeric_literal = !consume_exponent();
|
2020-04-05 12:20:58 +00:00
|
|
|
} else if (m_current_char == 'e' || m_current_char == 'E') {
|
2020-10-18 13:32:50 +00:00
|
|
|
is_invalid_numeric_literal = !consume_exponent();
|
2020-04-05 12:20:58 +00:00
|
|
|
} else if (m_current_char == 'o' || m_current_char == 'O') {
|
|
|
|
// octal
|
2020-10-18 13:32:50 +00:00
|
|
|
is_invalid_numeric_literal = !consume_octal_number();
|
2021-06-13 23:19:23 +00:00
|
|
|
if (m_current_char == 'n') {
|
|
|
|
consume();
|
|
|
|
token_type = TokenType::BigIntLiteral;
|
|
|
|
}
|
2020-04-05 12:20:58 +00:00
|
|
|
} else if (m_current_char == 'b' || m_current_char == 'B') {
|
|
|
|
// binary
|
2020-10-18 13:32:50 +00:00
|
|
|
is_invalid_numeric_literal = !consume_binary_number();
|
2021-06-13 23:19:23 +00:00
|
|
|
if (m_current_char == 'n') {
|
|
|
|
consume();
|
|
|
|
token_type = TokenType::BigIntLiteral;
|
|
|
|
}
|
2020-04-05 12:20:58 +00:00
|
|
|
} else if (m_current_char == 'x' || m_current_char == 'X') {
|
|
|
|
// hexadecimal
|
2020-10-18 13:32:50 +00:00
|
|
|
is_invalid_numeric_literal = !consume_hexadecimal_number();
|
2021-06-13 23:19:23 +00:00
|
|
|
if (m_current_char == 'n') {
|
|
|
|
consume();
|
|
|
|
token_type = TokenType::BigIntLiteral;
|
|
|
|
}
|
2020-06-06 00:14:10 +00:00
|
|
|
} else if (m_current_char == 'n') {
|
|
|
|
consume();
|
|
|
|
token_type = TokenType::BigIntLiteral;
|
2021-06-13 08:47:09 +00:00
|
|
|
} else if (is_ascii_digit(m_current_char)) {
|
2020-10-19 17:01:28 +00:00
|
|
|
// octal without '0o' prefix. Forbidden in 'strict mode'
|
2020-04-05 12:20:58 +00:00
|
|
|
do {
|
|
|
|
consume();
|
2021-06-26 14:30:05 +00:00
|
|
|
} while (is_ascii_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_ascii_digit));
|
2020-04-05 12:20:58 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// 1...9 or period
|
2021-06-26 14:30:05 +00:00
|
|
|
while (is_ascii_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_ascii_digit))
|
2020-04-05 12:20:58 +00:00
|
|
|
consume();
|
2020-06-06 00:14:10 +00:00
|
|
|
if (m_current_char == 'n') {
|
2020-04-05 12:20:58 +00:00
|
|
|
consume();
|
2020-06-06 00:14:10 +00:00
|
|
|
token_type = TokenType::BigIntLiteral;
|
|
|
|
} else {
|
|
|
|
if (m_current_char == '.') {
|
2020-04-05 12:20:58 +00:00
|
|
|
consume();
|
2021-06-26 14:30:05 +00:00
|
|
|
while (is_ascii_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_ascii_digit))
|
2020-06-06 00:14:10 +00:00
|
|
|
consume();
|
2020-04-05 12:20:58 +00:00
|
|
|
}
|
2020-06-06 00:14:10 +00:00
|
|
|
if (m_current_char == 'e' || m_current_char == 'E')
|
2020-10-18 13:32:50 +00:00
|
|
|
is_invalid_numeric_literal = !consume_exponent();
|
2020-04-05 12:20:58 +00:00
|
|
|
}
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
2020-10-26 20:13:48 +00:00
|
|
|
if (is_invalid_numeric_literal) {
|
2020-10-18 13:32:50 +00:00
|
|
|
token_type = TokenType::Invalid;
|
2020-10-26 20:13:48 +00:00
|
|
|
token_message = "Invalid numeric literal";
|
|
|
|
}
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
} else if (m_current_char == '"' || m_current_char == '\'') {
|
2020-03-14 10:45:25 +00:00
|
|
|
char stop_char = m_current_char;
|
2020-03-12 12:05:06 +00:00
|
|
|
consume();
|
2020-10-21 21:16:45 +00:00
|
|
|
// Note: LS/PS line terminators are allowed in string literals.
|
|
|
|
while (m_current_char != stop_char && m_current_char != '\r' && m_current_char != '\n' && !is_eof()) {
|
2020-03-14 10:45:25 +00:00
|
|
|
if (m_current_char == '\\') {
|
|
|
|
consume();
|
2021-07-24 22:50:14 +00:00
|
|
|
if (m_current_char == '\r' && m_position < m_source.length() && m_source[m_position] == '\n') {
|
|
|
|
consume();
|
|
|
|
}
|
2020-03-14 10:45:25 +00:00
|
|
|
}
|
2020-03-12 12:05:06 +00:00
|
|
|
consume();
|
|
|
|
}
|
2020-03-14 10:45:25 +00:00
|
|
|
if (m_current_char != stop_char) {
|
|
|
|
token_type = TokenType::UnterminatedStringLiteral;
|
|
|
|
} else {
|
|
|
|
consume();
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
token_type = TokenType::StringLiteral;
|
2020-03-14 10:45:25 +00:00
|
|
|
}
|
2020-06-08 02:32:25 +00:00
|
|
|
} else if (m_current_char == '/' && !slash_means_division()) {
|
|
|
|
consume();
|
2021-07-29 21:28:28 +00:00
|
|
|
token_type = consume_regex_literal();
|
2021-06-13 07:15:00 +00:00
|
|
|
} else if (m_eof) {
|
2020-10-26 20:10:50 +00:00
|
|
|
if (unterminated_comment) {
|
|
|
|
token_type = TokenType::Invalid;
|
|
|
|
token_message = "Unterminated multi-line comment";
|
|
|
|
} else {
|
|
|
|
token_type = TokenType::Eof;
|
|
|
|
}
|
2020-03-11 18:27:43 +00:00
|
|
|
} else {
|
2020-03-12 22:02:41 +00:00
|
|
|
// There is only one four-char operator: >>>=
|
|
|
|
bool found_four_char_token = false;
|
2020-04-13 17:50:58 +00:00
|
|
|
if (match('>', '>', '>', '=')) {
|
|
|
|
found_four_char_token = true;
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
token_type = TokenType::UnsignedShiftRightEquals;
|
2020-03-12 22:02:41 +00:00
|
|
|
}
|
|
|
|
|
2020-03-12 12:11:33 +00:00
|
|
|
bool found_three_char_token = false;
|
2020-03-12 22:02:41 +00:00
|
|
|
if (!found_four_char_token && m_position + 1 < m_source.length()) {
|
2020-10-21 21:16:45 +00:00
|
|
|
auto three_chars_view = m_source.substring_view(m_position - 1, 3);
|
2020-10-17 21:44:41 +00:00
|
|
|
auto it = s_three_char_tokens.find(three_chars_view.hash(), [&](auto& entry) { return entry.key == three_chars_view; });
|
2020-03-12 12:11:33 +00:00
|
|
|
if (it != s_three_char_tokens.end()) {
|
|
|
|
found_three_char_token = true;
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
token_type = it->value;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-03-11 18:27:43 +00:00
|
|
|
bool found_two_char_token = false;
|
2020-03-12 22:02:41 +00:00
|
|
|
if (!found_four_char_token && !found_three_char_token && m_position < m_source.length()) {
|
2020-10-21 21:16:45 +00:00
|
|
|
auto two_chars_view = m_source.substring_view(m_position - 1, 2);
|
2020-10-17 21:44:41 +00:00
|
|
|
auto it = s_two_char_tokens.find(two_chars_view.hash(), [&](auto& entry) { return entry.key == two_chars_view; });
|
2020-03-11 18:27:43 +00:00
|
|
|
if (it != s_two_char_tokens.end()) {
|
2020-12-14 13:05:24 +00:00
|
|
|
// OptionalChainingPunctuator :: ?. [lookahead ∉ DecimalDigit]
|
2021-06-13 08:47:09 +00:00
|
|
|
if (!(it->value == TokenType::QuestionMarkPeriod && m_position + 1 < m_source.length() && is_ascii_digit(m_source[m_position + 1]))) {
|
2020-12-14 13:05:24 +00:00
|
|
|
found_two_char_token = true;
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
token_type = it->value;
|
|
|
|
}
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-03-12 12:11:33 +00:00
|
|
|
bool found_one_char_token = false;
|
2020-03-12 22:02:41 +00:00
|
|
|
if (!found_four_char_token && !found_three_char_token && !found_two_char_token) {
|
2020-03-11 18:27:43 +00:00
|
|
|
auto it = s_single_char_tokens.find(m_current_char);
|
|
|
|
if (it != s_single_char_tokens.end()) {
|
2020-03-12 12:11:33 +00:00
|
|
|
found_one_char_token = true;
|
2020-03-11 18:27:43 +00:00
|
|
|
consume();
|
|
|
|
token_type = it->value;
|
|
|
|
}
|
|
|
|
}
|
2020-03-12 12:11:33 +00:00
|
|
|
|
2020-03-12 22:02:41 +00:00
|
|
|
if (!found_four_char_token && !found_three_char_token && !found_two_char_token && !found_one_char_token) {
|
2020-03-12 12:11:33 +00:00
|
|
|
consume();
|
|
|
|
token_type = TokenType::Invalid;
|
|
|
|
}
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
if (!m_template_states.is_empty() && m_template_states.last().in_expr) {
|
|
|
|
if (token_type == TokenType::CurlyOpen) {
|
|
|
|
m_template_states.last().open_bracket_count++;
|
|
|
|
} else if (token_type == TokenType::CurlyClose) {
|
|
|
|
m_template_states.last().open_bracket_count--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-18 20:34:25 +00:00
|
|
|
if (identifier.has_value()) {
|
|
|
|
m_current_token = Token(
|
|
|
|
token_type,
|
|
|
|
token_message,
|
|
|
|
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
|
|
|
|
m_source.substring_view(value_start - 1, m_position - value_start),
|
|
|
|
identifier.release_value(),
|
|
|
|
m_filename,
|
|
|
|
value_start_line_number,
|
|
|
|
value_start_column_number,
|
|
|
|
m_position);
|
|
|
|
} else {
|
|
|
|
m_current_token = Token(
|
|
|
|
token_type,
|
|
|
|
token_message,
|
|
|
|
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
|
|
|
|
m_source.substring_view(value_start - 1, m_position - value_start),
|
|
|
|
m_filename,
|
|
|
|
value_start_line_number,
|
|
|
|
value_start_column_number,
|
|
|
|
m_position);
|
|
|
|
}
|
2020-03-11 18:27:43 +00:00
|
|
|
|
2021-04-18 16:13:27 +00:00
|
|
|
if constexpr (LEXER_DEBUG) {
|
|
|
|
dbgln("------------------------------");
|
|
|
|
dbgln("Token: {}", m_current_token.name());
|
|
|
|
dbgln("Trivia: _{}_", m_current_token.trivia());
|
|
|
|
dbgln("Value: _{}_", m_current_token.value());
|
|
|
|
dbgln("Line: {}, Column: {}", m_current_token.line_number(), m_current_token.line_column());
|
|
|
|
dbgln("------------------------------");
|
|
|
|
}
|
2020-10-21 21:16:45 +00:00
|
|
|
|
2020-03-11 18:27:43 +00:00
|
|
|
return m_current_token;
|
|
|
|
}
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
|
2021-07-29 21:28:28 +00:00
|
|
|
Token Lexer::force_slash_as_regex()
|
|
|
|
{
|
|
|
|
VERIFY(m_current_token.type() == TokenType::Slash || m_current_token.type() == TokenType::SlashEquals);
|
|
|
|
|
|
|
|
bool has_equals = m_current_token.type() == TokenType::SlashEquals;
|
|
|
|
|
|
|
|
VERIFY(m_position > 0);
|
|
|
|
size_t value_start = m_position - 1;
|
|
|
|
|
|
|
|
if (has_equals) {
|
|
|
|
VERIFY(m_source[value_start - 1] == '=');
|
|
|
|
--value_start;
|
|
|
|
--m_position;
|
|
|
|
m_current_char = '=';
|
|
|
|
}
|
|
|
|
|
|
|
|
TokenType token_type = consume_regex_literal();
|
|
|
|
|
|
|
|
m_current_token = Token(
|
|
|
|
token_type,
|
|
|
|
"",
|
|
|
|
m_current_token.trivia(),
|
|
|
|
m_source.substring_view(value_start - 1, m_position - value_start),
|
|
|
|
m_filename,
|
|
|
|
m_current_token.line_number(),
|
|
|
|
m_current_token.line_column(),
|
|
|
|
m_position);
|
|
|
|
|
|
|
|
if constexpr (LEXER_DEBUG) {
|
|
|
|
dbgln("------------------------------");
|
|
|
|
dbgln("Token: {}", m_current_token.name());
|
|
|
|
dbgln("Trivia: _{}_", m_current_token.trivia());
|
|
|
|
dbgln("Value: _{}_", m_current_token.value());
|
|
|
|
dbgln("Line: {}, Column: {}", m_current_token.line_number(), m_current_token.line_column());
|
|
|
|
dbgln("------------------------------");
|
|
|
|
}
|
|
|
|
|
|
|
|
return m_current_token;
|
|
|
|
}
|
|
|
|
|
|
|
|
TokenType Lexer::consume_regex_literal()
|
|
|
|
{
|
|
|
|
TokenType token_type = TokenType::RegexLiteral;
|
|
|
|
|
|
|
|
while (!is_eof()) {
|
|
|
|
if (is_line_terminator() || (!m_regex_is_in_character_class && m_current_char == '/')) {
|
|
|
|
break;
|
|
|
|
} else if (m_current_char == '[') {
|
|
|
|
m_regex_is_in_character_class = true;
|
|
|
|
} else if (m_current_char == ']') {
|
|
|
|
m_regex_is_in_character_class = false;
|
|
|
|
} else if (!m_regex_is_in_character_class && m_current_char == '/') {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (match('\\', '/') || match('\\', '[') || match('\\', '\\') || (m_regex_is_in_character_class && match('\\', ']')))
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (m_current_char == '/') {
|
|
|
|
consume();
|
|
|
|
return TokenType::RegexLiteral;
|
|
|
|
} else {
|
|
|
|
return TokenType::UnterminatedRegexLiteral;
|
|
|
|
}
|
|
|
|
return token_type;
|
|
|
|
}
|
|
|
|
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|