2020-03-11 18:27:43 +00:00
|
|
|
/*
|
2021-05-29 10:38:28 +00:00
|
|
|
* Copyright (c) 2020, Stephan Unverwerth <s.unverwerth@serenityos.org>
|
2021-04-22 20:51:19 +00:00
|
|
|
* Copyright (c) 2020-2021, Linus Groh <linusg@serenityos.org>
|
2020-03-11 18:27:43 +00:00
|
|
|
*
|
2021-04-22 08:24:48 +00:00
|
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
2020-03-11 18:27:43 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include "Lexer.h"
|
2021-06-13 08:47:09 +00:00
|
|
|
#include <AK/CharacterTypes.h>
|
2021-01-24 14:28:26 +00:00
|
|
|
#include <AK/Debug.h>
|
2020-03-11 18:27:43 +00:00
|
|
|
#include <AK/HashMap.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
|
|
|
|
namespace JS {
|
|
|
|
|
|
|
|
HashMap<String, TokenType> Lexer::s_keywords;
|
2020-03-12 12:11:33 +00:00
|
|
|
HashMap<String, TokenType> Lexer::s_three_char_tokens;
|
2020-03-11 18:27:43 +00:00
|
|
|
HashMap<String, TokenType> Lexer::s_two_char_tokens;
|
|
|
|
HashMap<char, TokenType> Lexer::s_single_char_tokens;
|
|
|
|
|
2021-02-28 09:42:34 +00:00
|
|
|
Lexer::Lexer(StringView source, StringView filename, size_t line_number, size_t line_column)
|
2020-03-11 18:27:43 +00:00
|
|
|
: m_source(source)
|
2021-07-10 20:46:17 +00:00
|
|
|
, m_current_token(TokenType::Eof, {}, StringView(nullptr), StringView(nullptr), filename, 0, 0, 0)
|
2021-02-28 09:42:34 +00:00
|
|
|
, m_filename(filename)
|
|
|
|
, m_line_number(line_number)
|
|
|
|
, m_line_column(line_column)
|
2020-03-11 18:27:43 +00:00
|
|
|
{
|
|
|
|
if (s_keywords.is_empty()) {
|
2020-03-12 22:02:41 +00:00
|
|
|
s_keywords.set("await", TokenType::Await);
|
2020-03-29 11:09:54 +00:00
|
|
|
s_keywords.set("break", TokenType::Break);
|
|
|
|
s_keywords.set("case", TokenType::Case);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_keywords.set("catch", TokenType::Catch);
|
|
|
|
s_keywords.set("class", TokenType::Class);
|
|
|
|
s_keywords.set("const", TokenType::Const);
|
2020-04-04 22:22:42 +00:00
|
|
|
s_keywords.set("continue", TokenType::Continue);
|
2020-04-30 16:26:27 +00:00
|
|
|
s_keywords.set("debugger", TokenType::Debugger);
|
2020-03-29 11:09:54 +00:00
|
|
|
s_keywords.set("default", TokenType::Default);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_keywords.set("delete", TokenType::Delete);
|
|
|
|
s_keywords.set("do", TokenType::Do);
|
|
|
|
s_keywords.set("else", TokenType::Else);
|
2020-05-12 16:43:35 +00:00
|
|
|
s_keywords.set("enum", TokenType::Enum);
|
|
|
|
s_keywords.set("export", TokenType::Export);
|
|
|
|
s_keywords.set("extends", TokenType::Extends);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_keywords.set("false", TokenType::BoolLiteral);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_keywords.set("finally", TokenType::Finally);
|
2020-03-12 12:12:12 +00:00
|
|
|
s_keywords.set("for", TokenType::For);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_keywords.set("function", TokenType::Function);
|
|
|
|
s_keywords.set("if", TokenType::If);
|
2020-05-12 16:43:35 +00:00
|
|
|
s_keywords.set("import", TokenType::Import);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_keywords.set("in", TokenType::In);
|
|
|
|
s_keywords.set("instanceof", TokenType::Instanceof);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_keywords.set("let", TokenType::Let);
|
|
|
|
s_keywords.set("new", TokenType::New);
|
|
|
|
s_keywords.set("null", TokenType::NullLiteral);
|
|
|
|
s_keywords.set("return", TokenType::Return);
|
2020-05-12 16:43:35 +00:00
|
|
|
s_keywords.set("super", TokenType::Super);
|
2020-03-29 11:09:54 +00:00
|
|
|
s_keywords.set("switch", TokenType::Switch);
|
2020-04-12 22:42:14 +00:00
|
|
|
s_keywords.set("this", TokenType::This);
|
2020-03-24 21:03:50 +00:00
|
|
|
s_keywords.set("throw", TokenType::Throw);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_keywords.set("true", TokenType::BoolLiteral);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_keywords.set("try", TokenType::Try);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_keywords.set("typeof", TokenType::Typeof);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_keywords.set("var", TokenType::Var);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_keywords.set("void", TokenType::Void);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_keywords.set("while", TokenType::While);
|
2020-05-12 16:43:35 +00:00
|
|
|
s_keywords.set("with", TokenType::With);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_keywords.set("yield", TokenType::Yield);
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
2020-03-12 12:11:33 +00:00
|
|
|
if (s_three_char_tokens.is_empty()) {
|
|
|
|
s_three_char_tokens.set("===", TokenType::EqualsEqualsEquals);
|
|
|
|
s_three_char_tokens.set("!==", TokenType::ExclamationMarkEqualsEquals);
|
2020-05-04 22:03:35 +00:00
|
|
|
s_three_char_tokens.set("**=", TokenType::DoubleAsteriskEquals);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_three_char_tokens.set("<<=", TokenType::ShiftLeftEquals);
|
|
|
|
s_three_char_tokens.set(">>=", TokenType::ShiftRightEquals);
|
2020-10-05 15:49:43 +00:00
|
|
|
s_three_char_tokens.set("&&=", TokenType::DoubleAmpersandEquals);
|
|
|
|
s_three_char_tokens.set("||=", TokenType::DoublePipeEquals);
|
|
|
|
s_three_char_tokens.set("\?\?=", TokenType::DoubleQuestionMarkEquals);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_three_char_tokens.set(">>>", TokenType::UnsignedShiftRight);
|
2020-04-27 06:05:37 +00:00
|
|
|
s_three_char_tokens.set("...", TokenType::TripleDot);
|
2020-03-12 12:11:33 +00:00
|
|
|
}
|
|
|
|
|
2020-03-11 18:27:43 +00:00
|
|
|
if (s_two_char_tokens.is_empty()) {
|
2020-03-30 13:26:09 +00:00
|
|
|
s_two_char_tokens.set("=>", TokenType::Arrow);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_two_char_tokens.set("+=", TokenType::PlusEquals);
|
|
|
|
s_two_char_tokens.set("-=", TokenType::MinusEquals);
|
|
|
|
s_two_char_tokens.set("*=", TokenType::AsteriskEquals);
|
|
|
|
s_two_char_tokens.set("/=", TokenType::SlashEquals);
|
|
|
|
s_two_char_tokens.set("%=", TokenType::PercentEquals);
|
|
|
|
s_two_char_tokens.set("&=", TokenType::AmpersandEquals);
|
|
|
|
s_two_char_tokens.set("|=", TokenType::PipeEquals);
|
2020-05-04 21:34:45 +00:00
|
|
|
s_two_char_tokens.set("^=", TokenType::CaretEquals);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_two_char_tokens.set("&&", TokenType::DoubleAmpersand);
|
|
|
|
s_two_char_tokens.set("||", TokenType::DoublePipe);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_two_char_tokens.set("??", TokenType::DoubleQuestionMark);
|
|
|
|
s_two_char_tokens.set("**", TokenType::DoubleAsterisk);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_two_char_tokens.set("==", TokenType::EqualsEquals);
|
2020-03-23 13:10:23 +00:00
|
|
|
s_two_char_tokens.set("<=", TokenType::LessThanEquals);
|
|
|
|
s_two_char_tokens.set(">=", TokenType::GreaterThanEquals);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_two_char_tokens.set("!=", TokenType::ExclamationMarkEquals);
|
|
|
|
s_two_char_tokens.set("--", TokenType::MinusMinus);
|
|
|
|
s_two_char_tokens.set("++", TokenType::PlusPlus);
|
|
|
|
s_two_char_tokens.set("<<", TokenType::ShiftLeft);
|
|
|
|
s_two_char_tokens.set(">>", TokenType::ShiftRight);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_two_char_tokens.set("?.", TokenType::QuestionMarkPeriod);
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (s_single_char_tokens.is_empty()) {
|
|
|
|
s_single_char_tokens.set('&', TokenType::Ampersand);
|
|
|
|
s_single_char_tokens.set('*', TokenType::Asterisk);
|
|
|
|
s_single_char_tokens.set('[', TokenType::BracketOpen);
|
|
|
|
s_single_char_tokens.set(']', TokenType::BracketClose);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_single_char_tokens.set('^', TokenType::Caret);
|
2020-03-21 00:29:00 +00:00
|
|
|
s_single_char_tokens.set(':', TokenType::Colon);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_single_char_tokens.set(',', TokenType::Comma);
|
|
|
|
s_single_char_tokens.set('{', TokenType::CurlyOpen);
|
|
|
|
s_single_char_tokens.set('}', TokenType::CurlyClose);
|
|
|
|
s_single_char_tokens.set('=', TokenType::Equals);
|
|
|
|
s_single_char_tokens.set('!', TokenType::ExclamationMark);
|
|
|
|
s_single_char_tokens.set('-', TokenType::Minus);
|
|
|
|
s_single_char_tokens.set('(', TokenType::ParenOpen);
|
|
|
|
s_single_char_tokens.set(')', TokenType::ParenClose);
|
|
|
|
s_single_char_tokens.set('%', TokenType::Percent);
|
|
|
|
s_single_char_tokens.set('.', TokenType::Period);
|
|
|
|
s_single_char_tokens.set('|', TokenType::Pipe);
|
|
|
|
s_single_char_tokens.set('+', TokenType::Plus);
|
|
|
|
s_single_char_tokens.set('?', TokenType::QuestionMark);
|
|
|
|
s_single_char_tokens.set(';', TokenType::Semicolon);
|
|
|
|
s_single_char_tokens.set('/', TokenType::Slash);
|
2020-03-12 22:02:41 +00:00
|
|
|
s_single_char_tokens.set('~', TokenType::Tilde);
|
2020-03-11 18:27:43 +00:00
|
|
|
s_single_char_tokens.set('<', TokenType::LessThan);
|
|
|
|
s_single_char_tokens.set('>', TokenType::GreaterThan);
|
|
|
|
}
|
|
|
|
consume();
|
|
|
|
}
|
|
|
|
|
|
|
|
void Lexer::consume()
|
|
|
|
{
|
2020-10-21 21:16:45 +00:00
|
|
|
auto did_reach_eof = [this] {
|
|
|
|
if (m_position != m_source.length())
|
|
|
|
return false;
|
2021-06-13 07:15:00 +00:00
|
|
|
m_eof = true;
|
|
|
|
m_current_char = '\0';
|
2020-05-26 17:00:30 +00:00
|
|
|
m_position++;
|
|
|
|
m_line_column++;
|
2020-10-21 21:16:45 +00:00
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
|
|
|
if (m_position > m_source.length())
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (did_reach_eof())
|
2020-03-11 18:27:43 +00:00
|
|
|
return;
|
|
|
|
|
2020-10-21 21:16:45 +00:00
|
|
|
if (is_line_terminator()) {
|
2021-04-18 16:13:27 +00:00
|
|
|
if constexpr (LEXER_DEBUG) {
|
|
|
|
String type;
|
|
|
|
if (m_current_char == '\n')
|
|
|
|
type = "LINE FEED";
|
|
|
|
else if (m_current_char == '\r')
|
|
|
|
type = "CARRIAGE RETURN";
|
|
|
|
else if (m_source[m_position + 1] == (char)0xa8)
|
|
|
|
type = "LINE SEPARATOR";
|
|
|
|
else
|
|
|
|
type = "PARAGRAPH SEPARATOR";
|
|
|
|
dbgln("Found a line terminator: {}", type);
|
|
|
|
}
|
2020-10-21 21:16:45 +00:00
|
|
|
// This is a three-char line terminator, we need to increase m_position some more.
|
|
|
|
// We might reach EOF and need to check again.
|
|
|
|
if (m_current_char != '\n' && m_current_char != '\r') {
|
|
|
|
m_position += 2;
|
|
|
|
if (did_reach_eof())
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the previous character is \r and the current one \n we already updated line number
|
|
|
|
// and column - don't do it again. From https://tc39.es/ecma262/#sec-line-terminators:
|
|
|
|
// The sequence <CR><LF> is commonly used as a line terminator.
|
|
|
|
// It should be considered a single SourceCharacter for the purpose of reporting line numbers.
|
|
|
|
auto second_char_of_crlf = m_position > 1 && m_source[m_position - 2] == '\r' && m_current_char == '\n';
|
|
|
|
|
|
|
|
if (!second_char_of_crlf) {
|
|
|
|
m_line_number++;
|
|
|
|
m_line_column = 1;
|
2021-04-18 16:13:27 +00:00
|
|
|
dbgln_if(LEXER_DEBUG, "Incremented line number, now at: line {}, column 1", m_line_number);
|
2020-10-21 21:16:45 +00:00
|
|
|
} else {
|
2021-04-18 16:13:27 +00:00
|
|
|
dbgln_if(LEXER_DEBUG, "Previous was CR, this is LF - not incrementing line number again.");
|
2020-10-21 21:16:45 +00:00
|
|
|
}
|
2020-04-05 09:34:03 +00:00
|
|
|
} else {
|
|
|
|
m_line_column++;
|
|
|
|
}
|
|
|
|
|
2020-03-11 18:27:43 +00:00
|
|
|
m_current_char = m_source[m_position++];
|
|
|
|
}
|
|
|
|
|
2021-06-26 14:30:05 +00:00
|
|
|
bool Lexer::consume_decimal_number()
|
|
|
|
{
|
|
|
|
if (!is_ascii_digit(m_current_char))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
while (is_ascii_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_ascii_digit)) {
|
|
|
|
consume();
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-10-18 13:32:50 +00:00
|
|
|
bool Lexer::consume_exponent()
|
2020-04-05 12:20:58 +00:00
|
|
|
{
|
|
|
|
consume();
|
|
|
|
if (m_current_char == '-' || m_current_char == '+')
|
|
|
|
consume();
|
2020-10-18 13:32:50 +00:00
|
|
|
|
2021-06-13 08:47:09 +00:00
|
|
|
if (!is_ascii_digit(m_current_char))
|
2020-10-18 13:32:50 +00:00
|
|
|
return false;
|
|
|
|
|
2021-06-26 14:30:05 +00:00
|
|
|
return consume_decimal_number();
|
|
|
|
}
|
|
|
|
|
|
|
|
static constexpr bool is_octal_digit(char ch)
|
|
|
|
{
|
|
|
|
return ch >= '0' && ch <= '7';
|
2020-10-18 13:32:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::consume_octal_number()
|
|
|
|
{
|
|
|
|
consume();
|
2021-06-26 14:30:05 +00:00
|
|
|
if (!is_octal_digit(m_current_char))
|
2020-10-18 13:32:50 +00:00
|
|
|
return false;
|
|
|
|
|
2021-06-26 14:30:05 +00:00
|
|
|
while (is_octal_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_octal_digit))
|
2020-10-18 13:32:50 +00:00
|
|
|
consume();
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::consume_hexadecimal_number()
|
|
|
|
{
|
|
|
|
consume();
|
2021-06-13 08:47:09 +00:00
|
|
|
if (!is_ascii_hex_digit(m_current_char))
|
2020-10-18 13:32:50 +00:00
|
|
|
return false;
|
|
|
|
|
2021-06-26 14:30:05 +00:00
|
|
|
while (is_ascii_hex_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_ascii_hex_digit))
|
2020-10-18 13:32:50 +00:00
|
|
|
consume();
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2021-06-26 14:30:05 +00:00
|
|
|
static constexpr bool is_binary_digit(char ch)
|
|
|
|
{
|
|
|
|
return ch == '0' || ch == '1';
|
|
|
|
}
|
|
|
|
|
2020-10-18 13:32:50 +00:00
|
|
|
bool Lexer::consume_binary_number()
|
|
|
|
{
|
|
|
|
consume();
|
2021-06-26 14:30:05 +00:00
|
|
|
if (!is_binary_digit(m_current_char))
|
2020-10-18 13:32:50 +00:00
|
|
|
return false;
|
|
|
|
|
2021-06-26 14:30:05 +00:00
|
|
|
while (is_binary_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_binary_digit))
|
2020-10-18 13:32:50 +00:00
|
|
|
consume();
|
|
|
|
|
|
|
|
return true;
|
2020-04-05 12:20:58 +00:00
|
|
|
}
|
|
|
|
|
2021-06-26 14:30:05 +00:00
|
|
|
template<typename Callback>
|
|
|
|
bool Lexer::match_numeric_literal_separator_followed_by(Callback callback) const
|
|
|
|
{
|
|
|
|
if (m_position >= m_source.length())
|
|
|
|
return false;
|
|
|
|
return m_current_char == '_'
|
|
|
|
&& callback(m_source[m_position]);
|
|
|
|
}
|
|
|
|
|
2020-04-13 17:50:58 +00:00
|
|
|
bool Lexer::match(char a, char b) const
|
|
|
|
{
|
|
|
|
if (m_position >= m_source.length())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return m_current_char == a
|
|
|
|
&& m_source[m_position] == b;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::match(char a, char b, char c) const
|
|
|
|
{
|
|
|
|
if (m_position + 1 >= m_source.length())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return m_current_char == a
|
|
|
|
&& m_source[m_position] == b
|
|
|
|
&& m_source[m_position + 1] == c;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::match(char a, char b, char c, char d) const
|
|
|
|
{
|
|
|
|
if (m_position + 2 >= m_source.length())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return m_current_char == a
|
|
|
|
&& m_source[m_position] == b
|
|
|
|
&& m_source[m_position + 1] == c
|
|
|
|
&& m_source[m_position + 2] == d;
|
|
|
|
}
|
|
|
|
|
2020-03-11 18:27:43 +00:00
|
|
|
bool Lexer::is_eof() const
|
|
|
|
{
|
2021-06-13 07:15:00 +00:00
|
|
|
return m_eof;
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
2020-10-21 21:16:45 +00:00
|
|
|
bool Lexer::is_line_terminator() const
|
|
|
|
{
|
|
|
|
if (m_current_char == '\n' || m_current_char == '\r')
|
|
|
|
return true;
|
2020-11-25 21:33:48 +00:00
|
|
|
if (m_position > 0 && m_position + 1 < m_source.length()) {
|
2020-10-21 21:16:45 +00:00
|
|
|
auto three_chars_view = m_source.substring_view(m_position - 1, 3);
|
|
|
|
return (three_chars_view == LINE_SEPARATOR) || (three_chars_view == PARAGRAPH_SEPARATOR);
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-03-11 18:27:43 +00:00
|
|
|
bool Lexer::is_identifier_start() const
|
|
|
|
{
|
2021-06-13 08:47:09 +00:00
|
|
|
return is_ascii_alpha(m_current_char) || m_current_char == '_' || m_current_char == '$';
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::is_identifier_middle() const
|
|
|
|
{
|
2021-06-13 08:47:09 +00:00
|
|
|
return is_identifier_start() || is_ascii_digit(m_current_char);
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
2020-10-29 17:55:24 +00:00
|
|
|
bool Lexer::is_line_comment_start(bool line_has_token_yet) const
|
2020-03-11 18:27:43 +00:00
|
|
|
{
|
2020-10-29 17:55:24 +00:00
|
|
|
return match('/', '/')
|
2021-08-14 15:30:37 +00:00
|
|
|
|| (m_allow_html_comments && match('<', '!', '-', '-'))
|
2020-10-29 17:55:24 +00:00
|
|
|
// "-->" is considered a line comment start if the current line is only whitespace and/or
|
|
|
|
// other block comment(s); or in other words: the current line does not have a token or
|
|
|
|
// ongoing line comment yet
|
2021-08-14 15:30:37 +00:00
|
|
|
|| (m_allow_html_comments && !line_has_token_yet && match('-', '-', '>'))
|
2021-06-18 18:11:26 +00:00
|
|
|
// https://tc39.es/proposal-hashbang/out.html#sec-updated-syntax
|
|
|
|
|| (match('#', '!') && m_position == 1);
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::is_block_comment_start() const
|
|
|
|
{
|
2020-04-13 17:50:58 +00:00
|
|
|
return match('/', '*');
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool Lexer::is_block_comment_end() const
|
|
|
|
{
|
2020-04-13 17:50:58 +00:00
|
|
|
return match('*', '/');
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
2020-04-05 12:20:58 +00:00
|
|
|
bool Lexer::is_numeric_literal_start() const
|
|
|
|
{
|
2021-06-13 08:47:09 +00:00
|
|
|
return is_ascii_digit(m_current_char) || (m_current_char == '.' && m_position < m_source.length() && is_ascii_digit(m_source[m_position]));
|
2020-04-05 12:20:58 +00:00
|
|
|
}
|
|
|
|
|
2020-06-03 23:05:49 +00:00
|
|
|
bool Lexer::slash_means_division() const
|
|
|
|
{
|
|
|
|
auto type = m_current_token.type();
|
2020-06-07 18:02:42 +00:00
|
|
|
return type == TokenType::BigIntLiteral
|
2020-08-29 17:40:28 +00:00
|
|
|
|| type == TokenType::BoolLiteral
|
|
|
|
|| type == TokenType::BracketClose
|
|
|
|
|| type == TokenType::Identifier
|
|
|
|
|| type == TokenType::NullLiteral
|
|
|
|
|| type == TokenType::NumericLiteral
|
|
|
|
|| type == TokenType::ParenClose
|
|
|
|
|| type == TokenType::RegexLiteral
|
|
|
|
|| type == TokenType::StringLiteral
|
|
|
|
|| type == TokenType::TemplateLiteralEnd
|
|
|
|
|| type == TokenType::This;
|
2020-06-03 23:05:49 +00:00
|
|
|
}
|
|
|
|
|
2020-03-11 18:27:43 +00:00
|
|
|
Token Lexer::next()
|
|
|
|
{
|
|
|
|
size_t trivia_start = m_position;
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
auto in_template = !m_template_states.is_empty();
|
2020-10-29 17:55:24 +00:00
|
|
|
bool line_has_token_yet = m_line_column > 1;
|
2020-10-26 20:10:50 +00:00
|
|
|
bool unterminated_comment = false;
|
2020-03-11 18:27:43 +00:00
|
|
|
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
if (!in_template || m_template_states.last().in_expr) {
|
|
|
|
// consume whitespace and comments
|
|
|
|
while (true) {
|
2020-10-29 17:55:24 +00:00
|
|
|
if (is_line_terminator()) {
|
|
|
|
line_has_token_yet = false;
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
do {
|
|
|
|
consume();
|
2020-10-29 17:55:24 +00:00
|
|
|
} while (is_line_terminator());
|
2021-06-13 08:47:09 +00:00
|
|
|
} else if (is_ascii_space(m_current_char)) {
|
2020-10-29 17:55:24 +00:00
|
|
|
do {
|
|
|
|
consume();
|
2021-06-13 08:47:09 +00:00
|
|
|
} while (is_ascii_space(m_current_char));
|
2020-10-29 17:55:24 +00:00
|
|
|
} else if (is_line_comment_start(line_has_token_yet)) {
|
2020-03-11 18:27:43 +00:00
|
|
|
consume();
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
do {
|
|
|
|
consume();
|
2020-10-21 21:16:45 +00:00
|
|
|
} while (!is_eof() && !is_line_terminator());
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
} else if (is_block_comment_start()) {
|
2020-03-11 18:27:43 +00:00
|
|
|
consume();
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
do {
|
|
|
|
consume();
|
|
|
|
} while (!is_eof() && !is_block_comment_end());
|
2020-10-26 20:10:50 +00:00
|
|
|
if (is_eof())
|
|
|
|
unterminated_comment = true;
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
consume(); // consume *
|
2020-10-26 20:10:50 +00:00
|
|
|
if (is_eof())
|
|
|
|
unterminated_comment = true;
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
consume(); // consume /
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t value_start = m_position;
|
2020-08-29 17:40:28 +00:00
|
|
|
size_t value_start_line_number = m_line_number;
|
|
|
|
size_t value_start_column_number = m_line_column;
|
2020-03-12 12:11:33 +00:00
|
|
|
auto token_type = TokenType::Invalid;
|
2021-06-22 13:08:40 +00:00
|
|
|
auto did_consume_whitespace_or_comments = trivia_start != value_start;
|
2020-10-26 20:08:01 +00:00
|
|
|
// This is being used to communicate info about invalid tokens to the parser, which then
|
|
|
|
// can turn that into more specific error messages - instead of us having to make up a
|
|
|
|
// bunch of Invalid* tokens (bad numeric literals, unterminated comments etc.)
|
|
|
|
String token_message;
|
2020-03-11 18:27:43 +00:00
|
|
|
|
2021-06-22 13:08:40 +00:00
|
|
|
if (m_current_token.type() == TokenType::RegexLiteral && !is_eof() && is_ascii_alpha(m_current_char) && !did_consume_whitespace_or_comments) {
|
2020-06-03 23:05:49 +00:00
|
|
|
token_type = TokenType::RegexFlags;
|
2021-06-13 08:47:09 +00:00
|
|
|
while (!is_eof() && is_ascii_alpha(m_current_char))
|
2020-06-03 23:05:49 +00:00
|
|
|
consume();
|
|
|
|
} else if (m_current_char == '`') {
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
consume();
|
|
|
|
|
|
|
|
if (!in_template) {
|
|
|
|
token_type = TokenType::TemplateLiteralStart;
|
|
|
|
m_template_states.append({ false, 0 });
|
|
|
|
} else {
|
|
|
|
if (m_template_states.last().in_expr) {
|
|
|
|
m_template_states.append({ false, 0 });
|
|
|
|
token_type = TokenType::TemplateLiteralStart;
|
|
|
|
} else {
|
|
|
|
m_template_states.take_last();
|
|
|
|
token_type = TokenType::TemplateLiteralEnd;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (in_template && m_template_states.last().in_expr && m_template_states.last().open_bracket_count == 0 && m_current_char == '}') {
|
|
|
|
consume();
|
|
|
|
token_type = TokenType::TemplateLiteralExprEnd;
|
|
|
|
m_template_states.last().in_expr = false;
|
|
|
|
} else if (in_template && !m_template_states.last().in_expr) {
|
|
|
|
if (is_eof()) {
|
|
|
|
token_type = TokenType::UnterminatedTemplateLiteral;
|
|
|
|
m_template_states.take_last();
|
|
|
|
} else if (match('$', '{')) {
|
|
|
|
token_type = TokenType::TemplateLiteralExprStart;
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
m_template_states.last().in_expr = true;
|
|
|
|
} else {
|
|
|
|
while (!match('$', '{') && m_current_char != '`' && !is_eof()) {
|
|
|
|
if (match('\\', '$') || match('\\', '`'))
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
}
|
2020-11-29 22:59:06 +00:00
|
|
|
if (is_eof() && !m_template_states.is_empty())
|
|
|
|
token_type = TokenType::UnterminatedTemplateLiteral;
|
|
|
|
else
|
|
|
|
token_type = TokenType::TemplateLiteralString;
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
}
|
|
|
|
} else if (is_identifier_start()) {
|
2020-03-11 18:27:43 +00:00
|
|
|
// identifier or keyword
|
|
|
|
do {
|
|
|
|
consume();
|
|
|
|
} while (is_identifier_middle());
|
|
|
|
|
|
|
|
StringView value = m_source.substring_view(value_start - 1, m_position - value_start);
|
2020-10-17 21:44:41 +00:00
|
|
|
auto it = s_keywords.find(value.hash(), [&](auto& entry) { return entry.key == value; });
|
2020-03-11 18:27:43 +00:00
|
|
|
if (it == s_keywords.end()) {
|
|
|
|
token_type = TokenType::Identifier;
|
|
|
|
} else {
|
|
|
|
token_type = it->value;
|
|
|
|
}
|
2020-04-05 12:20:58 +00:00
|
|
|
} else if (is_numeric_literal_start()) {
|
2020-06-06 00:14:10 +00:00
|
|
|
token_type = TokenType::NumericLiteral;
|
2020-10-18 13:32:50 +00:00
|
|
|
bool is_invalid_numeric_literal = false;
|
2020-04-05 12:20:58 +00:00
|
|
|
if (m_current_char == '0') {
|
2020-03-11 18:27:43 +00:00
|
|
|
consume();
|
2020-04-05 12:20:58 +00:00
|
|
|
if (m_current_char == '.') {
|
|
|
|
// decimal
|
|
|
|
consume();
|
2021-06-26 14:30:05 +00:00
|
|
|
while (is_ascii_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_ascii_digit))
|
2020-04-05 12:20:58 +00:00
|
|
|
consume();
|
2020-06-06 00:14:10 +00:00
|
|
|
if (m_current_char == 'e' || m_current_char == 'E')
|
2020-10-18 13:32:50 +00:00
|
|
|
is_invalid_numeric_literal = !consume_exponent();
|
2020-04-05 12:20:58 +00:00
|
|
|
} else if (m_current_char == 'e' || m_current_char == 'E') {
|
2020-10-18 13:32:50 +00:00
|
|
|
is_invalid_numeric_literal = !consume_exponent();
|
2020-04-05 12:20:58 +00:00
|
|
|
} else if (m_current_char == 'o' || m_current_char == 'O') {
|
|
|
|
// octal
|
2020-10-18 13:32:50 +00:00
|
|
|
is_invalid_numeric_literal = !consume_octal_number();
|
2021-06-13 23:19:23 +00:00
|
|
|
if (m_current_char == 'n') {
|
|
|
|
consume();
|
|
|
|
token_type = TokenType::BigIntLiteral;
|
|
|
|
}
|
2020-04-05 12:20:58 +00:00
|
|
|
} else if (m_current_char == 'b' || m_current_char == 'B') {
|
|
|
|
// binary
|
2020-10-18 13:32:50 +00:00
|
|
|
is_invalid_numeric_literal = !consume_binary_number();
|
2021-06-13 23:19:23 +00:00
|
|
|
if (m_current_char == 'n') {
|
|
|
|
consume();
|
|
|
|
token_type = TokenType::BigIntLiteral;
|
|
|
|
}
|
2020-04-05 12:20:58 +00:00
|
|
|
} else if (m_current_char == 'x' || m_current_char == 'X') {
|
|
|
|
// hexadecimal
|
2020-10-18 13:32:50 +00:00
|
|
|
is_invalid_numeric_literal = !consume_hexadecimal_number();
|
2021-06-13 23:19:23 +00:00
|
|
|
if (m_current_char == 'n') {
|
|
|
|
consume();
|
|
|
|
token_type = TokenType::BigIntLiteral;
|
|
|
|
}
|
2020-06-06 00:14:10 +00:00
|
|
|
} else if (m_current_char == 'n') {
|
|
|
|
consume();
|
|
|
|
token_type = TokenType::BigIntLiteral;
|
2021-06-13 08:47:09 +00:00
|
|
|
} else if (is_ascii_digit(m_current_char)) {
|
2020-10-19 17:01:28 +00:00
|
|
|
// octal without '0o' prefix. Forbidden in 'strict mode'
|
2020-04-05 12:20:58 +00:00
|
|
|
do {
|
|
|
|
consume();
|
2021-06-26 14:30:05 +00:00
|
|
|
} while (is_ascii_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_ascii_digit));
|
2020-04-05 12:20:58 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// 1...9 or period
|
2021-06-26 14:30:05 +00:00
|
|
|
while (is_ascii_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_ascii_digit))
|
2020-04-05 12:20:58 +00:00
|
|
|
consume();
|
2020-06-06 00:14:10 +00:00
|
|
|
if (m_current_char == 'n') {
|
2020-04-05 12:20:58 +00:00
|
|
|
consume();
|
2020-06-06 00:14:10 +00:00
|
|
|
token_type = TokenType::BigIntLiteral;
|
|
|
|
} else {
|
|
|
|
if (m_current_char == '.') {
|
2020-04-05 12:20:58 +00:00
|
|
|
consume();
|
2021-06-26 14:30:05 +00:00
|
|
|
while (is_ascii_digit(m_current_char) || match_numeric_literal_separator_followed_by(is_ascii_digit))
|
2020-06-06 00:14:10 +00:00
|
|
|
consume();
|
2020-04-05 12:20:58 +00:00
|
|
|
}
|
2020-06-06 00:14:10 +00:00
|
|
|
if (m_current_char == 'e' || m_current_char == 'E')
|
2020-10-18 13:32:50 +00:00
|
|
|
is_invalid_numeric_literal = !consume_exponent();
|
2020-04-05 12:20:58 +00:00
|
|
|
}
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
2020-10-26 20:13:48 +00:00
|
|
|
if (is_invalid_numeric_literal) {
|
2020-10-18 13:32:50 +00:00
|
|
|
token_type = TokenType::Invalid;
|
2020-10-26 20:13:48 +00:00
|
|
|
token_message = "Invalid numeric literal";
|
|
|
|
}
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
} else if (m_current_char == '"' || m_current_char == '\'') {
|
2020-03-14 10:45:25 +00:00
|
|
|
char stop_char = m_current_char;
|
2020-03-12 12:05:06 +00:00
|
|
|
consume();
|
2020-10-21 21:16:45 +00:00
|
|
|
// Note: LS/PS line terminators are allowed in string literals.
|
|
|
|
while (m_current_char != stop_char && m_current_char != '\r' && m_current_char != '\n' && !is_eof()) {
|
2020-03-14 10:45:25 +00:00
|
|
|
if (m_current_char == '\\') {
|
|
|
|
consume();
|
|
|
|
}
|
2020-03-12 12:05:06 +00:00
|
|
|
consume();
|
|
|
|
}
|
2020-03-14 10:45:25 +00:00
|
|
|
if (m_current_char != stop_char) {
|
|
|
|
token_type = TokenType::UnterminatedStringLiteral;
|
|
|
|
} else {
|
|
|
|
consume();
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
token_type = TokenType::StringLiteral;
|
2020-03-14 10:45:25 +00:00
|
|
|
}
|
2020-06-08 02:32:25 +00:00
|
|
|
} else if (m_current_char == '/' && !slash_means_division()) {
|
|
|
|
consume();
|
|
|
|
token_type = TokenType::RegexLiteral;
|
|
|
|
|
|
|
|
while (!is_eof()) {
|
|
|
|
if (m_current_char == '[') {
|
|
|
|
m_regex_is_in_character_class = true;
|
|
|
|
} else if (m_current_char == ']') {
|
|
|
|
m_regex_is_in_character_class = false;
|
|
|
|
} else if (!m_regex_is_in_character_class && m_current_char == '/') {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (match('\\', '/') || match('\\', '[') || match('\\', '\\') || (m_regex_is_in_character_class && match('\\', ']')))
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_eof()) {
|
|
|
|
token_type = TokenType::UnterminatedRegexLiteral;
|
|
|
|
} else {
|
|
|
|
consume();
|
|
|
|
}
|
2021-06-13 07:15:00 +00:00
|
|
|
} else if (m_eof) {
|
2020-10-26 20:10:50 +00:00
|
|
|
if (unterminated_comment) {
|
|
|
|
token_type = TokenType::Invalid;
|
|
|
|
token_message = "Unterminated multi-line comment";
|
|
|
|
} else {
|
|
|
|
token_type = TokenType::Eof;
|
|
|
|
}
|
2020-03-11 18:27:43 +00:00
|
|
|
} else {
|
2020-03-12 22:02:41 +00:00
|
|
|
// There is only one four-char operator: >>>=
|
|
|
|
bool found_four_char_token = false;
|
2020-04-13 17:50:58 +00:00
|
|
|
if (match('>', '>', '>', '=')) {
|
|
|
|
found_four_char_token = true;
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
token_type = TokenType::UnsignedShiftRightEquals;
|
2020-03-12 22:02:41 +00:00
|
|
|
}
|
|
|
|
|
2020-03-12 12:11:33 +00:00
|
|
|
bool found_three_char_token = false;
|
2020-03-12 22:02:41 +00:00
|
|
|
if (!found_four_char_token && m_position + 1 < m_source.length()) {
|
2020-10-21 21:16:45 +00:00
|
|
|
auto three_chars_view = m_source.substring_view(m_position - 1, 3);
|
2020-10-17 21:44:41 +00:00
|
|
|
auto it = s_three_char_tokens.find(three_chars_view.hash(), [&](auto& entry) { return entry.key == three_chars_view; });
|
2020-03-12 12:11:33 +00:00
|
|
|
if (it != s_three_char_tokens.end()) {
|
|
|
|
found_three_char_token = true;
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
token_type = it->value;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-03-11 18:27:43 +00:00
|
|
|
bool found_two_char_token = false;
|
2020-03-12 22:02:41 +00:00
|
|
|
if (!found_four_char_token && !found_three_char_token && m_position < m_source.length()) {
|
2020-10-21 21:16:45 +00:00
|
|
|
auto two_chars_view = m_source.substring_view(m_position - 1, 2);
|
2020-10-17 21:44:41 +00:00
|
|
|
auto it = s_two_char_tokens.find(two_chars_view.hash(), [&](auto& entry) { return entry.key == two_chars_view; });
|
2020-03-11 18:27:43 +00:00
|
|
|
if (it != s_two_char_tokens.end()) {
|
2020-12-14 13:05:24 +00:00
|
|
|
// OptionalChainingPunctuator :: ?. [lookahead ∉ DecimalDigit]
|
2021-06-13 08:47:09 +00:00
|
|
|
if (!(it->value == TokenType::QuestionMarkPeriod && m_position + 1 < m_source.length() && is_ascii_digit(m_source[m_position + 1]))) {
|
2020-12-14 13:05:24 +00:00
|
|
|
found_two_char_token = true;
|
|
|
|
consume();
|
|
|
|
consume();
|
|
|
|
token_type = it->value;
|
|
|
|
}
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-03-12 12:11:33 +00:00
|
|
|
bool found_one_char_token = false;
|
2020-03-12 22:02:41 +00:00
|
|
|
if (!found_four_char_token && !found_three_char_token && !found_two_char_token) {
|
2020-03-11 18:27:43 +00:00
|
|
|
auto it = s_single_char_tokens.find(m_current_char);
|
|
|
|
if (it != s_single_char_tokens.end()) {
|
2020-03-12 12:11:33 +00:00
|
|
|
found_one_char_token = true;
|
2020-03-11 18:27:43 +00:00
|
|
|
consume();
|
|
|
|
token_type = it->value;
|
|
|
|
}
|
|
|
|
}
|
2020-03-12 12:11:33 +00:00
|
|
|
|
2020-03-12 22:02:41 +00:00
|
|
|
if (!found_four_char_token && !found_three_char_token && !found_two_char_token && !found_one_char_token) {
|
2020-03-12 12:11:33 +00:00
|
|
|
consume();
|
|
|
|
token_type = TokenType::Invalid;
|
|
|
|
}
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|
|
|
|
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
if (!m_template_states.is_empty() && m_template_states.last().in_expr) {
|
|
|
|
if (token_type == TokenType::CurlyOpen) {
|
|
|
|
m_template_states.last().open_bracket_count++;
|
|
|
|
} else if (token_type == TokenType::CurlyClose) {
|
|
|
|
m_template_states.last().open_bracket_count--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-03-11 18:27:43 +00:00
|
|
|
m_current_token = Token(
|
|
|
|
token_type,
|
2020-10-26 20:08:01 +00:00
|
|
|
token_message,
|
2020-03-11 18:27:43 +00:00
|
|
|
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
|
2020-04-05 09:34:03 +00:00
|
|
|
m_source.substring_view(value_start - 1, m_position - value_start),
|
2021-02-28 09:42:34 +00:00
|
|
|
m_filename,
|
2020-08-29 17:40:28 +00:00
|
|
|
value_start_line_number,
|
2021-07-10 20:46:17 +00:00
|
|
|
value_start_column_number,
|
|
|
|
m_position);
|
2020-03-11 18:27:43 +00:00
|
|
|
|
2021-04-18 16:13:27 +00:00
|
|
|
if constexpr (LEXER_DEBUG) {
|
|
|
|
dbgln("------------------------------");
|
|
|
|
dbgln("Token: {}", m_current_token.name());
|
|
|
|
dbgln("Trivia: _{}_", m_current_token.trivia());
|
|
|
|
dbgln("Value: _{}_", m_current_token.value());
|
|
|
|
dbgln("Line: {}, Column: {}", m_current_token.line_number(), m_current_token.line_column());
|
|
|
|
dbgln("------------------------------");
|
|
|
|
}
|
2020-10-21 21:16:45 +00:00
|
|
|
|
2020-03-11 18:27:43 +00:00
|
|
|
return m_current_token;
|
|
|
|
}
|
LibJS: Add template literals
Adds fully functioning template literals. Because template literals
contain expressions, most of the work has to be done in the Lexer rather
than the Parser. And because of the complexity of template literals
(expressions, nesting, escapes, etc), the Lexer needs to have some
template-related state.
When entering a new template literal, a TemplateLiteralStart token is
emitted. When inside a literal, all text will be parsed up until a '${'
or '`' (or EOF, but that's a syntax error) is seen, and then a
TemplateLiteralExprStart token is emitted. At this point, the Lexer
proceeds as normal, however it keeps track of the number of opening
and closing curly braces it has seen in order to determine the close
of the expression. Once it finds a matching curly brace for the '${',
a TemplateLiteralExprEnd token is emitted and the state is updated
accordingly.
When the Lexer is inside of a template literal, but not an expression,
and sees a '`', this must be the closing grave: a TemplateLiteralEnd
token is emitted.
The state required to correctly parse template strings consists of a
vector (for nesting) of two pieces of information: whether or not we
are in a template expression (as opposed to a template string); and
the count of the number of unmatched open curly braces we have seen
(only applicable if the Lexer is currently in a template expression).
TODO: Add support for template literal newlines in the JS REPL (this will
cause a syntax error currently):
> `foo
> bar`
'foo
bar'
2020-05-03 22:41:14 +00:00
|
|
|
|
2020-03-11 18:27:43 +00:00
|
|
|
}
|