Token.cpp 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. /*
  2. * Copyright (c) 2020, Stephan Unverwerth <s.unverwerth@serenityos.org>
  3. * Copyright (c) 2020-2021, Linus Groh <linusg@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include "Token.h"
  8. #include <AK/Assertions.h>
  9. #include <AK/CharacterTypes.h>
  10. #include <AK/GenericLexer.h>
  11. #include <AK/StringBuilder.h>
  12. #include <AK/Utf16View.h>
  13. namespace JS {
  14. const char* Token::name(TokenType type)
  15. {
  16. switch (type) {
  17. #define __ENUMERATE_JS_TOKEN(type, category) \
  18. case TokenType::type: \
  19. return #type;
  20. ENUMERATE_JS_TOKENS
  21. #undef __ENUMERATE_JS_TOKEN
  22. default:
  23. VERIFY_NOT_REACHED();
  24. return "<Unknown>";
  25. }
  26. }
  27. const char* Token::name() const
  28. {
  29. return name(m_type);
  30. }
  31. TokenCategory Token::category(TokenType type)
  32. {
  33. switch (type) {
  34. #define __ENUMERATE_JS_TOKEN(type, category) \
  35. case TokenType::type: \
  36. return TokenCategory::category;
  37. ENUMERATE_JS_TOKENS
  38. #undef __ENUMERATE_JS_TOKEN
  39. default:
  40. VERIFY_NOT_REACHED();
  41. }
  42. }
  43. TokenCategory Token::category() const
  44. {
  45. return category(m_type);
  46. }
  47. double Token::double_value() const
  48. {
  49. VERIFY(type() == TokenType::NumericLiteral);
  50. StringBuilder builder;
  51. for (auto ch : m_value) {
  52. if (ch == '_')
  53. continue;
  54. builder.append(ch);
  55. }
  56. String value_string = builder.to_string();
  57. if (value_string[0] == '0' && value_string.length() >= 2) {
  58. if (value_string[1] == 'x' || value_string[1] == 'X') {
  59. // hexadecimal
  60. return static_cast<double>(strtoul(value_string.characters() + 2, nullptr, 16));
  61. } else if (value_string[1] == 'o' || value_string[1] == 'O') {
  62. // octal
  63. return static_cast<double>(strtoul(value_string.characters() + 2, nullptr, 8));
  64. } else if (value_string[1] == 'b' || value_string[1] == 'B') {
  65. // binary
  66. return static_cast<double>(strtoul(value_string.characters() + 2, nullptr, 2));
  67. } else if (is_ascii_digit(value_string[1])) {
  68. // also octal, but syntax error in strict mode
  69. if (!m_value.contains('8') && !m_value.contains('9'))
  70. return static_cast<double>(strtoul(value_string.characters() + 1, nullptr, 8));
  71. }
  72. }
  73. return strtod(value_string.characters(), nullptr);
  74. }
  75. static u32 hex2int(char x)
  76. {
  77. VERIFY(is_ascii_hex_digit(x));
  78. if (x >= '0' && x <= '9')
  79. return x - '0';
  80. return 10u + (to_ascii_lowercase(x) - 'a');
  81. }
  82. String Token::string_value(StringValueStatus& status) const
  83. {
  84. VERIFY(type() == TokenType::StringLiteral || type() == TokenType::TemplateLiteralString);
  85. auto is_template = type() == TokenType::TemplateLiteralString;
  86. GenericLexer lexer(is_template ? m_value : m_value.substring_view(1, m_value.length() - 2));
  87. auto encoding_failure = [&status](StringValueStatus parse_status) -> String {
  88. status = parse_status;
  89. return {};
  90. };
  91. auto decode_surrogate = [&lexer]() -> Optional<u16> {
  92. u16 surrogate = 0;
  93. for (int j = 0; j < 4; ++j) {
  94. if (!lexer.next_is(is_ascii_hex_digit))
  95. return {};
  96. surrogate = (surrogate << 4u) | hex2int(lexer.consume());
  97. }
  98. return surrogate;
  99. };
  100. StringBuilder builder;
  101. while (!lexer.is_eof()) {
  102. // No escape, consume one char and continue
  103. if (!lexer.next_is('\\')) {
  104. builder.append(lexer.consume());
  105. continue;
  106. }
  107. lexer.ignore();
  108. VERIFY(!lexer.is_eof());
  109. // Line continuation
  110. if (lexer.next_is('\n') || lexer.next_is('\r')) {
  111. lexer.ignore();
  112. continue;
  113. }
  114. // Line continuation
  115. if (lexer.next_is(LINE_SEPARATOR_STRING) || lexer.next_is(PARAGRAPH_SEPARATOR_STRING)) {
  116. lexer.ignore(3);
  117. continue;
  118. }
  119. // Null-byte escape
  120. if (lexer.next_is('0') && !is_ascii_digit(lexer.peek(1))) {
  121. lexer.ignore();
  122. builder.append('\0');
  123. continue;
  124. }
  125. // Hex escape
  126. if (lexer.next_is('x')) {
  127. lexer.ignore();
  128. if (!is_ascii_hex_digit(lexer.peek()) || !is_ascii_hex_digit(lexer.peek(1)))
  129. return encoding_failure(StringValueStatus::MalformedHexEscape);
  130. auto code_point = hex2int(lexer.consume()) * 16 + hex2int(lexer.consume());
  131. VERIFY(code_point <= 255);
  132. builder.append_code_point(code_point);
  133. continue;
  134. }
  135. // Unicode escape
  136. if (lexer.next_is('u')) {
  137. lexer.ignore();
  138. u32 code_point = 0;
  139. if (lexer.next_is('{')) {
  140. lexer.ignore();
  141. while (true) {
  142. if (!lexer.next_is(is_ascii_hex_digit))
  143. return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
  144. auto new_code_point = (code_point << 4u) | hex2int(lexer.consume());
  145. if (new_code_point < code_point)
  146. return encoding_failure(StringValueStatus::UnicodeEscapeOverflow);
  147. code_point = new_code_point;
  148. if (lexer.next_is('}'))
  149. break;
  150. }
  151. lexer.ignore();
  152. } else {
  153. auto high_surrogate = decode_surrogate();
  154. if (!high_surrogate.has_value())
  155. return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
  156. if (Utf16View::is_high_surrogate(*high_surrogate) && lexer.consume_specific("\\u"sv)) {
  157. auto low_surrogate = decode_surrogate();
  158. if (!low_surrogate.has_value())
  159. return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
  160. if (Utf16View::is_low_surrogate(*low_surrogate)) {
  161. code_point = Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate);
  162. } else {
  163. builder.append_code_point(*high_surrogate);
  164. code_point = *low_surrogate;
  165. }
  166. } else {
  167. code_point = *high_surrogate;
  168. }
  169. }
  170. builder.append_code_point(code_point);
  171. continue;
  172. }
  173. // In non-strict mode LegacyOctalEscapeSequence is allowed in strings:
  174. // https://tc39.es/ecma262/#sec-additional-syntax-string-literals
  175. String octal_str;
  176. auto is_octal_digit = [](char ch) { return ch >= '0' && ch <= '7'; };
  177. auto is_zero_to_three = [](char ch) { return ch >= '0' && ch <= '3'; };
  178. auto is_four_to_seven = [](char ch) { return ch >= '4' && ch <= '7'; };
  179. // OctalDigit [lookahead ∉ OctalDigit]
  180. if (is_octal_digit(lexer.peek()) && !is_octal_digit(lexer.peek(1)))
  181. octal_str = lexer.consume(1);
  182. // ZeroToThree OctalDigit [lookahead ∉ OctalDigit]
  183. else if (is_zero_to_three(lexer.peek()) && is_octal_digit(lexer.peek(1)) && !is_octal_digit(lexer.peek(2)))
  184. octal_str = lexer.consume(2);
  185. // FourToSeven OctalDigit
  186. else if (is_four_to_seven(lexer.peek()) && is_octal_digit(lexer.peek(1)))
  187. octal_str = lexer.consume(2);
  188. // ZeroToThree OctalDigit OctalDigit
  189. else if (is_zero_to_three(lexer.peek()) && is_octal_digit(lexer.peek(1)) && is_octal_digit(lexer.peek(2)))
  190. octal_str = lexer.consume(3);
  191. if (!octal_str.is_null()) {
  192. status = StringValueStatus::LegacyOctalEscapeSequence;
  193. auto code_point = strtoul(octal_str.characters(), nullptr, 8);
  194. VERIFY(code_point <= 255);
  195. builder.append_code_point(code_point);
  196. continue;
  197. }
  198. lexer.retreat();
  199. builder.append(lexer.consume_escaped_character('\\', "b\bf\fn\nr\rt\tv\v"));
  200. }
  201. return builder.to_string();
  202. }
  203. bool Token::bool_value() const
  204. {
  205. VERIFY(type() == TokenType::BoolLiteral);
  206. return m_value == "true";
  207. }
  208. bool Token::is_identifier_name() const
  209. {
  210. // IdentifierNames are Identifiers + ReservedWords
  211. // The standard defines this reversed: Identifiers are IdentifierNames except reserved words
  212. // https://tc39.es/ecma262/#prod-Identifier
  213. return m_type == TokenType::Identifier
  214. || m_type == TokenType::Await
  215. || m_type == TokenType::BoolLiteral
  216. || m_type == TokenType::Break
  217. || m_type == TokenType::Case
  218. || m_type == TokenType::Catch
  219. || m_type == TokenType::Class
  220. || m_type == TokenType::Const
  221. || m_type == TokenType::Continue
  222. || m_type == TokenType::Debugger
  223. || m_type == TokenType::Default
  224. || m_type == TokenType::Delete
  225. || m_type == TokenType::Do
  226. || m_type == TokenType::Else
  227. || m_type == TokenType::Enum
  228. || m_type == TokenType::Export
  229. || m_type == TokenType::Extends
  230. || m_type == TokenType::Finally
  231. || m_type == TokenType::For
  232. || m_type == TokenType::Function
  233. || m_type == TokenType::If
  234. || m_type == TokenType::Import
  235. || m_type == TokenType::In
  236. || m_type == TokenType::Instanceof
  237. || m_type == TokenType::Let
  238. || m_type == TokenType::New
  239. || m_type == TokenType::NullLiteral
  240. || m_type == TokenType::Return
  241. || m_type == TokenType::Super
  242. || m_type == TokenType::Switch
  243. || m_type == TokenType::This
  244. || m_type == TokenType::Throw
  245. || m_type == TokenType::Try
  246. || m_type == TokenType::Typeof
  247. || m_type == TokenType::Var
  248. || m_type == TokenType::Void
  249. || m_type == TokenType::While
  250. || m_type == TokenType::With
  251. || m_type == TokenType::Yield;
  252. }
  253. bool Token::trivia_contains_line_terminator() const
  254. {
  255. return m_trivia.contains('\n') || m_trivia.contains('\r') || m_trivia.contains(LINE_SEPARATOR_STRING) || m_trivia.contains(PARAGRAPH_SEPARATOR_STRING);
  256. }
  257. }