Lexer.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787
  1. /*
  2. * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "Lexer.h"
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/HashTable.h>
  9. #include <AK/StdLibExtras.h>
  10. #include <AK/String.h>
  11. namespace Cpp {
  12. Lexer::Lexer(StringView const& input, size_t start_line)
  13. : m_input(input)
  14. , m_previous_position { start_line, 0 }
  15. , m_position { start_line, 0 }
  16. {
  17. }
  18. char Lexer::peek(size_t offset) const
  19. {
  20. if ((m_index + offset) >= m_input.length())
  21. return 0;
  22. return m_input[m_index + offset];
  23. }
  24. char Lexer::consume()
  25. {
  26. VERIFY(m_index < m_input.length());
  27. char ch = m_input[m_index++];
  28. m_previous_position = m_position;
  29. if (ch == '\n') {
  30. m_position.line++;
  31. m_position.column = 0;
  32. } else {
  33. m_position.column++;
  34. }
  35. return ch;
  36. }
  37. constexpr bool is_valid_first_character_of_identifier(char ch)
  38. {
  39. return is_ascii_alpha(ch) || ch == '_' || ch == '$';
  40. }
  41. constexpr bool is_valid_nonfirst_character_of_identifier(char ch)
  42. {
  43. return is_valid_first_character_of_identifier(ch) || is_ascii_digit(ch);
  44. }
  45. constexpr char const* s_known_keywords[] = {
  46. "alignas",
  47. "alignof",
  48. "and",
  49. "and_eq",
  50. "asm",
  51. "bitand",
  52. "bitor",
  53. "break",
  54. "case",
  55. "catch",
  56. "class",
  57. "compl",
  58. "const",
  59. "const_cast",
  60. "constexpr",
  61. "continue",
  62. "decltype",
  63. "default",
  64. "delete",
  65. "do",
  66. "dynamic_cast",
  67. "else",
  68. "enum",
  69. "explicit",
  70. "export",
  71. "extern",
  72. "false",
  73. "final",
  74. "for",
  75. "friend",
  76. "goto",
  77. "if",
  78. "inline",
  79. "mutable",
  80. "namespace",
  81. "new",
  82. "noexcept",
  83. "not",
  84. "not_eq",
  85. "nullptr",
  86. "operator",
  87. "or",
  88. "or_eq",
  89. "override",
  90. "private",
  91. "protected",
  92. "public",
  93. "register",
  94. "reinterpret_cast",
  95. "return",
  96. "signed",
  97. "sizeof",
  98. "static",
  99. "static_assert",
  100. "static_cast",
  101. "struct",
  102. "switch",
  103. "template",
  104. "this",
  105. "thread_local",
  106. "throw",
  107. "true",
  108. "try",
  109. "typedef",
  110. "typeid",
  111. "typename",
  112. "union",
  113. "using",
  114. "virtual",
  115. "volatile",
  116. "while",
  117. "xor",
  118. "xor_eq"
  119. };
  120. constexpr char const* s_known_types[] = {
  121. "ByteBuffer",
  122. "CircularDeque",
  123. "CircularQueue",
  124. "Deque",
  125. "DoublyLinkedList",
  126. "FileSystemPath",
  127. "Array",
  128. "Function",
  129. "HashMap",
  130. "HashTable",
  131. "IPv4Address",
  132. "IntrusiveList",
  133. "JsonArray",
  134. "JsonObject",
  135. "JsonValue",
  136. "MappedFile",
  137. "NetworkOrdered",
  138. "NonnullOwnPtr",
  139. "NonnullOwnPtrVector",
  140. "NonnullRefPtr",
  141. "NonnullRefPtrVector",
  142. "Optional",
  143. "OwnPtr",
  144. "RefPtr",
  145. "Result",
  146. "ScopeGuard",
  147. "SinglyLinkedList",
  148. "String",
  149. "StringBuilder",
  150. "StringImpl",
  151. "StringView",
  152. "Utf8View",
  153. "Vector",
  154. "WeakPtr",
  155. "auto",
  156. "bool",
  157. "char",
  158. "char16_t",
  159. "char32_t",
  160. "char8_t",
  161. "double",
  162. "float",
  163. "i16",
  164. "i32",
  165. "i64",
  166. "i8",
  167. "int",
  168. "int",
  169. "long",
  170. "short",
  171. "signed",
  172. "u16",
  173. "u32",
  174. "u64",
  175. "u8",
  176. "unsigned",
  177. "void",
  178. "wchar_t"
  179. };
  180. static bool is_keyword(StringView const& string)
  181. {
  182. static HashTable<String> keywords(array_size(s_known_keywords));
  183. if (keywords.is_empty()) {
  184. keywords.set_from(s_known_keywords);
  185. }
  186. return keywords.contains(string);
  187. }
  188. static bool is_known_type(StringView const& string)
  189. {
  190. static HashTable<String> types(array_size(s_known_types));
  191. if (types.is_empty()) {
  192. types.set_from(s_known_types);
  193. }
  194. return types.contains(string);
  195. }
  196. Vector<Token> Lexer::lex()
  197. {
  198. Vector<Token> tokens;
  199. size_t token_start_index = 0;
  200. Position token_start_position;
  201. auto emit_single_char_token = [&](auto type) {
  202. tokens.empend(type, m_position, m_position, m_input.substring_view(m_index, 1));
  203. consume();
  204. };
  205. auto begin_token = [&] {
  206. token_start_index = m_index;
  207. token_start_position = m_position;
  208. };
  209. auto commit_token = [&](auto type) {
  210. if (m_options.ignore_whitespace && type == Token::Type::Whitespace)
  211. return;
  212. tokens.empend(type, token_start_position, m_previous_position, m_input.substring_view(token_start_index, m_index - token_start_index));
  213. };
  214. auto emit_token_equals = [&](auto type, auto equals_type) {
  215. if (peek(1) == '=') {
  216. begin_token();
  217. consume();
  218. consume();
  219. commit_token(equals_type);
  220. return;
  221. }
  222. emit_single_char_token(type);
  223. };
  224. auto match_escape_sequence = [&]() -> size_t {
  225. switch (peek(1)) {
  226. case '\'':
  227. case '"':
  228. case '?':
  229. case '\\':
  230. case 'a':
  231. case 'b':
  232. case 'f':
  233. case 'n':
  234. case 'r':
  235. case 't':
  236. case 'v':
  237. return 2;
  238. case '0':
  239. case '1':
  240. case '2':
  241. case '3':
  242. case '4':
  243. case '5':
  244. case '6':
  245. case '7': {
  246. size_t octal_digits = 1;
  247. for (size_t i = 0; i < 2; ++i) {
  248. char next = peek(2 + i);
  249. if (next < '0' || next > '7')
  250. break;
  251. ++octal_digits;
  252. }
  253. return 1 + octal_digits;
  254. }
  255. case 'x': {
  256. size_t hex_digits = 0;
  257. while (is_ascii_hex_digit(peek(2 + hex_digits)))
  258. ++hex_digits;
  259. return 2 + hex_digits;
  260. }
  261. case 'u':
  262. case 'U': {
  263. bool is_unicode = true;
  264. size_t number_of_digits = peek(1) == 'u' ? 4 : 8;
  265. for (size_t i = 0; i < number_of_digits; ++i) {
  266. if (!is_ascii_hex_digit(peek(2 + i))) {
  267. is_unicode = false;
  268. break;
  269. }
  270. }
  271. return is_unicode ? 2 + number_of_digits : 0;
  272. }
  273. default:
  274. return 0;
  275. }
  276. };
  277. auto match_string_prefix = [&](char quote) -> size_t {
  278. if (peek() == quote)
  279. return 1;
  280. if (peek() == 'L' && peek(1) == quote)
  281. return 2;
  282. if (peek() == 'u') {
  283. if (peek(1) == quote)
  284. return 2;
  285. if (peek(1) == '8' && peek(2) == quote)
  286. return 3;
  287. }
  288. if (peek() == 'U' && peek(1) == quote)
  289. return 2;
  290. return 0;
  291. };
  292. while (m_index < m_input.length()) {
  293. auto ch = peek();
  294. if (is_ascii_space(ch)) {
  295. begin_token();
  296. while (is_ascii_space(peek()))
  297. consume();
  298. commit_token(Token::Type::Whitespace);
  299. continue;
  300. }
  301. if (ch == '(') {
  302. emit_single_char_token(Token::Type::LeftParen);
  303. continue;
  304. }
  305. if (ch == ')') {
  306. emit_single_char_token(Token::Type::RightParen);
  307. continue;
  308. }
  309. if (ch == '{') {
  310. emit_single_char_token(Token::Type::LeftCurly);
  311. continue;
  312. }
  313. if (ch == '}') {
  314. emit_single_char_token(Token::Type::RightCurly);
  315. continue;
  316. }
  317. if (ch == '[') {
  318. emit_single_char_token(Token::Type::LeftBracket);
  319. continue;
  320. }
  321. if (ch == ']') {
  322. emit_single_char_token(Token::Type::RightBracket);
  323. continue;
  324. }
  325. if (ch == '<') {
  326. begin_token();
  327. consume();
  328. if (peek() == '<') {
  329. consume();
  330. if (peek() == '=') {
  331. consume();
  332. commit_token(Token::Type::LessLessEquals);
  333. continue;
  334. }
  335. commit_token(Token::Type::LessLess);
  336. continue;
  337. }
  338. if (peek() == '=') {
  339. consume();
  340. commit_token(Token::Type::LessEquals);
  341. continue;
  342. }
  343. if (peek() == '>') {
  344. consume();
  345. commit_token(Token::Type::LessGreater);
  346. continue;
  347. }
  348. commit_token(Token::Type::Less);
  349. continue;
  350. }
  351. if (ch == '>') {
  352. begin_token();
  353. consume();
  354. if (peek() == '>') {
  355. consume();
  356. if (peek() == '=') {
  357. consume();
  358. commit_token(Token::Type::GreaterGreaterEquals);
  359. continue;
  360. }
  361. commit_token(Token::Type::GreaterGreater);
  362. continue;
  363. }
  364. if (peek() == '=') {
  365. consume();
  366. commit_token(Token::Type::GreaterEquals);
  367. continue;
  368. }
  369. commit_token(Token::Type::Greater);
  370. continue;
  371. }
  372. if (ch == ',') {
  373. emit_single_char_token(Token::Type::Comma);
  374. continue;
  375. }
  376. if (ch == '+') {
  377. begin_token();
  378. consume();
  379. if (peek() == '+') {
  380. consume();
  381. commit_token(Token::Type::PlusPlus);
  382. continue;
  383. }
  384. if (peek() == '=') {
  385. consume();
  386. commit_token(Token::Type::PlusEquals);
  387. continue;
  388. }
  389. commit_token(Token::Type::Plus);
  390. continue;
  391. }
  392. if (ch == '-') {
  393. begin_token();
  394. consume();
  395. if (peek() == '-') {
  396. consume();
  397. commit_token(Token::Type::MinusMinus);
  398. continue;
  399. }
  400. if (peek() == '=') {
  401. consume();
  402. commit_token(Token::Type::MinusEquals);
  403. continue;
  404. }
  405. if (peek() == '>') {
  406. consume();
  407. if (peek() == '*') {
  408. consume();
  409. commit_token(Token::Type::ArrowAsterisk);
  410. continue;
  411. }
  412. commit_token(Token::Type::Arrow);
  413. continue;
  414. }
  415. commit_token(Token::Type::Minus);
  416. continue;
  417. }
  418. if (ch == '*') {
  419. emit_token_equals(Token::Type::Asterisk, Token::Type::AsteriskEquals);
  420. continue;
  421. }
  422. if (ch == '%') {
  423. emit_token_equals(Token::Type::Percent, Token::Type::PercentEquals);
  424. continue;
  425. }
  426. if (ch == '^') {
  427. emit_token_equals(Token::Type::Caret, Token::Type::CaretEquals);
  428. continue;
  429. }
  430. if (ch == '!') {
  431. emit_token_equals(Token::Type::ExclamationMark, Token::Type::ExclamationMarkEquals);
  432. continue;
  433. }
  434. if (ch == '=') {
  435. emit_token_equals(Token::Type::Equals, Token::Type::EqualsEquals);
  436. continue;
  437. }
  438. if (ch == '&') {
  439. begin_token();
  440. consume();
  441. if (peek() == '&') {
  442. consume();
  443. commit_token(Token::Type::AndAnd);
  444. continue;
  445. }
  446. if (peek() == '=') {
  447. consume();
  448. commit_token(Token::Type::AndEquals);
  449. continue;
  450. }
  451. commit_token(Token::Type::And);
  452. continue;
  453. }
  454. if (ch == '|') {
  455. begin_token();
  456. consume();
  457. if (peek() == '|') {
  458. consume();
  459. commit_token(Token::Type::PipePipe);
  460. continue;
  461. }
  462. if (peek() == '=') {
  463. consume();
  464. commit_token(Token::Type::PipeEquals);
  465. continue;
  466. }
  467. commit_token(Token::Type::Pipe);
  468. continue;
  469. }
  470. if (ch == '~') {
  471. emit_single_char_token(Token::Type::Tilde);
  472. continue;
  473. }
  474. if (ch == '?') {
  475. emit_single_char_token(Token::Type::QuestionMark);
  476. continue;
  477. }
  478. if (ch == ':') {
  479. begin_token();
  480. consume();
  481. if (peek() == ':') {
  482. consume();
  483. if (peek() == '*') {
  484. consume();
  485. commit_token(Token::Type::ColonColonAsterisk);
  486. continue;
  487. }
  488. commit_token(Token::Type::ColonColon);
  489. continue;
  490. }
  491. commit_token(Token::Type::Colon);
  492. continue;
  493. }
  494. if (ch == ';') {
  495. emit_single_char_token(Token::Type::Semicolon);
  496. continue;
  497. }
  498. if (ch == '.') {
  499. begin_token();
  500. consume();
  501. if (peek() == '*') {
  502. consume();
  503. commit_token(Token::Type::DotAsterisk);
  504. continue;
  505. }
  506. commit_token(Token::Type::Dot);
  507. continue;
  508. }
  509. if (ch == '#') {
  510. begin_token();
  511. consume();
  512. if (is_valid_first_character_of_identifier(peek()))
  513. while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
  514. consume();
  515. auto directive = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
  516. if (directive == "#include") {
  517. commit_token(Token::Type::IncludeStatement);
  518. if (is_ascii_space(peek())) {
  519. begin_token();
  520. do {
  521. consume();
  522. } while (is_ascii_space(peek()));
  523. commit_token(Token::Type::Whitespace);
  524. }
  525. begin_token();
  526. if (peek() == '<' || peek() == '"') {
  527. char closing = consume() == '<' ? '>' : '"';
  528. while (peek() && peek() != closing && peek() != '\n')
  529. consume();
  530. if (peek() && consume() == '\n') {
  531. commit_token(Token::Type::IncludePath);
  532. continue;
  533. }
  534. commit_token(Token::Type::IncludePath);
  535. begin_token();
  536. }
  537. } else {
  538. while (peek()) {
  539. if (peek() == '\\' && peek(1) == '\n') {
  540. consume();
  541. consume();
  542. } else if (peek() == '\n') {
  543. break;
  544. } else {
  545. consume();
  546. }
  547. }
  548. commit_token(Token::Type::PreprocessorStatement);
  549. }
  550. continue;
  551. }
  552. if (ch == '/' && peek(1) == '/') {
  553. begin_token();
  554. while (peek() && peek() != '\n')
  555. consume();
  556. commit_token(Token::Type::Comment);
  557. continue;
  558. }
  559. if (ch == '/' && peek(1) == '*') {
  560. begin_token();
  561. consume();
  562. consume();
  563. bool comment_block_ends = false;
  564. while (peek()) {
  565. if (peek() == '*' && peek(1) == '/') {
  566. comment_block_ends = true;
  567. break;
  568. }
  569. consume();
  570. }
  571. if (comment_block_ends) {
  572. consume();
  573. consume();
  574. }
  575. commit_token(Token::Type::Comment);
  576. continue;
  577. }
  578. if (ch == '/') {
  579. emit_token_equals(Token::Type::Slash, Token::Type::SlashEquals);
  580. continue;
  581. }
  582. if (size_t prefix = match_string_prefix('"'); prefix > 0) {
  583. begin_token();
  584. for (size_t i = 0; i < prefix; ++i)
  585. consume();
  586. while (peek()) {
  587. if (peek() == '\\') {
  588. if (size_t escape = match_escape_sequence(); escape > 0) {
  589. commit_token(Token::Type::DoubleQuotedString);
  590. begin_token();
  591. for (size_t i = 0; i < escape; ++i)
  592. consume();
  593. commit_token(Token::Type::EscapeSequence);
  594. begin_token();
  595. continue;
  596. }
  597. }
  598. // If string is not terminated - stop before EOF
  599. if (!peek(1))
  600. break;
  601. if (consume() == '"')
  602. break;
  603. }
  604. commit_token(Token::Type::DoubleQuotedString);
  605. continue;
  606. }
  607. if (size_t prefix = match_string_prefix('R'); prefix > 0 && peek(prefix) == '"') {
  608. begin_token();
  609. for (size_t i = 0; i < prefix + 1; ++i)
  610. consume();
  611. size_t prefix_start = m_index;
  612. while (peek() && peek() != '(')
  613. consume();
  614. StringView prefix_string = m_input.substring_view(prefix_start, m_index - prefix_start);
  615. while (peek()) {
  616. if (consume() == '"') {
  617. VERIFY(m_index >= prefix_string.length() + 2);
  618. VERIFY(m_input[m_index - 1] == '"');
  619. if (m_input[m_index - 1 - prefix_string.length() - 1] == ')') {
  620. StringView suffix_string = m_input.substring_view(m_index - 1 - prefix_string.length(), prefix_string.length());
  621. if (prefix_string == suffix_string)
  622. break;
  623. }
  624. }
  625. }
  626. commit_token(Token::Type::RawString);
  627. continue;
  628. }
  629. if (size_t prefix = match_string_prefix('\''); prefix > 0) {
  630. begin_token();
  631. for (size_t i = 0; i < prefix; ++i)
  632. consume();
  633. while (peek()) {
  634. if (peek() == '\\') {
  635. if (size_t escape = match_escape_sequence(); escape > 0) {
  636. commit_token(Token::Type::SingleQuotedString);
  637. begin_token();
  638. for (size_t i = 0; i < escape; ++i)
  639. consume();
  640. commit_token(Token::Type::EscapeSequence);
  641. begin_token();
  642. continue;
  643. }
  644. }
  645. if (consume() == '\'')
  646. break;
  647. }
  648. commit_token(Token::Type::SingleQuotedString);
  649. continue;
  650. }
  651. if (is_ascii_digit(ch) || (ch == '.' && is_ascii_digit(peek(1)))) {
  652. begin_token();
  653. consume();
  654. auto type = ch == '.' ? Token::Type::Float : Token::Type::Integer;
  655. bool is_hex = false;
  656. bool is_binary = false;
  657. auto match_exponent = [&]() -> size_t {
  658. char ch = peek();
  659. if (ch != 'e' && ch != 'E' && ch != 'p' && ch != 'P')
  660. return 0;
  661. type = Token::Type::Float;
  662. size_t length = 1;
  663. ch = peek(length);
  664. if (ch == '+' || ch == '-') {
  665. ++length;
  666. }
  667. for (ch = peek(length); is_ascii_digit(ch); ch = peek(length)) {
  668. ++length;
  669. }
  670. return length;
  671. };
  672. auto match_type_literal = [&]() -> size_t {
  673. size_t length = 0;
  674. for (;;) {
  675. char ch = peek(length);
  676. if ((ch == 'u' || ch == 'U') && type == Token::Type::Integer) {
  677. ++length;
  678. } else if ((ch == 'f' || ch == 'F') && !is_binary) {
  679. type = Token::Type::Float;
  680. ++length;
  681. } else if (ch == 'l' || ch == 'L') {
  682. ++length;
  683. } else
  684. return length;
  685. }
  686. };
  687. if (peek() == 'b' || peek() == 'B') {
  688. consume();
  689. is_binary = true;
  690. for (char ch = peek(); ch == '0' || ch == '1' || (ch == '\'' && peek(1) != '\''); ch = peek()) {
  691. consume();
  692. }
  693. } else {
  694. if (peek() == 'x' || peek() == 'X') {
  695. consume();
  696. is_hex = true;
  697. }
  698. for (char ch = peek(); (is_hex ? is_ascii_hex_digit(ch) : is_ascii_digit(ch)) || (ch == '\'' && peek(1) != '\'') || ch == '.'; ch = peek()) {
  699. if (ch == '.') {
  700. if (type == Token::Type::Integer) {
  701. type = Token::Type::Float;
  702. } else
  703. break;
  704. };
  705. consume();
  706. }
  707. }
  708. if (!is_binary) {
  709. size_t length = match_exponent();
  710. for (size_t i = 0; i < length; ++i)
  711. consume();
  712. }
  713. size_t length = match_type_literal();
  714. for (size_t i = 0; i < length; ++i)
  715. consume();
  716. commit_token(type);
  717. continue;
  718. }
  719. if (is_valid_first_character_of_identifier(ch)) {
  720. begin_token();
  721. while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
  722. consume();
  723. auto token_view = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
  724. if (is_keyword(token_view))
  725. commit_token(Token::Type::Keyword);
  726. else if (is_known_type(token_view))
  727. commit_token(Token::Type::KnownType);
  728. else
  729. commit_token(Token::Type::Identifier);
  730. continue;
  731. }
  732. if (ch == '\\' && peek(1) == '\n') {
  733. consume();
  734. consume();
  735. continue;
  736. }
  737. dbgln("Unimplemented token character: {}", ch);
  738. emit_single_char_token(Token::Type::Unknown);
  739. }
  740. return tokens;
  741. }
  742. }