Lexer.cpp 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775
  1. /*
  2. * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "Lexer.h"
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/HashTable.h>
  9. #include <AK/StdLibExtras.h>
  10. #include <AK/String.h>
  11. namespace Cpp {
  12. Lexer::Lexer(StringView const& input)
  13. : m_input(input)
  14. {
  15. }
  16. char Lexer::peek(size_t offset) const
  17. {
  18. if ((m_index + offset) >= m_input.length())
  19. return 0;
  20. return m_input[m_index + offset];
  21. }
  22. char Lexer::consume()
  23. {
  24. VERIFY(m_index < m_input.length());
  25. char ch = m_input[m_index++];
  26. m_previous_position = m_position;
  27. if (ch == '\n') {
  28. m_position.line++;
  29. m_position.column = 0;
  30. } else {
  31. m_position.column++;
  32. }
  33. return ch;
  34. }
  35. constexpr bool is_valid_first_character_of_identifier(char ch)
  36. {
  37. return is_ascii_alpha(ch) || ch == '_' || ch == '$';
  38. }
  39. constexpr bool is_valid_nonfirst_character_of_identifier(char ch)
  40. {
  41. return is_valid_first_character_of_identifier(ch) || is_ascii_digit(ch);
  42. }
  43. constexpr char const* s_known_keywords[] = {
  44. "alignas",
  45. "alignof",
  46. "and",
  47. "and_eq",
  48. "asm",
  49. "bitand",
  50. "bitor",
  51. "break",
  52. "case",
  53. "catch",
  54. "class",
  55. "compl",
  56. "const",
  57. "const_cast",
  58. "constexpr",
  59. "continue",
  60. "decltype",
  61. "default",
  62. "delete",
  63. "do",
  64. "dynamic_cast",
  65. "else",
  66. "enum",
  67. "explicit",
  68. "export",
  69. "extern",
  70. "false",
  71. "final",
  72. "for",
  73. "friend",
  74. "goto",
  75. "if",
  76. "inline",
  77. "mutable",
  78. "namespace",
  79. "new",
  80. "noexcept",
  81. "not",
  82. "not_eq",
  83. "nullptr",
  84. "operator",
  85. "or",
  86. "or_eq",
  87. "override",
  88. "private",
  89. "protected",
  90. "public",
  91. "register",
  92. "reinterpret_cast",
  93. "return",
  94. "signed",
  95. "sizeof",
  96. "static",
  97. "static_assert",
  98. "static_cast",
  99. "struct",
  100. "switch",
  101. "template",
  102. "this",
  103. "thread_local",
  104. "throw",
  105. "true",
  106. "try",
  107. "typedef",
  108. "typeid",
  109. "typename",
  110. "union",
  111. "using",
  112. "virtual",
  113. "volatile",
  114. "while",
  115. "xor",
  116. "xor_eq"
  117. };
  118. constexpr char const* s_known_types[] = {
  119. "ByteBuffer",
  120. "CircularDeque",
  121. "CircularQueue",
  122. "Deque",
  123. "DoublyLinkedList",
  124. "FileSystemPath",
  125. "Array",
  126. "Function",
  127. "HashMap",
  128. "HashTable",
  129. "IPv4Address",
  130. "IntrusiveList",
  131. "JsonArray",
  132. "JsonObject",
  133. "JsonValue",
  134. "MappedFile",
  135. "NetworkOrdered",
  136. "NonnullOwnPtr",
  137. "NonnullOwnPtrVector",
  138. "NonnullRefPtr",
  139. "NonnullRefPtrVector",
  140. "Optional",
  141. "OwnPtr",
  142. "RefPtr",
  143. "Result",
  144. "ScopeGuard",
  145. "SinglyLinkedList",
  146. "String",
  147. "StringBuilder",
  148. "StringImpl",
  149. "StringView",
  150. "Utf8View",
  151. "Vector",
  152. "WeakPtr",
  153. "auto",
  154. "bool",
  155. "char",
  156. "char16_t",
  157. "char32_t",
  158. "char8_t",
  159. "double",
  160. "float",
  161. "i16",
  162. "i32",
  163. "i64",
  164. "i8",
  165. "int",
  166. "int",
  167. "long",
  168. "short",
  169. "signed",
  170. "u16",
  171. "u32",
  172. "u64",
  173. "u8",
  174. "unsigned",
  175. "void",
  176. "wchar_t"
  177. };
  178. static bool is_keyword(StringView const& string)
  179. {
  180. static HashTable<String> keywords(array_size(s_known_keywords));
  181. if (keywords.is_empty()) {
  182. keywords.set_from(s_known_keywords);
  183. }
  184. return keywords.contains(string);
  185. }
  186. static bool is_known_type(StringView const& string)
  187. {
  188. static HashTable<String> types(array_size(s_known_types));
  189. if (types.is_empty()) {
  190. types.set_from(s_known_types);
  191. }
  192. return types.contains(string);
  193. }
  194. Vector<Token> Lexer::lex()
  195. {
  196. Vector<Token> tokens;
  197. size_t token_start_index = 0;
  198. Position token_start_position;
  199. auto emit_single_char_token = [&](auto type) {
  200. tokens.empend(type, m_position, m_position, m_input.substring_view(m_index, 1));
  201. consume();
  202. };
  203. auto begin_token = [&] {
  204. token_start_index = m_index;
  205. token_start_position = m_position;
  206. };
  207. auto commit_token = [&](auto type) {
  208. tokens.empend(type, token_start_position, m_previous_position, m_input.substring_view(token_start_index, m_index - token_start_index));
  209. };
  210. auto emit_token_equals = [&](auto type, auto equals_type) {
  211. if (peek(1) == '=') {
  212. begin_token();
  213. consume();
  214. consume();
  215. commit_token(equals_type);
  216. return;
  217. }
  218. emit_single_char_token(type);
  219. };
  220. auto match_escape_sequence = [&]() -> size_t {
  221. switch (peek(1)) {
  222. case '\'':
  223. case '"':
  224. case '?':
  225. case '\\':
  226. case 'a':
  227. case 'b':
  228. case 'f':
  229. case 'n':
  230. case 'r':
  231. case 't':
  232. case 'v':
  233. return 2;
  234. case '0':
  235. case '1':
  236. case '2':
  237. case '3':
  238. case '4':
  239. case '5':
  240. case '6':
  241. case '7': {
  242. size_t octal_digits = 1;
  243. for (size_t i = 0; i < 2; ++i) {
  244. char next = peek(2 + i);
  245. if (next < '0' || next > '7')
  246. break;
  247. ++octal_digits;
  248. }
  249. return 1 + octal_digits;
  250. }
  251. case 'x': {
  252. size_t hex_digits = 0;
  253. while (is_ascii_hex_digit(peek(2 + hex_digits)))
  254. ++hex_digits;
  255. return 2 + hex_digits;
  256. }
  257. case 'u':
  258. case 'U': {
  259. bool is_unicode = true;
  260. size_t number_of_digits = peek(1) == 'u' ? 4 : 8;
  261. for (size_t i = 0; i < number_of_digits; ++i) {
  262. if (!is_ascii_hex_digit(peek(2 + i))) {
  263. is_unicode = false;
  264. break;
  265. }
  266. }
  267. return is_unicode ? 2 + number_of_digits : 0;
  268. }
  269. default:
  270. return 0;
  271. }
  272. };
  273. auto match_string_prefix = [&](char quote) -> size_t {
  274. if (peek() == quote)
  275. return 1;
  276. if (peek() == 'L' && peek(1) == quote)
  277. return 2;
  278. if (peek() == 'u') {
  279. if (peek(1) == quote)
  280. return 2;
  281. if (peek(1) == '8' && peek(2) == quote)
  282. return 3;
  283. }
  284. if (peek() == 'U' && peek(1) == quote)
  285. return 2;
  286. return 0;
  287. };
  288. while (m_index < m_input.length()) {
  289. auto ch = peek();
  290. if (is_ascii_space(ch)) {
  291. begin_token();
  292. while (is_ascii_space(peek()))
  293. consume();
  294. commit_token(Token::Type::Whitespace);
  295. continue;
  296. }
  297. if (ch == '(') {
  298. emit_single_char_token(Token::Type::LeftParen);
  299. continue;
  300. }
  301. if (ch == ')') {
  302. emit_single_char_token(Token::Type::RightParen);
  303. continue;
  304. }
  305. if (ch == '{') {
  306. emit_single_char_token(Token::Type::LeftCurly);
  307. continue;
  308. }
  309. if (ch == '}') {
  310. emit_single_char_token(Token::Type::RightCurly);
  311. continue;
  312. }
  313. if (ch == '[') {
  314. emit_single_char_token(Token::Type::LeftBracket);
  315. continue;
  316. }
  317. if (ch == ']') {
  318. emit_single_char_token(Token::Type::RightBracket);
  319. continue;
  320. }
  321. if (ch == '<') {
  322. begin_token();
  323. consume();
  324. if (peek() == '<') {
  325. consume();
  326. if (peek() == '=') {
  327. consume();
  328. commit_token(Token::Type::LessLessEquals);
  329. continue;
  330. }
  331. commit_token(Token::Type::LessLess);
  332. continue;
  333. }
  334. if (peek() == '=') {
  335. consume();
  336. commit_token(Token::Type::LessEquals);
  337. continue;
  338. }
  339. if (peek() == '>') {
  340. consume();
  341. commit_token(Token::Type::LessGreater);
  342. continue;
  343. }
  344. commit_token(Token::Type::Less);
  345. continue;
  346. }
  347. if (ch == '>') {
  348. begin_token();
  349. consume();
  350. if (peek() == '>') {
  351. consume();
  352. if (peek() == '=') {
  353. consume();
  354. commit_token(Token::Type::GreaterGreaterEquals);
  355. continue;
  356. }
  357. commit_token(Token::Type::GreaterGreater);
  358. continue;
  359. }
  360. if (peek() == '=') {
  361. consume();
  362. commit_token(Token::Type::GreaterEquals);
  363. continue;
  364. }
  365. commit_token(Token::Type::Greater);
  366. continue;
  367. }
  368. if (ch == ',') {
  369. emit_single_char_token(Token::Type::Comma);
  370. continue;
  371. }
  372. if (ch == '+') {
  373. begin_token();
  374. consume();
  375. if (peek() == '+') {
  376. consume();
  377. commit_token(Token::Type::PlusPlus);
  378. continue;
  379. }
  380. if (peek() == '=') {
  381. consume();
  382. commit_token(Token::Type::PlusEquals);
  383. continue;
  384. }
  385. commit_token(Token::Type::Plus);
  386. continue;
  387. }
  388. if (ch == '-') {
  389. begin_token();
  390. consume();
  391. if (peek() == '-') {
  392. consume();
  393. commit_token(Token::Type::MinusMinus);
  394. continue;
  395. }
  396. if (peek() == '=') {
  397. consume();
  398. commit_token(Token::Type::MinusEquals);
  399. continue;
  400. }
  401. if (peek() == '>') {
  402. consume();
  403. if (peek() == '*') {
  404. consume();
  405. commit_token(Token::Type::ArrowAsterisk);
  406. continue;
  407. }
  408. commit_token(Token::Type::Arrow);
  409. continue;
  410. }
  411. commit_token(Token::Type::Minus);
  412. continue;
  413. }
  414. if (ch == '*') {
  415. emit_token_equals(Token::Type::Asterisk, Token::Type::AsteriskEquals);
  416. continue;
  417. }
  418. if (ch == '%') {
  419. emit_token_equals(Token::Type::Percent, Token::Type::PercentEquals);
  420. continue;
  421. }
  422. if (ch == '^') {
  423. emit_token_equals(Token::Type::Caret, Token::Type::CaretEquals);
  424. continue;
  425. }
  426. if (ch == '!') {
  427. emit_token_equals(Token::Type::ExclamationMark, Token::Type::ExclamationMarkEquals);
  428. continue;
  429. }
  430. if (ch == '=') {
  431. emit_token_equals(Token::Type::Equals, Token::Type::EqualsEquals);
  432. continue;
  433. }
  434. if (ch == '&') {
  435. begin_token();
  436. consume();
  437. if (peek() == '&') {
  438. consume();
  439. commit_token(Token::Type::AndAnd);
  440. continue;
  441. }
  442. if (peek() == '=') {
  443. consume();
  444. commit_token(Token::Type::AndEquals);
  445. continue;
  446. }
  447. commit_token(Token::Type::And);
  448. continue;
  449. }
  450. if (ch == '|') {
  451. begin_token();
  452. consume();
  453. if (peek() == '|') {
  454. consume();
  455. commit_token(Token::Type::PipePipe);
  456. continue;
  457. }
  458. if (peek() == '=') {
  459. consume();
  460. commit_token(Token::Type::PipeEquals);
  461. continue;
  462. }
  463. commit_token(Token::Type::Pipe);
  464. continue;
  465. }
  466. if (ch == '~') {
  467. emit_single_char_token(Token::Type::Tilde);
  468. continue;
  469. }
  470. if (ch == '?') {
  471. emit_single_char_token(Token::Type::QuestionMark);
  472. continue;
  473. }
  474. if (ch == ':') {
  475. begin_token();
  476. consume();
  477. if (peek() == ':') {
  478. consume();
  479. if (peek() == '*') {
  480. consume();
  481. commit_token(Token::Type::ColonColonAsterisk);
  482. continue;
  483. }
  484. commit_token(Token::Type::ColonColon);
  485. continue;
  486. }
  487. commit_token(Token::Type::Colon);
  488. continue;
  489. }
  490. if (ch == ';') {
  491. emit_single_char_token(Token::Type::Semicolon);
  492. continue;
  493. }
  494. if (ch == '.') {
  495. begin_token();
  496. consume();
  497. if (peek() == '*') {
  498. consume();
  499. commit_token(Token::Type::DotAsterisk);
  500. continue;
  501. }
  502. commit_token(Token::Type::Dot);
  503. continue;
  504. }
  505. if (ch == '#') {
  506. begin_token();
  507. consume();
  508. if (is_valid_first_character_of_identifier(peek()))
  509. while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
  510. consume();
  511. auto directive = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
  512. if (directive == "#include") {
  513. commit_token(Token::Type::IncludeStatement);
  514. if (is_ascii_space(peek())) {
  515. begin_token();
  516. do {
  517. consume();
  518. } while (is_ascii_space(peek()));
  519. commit_token(Token::Type::Whitespace);
  520. }
  521. begin_token();
  522. if (peek() == '<' || peek() == '"') {
  523. char closing = consume() == '<' ? '>' : '"';
  524. while (peek() && peek() != closing && peek() != '\n')
  525. consume();
  526. if (peek() && consume() == '\n') {
  527. commit_token(Token::Type::IncludePath);
  528. continue;
  529. }
  530. commit_token(Token::Type::IncludePath);
  531. begin_token();
  532. }
  533. } else {
  534. while (peek() && peek() != '\n')
  535. consume();
  536. commit_token(Token::Type::PreprocessorStatement);
  537. }
  538. continue;
  539. }
  540. if (ch == '/' && peek(1) == '/') {
  541. begin_token();
  542. while (peek() && peek() != '\n')
  543. consume();
  544. commit_token(Token::Type::Comment);
  545. continue;
  546. }
  547. if (ch == '/' && peek(1) == '*') {
  548. begin_token();
  549. consume();
  550. consume();
  551. bool comment_block_ends = false;
  552. while (peek()) {
  553. if (peek() == '*' && peek(1) == '/') {
  554. comment_block_ends = true;
  555. break;
  556. }
  557. consume();
  558. }
  559. if (comment_block_ends) {
  560. consume();
  561. consume();
  562. }
  563. commit_token(Token::Type::Comment);
  564. continue;
  565. }
  566. if (ch == '/') {
  567. emit_token_equals(Token::Type::Slash, Token::Type::SlashEquals);
  568. continue;
  569. }
  570. if (size_t prefix = match_string_prefix('"'); prefix > 0) {
  571. begin_token();
  572. for (size_t i = 0; i < prefix; ++i)
  573. consume();
  574. while (peek()) {
  575. if (peek() == '\\') {
  576. if (size_t escape = match_escape_sequence(); escape > 0) {
  577. commit_token(Token::Type::DoubleQuotedString);
  578. begin_token();
  579. for (size_t i = 0; i < escape; ++i)
  580. consume();
  581. commit_token(Token::Type::EscapeSequence);
  582. begin_token();
  583. continue;
  584. }
  585. }
  586. // If string is not terminated - stop before EOF
  587. if (!peek(1))
  588. break;
  589. if (consume() == '"')
  590. break;
  591. }
  592. commit_token(Token::Type::DoubleQuotedString);
  593. continue;
  594. }
  595. if (size_t prefix = match_string_prefix('R'); prefix > 0 && peek(prefix) == '"') {
  596. begin_token();
  597. for (size_t i = 0; i < prefix + 1; ++i)
  598. consume();
  599. size_t prefix_start = m_index;
  600. while (peek() && peek() != '(')
  601. consume();
  602. StringView prefix_string = m_input.substring_view(prefix_start, m_index - prefix_start);
  603. while (peek()) {
  604. if (consume() == '"') {
  605. VERIFY(m_index >= prefix_string.length() + 2);
  606. VERIFY(m_input[m_index - 1] == '"');
  607. if (m_input[m_index - 1 - prefix_string.length() - 1] == ')') {
  608. StringView suffix_string = m_input.substring_view(m_index - 1 - prefix_string.length(), prefix_string.length());
  609. if (prefix_string == suffix_string)
  610. break;
  611. }
  612. }
  613. }
  614. commit_token(Token::Type::RawString);
  615. continue;
  616. }
  617. if (size_t prefix = match_string_prefix('\''); prefix > 0) {
  618. begin_token();
  619. for (size_t i = 0; i < prefix; ++i)
  620. consume();
  621. while (peek()) {
  622. if (peek() == '\\') {
  623. if (size_t escape = match_escape_sequence(); escape > 0) {
  624. commit_token(Token::Type::SingleQuotedString);
  625. begin_token();
  626. for (size_t i = 0; i < escape; ++i)
  627. consume();
  628. commit_token(Token::Type::EscapeSequence);
  629. begin_token();
  630. continue;
  631. }
  632. }
  633. if (consume() == '\'')
  634. break;
  635. }
  636. commit_token(Token::Type::SingleQuotedString);
  637. continue;
  638. }
  639. if (is_ascii_digit(ch) || (ch == '.' && is_ascii_digit(peek(1)))) {
  640. begin_token();
  641. consume();
  642. auto type = ch == '.' ? Token::Type::Float : Token::Type::Integer;
  643. bool is_hex = false;
  644. bool is_binary = false;
  645. auto match_exponent = [&]() -> size_t {
  646. char ch = peek();
  647. if (ch != 'e' && ch != 'E' && ch != 'p' && ch != 'P')
  648. return 0;
  649. type = Token::Type::Float;
  650. size_t length = 1;
  651. ch = peek(length);
  652. if (ch == '+' || ch == '-') {
  653. ++length;
  654. }
  655. for (ch = peek(length); is_ascii_digit(ch); ch = peek(length)) {
  656. ++length;
  657. }
  658. return length;
  659. };
  660. auto match_type_literal = [&]() -> size_t {
  661. size_t length = 0;
  662. for (;;) {
  663. char ch = peek(length);
  664. if ((ch == 'u' || ch == 'U') && type == Token::Type::Integer) {
  665. ++length;
  666. } else if ((ch == 'f' || ch == 'F') && !is_binary) {
  667. type = Token::Type::Float;
  668. ++length;
  669. } else if (ch == 'l' || ch == 'L') {
  670. ++length;
  671. } else
  672. return length;
  673. }
  674. };
  675. if (peek() == 'b' || peek() == 'B') {
  676. consume();
  677. is_binary = true;
  678. for (char ch = peek(); ch == '0' || ch == '1' || (ch == '\'' && peek(1) != '\''); ch = peek()) {
  679. consume();
  680. }
  681. } else {
  682. if (peek() == 'x' || peek() == 'X') {
  683. consume();
  684. is_hex = true;
  685. }
  686. for (char ch = peek(); (is_hex ? is_ascii_hex_digit(ch) : is_ascii_digit(ch)) || (ch == '\'' && peek(1) != '\'') || ch == '.'; ch = peek()) {
  687. if (ch == '.') {
  688. if (type == Token::Type::Integer) {
  689. type = Token::Type::Float;
  690. } else
  691. break;
  692. };
  693. consume();
  694. }
  695. }
  696. if (!is_binary) {
  697. size_t length = match_exponent();
  698. for (size_t i = 0; i < length; ++i)
  699. consume();
  700. }
  701. size_t length = match_type_literal();
  702. for (size_t i = 0; i < length; ++i)
  703. consume();
  704. commit_token(type);
  705. continue;
  706. }
  707. if (is_valid_first_character_of_identifier(ch)) {
  708. begin_token();
  709. while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
  710. consume();
  711. auto token_view = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
  712. if (is_keyword(token_view))
  713. commit_token(Token::Type::Keyword);
  714. else if (is_known_type(token_view))
  715. commit_token(Token::Type::KnownType);
  716. else
  717. commit_token(Token::Type::Identifier);
  718. continue;
  719. }
  720. if (ch == '\\' && peek(1) == '\n') {
  721. consume();
  722. consume();
  723. continue;
  724. }
  725. dbgln("Unimplemented token character: {}", ch);
  726. emit_single_char_token(Token::Type::Unknown);
  727. }
  728. return tokens;
  729. }
  730. }