Lexer.cpp 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776
  1. /*
  2. * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "Lexer.h"
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/HashTable.h>
  9. #include <AK/StdLibExtras.h>
  10. #include <AK/String.h>
  11. namespace Cpp {
  12. Lexer::Lexer(StringView const& input)
  13. : m_input(input)
  14. {
  15. }
  16. char Lexer::peek(size_t offset) const
  17. {
  18. if ((m_index + offset) >= m_input.length())
  19. return 0;
  20. return m_input[m_index + offset];
  21. }
  22. char Lexer::consume()
  23. {
  24. VERIFY(m_index < m_input.length());
  25. char ch = m_input[m_index++];
  26. m_previous_position = m_position;
  27. if (ch == '\n') {
  28. m_position.line++;
  29. m_position.column = 0;
  30. } else {
  31. m_position.column++;
  32. }
  33. return ch;
  34. }
  35. constexpr bool is_valid_first_character_of_identifier(char ch)
  36. {
  37. return is_ascii_alpha(ch) || ch == '_' || ch == '$';
  38. }
  39. constexpr bool is_valid_nonfirst_character_of_identifier(char ch)
  40. {
  41. return is_valid_first_character_of_identifier(ch) || is_ascii_digit(ch);
  42. }
  43. constexpr char const* s_known_keywords[] = {
  44. "alignas",
  45. "alignof",
  46. "and",
  47. "and_eq",
  48. "asm",
  49. "bitand",
  50. "bitor",
  51. "bool",
  52. "break",
  53. "case",
  54. "catch",
  55. "class",
  56. "compl",
  57. "const",
  58. "const_cast",
  59. "constexpr",
  60. "continue",
  61. "decltype",
  62. "default",
  63. "delete",
  64. "do",
  65. "dynamic_cast",
  66. "else",
  67. "enum",
  68. "explicit",
  69. "export",
  70. "extern",
  71. "false",
  72. "final",
  73. "for",
  74. "friend",
  75. "goto",
  76. "if",
  77. "inline",
  78. "mutable",
  79. "namespace",
  80. "new",
  81. "noexcept",
  82. "not",
  83. "not_eq",
  84. "nullptr",
  85. "operator",
  86. "or",
  87. "or_eq",
  88. "override",
  89. "private",
  90. "protected",
  91. "public",
  92. "register",
  93. "reinterpret_cast",
  94. "return",
  95. "signed",
  96. "sizeof",
  97. "static",
  98. "static_assert",
  99. "static_cast",
  100. "struct",
  101. "switch",
  102. "template",
  103. "this",
  104. "thread_local",
  105. "throw",
  106. "true",
  107. "try",
  108. "typedef",
  109. "typeid",
  110. "typename",
  111. "union",
  112. "using",
  113. "virtual",
  114. "volatile",
  115. "while",
  116. "xor",
  117. "xor_eq"
  118. };
  119. constexpr char const* s_known_types[] = {
  120. "ByteBuffer",
  121. "CircularDeque",
  122. "CircularQueue",
  123. "Deque",
  124. "DoublyLinkedList",
  125. "FileSystemPath",
  126. "Array",
  127. "Function",
  128. "HashMap",
  129. "HashTable",
  130. "IPv4Address",
  131. "InlineLinkedList",
  132. "IntrusiveList",
  133. "JsonArray",
  134. "JsonObject",
  135. "JsonValue",
  136. "MappedFile",
  137. "NetworkOrdered",
  138. "NonnullOwnPtr",
  139. "NonnullOwnPtrVector",
  140. "NonnullRefPtr",
  141. "NonnullRefPtrVector",
  142. "Optional",
  143. "OwnPtr",
  144. "RefPtr",
  145. "Result",
  146. "ScopeGuard",
  147. "SinglyLinkedList",
  148. "String",
  149. "StringBuilder",
  150. "StringImpl",
  151. "StringView",
  152. "Utf8View",
  153. "Vector",
  154. "WeakPtr",
  155. "auto",
  156. "char",
  157. "char16_t",
  158. "char32_t",
  159. "char8_t",
  160. "double",
  161. "float",
  162. "i16",
  163. "i32",
  164. "i64",
  165. "i8",
  166. "int",
  167. "int",
  168. "long",
  169. "short",
  170. "signed",
  171. "u16",
  172. "u32",
  173. "u64",
  174. "u8",
  175. "unsigned",
  176. "void",
  177. "wchar_t"
  178. };
  179. static bool is_keyword(StringView const& string)
  180. {
  181. static HashTable<String> keywords(array_size(s_known_keywords));
  182. if (keywords.is_empty()) {
  183. keywords.set_from(s_known_keywords);
  184. }
  185. return keywords.contains(string);
  186. }
  187. static bool is_known_type(StringView const& string)
  188. {
  189. static HashTable<String> types(array_size(s_known_types));
  190. if (types.is_empty()) {
  191. types.set_from(s_known_types);
  192. }
  193. return types.contains(string);
  194. }
  195. Vector<Token> Lexer::lex()
  196. {
  197. Vector<Token> tokens;
  198. size_t token_start_index = 0;
  199. Position token_start_position;
  200. auto emit_single_char_token = [&](auto type) {
  201. tokens.empend(type, m_position, m_position, m_input.substring_view(m_index, 1));
  202. consume();
  203. };
  204. auto begin_token = [&] {
  205. token_start_index = m_index;
  206. token_start_position = m_position;
  207. };
  208. auto commit_token = [&](auto type) {
  209. tokens.empend(type, token_start_position, m_previous_position, m_input.substring_view(token_start_index, m_index - token_start_index));
  210. };
  211. auto emit_token_equals = [&](auto type, auto equals_type) {
  212. if (peek(1) == '=') {
  213. begin_token();
  214. consume();
  215. consume();
  216. commit_token(equals_type);
  217. return;
  218. }
  219. emit_single_char_token(type);
  220. };
  221. auto match_escape_sequence = [&]() -> size_t {
  222. switch (peek(1)) {
  223. case '\'':
  224. case '"':
  225. case '?':
  226. case '\\':
  227. case 'a':
  228. case 'b':
  229. case 'f':
  230. case 'n':
  231. case 'r':
  232. case 't':
  233. case 'v':
  234. return 2;
  235. case '0':
  236. case '1':
  237. case '2':
  238. case '3':
  239. case '4':
  240. case '5':
  241. case '6':
  242. case '7': {
  243. size_t octal_digits = 1;
  244. for (size_t i = 0; i < 2; ++i) {
  245. char next = peek(2 + i);
  246. if (next < '0' || next > '7')
  247. break;
  248. ++octal_digits;
  249. }
  250. return 1 + octal_digits;
  251. }
  252. case 'x': {
  253. size_t hex_digits = 0;
  254. while (is_ascii_hex_digit(peek(2 + hex_digits)))
  255. ++hex_digits;
  256. return 2 + hex_digits;
  257. }
  258. case 'u':
  259. case 'U': {
  260. bool is_unicode = true;
  261. size_t number_of_digits = peek(1) == 'u' ? 4 : 8;
  262. for (size_t i = 0; i < number_of_digits; ++i) {
  263. if (!is_ascii_hex_digit(peek(2 + i))) {
  264. is_unicode = false;
  265. break;
  266. }
  267. }
  268. return is_unicode ? 2 + number_of_digits : 0;
  269. }
  270. default:
  271. return 0;
  272. }
  273. };
  274. auto match_string_prefix = [&](char quote) -> size_t {
  275. if (peek() == quote)
  276. return 1;
  277. if (peek() == 'L' && peek(1) == quote)
  278. return 2;
  279. if (peek() == 'u') {
  280. if (peek(1) == quote)
  281. return 2;
  282. if (peek(1) == '8' && peek(2) == quote)
  283. return 3;
  284. }
  285. if (peek() == 'U' && peek(1) == quote)
  286. return 2;
  287. return 0;
  288. };
  289. while (m_index < m_input.length()) {
  290. auto ch = peek();
  291. if (is_ascii_space(ch)) {
  292. begin_token();
  293. while (is_ascii_space(peek()))
  294. consume();
  295. commit_token(Token::Type::Whitespace);
  296. continue;
  297. }
  298. if (ch == '(') {
  299. emit_single_char_token(Token::Type::LeftParen);
  300. continue;
  301. }
  302. if (ch == ')') {
  303. emit_single_char_token(Token::Type::RightParen);
  304. continue;
  305. }
  306. if (ch == '{') {
  307. emit_single_char_token(Token::Type::LeftCurly);
  308. continue;
  309. }
  310. if (ch == '}') {
  311. emit_single_char_token(Token::Type::RightCurly);
  312. continue;
  313. }
  314. if (ch == '[') {
  315. emit_single_char_token(Token::Type::LeftBracket);
  316. continue;
  317. }
  318. if (ch == ']') {
  319. emit_single_char_token(Token::Type::RightBracket);
  320. continue;
  321. }
  322. if (ch == '<') {
  323. begin_token();
  324. consume();
  325. if (peek() == '<') {
  326. consume();
  327. if (peek() == '=') {
  328. consume();
  329. commit_token(Token::Type::LessLessEquals);
  330. continue;
  331. }
  332. commit_token(Token::Type::LessLess);
  333. continue;
  334. }
  335. if (peek() == '=') {
  336. consume();
  337. commit_token(Token::Type::LessEquals);
  338. continue;
  339. }
  340. if (peek() == '>') {
  341. consume();
  342. commit_token(Token::Type::LessGreater);
  343. continue;
  344. }
  345. commit_token(Token::Type::Less);
  346. continue;
  347. }
  348. if (ch == '>') {
  349. begin_token();
  350. consume();
  351. if (peek() == '>') {
  352. consume();
  353. if (peek() == '=') {
  354. consume();
  355. commit_token(Token::Type::GreaterGreaterEquals);
  356. continue;
  357. }
  358. commit_token(Token::Type::GreaterGreater);
  359. continue;
  360. }
  361. if (peek() == '=') {
  362. consume();
  363. commit_token(Token::Type::GreaterEquals);
  364. continue;
  365. }
  366. commit_token(Token::Type::Greater);
  367. continue;
  368. }
  369. if (ch == ',') {
  370. emit_single_char_token(Token::Type::Comma);
  371. continue;
  372. }
  373. if (ch == '+') {
  374. begin_token();
  375. consume();
  376. if (peek() == '+') {
  377. consume();
  378. commit_token(Token::Type::PlusPlus);
  379. continue;
  380. }
  381. if (peek() == '=') {
  382. consume();
  383. commit_token(Token::Type::PlusEquals);
  384. continue;
  385. }
  386. commit_token(Token::Type::Plus);
  387. continue;
  388. }
  389. if (ch == '-') {
  390. begin_token();
  391. consume();
  392. if (peek() == '-') {
  393. consume();
  394. commit_token(Token::Type::MinusMinus);
  395. continue;
  396. }
  397. if (peek() == '=') {
  398. consume();
  399. commit_token(Token::Type::MinusEquals);
  400. continue;
  401. }
  402. if (peek() == '>') {
  403. consume();
  404. if (peek() == '*') {
  405. consume();
  406. commit_token(Token::Type::ArrowAsterisk);
  407. continue;
  408. }
  409. commit_token(Token::Type::Arrow);
  410. continue;
  411. }
  412. commit_token(Token::Type::Minus);
  413. continue;
  414. }
  415. if (ch == '*') {
  416. emit_token_equals(Token::Type::Asterisk, Token::Type::AsteriskEquals);
  417. continue;
  418. }
  419. if (ch == '%') {
  420. emit_token_equals(Token::Type::Percent, Token::Type::PercentEquals);
  421. continue;
  422. }
  423. if (ch == '^') {
  424. emit_token_equals(Token::Type::Caret, Token::Type::CaretEquals);
  425. continue;
  426. }
  427. if (ch == '!') {
  428. emit_token_equals(Token::Type::ExclamationMark, Token::Type::ExclamationMarkEquals);
  429. continue;
  430. }
  431. if (ch == '=') {
  432. emit_token_equals(Token::Type::Equals, Token::Type::EqualsEquals);
  433. continue;
  434. }
  435. if (ch == '&') {
  436. begin_token();
  437. consume();
  438. if (peek() == '&') {
  439. consume();
  440. commit_token(Token::Type::AndAnd);
  441. continue;
  442. }
  443. if (peek() == '=') {
  444. consume();
  445. commit_token(Token::Type::AndEquals);
  446. continue;
  447. }
  448. commit_token(Token::Type::And);
  449. continue;
  450. }
  451. if (ch == '|') {
  452. begin_token();
  453. consume();
  454. if (peek() == '|') {
  455. consume();
  456. commit_token(Token::Type::PipePipe);
  457. continue;
  458. }
  459. if (peek() == '=') {
  460. consume();
  461. commit_token(Token::Type::PipeEquals);
  462. continue;
  463. }
  464. commit_token(Token::Type::Pipe);
  465. continue;
  466. }
  467. if (ch == '~') {
  468. emit_single_char_token(Token::Type::Tilde);
  469. continue;
  470. }
  471. if (ch == '?') {
  472. emit_single_char_token(Token::Type::QuestionMark);
  473. continue;
  474. }
  475. if (ch == ':') {
  476. begin_token();
  477. consume();
  478. if (peek() == ':') {
  479. consume();
  480. if (peek() == '*') {
  481. consume();
  482. commit_token(Token::Type::ColonColonAsterisk);
  483. continue;
  484. }
  485. commit_token(Token::Type::ColonColon);
  486. continue;
  487. }
  488. commit_token(Token::Type::Colon);
  489. continue;
  490. }
  491. if (ch == ';') {
  492. emit_single_char_token(Token::Type::Semicolon);
  493. continue;
  494. }
  495. if (ch == '.') {
  496. begin_token();
  497. consume();
  498. if (peek() == '*') {
  499. consume();
  500. commit_token(Token::Type::DotAsterisk);
  501. continue;
  502. }
  503. commit_token(Token::Type::Dot);
  504. continue;
  505. }
  506. if (ch == '#') {
  507. begin_token();
  508. consume();
  509. if (is_valid_first_character_of_identifier(peek()))
  510. while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
  511. consume();
  512. auto directive = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
  513. if (directive == "#include") {
  514. commit_token(Token::Type::IncludeStatement);
  515. if (is_ascii_space(peek())) {
  516. begin_token();
  517. do {
  518. consume();
  519. } while (is_ascii_space(peek()));
  520. commit_token(Token::Type::Whitespace);
  521. }
  522. begin_token();
  523. if (peek() == '<' || peek() == '"') {
  524. char closing = consume() == '<' ? '>' : '"';
  525. while (peek() && peek() != closing && peek() != '\n')
  526. consume();
  527. if (peek() && consume() == '\n') {
  528. commit_token(Token::Type::IncludePath);
  529. continue;
  530. }
  531. commit_token(Token::Type::IncludePath);
  532. begin_token();
  533. }
  534. } else {
  535. while (peek() && peek() != '\n')
  536. consume();
  537. commit_token(Token::Type::PreprocessorStatement);
  538. }
  539. continue;
  540. }
  541. if (ch == '/' && peek(1) == '/') {
  542. begin_token();
  543. while (peek() && peek() != '\n')
  544. consume();
  545. commit_token(Token::Type::Comment);
  546. continue;
  547. }
  548. if (ch == '/' && peek(1) == '*') {
  549. begin_token();
  550. consume();
  551. consume();
  552. bool comment_block_ends = false;
  553. while (peek()) {
  554. if (peek() == '*' && peek(1) == '/') {
  555. comment_block_ends = true;
  556. break;
  557. }
  558. consume();
  559. }
  560. if (comment_block_ends) {
  561. consume();
  562. consume();
  563. }
  564. commit_token(Token::Type::Comment);
  565. continue;
  566. }
  567. if (ch == '/') {
  568. emit_token_equals(Token::Type::Slash, Token::Type::SlashEquals);
  569. continue;
  570. }
  571. if (size_t prefix = match_string_prefix('"'); prefix > 0) {
  572. begin_token();
  573. for (size_t i = 0; i < prefix; ++i)
  574. consume();
  575. while (peek()) {
  576. if (peek() == '\\') {
  577. if (size_t escape = match_escape_sequence(); escape > 0) {
  578. commit_token(Token::Type::DoubleQuotedString);
  579. begin_token();
  580. for (size_t i = 0; i < escape; ++i)
  581. consume();
  582. commit_token(Token::Type::EscapeSequence);
  583. begin_token();
  584. continue;
  585. }
  586. }
  587. // If string is not terminated - stop before EOF
  588. if (!peek(1))
  589. break;
  590. if (consume() == '"')
  591. break;
  592. }
  593. commit_token(Token::Type::DoubleQuotedString);
  594. continue;
  595. }
  596. if (size_t prefix = match_string_prefix('R'); prefix > 0 && peek(prefix) == '"') {
  597. begin_token();
  598. for (size_t i = 0; i < prefix + 1; ++i)
  599. consume();
  600. size_t prefix_start = m_index;
  601. while (peek() && peek() != '(')
  602. consume();
  603. StringView prefix_string = m_input.substring_view(prefix_start, m_index - prefix_start);
  604. while (peek()) {
  605. if (consume() == '"') {
  606. VERIFY(m_index >= prefix_string.length() + 2);
  607. VERIFY(m_input[m_index - 1] == '"');
  608. if (m_input[m_index - 1 - prefix_string.length() - 1] == ')') {
  609. StringView suffix_string = m_input.substring_view(m_index - 1 - prefix_string.length(), prefix_string.length());
  610. if (prefix_string == suffix_string)
  611. break;
  612. }
  613. }
  614. }
  615. commit_token(Token::Type::RawString);
  616. continue;
  617. }
  618. if (size_t prefix = match_string_prefix('\''); prefix > 0) {
  619. begin_token();
  620. for (size_t i = 0; i < prefix; ++i)
  621. consume();
  622. while (peek()) {
  623. if (peek() == '\\') {
  624. if (size_t escape = match_escape_sequence(); escape > 0) {
  625. commit_token(Token::Type::SingleQuotedString);
  626. begin_token();
  627. for (size_t i = 0; i < escape; ++i)
  628. consume();
  629. commit_token(Token::Type::EscapeSequence);
  630. begin_token();
  631. continue;
  632. }
  633. }
  634. if (consume() == '\'')
  635. break;
  636. }
  637. commit_token(Token::Type::SingleQuotedString);
  638. continue;
  639. }
  640. if (is_ascii_digit(ch) || (ch == '.' && is_ascii_digit(peek(1)))) {
  641. begin_token();
  642. consume();
  643. auto type = ch == '.' ? Token::Type::Float : Token::Type::Integer;
  644. bool is_hex = false;
  645. bool is_binary = false;
  646. auto match_exponent = [&]() -> size_t {
  647. char ch = peek();
  648. if (ch != 'e' && ch != 'E' && ch != 'p' && ch != 'P')
  649. return 0;
  650. type = Token::Type::Float;
  651. size_t length = 1;
  652. ch = peek(length);
  653. if (ch == '+' || ch == '-') {
  654. ++length;
  655. }
  656. for (ch = peek(length); is_ascii_digit(ch); ch = peek(length)) {
  657. ++length;
  658. }
  659. return length;
  660. };
  661. auto match_type_literal = [&]() -> size_t {
  662. size_t length = 0;
  663. for (;;) {
  664. char ch = peek(length);
  665. if ((ch == 'u' || ch == 'U') && type == Token::Type::Integer) {
  666. ++length;
  667. } else if ((ch == 'f' || ch == 'F') && !is_binary) {
  668. type = Token::Type::Float;
  669. ++length;
  670. } else if (ch == 'l' || ch == 'L') {
  671. ++length;
  672. } else
  673. return length;
  674. }
  675. };
  676. if (peek() == 'b' || peek() == 'B') {
  677. consume();
  678. is_binary = true;
  679. for (char ch = peek(); ch == '0' || ch == '1' || (ch == '\'' && peek(1) != '\''); ch = peek()) {
  680. consume();
  681. }
  682. } else {
  683. if (peek() == 'x' || peek() == 'X') {
  684. consume();
  685. is_hex = true;
  686. }
  687. for (char ch = peek(); (is_hex ? is_ascii_hex_digit(ch) : is_ascii_digit(ch)) || (ch == '\'' && peek(1) != '\'') || ch == '.'; ch = peek()) {
  688. if (ch == '.') {
  689. if (type == Token::Type::Integer) {
  690. type = Token::Type::Float;
  691. } else
  692. break;
  693. };
  694. consume();
  695. }
  696. }
  697. if (!is_binary) {
  698. size_t length = match_exponent();
  699. for (size_t i = 0; i < length; ++i)
  700. consume();
  701. }
  702. size_t length = match_type_literal();
  703. for (size_t i = 0; i < length; ++i)
  704. consume();
  705. commit_token(type);
  706. continue;
  707. }
  708. if (is_valid_first_character_of_identifier(ch)) {
  709. begin_token();
  710. while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
  711. consume();
  712. auto token_view = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
  713. if (is_keyword(token_view))
  714. commit_token(Token::Type::Keyword);
  715. else if (is_known_type(token_view))
  716. commit_token(Token::Type::KnownType);
  717. else
  718. commit_token(Token::Type::Identifier);
  719. continue;
  720. }
  721. if (ch == '\\' && peek(1) == '\n') {
  722. consume();
  723. consume();
  724. continue;
  725. }
  726. dbgln("Unimplemented token character: {}", ch);
  727. emit_single_char_token(Token::Type::Unknown);
  728. }
  729. return tokens;
  730. }
  731. }