Lexer.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812
  1. /*
  2. * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "Lexer.h"
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/Function.h>
  9. #include <AK/HashTable.h>
  10. #include <AK/StdLibExtras.h>
  11. #include <AK/String.h>
  12. namespace Cpp {
  13. Lexer::Lexer(StringView input, size_t start_line)
  14. : m_input(input)
  15. , m_previous_position { start_line, 0 }
  16. , m_position { start_line, 0 }
  17. {
  18. }
  19. char Lexer::peek(size_t offset) const
  20. {
  21. if ((m_index + offset) >= m_input.length())
  22. return 0;
  23. return m_input[m_index + offset];
  24. }
  25. char Lexer::consume()
  26. {
  27. VERIFY(m_index < m_input.length());
  28. char ch = m_input[m_index++];
  29. m_previous_position = m_position;
  30. if (ch == '\n') {
  31. m_position.line++;
  32. m_position.column = 0;
  33. } else {
  34. m_position.column++;
  35. }
  36. return ch;
  37. }
  38. constexpr bool is_valid_first_character_of_identifier(char ch)
  39. {
  40. return is_ascii_alpha(ch) || ch == '_' || ch == '$';
  41. }
  42. constexpr bool is_valid_nonfirst_character_of_identifier(char ch)
  43. {
  44. return is_valid_first_character_of_identifier(ch) || is_ascii_digit(ch);
  45. }
  46. constexpr StringView s_known_keywords[] = {
  47. "alignas",
  48. "alignof",
  49. "and",
  50. "and_eq",
  51. "asm",
  52. "bitand",
  53. "bitor",
  54. "break",
  55. "case",
  56. "catch",
  57. "class",
  58. "compl",
  59. "const",
  60. "const_cast",
  61. "constexpr",
  62. "continue",
  63. "decltype",
  64. "default",
  65. "delete",
  66. "do",
  67. "dynamic_cast",
  68. "else",
  69. "enum",
  70. "explicit",
  71. "export",
  72. "extern",
  73. "false",
  74. "final",
  75. "for",
  76. "friend",
  77. "goto",
  78. "if",
  79. "inline",
  80. "mutable",
  81. "namespace",
  82. "new",
  83. "noexcept",
  84. "not",
  85. "not_eq",
  86. "nullptr",
  87. "operator",
  88. "or",
  89. "or_eq",
  90. "override",
  91. "private",
  92. "protected",
  93. "public",
  94. "register",
  95. "reinterpret_cast",
  96. "return",
  97. "signed",
  98. "sizeof",
  99. "static",
  100. "static_assert",
  101. "static_cast",
  102. "struct",
  103. "switch",
  104. "template",
  105. "this",
  106. "thread_local",
  107. "throw",
  108. "true",
  109. "try",
  110. "typedef",
  111. "typeid",
  112. "typename",
  113. "union",
  114. "using",
  115. "virtual",
  116. "volatile",
  117. "while",
  118. "xor",
  119. "xor_eq"
  120. };
  121. constexpr StringView s_known_types[] = {
  122. "Array",
  123. "Array",
  124. "Badge",
  125. "Bitmap",
  126. "ByteBuffer",
  127. "Bytes",
  128. "Checked",
  129. "CircularDeque",
  130. "CircularQueue",
  131. "Deque",
  132. "DoublyLinkedList",
  133. "Error",
  134. "ErrorOr",
  135. "FlyString",
  136. "Function",
  137. "HashMap",
  138. "HashTable",
  139. "IPv4Address",
  140. "IntrusiveList",
  141. "IntrusiveList",
  142. "JsonArray",
  143. "JsonObject",
  144. "JsonValue",
  145. "LexicalPath",
  146. "MappedFile",
  147. "NetworkOrdered",
  148. "NeverDestroyed",
  149. "NonnullOwnPtr",
  150. "NonnullOwnPtrVector",
  151. "NonnullRefPtr",
  152. "NonnullRefPtrVector",
  153. "Optional",
  154. "OwnPtr",
  155. "ReadonlyBytes",
  156. "RedBlackTree",
  157. "RefPtr",
  158. "Result",
  159. "ScopeGuard",
  160. "Singleton",
  161. "SinglyLinkedList",
  162. "Span",
  163. "String",
  164. "StringBuilder",
  165. "StringImpl",
  166. "StringView",
  167. "Utf8View",
  168. "Variant",
  169. "Vector",
  170. "WeakPtr",
  171. "auto",
  172. "bool",
  173. "char",
  174. "char16_t",
  175. "char32_t",
  176. "char8_t",
  177. "double",
  178. "float",
  179. "i16",
  180. "i32",
  181. "i64",
  182. "i8",
  183. "int",
  184. "int",
  185. "long",
  186. "short",
  187. "signed",
  188. "u16",
  189. "u32",
  190. "u64",
  191. "u8",
  192. "unsigned",
  193. "void",
  194. "wchar_t",
  195. };
  196. static bool is_keyword(StringView string)
  197. {
  198. static HashTable<String> keywords(array_size(s_known_keywords));
  199. if (keywords.is_empty()) {
  200. keywords.set_from(s_known_keywords);
  201. }
  202. return keywords.contains(string);
  203. }
  204. static bool is_known_type(StringView string)
  205. {
  206. static HashTable<String> types(array_size(s_known_types));
  207. if (types.is_empty()) {
  208. types.set_from(s_known_types);
  209. }
  210. return types.contains(string);
  211. }
  212. void Lexer::lex_impl(Function<void(Token)> callback)
  213. {
  214. size_t token_start_index = 0;
  215. Position token_start_position;
  216. auto emit_single_char_token = [&](auto type) {
  217. callback(Token(type, m_position, m_position, m_input.substring_view(m_index, 1)));
  218. consume();
  219. };
  220. auto begin_token = [&] {
  221. token_start_index = m_index;
  222. token_start_position = m_position;
  223. };
  224. auto commit_token = [&](auto type) {
  225. if (m_options.ignore_whitespace && type == Token::Type::Whitespace)
  226. return;
  227. callback(Token(type, token_start_position, m_previous_position, m_input.substring_view(token_start_index, m_index - token_start_index)));
  228. };
  229. auto emit_token_equals = [&](auto type, auto equals_type) {
  230. if (peek(1) == '=') {
  231. begin_token();
  232. consume();
  233. consume();
  234. commit_token(equals_type);
  235. return;
  236. }
  237. emit_single_char_token(type);
  238. };
  239. auto match_escape_sequence = [&]() -> size_t {
  240. switch (peek(1)) {
  241. case '\'':
  242. case '"':
  243. case '?':
  244. case '\\':
  245. case 'a':
  246. case 'b':
  247. case 'f':
  248. case 'n':
  249. case 'r':
  250. case 't':
  251. case 'v':
  252. return 2;
  253. case '0':
  254. case '1':
  255. case '2':
  256. case '3':
  257. case '4':
  258. case '5':
  259. case '6':
  260. case '7': {
  261. size_t octal_digits = 1;
  262. for (size_t i = 0; i < 2; ++i) {
  263. char next = peek(2 + i);
  264. if (next < '0' || next > '7')
  265. break;
  266. ++octal_digits;
  267. }
  268. return 1 + octal_digits;
  269. }
  270. case 'x': {
  271. size_t hex_digits = 0;
  272. while (is_ascii_hex_digit(peek(2 + hex_digits)))
  273. ++hex_digits;
  274. return 2 + hex_digits;
  275. }
  276. case 'u':
  277. case 'U': {
  278. bool is_unicode = true;
  279. size_t number_of_digits = peek(1) == 'u' ? 4 : 8;
  280. for (size_t i = 0; i < number_of_digits; ++i) {
  281. if (!is_ascii_hex_digit(peek(2 + i))) {
  282. is_unicode = false;
  283. break;
  284. }
  285. }
  286. return is_unicode ? 2 + number_of_digits : 0;
  287. }
  288. default:
  289. return 0;
  290. }
  291. };
  292. auto match_string_prefix = [&](char quote) -> size_t {
  293. if (peek() == quote)
  294. return 1;
  295. if (peek() == 'L' && peek(1) == quote)
  296. return 2;
  297. if (peek() == 'u') {
  298. if (peek(1) == quote)
  299. return 2;
  300. if (peek(1) == '8' && peek(2) == quote)
  301. return 3;
  302. }
  303. if (peek() == 'U' && peek(1) == quote)
  304. return 2;
  305. return 0;
  306. };
  307. while (m_index < m_input.length()) {
  308. auto ch = peek();
  309. if (is_ascii_space(ch)) {
  310. begin_token();
  311. while (is_ascii_space(peek()))
  312. consume();
  313. commit_token(Token::Type::Whitespace);
  314. continue;
  315. }
  316. if (ch == '(') {
  317. emit_single_char_token(Token::Type::LeftParen);
  318. continue;
  319. }
  320. if (ch == ')') {
  321. emit_single_char_token(Token::Type::RightParen);
  322. continue;
  323. }
  324. if (ch == '{') {
  325. emit_single_char_token(Token::Type::LeftCurly);
  326. continue;
  327. }
  328. if (ch == '}') {
  329. emit_single_char_token(Token::Type::RightCurly);
  330. continue;
  331. }
  332. if (ch == '[') {
  333. emit_single_char_token(Token::Type::LeftBracket);
  334. continue;
  335. }
  336. if (ch == ']') {
  337. emit_single_char_token(Token::Type::RightBracket);
  338. continue;
  339. }
  340. if (ch == '<') {
  341. begin_token();
  342. consume();
  343. if (peek() == '<') {
  344. consume();
  345. if (peek() == '=') {
  346. consume();
  347. commit_token(Token::Type::LessLessEquals);
  348. continue;
  349. }
  350. commit_token(Token::Type::LessLess);
  351. continue;
  352. }
  353. if (peek() == '=') {
  354. consume();
  355. commit_token(Token::Type::LessEquals);
  356. continue;
  357. }
  358. if (peek() == '>') {
  359. consume();
  360. commit_token(Token::Type::LessGreater);
  361. continue;
  362. }
  363. commit_token(Token::Type::Less);
  364. continue;
  365. }
  366. if (ch == '>') {
  367. begin_token();
  368. consume();
  369. if (peek() == '>') {
  370. consume();
  371. if (peek() == '=') {
  372. consume();
  373. commit_token(Token::Type::GreaterGreaterEquals);
  374. continue;
  375. }
  376. commit_token(Token::Type::GreaterGreater);
  377. continue;
  378. }
  379. if (peek() == '=') {
  380. consume();
  381. commit_token(Token::Type::GreaterEquals);
  382. continue;
  383. }
  384. commit_token(Token::Type::Greater);
  385. continue;
  386. }
  387. if (ch == ',') {
  388. emit_single_char_token(Token::Type::Comma);
  389. continue;
  390. }
  391. if (ch == '+') {
  392. begin_token();
  393. consume();
  394. if (peek() == '+') {
  395. consume();
  396. commit_token(Token::Type::PlusPlus);
  397. continue;
  398. }
  399. if (peek() == '=') {
  400. consume();
  401. commit_token(Token::Type::PlusEquals);
  402. continue;
  403. }
  404. commit_token(Token::Type::Plus);
  405. continue;
  406. }
  407. if (ch == '-') {
  408. begin_token();
  409. consume();
  410. if (peek() == '-') {
  411. consume();
  412. commit_token(Token::Type::MinusMinus);
  413. continue;
  414. }
  415. if (peek() == '=') {
  416. consume();
  417. commit_token(Token::Type::MinusEquals);
  418. continue;
  419. }
  420. if (peek() == '>') {
  421. consume();
  422. if (peek() == '*') {
  423. consume();
  424. commit_token(Token::Type::ArrowAsterisk);
  425. continue;
  426. }
  427. commit_token(Token::Type::Arrow);
  428. continue;
  429. }
  430. commit_token(Token::Type::Minus);
  431. continue;
  432. }
  433. if (ch == '*') {
  434. emit_token_equals(Token::Type::Asterisk, Token::Type::AsteriskEquals);
  435. continue;
  436. }
  437. if (ch == '%') {
  438. emit_token_equals(Token::Type::Percent, Token::Type::PercentEquals);
  439. continue;
  440. }
  441. if (ch == '^') {
  442. emit_token_equals(Token::Type::Caret, Token::Type::CaretEquals);
  443. continue;
  444. }
  445. if (ch == '!') {
  446. emit_token_equals(Token::Type::ExclamationMark, Token::Type::ExclamationMarkEquals);
  447. continue;
  448. }
  449. if (ch == '=') {
  450. emit_token_equals(Token::Type::Equals, Token::Type::EqualsEquals);
  451. continue;
  452. }
  453. if (ch == '&') {
  454. begin_token();
  455. consume();
  456. if (peek() == '&') {
  457. consume();
  458. commit_token(Token::Type::AndAnd);
  459. continue;
  460. }
  461. if (peek() == '=') {
  462. consume();
  463. commit_token(Token::Type::AndEquals);
  464. continue;
  465. }
  466. commit_token(Token::Type::And);
  467. continue;
  468. }
  469. if (ch == '|') {
  470. begin_token();
  471. consume();
  472. if (peek() == '|') {
  473. consume();
  474. commit_token(Token::Type::PipePipe);
  475. continue;
  476. }
  477. if (peek() == '=') {
  478. consume();
  479. commit_token(Token::Type::PipeEquals);
  480. continue;
  481. }
  482. commit_token(Token::Type::Pipe);
  483. continue;
  484. }
  485. if (ch == '~') {
  486. emit_single_char_token(Token::Type::Tilde);
  487. continue;
  488. }
  489. if (ch == '?') {
  490. emit_single_char_token(Token::Type::QuestionMark);
  491. continue;
  492. }
  493. if (ch == ':') {
  494. begin_token();
  495. consume();
  496. if (peek() == ':') {
  497. consume();
  498. if (peek() == '*') {
  499. consume();
  500. commit_token(Token::Type::ColonColonAsterisk);
  501. continue;
  502. }
  503. commit_token(Token::Type::ColonColon);
  504. continue;
  505. }
  506. commit_token(Token::Type::Colon);
  507. continue;
  508. }
  509. if (ch == ';') {
  510. emit_single_char_token(Token::Type::Semicolon);
  511. continue;
  512. }
  513. if (ch == '.') {
  514. begin_token();
  515. consume();
  516. if (peek() == '*') {
  517. consume();
  518. commit_token(Token::Type::DotAsterisk);
  519. continue;
  520. }
  521. commit_token(Token::Type::Dot);
  522. continue;
  523. }
  524. if (ch == '#') {
  525. begin_token();
  526. consume();
  527. while (AK::is_ascii_space(peek()))
  528. consume();
  529. size_t directive_start = m_index;
  530. if (is_valid_first_character_of_identifier(peek()))
  531. while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
  532. consume();
  533. auto directive = StringView(m_input.characters_without_null_termination() + directive_start, m_index - directive_start);
  534. if (directive == "include"sv) {
  535. commit_token(Token::Type::IncludeStatement);
  536. if (is_ascii_space(peek())) {
  537. begin_token();
  538. do {
  539. consume();
  540. } while (is_ascii_space(peek()));
  541. commit_token(Token::Type::Whitespace);
  542. }
  543. begin_token();
  544. if (peek() == '<' || peek() == '"') {
  545. char closing = consume() == '<' ? '>' : '"';
  546. while (peek() && peek() != closing && peek() != '\n')
  547. consume();
  548. if (peek() && consume() == '\n') {
  549. commit_token(Token::Type::IncludePath);
  550. continue;
  551. }
  552. commit_token(Token::Type::IncludePath);
  553. begin_token();
  554. }
  555. } else {
  556. while (peek()) {
  557. if (peek() == '\\' && peek(1) == '\n') {
  558. consume();
  559. consume();
  560. } else if (peek() == '\n') {
  561. break;
  562. } else {
  563. consume();
  564. }
  565. }
  566. commit_token(Token::Type::PreprocessorStatement);
  567. }
  568. continue;
  569. }
  570. if (ch == '/' && peek(1) == '/') {
  571. begin_token();
  572. while (peek() && peek() != '\n')
  573. consume();
  574. commit_token(Token::Type::Comment);
  575. continue;
  576. }
  577. if (ch == '/' && peek(1) == '*') {
  578. begin_token();
  579. consume();
  580. consume();
  581. bool comment_block_ends = false;
  582. while (peek()) {
  583. if (peek() == '*' && peek(1) == '/') {
  584. comment_block_ends = true;
  585. break;
  586. }
  587. consume();
  588. }
  589. if (comment_block_ends) {
  590. consume();
  591. consume();
  592. }
  593. commit_token(Token::Type::Comment);
  594. continue;
  595. }
  596. if (ch == '/') {
  597. emit_token_equals(Token::Type::Slash, Token::Type::SlashEquals);
  598. continue;
  599. }
  600. if (size_t prefix = match_string_prefix('"'); prefix > 0) {
  601. begin_token();
  602. for (size_t i = 0; i < prefix; ++i)
  603. consume();
  604. while (peek()) {
  605. if (peek() == '\\') {
  606. if (size_t escape = match_escape_sequence(); escape > 0) {
  607. commit_token(Token::Type::DoubleQuotedString);
  608. begin_token();
  609. for (size_t i = 0; i < escape; ++i)
  610. consume();
  611. commit_token(Token::Type::EscapeSequence);
  612. begin_token();
  613. continue;
  614. }
  615. }
  616. // If string is not terminated - stop before EOF
  617. if (!peek(1))
  618. break;
  619. if (consume() == '"')
  620. break;
  621. }
  622. commit_token(Token::Type::DoubleQuotedString);
  623. continue;
  624. }
  625. if (size_t prefix = match_string_prefix('R'); prefix > 0 && peek(prefix) == '"') {
  626. begin_token();
  627. for (size_t i = 0; i < prefix + 1; ++i)
  628. consume();
  629. size_t prefix_start = m_index;
  630. while (peek() && peek() != '(')
  631. consume();
  632. StringView prefix_string = m_input.substring_view(prefix_start, m_index - prefix_start);
  633. while (peek()) {
  634. if (consume() == '"') {
  635. VERIFY(m_index >= prefix_string.length() + 2);
  636. VERIFY(m_input[m_index - 1] == '"');
  637. if (m_input[m_index - 1 - prefix_string.length() - 1] == ')') {
  638. StringView suffix_string = m_input.substring_view(m_index - 1 - prefix_string.length(), prefix_string.length());
  639. if (prefix_string == suffix_string)
  640. break;
  641. }
  642. }
  643. }
  644. commit_token(Token::Type::RawString);
  645. continue;
  646. }
  647. if (size_t prefix = match_string_prefix('\''); prefix > 0) {
  648. begin_token();
  649. for (size_t i = 0; i < prefix; ++i)
  650. consume();
  651. while (peek()) {
  652. if (peek() == '\\') {
  653. if (size_t escape = match_escape_sequence(); escape > 0) {
  654. commit_token(Token::Type::SingleQuotedString);
  655. begin_token();
  656. for (size_t i = 0; i < escape; ++i)
  657. consume();
  658. commit_token(Token::Type::EscapeSequence);
  659. begin_token();
  660. continue;
  661. }
  662. }
  663. if (consume() == '\'')
  664. break;
  665. }
  666. commit_token(Token::Type::SingleQuotedString);
  667. continue;
  668. }
  669. if (is_ascii_digit(ch) || (ch == '.' && is_ascii_digit(peek(1)))) {
  670. begin_token();
  671. consume();
  672. auto type = ch == '.' ? Token::Type::Float : Token::Type::Integer;
  673. bool is_hex = false;
  674. bool is_binary = false;
  675. auto match_exponent = [&]() -> size_t {
  676. char ch = peek();
  677. if (ch != 'e' && ch != 'E' && ch != 'p' && ch != 'P')
  678. return 0;
  679. type = Token::Type::Float;
  680. size_t length = 1;
  681. ch = peek(length);
  682. if (ch == '+' || ch == '-') {
  683. ++length;
  684. }
  685. for (ch = peek(length); is_ascii_digit(ch); ch = peek(length)) {
  686. ++length;
  687. }
  688. return length;
  689. };
  690. auto match_type_literal = [&]() -> size_t {
  691. size_t length = 0;
  692. for (;;) {
  693. char ch = peek(length);
  694. if ((ch == 'u' || ch == 'U') && type == Token::Type::Integer) {
  695. ++length;
  696. } else if ((ch == 'f' || ch == 'F') && !is_binary) {
  697. type = Token::Type::Float;
  698. ++length;
  699. } else if (ch == 'l' || ch == 'L') {
  700. ++length;
  701. } else
  702. return length;
  703. }
  704. };
  705. if (peek() == 'b' || peek() == 'B') {
  706. consume();
  707. is_binary = true;
  708. for (char ch = peek(); ch == '0' || ch == '1' || (ch == '\'' && peek(1) != '\''); ch = peek()) {
  709. consume();
  710. }
  711. } else {
  712. if (peek() == 'x' || peek() == 'X') {
  713. consume();
  714. is_hex = true;
  715. }
  716. for (char ch = peek(); (is_hex ? is_ascii_hex_digit(ch) : is_ascii_digit(ch)) || (ch == '\'' && peek(1) != '\'') || ch == '.'; ch = peek()) {
  717. if (ch == '.') {
  718. if (type == Token::Type::Integer) {
  719. type = Token::Type::Float;
  720. } else
  721. break;
  722. };
  723. consume();
  724. }
  725. }
  726. if (!is_binary) {
  727. size_t length = match_exponent();
  728. for (size_t i = 0; i < length; ++i)
  729. consume();
  730. }
  731. size_t length = match_type_literal();
  732. for (size_t i = 0; i < length; ++i)
  733. consume();
  734. commit_token(type);
  735. continue;
  736. }
  737. if (is_valid_first_character_of_identifier(ch)) {
  738. begin_token();
  739. while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
  740. consume();
  741. auto token_view = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
  742. if (is_keyword(token_view))
  743. commit_token(Token::Type::Keyword);
  744. else if (is_known_type(token_view))
  745. commit_token(Token::Type::KnownType);
  746. else
  747. commit_token(Token::Type::Identifier);
  748. continue;
  749. }
  750. if (ch == '\\' && peek(1) == '\n') {
  751. consume();
  752. consume();
  753. continue;
  754. }
  755. dbgln("Unimplemented token character: {}", ch);
  756. emit_single_char_token(Token::Type::Unknown);
  757. }
  758. }
  759. Vector<Token> Lexer::lex()
  760. {
  761. Vector<Token> tokens;
  762. lex_impl([&](auto token) {
  763. tokens.append(move(token));
  764. });
  765. return tokens;
  766. }
  767. }