CppLexer.cpp 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. #include "CppLexer.h"
  2. #include <AK/LogStream.h>
  3. #include <ctype.h>
  4. CppLexer::CppLexer(const StringView& input)
  5. : m_input(input)
  6. {
  7. }
  8. char CppLexer::peek(int offset) const
  9. {
  10. if ((m_index + offset) >= m_input.length())
  11. return 0;
  12. return m_input[m_index + offset];
  13. }
  14. char CppLexer::consume()
  15. {
  16. ASSERT(m_index < m_input.length());
  17. char ch = m_input[m_index++];
  18. m_previous_position = m_position;
  19. if (ch == '\n') {
  20. m_position.line++;
  21. m_position.column = 0;
  22. } else {
  23. m_position.column++;
  24. }
  25. return ch;
  26. }
  27. static bool is_valid_first_character_of_identifier(char ch)
  28. {
  29. return isalpha(ch) || ch == '_' || ch == '$';
  30. }
  31. static bool is_valid_nonfirst_character_of_identifier(char ch)
  32. {
  33. return is_valid_first_character_of_identifier(ch) || isdigit(ch);
  34. }
  35. static bool is_keyword(const StringView& string)
  36. {
  37. if (string == "int" || string == "char" || string == "return")
  38. return true;
  39. return false;
  40. }
  41. Vector<CppToken> CppLexer::lex()
  42. {
  43. Vector<CppToken> tokens;
  44. int token_start_index = 0;
  45. CppPosition token_start_position;
  46. auto emit_token = [&](auto type) {
  47. CppToken token;
  48. token.m_type = type;
  49. token.m_start = m_position;
  50. token.m_end = m_position;
  51. tokens.append(token);
  52. consume();
  53. };
  54. auto begin_token = [&] {
  55. token_start_index = m_index;
  56. token_start_position = m_position;
  57. };
  58. auto commit_token = [&](auto type) {
  59. CppToken token;
  60. token.m_type = type;
  61. token.m_start = token_start_position;
  62. token.m_end = m_previous_position;
  63. tokens.append(token);
  64. };
  65. while (m_index < m_input.length()) {
  66. auto ch = peek();
  67. if (isspace(ch)) {
  68. begin_token();
  69. while (isspace(peek()))
  70. consume();
  71. commit_token(CppToken::Type::Whitespace);
  72. continue;
  73. }
  74. if (ch == '(') {
  75. emit_token(CppToken::Type::LeftParen);
  76. continue;
  77. }
  78. if (ch == ')') {
  79. emit_token(CppToken::Type::RightParen);
  80. continue;
  81. }
  82. if (ch == '{') {
  83. emit_token(CppToken::Type::LeftCurly);
  84. continue;
  85. }
  86. if (ch == '}') {
  87. emit_token(CppToken::Type::RightCurly);
  88. continue;
  89. }
  90. if (ch == '[') {
  91. emit_token(CppToken::Type::LeftBracket);
  92. continue;
  93. }
  94. if (ch == ']') {
  95. emit_token(CppToken::Type::RightBracket);
  96. continue;
  97. }
  98. if (ch == ',') {
  99. emit_token(CppToken::Type::Comma);
  100. continue;
  101. }
  102. if (ch == '*') {
  103. emit_token(CppToken::Type::Asterisk);
  104. continue;
  105. }
  106. if (ch == ';') {
  107. emit_token(CppToken::Type::Semicolon);
  108. continue;
  109. }
  110. if (ch == '#') {
  111. begin_token();
  112. while (peek() && peek() != '\n')
  113. consume();
  114. commit_token(CppToken::Type::PreprocessorStatement);
  115. continue;
  116. }
  117. if (ch == '/' && peek(1) == '/') {
  118. begin_token();
  119. while (peek() && peek() != '\n')
  120. consume();
  121. commit_token(CppToken::Type::Comment);
  122. continue;
  123. }
  124. if (ch == '/' && peek(1) == '*') {
  125. begin_token();
  126. consume();
  127. consume();
  128. while (peek()) {
  129. if (peek() == '*' && peek(1) == '/')
  130. break;
  131. consume();
  132. }
  133. consume();
  134. consume();
  135. emit_token(CppToken::Type::Comment);
  136. continue;
  137. }
  138. if (ch == '"') {
  139. begin_token();
  140. consume();
  141. while (peek()) {
  142. if (consume() == '"')
  143. break;
  144. }
  145. commit_token(CppToken::Type::DoubleQuotedString);
  146. continue;
  147. }
  148. if (ch == '\'') {
  149. begin_token();
  150. consume();
  151. while (peek()) {
  152. if (consume() == '\'')
  153. break;
  154. }
  155. commit_token(CppToken::Type::SingleQuotedString);
  156. continue;
  157. }
  158. if (isdigit(ch)) {
  159. begin_token();
  160. while (peek() && isdigit(peek())) {
  161. consume();
  162. }
  163. commit_token(CppToken::Type::Number);
  164. continue;
  165. }
  166. if (is_valid_first_character_of_identifier(ch)) {
  167. begin_token();
  168. while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
  169. consume();
  170. auto token_view = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
  171. if (is_keyword(token_view))
  172. commit_token(CppToken::Type::Keyword);
  173. else
  174. commit_token(CppToken::Type::Identifier);
  175. continue;
  176. }
  177. dbg() << "Unimplemented token character: " << ch;
  178. ASSERT_NOT_REACHED();
  179. }
  180. return tokens;
  181. }