CppLexer.cpp 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. #include "CppLexer.h"
  2. #include <AK/HashTable.h>
  3. #include <AK/String.h>
  4. #include <ctype.h>
  5. CppLexer::CppLexer(const StringView& input)
  6. : m_input(input)
  7. {
  8. }
  9. char CppLexer::peek(int offset) const
  10. {
  11. if ((m_index + offset) >= m_input.length())
  12. return 0;
  13. return m_input[m_index + offset];
  14. }
  15. char CppLexer::consume()
  16. {
  17. ASSERT(m_index < m_input.length());
  18. char ch = m_input[m_index++];
  19. m_previous_position = m_position;
  20. if (ch == '\n') {
  21. m_position.line++;
  22. m_position.column = 0;
  23. } else {
  24. m_position.column++;
  25. }
  26. return ch;
  27. }
  28. static bool is_valid_first_character_of_identifier(char ch)
  29. {
  30. return isalpha(ch) || ch == '_' || ch == '$';
  31. }
  32. static bool is_valid_nonfirst_character_of_identifier(char ch)
  33. {
  34. return is_valid_first_character_of_identifier(ch) || isdigit(ch);
  35. }
  36. static bool is_keyword(const StringView& string)
  37. {
  38. static HashTable<String> keywords;
  39. if (keywords.is_empty()) {
  40. keywords.set("alignas");
  41. keywords.set("alignof");
  42. keywords.set("and");
  43. keywords.set("and_eq");
  44. keywords.set("asm");
  45. keywords.set("auto");
  46. keywords.set("bitand");
  47. keywords.set("bitor");
  48. keywords.set("bool");
  49. keywords.set("break");
  50. keywords.set("case");
  51. keywords.set("catch");
  52. keywords.set("char");
  53. keywords.set("char8_t");
  54. keywords.set("char16_t");
  55. keywords.set("char32_t");
  56. keywords.set("class");
  57. keywords.set("compl");
  58. keywords.set("const");
  59. keywords.set("constexpr");
  60. keywords.set("const_cast");
  61. keywords.set("continue");
  62. keywords.set("decltype");
  63. keywords.set("default");
  64. keywords.set("delete");
  65. keywords.set("do");
  66. keywords.set("double");
  67. keywords.set("dynamic_cast");
  68. keywords.set("else");
  69. keywords.set("enum");
  70. keywords.set("explicit");
  71. keywords.set("export");
  72. keywords.set("extern");
  73. keywords.set("false");
  74. keywords.set("float");
  75. keywords.set("for");
  76. keywords.set("friend");
  77. keywords.set("goto");
  78. keywords.set("if");
  79. keywords.set("inline");
  80. keywords.set("int");
  81. keywords.set("long");
  82. keywords.set("mutable");
  83. keywords.set("namespace");
  84. keywords.set("new");
  85. keywords.set("noexcept");
  86. keywords.set("not");
  87. keywords.set("not_eq");
  88. keywords.set("nullptr");
  89. keywords.set("operator");
  90. keywords.set("or");
  91. keywords.set("or_eq");
  92. keywords.set("private");
  93. keywords.set("protected");
  94. keywords.set("public");
  95. keywords.set("register");
  96. keywords.set("reinterpret_cast");
  97. keywords.set("return");
  98. keywords.set("short");
  99. keywords.set("signed");
  100. keywords.set("sizeof");
  101. keywords.set("static");
  102. keywords.set("static_assert");
  103. keywords.set("static_cast");
  104. keywords.set("struct");
  105. keywords.set("switch");
  106. keywords.set("template");
  107. keywords.set("this");
  108. keywords.set("thread_local");
  109. keywords.set("throw");
  110. keywords.set("true");
  111. keywords.set("try");
  112. keywords.set("typedef");
  113. keywords.set("typeid");
  114. keywords.set("typename");
  115. keywords.set("union");
  116. keywords.set("unsigned");
  117. keywords.set("using");
  118. keywords.set("virtual");
  119. keywords.set("void");
  120. keywords.set("volatile");
  121. keywords.set("wchar_t");
  122. keywords.set("while");
  123. keywords.set("xor");
  124. keywords.set("xor_eq");
  125. }
  126. return keywords.contains(string);
  127. }
  128. Vector<CppToken> CppLexer::lex()
  129. {
  130. Vector<CppToken> tokens;
  131. int token_start_index = 0;
  132. CppPosition token_start_position;
  133. auto emit_token = [&](auto type) {
  134. CppToken token;
  135. token.m_type = type;
  136. token.m_start = m_position;
  137. token.m_end = m_position;
  138. tokens.append(token);
  139. consume();
  140. };
  141. auto begin_token = [&] {
  142. token_start_index = m_index;
  143. token_start_position = m_position;
  144. };
  145. auto commit_token = [&](auto type) {
  146. CppToken token;
  147. token.m_type = type;
  148. token.m_start = token_start_position;
  149. token.m_end = m_previous_position;
  150. tokens.append(token);
  151. };
  152. while (m_index < m_input.length()) {
  153. auto ch = peek();
  154. if (isspace(ch)) {
  155. begin_token();
  156. while (isspace(peek()))
  157. consume();
  158. commit_token(CppToken::Type::Whitespace);
  159. continue;
  160. }
  161. if (ch == '(') {
  162. emit_token(CppToken::Type::LeftParen);
  163. continue;
  164. }
  165. if (ch == ')') {
  166. emit_token(CppToken::Type::RightParen);
  167. continue;
  168. }
  169. if (ch == '{') {
  170. emit_token(CppToken::Type::LeftCurly);
  171. continue;
  172. }
  173. if (ch == '}') {
  174. emit_token(CppToken::Type::RightCurly);
  175. continue;
  176. }
  177. if (ch == '[') {
  178. emit_token(CppToken::Type::LeftBracket);
  179. continue;
  180. }
  181. if (ch == ']') {
  182. emit_token(CppToken::Type::RightBracket);
  183. continue;
  184. }
  185. if (ch == ',') {
  186. emit_token(CppToken::Type::Comma);
  187. continue;
  188. }
  189. if (ch == '*') {
  190. emit_token(CppToken::Type::Asterisk);
  191. continue;
  192. }
  193. if (ch == ';') {
  194. emit_token(CppToken::Type::Semicolon);
  195. continue;
  196. }
  197. if (ch == '#') {
  198. begin_token();
  199. while (peek() && peek() != '\n')
  200. consume();
  201. commit_token(CppToken::Type::PreprocessorStatement);
  202. continue;
  203. }
  204. if (ch == '/' && peek(1) == '/') {
  205. begin_token();
  206. while (peek() && peek() != '\n')
  207. consume();
  208. commit_token(CppToken::Type::Comment);
  209. continue;
  210. }
  211. if (ch == '/' && peek(1) == '*') {
  212. begin_token();
  213. consume();
  214. consume();
  215. while (peek()) {
  216. if (peek() == '*' && peek(1) == '/')
  217. break;
  218. consume();
  219. }
  220. consume();
  221. consume();
  222. commit_token(CppToken::Type::Comment);
  223. continue;
  224. }
  225. if (ch == '"') {
  226. begin_token();
  227. consume();
  228. while (peek()) {
  229. if (consume() == '"')
  230. break;
  231. }
  232. commit_token(CppToken::Type::DoubleQuotedString);
  233. continue;
  234. }
  235. if (ch == '\'') {
  236. begin_token();
  237. consume();
  238. while (peek()) {
  239. if (consume() == '\'')
  240. break;
  241. }
  242. commit_token(CppToken::Type::SingleQuotedString);
  243. continue;
  244. }
  245. if (isdigit(ch)) {
  246. begin_token();
  247. while (peek() && isdigit(peek())) {
  248. consume();
  249. }
  250. commit_token(CppToken::Type::Number);
  251. continue;
  252. }
  253. if (is_valid_first_character_of_identifier(ch)) {
  254. begin_token();
  255. while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
  256. consume();
  257. auto token_view = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
  258. if (is_keyword(token_view))
  259. commit_token(CppToken::Type::Keyword);
  260. else
  261. commit_token(CppToken::Type::Identifier);
  262. continue;
  263. }
  264. dbg() << "Unimplemented token character: " << ch;
  265. emit_token(CppToken::Type::Unknown);
  266. }
  267. return tokens;
  268. }