CppLexer.cpp 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. #include "CppLexer.h"
  2. #include <AK/HashTable.h>
  3. #include <AK/String.h>
  4. #include <ctype.h>
  5. CppLexer::CppLexer(const StringView& input)
  6. : m_input(input)
  7. {
  8. }
  9. char CppLexer::peek(int offset) const
  10. {
  11. if ((m_index + offset) >= m_input.length())
  12. return 0;
  13. return m_input[m_index + offset];
  14. }
  15. char CppLexer::consume()
  16. {
  17. ASSERT(m_index < m_input.length());
  18. char ch = m_input[m_index++];
  19. m_previous_position = m_position;
  20. if (ch == '\n') {
  21. m_position.line++;
  22. m_position.column = 0;
  23. } else {
  24. m_position.column++;
  25. }
  26. return ch;
  27. }
  28. static bool is_valid_first_character_of_identifier(char ch)
  29. {
  30. return isalpha(ch) || ch == '_' || ch == '$';
  31. }
  32. static bool is_valid_nonfirst_character_of_identifier(char ch)
  33. {
  34. return is_valid_first_character_of_identifier(ch) || isdigit(ch);
  35. }
  36. static bool is_keyword(const StringView& string)
  37. {
  38. static HashTable<String> keywords;
  39. if (keywords.is_empty()) {
  40. keywords.set("alignas");
  41. keywords.set("alignof");
  42. keywords.set("and");
  43. keywords.set("and_eq");
  44. keywords.set("asm");
  45. keywords.set("bitand");
  46. keywords.set("bitor");
  47. keywords.set("bool");
  48. keywords.set("break");
  49. keywords.set("case");
  50. keywords.set("catch");
  51. keywords.set("class");
  52. keywords.set("compl");
  53. keywords.set("const");
  54. keywords.set("const_cast");
  55. keywords.set("constexpr");
  56. keywords.set("continue");
  57. keywords.set("decltype");
  58. keywords.set("default");
  59. keywords.set("delete");
  60. keywords.set("do");
  61. keywords.set("dynamic_cast");
  62. keywords.set("else");
  63. keywords.set("enum");
  64. keywords.set("explicit");
  65. keywords.set("export");
  66. keywords.set("extern");
  67. keywords.set("false");
  68. keywords.set("final");
  69. keywords.set("for");
  70. keywords.set("friend");
  71. keywords.set("goto");
  72. keywords.set("if");
  73. keywords.set("inline");
  74. keywords.set("mutable");
  75. keywords.set("namespace");
  76. keywords.set("new");
  77. keywords.set("noexcept");
  78. keywords.set("not");
  79. keywords.set("not_eq");
  80. keywords.set("nullptr");
  81. keywords.set("operator");
  82. keywords.set("or");
  83. keywords.set("or_eq");
  84. keywords.set("override");
  85. keywords.set("private");
  86. keywords.set("protected");
  87. keywords.set("public");
  88. keywords.set("register");
  89. keywords.set("reinterpret_cast");
  90. keywords.set("return");
  91. keywords.set("signed");
  92. keywords.set("sizeof");
  93. keywords.set("static");
  94. keywords.set("static_assert");
  95. keywords.set("static_cast");
  96. keywords.set("struct");
  97. keywords.set("switch");
  98. keywords.set("template");
  99. keywords.set("this");
  100. keywords.set("thread_local");
  101. keywords.set("throw");
  102. keywords.set("true");
  103. keywords.set("try");
  104. keywords.set("typedef");
  105. keywords.set("typeid");
  106. keywords.set("typename");
  107. keywords.set("union");
  108. keywords.set("using");
  109. keywords.set("virtual");
  110. keywords.set("volatile");
  111. keywords.set("while");
  112. keywords.set("xor");
  113. keywords.set("xor_eq");
  114. }
  115. return keywords.contains(string);
  116. }
  117. static bool is_known_type(const StringView& string)
  118. {
  119. static HashTable<String> types;
  120. if (types.is_empty()) {
  121. types.set("ByteBuffer");
  122. types.set("CircularDeque");
  123. types.set("CircularQueue");
  124. types.set("Deque");
  125. types.set("DoublyLinkedList");
  126. types.set("FileSystemPath");
  127. types.set("FixedArray");
  128. types.set("Function");
  129. types.set("HashMap");
  130. types.set("HashTable");
  131. types.set("IPv4Address");
  132. types.set("InlineLinkedList");
  133. types.set("IntrusiveList");
  134. types.set("JsonArray");
  135. types.set("JsonObject");
  136. types.set("JsonValue");
  137. types.set("MappedFile");
  138. types.set("NetworkOrdered");
  139. types.set("NonnullOwnPtr");
  140. types.set("NonnullOwnPtrVector");
  141. types.set("NonnullRefPtr");
  142. types.set("NonnullRefPtrVector");
  143. types.set("Optional");
  144. types.set("OwnPtr");
  145. types.set("RefPtr");
  146. types.set("Result");
  147. types.set("ScopeGuard");
  148. types.set("SinglyLinkedList");
  149. types.set("String");
  150. types.set("StringBuilder");
  151. types.set("StringImpl");
  152. types.set("StringView");
  153. types.set("Utf8View");
  154. types.set("Vector");
  155. types.set("WeakPtr");
  156. types.set("auto");
  157. types.set("char");
  158. types.set("char16_t");
  159. types.set("char32_t");
  160. types.set("char8_t");
  161. types.set("double");
  162. types.set("float");
  163. types.set("i16");
  164. types.set("i32");
  165. types.set("i64");
  166. types.set("i8");
  167. types.set("int");
  168. types.set("int");
  169. types.set("long");
  170. types.set("short");
  171. types.set("signed");
  172. types.set("u16");
  173. types.set("u32");
  174. types.set("u64");
  175. types.set("u8");
  176. types.set("unsigned");
  177. types.set("void");
  178. types.set("wchar_t");
  179. }
  180. return types.contains(string);
  181. }
  182. Vector<CppToken> CppLexer::lex()
  183. {
  184. Vector<CppToken> tokens;
  185. int token_start_index = 0;
  186. CppPosition token_start_position;
  187. auto emit_token = [&](auto type) {
  188. CppToken token;
  189. token.m_type = type;
  190. token.m_start = m_position;
  191. token.m_end = m_position;
  192. tokens.append(token);
  193. consume();
  194. };
  195. auto begin_token = [&] {
  196. token_start_index = m_index;
  197. token_start_position = m_position;
  198. };
  199. auto commit_token = [&](auto type) {
  200. CppToken token;
  201. token.m_type = type;
  202. token.m_start = token_start_position;
  203. token.m_end = m_previous_position;
  204. tokens.append(token);
  205. };
  206. while (m_index < m_input.length()) {
  207. auto ch = peek();
  208. if (isspace(ch)) {
  209. begin_token();
  210. while (isspace(peek()))
  211. consume();
  212. commit_token(CppToken::Type::Whitespace);
  213. continue;
  214. }
  215. if (ch == '(') {
  216. emit_token(CppToken::Type::LeftParen);
  217. continue;
  218. }
  219. if (ch == ')') {
  220. emit_token(CppToken::Type::RightParen);
  221. continue;
  222. }
  223. if (ch == '{') {
  224. emit_token(CppToken::Type::LeftCurly);
  225. continue;
  226. }
  227. if (ch == '}') {
  228. emit_token(CppToken::Type::RightCurly);
  229. continue;
  230. }
  231. if (ch == '[') {
  232. emit_token(CppToken::Type::LeftBracket);
  233. continue;
  234. }
  235. if (ch == ']') {
  236. emit_token(CppToken::Type::RightBracket);
  237. continue;
  238. }
  239. if (ch == ',') {
  240. emit_token(CppToken::Type::Comma);
  241. continue;
  242. }
  243. if (ch == '*') {
  244. emit_token(CppToken::Type::Asterisk);
  245. continue;
  246. }
  247. if (ch == ';') {
  248. emit_token(CppToken::Type::Semicolon);
  249. continue;
  250. }
  251. if (ch == '#') {
  252. begin_token();
  253. while (peek() && peek() != '\n')
  254. consume();
  255. commit_token(CppToken::Type::PreprocessorStatement);
  256. continue;
  257. }
  258. if (ch == '/' && peek(1) == '/') {
  259. begin_token();
  260. while (peek() && peek() != '\n')
  261. consume();
  262. commit_token(CppToken::Type::Comment);
  263. continue;
  264. }
  265. if (ch == '/' && peek(1) == '*') {
  266. begin_token();
  267. consume();
  268. consume();
  269. while (peek()) {
  270. if (peek() == '*' && peek(1) == '/')
  271. break;
  272. consume();
  273. }
  274. consume();
  275. consume();
  276. commit_token(CppToken::Type::Comment);
  277. continue;
  278. }
  279. if (ch == '"') {
  280. begin_token();
  281. consume();
  282. while (peek()) {
  283. if (consume() == '"')
  284. break;
  285. }
  286. commit_token(CppToken::Type::DoubleQuotedString);
  287. continue;
  288. }
  289. if (ch == '\'') {
  290. begin_token();
  291. consume();
  292. while (peek()) {
  293. if (consume() == '\'')
  294. break;
  295. }
  296. commit_token(CppToken::Type::SingleQuotedString);
  297. continue;
  298. }
  299. if (isdigit(ch)) {
  300. begin_token();
  301. while (peek() && isdigit(peek())) {
  302. consume();
  303. }
  304. commit_token(CppToken::Type::Number);
  305. continue;
  306. }
  307. if (is_valid_first_character_of_identifier(ch)) {
  308. begin_token();
  309. while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
  310. consume();
  311. auto token_view = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
  312. if (is_keyword(token_view))
  313. commit_token(CppToken::Type::Keyword);
  314. else if (is_known_type(token_view))
  315. commit_token(CppToken::Type::KnownType);
  316. else
  317. commit_token(CppToken::Type::Identifier);
  318. continue;
  319. }
  320. dbg() << "Unimplemented token character: " << ch;
  321. emit_token(CppToken::Type::Unknown);
  322. }
  323. return tokens;
  324. }