Lexer.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
  3. * Copyright (c) 2021, Jan de Visser <jan@de-visser.net>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include "Lexer.h"
  8. #include <AK/Debug.h>
  9. #include <ctype.h>
  10. namespace SQL::AST {
  11. HashMap<String, TokenType> Lexer::s_keywords;
  12. HashMap<char, TokenType> Lexer::s_one_char_tokens;
  13. HashMap<String, TokenType> Lexer::s_two_char_tokens;
  14. Lexer::Lexer(StringView source)
  15. : m_source(source)
  16. {
  17. if (s_keywords.is_empty()) {
  18. #define __ENUMERATE_SQL_TOKEN(value, type, category) \
  19. if (TokenCategory::category == TokenCategory::Keyword) \
  20. s_keywords.set(value, TokenType::type);
  21. ENUMERATE_SQL_TOKENS
  22. #undef __ENUMERATE_SQL_TOKEN
  23. }
  24. if (s_one_char_tokens.is_empty()) {
  25. #define __ENUMERATE_SQL_TOKEN(value, type, category) \
  26. if (TokenCategory::category != TokenCategory::Keyword && StringView(value).length() == 1) \
  27. s_one_char_tokens.set(value[0], TokenType::type);
  28. ENUMERATE_SQL_TOKENS
  29. #undef __ENUMERATE_SQL_TOKEN
  30. }
  31. if (s_two_char_tokens.is_empty()) {
  32. #define __ENUMERATE_SQL_TOKEN(value, type, category) \
  33. if (TokenCategory::category != TokenCategory::Keyword && StringView(value).length() == 2) \
  34. s_two_char_tokens.set(value, TokenType::type);
  35. ENUMERATE_SQL_TOKENS
  36. #undef __ENUMERATE_SQL_TOKEN
  37. }
  38. consume();
  39. }
  40. Token Lexer::next()
  41. {
  42. bool found_invalid_comment = consume_whitespace_and_comments();
  43. size_t value_start_line_number = m_line_number;
  44. size_t value_start_column_number = m_line_column;
  45. auto token_type = TokenType::Invalid;
  46. StringBuilder current_token;
  47. if (is_eof()) {
  48. token_type = found_invalid_comment ? TokenType::Invalid : TokenType::Eof;
  49. } else if (is_numeric_literal_start()) {
  50. token_type = TokenType::NumericLiteral;
  51. if (!consume_numeric_literal(current_token))
  52. token_type = TokenType::Invalid;
  53. } else if (is_string_literal_start()) {
  54. token_type = TokenType::StringLiteral;
  55. if (!consume_string_literal(current_token))
  56. token_type = TokenType::Invalid;
  57. } else if (is_quoted_identifier_start()) {
  58. token_type = TokenType::Identifier;
  59. if (!consume_quoted_identifier(current_token))
  60. token_type = TokenType::Invalid;
  61. } else if (is_blob_literal_start()) {
  62. token_type = TokenType::BlobLiteral;
  63. if (!consume_blob_literal(current_token))
  64. token_type = TokenType::Invalid;
  65. } else if (is_identifier_start()) {
  66. do {
  67. current_token.append((char)toupper(m_current_char));
  68. consume();
  69. } while (is_identifier_middle());
  70. if (auto it = s_keywords.find(current_token.string_view()); it != s_keywords.end()) {
  71. token_type = it->value;
  72. } else {
  73. token_type = TokenType::Identifier;
  74. }
  75. } else {
  76. bool found_two_char_token = false;
  77. if (m_position < m_source.length()) {
  78. if (auto it = s_two_char_tokens.find(m_source.substring_view(m_position - 1, 2)); it != s_two_char_tokens.end()) {
  79. found_two_char_token = true;
  80. token_type = it->value;
  81. consume(&current_token);
  82. consume(&current_token);
  83. }
  84. }
  85. bool found_one_char_token = false;
  86. if (!found_two_char_token) {
  87. if (auto it = s_one_char_tokens.find(m_current_char); it != s_one_char_tokens.end()) {
  88. found_one_char_token = true;
  89. token_type = it->value;
  90. consume(&current_token);
  91. }
  92. }
  93. if (!found_two_char_token && !found_one_char_token) {
  94. token_type = TokenType::Invalid;
  95. consume(&current_token);
  96. }
  97. }
  98. Token token(token_type, current_token.build(),
  99. { value_start_line_number, value_start_column_number },
  100. { m_line_number, m_line_column });
  101. if constexpr (SQL_DEBUG) {
  102. dbgln("------------------------------");
  103. dbgln("Token: {}", token.name());
  104. dbgln("Value: {}", token.value());
  105. dbgln("Line: {}, Column: {}", token.start_position().line, token.start_position().column);
  106. dbgln("------------------------------");
  107. }
  108. return token;
  109. }
  110. void Lexer::consume(StringBuilder* current_token)
  111. {
  112. auto did_reach_eof = [this] {
  113. if (m_position != m_source.length())
  114. return false;
  115. m_eof = true;
  116. m_current_char = '\0';
  117. ++m_line_column;
  118. ++m_position;
  119. return true;
  120. };
  121. if (current_token)
  122. current_token->append(m_current_char);
  123. if (m_position > m_source.length())
  124. return;
  125. if (did_reach_eof())
  126. return;
  127. if (is_line_break()) {
  128. ++m_line_number;
  129. m_line_column = 1;
  130. } else {
  131. ++m_line_column;
  132. }
  133. m_current_char = m_source[m_position++];
  134. }
  135. bool Lexer::consume_whitespace_and_comments()
  136. {
  137. bool found_invalid_comment = false;
  138. while (true) {
  139. if (isspace(m_current_char)) {
  140. do {
  141. consume();
  142. } while (isspace(m_current_char));
  143. } else if (is_line_comment_start()) {
  144. consume();
  145. do {
  146. consume();
  147. } while (!is_eof() && !is_line_break());
  148. } else if (is_block_comment_start()) {
  149. consume();
  150. do {
  151. consume();
  152. } while (!is_eof() && !is_block_comment_end());
  153. if (is_eof())
  154. found_invalid_comment = true;
  155. consume(); // consume *
  156. if (is_eof())
  157. found_invalid_comment = true;
  158. consume(); // consume /
  159. } else {
  160. break;
  161. }
  162. }
  163. return found_invalid_comment;
  164. }
  165. bool Lexer::consume_numeric_literal(StringBuilder& current_token)
  166. {
  167. // https://sqlite.org/syntax/numeric-literal.html
  168. bool is_valid_numeric_literal = true;
  169. if (m_current_char == '0') {
  170. consume(&current_token);
  171. if (m_current_char == '.') {
  172. consume(&current_token);
  173. while (isdigit(m_current_char))
  174. consume(&current_token);
  175. if (m_current_char == 'e' || m_current_char == 'E')
  176. is_valid_numeric_literal = consume_exponent(current_token);
  177. } else if (m_current_char == 'e' || m_current_char == 'E') {
  178. is_valid_numeric_literal = consume_exponent(current_token);
  179. } else if (m_current_char == 'x' || m_current_char == 'X') {
  180. is_valid_numeric_literal = consume_hexadecimal_number(current_token);
  181. } else if (isdigit(m_current_char)) {
  182. do {
  183. consume(&current_token);
  184. } while (isdigit(m_current_char));
  185. }
  186. } else {
  187. do {
  188. consume(&current_token);
  189. } while (isdigit(m_current_char));
  190. if (m_current_char == '.') {
  191. consume(&current_token);
  192. while (isdigit(m_current_char))
  193. consume(&current_token);
  194. }
  195. if (m_current_char == 'e' || m_current_char == 'E')
  196. is_valid_numeric_literal = consume_exponent(current_token);
  197. }
  198. return is_valid_numeric_literal;
  199. }
  200. bool Lexer::consume_string_literal(StringBuilder& current_token)
  201. {
  202. // https://sqlite.org/lang_expr.html - See "3. Literal Values (Constants)"
  203. bool is_valid_string_literal = true;
  204. // Skip the opening single quote:
  205. consume();
  206. while (!is_eof() && !is_string_literal_end()) {
  207. // If both the current character and the next one are single quotes,
  208. // consume one single quote into the current token, and drop the
  209. // other one on the floor:
  210. if (match('\'', '\''))
  211. consume();
  212. consume(&current_token);
  213. }
  214. if (is_eof())
  215. is_valid_string_literal = false;
  216. // Drop the closing quote on the floor:
  217. consume();
  218. return is_valid_string_literal;
  219. }
  220. bool Lexer::consume_quoted_identifier(StringBuilder& current_token)
  221. {
  222. // I have not found a reference to the syntax for identifiers in the
  223. // SQLite documentation, but PostgreSQL has this:
  224. // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
  225. bool is_valid_identifier = true;
  226. // Skip the opening double quote:
  227. consume();
  228. while (!is_eof() && !is_quoted_identifier_end()) {
  229. // If both the current character and the next one are double quotes,
  230. // consume one single quote into the current token, and drop the
  231. // other one on the floor:
  232. if (match('"', '"'))
  233. consume();
  234. consume(&current_token);
  235. }
  236. if (is_eof())
  237. is_valid_identifier = false;
  238. // Drop the closing double quote on the floor:
  239. consume();
  240. return is_valid_identifier;
  241. }
  242. bool Lexer::consume_blob_literal(StringBuilder& current_token)
  243. {
  244. // https://sqlite.org/lang_expr.html - See "3. Literal Values (Constants)"
  245. // Skip starting 'X'/'x' character:
  246. consume();
  247. if (!consume_string_literal(current_token))
  248. return false;
  249. for (auto ix = 0u; ix < current_token.length(); ix++) {
  250. if (!isxdigit(current_token.string_view()[ix]))
  251. return false;
  252. }
  253. return true;
  254. }
  255. bool Lexer::consume_exponent(StringBuilder& current_token)
  256. {
  257. consume(&current_token);
  258. if (m_current_char == '-' || m_current_char == '+')
  259. consume(&current_token);
  260. if (!isdigit(m_current_char))
  261. return false;
  262. // FIXME This code results in the string "1e" being rejected as a
  263. // malformed numeric literal. We do however accept "1a" which
  264. // is inconsistent. We have to decide what we want to do:
  265. // - Be like `SQLite` and reject both "1a" and "1e" because we
  266. // require a space between the two tokens. This is pretty invasive;
  267. // we would have to decide where all spaces are required and fix
  268. // the lexer accordingly.
  269. // - Be like `PostgreSQL` and accept both "1e" and "1a" as two
  270. // separate tokens, and accept "1e3" as a single token. This would
  271. // would require pushing back the "e" we lexed here, terminate the
  272. // numeric literal, and re-process the "e" as the first char of
  273. // a new token.
  274. while (isdigit(m_current_char)) {
  275. consume(&current_token);
  276. }
  277. return true;
  278. }
  279. bool Lexer::consume_hexadecimal_number(StringBuilder& current_token)
  280. {
  281. consume(&current_token);
  282. if (!isxdigit(m_current_char))
  283. return false;
  284. while (isxdigit(m_current_char))
  285. consume(&current_token);
  286. return true;
  287. }
  288. bool Lexer::match(char a, char b) const
  289. {
  290. if (m_position >= m_source.length())
  291. return false;
  292. return m_current_char == a && m_source[m_position] == b;
  293. }
  294. bool Lexer::is_identifier_start() const
  295. {
  296. return isalpha(m_current_char) || m_current_char == '_';
  297. }
  298. bool Lexer::is_identifier_middle() const
  299. {
  300. return is_identifier_start() || isdigit(m_current_char);
  301. }
  302. bool Lexer::is_numeric_literal_start() const
  303. {
  304. return isdigit(m_current_char) || (m_current_char == '.' && m_position < m_source.length() && isdigit(m_source[m_position]));
  305. }
  306. bool Lexer::is_string_literal_start() const
  307. {
  308. return m_current_char == '\'';
  309. }
  310. bool Lexer::is_string_literal_end() const
  311. {
  312. return m_current_char == '\'' && !(m_position < m_source.length() && m_source[m_position] == '\'');
  313. }
  314. bool Lexer::is_quoted_identifier_start() const
  315. {
  316. return m_current_char == '"';
  317. }
  318. bool Lexer::is_quoted_identifier_end() const
  319. {
  320. return m_current_char == '"' && !(m_position < m_source.length() && m_source[m_position] == '"');
  321. }
  322. bool Lexer::is_blob_literal_start() const
  323. {
  324. return match('x', '\'') || match('X', '\'');
  325. }
  326. bool Lexer::is_line_comment_start() const
  327. {
  328. return match('-', '-');
  329. }
  330. bool Lexer::is_block_comment_start() const
  331. {
  332. return match('/', '*');
  333. }
  334. bool Lexer::is_block_comment_end() const
  335. {
  336. return match('*', '/');
  337. }
  338. bool Lexer::is_line_break() const
  339. {
  340. return m_current_char == '\n';
  341. }
  342. bool Lexer::is_eof() const
  343. {
  344. return m_eof;
  345. }
  346. }