RegexLexer.cpp 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "RegexLexer.h"
  7. #include <AK/Assertions.h>
  8. #include <AK/Debug.h>
  9. #include <stdio.h>
  10. namespace regex {
  11. const char* Token::name(const TokenType type)
  12. {
  13. switch (type) {
  14. #define __ENUMERATE_REGEX_TOKEN(x) \
  15. case TokenType::x: \
  16. return #x;
  17. ENUMERATE_REGEX_TOKENS
  18. #undef __ENUMERATE_REGEX_TOKEN
  19. default:
  20. VERIFY_NOT_REACHED();
  21. return "<Unknown>";
  22. }
  23. }
  24. const char* Token::name() const
  25. {
  26. return name(m_type);
  27. }
  28. Lexer::Lexer(const StringView source)
  29. : m_source(source)
  30. {
  31. }
  32. ALWAYS_INLINE char Lexer::peek(size_t offset) const
  33. {
  34. if ((m_position + offset) >= m_source.length())
  35. return EOF;
  36. return m_source[m_position + offset];
  37. }
  38. void Lexer::back(size_t offset)
  39. {
  40. if (offset == m_position + 1)
  41. offset = m_position; // 'position == 0' occurs twice.
  42. VERIFY(offset <= m_position);
  43. if (!offset)
  44. return;
  45. m_position -= offset;
  46. m_previous_position = (m_position > 0) ? m_position - 1 : 0;
  47. m_current_char = m_source[m_position];
  48. }
  49. ALWAYS_INLINE void Lexer::consume()
  50. {
  51. m_previous_position = m_position;
  52. if (m_position >= m_source.length()) {
  53. m_position = m_source.length() + 1;
  54. m_current_char = EOF;
  55. return;
  56. }
  57. m_current_char = m_source[m_position++];
  58. }
  59. void Lexer::reset()
  60. {
  61. m_position = 0;
  62. m_current_token = { TokenType::Eof, 0, StringView(nullptr) };
  63. m_current_char = 0;
  64. m_previous_position = 0;
  65. }
  66. bool Lexer::try_skip(char c)
  67. {
  68. if (peek() != c)
  69. return false;
  70. consume();
  71. return true;
  72. }
  73. char Lexer::skip()
  74. {
  75. auto c = peek();
  76. consume();
  77. return c;
  78. }
  79. Token Lexer::next()
  80. {
  81. size_t token_start_position;
  82. auto begin_token = [&] {
  83. token_start_position = m_position;
  84. };
  85. auto commit_token = [&](auto type) -> Token& {
  86. VERIFY(token_start_position + m_previous_position - token_start_position + 1 <= m_source.length());
  87. auto substring = m_source.substring_view(token_start_position, m_previous_position - token_start_position + 1);
  88. m_current_token = Token(type, token_start_position, substring);
  89. return m_current_token;
  90. };
  91. auto emit_token = [&](auto type) -> Token& {
  92. m_current_token = Token(type, m_position, m_source.substring_view(m_position, 1));
  93. consume();
  94. return m_current_token;
  95. };
  96. auto match_escape_sequence = [&]() -> size_t {
  97. switch (peek(1)) {
  98. case '^':
  99. case '.':
  100. case '[':
  101. case ']':
  102. case '$':
  103. case '(':
  104. case ')':
  105. case '|':
  106. case '*':
  107. case '+':
  108. case '?':
  109. case '{':
  110. case '\\':
  111. return 2;
  112. default:
  113. if constexpr (REGEX_DEBUG)
  114. fprintf(stderr, "[LEXER] Found invalid escape sequence: \\%c (the parser will have to deal with this!)\n", peek(1));
  115. return 0;
  116. }
  117. };
  118. while (m_position <= m_source.length()) {
  119. auto ch = peek();
  120. if (ch == '(')
  121. return emit_token(TokenType::LeftParen);
  122. if (ch == ')')
  123. return emit_token(TokenType::RightParen);
  124. if (ch == '{')
  125. return emit_token(TokenType::LeftCurly);
  126. if (ch == '}')
  127. return emit_token(TokenType::RightCurly);
  128. if (ch == '[')
  129. return emit_token(TokenType::LeftBracket);
  130. if (ch == ']')
  131. return emit_token(TokenType::RightBracket);
  132. if (ch == '.')
  133. return emit_token(TokenType::Period);
  134. if (ch == '*')
  135. return emit_token(TokenType::Asterisk);
  136. if (ch == '+')
  137. return emit_token(TokenType::Plus);
  138. if (ch == '$')
  139. return emit_token(TokenType::Dollar);
  140. if (ch == '^')
  141. return emit_token(TokenType::Circumflex);
  142. if (ch == '|')
  143. return emit_token(TokenType::Pipe);
  144. if (ch == '?')
  145. return emit_token(TokenType::Questionmark);
  146. if (ch == ',')
  147. return emit_token(TokenType::Comma);
  148. if (ch == '/')
  149. return emit_token(TokenType::Slash);
  150. if (ch == '=')
  151. return emit_token(TokenType::EqualSign);
  152. if (ch == ':')
  153. return emit_token(TokenType::Colon);
  154. if (ch == '-')
  155. return emit_token(TokenType::HyphenMinus);
  156. if (ch == '\\') {
  157. size_t escape = match_escape_sequence();
  158. if (escape > 0) {
  159. begin_token();
  160. for (size_t i = 0; i < escape; ++i)
  161. consume();
  162. return commit_token(TokenType::EscapeSequence);
  163. }
  164. }
  165. if (ch == EOF)
  166. break;
  167. return emit_token(TokenType::Char);
  168. }
  169. return Token(TokenType::Eof, m_position, nullptr);
  170. }
  171. }