RegexLexer.cpp 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "RegexLexer.h"
  7. #include <AK/Assertions.h>
  8. #include <AK/Debug.h>
  9. #include <AK/Format.h>
  10. #include <stdio.h>
  11. namespace regex {
  12. const char* Token::name(const TokenType type)
  13. {
  14. switch (type) {
  15. #define __ENUMERATE_REGEX_TOKEN(x) \
  16. case TokenType::x: \
  17. return #x;
  18. ENUMERATE_REGEX_TOKENS
  19. #undef __ENUMERATE_REGEX_TOKEN
  20. default:
  21. VERIFY_NOT_REACHED();
  22. return "<Unknown>";
  23. }
  24. }
  25. const char* Token::name() const
  26. {
  27. return name(m_type);
  28. }
  29. Lexer::Lexer(const StringView source)
  30. : m_source(source)
  31. {
  32. }
  33. ALWAYS_INLINE char Lexer::peek(size_t offset) const
  34. {
  35. if ((m_position + offset) >= m_source.length())
  36. return EOF;
  37. return m_source[m_position + offset];
  38. }
  39. void Lexer::back(size_t offset)
  40. {
  41. if (offset == m_position + 1)
  42. offset = m_position; // 'position == 0' occurs twice.
  43. VERIFY(offset <= m_position);
  44. if (!offset)
  45. return;
  46. m_position -= offset;
  47. m_previous_position = (m_position > 0) ? m_position - 1 : 0;
  48. m_current_char = m_source[m_position];
  49. }
  50. ALWAYS_INLINE void Lexer::consume()
  51. {
  52. m_previous_position = m_position;
  53. if (m_position >= m_source.length()) {
  54. m_position = m_source.length() + 1;
  55. m_current_char = EOF;
  56. return;
  57. }
  58. m_current_char = m_source[m_position++];
  59. }
  60. void Lexer::reset()
  61. {
  62. m_position = 0;
  63. m_current_token = { TokenType::Eof, 0, StringView(nullptr) };
  64. m_current_char = 0;
  65. m_previous_position = 0;
  66. }
  67. bool Lexer::try_skip(char c)
  68. {
  69. if (peek() != c)
  70. return false;
  71. consume();
  72. return true;
  73. }
  74. char Lexer::skip()
  75. {
  76. auto c = peek();
  77. consume();
  78. return c;
  79. }
  80. Token Lexer::next()
  81. {
  82. size_t token_start_position;
  83. auto begin_token = [&] {
  84. token_start_position = m_position;
  85. };
  86. auto commit_token = [&](auto type) -> Token& {
  87. VERIFY(token_start_position + m_previous_position - token_start_position + 1 <= m_source.length());
  88. auto substring = m_source.substring_view(token_start_position, m_previous_position - token_start_position + 1);
  89. m_current_token = Token(type, token_start_position, substring);
  90. return m_current_token;
  91. };
  92. auto emit_token = [&](auto type) -> Token& {
  93. m_current_token = Token(type, m_position, m_source.substring_view(m_position, 1));
  94. consume();
  95. return m_current_token;
  96. };
  97. auto match_escape_sequence = [&]() -> size_t {
  98. switch (peek(1)) {
  99. case '^':
  100. case '.':
  101. case '[':
  102. case ']':
  103. case '$':
  104. case '(':
  105. case ')':
  106. case '|':
  107. case '*':
  108. case '+':
  109. case '?':
  110. case '{':
  111. case '\\':
  112. return 2;
  113. default:
  114. dbgln_if(REGEX_DEBUG, "[LEXER] Found invalid escape sequence: \\{:c} (the parser will have to deal with this!)", peek(1));
  115. return 0;
  116. }
  117. };
  118. while (m_position <= m_source.length()) {
  119. auto ch = peek();
  120. if (ch == '(')
  121. return emit_token(TokenType::LeftParen);
  122. if (ch == ')')
  123. return emit_token(TokenType::RightParen);
  124. if (ch == '{')
  125. return emit_token(TokenType::LeftCurly);
  126. if (ch == '}')
  127. return emit_token(TokenType::RightCurly);
  128. if (ch == '[')
  129. return emit_token(TokenType::LeftBracket);
  130. if (ch == ']')
  131. return emit_token(TokenType::RightBracket);
  132. if (ch == '.')
  133. return emit_token(TokenType::Period);
  134. if (ch == '*')
  135. return emit_token(TokenType::Asterisk);
  136. if (ch == '+')
  137. return emit_token(TokenType::Plus);
  138. if (ch == '$')
  139. return emit_token(TokenType::Dollar);
  140. if (ch == '^')
  141. return emit_token(TokenType::Circumflex);
  142. if (ch == '|')
  143. return emit_token(TokenType::Pipe);
  144. if (ch == '?')
  145. return emit_token(TokenType::Questionmark);
  146. if (ch == ',')
  147. return emit_token(TokenType::Comma);
  148. if (ch == '/')
  149. return emit_token(TokenType::Slash);
  150. if (ch == '=')
  151. return emit_token(TokenType::EqualSign);
  152. if (ch == ':')
  153. return emit_token(TokenType::Colon);
  154. if (ch == '-')
  155. return emit_token(TokenType::HyphenMinus);
  156. if (ch == '\\') {
  157. size_t escape = match_escape_sequence();
  158. if (escape > 0) {
  159. begin_token();
  160. for (size_t i = 0; i < escape; ++i)
  161. consume();
  162. return commit_token(TokenType::EscapeSequence);
  163. }
  164. }
  165. if (ch == EOF)
  166. break;
  167. return emit_token(TokenType::Char);
  168. }
  169. return Token(TokenType::Eof, m_position, nullptr);
  170. }
  171. }