RegexLexer.cpp 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "RegexLexer.h"
  7. #include <AK/Assertions.h>
  8. #include <AK/Debug.h>
  9. #include <AK/Format.h>
  10. #include <stdio.h>
  11. namespace regex {
  12. const char* Token::name(const TokenType type)
  13. {
  14. switch (type) {
  15. #define __ENUMERATE_REGEX_TOKEN(x) \
  16. case TokenType::x: \
  17. return #x;
  18. ENUMERATE_REGEX_TOKENS
  19. #undef __ENUMERATE_REGEX_TOKEN
  20. default:
  21. VERIFY_NOT_REACHED();
  22. return "<Unknown>";
  23. }
  24. }
  25. const char* Token::name() const
  26. {
  27. return name(m_type);
  28. }
  29. Lexer::Lexer(const StringView source)
  30. : m_source(source)
  31. {
  32. }
  33. ALWAYS_INLINE int Lexer::peek(size_t offset) const
  34. {
  35. if ((m_position + offset) >= m_source.length())
  36. return EOF;
  37. return (unsigned char)m_source[m_position + offset];
  38. }
  39. void Lexer::back(size_t offset)
  40. {
  41. if (offset == m_position + 1)
  42. offset = m_position; // 'position == 0' occurs twice.
  43. VERIFY(offset <= m_position);
  44. if (!offset)
  45. return;
  46. m_position -= offset;
  47. m_previous_position = (m_position > 0) ? m_position - 1 : 0;
  48. m_current_char = m_source[m_position];
  49. }
  50. ALWAYS_INLINE void Lexer::consume()
  51. {
  52. m_previous_position = m_position;
  53. if (m_position >= m_source.length()) {
  54. m_position = m_source.length() + 1;
  55. m_current_char = EOF;
  56. return;
  57. }
  58. m_current_char = m_source[m_position++];
  59. }
  60. void Lexer::reset()
  61. {
  62. m_position = 0;
  63. m_current_token = { TokenType::Eof, 0, StringView(nullptr) };
  64. m_current_char = 0;
  65. m_previous_position = 0;
  66. }
  67. bool Lexer::try_skip(char c)
  68. {
  69. if (peek() != c)
  70. return false;
  71. consume();
  72. return true;
  73. }
  74. char Lexer::skip()
  75. {
  76. auto c = peek();
  77. consume();
  78. VERIFY(c != EOF);
  79. return c;
  80. }
  81. Token Lexer::next()
  82. {
  83. size_t token_start_position;
  84. auto begin_token = [&] {
  85. token_start_position = m_position;
  86. };
  87. auto commit_token = [&](auto type) -> Token& {
  88. VERIFY(token_start_position + m_previous_position - token_start_position + 1 <= m_source.length());
  89. auto substring = m_source.substring_view(token_start_position, m_previous_position - token_start_position + 1);
  90. m_current_token = Token(type, token_start_position, substring);
  91. return m_current_token;
  92. };
  93. auto emit_token = [&](auto type) -> Token& {
  94. m_current_token = Token(type, m_position, m_source.substring_view(m_position, 1));
  95. consume();
  96. return m_current_token;
  97. };
  98. auto match_escape_sequence = [&]() -> size_t {
  99. switch (peek(1)) {
  100. case '^':
  101. case '.':
  102. case '[':
  103. case ']':
  104. case '$':
  105. case '(':
  106. case ')':
  107. case '|':
  108. case '*':
  109. case '+':
  110. case '?':
  111. case '{':
  112. case '\\':
  113. return 2;
  114. default:
  115. dbgln_if(REGEX_DEBUG, "[LEXER] Found invalid escape sequence: \\{:c} (the parser will have to deal with this!)", peek(1));
  116. return 0;
  117. }
  118. };
  119. while (m_position <= m_source.length()) {
  120. auto ch = peek();
  121. if (ch == '(')
  122. return emit_token(TokenType::LeftParen);
  123. if (ch == ')')
  124. return emit_token(TokenType::RightParen);
  125. if (ch == '{')
  126. return emit_token(TokenType::LeftCurly);
  127. if (ch == '}')
  128. return emit_token(TokenType::RightCurly);
  129. if (ch == '[')
  130. return emit_token(TokenType::LeftBracket);
  131. if (ch == ']')
  132. return emit_token(TokenType::RightBracket);
  133. if (ch == '.')
  134. return emit_token(TokenType::Period);
  135. if (ch == '*')
  136. return emit_token(TokenType::Asterisk);
  137. if (ch == '+')
  138. return emit_token(TokenType::Plus);
  139. if (ch == '$')
  140. return emit_token(TokenType::Dollar);
  141. if (ch == '^')
  142. return emit_token(TokenType::Circumflex);
  143. if (ch == '|')
  144. return emit_token(TokenType::Pipe);
  145. if (ch == '?')
  146. return emit_token(TokenType::Questionmark);
  147. if (ch == ',')
  148. return emit_token(TokenType::Comma);
  149. if (ch == '/')
  150. return emit_token(TokenType::Slash);
  151. if (ch == '=')
  152. return emit_token(TokenType::EqualSign);
  153. if (ch == ':')
  154. return emit_token(TokenType::Colon);
  155. if (ch == '-')
  156. return emit_token(TokenType::HyphenMinus);
  157. if (ch == '\\') {
  158. size_t escape = match_escape_sequence();
  159. if (escape > 0) {
  160. begin_token();
  161. for (size_t i = 0; i < escape; ++i)
  162. consume();
  163. return commit_token(TokenType::EscapeSequence);
  164. }
  165. }
  166. if (ch == EOF)
  167. break;
  168. return emit_token(TokenType::Char);
  169. }
  170. return Token(TokenType::Eof, m_position, nullptr);
  171. }
  172. }