RegexLexer.cpp 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "RegexLexer.h"
  7. #include <AK/Assertions.h>
  8. #include <AK/Debug.h>
  9. #include <AK/Format.h>
  10. #include <stdio.h>
  11. namespace regex {
  12. char const* Token::name(TokenType const type)
  13. {
  14. switch (type) {
  15. #define __ENUMERATE_REGEX_TOKEN(x) \
  16. case TokenType::x: \
  17. return #x;
  18. ENUMERATE_REGEX_TOKENS
  19. #undef __ENUMERATE_REGEX_TOKEN
  20. default:
  21. VERIFY_NOT_REACHED();
  22. return "<Unknown>";
  23. }
  24. }
  25. char const* Token::name() const
  26. {
  27. return name(m_type);
  28. }
  29. Lexer::Lexer()
  30. : GenericLexer(StringView {})
  31. {
  32. }
  33. Lexer::Lexer(StringView const source)
  34. : GenericLexer(source)
  35. {
  36. }
  37. void Lexer::back(size_t offset)
  38. {
  39. if (offset == m_index + 1)
  40. offset = m_index; // 'position == 0' occurs twice.
  41. VERIFY(offset <= m_index);
  42. if (!offset)
  43. return;
  44. m_index -= offset;
  45. m_previous_position = (m_index > 0) ? m_index - 1 : 0;
  46. }
  47. char Lexer::consume()
  48. {
  49. m_previous_position = m_index;
  50. return GenericLexer::consume();
  51. }
  52. void Lexer::reset()
  53. {
  54. m_index = 0;
  55. m_current_token = { TokenType::Eof, 0, {} };
  56. m_previous_position = 0;
  57. }
  58. Token Lexer::next()
  59. {
  60. size_t token_start_position;
  61. auto begin_token = [&] {
  62. token_start_position = m_index;
  63. };
  64. auto commit_token = [&](auto type) -> Token& {
  65. VERIFY(token_start_position + m_previous_position - token_start_position + 1 <= m_input.length());
  66. auto substring = m_input.substring_view(token_start_position, m_previous_position - token_start_position + 1);
  67. m_current_token = Token(type, token_start_position, substring);
  68. return m_current_token;
  69. };
  70. auto emit_token = [&](auto type) -> Token& {
  71. m_current_token = Token(type, m_index, m_input.substring_view(m_index, 1));
  72. consume();
  73. return m_current_token;
  74. };
  75. auto match_escape_sequence = [&]() -> size_t {
  76. switch (peek(1)) {
  77. case '^':
  78. case '.':
  79. case '[':
  80. case ']':
  81. case '$':
  82. case '(':
  83. case ')':
  84. case '|':
  85. case '*':
  86. case '+':
  87. case '?':
  88. case '{':
  89. case '\\':
  90. return 2;
  91. default:
  92. dbgln_if(REGEX_DEBUG, "[LEXER] Found invalid escape sequence: \\{:c} (the parser will have to deal with this!)", peek(1));
  93. return 0;
  94. }
  95. };
  96. while (m_index < m_input.length()) {
  97. auto ch = peek();
  98. if (ch == '(')
  99. return emit_token(TokenType::LeftParen);
  100. if (ch == ')')
  101. return emit_token(TokenType::RightParen);
  102. if (ch == '{')
  103. return emit_token(TokenType::LeftCurly);
  104. if (ch == '}')
  105. return emit_token(TokenType::RightCurly);
  106. if (ch == '[')
  107. return emit_token(TokenType::LeftBracket);
  108. if (ch == ']')
  109. return emit_token(TokenType::RightBracket);
  110. if (ch == '.')
  111. return emit_token(TokenType::Period);
  112. if (ch == '*')
  113. return emit_token(TokenType::Asterisk);
  114. if (ch == '+')
  115. return emit_token(TokenType::Plus);
  116. if (ch == '$')
  117. return emit_token(TokenType::Dollar);
  118. if (ch == '^')
  119. return emit_token(TokenType::Circumflex);
  120. if (ch == '|')
  121. return emit_token(TokenType::Pipe);
  122. if (ch == '?')
  123. return emit_token(TokenType::Questionmark);
  124. if (ch == ',')
  125. return emit_token(TokenType::Comma);
  126. if (ch == '/')
  127. return emit_token(TokenType::Slash);
  128. if (ch == '=')
  129. return emit_token(TokenType::EqualSign);
  130. if (ch == ':')
  131. return emit_token(TokenType::Colon);
  132. if (ch == '-')
  133. return emit_token(TokenType::HyphenMinus);
  134. if (ch == '\\') {
  135. size_t escape = match_escape_sequence();
  136. if (escape > 0) {
  137. begin_token();
  138. for (size_t i = 0; i < escape; ++i)
  139. consume();
  140. return commit_token(TokenType::EscapeSequence);
  141. }
  142. }
  143. return emit_token(TokenType::Char);
  144. }
  145. return Token(TokenType::Eof, m_index, {});
  146. }
  147. }