RegexLexer.cpp 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include "RegexLexer.h"
  27. #include <AK/Assertions.h>
  28. #include <AK/LogStream.h>
  29. #include <stdio.h>
  30. namespace regex {
  31. const char* Token::name(const TokenType type)
  32. {
  33. switch (type) {
  34. #define __ENUMERATE_REGEX_TOKEN(x) \
  35. case TokenType::x: \
  36. return #x;
  37. ENUMERATE_REGEX_TOKENS
  38. #undef __ENUMERATE_REGEX_TOKEN
  39. default:
  40. ASSERT_NOT_REACHED();
  41. return "<Unknown>";
  42. }
  43. }
  44. const char* Token::name() const
  45. {
  46. return name(m_type);
  47. }
  48. Lexer::Lexer(const StringView source)
  49. : m_source(source)
  50. {
  51. }
  52. ALWAYS_INLINE char Lexer::peek(size_t offset) const
  53. {
  54. if ((m_position + offset) >= m_source.length())
  55. return EOF;
  56. return m_source[m_position + offset];
  57. }
  58. void Lexer::back(size_t offset)
  59. {
  60. if (offset == m_position + 1)
  61. offset = m_position; // 'position == 0' occurs twice.
  62. ASSERT(offset <= m_position);
  63. if (!offset)
  64. return;
  65. m_position -= offset;
  66. m_previous_position = (m_position > 0) ? m_position - 1 : 0;
  67. m_current_char = m_source[m_position];
  68. }
  69. ALWAYS_INLINE void Lexer::consume()
  70. {
  71. m_previous_position = m_position;
  72. if (m_position >= m_source.length()) {
  73. m_position = m_source.length() + 1;
  74. m_current_char = EOF;
  75. return;
  76. }
  77. m_current_char = m_source[m_position++];
  78. }
  79. void Lexer::reset()
  80. {
  81. m_position = 0;
  82. m_current_token = { TokenType::Eof, 0, StringView(nullptr) };
  83. m_current_char = 0;
  84. m_previous_position = 0;
  85. }
  86. bool Lexer::try_skip(char c)
  87. {
  88. if (peek() != c)
  89. return false;
  90. consume();
  91. return true;
  92. }
  93. char Lexer::skip()
  94. {
  95. auto c = peek();
  96. consume();
  97. return c;
  98. }
  99. Token Lexer::next()
  100. {
  101. size_t token_start_position;
  102. auto begin_token = [&] {
  103. token_start_position = m_position;
  104. };
  105. auto commit_token = [&](auto type) -> Token& {
  106. ASSERT(token_start_position + m_previous_position - token_start_position + 1 <= m_source.length());
  107. auto substring = m_source.substring_view(token_start_position, m_previous_position - token_start_position + 1);
  108. m_current_token = Token(type, token_start_position, substring);
  109. return m_current_token;
  110. };
  111. auto emit_token = [&](auto type) -> Token& {
  112. m_current_token = Token(type, m_position, m_source.substring_view(m_position, 1));
  113. consume();
  114. return m_current_token;
  115. };
  116. auto match_escape_sequence = [&]() -> size_t {
  117. switch (peek(1)) {
  118. case '^':
  119. case '.':
  120. case '[':
  121. case ']':
  122. case '$':
  123. case '(':
  124. case ')':
  125. case '|':
  126. case '*':
  127. case '+':
  128. case '?':
  129. case '{':
  130. case '\\':
  131. return 2;
  132. default:
  133. #ifdef REGEX_DEBUG
  134. fprintf(stderr, "[LEXER] Found invalid escape sequence: \\%c (the parser will have to deal with this!)\n", peek(1));
  135. #endif
  136. return 0;
  137. }
  138. };
  139. while (m_position <= m_source.length()) {
  140. auto ch = peek();
  141. if (ch == '(')
  142. return emit_token(TokenType::LeftParen);
  143. if (ch == ')')
  144. return emit_token(TokenType::RightParen);
  145. if (ch == '{')
  146. return emit_token(TokenType::LeftCurly);
  147. if (ch == '}')
  148. return emit_token(TokenType::RightCurly);
  149. if (ch == '[')
  150. return emit_token(TokenType::LeftBracket);
  151. if (ch == ']')
  152. return emit_token(TokenType::RightBracket);
  153. if (ch == '.')
  154. return emit_token(TokenType::Period);
  155. if (ch == '*')
  156. return emit_token(TokenType::Asterisk);
  157. if (ch == '+')
  158. return emit_token(TokenType::Plus);
  159. if (ch == '$')
  160. return emit_token(TokenType::Dollar);
  161. if (ch == '^')
  162. return emit_token(TokenType::Circumflex);
  163. if (ch == '|')
  164. return emit_token(TokenType::Pipe);
  165. if (ch == '?')
  166. return emit_token(TokenType::Questionmark);
  167. if (ch == ',')
  168. return emit_token(TokenType::Comma);
  169. if (ch == '/')
  170. return emit_token(TokenType::Slash);
  171. if (ch == '=')
  172. return emit_token(TokenType::EqualSign);
  173. if (ch == ':')
  174. return emit_token(TokenType::Colon);
  175. if (ch == '-')
  176. return emit_token(TokenType::HyphenMinus);
  177. if (ch == '\\') {
  178. size_t escape = match_escape_sequence();
  179. if (escape > 0) {
  180. begin_token();
  181. for (size_t i = 0; i < escape; ++i)
  182. consume();
  183. return commit_token(TokenType::EscapeSequence);
  184. }
  185. }
  186. if (ch == EOF)
  187. break;
  188. return emit_token(TokenType::Char);
  189. }
  190. return Token(TokenType::Eof, m_position, nullptr);
  191. }
  192. }