GenericLexer.cpp 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. /*
  2. * Copyright (c) 2020, Benoit Lormeau <blormeau@outlook.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Assertions.h>
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/GenericLexer.h>
  9. #include <AK/String.h>
  10. #include <AK/StringBuilder.h>
  11. #include <AK/Utf16View.h>
  12. namespace AK {
  13. // Consume a number of characters
  14. StringView GenericLexer::consume(size_t count)
  15. {
  16. if (count == 0)
  17. return {};
  18. size_t start = m_index;
  19. size_t length = min(count, m_input.length() - m_index);
  20. m_index += length;
  21. return m_input.substring_view(start, length);
  22. }
  23. // Consume the rest of the input
  24. StringView GenericLexer::consume_all()
  25. {
  26. if (is_eof())
  27. return {};
  28. auto rest = m_input.substring_view(m_index, m_input.length() - m_index);
  29. m_index = m_input.length();
  30. return rest;
  31. }
  32. // Consume until a new line is found
  33. StringView GenericLexer::consume_line()
  34. {
  35. size_t start = m_index;
  36. while (!is_eof() && peek() != '\r' && peek() != '\n')
  37. m_index++;
  38. size_t length = m_index - start;
  39. consume_specific('\r');
  40. consume_specific('\n');
  41. if (length == 0)
  42. return {};
  43. return m_input.substring_view(start, length);
  44. }
  45. // Consume and return characters until `stop` is peek'd
  46. // The `stop` character is ignored, as it is user-defined
  47. StringView GenericLexer::consume_until(char stop)
  48. {
  49. size_t start = m_index;
  50. while (!is_eof() && peek() != stop)
  51. m_index++;
  52. size_t length = m_index - start;
  53. ignore();
  54. if (length == 0)
  55. return {};
  56. return m_input.substring_view(start, length);
  57. }
  58. // Consume and return characters until the string `stop` is found
  59. // The `stop` string is ignored, as it is user-defined
  60. StringView GenericLexer::consume_until(const char* stop)
  61. {
  62. size_t start = m_index;
  63. while (!is_eof() && !next_is(stop))
  64. m_index++;
  65. size_t length = m_index - start;
  66. ignore(__builtin_strlen(stop));
  67. if (length == 0)
  68. return {};
  69. return m_input.substring_view(start, length);
  70. }
  71. /*
  72. * Consume a string surrounded by single or double quotes. The returned
  73. * StringView does not include the quotes. An escape character can be provided
  74. * to capture the enclosing quotes. Please note that the escape character will
  75. * still be in the resulting StringView
  76. */
  77. StringView GenericLexer::consume_quoted_string(char escape_char)
  78. {
  79. if (!next_is(is_quote))
  80. return {};
  81. char quote_char = consume();
  82. size_t start = m_index;
  83. while (!is_eof()) {
  84. if (next_is(escape_char))
  85. m_index++;
  86. else if (next_is(quote_char))
  87. break;
  88. m_index++;
  89. }
  90. size_t length = m_index - start;
  91. if (peek() != quote_char) {
  92. // Restore the index in case the string is unterminated
  93. m_index = start - 1;
  94. return {};
  95. }
  96. // Ignore closing quote
  97. ignore();
  98. return m_input.substring_view(start, length);
  99. }
  100. String GenericLexer::consume_and_unescape_string(char escape_char)
  101. {
  102. auto view = consume_quoted_string(escape_char);
  103. if (view.is_null())
  104. return {};
  105. StringBuilder builder;
  106. for (size_t i = 0; i < view.length(); ++i)
  107. builder.append(consume_escaped_character(escape_char));
  108. return builder.to_string();
  109. }
  110. auto GenericLexer::consume_escaped_code_point(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
  111. {
  112. if (!consume_specific("\\u"sv))
  113. return UnicodeEscapeError::MalformedUnicodeEscape;
  114. if (next_is('{'))
  115. return decode_code_point();
  116. return decode_single_or_paired_surrogate(combine_surrogate_pairs);
  117. }
  118. auto GenericLexer::decode_code_point() -> Result<u32, UnicodeEscapeError>
  119. {
  120. bool starts_with_open_bracket = consume_specific('{');
  121. VERIFY(starts_with_open_bracket);
  122. u32 code_point = 0;
  123. while (true) {
  124. if (!next_is(is_ascii_hex_digit))
  125. return UnicodeEscapeError::MalformedUnicodeEscape;
  126. auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume());
  127. if (new_code_point < code_point)
  128. return UnicodeEscapeError::UnicodeEscapeOverflow;
  129. code_point = new_code_point;
  130. if (consume_specific('}'))
  131. break;
  132. }
  133. if (is_unicode(code_point))
  134. return code_point;
  135. return UnicodeEscapeError::UnicodeEscapeOverflow;
  136. }
  137. auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
  138. {
  139. constexpr size_t surrogate_length = 4;
  140. auto decode_one_surrogate = [&]() -> Optional<u16> {
  141. u16 surrogate = 0;
  142. for (size_t i = 0; i < surrogate_length; ++i) {
  143. if (!next_is(is_ascii_hex_digit))
  144. return {};
  145. surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume());
  146. }
  147. return surrogate;
  148. };
  149. auto high_surrogate = decode_one_surrogate();
  150. if (!high_surrogate.has_value())
  151. return UnicodeEscapeError::MalformedUnicodeEscape;
  152. if (!Utf16View::is_high_surrogate(*high_surrogate))
  153. return *high_surrogate;
  154. if (!combine_surrogate_pairs || !consume_specific("\\u"sv))
  155. return *high_surrogate;
  156. auto low_surrogate = decode_one_surrogate();
  157. if (!low_surrogate.has_value())
  158. return UnicodeEscapeError::MalformedUnicodeEscape;
  159. if (Utf16View::is_low_surrogate(*low_surrogate))
  160. return Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate);
  161. retreat(6);
  162. return *high_surrogate;
  163. }
  164. }