Lexer.cpp 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. /*
  2. * Copyright (c) 2023, Dan Klishch <danilklishch@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/NonnullOwnPtr.h>
  7. #include <LibXML/Parser/Parser.h>
  8. #include "Parser/Lexer.h"
  9. #include "Parser/SpecParser.h"
  10. #include "Parser/XMLUtils.h"
  11. namespace JSSpecCompiler {
  12. namespace {
  13. Optional<Token> consume_number(LineTrackingLexer& lexer, Location& location)
  14. {
  15. u64 start = lexer.tell();
  16. if (lexer.next_is('-'))
  17. lexer.consume(1);
  18. if (!lexer.next_is(is_ascii_digit)) {
  19. lexer.retreat(lexer.tell() - start);
  20. return {};
  21. }
  22. lexer.consume_while(is_ascii_digit);
  23. if (lexer.next_is('.')) {
  24. lexer.consume(1);
  25. if (lexer.consume_while(is_ascii_digit).length() == 0)
  26. lexer.retreat(1);
  27. }
  28. auto length = lexer.tell() - start;
  29. lexer.retreat(length);
  30. return { Token { TokenType::Number, lexer.consume(length), move(location) } };
  31. }
  32. bool can_end_word_token(char c)
  33. {
  34. return is_ascii_space(c) || ".,"sv.contains(c);
  35. }
  36. void tokenize_string(SpecificationParsingContext& ctx, XML::Node const* node, StringView view, Vector<Token>& tokens)
  37. {
  38. static constexpr struct {
  39. StringView text_to_match;
  40. TokenType token_type;
  41. } choices[] = {
  42. { "-"sv, TokenType::AmbiguousMinus },
  43. { "}"sv, TokenType::BraceClose },
  44. { "{"sv, TokenType::BraceOpen },
  45. { ":"sv, TokenType::Colon },
  46. { ","sv, TokenType::Comma },
  47. { "/"sv, TokenType::Division },
  48. { ". "sv, TokenType::Dot },
  49. { ".\n"sv, TokenType::Dot },
  50. { "="sv, TokenType::Equals },
  51. { "is equal to"sv, TokenType::Equals },
  52. { "!"sv, TokenType::ExclamationMark },
  53. { ">"sv, TokenType::Greater },
  54. { "is"sv, TokenType::Is },
  55. { "<"sv, TokenType::Less },
  56. { "»"sv, TokenType::ListEnd },
  57. { "«"sv, TokenType::ListStart },
  58. { "."sv, TokenType::MemberAccess },
  59. { "×"sv, TokenType::Multiplication },
  60. { "is not equal to"sv, TokenType::NotEquals },
  61. { "≠"sv, TokenType::NotEquals },
  62. { ")"sv, TokenType::ParenClose },
  63. { "("sv, TokenType::ParenOpen },
  64. { "+"sv, TokenType::Plus },
  65. { "?"sv, TokenType::QuestionMark },
  66. };
  67. LineTrackingLexer lexer(view, node->offset);
  68. while (!lexer.is_eof()) {
  69. lexer.ignore_while(is_ascii_space);
  70. // FIXME: This is incorrect since we count text offset after XML reference resolution. To do
  71. // this properly, we need support from XML::Parser.
  72. Location token_location = ctx.location_from_xml_offset(lexer.position_for(lexer.tell()));
  73. if (auto result = consume_number(lexer, token_location); result.has_value()) {
  74. tokens.append(result.release_value());
  75. continue;
  76. }
  77. bool matched = false;
  78. for (auto const& [text_to_match, token_type] : choices) {
  79. if (lexer.consume_specific(text_to_match)) {
  80. tokens.append({ token_type, ""sv, move(token_location) });
  81. matched = true;
  82. break;
  83. }
  84. }
  85. if (matched)
  86. continue;
  87. StringView word = lexer.consume_until(can_end_word_token);
  88. if (word.length())
  89. tokens.append({ TokenType::Word, word, move(token_location) });
  90. }
  91. }
  92. enum class TreeType {
  93. AlgorithmStep,
  94. NestedExpression,
  95. Header,
  96. };
  97. struct TokenizerState {
  98. Vector<Token> tokens;
  99. XML::Node const* substeps = nullptr;
  100. bool has_errors = false;
  101. };
  102. void tokenize_tree(SpecificationParsingContext& ctx, TokenizerState& state, XML::Node const* node, TreeType tree_type)
  103. {
  104. // FIXME: Use structured binding once macOS Lagom CI updates to Clang >= 16.
  105. auto& tokens = state.tokens;
  106. auto& substeps = state.substeps;
  107. auto& has_errors = state.has_errors;
  108. for (auto const& child : node->as_element().children) {
  109. if (has_errors)
  110. break;
  111. child->content.visit(
  112. [&](XML::Node::Element const& element) -> void {
  113. Location child_location = ctx.location_from_xml_offset(child->offset);
  114. auto report_error = [&]<typename... Parameters>(AK::CheckedFormatString<Parameters...>&& fmt, Parameters const&... parameters) {
  115. ctx.diag().error(child_location, move(fmt), parameters...);
  116. has_errors = true;
  117. };
  118. if (substeps) {
  119. report_error("substeps list must be the last child of algorithm step");
  120. return;
  121. }
  122. if (element.name == tag_var) {
  123. auto variable_name = get_text_contents(child);
  124. if (!variable_name.has_value())
  125. report_error("malformed <var> subtree, expected single text child node");
  126. tokens.append({ TokenType::Identifier, variable_name.value_or(""sv), move(child_location) });
  127. return;
  128. }
  129. if (element.name == tag_emu_val) {
  130. auto maybe_contents = get_text_contents(child);
  131. if (!maybe_contents.has_value())
  132. report_error("malformed <emu-val> subtree, expected single text child node");
  133. auto contents = maybe_contents.value_or(""sv);
  134. if (contents.length() >= 2 && contents.starts_with('"') && contents.ends_with('"'))
  135. tokens.append({ TokenType::String, contents.substring_view(1, contents.length() - 2), move(child_location) });
  136. else if (contents.is_one_of("undefined", "null", "this", "true", "false"))
  137. tokens.append({ TokenType::WellKnownValue, contents, move(child_location) });
  138. else
  139. tokens.append({ TokenType::Identifier, contents, move(child_location) });
  140. return;
  141. }
  142. if (element.name == tag_emu_xref) {
  143. auto identifier = get_single_child_with_tag(child, "a"sv).map([](XML::Node const* node) {
  144. return get_text_contents(node).value_or(""sv);
  145. });
  146. if (!identifier.has_value() || identifier.value().is_empty())
  147. report_error("malformed <emu-xref> subtree, expected <a> with nested single text node");
  148. tokens.append({ TokenType::Identifier, identifier.value_or(""sv), move(child_location) });
  149. return;
  150. }
  151. if (element.name == tag_sup) {
  152. tokens.append({ TokenType::Superscript, ""sv, move(child_location) });
  153. tokens.append({ TokenType::ParenOpen, ""sv, move(child_location) });
  154. tokenize_tree(ctx, state, child, TreeType::NestedExpression);
  155. tokens.append({ TokenType::ParenClose, ""sv, move(child_location) });
  156. return;
  157. }
  158. if (element.name == tag_emu_const) {
  159. auto maybe_contents = get_text_contents(child);
  160. if (!maybe_contents.has_value())
  161. report_error("malformed <emu-const> subtree, expected single text child node");
  162. tokens.append({ TokenType::Enumerator, maybe_contents.value_or(""sv), move(child_location) });
  163. return;
  164. }
  165. if (tree_type == TreeType::Header && element.name == tag_span) {
  166. auto element_class = get_attribute_by_name(child, attribute_class);
  167. if (element_class != class_secnum)
  168. report_error("expected <span> to have class='secnum' attribute");
  169. auto section_number = get_text_contents(child);
  170. if (!section_number.has_value())
  171. report_error("malformed section number span subtree, expected single text child node");
  172. tokens.append({ TokenType::SectionNumber, section_number.value_or(""sv), move(child_location) });
  173. return;
  174. }
  175. if (tree_type == TreeType::AlgorithmStep && element.name == tag_ol) {
  176. substeps = child;
  177. return;
  178. }
  179. report_error("<{}> should not be a child of algorithm step", element.name);
  180. },
  181. [&](XML::Node::Text const& text) {
  182. auto view = text.builder.string_view();
  183. if (substeps != nullptr && !contains_empty_text(child)) {
  184. ctx.diag().error(ctx.location_from_xml_offset(child->offset),
  185. "substeps list must be the last child of algorithm step");
  186. } else {
  187. tokenize_string(ctx, child, view, tokens);
  188. }
  189. },
  190. [&](auto const&) {});
  191. }
  192. if (tree_type == TreeType::AlgorithmStep && tokens.size() && tokens.last().type == TokenType::MemberAccess)
  193. tokens.last().type = TokenType::Dot;
  194. }
  195. }
  196. StepTokenizationResult tokenize_step(SpecificationParsingContext& ctx, XML::Node const* node)
  197. {
  198. TokenizerState state;
  199. tokenize_tree(ctx, state, node, TreeType::AlgorithmStep);
  200. return {
  201. .tokens = state.has_errors ? OptionalNone {} : Optional<Vector<Token>> { move(state.tokens) },
  202. .substeps = state.substeps,
  203. };
  204. }
  205. Optional<Vector<Token>> tokenize_header(SpecificationParsingContext& ctx, XML::Node const* node)
  206. {
  207. TokenizerState state;
  208. tokenize_tree(ctx, state, node, TreeType::Header);
  209. return state.has_errors ? OptionalNone {} : Optional<Vector<Token>> { state.tokens };
  210. }
  211. }