Lexer.cpp 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. /*
  2. * Copyright (c) 2023, Dan Klishch <danilklishch@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/NonnullOwnPtr.h>
  7. #include <LibXML/Parser/Parser.h>
  8. #include "Parser/Lexer.h"
  9. #include "Parser/SpecParser.h"
  10. #include "Parser/XMLUtils.h"
  11. namespace JSSpecCompiler {
  12. namespace {
  13. Optional<Token> consume_number(XML::LineTrackingLexer& lexer, XML::Node const* node, Location& location)
  14. {
  15. u64 start = lexer.tell();
  16. if (lexer.next_is('-'))
  17. lexer.consume(1);
  18. if (!lexer.next_is(is_ascii_digit)) {
  19. lexer.retreat(lexer.tell() - start);
  20. return {};
  21. }
  22. lexer.consume_while(is_ascii_digit);
  23. if (lexer.next_is('.')) {
  24. lexer.consume(1);
  25. if (lexer.consume_while(is_ascii_digit).length() == 0)
  26. lexer.retreat(1);
  27. }
  28. auto length = lexer.tell() - start;
  29. lexer.retreat(length);
  30. return { Token { TokenType::Number, lexer.consume(length), node, move(location) } };
  31. }
  32. bool can_end_word_token(char c)
  33. {
  34. return is_ascii_space(c) || ".,"sv.contains(c);
  35. }
  36. }
  37. ParseErrorOr<void> tokenize_string(SpecificationParsingContext& ctx, XML::Node const* node, StringView view, Vector<Token>& tokens)
  38. {
  39. static constexpr struct {
  40. StringView text_to_match;
  41. TokenType token_type;
  42. } choices[] = {
  43. { "-"sv, TokenType::AmbiguousMinus },
  44. { "}"sv, TokenType::BraceClose },
  45. { "{"sv, TokenType::BraceOpen },
  46. { ":"sv, TokenType::Colon },
  47. { ","sv, TokenType::Comma },
  48. { "/"sv, TokenType::Division },
  49. { ". "sv, TokenType::Dot },
  50. { ".\n"sv, TokenType::Dot },
  51. { "="sv, TokenType::Equals },
  52. { "is equal to"sv, TokenType::Equals },
  53. { "!"sv, TokenType::ExclamationMark },
  54. { ">"sv, TokenType::Greater },
  55. { "is"sv, TokenType::Is },
  56. { "<"sv, TokenType::Less },
  57. { "."sv, TokenType::MemberAccess },
  58. { "×"sv, TokenType::Multiplication },
  59. { "is not equal to"sv, TokenType::NotEquals },
  60. { "≠"sv, TokenType::NotEquals },
  61. { ")"sv, TokenType::ParenClose },
  62. { "("sv, TokenType::ParenOpen },
  63. { "+"sv, TokenType::Plus },
  64. };
  65. XML::LineTrackingLexer lexer(view, node->offset);
  66. while (!lexer.is_eof()) {
  67. lexer.ignore_while(is_ascii_space);
  68. // FIXME: This is incorrect since we count text offset after XML reference resolution. To do
  69. // this properly, we need support from XML::Parser.
  70. Location token_location = ctx.location_from_xml_offset(lexer.offset_for(lexer.tell()));
  71. if (auto result = consume_number(lexer, node, token_location); result.has_value()) {
  72. tokens.append(result.release_value());
  73. continue;
  74. }
  75. bool matched = false;
  76. for (auto const& [text_to_match, token_type] : choices) {
  77. if (lexer.consume_specific(text_to_match)) {
  78. tokens.append({ token_type, ""sv, node, move(token_location) });
  79. matched = true;
  80. break;
  81. }
  82. }
  83. if (matched)
  84. continue;
  85. StringView word = lexer.consume_until(can_end_word_token);
  86. if (word.length())
  87. tokens.append({ TokenType::Word, word, node, move(token_location) });
  88. }
  89. return {};
  90. }
  91. ParseErrorOr<TokenizeTreeResult> tokenize_tree(SpecificationParsingContext& ctx, XML::Node const* node, bool allow_substeps)
  92. {
  93. TokenizeTreeResult result;
  94. auto& tokens = result.tokens;
  95. for (auto const& child : node->as_element().children) {
  96. TRY(child->content.visit(
  97. [&](XML::Node::Element const& element) -> ParseErrorOr<void> {
  98. if (result.substeps != nullptr)
  99. return ParseError::create("Substeps list must be the last non-empty child"sv, child);
  100. Location child_location = ctx.location_from_xml_offset(child->offset);
  101. if (element.name == tag_var) {
  102. tokens.append({ TokenType::Identifier, TRY(get_text_contents(child)), child, move(child_location) });
  103. return {};
  104. }
  105. if (element.name == tag_span) {
  106. auto element_class = TRY(deprecated_get_attribute_by_name(child, attribute_class));
  107. if (element_class != class_secnum)
  108. return ParseError::create(String::formatted("Expected 'secnum' as a class name of <span>, but found '{}'", element_class), child);
  109. tokens.append({ TokenType::SectionNumber, TRY(get_text_contents(child)), child, move(child_location) });
  110. return {};
  111. }
  112. if (element.name == tag_emu_val) {
  113. auto contents = TRY(get_text_contents(child));
  114. if (contents.length() >= 2 && contents.starts_with('"') && contents.ends_with('"'))
  115. tokens.append({ TokenType::String, contents.substring_view(1, contents.length() - 2), child, move(child_location) });
  116. else if (contents == "undefined")
  117. tokens.append({ TokenType::Undefined, contents, child, move(child_location) });
  118. else
  119. tokens.append({ TokenType::Identifier, contents, child, move(child_location) });
  120. return {};
  121. }
  122. if (element.name == tag_emu_xref) {
  123. auto contents = TRY(get_text_contents(TRY(get_only_child(child, "a"sv))));
  124. tokens.append({ TokenType::Identifier, contents, child, move(child_location) });
  125. return {};
  126. }
  127. if (element.name == tag_ol) {
  128. if (!allow_substeps)
  129. return ParseError::create("Found nested list but substeps are not allowed"sv, child);
  130. result.substeps = child;
  131. return {};
  132. }
  133. return ParseError::create(String::formatted("Unexpected child element with tag {}", element.name), child);
  134. },
  135. [&](XML::Node::Text const& text) -> ParseErrorOr<void> {
  136. auto view = text.builder.string_view();
  137. if (result.substeps && !contains_empty_text(child))
  138. return ParseError::create("Substeps list must be the last non-empty child"sv, child);
  139. return tokenize_string(ctx, child, view, tokens);
  140. },
  141. move(ignore_comments)));
  142. }
  143. if (tokens.size() && tokens.last().type == TokenType::MemberAccess)
  144. tokens.last().type = TokenType::Dot;
  145. return result;
  146. }
  147. }