Lexer.cpp 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. /*
  2. * Copyright (c) 2023, Dan Klishch <danilklishch@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/GenericLexer.h>
  7. #include <AK/NonnullOwnPtr.h>
  8. #include "Parser/Lexer.h"
  9. #include "Parser/XMLUtils.h"
  10. namespace JSSpecCompiler {
  11. namespace {
  12. Optional<Token> consume_number(GenericLexer& lexer, XML::Node const* node)
  13. {
  14. u64 start = lexer.tell();
  15. if (lexer.next_is('-'))
  16. lexer.consume(1);
  17. if (!lexer.next_is(is_ascii_digit)) {
  18. lexer.retreat(lexer.tell() - start);
  19. return {};
  20. }
  21. lexer.consume_while(is_ascii_digit);
  22. if (lexer.next_is('.')) {
  23. lexer.consume(1);
  24. if (lexer.consume_while(is_ascii_digit).length() == 0)
  25. lexer.retreat(1);
  26. }
  27. auto length = lexer.tell() - start;
  28. lexer.retreat(length);
  29. return { Token { TokenType::Number, lexer.consume(length), node } };
  30. }
  31. bool can_end_word_token(char c)
  32. {
  33. return is_ascii_space(c) || ".,"sv.contains(c);
  34. }
  35. }
  36. ParseErrorOr<void> tokenize_string(XML::Node const* node, StringView view, Vector<Token>& tokens)
  37. {
  38. #define CONSUME_IF_NEXT(view, type) \
  39. if (lexer.next_is(view##sv)) { \
  40. size_t length = __builtin_strlen(view); \
  41. tokens.append({ TokenType::type, lexer.consume(length), node }); \
  42. continue; \
  43. }
  44. GenericLexer lexer(view);
  45. while (!lexer.is_eof()) {
  46. lexer.ignore_while(is_ascii_space);
  47. if (auto result = consume_number(lexer, node); result.has_value()) {
  48. tokens.append(result.release_value());
  49. continue;
  50. }
  51. CONSUME_IF_NEXT("(", ParenOpen);
  52. CONSUME_IF_NEXT(")", ParenClose);
  53. CONSUME_IF_NEXT("{", BraceOpen);
  54. CONSUME_IF_NEXT("}", BraceClose);
  55. CONSUME_IF_NEXT(",", Comma);
  56. CONSUME_IF_NEXT(". ", Dot);
  57. CONSUME_IF_NEXT(".\n", Dot);
  58. CONSUME_IF_NEXT(":", Colon);
  59. CONSUME_IF_NEXT(".", MemberAccess);
  60. CONSUME_IF_NEXT("<", Less);
  61. CONSUME_IF_NEXT(">", Greater);
  62. CONSUME_IF_NEXT("is not equal to", NotEquals);
  63. CONSUME_IF_NEXT("≠", NotEquals);
  64. CONSUME_IF_NEXT("is equal to", Equals);
  65. CONSUME_IF_NEXT("=", Equals);
  66. CONSUME_IF_NEXT("+", Plus);
  67. CONSUME_IF_NEXT("-", AmbiguousMinus);
  68. CONSUME_IF_NEXT("×", Multiplication);
  69. CONSUME_IF_NEXT("/", Division);
  70. CONSUME_IF_NEXT("!", ExclamationMark);
  71. CONSUME_IF_NEXT("is", Is);
  72. StringView word = lexer.consume_until(can_end_word_token);
  73. if (word.length())
  74. tokens.append({ TokenType::Word, word, node });
  75. }
  76. return {};
  77. #undef CONSUME_IF_NEXT
  78. }
  79. ParseErrorOr<TokenizeTreeResult> tokenize_tree(XML::Node const* node, bool allow_substeps)
  80. {
  81. TokenizeTreeResult result;
  82. auto& tokens = result.tokens;
  83. for (auto const& child : node->as_element().children) {
  84. TRY(child->content.visit(
  85. [&](XML::Node::Element const& element) -> ParseErrorOr<void> {
  86. if (result.substeps != nullptr)
  87. return ParseError::create("Substeps list must be the last non-empty child"sv, child);
  88. if (element.name == tag_var) {
  89. tokens.append({ TokenType::Identifier, TRY(get_text_contents(child)), child });
  90. return {};
  91. }
  92. if (element.name == tag_span) {
  93. auto element_class = TRY(get_attribute_by_name(child, attribute_class));
  94. if (element_class != class_secnum)
  95. return ParseError::create(String::formatted("Expected 'secnum' as a class name of <span>, but found '{}'", element_class), child);
  96. tokens.append({ TokenType::SectionNumber, TRY(get_text_contents(child)), child });
  97. return {};
  98. }
  99. if (element.name == tag_emu_val) {
  100. auto contents = TRY(get_text_contents(child));
  101. if (contents.length() >= 2 && contents.starts_with('"') && contents.ends_with('"'))
  102. tokens.append({ TokenType::String, contents.substring_view(1, contents.length() - 2), child });
  103. else if (contents == "undefined")
  104. tokens.append({ TokenType::Undefined, contents, child });
  105. else
  106. tokens.append({ TokenType::Identifier, contents, child });
  107. return {};
  108. }
  109. if (element.name == tag_emu_xref) {
  110. auto contents = TRY(get_text_contents(TRY(get_only_child(child, "a"sv))));
  111. tokens.append({ TokenType::Identifier, contents, child });
  112. return {};
  113. }
  114. if (element.name == tag_ol) {
  115. if (!allow_substeps)
  116. return ParseError::create("Found nested list but substeps are not allowed"sv, child);
  117. result.substeps = child;
  118. return {};
  119. }
  120. return ParseError::create(String::formatted("Unexpected child element with tag {}", element.name), child);
  121. },
  122. [&](XML::Node::Text const& text) -> ParseErrorOr<void> {
  123. auto view = text.builder.string_view();
  124. if (result.substeps && !contains_empty_text(child))
  125. return ParseError::create("Substeps list must be the last non-empty child"sv, child);
  126. return tokenize_string(child, view, tokens);
  127. },
  128. move(ignore_comments)));
  129. }
  130. if (tokens.size() && tokens.last().type == TokenType::MemberAccess)
  131. tokens.last().type = TokenType::Dot;
  132. return result;
  133. }
  134. }