/* * Copyright (c) 2023, Dan Klishch * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include "Parser/Lexer.h" #include "Parser/SpecParser.h" #include "Parser/XMLUtils.h" namespace JSSpecCompiler { namespace { Optional consume_number(LineTrackingLexer& lexer, Location& location) { u64 start = lexer.tell(); if (lexer.next_is('-')) lexer.consume(1); if (!lexer.next_is(is_ascii_digit)) { lexer.retreat(lexer.tell() - start); return {}; } lexer.consume_while(is_ascii_digit); if (lexer.next_is('.')) { lexer.consume(1); if (lexer.consume_while(is_ascii_digit).length() == 0) lexer.retreat(1); } auto length = lexer.tell() - start; lexer.retreat(length); return { Token { TokenType::Number, lexer.consume(length), move(location) } }; } bool can_end_word_token(char c) { return is_ascii_space(c) || ".,"sv.contains(c); } void tokenize_string(SpecificationParsingContext& ctx, XML::Node const* node, StringView view, Vector& tokens) { static constexpr struct { StringView text_to_match; TokenType token_type; } choices[] = { { "-"sv, TokenType::AmbiguousMinus }, { "}"sv, TokenType::BraceClose }, { "{"sv, TokenType::BraceOpen }, { ":"sv, TokenType::Colon }, { ","sv, TokenType::Comma }, { "/"sv, TokenType::Division }, { ". "sv, TokenType::Dot }, { ".\n"sv, TokenType::Dot }, { "="sv, TokenType::Equals }, { "is equal to"sv, TokenType::Equals }, { "!"sv, TokenType::ExclamationMark }, { ">"sv, TokenType::Greater }, { "is"sv, TokenType::Is }, { "<"sv, TokenType::Less }, { "."sv, TokenType::MemberAccess }, { "×"sv, TokenType::Multiplication }, { "is not equal to"sv, TokenType::NotEquals }, { "≠"sv, TokenType::NotEquals }, { ")"sv, TokenType::ParenClose }, { "("sv, TokenType::ParenOpen }, { "+"sv, TokenType::Plus }, { "?"sv, TokenType::QuestionMark }, }; LineTrackingLexer lexer(view, node->offset); while (!lexer.is_eof()) { lexer.ignore_while(is_ascii_space); // FIXME: This is incorrect since we count text offset after XML reference resolution. To do // this properly, we need support from XML::Parser. Location token_location = ctx.location_from_xml_offset(lexer.position_for(lexer.tell())); if (auto result = consume_number(lexer, token_location); result.has_value()) { tokens.append(result.release_value()); continue; } bool matched = false; for (auto const& [text_to_match, token_type] : choices) { if (lexer.consume_specific(text_to_match)) { tokens.append({ token_type, ""sv, move(token_location) }); matched = true; break; } } if (matched) continue; StringView word = lexer.consume_until(can_end_word_token); if (word.length()) tokens.append({ TokenType::Word, word, move(token_location) }); } } enum class TreeType { AlgorithmStep, NestedExpression, Header, }; struct TokenizerState { Vector tokens; XML::Node const* substeps = nullptr; bool has_errors = false; }; void tokenize_tree(SpecificationParsingContext& ctx, TokenizerState& state, XML::Node const* node, TreeType tree_type) { // FIXME: Use structured binding once macOS Lagom CI updates to Clang >= 16. auto& tokens = state.tokens; auto& substeps = state.substeps; auto& has_errors = state.has_errors; for (auto const& child : node->as_element().children) { if (has_errors) break; child->content.visit( [&](XML::Node::Element const& element) -> void { Location child_location = ctx.location_from_xml_offset(child->offset); auto report_error = [&](AK::CheckedFormatString&& fmt, Parameters const&... parameters) { ctx.diag().error(child_location, move(fmt), parameters...); has_errors = true; }; if (substeps) { report_error("substeps list must be the last child of algorithm step"); return; } if (element.name == tag_var) { auto variable_name = get_text_contents(child); if (!variable_name.has_value()) report_error("malformed subtree, expected single text child node"); tokens.append({ TokenType::Identifier, variable_name.value_or(""sv), move(child_location) }); return; } if (element.name == tag_emu_val) { auto maybe_contents = get_text_contents(child); if (!maybe_contents.has_value()) report_error("malformed subtree, expected single text child node"); auto contents = maybe_contents.value_or(""sv); if (contents.length() >= 2 && contents.starts_with('"') && contents.ends_with('"')) tokens.append({ TokenType::String, contents.substring_view(1, contents.length() - 2), move(child_location) }); else if (contents.is_one_of("undefined", "null", "this", "true", "false")) tokens.append({ TokenType::WellKnownValue, contents, move(child_location) }); else tokens.append({ TokenType::Identifier, contents, move(child_location) }); return; } if (element.name == tag_emu_xref) { auto identifier = get_single_child_with_tag(child, "a"sv).map([](XML::Node const* node) { return get_text_contents(node).value_or(""sv); }); if (!identifier.has_value() || identifier.value().is_empty()) report_error("malformed subtree, expected with nested single text node"); tokens.append({ TokenType::Identifier, identifier.value_or(""sv), move(child_location) }); return; } if (element.name == tag_sup) { tokens.append({ TokenType::Superscript, ""sv, move(child_location) }); tokens.append({ TokenType::ParenOpen, ""sv, move(child_location) }); tokenize_tree(ctx, state, child, TreeType::NestedExpression); tokens.append({ TokenType::ParenClose, ""sv, move(child_location) }); return; } if (tree_type == TreeType::Header && element.name == tag_span) { auto element_class = get_attribute_by_name(child, attribute_class); if (element_class != class_secnum) report_error("expected to have class='secnum' attribute"); auto section_number = get_text_contents(child); if (!section_number.has_value()) report_error("malformed section number span subtree, expected single text child node"); tokens.append({ TokenType::SectionNumber, section_number.value_or(""sv), move(child_location) }); return; } if (tree_type == TreeType::AlgorithmStep && element.name == tag_ol) { substeps = child; return; } report_error("<{}> should not be a child of algorithm step", element.name); }, [&](XML::Node::Text const& text) { auto view = text.builder.string_view(); if (substeps != nullptr && !contains_empty_text(child)) { ctx.diag().error(ctx.location_from_xml_offset(child->offset), "substeps list must be the last child of algorithm step"); } else { tokenize_string(ctx, child, view, tokens); } }, [&](auto const&) {}); } if (tree_type == TreeType::AlgorithmStep && tokens.size() && tokens.last().type == TokenType::MemberAccess) tokens.last().type = TokenType::Dot; } } StepTokenizationResult tokenize_step(SpecificationParsingContext& ctx, XML::Node const* node) { TokenizerState state; tokenize_tree(ctx, state, node, TreeType::AlgorithmStep); return { .tokens = state.has_errors ? OptionalNone {} : Optional> { move(state.tokens) }, .substeps = state.substeps, }; } Optional> tokenize_header(SpecificationParsingContext& ctx, XML::Node const* node) { TokenizerState state; tokenize_tree(ctx, state, node, TreeType::Header); return state.has_errors ? OptionalNone {} : Optional> { state.tokens }; } }