ladybird/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/Lexer.cpp

242 lines
9 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (c) 2023, Dan Klishch <danilklishch@gmail.com>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/NonnullOwnPtr.h>
#include <LibXML/Parser/Parser.h>
#include "Parser/Lexer.h"
#include "Parser/SpecParser.h"
#include "Parser/XMLUtils.h"
namespace JSSpecCompiler {
namespace {
Optional<Token> consume_number(LineTrackingLexer& lexer, Location& location)
{
u64 start = lexer.tell();
if (lexer.next_is('-'))
lexer.consume(1);
if (!lexer.next_is(is_ascii_digit)) {
lexer.retreat(lexer.tell() - start);
return {};
}
lexer.consume_while(is_ascii_digit);
if (lexer.next_is('.')) {
lexer.consume(1);
if (lexer.consume_while(is_ascii_digit).length() == 0)
lexer.retreat(1);
}
auto length = lexer.tell() - start;
lexer.retreat(length);
return { Token { TokenType::Number, lexer.consume(length), move(location) } };
}
bool can_end_word_token(char c)
{
return is_ascii_space(c) || ".,"sv.contains(c);
}
void tokenize_string(SpecificationParsingContext& ctx, XML::Node const* node, StringView view, Vector<Token>& tokens)
{
static constexpr struct {
StringView text_to_match;
TokenType token_type;
} choices[] = {
{ "-"sv, TokenType::AmbiguousMinus },
{ "}"sv, TokenType::BraceClose },
{ "{"sv, TokenType::BraceOpen },
{ ":"sv, TokenType::Colon },
{ ","sv, TokenType::Comma },
{ "/"sv, TokenType::Division },
{ ". "sv, TokenType::Dot },
{ ".\n"sv, TokenType::Dot },
{ "="sv, TokenType::Equals },
{ "is equal to"sv, TokenType::Equals },
{ "!"sv, TokenType::ExclamationMark },
{ ">"sv, TokenType::Greater },
{ "is"sv, TokenType::Is },
{ "<"sv, TokenType::Less },
{ "."sv, TokenType::MemberAccess },
{ "×"sv, TokenType::Multiplication },
{ "is not equal to"sv, TokenType::NotEquals },
{ ""sv, TokenType::NotEquals },
{ ")"sv, TokenType::ParenClose },
{ "("sv, TokenType::ParenOpen },
{ "+"sv, TokenType::Plus },
{ "?"sv, TokenType::QuestionMark },
};
LineTrackingLexer lexer(view, node->offset);
while (!lexer.is_eof()) {
lexer.ignore_while(is_ascii_space);
// FIXME: This is incorrect since we count text offset after XML reference resolution. To do
// this properly, we need support from XML::Parser.
Location token_location = ctx.location_from_xml_offset(lexer.position_for(lexer.tell()));
if (auto result = consume_number(lexer, token_location); result.has_value()) {
tokens.append(result.release_value());
continue;
}
bool matched = false;
for (auto const& [text_to_match, token_type] : choices) {
if (lexer.consume_specific(text_to_match)) {
tokens.append({ token_type, ""sv, move(token_location) });
matched = true;
break;
}
}
if (matched)
continue;
StringView word = lexer.consume_until(can_end_word_token);
if (word.length())
tokens.append({ TokenType::Word, word, move(token_location) });
}
}
enum class TreeType {
AlgorithmStep,
NestedExpression,
Header,
};
struct TokenizerState {
Vector<Token> tokens;
XML::Node const* substeps = nullptr;
bool has_errors = false;
};
void tokenize_tree(SpecificationParsingContext& ctx, TokenizerState& state, XML::Node const* node, TreeType tree_type)
{
// FIXME: Use structured binding once macOS Lagom CI updates to Clang >= 16.
auto& tokens = state.tokens;
auto& substeps = state.substeps;
auto& has_errors = state.has_errors;
for (auto const& child : node->as_element().children) {
if (has_errors)
break;
child->content.visit(
[&](XML::Node::Element const& element) -> void {
Location child_location = ctx.location_from_xml_offset(child->offset);
auto report_error = [&]<typename... Parameters>(AK::CheckedFormatString<Parameters...>&& fmt, Parameters const&... parameters) {
ctx.diag().error(child_location, move(fmt), parameters...);
has_errors = true;
};
if (substeps) {
report_error("substeps list must be the last child of algorithm step");
return;
}
if (element.name == tag_var) {
auto variable_name = get_text_contents(child);
if (!variable_name.has_value())
report_error("malformed <var> subtree, expected single text child node");
tokens.append({ TokenType::Identifier, variable_name.value_or(""sv), move(child_location) });
return;
}
if (element.name == tag_emu_val) {
auto maybe_contents = get_text_contents(child);
if (!maybe_contents.has_value())
report_error("malformed <emu-val> subtree, expected single text child node");
auto contents = maybe_contents.value_or(""sv);
if (contents.length() >= 2 && contents.starts_with('"') && contents.ends_with('"'))
tokens.append({ TokenType::String, contents.substring_view(1, contents.length() - 2), move(child_location) });
else if (contents.is_one_of("undefined", "null", "this", "true", "false"))
tokens.append({ TokenType::WellKnownValue, contents, move(child_location) });
else
tokens.append({ TokenType::Identifier, contents, move(child_location) });
return;
}
if (element.name == tag_emu_xref) {
auto identifier = get_single_child_with_tag(child, "a"sv).map([](XML::Node const* node) {
return get_text_contents(node).value_or(""sv);
});
if (!identifier.has_value() || identifier.value().is_empty())
report_error("malformed <emu-xref> subtree, expected <a> with nested single text node");
tokens.append({ TokenType::Identifier, identifier.value_or(""sv), move(child_location) });
return;
}
if (element.name == tag_sup) {
tokens.append({ TokenType::Superscript, ""sv, move(child_location) });
tokens.append({ TokenType::ParenOpen, ""sv, move(child_location) });
tokenize_tree(ctx, state, child, TreeType::NestedExpression);
tokens.append({ TokenType::ParenClose, ""sv, move(child_location) });
return;
}
if (tree_type == TreeType::Header && element.name == tag_span) {
auto element_class = get_attribute_by_name(child, attribute_class);
if (element_class != class_secnum)
report_error("expected <span> to have class='secnum' attribute");
auto section_number = get_text_contents(child);
if (!section_number.has_value())
report_error("malformed section number span subtree, expected single text child node");
tokens.append({ TokenType::SectionNumber, section_number.value_or(""sv), move(child_location) });
return;
}
if (tree_type == TreeType::AlgorithmStep && element.name == tag_ol) {
substeps = child;
return;
}
report_error("<{}> should not be a child of algorithm step", element.name);
},
[&](XML::Node::Text const& text) {
auto view = text.builder.string_view();
if (substeps != nullptr && !contains_empty_text(child)) {
ctx.diag().error(ctx.location_from_xml_offset(child->offset),
"substeps list must be the last child of algorithm step");
} else {
tokenize_string(ctx, child, view, tokens);
}
},
[&](auto const&) {});
}
if (tree_type == TreeType::AlgorithmStep && tokens.size() && tokens.last().type == TokenType::MemberAccess)
tokens.last().type = TokenType::Dot;
}
}
StepTokenizationResult tokenize_step(SpecificationParsingContext& ctx, XML::Node const* node)
{
TokenizerState state;
tokenize_tree(ctx, state, node, TreeType::AlgorithmStep);
return {
.tokens = state.has_errors ? OptionalNone {} : Optional<Vector<Token>> { move(state.tokens) },
.substeps = state.substeps,
};
}
Optional<Vector<Token>> tokenize_header(SpecificationParsingContext& ctx, XML::Node const* node)
{
TokenizerState state;
tokenize_tree(ctx, state, node, TreeType::Header);
return state.has_errors ? OptionalNone {} : Optional<Vector<Token>> { state.tokens };
}
}