ladybird/Meta/Lagom/Tools/CodeGenerators/JSSpecCompiler/Parser/Lexer.cpp

/*
 * Copyright (c) 2023, Dan Klishch <danilklishch@gmail.com>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <AK/NonnullOwnPtr.h>
#include <LibXML/Parser/Parser.h>

#include "Parser/Lexer.h"
#include "Parser/SpecParser.h"
#include "Parser/XMLUtils.h"

namespace JSSpecCompiler {

namespace {
Optional<Token> consume_number(LineTrackingLexer& lexer, Location& location)
{
    u64 start = lexer.tell();

    if (lexer.next_is('-'))
        lexer.consume(1);

    if (!lexer.next_is(is_ascii_digit)) {
        lexer.retreat(lexer.tell() - start);
        return {};
    }

    lexer.consume_while(is_ascii_digit);

    if (lexer.next_is('.')) {
        lexer.consume(1);
        if (lexer.consume_while(is_ascii_digit).length() == 0)
            lexer.retreat(1);
    }

    auto length = lexer.tell() - start;
    lexer.retreat(length);
    return { Token { TokenType::Number, lexer.consume(length), move(location) } };
}

bool can_end_word_token(char c)
{
    return is_ascii_space(c) || ".,"sv.contains(c);
}

void tokenize_string(SpecificationParsingContext& ctx, XML::Node const* node, StringView view, Vector<Token>& tokens)
{
    static constexpr struct {
        StringView text_to_match;
        TokenType token_type;
    } choices[] = {
        { "-"sv, TokenType::AmbiguousMinus },
        { "}"sv, TokenType::BraceClose },
        { "{"sv, TokenType::BraceOpen },
        { ":"sv, TokenType::Colon },
        { ","sv, TokenType::Comma },
        { "/"sv, TokenType::Division },
        { ". "sv, TokenType::Dot },
        { ".\n"sv, TokenType::Dot },
        { "="sv, TokenType::Equals },
        { "is equal to"sv, TokenType::Equals },
        { "!"sv, TokenType::ExclamationMark },
        { ">"sv, TokenType::Greater },
        { "is"sv, TokenType::Is },
        { "<"sv, TokenType::Less },
        { "."sv, TokenType::MemberAccess },
        { "×"sv, TokenType::Multiplication },
        { "is not equal to"sv, TokenType::NotEquals },
        { "≠"sv, TokenType::NotEquals },
        { ")"sv, TokenType::ParenClose },
        { "("sv, TokenType::ParenOpen },
        { "+"sv, TokenType::Plus },
        { "?"sv, TokenType::QuestionMark },
    };

    LineTrackingLexer lexer(view, node->offset);

    while (!lexer.is_eof()) {
        lexer.ignore_while(is_ascii_space);

        // FIXME: This is incorrect since we count text offset after XML reference resolution. To do
        //        this properly, we need support from XML::Parser.
        Location token_location = ctx.location_from_xml_offset(lexer.position_for(lexer.tell()));

        if (auto result = consume_number(lexer, token_location); result.has_value()) {
            tokens.append(result.release_value());
            continue;
        }

        bool matched = false;
        for (auto const& [text_to_match, token_type] : choices) {
            if (lexer.consume_specific(text_to_match)) {
                tokens.append({ token_type, ""sv, move(token_location) });
                matched = true;
                break;
            }
        }
        if (matched)
            continue;

        StringView word = lexer.consume_until(can_end_word_token);
        if (word.length())
            tokens.append({ TokenType::Word, word, move(token_location) });
    }
}

enum class TreeType {
    AlgorithmStep,
    NestedExpression,
    Header,
};

struct TokenizerState {
    Vector<Token> tokens;
    XML::Node const* substeps = nullptr;
    bool has_errors = false;
};

void tokenize_tree(SpecificationParsingContext& ctx, TokenizerState& state, XML::Node const* node, TreeType tree_type)
{
    // FIXME: Use structured binding once macOS Lagom CI updates to Clang >= 16.
    auto& tokens = state.tokens;
    auto& substeps = state.substeps;
    auto& has_errors = state.has_errors;

    for (auto const& child : node->as_element().children) {
        if (has_errors)
            break;

        child->content.visit(
            [&](XML::Node::Element const& element) -> void {
                Location child_location = ctx.location_from_xml_offset(child->offset);
                auto report_error = [&]<typename... Parameters>(AK::CheckedFormatString<Parameters...>&& fmt, Parameters const&... parameters) {
                    ctx.diag().error(child_location, move(fmt), parameters...);
                    has_errors = true;
                };

                if (substeps) {
                    report_error("substeps list must be the last child of algorithm step");
                    return;
                }

                if (element.name == tag_var) {
                    auto variable_name = get_text_contents(child);
                    if (!variable_name.has_value())
                        report_error("malformed <var> subtree, expected single text child node");

                    tokens.append({ TokenType::Identifier, variable_name.value_or(""sv), move(child_location) });
                    return;
                }

                if (element.name == tag_emu_val) {
                    auto maybe_contents = get_text_contents(child);
                    if (!maybe_contents.has_value())
                        report_error("malformed <emu-val> subtree, expected single text child node");

                    auto contents = maybe_contents.value_or(""sv);

                    if (contents.length() >= 2 && contents.starts_with('"') && contents.ends_with('"'))
                        tokens.append({ TokenType::String, contents.substring_view(1, contents.length() - 2), move(child_location) });
                    else if (contents.is_one_of("undefined", "null", "this", "true", "false"))
                        tokens.append({ TokenType::WellKnownValue, contents, move(child_location) });
                    else
                        tokens.append({ TokenType::Identifier, contents, move(child_location) });
                    return;
                }

                if (element.name == tag_emu_xref) {
                    auto identifier = get_single_child_with_tag(child, "a"sv).map([](XML::Node const* node) {
                        return get_text_contents(node).value_or(""sv);
                    });
                    if (!identifier.has_value() || identifier.value().is_empty())
                        report_error("malformed <emu-xref> subtree, expected <a> with nested single text node");

                    tokens.append({ TokenType::Identifier, identifier.value_or(""sv), move(child_location) });
                    return;
                }

                if (element.name == tag_sup) {
                    tokens.append({ TokenType::Superscript, ""sv, move(child_location) });
                    tokens.append({ TokenType::ParenOpen, ""sv, move(child_location) });
                    tokenize_tree(ctx, state, child, TreeType::NestedExpression);
                    tokens.append({ TokenType::ParenClose, ""sv, move(child_location) });
                    return;
                }

                if (tree_type == TreeType::Header && element.name == tag_span) {
                    auto element_class = get_attribute_by_name(child, attribute_class);
                    if (element_class != class_secnum)
                        report_error("expected <span> to have class='secnum' attribute");

                    auto section_number = get_text_contents(child);
                    if (!section_number.has_value())
                        report_error("malformed section number span subtree, expected single text child node");

                    tokens.append({ TokenType::SectionNumber, section_number.value_or(""sv), move(child_location) });
                    return;
                }

                if (tree_type == TreeType::AlgorithmStep && element.name == tag_ol) {
                    substeps = child;
                    return;
                }

                report_error("<{}> should not be a child of algorithm step", element.name);
            },
            [&](XML::Node::Text const& text) {
                auto view = text.builder.string_view();
                if (substeps != nullptr && !contains_empty_text(child)) {
                    ctx.diag().error(ctx.location_from_xml_offset(child->offset),
                        "substeps list must be the last child of algorithm step");
                } else {
                    tokenize_string(ctx, child, view, tokens);
                }
            },
            [&](auto const&) {});
    }

    if (tree_type == TreeType::AlgorithmStep && tokens.size() && tokens.last().type == TokenType::MemberAccess)
        tokens.last().type = TokenType::Dot;
}
}

StepTokenizationResult tokenize_step(SpecificationParsingContext& ctx, XML::Node const* node)
{
    TokenizerState state;
    tokenize_tree(ctx, state, node, TreeType::AlgorithmStep);
    return {
        .tokens = state.has_errors ? OptionalNone {} : Optional<Vector<Token>> { move(state.tokens) },
        .substeps = state.substeps,
    };
}

Optional<Vector<Token>> tokenize_header(SpecificationParsingContext& ctx, XML::Node const* node)
{
    TokenizerState state;
    tokenize_tree(ctx, state, node, TreeType::Header);
    return state.has_errors ? OptionalNone {} : Optional<Vector<Token>> { state.tokens };
}

}