瀏覽代碼

DevTools: Add StateMachineGenerator utility

This program turns a description of a state machine that takes its input
byte-by-byte into C++ code. The state machine is described in a custom
format as specified below:

```
// Comments are started by two slashes, and cause the rest of the line
// to be ignored

@name ExampleStateMachine // sets the name of the generated class
@namespace Test           // sets the namespace (optional)
@begin Begin              // sets the state the parser will start in

// The rest of the file contains one or more states and an optional
// @anywhere directive. Each of these is a curly bracket delimited set
// of state transitions. State transitions contain a selector, the
// literal "=>" and a (new_state, action) tuple. Examples:
//     0x0a => (Begin, PrintLine)
//     [0x00..0x1f] => (_, Warn)      // '_' means no change
//     [0x41..0x5a] => (BeginWord, _) // '_' means no action

// Rules common to all states. These take precedence over rules in the
// specific states.
@anywhere {
    0x0a         => (Begin, PrintLine)
    [0x00..0x1f] => (_, Warn)
}

Begin {
    [0x41..0x5a] => (Word, _)
    [0x61..0x7a] => (Word, _)
    // For missing values, the transition (_, _) is implied
}

Word {
    // The entry action is run when we transition to this state from a
    // *different* state. @anywhere can't have this
    @entry IncreaseWordCount
    0x09 => (Begin, _)
    0x20 => (Begin, _)

    // The exit action is run before we transition to any *other* state
    // from here. @anywhere can't have this
    @exit EndOfWord
}
```

The generated code consists of a single class which takes a
`Function<Action, u8>` as a parameter in its constructor. This gets
called whenever an action is to be done. This is because some input
might not produce an action, but others might produce up to 3 (exit,
state transition, entry). The actions allow us to build a more
advanced parser over the simple state machine.

The sole public method, `void advance(u8)`, handles the input
byte-by-byte, managing the state changes and requesting the appropriate
Action from the handler.

Internally, the state transitions are resolved via a lookup table. This
is a bit wasteful for more complex state machines, therefore the
generator is designed to be easily extendable with a switch-based
resolver; only the private `lookup_state_transition` method needs to be
re-implemented.

My goal for this tool is to use it for implementing a standard-compliant
ANSI escape sequence parser for LibVT, as described on
<https://vt100.net/emu/dec_ansi_parser>
Daniel Bertalan 4 年之前
父節點
當前提交
22195d965f

+ 1 - 0
CMakeLists.txt

@@ -103,6 +103,7 @@ include_directories(${CMAKE_BINARY_DIR})
 
 add_subdirectory(Meta/Lagom)
 add_subdirectory(Userland/DevTools/IPCCompiler)
+add_subdirectory(Userland/DevTools/StateMachineGenerator)
 add_subdirectory(Userland/Libraries/LibWeb/CodeGenerators)
 
 set(write_if_different ${CMAKE_SOURCE_DIR}/Meta/write-only-on-difference.sh)

+ 6 - 0
Userland/DevTools/StateMachineGenerator/CMakeLists.txt

@@ -0,0 +1,6 @@
+set(SOURCES
+    main.cpp
+)
+
+add_executable(StateMachineGenerator ${SOURCES})
+target_link_libraries(StateMachineGenerator LagomCore)

+ 495 - 0
Userland/DevTools/StateMachineGenerator/main.cpp

@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2021, Daniel Bertalan <dani@danielbertalan.dev>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/GenericLexer.h>
+#include <AK/HashTable.h>
+#include <AK/NonnullOwnPtr.h>
+#include <AK/SourceGenerator.h>
+#include <AK/String.h>
+#include <AK/StringBuilder.h>
+#include <AK/Types.h>
+#include <LibCore/ArgsParser.h>
+#include <LibCore/File.h>
+#include <ctype.h>
+#include <string.h>
+
+struct Range {
+    int begin;
+    int end;
+};
+
+struct StateTransition {
+    Optional<String> new_state;
+    Optional<String> action;
+};
+
+struct MatchedAction {
+    Range range;
+    StateTransition action;
+};
+
+struct State {
+    String name;
+    Vector<MatchedAction> actions;
+    Optional<String> entry_action;
+    Optional<String> exit_action;
+};
+
+struct StateMachine {
+    String name;
+    String initial_state;
+    Vector<State> states;
+    Optional<State> anywhere;
+    Optional<String> namespaces;
+};
+
+static OwnPtr<StateMachine>
+parse_state_machine(StringView input)
+{
+    auto state_machine = make<StateMachine>();
+    GenericLexer lexer(input);
+
+    auto consume_whitespace = [&] {
+        bool consumed = true;
+        while (consumed) {
+            consumed = lexer.consume_while(isspace).length() > 0;
+            if (lexer.consume_specific("//")) {
+                lexer.consume_line();
+                consumed = true;
+            }
+        }
+    };
+
+    auto consume_identifier = [&] {
+        consume_whitespace();
+        return lexer.consume_while([](char c) { return isalnum(c) || c == '_'; });
+    };
+
+    auto get_hex_value = [&](char c) {
+        if (isdigit(c))
+            return c - '0';
+        else
+            return c - 'a' + 10;
+    };
+
+    auto consume_number = [&] {
+        int num = 0;
+        consume_whitespace();
+        if (lexer.consume_specific("0x")) {
+            auto hex_digits = lexer.consume_while([](char c) {
+                if (isdigit(c)) return true;
+            else {
+                c = tolower(c);
+                return (c >= 'a' && c <= 'f');
+              } });
+            for (auto c : hex_digits)
+                num = 16 * num + get_hex_value(c);
+        } else {
+            lexer.consume_specific('\'');
+            if (lexer.next_is('\\'))
+                num = (int)lexer.consume_escaped_character('\\');
+            else
+                num = lexer.consume_until('\'').to_int().value();
+            lexer.consume_specific('\'');
+        }
+        return num;
+    };
+
+    auto consume_condition = [&] {
+        Range condition;
+        consume_whitespace();
+        if (lexer.consume_specific('[')) {
+            consume_whitespace();
+            condition.begin = consume_number();
+            consume_whitespace();
+            lexer.consume_specific("..");
+            consume_whitespace();
+            condition.end = consume_number();
+            consume_whitespace();
+            lexer.consume_specific(']');
+        } else {
+            auto num = consume_number();
+            condition.begin = num;
+            condition.end = num;
+        }
+        return condition;
+    };
+
+    auto consume_action = [&]() {
+        StateTransition action;
+        consume_whitespace();
+        lexer.consume_specific("=>");
+        consume_whitespace();
+        lexer.consume_specific('(');
+        consume_whitespace();
+        if (!lexer.consume_specific("_"))
+            action.new_state = consume_identifier();
+        consume_whitespace();
+        lexer.consume_specific(',');
+        consume_whitespace();
+        if (!lexer.consume_specific("_"))
+            action.action = consume_identifier();
+        consume_whitespace();
+        lexer.consume_specific(')');
+        return action;
+    };
+
+    auto consume_state_description
+        = [&] {
+              State state;
+              consume_whitespace();
+              state.name = consume_identifier();
+              consume_whitespace();
+              consume_whitespace();
+              lexer.consume_specific('{');
+              for (;;) {
+                  consume_whitespace();
+                  if (lexer.consume_specific('}')) {
+                      break;
+                  }
+                  if (lexer.consume_specific("@entry")) {
+                      consume_whitespace();
+                      state.entry_action = consume_identifier();
+                  } else if (lexer.consume_specific("@exit")) {
+                      consume_whitespace();
+                      state.exit_action = consume_identifier();
+                  } else if (lexer.next_is('@')) {
+                      auto directive = consume_identifier().to_string();
+                      fprintf(stderr, "Unimplemented @ directive %s\n", directive.characters());
+                      exit(1);
+                  } else {
+                      MatchedAction matched_action;
+                      matched_action.range = consume_condition();
+                      matched_action.action = consume_action();
+                      state.actions.append(matched_action);
+                  }
+              }
+              return state;
+          };
+
+    while (!lexer.is_eof()) {
+        consume_whitespace();
+        if (lexer.is_eof())
+            break;
+        if (lexer.consume_specific("@namespace")) {
+            consume_whitespace();
+            state_machine->namespaces = lexer.consume_while([](char c) { return isalpha(c) || c == ':'; });
+        } else if (lexer.consume_specific("@begin")) {
+            consume_whitespace();
+            state_machine->initial_state = consume_identifier();
+        } else if (lexer.consume_specific("@name")) {
+            consume_whitespace();
+            state_machine->name = consume_identifier();
+        } else if (lexer.next_is("@anywhere")) {
+            lexer.consume_specific('@');
+            state_machine->anywhere = consume_state_description();
+        } else if (lexer.consume_specific('@')) {
+            auto directive = consume_identifier().to_string();
+            fprintf(stderr, "Unimplemented @ directive %s\n", directive.characters());
+            exit(1);
+        } else {
+            auto description = consume_state_description();
+            state_machine->states.append(description);
+        }
+    }
+
+    if (state_machine->initial_state.is_empty()) {
+        fprintf(stderr, "Missing @begin directive\n");
+        exit(1);
+    } else if (state_machine->name.is_empty()) {
+        fprintf(stderr, "Missing @name directive\n");
+        exit(1);
+    }
+
+    if (state_machine->anywhere.has_value()) {
+        state_machine->anywhere.value().name = "_Anywhere";
+    }
+    return state_machine;
+}
+
+void output_header(const StateMachine&, SourceGenerator&);
+void output_cpp(const StateMachine&, SourceGenerator&);
+
+int main(int argc, char** argv)
+{
+    Core::ArgsParser args_parser;
+    const char* path = nullptr;
+    bool header_mode = false;
+    args_parser.add_option(header_mode, "Generate .h file", "header", 'H');
+    args_parser.add_positional_argument(path, "Path to parser description", "input", Core::ArgsParser::Required::Yes);
+    args_parser.parse(argc, argv);
+
+    auto file_or_error = Core::File::open(path, Core::IODevice::ReadOnly);
+    if (file_or_error.is_error()) {
+        fprintf(stderr, "Cannot open %s\n", path);
+    }
+
+    auto content = file_or_error.value()->read_all();
+    auto state_machine = parse_state_machine(content);
+
+    StringBuilder builder;
+    SourceGenerator generator { builder };
+    if (header_mode)
+        output_header(*state_machine, generator);
+    else
+        output_cpp(*state_machine, generator);
+    outln("{}", generator.as_string_view());
+    return 0;
+}
+
+HashTable<String> actions(const StateMachine& machine)
+{
+    HashTable<String> table;
+
+    auto do_state = [&](const State& state) {
+        if (state.entry_action.has_value())
+            table.set(state.entry_action.value());
+        if (state.exit_action.has_value())
+            table.set(state.exit_action.value());
+        for (auto action : state.actions) {
+            if (action.action.action.has_value())
+                table.set(action.action.action.value());
+        }
+    };
+    for (auto state : machine.states) {
+        do_state(state);
+    }
+    if (machine.anywhere.has_value())
+        do_state(machine.anywhere.value());
+    return move(table);
+}
+
+void generate_lookup_table(const StateMachine& machine, SourceGenerator& generator)
+{
+    generator.append(R"~~~(
+    static constexpr StateTransition STATE_TRANSITION_TABLE[][256] = {
+)~~~");
+
+    auto generate_for_state = [&](const State& s) {
+        auto table_generator = generator.fork();
+        table_generator.set("active_state", s.name);
+        table_generator.append("/* @active_state@ */ { ");
+        VERIFY(!s.name.is_empty());
+        Vector<StateTransition> row;
+        for (int i = 0; i < 256; i++)
+            row.append({ s.name, "_Ignore" });
+        for (auto action : s.actions) {
+            for (int range_element = action.range.begin; range_element <= action.range.end; range_element++) {
+                row[range_element] = { action.action.new_state, action.action.action };
+            }
+        }
+        for (int i = 0; i < 256; ++i) {
+            auto cell_generator = table_generator.fork();
+            cell_generator.set("cell_new_state", row[i].new_state.value_or(s.name));
+            cell_generator.set("cell_action", row[i].action.value_or("_Ignore"));
+            cell_generator.append(" {State::@cell_new_state@, Action::@cell_action@}, ");
+        }
+        table_generator.append("},\n");
+    };
+    if (machine.anywhere.has_value()) {
+        generate_for_state(machine.anywhere.value());
+    }
+    for (auto s : machine.states) {
+        generate_for_state(s);
+    }
+    generator.append(R"~~~(
+    };
+)~~~");
+}
+
+void output_header(const StateMachine& machine, SourceGenerator& generator)
+{
+    generator.set("class_name", machine.name);
+    generator.set("initial_state", machine.initial_state);
+    generator.set("state_count", String::number(machine.states.size() + 1));
+
+    generator.append(R"~~~(
+#pragma once
+
+#include <AK/Function.h>
+#include <AK/Platform.h>
+#include <AK/Types.h>
+                     )~~~");
+    if (machine.namespaces.has_value()) {
+        generator.set("namespace", machine.namespaces.value());
+        generator.append(R"~~~(
+namespace @namespace@ {
+)~~~");
+    }
+    generator.append(R"~~~(
+class @class_name@ {
+public:
+    enum class Action : u8 {
+        _Ignore,
+)~~~");
+    for (auto a : actions(machine)) {
+        if (a.is_empty())
+            continue;
+        auto action_generator = generator.fork();
+        action_generator.set("action.name", a);
+        action_generator.append(R"~~~(
+        @action.name@,
+    )~~~");
+    }
+
+    generator.append(R"~~~(
+    }; // end Action
+
+    typedef Function<void(Action, u8)> Handler;
+
+    @class_name@(Handler);
+
+    void advance(u8);
+
+private:
+    enum class State : u8 {
+        _Anywhere,
+)~~~");
+
+    int largest_state_value = 0;
+    for (auto s : machine.states) {
+        auto state_generator = generator.fork();
+        state_generator.set("state.name", s.name);
+        largest_state_value++;
+        state_generator.append(R"~~~(
+        @state.name@,
+)~~~");
+    }
+    generator.append(R"~~~(
+    }; // end State
+
+    struct StateTransition {
+        State new_state;
+        Action action;
+    };
+
+    State m_state { State::@initial_state@ };
+
+    Handler m_handler;
+
+    StateTransition lookup_state_transition(u8);
+)~~~");
+
+    auto table_generator = generator.fork();
+    generate_lookup_table(machine, table_generator);
+    generator.append(R"~~~(
+}; // end @class_name@
+)~~~");
+
+    if (machine.namespaces.has_value()) {
+        generator.append(R"~~~(
+} // end namespace
+)~~~");
+    }
+}
+
+void output_cpp(const StateMachine& machine, SourceGenerator& generator)
+{
+    VERIFY(!machine.name.is_empty());
+    generator.set("class_name", machine.name);
+    generator.set("state_count", String::number(machine.states.size() + 1));
+
+    generator.append(R"~~~(
+#include "@class_name@.h"
+#include <AK/Function.h>
+#include <AK/Types.h>
+)~~~");
+    if (machine.namespaces.has_value()) {
+        generator.set("namespace", machine.namespaces.value());
+        generator.append(R"~~~(
+namespace @namespace@ {
+)~~~");
+    }
+    generator.append(R"~~~(
+@class_name@::@class_name@(Handler handler)
+    : m_handler(move(handler))
+{
+}
+
+ALWAYS_INLINE @class_name@::StateTransition @class_name@::lookup_state_transition(u8 byte)
+{
+    VERIFY((u8)m_state < @state_count@);
+    )~~~");
+    if (machine.anywhere.has_value()) {
+        generator.append(R"~~~(
+    auto anywhere_state = STATE_TRANSITION_TABLE[0][byte];
+    if (anywhere_state.new_state != @class_name@::State::_Anywhere || anywhere_state.action != @class_name@::Action::_Ignore)
+        return anywhere_state;
+    else
+)~~~");
+    }
+    generator.append(R"~~~(
+        return STATE_TRANSITION_TABLE[(u8)m_state][byte];
+}
+)~~~");
+
+    generator.append(R"~~~(
+
+void @class_name@::advance(u8 byte)
+{
+    auto next_state = lookup_state_transition(byte);
+    bool state_will_change = next_state.new_state != m_state && next_state.new_state != @class_name@::State::_Anywhere;
+
+    // only run exit directive if state is being changed
+    if (state_will_change)
+    {
+        switch (m_state)
+        {
+)~~~");
+    for (auto s : machine.states) {
+        auto state_generator = generator.fork();
+        if (s.exit_action.has_value()) {
+            state_generator.set("state_name", s.name);
+            state_generator.set("action", s.exit_action.value());
+            state_generator.append(R"~~~(
+        case @class_name@::State::@state_name@:
+            m_handler(Action::@action@, byte);
+            break;
+)~~~");
+        }
+    }
+    generator.append(R"~~~(
+        default:
+            break;
+        }
+    }
+
+    if (next_state.action != @class_name@::Action::_Ignore)
+        m_handler(next_state.action, byte);
+    m_state = next_state.new_state;
+
+    // only run entry directive if state is being changed
+    if (state_will_change)
+    {
+        switch (next_state.new_state)
+        {
+)~~~");
+    for (auto state : machine.states) {
+        auto state_generator = generator.fork();
+        if (state.entry_action.has_value()) {
+            state_generator.set("state_name", state.name);
+            state_generator.set("action", state.entry_action.value());
+            state_generator.append(R"~~~(
+        case @class_name@::State::@state_name@:
+            m_handler(Action::@action@, byte);
+            break;
+)~~~");
+        }
+    }
+    generator.append(R"~~~(
+        default:
+            break;
+        }
+    }
+}
+)~~~");
+
+    if (machine.namespaces.has_value()) {
+        generator.append(R"~~~(
+} // end namespace
+)~~~");
+    }
+}