Ver Fonte

Shell: Add support for heredocs to the POSIX parser

Ali Mohammad Pur há 2 anos atrás
pai
commit
2881bb4c3a

+ 226 - 1
Userland/Shell/PosixLexer.cpp

@@ -23,8 +23,11 @@ static bool is_part_of_operator(StringView text, char ch)
 
 namespace Shell::Posix {
 
-Vector<Token> Lexer::batch_next()
+Vector<Token> Lexer::batch_next(Optional<Reduction> starting_reduction)
 {
+    if (starting_reduction.has_value())
+        m_next_reduction = *starting_reduction;
+
     for (; m_next_reduction != Reduction::None;) {
         auto result = reduce(m_next_reduction);
         m_next_reduction = result.next_reduction;
@@ -55,6 +58,18 @@ char Lexer::consume()
     return ch;
 }
 
+void Lexer::reconsume(StringView string)
+{
+    for (auto byte : string.bytes()) {
+        if (byte == '\n') {
+            m_state.position.end_line.line_number++;
+            m_state.position.end_line.line_column = 0;
+        }
+
+        m_state.position.end_offset++;
+    }
+}
+
 bool Lexer::consume_specific(char ch)
 {
     if (m_lexer.peek() == ch) {
@@ -95,6 +110,8 @@ Lexer::ReductionResult Lexer::reduce(Reduction reduction)
         return reduce_command_or_arithmetic_substitution_expansion();
     case Reduction::ExtendedParameterExpansion:
         return reduce_extended_parameter_expansion();
+    case Reduction::HeredocContents:
+        return reduce_heredoc_contents();
     }
 
     VERIFY_NOT_REACHED();
@@ -108,6 +125,91 @@ Lexer::ReductionResult Lexer::reduce_end()
     };
 }
 
+Lexer::HeredocKeyResult Lexer::process_heredoc_key(Token const& token)
+{
+    StringBuilder builder;
+    enum ParseState {
+        Free,
+        InDoubleQuotes,
+        InSingleQuotes,
+    };
+    Vector<ParseState, 4> parse_state;
+    parse_state.append(Free);
+    bool escaped = false;
+    bool had_a_single_quote_segment = false;
+
+    for (auto byte : token.value.bytes()) {
+        switch (parse_state.last()) {
+        case Free:
+            switch (byte) {
+            case '"':
+                if (escaped) {
+                    builder.append(byte);
+                    escaped = false;
+                } else {
+                    parse_state.append(InDoubleQuotes);
+                }
+                break;
+            case '\'':
+                if (escaped) {
+                    builder.append(byte);
+                    escaped = false;
+                } else {
+                    had_a_single_quote_segment = true;
+                    parse_state.append(InSingleQuotes);
+                }
+                break;
+            case '\\':
+                if (escaped) {
+                    builder.append(byte);
+                    escaped = false;
+                } else {
+                    escaped = true;
+                }
+                break;
+            default:
+                if (escaped) {
+                    builder.append('\\');
+                    escaped = false;
+                }
+                builder.append(byte);
+                break;
+            }
+            break;
+        case InDoubleQuotes:
+            if (!escaped && byte == '"') {
+                parse_state.take_last();
+                break;
+            }
+            if (escaped) {
+                if (byte != '"')
+                    builder.append('\\');
+                builder.append(byte);
+                break;
+            }
+            if (byte == '\\')
+                escaped = true;
+            else
+                builder.append(byte);
+            break;
+        case InSingleQuotes:
+            if (byte == '\'') {
+                parse_state.take_last();
+                break;
+            }
+            builder.append(byte);
+            break;
+        }
+    }
+
+    // NOTE: Not checking the final state as any garbage that even partially parses is allowed to be used as a key :/
+
+    return {
+        .key = builder.to_deprecated_string(),
+        .allow_interpolation = !had_a_single_quote_segment,
+    };
+}
+
 Lexer::ReductionResult Lexer::reduce_operator()
 {
     if (m_lexer.is_eof()) {
@@ -142,8 +244,25 @@ Lexer::ReductionResult Lexer::reduce_operator()
         m_state.position.start_line = m_state.position.end_line;
     }
 
+    auto expect_heredoc_entry = !tokens.is_empty() && (tokens.last().type == Token::Type::DoubleLessDash || tokens.last().type == Token::Type::DoubleLess);
+
     auto result = reduce(Reduction::Start);
     tokens.extend(move(result.tokens));
+
+    while (expect_heredoc_entry && tokens.size() == 1) {
+        result = reduce(result.next_reduction);
+        tokens.extend(move(result.tokens));
+    }
+
+    if (expect_heredoc_entry && tokens.size() > 1) {
+        auto [key, interpolation] = process_heredoc_key(tokens[1]);
+        m_state.heredoc_entries.enqueue(HeredocEntry {
+            .key = key,
+            .allow_interpolation = interpolation,
+            .dedent = tokens[0].type == Token::Type::DoubleLessDash,
+        });
+    }
+
     return {
         .tokens = move(tokens),
         .next_reduction = result.next_reduction,
@@ -160,6 +279,7 @@ Lexer::ReductionResult Lexer::reduce_comment()
     }
 
     if (consume() == '\n') {
+        m_state.on_new_line = true;
         return {
             .tokens = { Token::newline() },
             .next_reduction = Reduction::Start,
@@ -352,8 +472,64 @@ Lexer::ReductionResult Lexer::reduce_command_expansion()
     };
 }
 
+Lexer::ReductionResult Lexer::reduce_heredoc_contents()
+{
+    if (m_lexer.is_eof()) {
+        auto tokens = Token::maybe_from_state(m_state);
+        m_state.buffer.clear();
+        m_state.position.start_offset = m_state.position.end_offset;
+        m_state.position.start_line = m_state.position.end_line;
+
+        return {
+            .tokens = move(tokens),
+            .next_reduction = Reduction::End,
+        };
+    }
+
+    if (!m_state.escaping && consume_specific('\\')) {
+        m_state.escaping = true;
+        m_state.buffer.append('\\');
+        return {
+            .tokens = {},
+            .next_reduction = Reduction::HeredocContents,
+        };
+    }
+
+    if (!m_state.escaping && consume_specific('$')) {
+        m_state.buffer.append('$');
+        if (m_lexer.next_is("("))
+            m_state.expansions.empend(CommandExpansion { .command = StringBuilder {}, .range = range() });
+        else
+            m_state.expansions.empend(ParameterExpansion { .parameter = StringBuilder {}, .range = range() });
+
+        return {
+            .tokens = {},
+            .next_reduction = Reduction::Expansion,
+        };
+    }
+
+    if (!m_state.escaping && consume_specific('`')) {
+        m_state.buffer.append('`');
+        m_state.expansions.empend(CommandExpansion { .command = StringBuilder {}, .range = range() });
+        return {
+            .tokens = {},
+            .next_reduction = Reduction::CommandExpansion,
+        };
+    }
+
+    m_state.escaping = false;
+    m_state.buffer.append(consume());
+    return {
+        .tokens = {},
+        .next_reduction = Reduction::HeredocContents,
+    };
+}
+
 Lexer::ReductionResult Lexer::reduce_start()
 {
+    auto was_on_new_line = m_state.on_new_line;
+    m_state.on_new_line = false;
+
     if (m_lexer.is_eof()) {
         auto tokens = Token::maybe_from_state(m_state);
         m_state.buffer.clear();
@@ -366,6 +542,51 @@ Lexer::ReductionResult Lexer::reduce_start()
         };
     }
 
+    if (was_on_new_line && !m_state.heredoc_entries.is_empty()) {
+        auto const& entry = m_state.heredoc_entries.head();
+
+        auto start_index = m_lexer.tell();
+        Optional<size_t> end_index;
+
+        for (; !m_lexer.is_eof();) {
+            auto index = m_lexer.tell();
+            auto possible_end_index = m_lexer.tell();
+            if (m_lexer.consume_specific('\n')) {
+                if (entry.dedent)
+                    m_lexer.ignore_while(is_any_of("\t"sv));
+                if (m_lexer.consume_specific(entry.key.view())) {
+                    if (m_lexer.consume_specific('\n') || m_lexer.is_eof()) {
+                        end_index = possible_end_index;
+                        break;
+                    }
+                }
+            }
+            if (m_lexer.tell() == index)
+                m_lexer.ignore();
+        }
+
+        auto contents = m_lexer.input().substring_view(start_index, end_index.value_or(m_lexer.tell()) - start_index);
+        reconsume(contents);
+
+        m_state.buffer.clear();
+        m_state.buffer.append(contents);
+
+        auto token = Token::maybe_from_state(m_state).first();
+        token.relevant_heredoc_key = entry.key;
+        token.type = Token::Type::HeredocContents;
+
+        m_state.heredoc_entries.dequeue();
+
+        m_state.on_new_line = true;
+
+        m_state.buffer.clear();
+
+        return {
+            .tokens = { move(token) },
+            .next_reduction = Reduction::Start,
+        };
+    }
+
     if (m_state.escaping && consume_specific('\n')) {
         m_state.escaping = false;
 
@@ -391,6 +612,8 @@ Lexer::ReductionResult Lexer::reduce_start()
         auto tokens = Token::maybe_from_state(m_state);
         tokens.append(Token::newline());
 
+        m_state.on_new_line = true;
+
         m_state.buffer.clear();
         m_state.position.start_offset = m_state.position.end_offset;
         m_state.position.start_line = m_state.position.end_line;
@@ -678,6 +901,8 @@ StringView Token::type_name() const
         return "Clobber"sv;
     case Type::Semicolon:
         return "Semicolon"sv;
+    case Type::HeredocContents:
+        return "HeredocContents"sv;
     case Type::AssignmentWord:
         return "AssignmentWord"sv;
     case Type::Bang:

+ 24 - 1
Userland/Shell/PosixLexer.h

@@ -8,6 +8,7 @@
 
 #include <AK/DeprecatedString.h>
 #include <AK/GenericLexer.h>
+#include <AK/Queue.h>
 #include <AK/Variant.h>
 #include <AK/Vector.h>
 #include <Shell/AST.h>
@@ -29,6 +30,9 @@ enum class Reduction {
     ParameterExpansion,
     CommandOrArithmeticSubstitutionExpansion,
     ExtendedParameterExpansion,
+
+    // Separate rule, not used by the main flow.
+    HeredocContents,
 };
 
 struct ExpansionRange {
@@ -177,6 +181,12 @@ struct ResolvedCommandExpansion {
 
 using ResolvedExpansion = Variant<ResolvedParameterExpansion, ResolvedCommandExpansion>;
 
+struct HeredocEntry {
+    DeprecatedString key;
+    bool allow_interpolation;
+    bool dedent;
+};
+
 struct State {
     StringBuilder buffer {};
     Reduction previous_reduction { Reduction::Start };
@@ -194,6 +204,8 @@ struct State {
         },
     };
     Vector<Expansion> expansions {};
+    Queue<HeredocEntry> heredoc_entries {};
+    bool on_new_line { true };
 };
 
 struct Token {
@@ -219,6 +231,7 @@ struct Token {
         DoubleLessDash,
         Clobber,
         Semicolon,
+        HeredocContents,
 
         // Not produced by this lexer, but generated in later stages.
         AssignmentWord,
@@ -249,6 +262,7 @@ struct Token {
     Vector<Expansion> expansions;
     Vector<ResolvedExpansion> resolved_expansions {};
     StringView original_text;
+    Optional<DeprecatedString> relevant_heredoc_key {};
     bool could_be_start_of_a_simple_command { false };
 
     static Vector<Token> maybe_from_state(State const& state)
@@ -378,7 +392,14 @@ public:
     {
     }
 
-    Vector<Token> batch_next();
+    Vector<Token> batch_next(Optional<Reduction> starting_reduction = {});
+
+    struct HeredocKeyResult {
+        DeprecatedString key;
+        bool allow_interpolation;
+    };
+
+    static HeredocKeyResult process_heredoc_key(Token const&);
 
 private:
     struct ReductionResult {
@@ -400,9 +421,11 @@ private:
     ReductionResult reduce_parameter_expansion();
     ReductionResult reduce_command_or_arithmetic_substitution_expansion();
     ReductionResult reduce_extended_parameter_expansion();
+    ReductionResult reduce_heredoc_contents();
 
     char consume();
     bool consume_specific(char);
+    void reconsume(StringView);
     ExpansionRange range(ssize_t offset = 0) const;
 
     GenericLexer m_lexer;

+ 73 - 12
Userland/Shell/PosixParser.cpp

@@ -9,6 +9,11 @@
 #include <AK/StringUtils.h>
 #include <Shell/PosixParser.h>
 
+static Shell::AST::Position empty_position()
+{
+    return { 0, 0, { 0, 0 }, { 0, 0 } };
+}
+
 template<typename T, typename... Ts>
 static inline bool is_one_of(T const& value, Ts const&... values)
 {
@@ -22,7 +27,8 @@ static inline bool is_io_operator(Shell::Posix::Token const& token)
         Token::Type::Less, Token::Type::Great,
         Token::Type::LessAnd, Token::Type::GreatAnd,
         Token::Type::DoubleLess, Token::Type::DoubleGreat,
-        Token::Type::LessGreat, Token::Type::Clobber);
+        Token::Type::DoubleLessDash, Token::Type::LessGreat,
+        Token::Type::Clobber);
 }
 
 static inline bool is_separator(Shell::Posix::Token const& token)
@@ -95,10 +101,10 @@ static inline bool is_valid_name(StringView word)
 }
 
 namespace Shell::Posix {
-void Parser::fill_token_buffer()
+void Parser::fill_token_buffer(Optional<Reduction> starting_reduction)
 {
     for (;;) {
-        auto token = next_expanded_token();
+        auto token = next_expanded_token(starting_reduction);
         if (!token.has_value())
             break;
 #if SHELL_POSIX_PARSER_DEBUG
@@ -126,10 +132,36 @@ RefPtr<AST::Node> Parser::parse()
     return parse_complete_command();
 }
 
-Optional<Token> Parser::next_expanded_token()
+void Parser::handle_heredoc_contents()
+{
+    while (!eof() && m_token_buffer[m_token_index].type == Token::Type::HeredocContents) {
+        auto& token = m_token_buffer[m_token_index++];
+        auto entry = m_unprocessed_heredoc_entries.get(token.relevant_heredoc_key.value());
+        if (!entry.has_value()) {
+            error(token, "Discarding unexpected heredoc contents for key '{}'", *token.relevant_heredoc_key);
+            continue;
+        }
+
+        auto& heredoc = **entry;
+
+        RefPtr<AST::Node> contents;
+        if (heredoc.allow_interpolation()) {
+            Parser parser { token.value, m_in_interactive_mode, Reduction::HeredocContents };
+            contents = parser.parse_word();
+        } else {
+            contents = make_ref_counted<AST::StringLiteral>(token.position.value_or(empty_position()), token.value, AST::StringLiteral::EnclosureType::None);
+        }
+
+        if (contents)
+            heredoc.set_contents(contents);
+        m_unprocessed_heredoc_entries.remove(*token.relevant_heredoc_key);
+    }
+}
+
+Optional<Token> Parser::next_expanded_token(Optional<Reduction> starting_reduction)
 {
     while (m_token_buffer.find_if([](auto& token) { return token.type == Token::Type::Eof; }).is_end()) {
-        auto tokens = m_lexer.batch_next();
+        auto tokens = m_lexer.batch_next(starting_reduction);
         auto expanded = perform_expansions(move(tokens));
         m_token_buffer.extend(expanded);
     }
@@ -589,11 +621,6 @@ Vector<Token> Parser::perform_expansions(Vector<Token> tokens)
     return tokens;
 }
 
-static AST::Position empty_position()
-{
-    return { 0, 0, { 0, 0 }, { 0, 0 } };
-}
-
 RefPtr<AST::Node> Parser::parse_complete_command()
 {
     auto list = [&] {
@@ -1835,13 +1862,47 @@ RefPtr<AST::Node> Parser::parse_io_redirect()
     if (auto io_file = parse_io_file(start_position, io_number))
         return io_file;
 
-    // if (auto io_here = parse_io_here(start_position, io_number))
-    //     return io_here;
+    if (auto io_here = parse_io_here(start_position, io_number))
+        return io_here;
 
     m_token_index = start_index;
     return nullptr;
 }
 
+RefPtr<AST::Node> Parser::parse_io_here(AST::Position start_position, Optional<int> fd)
+{
+    // io_here: IO_NUMBER? (DLESS | DLESSDASH) WORD
+    auto io_operator = peek().type;
+    if (!is_one_of(io_operator, Token::Type::DoubleLess, Token::Type::DoubleLessDash))
+        return nullptr;
+
+    auto io_operator_token = consume();
+
+    auto redirection_fd = fd.value_or(0);
+
+    auto end_keyword = consume();
+    if (!is_one_of(end_keyword.type, Token::Type::Word, Token::Type::Token))
+        return make_ref_counted<AST::SyntaxError>(io_operator_token.position.value_or(start_position), "Expected a heredoc keyword", true);
+
+    auto [end_keyword_text, allow_interpolation] = Lexer::process_heredoc_key(end_keyword);
+    RefPtr<AST::SyntaxError> error;
+
+    auto position = start_position.with_end(peek().position.value_or(empty_position()));
+    auto result = make_ref_counted<AST::Heredoc>(
+        position,
+        end_keyword_text,
+        allow_interpolation,
+        io_operator == Token::Type::DoubleLessDash,
+        Optional<int> { redirection_fd });
+
+    m_unprocessed_heredoc_entries.set(end_keyword_text, result);
+
+    if (error)
+        result->set_is_syntax_error(*error);
+
+    return result;
+}
+
 RefPtr<AST::Node> Parser::parse_io_file(AST::Position start_position, Optional<int> fd)
 {
     auto start_index = m_token_index;

+ 9 - 5
Userland/Shell/PosixParser.h

@@ -13,12 +13,12 @@ namespace Shell::Posix {
 
 class Parser {
 public:
-    Parser(StringView input, bool interactive = false)
+    Parser(StringView input, bool interactive = false, Optional<Reduction> starting_reduction = {})
         : m_lexer(input)
         , m_in_interactive_mode(interactive)
         , m_eof_token(Token::eof())
     {
-        fill_token_buffer();
+        fill_token_buffer(starting_reduction);
     }
 
     RefPtr<AST::Node> parse();
@@ -31,20 +31,23 @@ public:
     auto& errors() const { return m_errors; }
 
 private:
-    Optional<Token> next_expanded_token();
+    Optional<Token> next_expanded_token(Optional<Reduction> starting_reduction = {});
     Vector<Token> perform_expansions(Vector<Token> tokens);
-    void fill_token_buffer();
+    void fill_token_buffer(Optional<Reduction> starting_reduction = {});
+    void handle_heredoc_contents();
 
-    Token const& peek() const
+    Token const& peek()
     {
         if (eof())
             return m_eof_token;
+        handle_heredoc_contents();
         return m_token_buffer[m_token_index];
     }
     Token const& consume()
     {
         if (eof())
             return m_eof_token;
+        handle_heredoc_contents();
         return m_token_buffer[m_token_index++];
     }
     void skip()
@@ -108,6 +111,7 @@ private:
     Vector<Token> m_previous_token_buffer;
 
     Vector<Error> m_errors;
+    HashMap<DeprecatedString, NonnullRefPtr<AST::Heredoc>> m_unprocessed_heredoc_entries;
 
     Token m_eof_token;