123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371 |
- /*
- * Copyright (c) 2023, Sam Atkins <atkinssj@serenityos.org>
- *
- * SPDX-License-Identifier: BSD-2-Clause
- */
- #include "Lexer.h"
- #include <AK/CharacterTypes.h>
- #include <AK/Debug.h>
- #include <AK/Format.h>
- #include <AK/ScopeLogger.h>
- namespace CMake {
- static bool is_valid_identifier_initial_char(char c)
- {
- return is_ascii_alpha(c) || c == '_';
- }
- static bool is_valid_identifier_char(char c)
- {
- return is_ascii_alphanumeric(c) || c == '_';
- }
- ErrorOr<Vector<Token>> Lexer::lex(StringView input)
- {
- Lexer lexer { input };
- return lexer.lex_file();
- }
- Lexer::Lexer(StringView input)
- : GenericLexer(input)
- {
- }
- ErrorOr<Vector<Token>> Lexer::lex_file()
- {
- m_tokens.clear_with_capacity();
- while (!is_eof()) {
- consume_whitespace_or_comments();
- if (is_eof())
- break;
- if (is_valid_identifier_initial_char(peek())) {
- consume_command_invocation();
- } else {
- consume_garbage();
- }
- }
- return m_tokens;
- }
- void Lexer::skip_whitespace()
- {
- while (!is_eof()) {
- if (next_is('\n')) {
- next_line();
- continue;
- }
- auto consumed = consume_while([&](char c) {
- return c == ' ' || c == '\t';
- });
- if (consumed.is_empty())
- break;
- }
- }
- void Lexer::consume_whitespace_or_comments()
- {
- ScopeLogger<CMAKE_DEBUG> log;
- while (!is_eof()) {
- skip_whitespace();
- if (next_is('#')) {
- consume_comment();
- } else {
- break;
- }
- }
- }
- // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#command-invocations
- void Lexer::consume_command_invocation()
- {
- ScopeLogger<CMAKE_DEBUG> log;
- auto identifier_start = position();
- auto identifier = consume_while(is_valid_identifier_char);
- auto control_keyword = control_keyword_from_string(identifier);
- if (control_keyword.has_value()) {
- emit_token(Token::Type::ControlKeyword, identifier, identifier_start, position(), control_keyword.release_value());
- } else {
- emit_token(Token::Type::Identifier, identifier, identifier_start, position());
- }
- consume_whitespace_or_comments();
- if (next_is('('))
- consume_open_paren();
- consume_arguments();
- if (next_is(')'))
- consume_close_paren();
- }
- void Lexer::consume_arguments()
- {
- ScopeLogger<CMAKE_DEBUG> log;
- while (!is_eof()) {
- consume_whitespace_or_comments();
- if (next_is('(')) {
- consume_open_paren();
- consume_whitespace_or_comments();
- consume_arguments();
- consume_whitespace_or_comments();
- if (next_is(')'))
- consume_close_paren();
- continue;
- }
- if (next_is(')'))
- return;
- consume_argument();
- }
- }
- // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#command-arguments
- void Lexer::consume_argument()
- {
- ScopeLogger<CMAKE_DEBUG> log;
- consume_whitespace_or_comments();
- if (next_is('[')) {
- consume_bracket_argument();
- return;
- }
- if (next_is('"')) {
- consume_quoted_argument();
- return;
- }
- consume_unquoted_argument();
- }
- // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-argument
- void Lexer::consume_bracket_argument()
- {
- ScopeLogger<CMAKE_DEBUG> log;
- auto start = position();
- auto value = read_bracket_argument();
- emit_token(Token::Type::BracketArgument, value, start, position());
- }
- // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#quoted-argument
- void Lexer::consume_quoted_argument()
- {
- ScopeLogger<CMAKE_DEBUG> log;
- auto start = position();
- auto start_offset = tell();
- VERIFY(consume_specific('"'));
- while (!is_eof()) {
- if (next_is('"')) {
- ignore();
- break;
- }
- if (next_is("\\\""sv)) {
- ignore(2);
- continue;
- }
- if (next_is('\n')) {
- next_line();
- continue;
- }
- ignore();
- }
- auto whole_token = m_input.substring_view(start_offset, tell() - start_offset);
- auto value = whole_token.substring_view(1, whole_token.length() - 2);
- auto variable_references = parse_variable_references_from_argument(whole_token, start);
- emit_token(Token::Type::QuotedArgument, value, start, position(), {}, move(variable_references));
- }
- // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#unquoted-argument
- void Lexer::consume_unquoted_argument()
- {
- ScopeLogger<CMAKE_DEBUG> log;
- auto start_offset = tell();
- auto start = position();
- while (!is_eof()) {
- if (next_is('\\')) {
- consume_escaped_character('\\');
- continue;
- }
- auto consumed = consume_until([](char c) { return is_ascii_space(c) || "()#\"\\'"sv.contains(c); });
- if (consumed.is_empty())
- break;
- // FIXME: `unquoted_legacy`
- }
- auto value = m_input.substring_view(start_offset, tell() - start_offset);
- auto variable_references = parse_variable_references_from_argument(value, start);
- emit_token(Token::Type::UnquotedArgument, value, start, position(), {}, move(variable_references));
- }
- // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#comments
- void Lexer::consume_comment()
- {
- ScopeLogger<CMAKE_DEBUG> log;
- auto start = position();
- VERIFY(consume_specific('#'));
- if (next_is('[')) {
- // Bracket comment
- // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-comment
- auto comment = read_bracket_argument();
- emit_token(Token::Type::BracketComment, comment, start, position());
- return;
- }
- // Line comment
- // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#line-comment
- auto comment = consume_until('\n');
- emit_token(Token::Type::LineComment, comment, start, position());
- }
- void Lexer::consume_open_paren()
- {
- auto start = position();
- VERIFY(consume_specific('('));
- emit_token(Token::Type::OpenParen, "("sv, start, position());
- }
- void Lexer::consume_close_paren()
- {
- auto start = position();
- VERIFY(consume_specific(')'));
- emit_token(Token::Type::CloseParen, ")"sv, start, position());
- }
- void Lexer::consume_garbage()
- {
- ScopeLogger<CMAKE_DEBUG> log;
- auto start = position();
- auto contents = consume_until(is_ascii_space);
- if (!contents.is_empty())
- emit_token(Token::Type::Garbage, contents, start, position());
- }
- // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-argument
- // Used by both bracket arguments and bracket comments.
- StringView Lexer::read_bracket_argument()
- {
- VERIFY(consume_specific('['));
- auto leading_equals_signs = consume_while([](char c) { return c == '='; });
- consume_specific('[');
- auto start = tell();
- auto end = start;
- while (!is_eof()) {
- // Read everything until we see `]={len}]`.
- ignore_until(']');
- end = tell();
- ignore();
- if (next_is(leading_equals_signs))
- ignore(leading_equals_signs.length());
- if (consume_specific(']'))
- break;
- }
- return m_input.substring_view(start, end - start);
- }
- // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#variable-references
- Vector<VariableReference> Lexer::parse_variable_references_from_argument(StringView argument_value, Position argument_start)
- {
- auto position = argument_start;
- GenericLexer lexer { argument_value };
- Vector<VariableReference> variable_references;
- while (!lexer.is_eof()) {
- if (lexer.next_is('\n')) {
- lexer.ignore();
- position.column = 0;
- position.line++;
- continue;
- }
- if (lexer.next_is('\\')) {
- lexer.ignore();
- if (lexer.next_is('\n')) {
- lexer.ignore();
- position.column = 0;
- position.line++;
- continue;
- }
- lexer.ignore();
- position.column += 2;
- }
- if (lexer.next_is('$')) {
- auto start = position;
- lexer.ignore();
- position.column++;
- if (lexer.next_is("ENV{"sv)) {
- lexer.ignore(4);
- position.column += 4;
- } else if (lexer.next_is('{')) {
- lexer.ignore();
- position.column++;
- } else {
- auto skipped = lexer.consume_until(is_any_of("$ \n"sv));
- position.column += skipped.length();
- continue;
- }
- auto variable_name = lexer.consume_until(is_any_of("} \n"sv));
- position.column += variable_name.length();
- if (lexer.next_is('}')) {
- lexer.ignore();
- position.column++;
- variable_references.empend(variable_name, start, position);
- }
- continue;
- }
- lexer.ignore();
- position.column++;
- }
- return variable_references;
- }
- Position Lexer::position() const
- {
- return Position {
- .line = m_line,
- .column = tell() - m_string_offset_after_previous_newline,
- };
- }
- void Lexer::next_line()
- {
- VERIFY(consume_specific('\n'));
- m_string_offset_after_previous_newline = tell();
- m_line++;
- }
- void Lexer::emit_token(Token::Type type, StringView value, Position start, Position end, Optional<ControlKeywordType> control_keyword, Vector<VariableReference> variable_references)
- {
- dbgln_if(CMAKE_DEBUG, "Emitting {} token: `{}` ({}:{} to {}:{})", to_string(type), value, start.line, start.column, end.line, end.column);
- m_tokens.empend(type, value, start, end, move(control_keyword), move(variable_references));
- }
- }
|