/* * Copyright (c) 2023, Sam Atkins * * SPDX-License-Identifier: BSD-2-Clause */ #include "Lexer.h" #include #include #include #include namespace CMake { static bool is_valid_identifier_initial_char(char c) { return is_ascii_alpha(c) || c == '_'; } static bool is_valid_identifier_char(char c) { return is_ascii_alphanumeric(c) || c == '_'; } ErrorOr> Lexer::lex(StringView input) { Lexer lexer { input }; return lexer.lex_file(); } Lexer::Lexer(StringView input) : GenericLexer(input) { } ErrorOr> Lexer::lex_file() { m_tokens.clear_with_capacity(); while (!is_eof()) { consume_whitespace_or_comments(); if (is_eof()) break; if (is_valid_identifier_initial_char(peek())) { consume_command_invocation(); } else { consume_garbage(); } } return m_tokens; } void Lexer::skip_whitespace() { while (!is_eof()) { if (next_is('\n')) { next_line(); continue; } auto consumed = consume_while([&](char c) { return c == ' ' || c == '\t'; }); if (consumed.is_empty()) break; } } void Lexer::consume_whitespace_or_comments() { ScopeLogger log; while (!is_eof()) { skip_whitespace(); if (next_is('#')) { consume_comment(); } else { break; } } } // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#command-invocations void Lexer::consume_command_invocation() { ScopeLogger log; auto identifier_start = position(); auto identifier = consume_while(is_valid_identifier_char); auto control_keyword = control_keyword_from_string(identifier); if (control_keyword.has_value()) { emit_token(Token::Type::ControlKeyword, identifier, identifier_start, position(), control_keyword.release_value()); } else { emit_token(Token::Type::Identifier, identifier, identifier_start, position()); } consume_whitespace_or_comments(); if (next_is('(')) consume_open_paren(); consume_arguments(); if (next_is(')')) consume_close_paren(); } void Lexer::consume_arguments() { ScopeLogger log; while (!is_eof()) { consume_whitespace_or_comments(); if (next_is('(')) { consume_open_paren(); consume_whitespace_or_comments(); consume_arguments(); consume_whitespace_or_comments(); if (next_is(')')) consume_close_paren(); continue; } if (next_is(')')) return; consume_argument(); } } // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#command-arguments void Lexer::consume_argument() { ScopeLogger log; consume_whitespace_or_comments(); if (next_is('[')) { consume_bracket_argument(); return; } if (next_is('"')) { consume_quoted_argument(); return; } consume_unquoted_argument(); } // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-argument void Lexer::consume_bracket_argument() { ScopeLogger log; auto start = position(); auto value = read_bracket_argument(); emit_token(Token::Type::BracketArgument, value, start, position()); } // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#quoted-argument void Lexer::consume_quoted_argument() { ScopeLogger log; auto start = position(); auto start_offset = tell(); VERIFY(consume_specific('"')); while (!is_eof()) { if (next_is('"')) { ignore(); break; } if (next_is("\\\""sv)) { ignore(2); continue; } if (next_is('\n')) { next_line(); continue; } ignore(); } auto whole_token = m_input.substring_view(start_offset, tell() - start_offset); auto value = whole_token.substring_view(1, whole_token.length() - 2); auto variable_references = parse_variable_references_from_argument(whole_token, start); emit_token(Token::Type::QuotedArgument, value, start, position(), {}, move(variable_references)); } // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#unquoted-argument void Lexer::consume_unquoted_argument() { ScopeLogger log; auto start_offset = tell(); auto start = position(); while (!is_eof()) { if (next_is('\\')) { consume_escaped_character('\\'); continue; } auto consumed = consume_until([](char c) { return is_ascii_space(c) || "()#\"\\'"sv.contains(c); }); if (consumed.is_empty()) break; // FIXME: `unquoted_legacy` } auto value = m_input.substring_view(start_offset, tell() - start_offset); auto variable_references = parse_variable_references_from_argument(value, start); emit_token(Token::Type::UnquotedArgument, value, start, position(), {}, move(variable_references)); } // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#comments void Lexer::consume_comment() { ScopeLogger log; auto start = position(); VERIFY(consume_specific('#')); if (next_is('[')) { // Bracket comment // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-comment auto comment = read_bracket_argument(); emit_token(Token::Type::BracketComment, comment, start, position()); return; } // Line comment // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#line-comment auto comment = consume_until('\n'); emit_token(Token::Type::LineComment, comment, start, position()); } void Lexer::consume_open_paren() { auto start = position(); VERIFY(consume_specific('(')); emit_token(Token::Type::OpenParen, "("sv, start, position()); } void Lexer::consume_close_paren() { auto start = position(); VERIFY(consume_specific(')')); emit_token(Token::Type::CloseParen, ")"sv, start, position()); } void Lexer::consume_garbage() { ScopeLogger log; auto start = position(); auto contents = consume_until(is_ascii_space); if (!contents.is_empty()) emit_token(Token::Type::Garbage, contents, start, position()); } // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#bracket-argument // Used by both bracket arguments and bracket comments. StringView Lexer::read_bracket_argument() { VERIFY(consume_specific('[')); auto leading_equals_signs = consume_while([](char c) { return c == '='; }); consume_specific('['); auto start = tell(); auto end = start; while (!is_eof()) { // Read everything until we see `]={len}]`. ignore_until(']'); end = tell(); ignore(); if (next_is(leading_equals_signs)) ignore(leading_equals_signs.length()); if (consume_specific(']')) break; } return m_input.substring_view(start, end - start); } // https://cmake.org/cmake/help/latest/manual/cmake-language.7.html#variable-references Vector Lexer::parse_variable_references_from_argument(StringView argument_value, Position argument_start) { auto position = argument_start; GenericLexer lexer { argument_value }; Vector variable_references; while (!lexer.is_eof()) { if (lexer.next_is('\n')) { lexer.ignore(); position.column = 0; position.line++; continue; } if (lexer.next_is('\\')) { lexer.ignore(); if (lexer.next_is('\n')) { lexer.ignore(); position.column = 0; position.line++; continue; } lexer.ignore(); position.column += 2; } if (lexer.next_is('$')) { auto start = position; lexer.ignore(); position.column++; if (lexer.next_is("ENV{"sv)) { lexer.ignore(4); position.column += 4; } else if (lexer.next_is('{')) { lexer.ignore(); position.column++; } else { auto skipped = lexer.consume_until(is_any_of("$ \n"sv)); position.column += skipped.length(); continue; } auto variable_name = lexer.consume_until(is_any_of("} \n"sv)); position.column += variable_name.length(); if (lexer.next_is('}')) { lexer.ignore(); position.column++; variable_references.empend(variable_name, start, position); } continue; } lexer.ignore(); position.column++; } return variable_references; } Position Lexer::position() const { return Position { .line = m_line, .column = tell() - m_string_offset_after_previous_newline, }; } void Lexer::next_line() { VERIFY(consume_specific('\n')); m_string_offset_after_previous_newline = tell(); m_line++; } void Lexer::emit_token(Token::Type type, StringView value, Position start, Position end, Optional control_keyword, Vector variable_references) { dbgln_if(CMAKE_DEBUG, "Emitting {} token: `{}` ({}:{} to {}:{})", to_string(type), value, start.line, start.column, end.line, end.column); m_tokens.empend(type, value, start, end, move(control_keyword), move(variable_references)); } }