summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibCpp/Lexer.cpp
diff options
context:
space:
mode:
authorAndreas Kling <kling@serenityos.org>2021-01-12 12:17:30 +0100
committerAndreas Kling <kling@serenityos.org>2021-01-12 12:17:46 +0100
commit13d7c09125f8eec703d0a43a9a87fc8aa08f7319 (patch)
tree70fd643c429cea5c1f9362c2674511d17a53f3b5 /Userland/Libraries/LibCpp/Lexer.cpp
parentdc28c07fa526841e05e16161c74a6c23984f1dd5 (diff)
downloadserenity-13d7c09125f8eec703d0a43a9a87fc8aa08f7319.zip
Libraries: Move to Userland/Libraries/
Diffstat (limited to 'Userland/Libraries/LibCpp/Lexer.cpp')
-rw-r--r--Userland/Libraries/LibCpp/Lexer.cpp789
1 files changed, 789 insertions, 0 deletions
diff --git a/Userland/Libraries/LibCpp/Lexer.cpp b/Userland/Libraries/LibCpp/Lexer.cpp
new file mode 100644
index 0000000000..831822e5a5
--- /dev/null
+++ b/Userland/Libraries/LibCpp/Lexer.cpp
@@ -0,0 +1,789 @@
+/*
+ * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "Lexer.h"
+#include <AK/HashTable.h>
+#include <AK/StdLibExtras.h>
+#include <AK/String.h>
+#include <ctype.h>
+
+namespace Cpp {
+
+Lexer::Lexer(const StringView& input)
+ : m_input(input)
+{
+}
+
+char Lexer::peek(size_t offset) const
+{
+ if ((m_index + offset) >= m_input.length())
+ return 0;
+ return m_input[m_index + offset];
+}
+
+char Lexer::consume()
+{
+ ASSERT(m_index < m_input.length());
+ char ch = m_input[m_index++];
+ m_previous_position = m_position;
+ if (ch == '\n') {
+ m_position.line++;
+ m_position.column = 0;
+ } else {
+ m_position.column++;
+ }
+ return ch;
+}
+
+static bool is_valid_first_character_of_identifier(char ch)
+{
+ return isalpha(ch) || ch == '_' || ch == '$';
+}
+
+static bool is_valid_nonfirst_character_of_identifier(char ch)
+{
+ return is_valid_first_character_of_identifier(ch) || isdigit(ch);
+}
+
+constexpr const char* s_known_keywords[] = {
+ "alignas",
+ "alignof",
+ "and",
+ "and_eq",
+ "asm",
+ "bitand",
+ "bitor",
+ "bool",
+ "break",
+ "case",
+ "catch",
+ "class",
+ "compl",
+ "const",
+ "const_cast",
+ "constexpr",
+ "continue",
+ "decltype",
+ "default",
+ "delete",
+ "do",
+ "dynamic_cast",
+ "else",
+ "enum",
+ "explicit",
+ "export",
+ "extern",
+ "false",
+ "final",
+ "for",
+ "friend",
+ "goto",
+ "if",
+ "inline",
+ "mutable",
+ "namespace",
+ "new",
+ "noexcept",
+ "not",
+ "not_eq",
+ "nullptr",
+ "operator",
+ "or",
+ "or_eq",
+ "override",
+ "private",
+ "protected",
+ "public",
+ "register",
+ "reinterpret_cast",
+ "return",
+ "signed",
+ "sizeof",
+ "static",
+ "static_assert",
+ "static_cast",
+ "struct",
+ "switch",
+ "template",
+ "this",
+ "thread_local",
+ "throw",
+ "true",
+ "try",
+ "typedef",
+ "typeid",
+ "typename",
+ "union",
+ "using",
+ "virtual",
+ "volatile",
+ "while",
+ "xor",
+ "xor_eq"
+};
+
+constexpr const char* s_known_types[] = {
+ "ByteBuffer",
+ "CircularDeque",
+ "CircularQueue",
+ "Deque",
+ "DoublyLinkedList",
+ "FileSystemPath",
+ "Array",
+ "Function",
+ "HashMap",
+ "HashTable",
+ "IPv4Address",
+ "InlineLinkedList",
+ "IntrusiveList",
+ "JsonArray",
+ "JsonObject",
+ "JsonValue",
+ "MappedFile",
+ "NetworkOrdered",
+ "NonnullOwnPtr",
+ "NonnullOwnPtrVector",
+ "NonnullRefPtr",
+ "NonnullRefPtrVector",
+ "Optional",
+ "OwnPtr",
+ "RefPtr",
+ "Result",
+ "ScopeGuard",
+ "SinglyLinkedList",
+ "String",
+ "StringBuilder",
+ "StringImpl",
+ "StringView",
+ "Utf8View",
+ "Vector",
+ "WeakPtr",
+ "auto",
+ "char",
+ "char16_t",
+ "char32_t",
+ "char8_t",
+ "double",
+ "float",
+ "i16",
+ "i32",
+ "i64",
+ "i8",
+ "int",
+ "int",
+ "long",
+ "short",
+ "signed",
+ "u16",
+ "u32",
+ "u64",
+ "u8",
+ "unsigned",
+ "void",
+ "wchar_t"
+};
+
+static bool is_keyword(const StringView& string)
+{
+ static HashTable<String> keywords(array_size(s_known_keywords));
+ if (keywords.is_empty()) {
+ keywords.set_from(s_known_keywords);
+ }
+ return keywords.contains(string);
+}
+
+static bool is_known_type(const StringView& string)
+{
+ static HashTable<String> types(array_size(s_known_types));
+ if (types.is_empty()) {
+ types.set_from(s_known_types);
+ }
+ return types.contains(string);
+}
+
+Vector<Token> Lexer::lex()
+{
+ Vector<Token> tokens;
+
+ size_t token_start_index = 0;
+ Position token_start_position;
+
+ auto emit_token = [&](auto type) {
+ Token token;
+ token.m_type = type;
+ token.m_start = m_position;
+ token.m_end = m_position;
+ tokens.append(token);
+ consume();
+ };
+
+ auto begin_token = [&] {
+ token_start_index = m_index;
+ token_start_position = m_position;
+ };
+ auto commit_token = [&](auto type) {
+ Token token;
+ token.m_type = type;
+ token.m_start = token_start_position;
+ token.m_end = m_previous_position;
+ tokens.append(token);
+ };
+
+ auto emit_token_equals = [&](auto type, auto equals_type) {
+ if (peek(1) == '=') {
+ begin_token();
+ consume();
+ consume();
+ commit_token(equals_type);
+ return;
+ }
+ emit_token(type);
+ };
+
+ auto match_escape_sequence = [&]() -> size_t {
+ switch (peek(1)) {
+ case '\'':
+ case '"':
+ case '?':
+ case '\\':
+ case 'a':
+ case 'b':
+ case 'f':
+ case 'n':
+ case 'r':
+ case 't':
+ case 'v':
+ return 2;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7': {
+ size_t octal_digits = 1;
+ for (size_t i = 0; i < 2; ++i) {
+ char next = peek(2 + i);
+ if (next < '0' || next > '7')
+ break;
+ ++octal_digits;
+ }
+ return 1 + octal_digits;
+ }
+ case 'x': {
+ size_t hex_digits = 0;
+ while (isxdigit(peek(2 + hex_digits)))
+ ++hex_digits;
+ return 2 + hex_digits;
+ }
+ case 'u':
+ case 'U': {
+ bool is_unicode = true;
+ size_t number_of_digits = peek(1) == 'u' ? 4 : 8;
+ for (size_t i = 0; i < number_of_digits; ++i) {
+ if (!isxdigit(peek(2 + i))) {
+ is_unicode = false;
+ break;
+ }
+ }
+ return is_unicode ? 2 + number_of_digits : 0;
+ }
+ default:
+ return 0;
+ }
+ };
+
+ auto match_string_prefix = [&](char quote) -> size_t {
+ if (peek() == quote)
+ return 1;
+ if (peek() == 'L' && peek(1) == quote)
+ return 2;
+ if (peek() == 'u') {
+ if (peek(1) == quote)
+ return 2;
+ if (peek(1) == '8' && peek(2) == quote)
+ return 3;
+ }
+ if (peek() == 'U' && peek(1) == quote)
+ return 2;
+ return 0;
+ };
+
+ while (m_index < m_input.length()) {
+ auto ch = peek();
+ if (isspace(ch)) {
+ begin_token();
+ while (isspace(peek()))
+ consume();
+ commit_token(Token::Type::Whitespace);
+ continue;
+ }
+ if (ch == '(') {
+ emit_token(Token::Type::LeftParen);
+ continue;
+ }
+ if (ch == ')') {
+ emit_token(Token::Type::RightParen);
+ continue;
+ }
+ if (ch == '{') {
+ emit_token(Token::Type::LeftCurly);
+ continue;
+ }
+ if (ch == '}') {
+ emit_token(Token::Type::RightCurly);
+ continue;
+ }
+ if (ch == '[') {
+ emit_token(Token::Type::LeftBracket);
+ continue;
+ }
+ if (ch == ']') {
+ emit_token(Token::Type::RightBracket);
+ continue;
+ }
+ if (ch == '<') {
+ begin_token();
+ consume();
+ if (peek() == '<') {
+ consume();
+ if (peek() == '=') {
+ consume();
+ commit_token(Token::Type::LessLessEquals);
+ continue;
+ }
+ commit_token(Token::Type::LessLess);
+ continue;
+ }
+ if (peek() == '=') {
+ consume();
+ commit_token(Token::Type::LessEquals);
+ continue;
+ }
+ if (peek() == '>') {
+ consume();
+ commit_token(Token::Type::LessGreater);
+ continue;
+ }
+ commit_token(Token::Type::Less);
+ continue;
+ }
+ if (ch == '>') {
+ begin_token();
+ consume();
+ if (peek() == '>') {
+ consume();
+ if (peek() == '=') {
+ consume();
+ commit_token(Token::Type::GreaterGreaterEquals);
+ continue;
+ }
+ commit_token(Token::Type::GreaterGreater);
+ continue;
+ }
+ if (peek() == '=') {
+ consume();
+ commit_token(Token::Type::GreaterEquals);
+ continue;
+ }
+ commit_token(Token::Type::Greater);
+ continue;
+ }
+ if (ch == ',') {
+ emit_token(Token::Type::Comma);
+ continue;
+ }
+ if (ch == '+') {
+ begin_token();
+ consume();
+ if (peek() == '+') {
+ consume();
+ commit_token(Token::Type::PlusPlus);
+ continue;
+ }
+ if (peek() == '=') {
+ consume();
+ commit_token(Token::Type::PlusEquals);
+ continue;
+ }
+ commit_token(Token::Type::Plus);
+ continue;
+ }
+ if (ch == '-') {
+ begin_token();
+ consume();
+ if (peek() == '-') {
+ consume();
+ commit_token(Token::Type::MinusMinus);
+ continue;
+ }
+ if (peek() == '=') {
+ consume();
+ commit_token(Token::Type::MinusEquals);
+ continue;
+ }
+ if (peek() == '>') {
+ consume();
+ if (peek() == '*') {
+ consume();
+ commit_token(Token::Type::ArrowAsterisk);
+ continue;
+ }
+ commit_token(Token::Type::Arrow);
+ continue;
+ }
+ commit_token(Token::Type::Minus);
+ continue;
+ }
+ if (ch == '*') {
+ emit_token_equals(Token::Type::Asterisk, Token::Type::AsteriskEquals);
+ continue;
+ }
+ if (ch == '%') {
+ emit_token_equals(Token::Type::Percent, Token::Type::PercentEquals);
+ continue;
+ }
+ if (ch == '^') {
+ emit_token_equals(Token::Type::Caret, Token::Type::CaretEquals);
+ continue;
+ }
+ if (ch == '!') {
+ emit_token_equals(Token::Type::ExclamationMark, Token::Type::ExclamationMarkEquals);
+ continue;
+ }
+ if (ch == '=') {
+ emit_token_equals(Token::Type::Equals, Token::Type::EqualsEquals);
+ continue;
+ }
+ if (ch == '&') {
+ begin_token();
+ consume();
+ if (peek() == '&') {
+ consume();
+ commit_token(Token::Type::AndAnd);
+ continue;
+ }
+ if (peek() == '=') {
+ consume();
+ commit_token(Token::Type::AndEquals);
+ continue;
+ }
+ commit_token(Token::Type::And);
+ continue;
+ }
+ if (ch == '|') {
+ begin_token();
+ consume();
+ if (peek() == '|') {
+ consume();
+ commit_token(Token::Type::PipePipe);
+ continue;
+ }
+ if (peek() == '=') {
+ consume();
+ commit_token(Token::Type::PipeEquals);
+ continue;
+ }
+ commit_token(Token::Type::Pipe);
+ continue;
+ }
+ if (ch == '~') {
+ emit_token(Token::Type::Tilde);
+ continue;
+ }
+ if (ch == '?') {
+ emit_token(Token::Type::QuestionMark);
+ continue;
+ }
+ if (ch == ':') {
+ begin_token();
+ consume();
+ if (peek() == ':') {
+ consume();
+ if (peek() == '*') {
+ consume();
+ commit_token(Token::Type::ColonColonAsterisk);
+ continue;
+ }
+ commit_token(Token::Type::ColonColon);
+ continue;
+ }
+ commit_token(Token::Type::Colon);
+ continue;
+ }
+ if (ch == ';') {
+ emit_token(Token::Type::Semicolon);
+ continue;
+ }
+ if (ch == '.') {
+ begin_token();
+ consume();
+ if (peek() == '*') {
+ consume();
+ commit_token(Token::Type::DotAsterisk);
+ continue;
+ }
+ commit_token(Token::Type::Dot);
+ continue;
+ }
+ if (ch == '#') {
+ begin_token();
+ consume();
+
+ if (is_valid_first_character_of_identifier(peek()))
+ while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
+ consume();
+
+ auto directive = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
+ if (directive == "#include") {
+ commit_token(Token::Type::IncludeStatement);
+
+ begin_token();
+ while (isspace(peek()))
+ consume();
+ commit_token(Token::Type::Whitespace);
+
+ begin_token();
+ if (peek() == '<' || peek() == '"') {
+ char closing = consume() == '<' ? '>' : '"';
+ while (peek() && peek() != closing && peek() != '\n')
+ consume();
+
+ if (peek() && consume() == '\n') {
+ commit_token(Token::Type::IncludePath);
+ continue;
+ }
+
+ commit_token(Token::Type::IncludePath);
+ begin_token();
+ }
+ }
+
+ while (peek() && peek() != '\n')
+ consume();
+
+ commit_token(Token::Type::PreprocessorStatement);
+ continue;
+ }
+ if (ch == '/' && peek(1) == '/') {
+ begin_token();
+ while (peek() && peek() != '\n')
+ consume();
+ commit_token(Token::Type::Comment);
+ continue;
+ }
+ if (ch == '/' && peek(1) == '*') {
+ begin_token();
+ consume();
+ consume();
+ bool comment_block_ends = false;
+ while (peek()) {
+ if (peek() == '*' && peek(1) == '/') {
+ comment_block_ends = true;
+ break;
+ }
+
+ consume();
+ }
+
+ if (comment_block_ends) {
+ consume();
+ consume();
+ }
+
+ commit_token(Token::Type::Comment);
+ continue;
+ }
+ if (ch == '/') {
+ emit_token_equals(Token::Type::Slash, Token::Type::SlashEquals);
+ continue;
+ }
+ if (size_t prefix = match_string_prefix('"'); prefix > 0) {
+ begin_token();
+ for (size_t i = 0; i < prefix; ++i)
+ consume();
+ while (peek()) {
+ if (peek() == '\\') {
+ if (size_t escape = match_escape_sequence(); escape > 0) {
+ commit_token(Token::Type::DoubleQuotedString);
+ begin_token();
+ for (size_t i = 0; i < escape; ++i)
+ consume();
+ commit_token(Token::Type::EscapeSequence);
+ begin_token();
+ continue;
+ }
+ }
+
+ if (consume() == '"')
+ break;
+ }
+ commit_token(Token::Type::DoubleQuotedString);
+ continue;
+ }
+ if (size_t prefix = match_string_prefix('R'); prefix > 0 && peek(prefix) == '"') {
+ begin_token();
+ for (size_t i = 0; i < prefix + 1; ++i)
+ consume();
+ size_t prefix_start = m_index;
+ while (peek() && peek() != '(')
+ consume();
+ StringView prefix_string = m_input.substring_view(prefix_start, m_index - prefix_start);
+ while (peek()) {
+ if (consume() == '"') {
+ ASSERT(m_index >= prefix_string.length() + 2);
+ ASSERT(m_input[m_index - 1] == '"');
+ if (m_input[m_index - 1 - prefix_string.length() - 1] == ')') {
+ StringView suffix_string = m_input.substring_view(m_index - 1 - prefix_string.length(), prefix_string.length());
+ if (prefix_string == suffix_string)
+ break;
+ }
+ }
+ }
+ commit_token(Token::Type::RawString);
+ continue;
+ }
+ if (size_t prefix = match_string_prefix('\''); prefix > 0) {
+ begin_token();
+ for (size_t i = 0; i < prefix; ++i)
+ consume();
+ while (peek()) {
+ if (peek() == '\\') {
+ if (size_t escape = match_escape_sequence(); escape > 0) {
+ commit_token(Token::Type::SingleQuotedString);
+ begin_token();
+ for (size_t i = 0; i < escape; ++i)
+ consume();
+ commit_token(Token::Type::EscapeSequence);
+ begin_token();
+ continue;
+ }
+ }
+
+ if (consume() == '\'')
+ break;
+ }
+ commit_token(Token::Type::SingleQuotedString);
+ continue;
+ }
+ if (isdigit(ch) || (ch == '.' && isdigit(peek(1)))) {
+ begin_token();
+ consume();
+
+ auto type = ch == '.' ? Token::Type::Float : Token::Type::Integer;
+ bool is_hex = false;
+ bool is_binary = false;
+
+ auto match_exponent = [&]() -> size_t {
+ char ch = peek();
+ if (ch != 'e' && ch != 'E' && ch != 'p' && ch != 'P')
+ return 0;
+
+ type = Token::Type::Float;
+ size_t length = 1;
+ ch = peek(length);
+ if (ch == '+' || ch == '-') {
+ ++length;
+ }
+ for (ch = peek(length); isdigit(ch); ch = peek(length)) {
+ ++length;
+ }
+ return length;
+ };
+
+ auto match_type_literal = [&]() -> size_t {
+ size_t length = 0;
+ for (;;) {
+ char ch = peek(length);
+ if ((ch == 'u' || ch == 'U') && type == Token::Type::Integer) {
+ ++length;
+ } else if ((ch == 'f' || ch == 'F') && !is_binary) {
+ type = Token::Type::Float;
+ ++length;
+ } else if (ch == 'l' || ch == 'L') {
+ ++length;
+ } else
+ return length;
+ }
+ };
+
+ if (peek() == 'b' || peek() == 'B') {
+ consume();
+ is_binary = true;
+ for (char ch = peek(); ch == '0' || ch == '1' || (ch == '\'' && peek(1) != '\''); ch = peek()) {
+ consume();
+ }
+ } else {
+ if (peek() == 'x' || peek() == 'X') {
+ consume();
+ is_hex = true;
+ }
+
+ for (char ch = peek(); (is_hex ? isxdigit(ch) : isdigit(ch)) || (ch == '\'' && peek(1) != '\'') || ch == '.'; ch = peek()) {
+ if (ch == '.') {
+ if (type == Token::Type::Integer) {
+ type = Token::Type::Float;
+ } else
+ break;
+ };
+ consume();
+ }
+ }
+
+ if (!is_binary) {
+ size_t length = match_exponent();
+ for (size_t i = 0; i < length; ++i)
+ consume();
+ }
+
+ size_t length = match_type_literal();
+ for (size_t i = 0; i < length; ++i)
+ consume();
+
+ commit_token(type);
+ continue;
+ }
+ if (is_valid_first_character_of_identifier(ch)) {
+ begin_token();
+ while (peek() && is_valid_nonfirst_character_of_identifier(peek()))
+ consume();
+ auto token_view = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index);
+ if (is_keyword(token_view))
+ commit_token(Token::Type::Keyword);
+ else if (is_known_type(token_view))
+ commit_token(Token::Type::KnownType);
+ else
+ commit_token(Token::Type::Identifier);
+ continue;
+ }
+ dbgln("Unimplemented token character: {}", ch);
+ emit_token(Token::Type::Unknown);
+ }
+ return tokens;
+}
+
+}