diff options
author | Andreas Kling <awesomekling@gmail.com> | 2019-10-25 19:52:44 +0200 |
---|---|---|
committer | Andreas Kling <awesomekling@gmail.com> | 2019-10-25 19:52:44 +0200 |
commit | 307cbf83c3834512ecf6d208cf06d177abfbfb56 (patch) | |
tree | 6ae63107f8feba2bc65923acc065144bca53596b /DevTools/HackStudio | |
parent | 51e655f9033d4c14119ab2264735545d3cd85a0a (diff) | |
download | serenity-307cbf83c3834512ecf6d208cf06d177abfbfb56.zip |
HackStudio: Start building a C++ lexer to help with syntax highlighting
Diffstat (limited to 'DevTools/HackStudio')
-rw-r--r-- | DevTools/HackStudio/CppLexer.cpp | 179 | ||||
-rw-r--r-- | DevTools/HackStudio/CppLexer.h | 61 | ||||
-rw-r--r-- | DevTools/HackStudio/Makefile | 1 |
3 files changed, 241 insertions, 0 deletions
diff --git a/DevTools/HackStudio/CppLexer.cpp b/DevTools/HackStudio/CppLexer.cpp new file mode 100644 index 0000000000..8d4c688249 --- /dev/null +++ b/DevTools/HackStudio/CppLexer.cpp @@ -0,0 +1,179 @@ +#include "CppLexer.h" +#include <AK/LogStream.h> +#include <ctype.h> + +CppLexer::CppLexer(const StringView& input) + : m_input(input) +{ +} + +char CppLexer::peek(int offset) const +{ + if ((m_index + offset) >= m_input.length()) + return 0; + return m_input[m_index + offset]; +} + +char CppLexer::consume() +{ + ASSERT(m_index < m_input.length()); + return m_input[m_index++]; +} + +static bool is_valid_first_character_of_identifier(char ch) +{ + return isalpha(ch) || ch == '_' || ch == '$'; +} + +static bool is_valid_nonfirst_character_of_identifier(char ch) +{ + return is_valid_first_character_of_identifier(ch) || isdigit(ch); +} + +static bool is_keyword(const StringView& string) +{ + if (string == "int" || string == "char" || string == "return") + return true; + return false; +} + +Vector<CppToken> CppLexer::lex() +{ + Vector<CppToken> tokens; + + auto emit_token = [&](auto type) { + CppToken token; + token.m_type = type; + token.m_view = StringView(m_input.characters_without_null_termination() + m_index, 1); + tokens.append(token); + m_index++; + }; + + int token_start_index = 0; + auto begin_token = [&] { + token_start_index = m_index; + }; + auto commit_token = [&](auto type) { + CppToken token; + token.m_type = type; + token.m_view = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index); + tokens.append(token); + }; + + while (m_index < m_input.length()) { + auto ch = peek(); + if (isspace(ch)) { + begin_token(); + while (isspace(peek())) + consume(); + commit_token(CppToken::Type::Whitespace); + continue; + } + if (ch == '(') { + emit_token(CppToken::Type::LeftParen); + continue; + } + if (ch == ')') { + emit_token(CppToken::Type::RightParen); + continue; + } + if (ch == '{') { + emit_token(CppToken::Type::LeftCurly); + continue; + } + if (ch == '}') { + emit_token(CppToken::Type::RightCurly); + continue; + } + if (ch == '[') { + emit_token(CppToken::Type::LeftBracket); + continue; + } + if (ch == ']') { + emit_token(CppToken::Type::RightBracket); + continue; + } + if (ch == ',') { + emit_token(CppToken::Type::Comma); + continue; + } + if (ch == '*') { + emit_token(CppToken::Type::Asterisk); + continue; + } + if (ch == ';') { + emit_token(CppToken::Type::Semicolon); + continue; + } + if (ch == '#') { + begin_token(); + while (peek() && peek() != '\n') + consume(); + commit_token(CppToken::Type::PreprocessorStatement); + continue; + } + if (ch == '/' && peek(1) == '/') { + begin_token(); + while (peek() && peek() != '\n') + consume(); + commit_token(CppToken::Type::Comment); + continue; + } + if (ch == '/' && peek(1) == '*') { + begin_token(); + consume(); + consume(); + while (peek()) { + if (peek() == '*' && peek(1) == '/') + break; + consume(); + } + consume(); + consume(); + emit_token(CppToken::Type::Comment); + continue; + } + if (ch == '"') { + begin_token(); + consume(); + while (peek()) { + if (consume() == '"') + break; + } + commit_token(CppToken::Type::DoubleQuotedString); + continue; + } + if (ch == '\'') { + begin_token(); + consume(); + while (peek()) { + if (consume() == '\'') + break; + } + commit_token(CppToken::Type::SingleQuotedString); + continue; + } + if (isdigit(ch)) { + begin_token(); + while (peek() && isdigit(peek())) { + consume(); + } + commit_token(CppToken::Type::Number); + continue; + } + if (is_valid_first_character_of_identifier(ch)) { + begin_token(); + while (peek() && is_valid_nonfirst_character_of_identifier(peek())) + consume(); + auto token_view = StringView(m_input.characters_without_null_termination() + token_start_index, m_index - token_start_index); + if (is_keyword(token_view)) + commit_token(CppToken::Type::Keyword); + else + commit_token(CppToken::Type::Identifier); + continue; + } + dbg() << "Unimplemented token character: " << ch; + ASSERT_NOT_REACHED(); + } + return tokens; +} diff --git a/DevTools/HackStudio/CppLexer.h b/DevTools/HackStudio/CppLexer.h new file mode 100644 index 0000000000..80d8515361 --- /dev/null +++ b/DevTools/HackStudio/CppLexer.h @@ -0,0 +1,61 @@ +#pragma once + +#include <AK/StringView.h> +#include <AK/Vector.h> + +#define FOR_EACH_TOKEN_TYPE \ + __TOKEN(Invalid) \ + __TOKEN(Whitespace) \ + __TOKEN(PreprocessorStatement) \ + __TOKEN(LeftParen) \ + __TOKEN(RightParen) \ + __TOKEN(LeftCurly) \ + __TOKEN(RightCurly) \ + __TOKEN(LeftBracket) \ + __TOKEN(RightBracket) \ + __TOKEN(Comma) \ + __TOKEN(Asterisk) \ + __TOKEN(Semicolon) \ + __TOKEN(DoubleQuotedString) \ + __TOKEN(SingleQuotedString) \ + __TOKEN(Comment) \ + __TOKEN(Number) \ + __TOKEN(Keyword) \ + __TOKEN(Identifier) + +struct CppToken { + enum class Type { +#define __TOKEN(x) x, + FOR_EACH_TOKEN_TYPE +#undef __TOKEN + }; + + const char* to_string() const + { + switch (m_type) { +#define __TOKEN(x) \ + case Type::x: \ + return #x; + FOR_EACH_TOKEN_TYPE +#undef __TOKEN + } + ASSERT_NOT_REACHED(); + } + + Type m_type { Type::Invalid }; + StringView m_view; +}; + +class CppLexer { +public: + CppLexer(const StringView&); + + Vector<CppToken> lex(); + +private: + char peek(int offset = 0) const; + char consume(); + + StringView m_input; + int m_index { 0 }; +}; diff --git a/DevTools/HackStudio/Makefile b/DevTools/HackStudio/Makefile index 728552f7c3..dc35d177eb 100644 --- a/DevTools/HackStudio/Makefile +++ b/DevTools/HackStudio/Makefile @@ -6,6 +6,7 @@ OBJS = \ TerminalWrapper.o \ FindInFilesWidget.o \ ProcessStateWidget.o \ + CppLexer.o \ main.o APP = HackStudio |