From 97a230e4efedf92879da89cce57ffcb8d26fa517 Mon Sep 17 00:00:00 2001 From: Ali Mohammad Pur Date: Thu, 20 May 2021 23:15:33 +0430 Subject: LibWeb: Add a super basic HTML syntax highlighter This can currently highlight tag names and attribute names/values. --- Userland/Libraries/LibSyntax/Highlighter.h | 9 +- Userland/Libraries/LibWeb/CMakeLists.txt | 1 + .../Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp | 5 + .../Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h | 4 + .../HTML/SyntaxHighlighter/SyntaxHighlighter.cpp | 147 +++++++++++++++++++++ .../HTML/SyntaxHighlighter/SyntaxHighlighter.h | 32 +++++ 6 files changed, 194 insertions(+), 4 deletions(-) create mode 100644 Userland/Libraries/LibWeb/HTML/SyntaxHighlighter/SyntaxHighlighter.cpp create mode 100644 Userland/Libraries/LibWeb/HTML/SyntaxHighlighter/SyntaxHighlighter.h diff --git a/Userland/Libraries/LibSyntax/Highlighter.h b/Userland/Libraries/LibSyntax/Highlighter.h index dc2fe000e6..de0b6bdc2b 100644 --- a/Userland/Libraries/LibSyntax/Highlighter.h +++ b/Userland/Libraries/LibSyntax/Highlighter.h @@ -15,13 +15,14 @@ namespace Syntax { enum class Language { - PlainText, Cpp, - JavaScript, - INI, GML, - Shell, + HTML, + INI, + JavaScript, + PlainText, SQL, + Shell, }; struct TextStyle { diff --git a/Userland/Libraries/LibWeb/CMakeLists.txt b/Userland/Libraries/LibWeb/CMakeLists.txt index 42c6de4158..20a62c948c 100644 --- a/Userland/Libraries/LibWeb/CMakeLists.txt +++ b/Userland/Libraries/LibWeb/CMakeLists.txt @@ -153,6 +153,7 @@ set(SOURCES HTML/Parser/ListOfActiveFormattingElements.cpp HTML/Parser/StackOfOpenElements.cpp HTML/SubmitEvent.cpp + HTML/SyntaxHighlighter/SyntaxHighlighter.cpp HTML/TagNames.cpp HTML/WebSocket.cpp HighResolutionTime/Performance.cpp diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp index 51ef6af76b..8dfaed9602 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp @@ -229,6 +229,11 @@ Optional HTMLTokenizer::peek_code_point(size_t offset) const Optional HTMLTokenizer::next_token() { + { + auto last_position = m_source_positions.last(); + m_source_positions.clear(); + m_source_positions.append(move(last_position)); + } _StartOfFunction: if (!m_queued_tokens.is_empty()) return m_queued_tokens.dequeue(); diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h index 5edfaf2271..da9945ed78 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h @@ -110,6 +110,10 @@ public: Optional next_token(); void switch_to(Badge, State new_state); + void switch_to(State new_state) + { + m_state = new_state; + } void set_blocked(bool b) { m_blocked = b; } bool is_blocked() const { return m_blocked; } diff --git a/Userland/Libraries/LibWeb/HTML/SyntaxHighlighter/SyntaxHighlighter.cpp b/Userland/Libraries/LibWeb/HTML/SyntaxHighlighter/SyntaxHighlighter.cpp new file mode 100644 index 0000000000..53dba5bf18 --- /dev/null +++ b/Userland/Libraries/LibWeb/HTML/SyntaxHighlighter/SyntaxHighlighter.cpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2021, Ali Mohammad Pur + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include + +namespace Web::HTML { + +enum class AugmentedTokenKind : u32 { + AttributeName, + AttributeValue, + OpenTag, + CloseTag, + Comment, + Doctype, +}; + +bool SyntaxHighlighter::is_identifier(void* token) const +{ + if (!token) + return false; + return false; +} + +bool SyntaxHighlighter::is_navigatable(void*) const +{ + return false; +} + +void SyntaxHighlighter::rehighlight(const Palette& palette) +{ + (void)palette; + auto text = m_client->get_text(); + + Vector spans; + auto highlight = [&](auto start_line, auto start_column, auto end_line, auto end_column, Gfx::TextAttributes attributes, AugmentedTokenKind kind) { + spans.empend( + GUI::TextRange { + { start_line, start_column }, + { end_line, end_column }, + }, + move(attributes), + (void*)kind, + false); + }; + + HTMLTokenizer tokenizer { text, "utf-8" }; + [[maybe_unused]] enum class State { + HTML, + Javascript, + CSS, + } state { State::HTML }; + for (;;) { + auto token = tokenizer.next_token(); + if (!token.has_value()) + break; + + if (token->is_start_tag()) { + if (token->tag_name() == "script"sv) { + tokenizer.switch_to(HTMLTokenizer::State::ScriptData); + state = State::Javascript; + } else if (token->tag_name() == "style"sv) { + tokenizer.switch_to(HTMLTokenizer::State::RAWTEXT); + state = State::CSS; + } + } else if (token->is_end_tag()) { + if (token->tag_name().is_one_of("script"sv, "style"sv)) { + if (state == State::Javascript) { + // FIXME: Highlight javascript code here instead. + } else if (state == State::CSS) { + // FIXME: Highlight CSS code here instead. + } + state = State::HTML; + } + } + + size_t token_start_offset = token->is_end_tag() ? 1 : 0; + + if (token->is_comment()) { + highlight( + token->start_position().line, + token->start_position().column, + token->start_position().line, + token->start_position().column, + { palette.syntax_comment(), {} }, + AugmentedTokenKind::Comment); + } else if (token->is_start_tag() || token->is_end_tag()) { + // FIXME: This breaks with single-character tag names. + highlight( + token->start_position().line, + token->start_position().column + token_start_offset, + token->start_position().line, + token->start_position().column + token->tag_name().length() + token_start_offset - 1, + { palette.syntax_keyword(), {} }, + token->is_start_tag() ? AugmentedTokenKind::OpenTag : AugmentedTokenKind::CloseTag); + + for (auto& attribute : token->attributes()) { + highlight( + attribute.name_start_position.line, + attribute.name_start_position.column + token_start_offset, + attribute.name_end_position.line, + attribute.name_end_position.column + token_start_offset, + { palette.syntax_identifier(), {} }, + AugmentedTokenKind::AttributeName); + highlight( + attribute.value_start_position.line, + attribute.value_start_position.column + token_start_offset, + attribute.value_end_position.line, + attribute.value_end_position.column + token_start_offset, + { palette.syntax_string(), {} }, + AugmentedTokenKind::AttributeValue); + } + } else if (token->is_doctype()) { + highlight( + token->start_position().line, + token->start_position().column, + token->start_position().line, + token->start_position().column, + { palette.syntax_preprocessor_statement(), {} }, + AugmentedTokenKind::Doctype); + } + } + + m_client->do_set_spans(move(spans)); + m_has_brace_buddies = false; + highlight_matching_token_pair(); + m_client->do_update(); +} + +Vector SyntaxHighlighter::matching_token_pairs() const +{ + static Vector pairs; + if (pairs.is_empty()) { + pairs.append({ (void*)AugmentedTokenKind::OpenTag, (void*)AugmentedTokenKind::CloseTag }); + } + return pairs; +} + +bool SyntaxHighlighter::token_types_equal(void* token0, void* token1) const +{ + return token0 == token1; +} + +} diff --git a/Userland/Libraries/LibWeb/HTML/SyntaxHighlighter/SyntaxHighlighter.h b/Userland/Libraries/LibWeb/HTML/SyntaxHighlighter/SyntaxHighlighter.h new file mode 100644 index 0000000000..067c3c6677 --- /dev/null +++ b/Userland/Libraries/LibWeb/HTML/SyntaxHighlighter/SyntaxHighlighter.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2021, Ali Mohammad Pur + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include + +namespace Web::HTML { + +class SyntaxHighlighter : public Syntax::Highlighter { +public: + SyntaxHighlighter() = default; + virtual ~SyntaxHighlighter() override = default; + + virtual bool is_identifier(void*) const override; + virtual bool is_navigatable(void*) const override; + + virtual Syntax::Language language() const override { return Syntax::Language::HTML; } + virtual void rehighlight(const Palette&) override; + +protected: + virtual Vector matching_token_pairs() const override; + virtual bool token_types_equal(void*, void*) const override; + + size_t m_line { 1 }; + size_t m_column { 0 }; +}; + +} -- cgit v1.2.3