diff options
author | Andreas Kling <awesomekling@gmail.com> | 2019-10-12 23:26:47 +0200 |
---|---|---|
committer | Andreas Kling <awesomekling@gmail.com> | 2019-10-12 23:34:05 +0200 |
commit | b083a233d8482ed7106d7a102866ee0dd42acc94 (patch) | |
tree | d32b52a7cc55fcd66dcaa94bb51f01e4bd04f398 | |
parent | 6d150df58a0b64c4cb583fc374fd05e60d387dc6 (diff) | |
download | serenity-b083a233d8482ed7106d7a102866ee0dd42acc94.zip |
LibHTML: Add Comment and CharacterData nodes and improve HTML parsing
This patch adds the CharacterData subclass of Node, which is now the
parent class of Text and a new Comment class.
A Comment node is one of these in HTML: <!--hello friends-->
Since these occur somewhat frequently on the web, we need to be able
to parse them.
This patch also adds a child rejection mechanism to the DOM tree.
Nodes can now override is_child_allowed(Node) and return false if they
don't want a particular Node to become a child of theirs. This is used
to prevent Document from taking on unwanted children.
-rw-r--r-- | Base/home/anon/www/welcome.html | 2 | ||||
-rw-r--r-- | Libraries/LibHTML/DOM/CharacterData.cpp | 11 | ||||
-rw-r--r-- | Libraries/LibHTML/DOM/CharacterData.h | 25 | ||||
-rw-r--r-- | Libraries/LibHTML/DOM/Comment.cpp | 11 | ||||
-rw-r--r-- | Libraries/LibHTML/DOM/Comment.h | 18 | ||||
-rw-r--r-- | Libraries/LibHTML/DOM/Document.cpp | 17 | ||||
-rw-r--r-- | Libraries/LibHTML/DOM/Document.h | 2 | ||||
-rw-r--r-- | Libraries/LibHTML/DOM/DocumentType.h | 2 | ||||
-rw-r--r-- | Libraries/LibHTML/DOM/Node.h | 5 | ||||
-rw-r--r-- | Libraries/LibHTML/DOM/Text.cpp | 3 | ||||
-rw-r--r-- | Libraries/LibHTML/DOM/Text.h | 10 | ||||
-rw-r--r-- | Libraries/LibHTML/Dump.cpp | 3 | ||||
-rw-r--r-- | Libraries/LibHTML/Makefile.shared | 2 | ||||
-rw-r--r-- | Libraries/LibHTML/Parser/HTMLParser.cpp | 59 | ||||
-rw-r--r-- | Libraries/LibHTML/TreeNode.h | 13 |
15 files changed, 158 insertions, 25 deletions
diff --git a/Base/home/anon/www/welcome.html b/Base/home/anon/www/welcome.html index 24602f109e..5f6739bd16 100644 --- a/Base/home/anon/www/welcome.html +++ b/Base/home/anon/www/welcome.html @@ -1,6 +1,8 @@ +<!DOCTYPE html> <html> <head> <title>Welcome!</title> +<!-- this is a comment --> <style type="text/css"> body { background-color: #fff; diff --git a/Libraries/LibHTML/DOM/CharacterData.cpp b/Libraries/LibHTML/DOM/CharacterData.cpp new file mode 100644 index 0000000000..e4b9d74e13 --- /dev/null +++ b/Libraries/LibHTML/DOM/CharacterData.cpp @@ -0,0 +1,11 @@ +#include <LibHTML/DOM/CharacterData.h> + +CharacterData::CharacterData(Document& document, NodeType type, const String& data) + : Node(document, type) + , m_data(data) +{ +} + +CharacterData::~CharacterData() +{ +} diff --git a/Libraries/LibHTML/DOM/CharacterData.h b/Libraries/LibHTML/DOM/CharacterData.h new file mode 100644 index 0000000000..34170ef6b9 --- /dev/null +++ b/Libraries/LibHTML/DOM/CharacterData.h @@ -0,0 +1,25 @@ +#pragma once + +#include <AK/String.h> +#include <LibHTML/DOM/Node.h> + +class CharacterData : public Node { +public: + virtual ~CharacterData() override; + + const String& data() const { return m_data; } + + virtual String text_content() const override { return m_data; } + +protected: + explicit CharacterData(Document&, NodeType, const String&); + +private: + String m_data; +}; + +template<> +inline bool is<CharacterData>(const Node& node) +{ + return node.is_character_data(); +} diff --git a/Libraries/LibHTML/DOM/Comment.cpp b/Libraries/LibHTML/DOM/Comment.cpp new file mode 100644 index 0000000000..3d97339e49 --- /dev/null +++ b/Libraries/LibHTML/DOM/Comment.cpp @@ -0,0 +1,11 @@ +#include <LibHTML/DOM/Comment.h> +#include <LibHTML/Layout/LayoutText.h> + +Comment::Comment(Document& document, const String& data) + : CharacterData(document, NodeType::COMMENT_NODE, data) +{ +} + +Comment::~Comment() +{ +} diff --git a/Libraries/LibHTML/DOM/Comment.h b/Libraries/LibHTML/DOM/Comment.h new file mode 100644 index 0000000000..39d0cd4cad --- /dev/null +++ b/Libraries/LibHTML/DOM/Comment.h @@ -0,0 +1,18 @@ +#pragma once + +#include <AK/String.h> +#include <LibHTML/DOM/CharacterData.h> + +class Comment final : public CharacterData { +public: + explicit Comment(Document&, const String&); + virtual ~Comment() override; + + virtual String tag_name() const override { return "#comment"; } +}; + +template<> +inline bool is<Comment>(const Node& node) +{ + return node.is_comment(); +} diff --git a/Libraries/LibHTML/DOM/Document.cpp b/Libraries/LibHTML/DOM/Document.cpp index 8b4af6a0bd..4262bca329 100644 --- a/Libraries/LibHTML/DOM/Document.cpp +++ b/Libraries/LibHTML/DOM/Document.cpp @@ -29,6 +29,23 @@ StyleResolver& Document::style_resolver() return *m_style_resolver; } +bool Document::is_child_allowed(const Node& node) const +{ + switch (node.type()) { + case NodeType::DOCUMENT_NODE: + case NodeType::TEXT_NODE: + return false; + case NodeType::COMMENT_NODE: + return true; + case NodeType::DOCUMENT_TYPE_NODE: + return !first_child_of_type<DocumentType>(); + case NodeType::ELEMENT_NODE: + return !first_child_of_type<Element>(); + default: + return false; + } +} + void Document::fixup() { if (!is<DocumentType>(first_child())) diff --git a/Libraries/LibHTML/DOM/Document.h b/Libraries/LibHTML/DOM/Document.h index 485ed87cec..8f3947147c 100644 --- a/Libraries/LibHTML/DOM/Document.h +++ b/Libraries/LibHTML/DOM/Document.h @@ -67,6 +67,8 @@ public: void invalidate_layout(); Function<void()> on_invalidate_layout; + virtual bool is_child_allowed(const Node&) const override; + private: virtual RefPtr<LayoutNode> create_layout_node(const StyleResolver&, const StyleProperties* parent_style) const override; diff --git a/Libraries/LibHTML/DOM/DocumentType.h b/Libraries/LibHTML/DOM/DocumentType.h index c13bb5cbee..78f2eae5bf 100644 --- a/Libraries/LibHTML/DOM/DocumentType.h +++ b/Libraries/LibHTML/DOM/DocumentType.h @@ -7,7 +7,7 @@ public: explicit DocumentType(Document&); virtual ~DocumentType() override; - virtual String tag_name() const override { return "!DOCTYPE"; } + virtual String tag_name() const override { return "#doctype"; } }; template<> diff --git a/Libraries/LibHTML/DOM/Node.h b/Libraries/LibHTML/DOM/Node.h index 132bbcd63a..9efd4bf962 100644 --- a/Libraries/LibHTML/DOM/Node.h +++ b/Libraries/LibHTML/DOM/Node.h @@ -10,6 +10,7 @@ enum class NodeType : unsigned { INVALID = 0, ELEMENT_NODE = 1, TEXT_NODE = 3, + COMMENT_NODE = 8, DOCUMENT_NODE = 9, DOCUMENT_TYPE_NODE = 10, }; @@ -32,6 +33,8 @@ public: bool is_text() const { return type() == NodeType::TEXT_NODE; } bool is_document() const { return type() == NodeType::DOCUMENT_NODE; } bool is_document_type() const { return type() == NodeType::DOCUMENT_TYPE_NODE; } + bool is_comment() const { return type() == NodeType::COMMENT_NODE; } + bool is_character_data() const { return type() == NodeType::TEXT_NODE || type() == NodeType::COMMENT_NODE; } bool is_parent_node() const { return is_element() || is_document(); } virtual RefPtr<LayoutNode> create_layout_node(const StyleResolver&, const StyleProperties* parent_style) const; @@ -66,6 +69,8 @@ public: const Element* previous_element_sibling() const; const Element* next_element_sibling() const; + virtual bool is_child_allowed(const Node&) const { return true; } + protected: Node(Document&, NodeType); diff --git a/Libraries/LibHTML/DOM/Text.cpp b/Libraries/LibHTML/DOM/Text.cpp index a066a767f6..4dc3d34ff9 100644 --- a/Libraries/LibHTML/DOM/Text.cpp +++ b/Libraries/LibHTML/DOM/Text.cpp @@ -2,8 +2,7 @@ #include <LibHTML/Layout/LayoutText.h> Text::Text(Document& document, const String& data) - : Node(document, NodeType::TEXT_NODE) - , m_data(data) + : CharacterData(document, NodeType::TEXT_NODE, data) { } diff --git a/Libraries/LibHTML/DOM/Text.h b/Libraries/LibHTML/DOM/Text.h index c507dbb3ba..30dd7d02c8 100644 --- a/Libraries/LibHTML/DOM/Text.h +++ b/Libraries/LibHTML/DOM/Text.h @@ -1,23 +1,17 @@ #pragma once #include <AK/String.h> -#include <LibHTML/DOM/Node.h> +#include <LibHTML/DOM/CharacterData.h> -class Text final : public Node { +class Text final : public CharacterData { public: explicit Text(Document&, const String&); virtual ~Text() override; - const String& data() const { return m_data; } - virtual String tag_name() const override { return "#text"; } - virtual String text_content() const override { return m_data; } - private: virtual RefPtr<LayoutNode> create_layout_node(const StyleResolver&, const StyleProperties* parent_style) const override; - - String m_data; }; template<> diff --git a/Libraries/LibHTML/Dump.cpp b/Libraries/LibHTML/Dump.cpp index 6533b58774..2857d43e31 100644 --- a/Libraries/LibHTML/Dump.cpp +++ b/Libraries/LibHTML/Dump.cpp @@ -1,5 +1,6 @@ #include <AK/Utf8View.h> #include <LibHTML/CSS/StyleSheet.h> +#include <LibHTML/DOM/Comment.h> #include <LibHTML/DOM/Document.h> #include <LibHTML/DOM/DocumentType.h> #include <LibHTML/DOM/Element.h> @@ -27,6 +28,8 @@ void dump_tree(const Node& node) dbgprintf("\"%s\"\n", static_cast<const Text&>(node).data().characters()); } else if (is<DocumentType>(node)) { dbgprintf("<!DOCTYPE>\n"); + } else if (is<Comment>(node)) { + dbgprintf("<!--%s-->\n", to<Comment>(node).data().characters()); } ++indent; if (is<ParentNode>(node)) { diff --git a/Libraries/LibHTML/Makefile.shared b/Libraries/LibHTML/Makefile.shared index 08fc3d5500..a3af6d90d2 100644 --- a/Libraries/LibHTML/Makefile.shared +++ b/Libraries/LibHTML/Makefile.shared @@ -17,6 +17,8 @@ LIBHTML_OBJS = \ DOM/HTMLBlinkElement.o \ DOM/HTMLBRElement.o \ DOM/Document.o \ + DOM/CharacterData.o \ + DOM/Comment.o \ DOM/Text.o \ DOM/DocumentType.o \ DOM/ElementFactory.o \ diff --git a/Libraries/LibHTML/Parser/HTMLParser.cpp b/Libraries/LibHTML/Parser/HTMLParser.cpp index a69871c8a0..7f0a8c9c26 100644 --- a/Libraries/LibHTML/Parser/HTMLParser.cpp +++ b/Libraries/LibHTML/Parser/HTMLParser.cpp @@ -1,6 +1,7 @@ #include <AK/Function.h> #include <AK/NonnullRefPtrVector.h> #include <AK/StringBuilder.h> +#include <LibHTML/DOM/Comment.h> #include <LibHTML/DOM/DocumentType.h> #include <LibHTML/DOM/Element.h> #include <LibHTML/DOM/ElementFactory.h> @@ -44,6 +45,8 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url) Free = 0, BeforeTagName, InTagName, + InDoctype, + InComment, InAttributeList, InAttributeName, BeforeAttributeValue, @@ -101,19 +104,16 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url) close_tag(); }; - auto handle_exclamation_tag = [&] { - auto name = String::copy(tag_name_buffer); - tag_name_buffer.clear(); - ASSERT(name == "DOCTYPE"); - if (node_stack.size() != 1) - node_stack[node_stack.size() - 2].append_child(adopt(*new DocumentType(document)), false); - close_tag(); + auto commit_doctype = [&] { + node_stack.last().append_child(adopt(*new DocumentType(document)), false); + }; + + auto commit_comment = [&] { + node_stack.last().append_child(adopt(*new Comment(document, text_buffer.to_string())), false); }; auto commit_tag = [&] { - if (is_exclamation_tag) - handle_exclamation_tag(); - else if (is_slash_tag) + if (is_slash_tag) close_tag(); else open_tag(); @@ -124,12 +124,16 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url) }; for (int i = 0; i < html.length(); ++i) { + auto peek = [&](int offset) -> char { + if (i + offset >= html.length()) + return '\0'; + return html[i + offset]; + }; char ch = html[i]; switch (state) { case State::Free: if (ch == '<') { is_slash_tag = false; - is_exclamation_tag = false; move_to_state(State::BeforeTagName); break; } @@ -165,7 +169,22 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url) break; } if (ch == '!') { - is_exclamation_tag = true; + if (peek(1) == 'D' + && peek(2) == 'O' + && peek(3) == 'C' + && peek(4) == 'T' + && peek(5) == 'Y' + && peek(6) == 'P' + && peek(7) == 'E') { + i += 7; + move_to_state(State::InDoctype); + break; + } + if (peek(1) == '-' && peek(2) == '-') { + i += 2; + move_to_state(State::InComment); + break; + } break; } if (ch == '>') { @@ -188,6 +207,22 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url) } tag_name_buffer.append(ch); break; + case State::InDoctype: + if (ch == '>') { + commit_doctype(); + move_to_state(State::Free); + break; + } + break; + case State::InComment: + if (ch == '-' && peek(1) == '-' && peek(2) == '>') { + commit_comment(); + i += 2; + move_to_state(State::Free); + break; + } + text_buffer.append(ch); + break; case State::InAttributeList: if (ch == '>') { commit_tag(); diff --git a/Libraries/LibHTML/TreeNode.h b/Libraries/LibHTML/TreeNode.h index 75f878fd80..2b498720f4 100644 --- a/Libraries/LibHTML/TreeNode.h +++ b/Libraries/LibHTML/TreeNode.h @@ -50,8 +50,10 @@ public: void append_child(NonnullRefPtr<T> node, bool call_inserted_into = true); void donate_all_children_to(T& node); + bool is_child_allowed(const T&) const { return true; } + protected: - TreeNode() { } + TreeNode() {} private: int m_ref_count { 1 }; @@ -66,6 +68,10 @@ template<typename T> inline void TreeNode<T>::append_child(NonnullRefPtr<T> node, bool call_inserted_into) { ASSERT(!node->m_parent); + + if (!static_cast<T*>(this)->is_child_allowed(*node)) + return; + if (m_last_child) m_last_child->m_next_sibling = node.ptr(); node->m_previous_sibling = m_last_child; @@ -82,6 +88,10 @@ template<typename T> inline void TreeNode<T>::prepend_child(NonnullRefPtr<T> node, bool call_inserted_into) { ASSERT(!node->m_parent); + + if (!static_cast<T*>(this)->is_child_allowed(*node)) + return; + if (m_first_child) m_first_child->m_previous_sibling = node.ptr(); node->m_next_sibling = m_first_child; @@ -112,7 +122,6 @@ inline void TreeNode<T>::donate_all_children_to(T& node) m_last_child = nullptr; } - template<typename T> inline bool TreeNode<T>::is_ancestor_of(const TreeNode<T>& other) const { |