summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Kling <awesomekling@gmail.com>2019-10-12 23:26:47 +0200
committerAndreas Kling <awesomekling@gmail.com>2019-10-12 23:34:05 +0200
commitb083a233d8482ed7106d7a102866ee0dd42acc94 (patch)
treed32b52a7cc55fcd66dcaa94bb51f01e4bd04f398
parent6d150df58a0b64c4cb583fc374fd05e60d387dc6 (diff)
downloadserenity-b083a233d8482ed7106d7a102866ee0dd42acc94.zip
LibHTML: Add Comment and CharacterData nodes and improve HTML parsing
This patch adds the CharacterData subclass of Node, which is now the parent class of Text and a new Comment class. A Comment node is one of these in HTML: <!--hello friends--> Since these occur somewhat frequently on the web, we need to be able to parse them. This patch also adds a child rejection mechanism to the DOM tree. Nodes can now override is_child_allowed(Node) and return false if they don't want a particular Node to become a child of theirs. This is used to prevent Document from taking on unwanted children.
-rw-r--r--Base/home/anon/www/welcome.html2
-rw-r--r--Libraries/LibHTML/DOM/CharacterData.cpp11
-rw-r--r--Libraries/LibHTML/DOM/CharacterData.h25
-rw-r--r--Libraries/LibHTML/DOM/Comment.cpp11
-rw-r--r--Libraries/LibHTML/DOM/Comment.h18
-rw-r--r--Libraries/LibHTML/DOM/Document.cpp17
-rw-r--r--Libraries/LibHTML/DOM/Document.h2
-rw-r--r--Libraries/LibHTML/DOM/DocumentType.h2
-rw-r--r--Libraries/LibHTML/DOM/Node.h5
-rw-r--r--Libraries/LibHTML/DOM/Text.cpp3
-rw-r--r--Libraries/LibHTML/DOM/Text.h10
-rw-r--r--Libraries/LibHTML/Dump.cpp3
-rw-r--r--Libraries/LibHTML/Makefile.shared2
-rw-r--r--Libraries/LibHTML/Parser/HTMLParser.cpp59
-rw-r--r--Libraries/LibHTML/TreeNode.h13
15 files changed, 158 insertions, 25 deletions
diff --git a/Base/home/anon/www/welcome.html b/Base/home/anon/www/welcome.html
index 24602f109e..5f6739bd16 100644
--- a/Base/home/anon/www/welcome.html
+++ b/Base/home/anon/www/welcome.html
@@ -1,6 +1,8 @@
+<!DOCTYPE html>
<html>
<head>
<title>Welcome!</title>
+<!-- this is a comment -->
<style type="text/css">
body {
background-color: #fff;
diff --git a/Libraries/LibHTML/DOM/CharacterData.cpp b/Libraries/LibHTML/DOM/CharacterData.cpp
new file mode 100644
index 0000000000..e4b9d74e13
--- /dev/null
+++ b/Libraries/LibHTML/DOM/CharacterData.cpp
@@ -0,0 +1,11 @@
+#include <LibHTML/DOM/CharacterData.h>
+
+CharacterData::CharacterData(Document& document, NodeType type, const String& data)
+ : Node(document, type)
+ , m_data(data)
+{
+}
+
+CharacterData::~CharacterData()
+{
+}
diff --git a/Libraries/LibHTML/DOM/CharacterData.h b/Libraries/LibHTML/DOM/CharacterData.h
new file mode 100644
index 0000000000..34170ef6b9
--- /dev/null
+++ b/Libraries/LibHTML/DOM/CharacterData.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <AK/String.h>
+#include <LibHTML/DOM/Node.h>
+
+class CharacterData : public Node {
+public:
+ virtual ~CharacterData() override;
+
+ const String& data() const { return m_data; }
+
+ virtual String text_content() const override { return m_data; }
+
+protected:
+ explicit CharacterData(Document&, NodeType, const String&);
+
+private:
+ String m_data;
+};
+
+template<>
+inline bool is<CharacterData>(const Node& node)
+{
+ return node.is_character_data();
+}
diff --git a/Libraries/LibHTML/DOM/Comment.cpp b/Libraries/LibHTML/DOM/Comment.cpp
new file mode 100644
index 0000000000..3d97339e49
--- /dev/null
+++ b/Libraries/LibHTML/DOM/Comment.cpp
@@ -0,0 +1,11 @@
+#include <LibHTML/DOM/Comment.h>
+#include <LibHTML/Layout/LayoutText.h>
+
+Comment::Comment(Document& document, const String& data)
+ : CharacterData(document, NodeType::COMMENT_NODE, data)
+{
+}
+
+Comment::~Comment()
+{
+}
diff --git a/Libraries/LibHTML/DOM/Comment.h b/Libraries/LibHTML/DOM/Comment.h
new file mode 100644
index 0000000000..39d0cd4cad
--- /dev/null
+++ b/Libraries/LibHTML/DOM/Comment.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <AK/String.h>
+#include <LibHTML/DOM/CharacterData.h>
+
+class Comment final : public CharacterData {
+public:
+ explicit Comment(Document&, const String&);
+ virtual ~Comment() override;
+
+ virtual String tag_name() const override { return "#comment"; }
+};
+
+template<>
+inline bool is<Comment>(const Node& node)
+{
+ return node.is_comment();
+}
diff --git a/Libraries/LibHTML/DOM/Document.cpp b/Libraries/LibHTML/DOM/Document.cpp
index 8b4af6a0bd..4262bca329 100644
--- a/Libraries/LibHTML/DOM/Document.cpp
+++ b/Libraries/LibHTML/DOM/Document.cpp
@@ -29,6 +29,23 @@ StyleResolver& Document::style_resolver()
return *m_style_resolver;
}
+bool Document::is_child_allowed(const Node& node) const
+{
+ switch (node.type()) {
+ case NodeType::DOCUMENT_NODE:
+ case NodeType::TEXT_NODE:
+ return false;
+ case NodeType::COMMENT_NODE:
+ return true;
+ case NodeType::DOCUMENT_TYPE_NODE:
+ return !first_child_of_type<DocumentType>();
+ case NodeType::ELEMENT_NODE:
+ return !first_child_of_type<Element>();
+ default:
+ return false;
+ }
+}
+
void Document::fixup()
{
if (!is<DocumentType>(first_child()))
diff --git a/Libraries/LibHTML/DOM/Document.h b/Libraries/LibHTML/DOM/Document.h
index 485ed87cec..8f3947147c 100644
--- a/Libraries/LibHTML/DOM/Document.h
+++ b/Libraries/LibHTML/DOM/Document.h
@@ -67,6 +67,8 @@ public:
void invalidate_layout();
Function<void()> on_invalidate_layout;
+ virtual bool is_child_allowed(const Node&) const override;
+
private:
virtual RefPtr<LayoutNode> create_layout_node(const StyleResolver&, const StyleProperties* parent_style) const override;
diff --git a/Libraries/LibHTML/DOM/DocumentType.h b/Libraries/LibHTML/DOM/DocumentType.h
index c13bb5cbee..78f2eae5bf 100644
--- a/Libraries/LibHTML/DOM/DocumentType.h
+++ b/Libraries/LibHTML/DOM/DocumentType.h
@@ -7,7 +7,7 @@ public:
explicit DocumentType(Document&);
virtual ~DocumentType() override;
- virtual String tag_name() const override { return "!DOCTYPE"; }
+ virtual String tag_name() const override { return "#doctype"; }
};
template<>
diff --git a/Libraries/LibHTML/DOM/Node.h b/Libraries/LibHTML/DOM/Node.h
index 132bbcd63a..9efd4bf962 100644
--- a/Libraries/LibHTML/DOM/Node.h
+++ b/Libraries/LibHTML/DOM/Node.h
@@ -10,6 +10,7 @@ enum class NodeType : unsigned {
INVALID = 0,
ELEMENT_NODE = 1,
TEXT_NODE = 3,
+ COMMENT_NODE = 8,
DOCUMENT_NODE = 9,
DOCUMENT_TYPE_NODE = 10,
};
@@ -32,6 +33,8 @@ public:
bool is_text() const { return type() == NodeType::TEXT_NODE; }
bool is_document() const { return type() == NodeType::DOCUMENT_NODE; }
bool is_document_type() const { return type() == NodeType::DOCUMENT_TYPE_NODE; }
+ bool is_comment() const { return type() == NodeType::COMMENT_NODE; }
+ bool is_character_data() const { return type() == NodeType::TEXT_NODE || type() == NodeType::COMMENT_NODE; }
bool is_parent_node() const { return is_element() || is_document(); }
virtual RefPtr<LayoutNode> create_layout_node(const StyleResolver&, const StyleProperties* parent_style) const;
@@ -66,6 +69,8 @@ public:
const Element* previous_element_sibling() const;
const Element* next_element_sibling() const;
+ virtual bool is_child_allowed(const Node&) const { return true; }
+
protected:
Node(Document&, NodeType);
diff --git a/Libraries/LibHTML/DOM/Text.cpp b/Libraries/LibHTML/DOM/Text.cpp
index a066a767f6..4dc3d34ff9 100644
--- a/Libraries/LibHTML/DOM/Text.cpp
+++ b/Libraries/LibHTML/DOM/Text.cpp
@@ -2,8 +2,7 @@
#include <LibHTML/Layout/LayoutText.h>
Text::Text(Document& document, const String& data)
- : Node(document, NodeType::TEXT_NODE)
- , m_data(data)
+ : CharacterData(document, NodeType::TEXT_NODE, data)
{
}
diff --git a/Libraries/LibHTML/DOM/Text.h b/Libraries/LibHTML/DOM/Text.h
index c507dbb3ba..30dd7d02c8 100644
--- a/Libraries/LibHTML/DOM/Text.h
+++ b/Libraries/LibHTML/DOM/Text.h
@@ -1,23 +1,17 @@
#pragma once
#include <AK/String.h>
-#include <LibHTML/DOM/Node.h>
+#include <LibHTML/DOM/CharacterData.h>
-class Text final : public Node {
+class Text final : public CharacterData {
public:
explicit Text(Document&, const String&);
virtual ~Text() override;
- const String& data() const { return m_data; }
-
virtual String tag_name() const override { return "#text"; }
- virtual String text_content() const override { return m_data; }
-
private:
virtual RefPtr<LayoutNode> create_layout_node(const StyleResolver&, const StyleProperties* parent_style) const override;
-
- String m_data;
};
template<>
diff --git a/Libraries/LibHTML/Dump.cpp b/Libraries/LibHTML/Dump.cpp
index 6533b58774..2857d43e31 100644
--- a/Libraries/LibHTML/Dump.cpp
+++ b/Libraries/LibHTML/Dump.cpp
@@ -1,5 +1,6 @@
#include <AK/Utf8View.h>
#include <LibHTML/CSS/StyleSheet.h>
+#include <LibHTML/DOM/Comment.h>
#include <LibHTML/DOM/Document.h>
#include <LibHTML/DOM/DocumentType.h>
#include <LibHTML/DOM/Element.h>
@@ -27,6 +28,8 @@ void dump_tree(const Node& node)
dbgprintf("\"%s\"\n", static_cast<const Text&>(node).data().characters());
} else if (is<DocumentType>(node)) {
dbgprintf("<!DOCTYPE>\n");
+ } else if (is<Comment>(node)) {
+ dbgprintf("<!--%s-->\n", to<Comment>(node).data().characters());
}
++indent;
if (is<ParentNode>(node)) {
diff --git a/Libraries/LibHTML/Makefile.shared b/Libraries/LibHTML/Makefile.shared
index 08fc3d5500..a3af6d90d2 100644
--- a/Libraries/LibHTML/Makefile.shared
+++ b/Libraries/LibHTML/Makefile.shared
@@ -17,6 +17,8 @@ LIBHTML_OBJS = \
DOM/HTMLBlinkElement.o \
DOM/HTMLBRElement.o \
DOM/Document.o \
+ DOM/CharacterData.o \
+ DOM/Comment.o \
DOM/Text.o \
DOM/DocumentType.o \
DOM/ElementFactory.o \
diff --git a/Libraries/LibHTML/Parser/HTMLParser.cpp b/Libraries/LibHTML/Parser/HTMLParser.cpp
index a69871c8a0..7f0a8c9c26 100644
--- a/Libraries/LibHTML/Parser/HTMLParser.cpp
+++ b/Libraries/LibHTML/Parser/HTMLParser.cpp
@@ -1,6 +1,7 @@
#include <AK/Function.h>
#include <AK/NonnullRefPtrVector.h>
#include <AK/StringBuilder.h>
+#include <LibHTML/DOM/Comment.h>
#include <LibHTML/DOM/DocumentType.h>
#include <LibHTML/DOM/Element.h>
#include <LibHTML/DOM/ElementFactory.h>
@@ -44,6 +45,8 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url)
Free = 0,
BeforeTagName,
InTagName,
+ InDoctype,
+ InComment,
InAttributeList,
InAttributeName,
BeforeAttributeValue,
@@ -101,19 +104,16 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url)
close_tag();
};
- auto handle_exclamation_tag = [&] {
- auto name = String::copy(tag_name_buffer);
- tag_name_buffer.clear();
- ASSERT(name == "DOCTYPE");
- if (node_stack.size() != 1)
- node_stack[node_stack.size() - 2].append_child(adopt(*new DocumentType(document)), false);
- close_tag();
+ auto commit_doctype = [&] {
+ node_stack.last().append_child(adopt(*new DocumentType(document)), false);
+ };
+
+ auto commit_comment = [&] {
+ node_stack.last().append_child(adopt(*new Comment(document, text_buffer.to_string())), false);
};
auto commit_tag = [&] {
- if (is_exclamation_tag)
- handle_exclamation_tag();
- else if (is_slash_tag)
+ if (is_slash_tag)
close_tag();
else
open_tag();
@@ -124,12 +124,16 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url)
};
for (int i = 0; i < html.length(); ++i) {
+ auto peek = [&](int offset) -> char {
+ if (i + offset >= html.length())
+ return '\0';
+ return html[i + offset];
+ };
char ch = html[i];
switch (state) {
case State::Free:
if (ch == '<') {
is_slash_tag = false;
- is_exclamation_tag = false;
move_to_state(State::BeforeTagName);
break;
}
@@ -165,7 +169,22 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url)
break;
}
if (ch == '!') {
- is_exclamation_tag = true;
+ if (peek(1) == 'D'
+ && peek(2) == 'O'
+ && peek(3) == 'C'
+ && peek(4) == 'T'
+ && peek(5) == 'Y'
+ && peek(6) == 'P'
+ && peek(7) == 'E') {
+ i += 7;
+ move_to_state(State::InDoctype);
+ break;
+ }
+ if (peek(1) == '-' && peek(2) == '-') {
+ i += 2;
+ move_to_state(State::InComment);
+ break;
+ }
break;
}
if (ch == '>') {
@@ -188,6 +207,22 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url)
}
tag_name_buffer.append(ch);
break;
+ case State::InDoctype:
+ if (ch == '>') {
+ commit_doctype();
+ move_to_state(State::Free);
+ break;
+ }
+ break;
+ case State::InComment:
+ if (ch == '-' && peek(1) == '-' && peek(2) == '>') {
+ commit_comment();
+ i += 2;
+ move_to_state(State::Free);
+ break;
+ }
+ text_buffer.append(ch);
+ break;
case State::InAttributeList:
if (ch == '>') {
commit_tag();
diff --git a/Libraries/LibHTML/TreeNode.h b/Libraries/LibHTML/TreeNode.h
index 75f878fd80..2b498720f4 100644
--- a/Libraries/LibHTML/TreeNode.h
+++ b/Libraries/LibHTML/TreeNode.h
@@ -50,8 +50,10 @@ public:
void append_child(NonnullRefPtr<T> node, bool call_inserted_into = true);
void donate_all_children_to(T& node);
+ bool is_child_allowed(const T&) const { return true; }
+
protected:
- TreeNode() { }
+ TreeNode() {}
private:
int m_ref_count { 1 };
@@ -66,6 +68,10 @@ template<typename T>
inline void TreeNode<T>::append_child(NonnullRefPtr<T> node, bool call_inserted_into)
{
ASSERT(!node->m_parent);
+
+ if (!static_cast<T*>(this)->is_child_allowed(*node))
+ return;
+
if (m_last_child)
m_last_child->m_next_sibling = node.ptr();
node->m_previous_sibling = m_last_child;
@@ -82,6 +88,10 @@ template<typename T>
inline void TreeNode<T>::prepend_child(NonnullRefPtr<T> node, bool call_inserted_into)
{
ASSERT(!node->m_parent);
+
+ if (!static_cast<T*>(this)->is_child_allowed(*node))
+ return;
+
if (m_first_child)
m_first_child->m_previous_sibling = node.ptr();
node->m_next_sibling = m_first_child;
@@ -112,7 +122,6 @@ inline void TreeNode<T>::donate_all_children_to(T& node)
m_last_child = nullptr;
}
-
template<typename T>
inline bool TreeNode<T>::is_ancestor_of(const TreeNode<T>& other) const
{