summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Kling <awesomekling@gmail.com>2019-06-15 20:20:55 +0200
committerAndreas Kling <awesomekling@gmail.com>2019-06-15 20:21:57 +0200
commit581d6b00c828cd6f5920bf7de5b63023d5b52436 (patch)
tree302a408e91d1e6fe4ec606ea4baeaa10354cddce
parenta67e823838943b31fb7cea68bd592093e197cf16 (diff)
downloadserenity-581d6b00c828cd6f5920bf7de5b63023d5b52436.zip
LibHTML: Start working on a very simplified HTML parser.
-rw-r--r--Base/home/anon/lorem.html15
-rw-r--r--Base/home/anon/small.html6
-rw-r--r--LibHTML/Parser.cpp114
-rw-r--r--LibHTML/test.cpp12
4 files changed, 132 insertions, 15 deletions
diff --git a/Base/home/anon/lorem.html b/Base/home/anon/lorem.html
new file mode 100644
index 0000000000..cb8f3e26c8
--- /dev/null
+++ b/Base/home/anon/lorem.html
@@ -0,0 +1,15 @@
+<html>
+ <head><title>Lorem Ipsum</title></head>
+ <body>
+ <h1>Lorem Ipsum</h1>
+ <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. In non elit dignissim, lobortis velit id, rutrum enim. Fusce urna nulla, semper in nisl consectetur, dictum dignissim felis. Vivamus mollis porttitor neque non pulvinar. Donec sollicitudin pulvinar nisi, nec vestibulum massa rutrum id. Aenean convallis tincidunt diam vel egestas. Pellentesque laoreet commodo arcu id dignissim. Etiam mattis elementum lectus, ut ultricies nibh dapibus sit amet. Curabitur sodales cursus ipsum vitae porttitor. Vestibulum ac nulla auctor, imperdiet augue accumsan, ornare eros.</p>
+
+ <p>Proin vel orci lobortis, ultrices nunc non, placerat odio. Proin nec nibh et odio pellentesque lobortis. Donec id urna ac sapien commodo facilisis in quis magna. Ut tempus aliquet elit, ut semper ante accumsan ornare. Morbi ac egestas quam. Pellentesque ut convallis metus, sit amet dignissim turpis. Sed feugiat hendrerit nibh, id tincidunt tortor euismod et. In vel fringilla ante. Etiam volutpat risus egestas congue sollicitudin.</p>
+
+ <p>Sed libero urna, fermentum quis leo at, lacinia suscipit ipsum. Vivamus in dignissim nibh. Proin ultricies sapien quis tortor luctus vehicula. Morbi ut consequat ipsum. Morbi imperdiet lectus libero, at tristique erat scelerisque sed. Duis eu risus at lectus vehicula facilisis. In tempor felis a nulla imperdiet volutpat. Quisque at auctor libero. Nunc ornare eros eget libero faucibus, vehicula ullamcorper erat laoreet. Aliquam dignissim eget est et aliquam. Phasellus imperdiet tincidunt mi, vitae viverra enim elementum a. Nullam pellentesque odio eu mauris bibendum tempor.</p>
+
+ <p>Sed mattis, elit eu pulvinar sagittis, ipsum enim interdum nisl, eu ornare augue orci at enim. Sed cursus, dolor in vestibulum maximus, mauris magna bibendum enim, in fringilla mauris metus vel nunc. Cras in quam mi. Nullam aliquam velit mauris, quis aliquet nulla pretium auctor. Donec non lobortis tellus. Nunc sodales libero id libero ultricies cursus. Cras ipsum nibh, dictum eu augue fermentum, blandit bibendum odio. Pellentesque tincidunt hendrerit aliquam. Donec sit amet justo vel magna pretium lobortis tempus vitae lorem. Maecenas quam purus, scelerisque dapibus lectus at, mattis tempus enim. Suspendisse ac ante turpis. Suspendisse aliquet, velit at hendrerit elementum, risus tortor accumsan est, quis luctus nisl sapien sed risus. Donec cursus ex diam, nec iaculis urna bibendum eget. Cras neque lacus, ornare eget elit eu, fringilla vestibulum velit. Phasellus lacinia condimentum enim accumsan aliquam. Nulla finibus ex elit, id semper erat posuere suscipit.</p>
+
+ <p>Integer at libero purus. Maecenas eu cursus nunc, vitae pellentesque sapien. Mauris auctor condimentum massa. Sed pharetra nibh varius leo rutrum, vel auctor tortor venenatis. Donec tincidunt tempus libero vel iaculis. Nam pretium non augue et pretium. Nunc dignissim tortor venenatis, blandit sem ac, mattis dolor. Donec et lacinia nunc. Vestibulum enim eros, aliquam pulvinar cursus ornare, volutpat eu mi. Quisque semper mi id metus elementum malesuada. Suspendisse nisl felis, pretium id consectetur quis, lacinia sit amet est.</p>
+ </body>
+</html>
diff --git a/Base/home/anon/small.html b/Base/home/anon/small.html
new file mode 100644
index 0000000000..f44d46836e
--- /dev/null
+++ b/Base/home/anon/small.html
@@ -0,0 +1,6 @@
+<html>
+ <head><title>Small test page</title></head>
+ <body>
+ <p>This is a <b>very small</b> test page :^)</p>
+ </body>
+</html>
diff --git a/LibHTML/Parser.cpp b/LibHTML/Parser.cpp
index 3bc6f2a99a..ea6dd65b95 100644
--- a/LibHTML/Parser.cpp
+++ b/LibHTML/Parser.cpp
@@ -1,32 +1,120 @@
#include <LibHTML/Element.h>
#include <LibHTML/Parser.h>
#include <LibHTML/Text.h>
+#include <ctype.h>
static Retained<Element> create_element(const String& tag_name)
{
return adopt(*new Element(tag_name));
}
+static bool is_self_closing_tag(const String& tag_name)
+{
+ return tag_name == "area"
+ || tag_name == "base"
+ || tag_name == "br"
+ || tag_name == "col"
+ || tag_name == "embed"
+ || tag_name == "hr"
+ || tag_name == "img"
+ || tag_name == "input"
+ || tag_name == "link"
+ || tag_name == "meta"
+ || tag_name == "param"
+ || tag_name == "source"
+ || tag_name == "track"
+ || tag_name == "wbr";
+}
+
Retained<Document> parse(const String& html)
{
+ Vector<Retained<ParentNode>> node_stack;
+
auto doc = adopt(*new Document);
+ node_stack.append(doc);
+
+ enum class State {
+ Free,
+ BeforeTagName,
+ InTagName,
+ InAttributeList,
+ InAttributeName,
+ InAttributeValueNoQuote,
+ InAttributeValueSingleQuote,
+ InAttributeValueDoubleQuote,
+ };
- auto head = create_element("head");
- auto title = create_element("title");
- auto title_text = adopt(*new Text("Page Title"));
- title->append_child(title_text);
- head->append_child(title);
+ auto state = State::Free;
- doc->append_child(head);
+ Vector<char, 256> buffer;
- auto body = create_element("body");
- auto h1 = create_element("h1");
- auto h1_text = adopt(*new Text("Hello World!"));
+ bool is_slash_tag = false;
- h1->append_child(h1_text);
- body->append_child(h1);
- doc->append_child(body);
+ auto move_to_state = [&](State new_state) {
+ if (new_state == State::BeforeTagName)
+ is_slash_tag = false;
+ if (state == State::Free && !buffer.is_empty()) {
+ auto text_node = adopt(*new Text(String::copy(buffer)));
+ node_stack.last()->append_child(text_node);
+ }
+ state = new_state;
+ buffer.clear();
+ };
+ auto close_tag = [&] {
+ if (node_stack.size() > 1)
+ node_stack.take_last();
+ };
+
+ auto open_tag = [&] {
+ auto new_element = create_element(String::copy(buffer));
+ node_stack.append(new_element);
+ if (node_stack.size() != 1)
+ node_stack[node_stack.size() - 2]->append_child(new_element);
+
+ if (is_self_closing_tag(new_element->tag_name()))
+ close_tag();
+ };
+
+ for (int i = 0; i < html.length(); ++i) {
+ char ch = html[i];
+ switch (state) {
+ case State::Free:
+ if (ch == '<') {
+ move_to_state(State::BeforeTagName);
+ break;
+ }
+ buffer.append(ch);
+ break;
+ case State::BeforeTagName:
+ if (ch == '/') {
+ is_slash_tag = true;
+ break;
+ }
+ if (ch == '>') {
+ move_to_state(State::Free);
+ break;
+ }
+ if (!isascii(ch))
+ break;
+ move_to_state(State::InTagName);
+ [[fallthrough]];
+ case State::InTagName:
+ if (ch == ' ') {
+ move_to_state(State::InAttributeList);
+ break;
+ }
+ if (ch == '>') {
+ if (is_slash_tag)
+ close_tag();
+ else
+ open_tag();
+ move_to_state(State::Free);
+ break;
+ }
+ buffer.append(ch);
+ break;
+ }
+ }
return doc;
}
-
diff --git a/LibHTML/test.cpp b/LibHTML/test.cpp
index 9d774335c2..ebeda969e6 100644
--- a/LibHTML/test.cpp
+++ b/LibHTML/test.cpp
@@ -1,10 +1,18 @@
+#include <LibCore/CFile.h>
#include <LibHTML/Dump.h>
#include <LibHTML/Element.h>
#include <LibHTML/Parser.h>
+#include <stdio.h>
-int main()
+int main(int argc, char** argv)
{
- String html = "<html><head><title>my page</title></head><body><h1>Hi there</h1><p>Hello World!</p></body></html>";
+ CFile f(argc == 1 ? "/home/anon/small.html" : argv[1]);
+ if (!f.open(CIODevice::ReadOnly)) {
+ fprintf(stderr, "Error: %s\n", f.error_string());
+ return 1;
+ }
+ String html = String::copy(f.read_all());
auto doc = parse(html);
dump_tree(doc);
+ return 0;
}