LibWeb: Start building the tree building part of the new HTML parser

This patch adds a new HTMLDocumentParser class. It keeps a tokenizer object internally and feeds itself with one token at a time from it. The names and idioms in this class are expressed as closely to the actual HTML parsing spec as possible, to make development as easy and bug free as possible. :^) This is going to become pretty large, but it's pretty cool!
author: Andreas Kling <kling@serenityos.org> 2020-05-24 00:14:23 +0200
committer: Andreas Kling <kling@serenityos.org> 2020-05-24 00:14:23 +0200
commit: fd1b31d0ff26775c6ebe125f6ff26f938598d655 (patch)
tree: 925f27756aa6fb01d65d38cdfd66271ddcb5617e
parent: 0b61e21873b9459c3ff217fa79956b357dd67d63 (diff)
download: serenity-fd1b31d0ff26775c6ebe125f6ff26f938598d655.zip
8 files changed, 515 insertions, 76 deletions
diff --git a/Libraries/LibWeb/CMakeLists.txt b/Libraries/LibWeb/CMakeLists.txt
index 631ff1e534..e2789da5aa 100644
--- a/Libraries/LibWeb/CMakeLists.txt
+++ b/Libraries/LibWeb/CMakeLists.txt
@@ -84,7 +84,9 @@ set(SOURCES
     Layout/LineBox.cpp
     Layout/LineBoxFragment.cpp
     Parser/CSSParser.cpp
+    Parser/HTMLDocumentParser.cpp
     Parser/HTMLParser.cpp
+    Parser/HTMLToken.cpp
     Parser/HTMLTokenizer.cpp
     ResourceLoader.cpp
     StylePropertiesModel.cpp
diff --git a/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp b/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp
new file mode 100644
index 0000000000..3d61d29004
--- /dev/null
+++ b/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <LibWeb/DOM/Document.h>
+#include <LibWeb/DOM/DocumentType.h>
+#include <LibWeb/DOM/ElementFactory.h>
+#include <LibWeb/DOM/HTMLFormElement.h>
+#include <LibWeb/DOM/HTMLHeadElement.h>
+#include <LibWeb/Parser/HTMLDocumentParser.h>
+#include <LibWeb/Parser/HTMLToken.h>
+
+namespace Web {
+
+HTMLDocumentParser::HTMLDocumentParser(const StringView& input)
+    : m_tokenizer(input)
+{
+}
+
+HTMLDocumentParser::~HTMLDocumentParser()
+{
+}
+
+void HTMLDocumentParser::run()
+{
+    m_document = adopt(*new Document);
+
+    for (;;) {
+        auto optional_token = m_tokenizer.next_token();
+        if (!optional_token.has_value())
+            return;
+        auto& token = optional_token.value();
+
+        dbg() << "[" << insertion_mode_name() << "] " << token.to_string();
+
+        if (token.type() == HTMLToken::Type::EndOfFile)
+            return;
+
+        switch (m_insertion_mode) {
+        case InsertionMode::Initial:
+            handle_initial(token);
+            break;
+        case InsertionMode::BeforeHTML:
+            handle_before_html(token);
+            break;
+        case InsertionMode::BeforeHead:
+            handle_before_head(token);
+            break;
+        case InsertionMode::InHead:
+            handle_in_head(token);
+            break;
+        case InsertionMode::InHeadNoscript:
+            handle_in_head_noscript(token);
+            break;
+        case InsertionMode::AfterHead:
+            handle_after_head(token);
+            break;
+        case InsertionMode::InBody:
+            handle_in_body(token);
+            break;
+        case InsertionMode::Text:
+            handle_text(token);
+            break;
+        default:
+            ASSERT_NOT_REACHED();
+        }
+    }
+}
+
+void HTMLDocumentParser::handle_initial(HTMLToken& token)
+{
+    if (token.type() == HTMLToken::Type::DOCTYPE) {
+        auto doctype = adopt(*new DocumentType(document()));
+        doctype->set_name(token.m_doctype.name.to_string());
+        document().append_child(move(doctype));
+        m_insertion_mode = InsertionMode::BeforeHTML;
+        return;
+    }
+    ASSERT_NOT_REACHED();
+}
+
+void HTMLDocumentParser::handle_before_html(HTMLToken& token)
+{
+    if (token.is_start_tag() && token.tag_name() == "html") {
+        auto element = create_element_for(token);
+        document().append_child(element);
+        m_stack_of_open_elements.append(element);
+        m_insertion_mode = InsertionMode::BeforeHead;
+        return;
+    }
+    ASSERT_NOT_REACHED();
+}
+
+NonnullRefPtr<Node> HTMLDocumentParser::current_node()
+{
+    return m_stack_of_open_elements.last();
+}
+
+RefPtr<Node> HTMLDocumentParser::find_appropriate_place_for_inserting_node()
+{
+    auto target = current_node();
+    if (m_foster_parenting) {
+        ASSERT_NOT_REACHED();
+    }
+    return target;
+}
+
+NonnullRefPtr<Element> HTMLDocumentParser::create_element_for(HTMLToken& token)
+{
+    auto element = create_element(document(), token.tag_name());
+    for (auto& attribute : token.m_tag.attributes) {
+        element->set_attribute(attribute.name_builder.to_string(), attribute.value_builder.to_string());
+    }
+    return element;
+}
+
+RefPtr<Element> HTMLDocumentParser::insert_html_element(HTMLToken& token)
+{
+    auto adjusted_insertion_location = find_appropriate_place_for_inserting_node();
+    auto element = create_element_for(token);
+    // FIXME: Check if it's possible to insert `element` at `adjusted_insertion_location`
+    adjusted_insertion_location->append_child(element);
+    m_stack_of_open_elements.append(element);
+    return element;
+}
+
+void HTMLDocumentParser::handle_before_head(HTMLToken& token)
+{
+    if (token.is_start_tag() && token.tag_name() == "head") {
+        auto element = insert_html_element(token);
+        m_head_element = to<HTMLHeadElement>(element);
+        m_insertion_mode = InsertionMode::InHead;
+        return;
+    }
+    ASSERT_NOT_REACHED();
+}
+
+void HTMLDocumentParser::handle_in_head(HTMLToken& token)
+{
+    if (token.is_start_tag() && token.tag_name() == "meta") {
+        auto element = insert_html_element(token);
+        m_stack_of_open_elements.take_last();
+        if (token.is_self_closing()) {
+            ASSERT_NOT_REACHED();
+        }
+        return;
+    }
+    if (token.is_end_tag() && token.tag_name() == "head") {
+        m_stack_of_open_elements.take_last();
+        m_insertion_mode = InsertionMode::AfterHead;
+        return;
+    }
+    ASSERT_NOT_REACHED();
+}
+
+void HTMLDocumentParser::handle_in_head_noscript(HTMLToken&)
+{
+    ASSERT_NOT_REACHED();
+}
+
+void HTMLDocumentParser::handle_after_head(HTMLToken& token)
+{
+    if (token.is_character()) {
+        ASSERT_NOT_REACHED();
+    }
+
+    if (token.is_comment()) {
+        ASSERT_NOT_REACHED();
+    }
+
+    if (token.is_doctype()) {
+        ASSERT_NOT_REACHED();
+    }
+
+    if (token.is_start_tag() && token.tag_name() == "html") {
+        ASSERT_NOT_REACHED();
+    }
+
+    if (token.is_start_tag() && token.tag_name() == "body") {
+        ASSERT_NOT_REACHED();
+    }
+
+    if (token.is_start_tag() && token.tag_name() == "frameset") {
+        ASSERT_NOT_REACHED();
+    }
+
+    {
+        Vector<String> names = { "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "template", "title" };
+        if (token.is_end_tag() && names.contains_slow(token.tag_name())) {
+            ASSERT_NOT_REACHED();
+        }
+    }
+
+    if (token.is_end_tag() && token.tag_name() == "template") {
+        ASSERT_NOT_REACHED();
+    }
+
+    if (token.is_end_tag() && (token.tag_name() == "body" || token.tag_name() == "html" || token.tag_name() == "br")) {
+        goto AnythingElse;
+    }
+
+    if ((token.is_start_tag() && token.tag_name() == "head") || token.is_end_tag()) {
+        ASSERT_NOT_REACHED();
+    }
+
+AnythingElse:
+    HTMLToken fake_body_token;
+    fake_body_token.m_type = HTMLToken::Type::StartTag;
+    fake_body_token.m_tag.tag_name.append("body");
+    insert_html_element(fake_body_token);
+    m_insertion_mode = InsertionMode::InBody;
+}
+
+void HTMLDocumentParser::handle_in_body(HTMLToken&)
+{
+    ASSERT_NOT_REACHED();
+}
+
+void HTMLDocumentParser::handle_text(HTMLToken&)
+{
+    ASSERT_NOT_REACHED();
+}
+
+const char* HTMLDocumentParser::insertion_mode_name() const
+{
+    switch (m_insertion_mode) {
+#define __ENUMERATE_INSERTION_MODE(mode) \
+    case InsertionMode::mode:            \
+        return #mode;
+        ENUMERATE_INSERTION_MODES
+#undef __ENUMERATE_INSERTION_MODE
+    }
+    ASSERT_NOT_REACHED();
+}
+
+Document& HTMLDocumentParser::document()
+{
+    return *m_document;
+}
+
+}
diff --git a/Libraries/LibWeb/Parser/HTMLDocumentParser.h b/Libraries/LibWeb/Parser/HTMLDocumentParser.h
new file mode 100644
index 0000000000..060cc31d45
--- /dev/null
+++ b/Libraries/LibWeb/Parser/HTMLDocumentParser.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <AK/NonnullRefPtrVector.h>
+#include <LibWeb/DOM/Node.h>
+#include <LibWeb/Parser/HTMLTokenizer.h>
+
+#define ENUMERATE_INSERTION_MODES               \
+    __ENUMERATE_INSERTION_MODE(Initial)         \
+    __ENUMERATE_INSERTION_MODE(BeforeHTML)      \
+    __ENUMERATE_INSERTION_MODE(BeforeHead)      \
+    __ENUMERATE_INSERTION_MODE(InHead)          \
+    __ENUMERATE_INSERTION_MODE(InHeadNoscript)  \
+    __ENUMERATE_INSERTION_MODE(AfterHead)       \
+    __ENUMERATE_INSERTION_MODE(InBody)          \
+    __ENUMERATE_INSERTION_MODE(Text)            \
+    __ENUMERATE_INSERTION_MODE(InTable)         \
+    __ENUMERATE_INSERTION_MODE(InTableText)     \
+    __ENUMERATE_INSERTION_MODE(InCaption)       \
+    __ENUMERATE_INSERTION_MODE(InColumnGroup)   \
+    __ENUMERATE_INSERTION_MODE(InTableBody)     \
+    __ENUMERATE_INSERTION_MODE(InRow)           \
+    __ENUMERATE_INSERTION_MODE(InCell)          \
+    __ENUMERATE_INSERTION_MODE(InSelect)        \
+    __ENUMERATE_INSERTION_MODE(InSelectInTable) \
+    __ENUMERATE_INSERTION_MODE(InTemplate)      \
+    __ENUMERATE_INSERTION_MODE(AfterBody)       \
+    __ENUMERATE_INSERTION_MODE(InFrameset)      \
+    __ENUMERATE_INSERTION_MODE(AfterFrameset)   \
+    __ENUMERATE_INSERTION_MODE(AfterAfterBody)  \
+    __ENUMERATE_INSERTION_MODE(AfterAfterFrameset)
+
+namespace Web {
+
+class HTMLDocumentParser {
+public:
+    explicit HTMLDocumentParser(const StringView& input);
+    ~HTMLDocumentParser();
+
+    void run();
+
+    Document& document();
+
+    enum class InsertionMode {
+#define __ENUMERATE_INSERTION_MODE(mode) mode,
+        ENUMERATE_INSERTION_MODES
+#undef __ENUMERATE_INSERTION_MODE
+    };
+
+    InsertionMode insertion_mode() const { return m_insertion_mode; }
+
+private:
+    const char* insertion_mode_name() const;
+
+    void handle_initial(HTMLToken&);
+    void handle_before_html(HTMLToken&);
+    void handle_before_head(HTMLToken&);
+    void handle_in_head(HTMLToken&);
+    void handle_in_head_noscript(HTMLToken&);
+    void handle_after_head(HTMLToken&);
+    void handle_in_body(HTMLToken&);
+    void handle_text(HTMLToken&);
+
+    NonnullRefPtr<Element> create_element_for(HTMLToken&);
+    RefPtr<Node> find_appropriate_place_for_inserting_node();
+    RefPtr<Element> insert_html_element(HTMLToken&);
+    NonnullRefPtr<Node> current_node();
+
+    InsertionMode m_insertion_mode { InsertionMode::Initial };
+    NonnullRefPtrVector<Node> m_stack_of_open_elements;
+
+    HTMLTokenizer m_tokenizer;
+
+    bool m_foster_parenting { false };
+
+    RefPtr<Document> m_document;
+    RefPtr<HTMLHeadElement> m_head_element;
+    RefPtr<HTMLFormElement> m_form_element;
+};
+
+}
diff --git a/Libraries/LibWeb/Parser/HTMLToken.cpp b/Libraries/LibWeb/Parser/HTMLToken.cpp
new file mode 100644
index 0000000000..88cf5991ca
--- /dev/null
+++ b/Libraries/LibWeb/Parser/HTMLToken.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <LibWeb/Parser/HTMLToken.h>
+
+namespace Web {
+
+String HTMLToken::to_string() const
+{
+    StringBuilder builder;
+
+    switch (type()) {
+    case HTMLToken::Type::DOCTYPE:
+        builder.append("DOCTYPE");
+        builder.append(" { name: '");
+        builder.append(m_doctype.name.to_string());
+        builder.append("' }");
+        break;
+    case HTMLToken::Type::StartTag:
+        builder.append("StartTag");
+        break;
+    case HTMLToken::Type::EndTag:
+        builder.append("EndTag");
+        break;
+    case HTMLToken::Type::Comment:
+        builder.append("Comment");
+        break;
+    case HTMLToken::Type::Character:
+        builder.append("Character");
+        break;
+    case HTMLToken::Type::EndOfFile:
+        builder.append("EndOfFile");
+        break;
+    }
+
+    if (type() == HTMLToken::Type::StartTag || type() == HTMLToken::Type::EndTag) {
+        builder.append(" { name: '");
+        builder.append(m_tag.tag_name.to_string());
+        builder.append("', { ");
+        for (auto& attribute : m_tag.attributes) {
+            builder.append(attribute.name_builder.to_string());
+            builder.append("=\"");
+            builder.append(attribute.value_builder.to_string());
+            builder.append("\" ");
+        }
+        builder.append("} }");
+    }
+
+    if (type() == HTMLToken::Type::Comment || type() == HTMLToken::Type::Character) {
+        builder.append(" { data: '");
+        builder.append(m_comment_or_character.data.to_string());
+        builder.append(" }");
+    }
+
+    return builder.to_string();
+
+    //dbg() << "[" << String::format("%42s", state_name(m_state)) << "] " << builder.to_string();
+    //m_current_token = {};
+}
+
+}
diff --git a/Libraries/LibWeb/Parser/HTMLToken.h b/Libraries/LibWeb/Parser/HTMLToken.h
index 13c773d270..e956c4ac27 100644
--- a/Libraries/LibWeb/Parser/HTMLToken.h
+++ b/Libraries/LibWeb/Parser/HTMLToken.h
@@ -34,6 +34,7 @@
 namespace Web {
 
 class HTMLToken {
+    friend class HTMLDocumentParser;
     friend class HTMLTokenizer;
 
 public:
@@ -46,8 +47,29 @@ public:
         EndOfFile,
     };
 
+    bool is_doctype() const { return m_type == Type::DOCTYPE; }
+    bool is_start_tag() const { return m_type == Type::StartTag; }
+    bool is_end_tag() const { return m_type == Type::EndTag; }
+    bool is_comment() const { return m_type == Type::Comment; }
+    bool is_character() const { return m_type == Type::Character; }
+    bool is_end_of_file() const { return m_type == Type::EndOfFile; }
+
+    String tag_name() const
+    {
+        ASSERT(is_start_tag() || is_end_tag());
+        return m_tag.tag_name.to_string();
+    }
+
+    bool is_self_closing() const
+    {
+        ASSERT(is_start_tag() || is_end_tag());
+        return m_tag.self_closing;
+    }
+
     Type type() const { return m_type; }
 
+    String to_string() const;
+
 private:
     struct AttributeBuilder {
         StringBuilder name_builder;
diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
index b5835446d3..7badf5af65 100644
--- a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
+++ b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
@@ -28,6 +28,8 @@
 #include <LibWeb/Parser/HTMLTokenizer.h>
 #include <ctype.h>
 
+#pragma GCC diagnostic ignored "-Wunused-label"
+
 //#define TOKENIZER_TRACE
 
 #define TODO()                                                                                              \
@@ -47,6 +49,11 @@
     m_state = State::new_state;          \
     goto new_state;
 
+#define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
+    will_switch_to(State::new_state);               \
+    m_state = State::new_state;                     \
+    return m_current_token;
+
 #define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor;
 
 #define ON(codepoint) \
@@ -66,10 +73,12 @@
 
 #define ANYTHING_ELSE if (1)
 
-#define EMIT_EOF_AND_RETURN                       \
+#define EMIT_EOF                                  \
     create_new_token(HTMLToken::Type::EndOfFile); \
-    emit_current_token();                         \
-    return;
+    return m_current_token;
+
+#define EMIT_CURRENT_TOKEN \
+    return m_current_token;
 
 #define BEGIN_STATE(state) \
     state:                 \
@@ -100,7 +109,7 @@ Optional<u32> HTMLTokenizer::peek_codepoint(size_t offset) const
     return m_input[m_cursor + offset];
 }
 
-void HTMLTokenizer::run()
+Optional<HTMLToken> HTMLTokenizer::next_token()
 {
     for (;;) {
         auto current_input_character = next_codepoint();
@@ -118,7 +127,7 @@ void HTMLTokenizer::run()
                 }
                 ON_EOF
                 {
-                    EMIT_EOF_AND_RETURN;
+                    EMIT_EOF;
                 }
                 ANYTHING_ELSE
                 {
@@ -168,8 +177,7 @@ void HTMLTokenizer::run()
                 }
                 ON('>')
                 {
-                    emit_current_token();
-                    SWITCH_TO(Data);
+                    SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
                 }
                 ANYTHING_ELSE
                 {
@@ -266,8 +274,7 @@ void HTMLTokenizer::run()
                 }
                 ON('>')
                 {
-                    emit_current_token();
-                    SWITCH_TO(Data);
+                    SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
                 }
                 ON_ASCII_UPPER_ALPHA
                 {
@@ -297,8 +304,7 @@ void HTMLTokenizer::run()
                 }
                 ON('>')
                 {
-                    emit_current_token();
-                    SWITCH_TO(Data);
+                    SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
                 }
                 ON_EOF
                 {
@@ -473,8 +479,7 @@ void HTMLTokenizer::run()
                 }
                 ON('>')
                 {
-                    emit_current_token();
-                    SWITCH_TO(Data);
+                    SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
                 }
                 ON(0)
                 {
@@ -504,8 +509,7 @@ void HTMLTokenizer::run()
                 }
                 ON('>')
                 {
-                    emit_current_token();
-                    SWITCH_TO(Data);
+                    SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
                 }
                 ON_EOF
                 {
@@ -588,8 +592,7 @@ void HTMLTokenizer::run()
             {
                 ON('>')
                 {
-                    emit_current_token();
-                    SWITCH_TO(Data);
+                    SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
                 }
                 ON('!')
                 {
@@ -741,57 +744,6 @@ bool HTMLTokenizer::next_few_characters_are(const StringView& string) const
     return true;
 }
 
-void HTMLTokenizer::emit_current_token()
-{
-    StringBuilder builder;
-
-    switch (m_current_token.type()) {
-    case HTMLToken::Type::DOCTYPE:
-        builder.append("DOCTYPE");
-        builder.append(" { name: '");
-        builder.append(m_current_token.m_doctype.name.to_string());
-        builder.append("' }");
-        break;
-    case HTMLToken::Type::StartTag:
-        builder.append("StartTag");
-        break;
-    case HTMLToken::Type::EndTag:
-        builder.append("EndTag");
-        break;
-    case HTMLToken::Type::Comment:
-        builder.append("Comment");
-        break;
-    case HTMLToken::Type::Character:
-        builder.append("Character");
-        break;
-    case HTMLToken::Type::EndOfFile:
-        builder.append("EndOfFile");
-        break;
-    }
-
-    if (m_current_token.type() == HTMLToken::Type::StartTag || m_current_token.type() == HTMLToken::Type::EndTag) {
-        builder.append(" { name: '");
-        builder.append(m_current_token.m_tag.tag_name.to_string());
-        builder.append("', { ");
-        for (auto& attribute : m_current_token.m_tag.attributes) {
-            builder.append(attribute.name_builder.to_string());
-            builder.append("=\"");
-            builder.append(attribute.value_builder.to_string());
-            builder.append("\" ");
-        }
-        builder.append("} }");
-    }
-
-    if (m_current_token.type() == HTMLToken::Type::Comment || m_current_token.type() == HTMLToken::Type::Character) {
-        builder.append(" { data: '");
-        builder.append(m_current_token.m_comment_or_character.data.to_string());
-        builder.append(" }");
-    }
-
-    dbg() << "[" << String::format("%42s", state_name(m_state)) << "] " << builder.to_string();
-    m_current_token = {};
-}
-
 void HTMLTokenizer::create_new_token(HTMLToken::Type type)
 {
     flush_current_character_or_comment_if_needed();
@@ -822,8 +774,8 @@ void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
 
 void HTMLTokenizer::flush_current_character_or_comment_if_needed()
 {
-    if (m_current_token.type() == HTMLToken::Type::Character || m_current_token.type() == HTMLToken::Type::Comment)
-        emit_current_token();
+    //if (m_current_token.type() == HTMLToken::Type::Character || m_current_token.type() == HTMLToken::Type::Comment)
+//        emit_current_token();
 }
 
 }
diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.h b/Libraries/LibWeb/Parser/HTMLTokenizer.h
index 5573cdd46c..2476e85be8 100644
--- a/Libraries/LibWeb/Parser/HTMLTokenizer.h
+++ b/Libraries/LibWeb/Parser/HTMLTokenizer.h
@@ -118,14 +118,13 @@ class HTMLTokenizer {
 public:
     explicit HTMLTokenizer(const StringView& input);
 
-    void run();
+    Optional<HTMLToken> next_token();
 
 private:
     Optional<u32> next_codepoint();
     Optional<u32> peek_codepoint(size_t offset) const;
     bool next_few_characters_are(const StringView&) const;
     void consume(const StringView&);
-    void emit_current_token();
     void create_new_token(HTMLToken::Type);
 
     enum class State {
diff --git a/Userland/ht.cpp b/Userland/ht.cpp
index 02c01913ce..03f33455a0 100644
--- a/Userland/ht.cpp
+++ b/Userland/ht.cpp
@@ -24,13 +24,19 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <LibWeb/Parser/HTMLTokenizer.h>
-#include <LibCore/File.h>
 #include <AK/ByteBuffer.h>
 #include <AK/LogStream.h>
+#include <LibCore/EventLoop.h>
+#include <LibCore/File.h>
+#include <LibWeb/DOM/Document.h>
+#include <LibWeb/Dump.h>
+#include <LibWeb/Parser/HTMLDocumentParser.h>
+#include <LibWeb/Parser/HTMLTokenizer.h>
 
 int main(int argc, char** argv)
 {
+    Core::EventLoop loop;
+
     // This is a temporary test program to aid with bringing up the new HTML parser. :^)
     const char* input_path = "/home/anon/www/simple.html";
     if (argc > 1)
@@ -40,7 +46,12 @@ int main(int argc, char** argv)
     if (file_or_error.is_error())
         return 1;
     auto contents = file_or_error.value()->read_all();
-    Web::HTMLTokenizer tokenizer(contents);
-    tokenizer.run();
+
+    Web::HTMLDocumentParser parser(contents);
+    parser.run();
+
+    auto& document = parser.document();
+    Web::dump_tree(document);
+
     return 0;
 }
author	Andreas Kling <kling@serenityos.org>	2020-05-24 00:14:23 +0200
committer	Andreas Kling <kling@serenityos.org>	2020-05-24 00:14:23 +0200
commit	fd1b31d0ff26775c6ebe125f6ff26f938598d655 (patch)
tree	925f27756aa6fb01d65d38cdfd66271ddcb5617e
parent	0b61e21873b9459c3ff217fa79956b357dd67d63 (diff)
download	serenity-fd1b31d0ff26775c6ebe125f6ff26f938598d655.zip