diff options
author | Andreas Kling <kling@serenityos.org> | 2020-05-24 00:49:22 +0200 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2020-05-24 00:49:22 +0200 |
commit | e44c87cfff3cf96b07994c1690646f468e28f2dc (patch) | |
tree | 4704e0ba13ec61e288d155aca3906e8ffaa59978 | |
parent | fd1b31d0ff26775c6ebe125f6ff26f938598d655 (diff) | |
download | serenity-e44c87cfff3cf96b07994c1690646f468e28f2dc.zip |
LibWeb: Implement enough HTML parsing to handle a small simple DOM :^)
We can now parse a little DOM like this:
<!DOCTYPE html>
<html>
<head></head>
<body>
<div></div>
</body>
</html>
This is pretty slow work, but the incremental progress is satisfying!
-rw-r--r-- | Base/home/anon/www/simple.html | 3 | ||||
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLDocumentParser.cpp | 100 | ||||
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLDocumentParser.h | 6 | ||||
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLTokenizer.cpp | 5 | ||||
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLTokenizer.h | 2 |
5 files changed, 110 insertions, 6 deletions
diff --git a/Base/home/anon/www/simple.html b/Base/home/anon/www/simple.html index 1554113580..88f308a373 100644 --- a/Base/home/anon/www/simple.html +++ b/Base/home/anon/www/simple.html @@ -1,4 +1,7 @@ <!DOCTYPE html> <html> <head><meta name="greeting" content='Hello friends!' foo=bar></head> +<body> +<div></div> +</body> </html> diff --git a/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp b/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp index 3d61d29004..c132a750db 100644 --- a/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp +++ b/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp @@ -55,9 +55,6 @@ void HTMLDocumentParser::run() dbg() << "[" << insertion_mode_name() << "] " << token.to_string(); - if (token.type() == HTMLToken::Type::EndOfFile) - return; - switch (m_insertion_mode) { case InsertionMode::Initial: handle_initial(token); @@ -80,6 +77,12 @@ void HTMLDocumentParser::run() case InsertionMode::InBody: handle_in_body(token); break; + case InsertionMode::AfterBody: + handle_after_body(token); + break; + case InsertionMode::AfterAfterBody: + handle_after_after_body(token); + break; case InsertionMode::Text: handle_text(token); break; @@ -199,7 +202,10 @@ void HTMLDocumentParser::handle_after_head(HTMLToken& token) } if (token.is_start_tag() && token.tag_name() == "body") { - ASSERT_NOT_REACHED(); + insert_html_element(token); + m_frameset_ok = false; + m_insertion_mode = InsertionMode::InBody; + return; } if (token.is_start_tag() && token.tag_name() == "frameset") { @@ -231,10 +237,94 @@ AnythingElse: fake_body_token.m_tag.tag_name.append("body"); insert_html_element(fake_body_token); m_insertion_mode = InsertionMode::InBody; + // FIXME: Reprocess the current token in InBody! } -void HTMLDocumentParser::handle_in_body(HTMLToken&) +void HTMLDocumentParser::generate_implied_end_tags() { + Vector<String> names { "dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc" }; + while (names.contains_slow(current_node()->tag_name())) + m_stack_of_open_elements.take_last(); +} + +bool HTMLDocumentParser::stack_of_open_elements_has_element_with_tag_name_in_scope(const FlyString& tag_name) +{ + Vector<String> list { "applet", "caption", "html", "table", "td", "th", "marquee", "object", "template" }; + for (ssize_t i = m_stack_of_open_elements.size() - 1; i >= 0; --i) { + auto& node = m_stack_of_open_elements.at(i); + if (node.tag_name() == tag_name) + return true; + if (list.contains_slow(node.tag_name())) + return false; + } + ASSERT_NOT_REACHED(); +} + +void HTMLDocumentParser::handle_after_body(HTMLToken& token) +{ + if (token.is_end_tag() && token.tag_name() == "html") { + if (m_parsing_fragment) { + ASSERT_NOT_REACHED(); + } + m_insertion_mode = InsertionMode::AfterAfterBody; + return; + } + ASSERT_NOT_REACHED(); +} + +void HTMLDocumentParser::handle_after_after_body(HTMLToken& token) +{ + if (token.is_end_of_file()) { + dbg() << "Stop parsing! :^)"; + return; + } + ASSERT_NOT_REACHED(); +} + +void HTMLDocumentParser::handle_in_body(HTMLToken& token) +{ + if (token.is_end_tag() && token.tag_name() == "body") { + if (!stack_of_open_elements_has_element_with_tag_name_in_scope("body")) { + ASSERT_NOT_REACHED(); + } + + // FIXME: Otherwise, if there is a node in the stack of open elements that is + // not either a dd element, a dt element, an li element, an optgroup element, + // an option element, a p element, an rb element, an rp element, an rt element, + // an rtc element, a tbody element, a td element, a tfoot element, a th element, + // a thead element, a tr element, the body element, or the html element, + // then this is a parse error. + + m_insertion_mode = InsertionMode::AfterBody; + return; + } + + { + Vector<String> names { "address", "article", "aside", "blockquote", "center", "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", "section", "summary", "ul" }; + if (token.is_start_tag() && names.contains_slow(token.tag_name())) { + // FIXME: If the stack of open elements has a p element in button scope, then close a p element. + insert_html_element(token); + return; + } + + if (token.is_end_tag() && names.contains_slow(token.tag_name())) { + // FIXME: If the stack of open elements has a p element in button scope, then close a p element. + + if (!stack_of_open_elements_has_element_with_tag_name_in_scope(token.tag_name())) { + ASSERT_NOT_REACHED(); + } + + generate_implied_end_tags(); + + if (current_node()->tag_name() != token.tag_name()) { + ASSERT_NOT_REACHED(); + } + + m_stack_of_open_elements.take_last(); + return; + } + } + ASSERT_NOT_REACHED(); } diff --git a/Libraries/LibWeb/Parser/HTMLDocumentParser.h b/Libraries/LibWeb/Parser/HTMLDocumentParser.h index 060cc31d45..e846b10a58 100644 --- a/Libraries/LibWeb/Parser/HTMLDocumentParser.h +++ b/Libraries/LibWeb/Parser/HTMLDocumentParser.h @@ -84,8 +84,12 @@ private: void handle_in_head_noscript(HTMLToken&); void handle_after_head(HTMLToken&); void handle_in_body(HTMLToken&); + void handle_after_body(HTMLToken&); + void handle_after_after_body(HTMLToken&); void handle_text(HTMLToken&); + void generate_implied_end_tags(); + bool stack_of_open_elements_has_element_with_tag_name_in_scope(const FlyString& tag_name); NonnullRefPtr<Element> create_element_for(HTMLToken&); RefPtr<Node> find_appropriate_place_for_inserting_node(); RefPtr<Element> insert_html_element(HTMLToken&); @@ -97,6 +101,8 @@ private: HTMLTokenizer m_tokenizer; bool m_foster_parenting { false }; + bool m_frameset_ok { true }; + bool m_parsing_fragment { false }; RefPtr<Document> m_document; RefPtr<HTMLHeadElement> m_head_element; diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp index 7badf5af65..f53d07d1db 100644 --- a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp @@ -74,6 +74,9 @@ #define ANYTHING_ELSE if (1) #define EMIT_EOF \ + if (m_has_emitted_eof) \ + return {}; \ + m_has_emitted_eof = true; \ create_new_token(HTMLToken::Type::EndOfFile); \ return m_current_token; @@ -775,7 +778,7 @@ void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state) void HTMLTokenizer::flush_current_character_or_comment_if_needed() { //if (m_current_token.type() == HTMLToken::Type::Character || m_current_token.type() == HTMLToken::Type::Comment) -// emit_current_token(); + // emit_current_token(); } } diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.h b/Libraries/LibWeb/Parser/HTMLTokenizer.h index 2476e85be8..17b795baa5 100644 --- a/Libraries/LibWeb/Parser/HTMLTokenizer.h +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.h @@ -156,5 +156,7 @@ private: size_t m_cursor { 0 }; HTMLToken m_current_token; + + bool m_has_emitted_eof { false }; }; } |