diff options
author | Andreas Kling <awesomekling@gmail.com> | 2019-10-12 23:26:47 +0200 |
---|---|---|
committer | Andreas Kling <awesomekling@gmail.com> | 2019-10-12 23:34:05 +0200 |
commit | b083a233d8482ed7106d7a102866ee0dd42acc94 (patch) | |
tree | d32b52a7cc55fcd66dcaa94bb51f01e4bd04f398 /Libraries/LibHTML/Parser | |
parent | 6d150df58a0b64c4cb583fc374fd05e60d387dc6 (diff) | |
download | serenity-b083a233d8482ed7106d7a102866ee0dd42acc94.zip |
LibHTML: Add Comment and CharacterData nodes and improve HTML parsing
This patch adds the CharacterData subclass of Node, which is now the
parent class of Text and a new Comment class.
A Comment node is one of these in HTML: <!--hello friends-->
Since these occur somewhat frequently on the web, we need to be able
to parse them.
This patch also adds a child rejection mechanism to the DOM tree.
Nodes can now override is_child_allowed(Node) and return false if they
don't want a particular Node to become a child of theirs. This is used
to prevent Document from taking on unwanted children.
Diffstat (limited to 'Libraries/LibHTML/Parser')
-rw-r--r-- | Libraries/LibHTML/Parser/HTMLParser.cpp | 59 |
1 files changed, 47 insertions, 12 deletions
diff --git a/Libraries/LibHTML/Parser/HTMLParser.cpp b/Libraries/LibHTML/Parser/HTMLParser.cpp index a69871c8a0..7f0a8c9c26 100644 --- a/Libraries/LibHTML/Parser/HTMLParser.cpp +++ b/Libraries/LibHTML/Parser/HTMLParser.cpp @@ -1,6 +1,7 @@ #include <AK/Function.h> #include <AK/NonnullRefPtrVector.h> #include <AK/StringBuilder.h> +#include <LibHTML/DOM/Comment.h> #include <LibHTML/DOM/DocumentType.h> #include <LibHTML/DOM/Element.h> #include <LibHTML/DOM/ElementFactory.h> @@ -44,6 +45,8 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url) Free = 0, BeforeTagName, InTagName, + InDoctype, + InComment, InAttributeList, InAttributeName, BeforeAttributeValue, @@ -101,19 +104,16 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url) close_tag(); }; - auto handle_exclamation_tag = [&] { - auto name = String::copy(tag_name_buffer); - tag_name_buffer.clear(); - ASSERT(name == "DOCTYPE"); - if (node_stack.size() != 1) - node_stack[node_stack.size() - 2].append_child(adopt(*new DocumentType(document)), false); - close_tag(); + auto commit_doctype = [&] { + node_stack.last().append_child(adopt(*new DocumentType(document)), false); + }; + + auto commit_comment = [&] { + node_stack.last().append_child(adopt(*new Comment(document, text_buffer.to_string())), false); }; auto commit_tag = [&] { - if (is_exclamation_tag) - handle_exclamation_tag(); - else if (is_slash_tag) + if (is_slash_tag) close_tag(); else open_tag(); @@ -124,12 +124,16 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url) }; for (int i = 0; i < html.length(); ++i) { + auto peek = [&](int offset) -> char { + if (i + offset >= html.length()) + return '\0'; + return html[i + offset]; + }; char ch = html[i]; switch (state) { case State::Free: if (ch == '<') { is_slash_tag = false; - is_exclamation_tag = false; move_to_state(State::BeforeTagName); break; } @@ -165,7 +169,22 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url) break; } if (ch == '!') { - is_exclamation_tag = true; + if (peek(1) == 'D' + && peek(2) == 'O' + && peek(3) == 'C' + && peek(4) == 'T' + && peek(5) == 'Y' + && peek(6) == 'P' + && peek(7) == 'E') { + i += 7; + move_to_state(State::InDoctype); + break; + } + if (peek(1) == '-' && peek(2) == '-') { + i += 2; + move_to_state(State::InComment); + break; + } break; } if (ch == '>') { @@ -188,6 +207,22 @@ NonnullRefPtr<Document> parse_html(const StringView& html, const URL& url) } tag_name_buffer.append(ch); break; + case State::InDoctype: + if (ch == '>') { + commit_doctype(); + move_to_state(State::Free); + break; + } + break; + case State::InComment: + if (ch == '-' && peek(1) == '-' && peek(2) == '>') { + commit_comment(); + i += 2; + move_to_state(State::Free); + break; + } + text_buffer.append(ch); + break; case State::InAttributeList: if (ch == '>') { commit_tag(); |