summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Kling <kling@serenityos.org>2020-05-23 01:54:26 +0200
committerAndreas Kling <kling@serenityos.org>2020-05-23 01:54:26 +0200
commita58500fdc59083f4fef12cc899ce5cf97b5173b4 (patch)
tree2727125c6a86a6ea1779ed20813c51adb0080f57
parent909ac2a5580138ca7d248b94e2ff096d11f153f7 (diff)
downloadserenity-a58500fdc59083f4fef12cc899ce5cf97b5173b4.zip
LibWeb: Teach HTMLTokenizer how to tokenize comments
We can now correctly tokenize the welcome.html test page. :^)
-rw-r--r--Libraries/LibWeb/Parser/HTMLTokenizer.cpp196
1 files changed, 196 insertions, 0 deletions
diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
index 73af7c0f15..7b26419593 100644
--- a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
+++ b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
@@ -171,6 +171,11 @@ void HTMLTokenizer::run()
BEGIN_STATE(MarkupDeclarationOpen)
{
DONT_CONSUME_NEXT_INPUT_CHARACTER;
+ if (next_few_characters_are("--")) {
+ consume("--");
+ create_new_token(HTMLToken::Type::Comment);
+ SWITCH_TO(CommentStart);
+ }
if (next_few_characters_are("DOCTYPE")) {
consume("DOCTYPE");
SWITCH_TO(DOCTYPE);
@@ -424,6 +429,197 @@ void HTMLTokenizer::run()
}
END_STATE
+ BEGIN_STATE(CommentStart)
+ {
+ ON('-')
+ {
+ SWITCH_TO(CommentStartDash);
+ }
+ ON('>')
+ {
+ TODO();
+ }
+ ANYTHING_ELSE
+ {
+ RECONSUME_IN(Comment);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CommentStartDash)
+ {
+ ON('-')
+ {
+ SWITCH_TO(CommentEnd);
+ }
+ ON('>')
+ {
+ TODO();
+ }
+ ON_EOF
+ {
+ TODO();
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_comment_or_character.data.append('-');
+ RECONSUME_IN(Comment);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(Comment)
+ {
+ ON('<')
+ {
+ m_current_token.m_comment_or_character.data.append(current_input_character.value());
+ SWITCH_TO(CommentLessThanSign);
+ }
+ ON('-')
+ {
+ SWITCH_TO(CommentEndDash);
+ }
+ ON(0)
+ {
+ TODO();
+ }
+ ON_EOF
+ {
+ TODO();
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_comment_or_character.data.append(current_input_character.value());
+ continue;
+ }
+ }
+
+ BEGIN_STATE(CommentEnd)
+ {
+ ON('>')
+ {
+ emit_current_token();
+ SWITCH_TO(Data);
+ }
+ ON('!')
+ {
+ SWITCH_TO(CommentEndBang);
+ }
+ ON('-')
+ {
+ m_current_token.m_comment_or_character.data.append('-');
+ continue;
+ }
+ ON_EOF
+ {
+ TODO();
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_comment_or_character.data.append('-');
+ RECONSUME_IN(Comment);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CommentEndBang)
+ {
+ ON('-')
+ {
+ m_current_token.m_comment_or_character.data.append("--!");
+ SWITCH_TO(CommentEndDash);
+ }
+ ON('>')
+ {
+ TODO();
+ }
+ ON_EOF
+ {
+ TODO();
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_comment_or_character.data.append("--!");
+ RECONSUME_IN(Comment);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CommentEndDash)
+ {
+ ON('-')
+ {
+ SWITCH_TO(CommentEnd);
+ }
+ ON_EOF
+ {
+ TODO();
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_comment_or_character.data.append('-');
+ RECONSUME_IN(Comment);
+ }
+ }
+
+ BEGIN_STATE(CommentLessThanSign)
+ {
+ ON('!')
+ {
+ m_current_token.m_comment_or_character.data.append(current_input_character.value());
+ SWITCH_TO(CommentLessThanSignBang);
+ }
+ ON('<')
+ {
+ m_current_token.m_comment_or_character.data.append(current_input_character.value());
+ continue;
+ }
+ ANYTHING_ELSE
+ {
+ RECONSUME_IN(Comment);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CommentLessThanSignBang)
+ {
+ ON('-')
+ {
+ SWITCH_TO(CommentLessThanSignBangDash);
+ }
+ ANYTHING_ELSE
+ {
+ RECONSUME_IN(Comment);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CommentLessThanSignBangDash)
+ {
+ ON('-')
+ {
+ SWITCH_TO(CommentLessThanSignBangDashDash);
+ }
+ ANYTHING_ELSE
+ {
+ RECONSUME_IN(Comment);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CommentLessThanSignBangDashDash)
+ {
+ ON('>')
+ {
+ SWITCH_TO(CommentEnd);
+ }
+ ANYTHING_ELSE
+ {
+ TODO();
+ }
+ }
+ END_STATE
+
BEGIN_STATE(CharacterReference)
{
}