summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Kling <kling@serenityos.org>2020-05-23 01:19:42 +0200
committerAndreas Kling <kling@serenityos.org>2020-05-23 01:22:15 +0200
commit6caa5661f3833e7168387c1074379cabcd1e5acf (patch)
treed0c5d72c532c49da2c48c8c318b90ef983c8e950
parentdaf74838ddae1d92c19a17fa7593e118cdcb14d3 (diff)
downloadserenity-6caa5661f3833e7168387c1074379cabcd1e5acf.zip
LibWeb: Teach HTMLTokenizer how to tokenize attributes
Properly tokenize single-quoted, double-quoted and unquoted attributes!
-rw-r--r--Base/home/anon/www/simple.html1
-rw-r--r--Libraries/LibWeb/Parser/HTMLToken.h8
-rw-r--r--Libraries/LibWeb/Parser/HTMLTokenizer.cpp228
3 files changed, 232 insertions, 5 deletions
diff --git a/Base/home/anon/www/simple.html b/Base/home/anon/www/simple.html
index bd54434b15..1554113580 100644
--- a/Base/home/anon/www/simple.html
+++ b/Base/home/anon/www/simple.html
@@ -1,3 +1,4 @@
<!DOCTYPE html>
<html>
+<head><meta name="greeting" content='Hello friends!' foo=bar></head>
</html>
diff --git a/Libraries/LibWeb/Parser/HTMLToken.h b/Libraries/LibWeb/Parser/HTMLToken.h
index 93d27adf14..13c773d270 100644
--- a/Libraries/LibWeb/Parser/HTMLToken.h
+++ b/Libraries/LibWeb/Parser/HTMLToken.h
@@ -30,7 +30,6 @@
#include <AK/StringBuilder.h>
#include <AK/Types.h>
#include <AK/Vector.h>
-#include <LibWeb/DOM/Attribute.h>
namespace Web {
@@ -50,6 +49,11 @@ public:
Type type() const { return m_type; }
private:
+ struct AttributeBuilder {
+ StringBuilder name_builder;
+ StringBuilder value_builder;
+ };
+
Type m_type;
// Type::DOCTYPE
@@ -65,7 +69,7 @@ private:
struct {
StringBuilder tag_name;
bool self_closing { false };
- Vector<Attribute> attributes;
+ Vector<AttributeBuilder> attributes;
} m_tag;
// Type::Comment
diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
index bc7dfa2daf..73af7c0f15 100644
--- a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
+++ b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
@@ -30,6 +30,8 @@
//#define TOKENIZER_TRACE
+#define TODO ASSERT_NOT_REACHED
+
#define SWITCH_TO(new_state) \
will_switch_to(State::new_state); \
m_state = State::new_state; \
@@ -43,8 +45,6 @@
#define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor;
-#define IGNORE_CHARACTER_AND_CONTINUE_IN(x) SWITCH_TO(x)
-
#define ON(codepoint) \
if (current_input_character.has_value() && current_input_character.value() == codepoint)
@@ -138,6 +138,14 @@ void HTMLTokenizer::run()
BEGIN_STATE(TagName)
{
+ ON_WHITESPACE
+ {
+ SWITCH_TO(BeforeAttributeName);
+ }
+ ON('/')
+ {
+ SWITCH_TO(SelfClosingStartTag);
+ }
ON('>')
{
emit_current_token();
@@ -209,6 +217,213 @@ void HTMLTokenizer::run()
}
END_STATE
+ BEGIN_STATE(BeforeAttributeName)
+ {
+ ON_WHITESPACE
+ {
+ continue;
+ }
+ ON('/')
+ {
+ RECONSUME_IN(AfterAttributeName);
+ }
+ ON('>')
+ {
+ RECONSUME_IN(AfterAttributeName);
+ }
+ ON_EOF
+ {
+ RECONSUME_IN(AfterAttributeName);
+ }
+ ON('=')
+ {
+ TODO();
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder());
+ RECONSUME_IN(AttributeName);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(SelfClosingStartTag)
+ {
+ }
+ END_STATE
+
+ BEGIN_STATE(AttributeName)
+ {
+ ON_WHITESPACE
+ {
+ RECONSUME_IN(AfterAttributeName);
+ }
+ ON('/')
+ {
+ RECONSUME_IN(AfterAttributeName);
+ }
+ ON('>')
+ {
+ RECONSUME_IN(AfterAttributeName);
+ }
+ ON_EOF
+ {
+ RECONSUME_IN(AfterAttributeName);
+ }
+ ON('=')
+ {
+ SWITCH_TO(BeforeAttributeValue);
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_tag.attributes.last().name_builder.append(current_input_character.value());
+ continue;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(AfterAttributeName)
+ {
+ }
+ END_STATE
+
+ BEGIN_STATE(BeforeAttributeValue)
+ {
+ ON_WHITESPACE
+ {
+ continue;
+ }
+ ON('"')
+ {
+ SWITCH_TO(AttributeValueDoubleQuoted);
+ }
+ ON('\'')
+ {
+ SWITCH_TO(AttributeValueSingleQuoted);
+ }
+ ON('>')
+ {
+ TODO();
+ }
+ ANYTHING_ELSE
+ {
+ RECONSUME_IN(AttributeValueUnquoted);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(AttributeValueDoubleQuoted)
+ {
+ ON('"')
+ {
+ SWITCH_TO(AfterAttributeValueQuoted);
+ }
+ ON('&')
+ {
+ m_return_state = State::AttributeValueDoubleQuoted;
+ SWITCH_TO(CharacterReference);
+ }
+ ON(0)
+ {
+ TODO();
+ }
+ ON_EOF
+ {
+ TODO();
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
+ continue;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(AttributeValueSingleQuoted)
+ {
+ ON('\'')
+ {
+ SWITCH_TO(AfterAttributeValueQuoted);
+ }
+ ON('&')
+ {
+ m_return_state = State::AttributeValueSingleQuoted;
+ SWITCH_TO(CharacterReference);
+ }
+ ON(0)
+ {
+ TODO();
+ }
+ ON_EOF
+ {
+ TODO();
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
+ continue;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(AttributeValueUnquoted)
+ {
+ ON_WHITESPACE
+ {
+ SWITCH_TO(BeforeAttributeName);
+ }
+ ON('&')
+ {
+ m_return_state = State::AttributeValueUnquoted;
+ SWITCH_TO(CharacterReference);
+ }
+ ON('>')
+ {
+ emit_current_token();
+ SWITCH_TO(Data);
+ }
+ ON(0)
+ {
+ TODO();
+ }
+ ON_EOF
+ {
+ TODO();
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
+ continue;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(AfterAttributeValueQuoted)
+ {
+ ON_WHITESPACE
+ {
+ SWITCH_TO(BeforeAttributeName);
+ }
+ ON('/')
+ {
+ SWITCH_TO(SelfClosingStartTag);
+ }
+ ON('>')
+ {
+ emit_current_token();
+ SWITCH_TO(Data);
+ }
+ ON_EOF
+ {
+ TODO();
+ }
+ ANYTHING_ELSE
+ {
+ TODO();
+ }
+ }
+ END_STATE
+
BEGIN_STATE(CharacterReference)
{
}
@@ -270,7 +485,14 @@ void HTMLTokenizer::emit_current_token()
if (m_current_token.type() == HTMLToken::Type::StartTag || m_current_token.type() == HTMLToken::Type::EndTag) {
builder.append(" { name: '");
builder.append(m_current_token.m_tag.tag_name.to_string());
- builder.append("' }");
+ builder.append("', { ");
+ for (auto& attribute : m_current_token.m_tag.attributes) {
+ builder.append(attribute.name_builder.to_string());
+ builder.append("=\"");
+ builder.append(attribute.value_builder.to_string());
+ builder.append("\" ");
+ }
+ builder.append("} }");
}
dbg() << "[" << String::format("%42s", state_name(m_state)) << "] " << builder.to_string();