diff options
author | Andreas Kling <kling@serenityos.org> | 2020-05-23 01:19:42 +0200 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2020-05-23 01:22:15 +0200 |
commit | 6caa5661f3833e7168387c1074379cabcd1e5acf (patch) | |
tree | d0c5d72c532c49da2c48c8c318b90ef983c8e950 | |
parent | daf74838ddae1d92c19a17fa7593e118cdcb14d3 (diff) | |
download | serenity-6caa5661f3833e7168387c1074379cabcd1e5acf.zip |
LibWeb: Teach HTMLTokenizer how to tokenize attributes
Properly tokenize single-quoted, double-quoted and unquoted attributes!
-rw-r--r-- | Base/home/anon/www/simple.html | 1 | ||||
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLToken.h | 8 | ||||
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLTokenizer.cpp | 228 |
3 files changed, 232 insertions, 5 deletions
diff --git a/Base/home/anon/www/simple.html b/Base/home/anon/www/simple.html index bd54434b15..1554113580 100644 --- a/Base/home/anon/www/simple.html +++ b/Base/home/anon/www/simple.html @@ -1,3 +1,4 @@ <!DOCTYPE html> <html> +<head><meta name="greeting" content='Hello friends!' foo=bar></head> </html> diff --git a/Libraries/LibWeb/Parser/HTMLToken.h b/Libraries/LibWeb/Parser/HTMLToken.h index 93d27adf14..13c773d270 100644 --- a/Libraries/LibWeb/Parser/HTMLToken.h +++ b/Libraries/LibWeb/Parser/HTMLToken.h @@ -30,7 +30,6 @@ #include <AK/StringBuilder.h> #include <AK/Types.h> #include <AK/Vector.h> -#include <LibWeb/DOM/Attribute.h> namespace Web { @@ -50,6 +49,11 @@ public: Type type() const { return m_type; } private: + struct AttributeBuilder { + StringBuilder name_builder; + StringBuilder value_builder; + }; + Type m_type; // Type::DOCTYPE @@ -65,7 +69,7 @@ private: struct { StringBuilder tag_name; bool self_closing { false }; - Vector<Attribute> attributes; + Vector<AttributeBuilder> attributes; } m_tag; // Type::Comment diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp index bc7dfa2daf..73af7c0f15 100644 --- a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp @@ -30,6 +30,8 @@ //#define TOKENIZER_TRACE +#define TODO ASSERT_NOT_REACHED + #define SWITCH_TO(new_state) \ will_switch_to(State::new_state); \ m_state = State::new_state; \ @@ -43,8 +45,6 @@ #define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor; -#define IGNORE_CHARACTER_AND_CONTINUE_IN(x) SWITCH_TO(x) - #define ON(codepoint) \ if (current_input_character.has_value() && current_input_character.value() == codepoint) @@ -138,6 +138,14 @@ void HTMLTokenizer::run() BEGIN_STATE(TagName) { + ON_WHITESPACE + { + SWITCH_TO(BeforeAttributeName); + } + ON('/') + { + SWITCH_TO(SelfClosingStartTag); + } ON('>') { emit_current_token(); @@ -209,6 +217,213 @@ void HTMLTokenizer::run() } END_STATE + BEGIN_STATE(BeforeAttributeName) + { + ON_WHITESPACE + { + continue; + } + ON('/') + { + RECONSUME_IN(AfterAttributeName); + } + ON('>') + { + RECONSUME_IN(AfterAttributeName); + } + ON_EOF + { + RECONSUME_IN(AfterAttributeName); + } + ON('=') + { + TODO(); + } + ANYTHING_ELSE + { + m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder()); + RECONSUME_IN(AttributeName); + } + } + END_STATE + + BEGIN_STATE(SelfClosingStartTag) + { + } + END_STATE + + BEGIN_STATE(AttributeName) + { + ON_WHITESPACE + { + RECONSUME_IN(AfterAttributeName); + } + ON('/') + { + RECONSUME_IN(AfterAttributeName); + } + ON('>') + { + RECONSUME_IN(AfterAttributeName); + } + ON_EOF + { + RECONSUME_IN(AfterAttributeName); + } + ON('=') + { + SWITCH_TO(BeforeAttributeValue); + } + ANYTHING_ELSE + { + m_current_token.m_tag.attributes.last().name_builder.append(current_input_character.value()); + continue; + } + } + END_STATE + + BEGIN_STATE(AfterAttributeName) + { + } + END_STATE + + BEGIN_STATE(BeforeAttributeValue) + { + ON_WHITESPACE + { + continue; + } + ON('"') + { + SWITCH_TO(AttributeValueDoubleQuoted); + } + ON('\'') + { + SWITCH_TO(AttributeValueSingleQuoted); + } + ON('>') + { + TODO(); + } + ANYTHING_ELSE + { + RECONSUME_IN(AttributeValueUnquoted); + } + } + END_STATE + + BEGIN_STATE(AttributeValueDoubleQuoted) + { + ON('"') + { + SWITCH_TO(AfterAttributeValueQuoted); + } + ON('&') + { + m_return_state = State::AttributeValueDoubleQuoted; + SWITCH_TO(CharacterReference); + } + ON(0) + { + TODO(); + } + ON_EOF + { + TODO(); + } + ANYTHING_ELSE + { + m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value()); + continue; + } + } + END_STATE + + BEGIN_STATE(AttributeValueSingleQuoted) + { + ON('\'') + { + SWITCH_TO(AfterAttributeValueQuoted); + } + ON('&') + { + m_return_state = State::AttributeValueSingleQuoted; + SWITCH_TO(CharacterReference); + } + ON(0) + { + TODO(); + } + ON_EOF + { + TODO(); + } + ANYTHING_ELSE + { + m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value()); + continue; + } + } + END_STATE + + BEGIN_STATE(AttributeValueUnquoted) + { + ON_WHITESPACE + { + SWITCH_TO(BeforeAttributeName); + } + ON('&') + { + m_return_state = State::AttributeValueUnquoted; + SWITCH_TO(CharacterReference); + } + ON('>') + { + emit_current_token(); + SWITCH_TO(Data); + } + ON(0) + { + TODO(); + } + ON_EOF + { + TODO(); + } + ANYTHING_ELSE + { + m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value()); + continue; + } + } + END_STATE + + BEGIN_STATE(AfterAttributeValueQuoted) + { + ON_WHITESPACE + { + SWITCH_TO(BeforeAttributeName); + } + ON('/') + { + SWITCH_TO(SelfClosingStartTag); + } + ON('>') + { + emit_current_token(); + SWITCH_TO(Data); + } + ON_EOF + { + TODO(); + } + ANYTHING_ELSE + { + TODO(); + } + } + END_STATE + BEGIN_STATE(CharacterReference) { } @@ -270,7 +485,14 @@ void HTMLTokenizer::emit_current_token() if (m_current_token.type() == HTMLToken::Type::StartTag || m_current_token.type() == HTMLToken::Type::EndTag) { builder.append(" { name: '"); builder.append(m_current_token.m_tag.tag_name.to_string()); - builder.append("' }"); + builder.append("', { "); + for (auto& attribute : m_current_token.m_tag.attributes) { + builder.append(attribute.name_builder.to_string()); + builder.append("=\""); + builder.append(attribute.value_builder.to_string()); + builder.append("\" "); + } + builder.append("} }"); } dbg() << "[" << String::format("%42s", state_name(m_state)) << "] " << builder.to_string(); |