diff options
Diffstat (limited to 'Libraries')
-rw-r--r-- | Libraries/LibWeb/CMakeLists.txt | 1 | ||||
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLToken.h | 76 | ||||
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLTokenizer.cpp | 305 | ||||
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLTokenizer.h | 160 |
4 files changed, 542 insertions, 0 deletions
diff --git a/Libraries/LibWeb/CMakeLists.txt b/Libraries/LibWeb/CMakeLists.txt index 2a53d4ed56..631ff1e534 100644 --- a/Libraries/LibWeb/CMakeLists.txt +++ b/Libraries/LibWeb/CMakeLists.txt @@ -85,6 +85,7 @@ set(SOURCES Layout/LineBoxFragment.cpp Parser/CSSParser.cpp Parser/HTMLParser.cpp + Parser/HTMLTokenizer.cpp ResourceLoader.cpp StylePropertiesModel.cpp URLEncoder.cpp diff --git a/Libraries/LibWeb/Parser/HTMLToken.h b/Libraries/LibWeb/Parser/HTMLToken.h new file mode 100644 index 0000000000..398a01ac7f --- /dev/null +++ b/Libraries/LibWeb/Parser/HTMLToken.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2020, Andreas Kling <kling@serenityos.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include <AK/String.h> +#include <AK/StringBuilder.h> +#include <AK/Types.h> +#include <AK/Vector.h> +#include <LibWeb/DOM/Attribute.h> + +namespace Web { + +class HTMLToken { + friend class HTMLTokenizer; + +public: + enum class Type { + DOCTYPE, + StartTag, + EndTag, + Comment, + Character, + EndOfFile, + }; + + Type type() const { return m_type; } + +private: + Type m_type; + + // Type::DOCTYPE + struct { + StringBuilder name; + StringBuilder public_identifier; + StringBuilder system_public_identifier; + bool force_quirks { false }; + } m_doctype; + + // Type::StartTag + // Type::EndTag + struct { + StringBuilder tag_name; + bool self_closing { false }; + Vector<Attribute> attributes; + } m_tag; + + struct { + StringBuilder data; + } m_comment_or_character; +}; + +} diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp new file mode 100644 index 0000000000..bc7dfa2daf --- /dev/null +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp @@ -0,0 +1,305 @@ +/* + * Copyright (c) 2020, Andreas Kling <kling@serenityos.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <LibWeb/Parser/HTMLToken.h> +#include <LibWeb/Parser/HTMLTokenizer.h> +#include <ctype.h> + +//#define TOKENIZER_TRACE + +#define SWITCH_TO(new_state) \ + will_switch_to(State::new_state); \ + m_state = State::new_state; \ + current_input_character = next_codepoint(); \ + goto new_state; + +#define RECONSUME_IN(new_state) \ + will_reconsume_in(State::new_state); \ + m_state = State::new_state; \ + goto new_state; + +#define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor; + +#define IGNORE_CHARACTER_AND_CONTINUE_IN(x) SWITCH_TO(x) + +#define ON(codepoint) \ + if (current_input_character.has_value() && current_input_character.value() == codepoint) + +#define ON_EOF \ + if (!current_input_character.has_value()) + +#define ON_ASCII_ALPHA \ + if (current_input_character.has_value() && isalpha(current_input_character.value())) + +#define ON_WHITESPACE \ + if (current_input_character.has_value() && (current_input_character.value() == '\t' || current_input_character.value() == '\a' || current_input_character.value() == '\f' || current_input_character.value() == ' ')) + +#define ANYTHING_ELSE if (1) + +#define EMIT_EOF_AND_RETURN \ + create_new_token(HTMLToken::Type::EndOfFile); \ + emit_current_token(); \ + return; + +#define BEGIN_STATE(state) \ + state: \ + case State::state: + +#define END_STATE \ + ASSERT_NOT_REACHED(); \ + break; + +namespace Web { + +Optional<u32> HTMLTokenizer::next_codepoint() +{ + if (m_cursor >= m_input.length()) + return {}; + return m_input[m_cursor++]; +} + +Optional<u32> HTMLTokenizer::peek_codepoint(size_t offset) const +{ + if ((m_cursor + offset) >= m_input.length()) + return {}; + return m_input[m_cursor + offset]; +} + +void HTMLTokenizer::run() +{ + for (;;) { + auto current_input_character = next_codepoint(); + switch (m_state) { + BEGIN_STATE(Data) + { + ON('&') + { + m_return_state = State::Data; + SWITCH_TO(CharacterReference); + } + ON('<') + { + SWITCH_TO(TagOpen); + } + ON_EOF + { + EMIT_EOF_AND_RETURN; + } + ANYTHING_ELSE + { + create_new_token(HTMLToken::Type::Character); + m_current_token.m_comment_or_character.data.append(current_input_character.value()); + emit_current_token(); + continue; + } + } + END_STATE + + BEGIN_STATE(TagOpen) + { + ON('!') + { + SWITCH_TO(MarkupDeclarationOpen); + } + ON('/') + { + SWITCH_TO(EndTagOpen); + } + ON_ASCII_ALPHA + { + create_new_token(HTMLToken::Type::StartTag); + RECONSUME_IN(TagName); + } + } + END_STATE + + BEGIN_STATE(TagName) + { + ON('>') + { + emit_current_token(); + SWITCH_TO(Data); + } + ANYTHING_ELSE + { + m_current_token.m_tag.tag_name.append(current_input_character.value()); + continue; + } + } + + BEGIN_STATE(EndTagOpen) + { + ON_ASCII_ALPHA + { + create_new_token(HTMLToken::Type::EndTag); + RECONSUME_IN(TagName); + } + } + END_STATE + + BEGIN_STATE(MarkupDeclarationOpen) + { + DONT_CONSUME_NEXT_INPUT_CHARACTER; + if (next_few_characters_are("DOCTYPE")) { + consume("DOCTYPE"); + SWITCH_TO(DOCTYPE); + } + } + END_STATE + + BEGIN_STATE(DOCTYPE) + { + ON_WHITESPACE + { + SWITCH_TO(BeforeDOCTYPEName); + } + } + END_STATE + + BEGIN_STATE(BeforeDOCTYPEName) + { + ON_WHITESPACE + { + continue; + } + ANYTHING_ELSE + { + create_new_token(HTMLToken::Type::DOCTYPE); + m_current_token.m_doctype.name.append(current_input_character.value()); + SWITCH_TO(DOCTYPEName); + } + } + END_STATE + + BEGIN_STATE(DOCTYPEName) + { + ON('>') + { + emit_current_token(); + SWITCH_TO(Data); + } + ANYTHING_ELSE + { + m_current_token.m_doctype.name.append(current_input_character.value()); + continue; + } + } + END_STATE + + BEGIN_STATE(CharacterReference) + { + } + END_STATE + + default: + ASSERT_NOT_REACHED(); + } + } +} + +void HTMLTokenizer::consume(const StringView& string) +{ + ASSERT(next_few_characters_are(string)); + m_cursor += string.length(); +} + +bool HTMLTokenizer::next_few_characters_are(const StringView& string) const +{ + for (size_t i = 0; i < string.length(); ++i) { + auto codepoint = peek_codepoint(i); + if (!codepoint.has_value()) + return false; + // FIXME: This should be more Unicode-aware. + if (codepoint.value() != (u32)string[i]) + return false; + } + return true; +} + +void HTMLTokenizer::emit_current_token() +{ + StringBuilder builder; + + switch (m_current_token.type()) { + case HTMLToken::Type::DOCTYPE: + builder.append("DOCTYPE"); + builder.append(" { name: '"); + builder.append(m_current_token.m_doctype.name.to_string()); + builder.append("' }"); + break; + case HTMLToken::Type::StartTag: + builder.append("StartTag"); + break; + case HTMLToken::Type::EndTag: + builder.append("EndTag"); + break; + case HTMLToken::Type::Comment: + builder.append("Comment"); + break; + case HTMLToken::Type::Character: + builder.append("Character"); + break; + case HTMLToken::Type::EndOfFile: + builder.append("EndOfFile"); + break; + } + + if (m_current_token.type() == HTMLToken::Type::StartTag || m_current_token.type() == HTMLToken::Type::EndTag) { + builder.append(" { name: '"); + builder.append(m_current_token.m_tag.tag_name.to_string()); + builder.append("' }"); + } + + dbg() << "[" << String::format("%42s", state_name(m_state)) << "] " << builder.to_string(); + m_current_token = {}; +} + +void HTMLTokenizer::create_new_token(HTMLToken::Type type) +{ + m_current_token = {}; + m_current_token.m_type = type; +} + +HTMLTokenizer::HTMLTokenizer(const StringView& input) + : m_input(input) +{ +} + +void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state) +{ +#ifdef TOKENIZER_TRACE + dbg() << "[" << state_name(m_state) << "] Switch to " << state_name(new_state); +#endif +} + +void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state) +{ +#ifdef TOKENIZER_TRACE + dbg() << "[" << state_name(m_state) << "] Reconsume in " << state_name(new_state); +#endif +} + +} diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.h b/Libraries/LibWeb/Parser/HTMLTokenizer.h new file mode 100644 index 0000000000..ec5adecb92 --- /dev/null +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2020, Andreas Kling <kling@serenityos.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include <AK/StringView.h> +#include <AK/Types.h> +#include <LibWeb/Parser/HTMLToken.h> + +#define ENUMERATE_TOKENIZER_STATES \ + __ENUMERATE_TOKENIZER_STATE(Data) \ + __ENUMERATE_TOKENIZER_STATE(RCDATA) \ + __ENUMERATE_TOKENIZER_STATE(RAWTEXT) \ + __ENUMERATE_TOKENIZER_STATE(ScriptData) \ + __ENUMERATE_TOKENIZER_STATE(PLAINTEXT) \ + __ENUMERATE_TOKENIZER_STATE(TagOpen) \ + __ENUMERATE_TOKENIZER_STATE(EndTagOpen) \ + __ENUMERATE_TOKENIZER_STATE(TagName) \ + __ENUMERATE_TOKENIZER_STATE(RCDATALessThanSign) \ + __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagOpen) \ + __ENUMERATE_TOKENIZER_STATE(RCDATAEndTagName) \ + __ENUMERATE_TOKENIZER_STATE(RAWTEXTLessThanSign) \ + __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagOpen) \ + __ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagName) \ + __ENUMERATE_TOKENIZER_STATE(ScriptDataLessThanSign) \ + __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagOpen) \ + __ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagName) \ + __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStart) \ + __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStartDash) \ + __ENUMERATE_TOKENIZER_STATE(ScriptDataEscaped) \ + __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDash) \ + __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDashDash) \ + __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedLessThanSign) \ + __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagOpen) \ + __ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagName) \ + __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeStart) \ + __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscaped) \ + __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDash) \ + __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDashDash) \ + __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedLessThanSign) \ + __ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeEnd) \ + __ENUMERATE_TOKENIZER_STATE(BeforeAttributeName) \ + __ENUMERATE_TOKENIZER_STATE(AttributeName) \ + __ENUMERATE_TOKENIZER_STATE(AfterAttributeName) \ + __ENUMERATE_TOKENIZER_STATE(BeforeAttributeValue) \ + __ENUMERATE_TOKENIZER_STATE(AttributeValueDoubleQuoted) \ + __ENUMERATE_TOKENIZER_STATE(AttributeValueSingleQuoted) \ + __ENUMERATE_TOKENIZER_STATE(AttributeValueUnquoted) \ + __ENUMERATE_TOKENIZER_STATE(AfterAttributeValueQuoted) \ + __ENUMERATE_TOKENIZER_STATE(SelfClosingStartTag) \ + __ENUMERATE_TOKENIZER_STATE(BogusComment) \ + __ENUMERATE_TOKENIZER_STATE(MarkupDeclarationOpen) \ + __ENUMERATE_TOKENIZER_STATE(CommentStart) \ + __ENUMERATE_TOKENIZER_STATE(CommentStartDash) \ + __ENUMERATE_TOKENIZER_STATE(Comment) \ + __ENUMERATE_TOKENIZER_STATE(CommentLessThanSign) \ + __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBang) \ + __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDash) \ + __ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDashDash) \ + __ENUMERATE_TOKENIZER_STATE(CommentEndDash) \ + __ENUMERATE_TOKENIZER_STATE(CommentEnd) \ + __ENUMERATE_TOKENIZER_STATE(CommentEndBang) \ + __ENUMERATE_TOKENIZER_STATE(DOCTYPE) \ + __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEName) \ + __ENUMERATE_TOKENIZER_STATE(DOCTYPEName) \ + __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEName) \ + __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicKeyword) \ + __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEPublicIdentifier) \ + __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierDoubleQuoted) \ + __ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierSingleQuoted) \ + __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicIdentifier) \ + __ENUMERATE_TOKENIZER_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers) \ + __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemKeyword) \ + __ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPESystemIdentifier) \ + __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierDoubleQuoted) \ + __ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierSingleQuoted) \ + __ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemIdentifier) \ + __ENUMERATE_TOKENIZER_STATE(BogusDOCTYPE) \ + __ENUMERATE_TOKENIZER_STATE(CDATASection) \ + __ENUMERATE_TOKENIZER_STATE(CDATASectionBracket) \ + __ENUMERATE_TOKENIZER_STATE(CDATASectionEnd) \ + __ENUMERATE_TOKENIZER_STATE(CharacterReference) \ + __ENUMERATE_TOKENIZER_STATE(NamedCharacterReference) \ + __ENUMERATE_TOKENIZER_STATE(AmbiguousAmpersand) \ + __ENUMERATE_TOKENIZER_STATE(NumericCharacterReference) \ + __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReferenceStart) \ + __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReferenceStart) \ + __ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReference) \ + __ENUMERATE_TOKENIZER_STATE(DecimalCharacterReference) \ + __ENUMERATE_TOKENIZER_STATE(NumericCharacterReferenceEnd) + +namespace Web { + +class HTMLTokenizer { +public: + explicit HTMLTokenizer(const StringView& input); + + void run(); + +private: + Optional<u32> next_codepoint(); + Optional<u32> peek_codepoint(size_t offset) const; + bool next_few_characters_are(const StringView&) const; + void consume(const StringView&); + void emit_current_token(); + void create_new_token(HTMLToken::Type); + + enum class State { +#define __ENUMERATE_TOKENIZER_STATE(state) state, + ENUMERATE_TOKENIZER_STATES +#undef __ENUMERATE_TOKENIZER_STATE + }; + + static const char* state_name(State state) + { + switch (state) { +#define __ENUMERATE_TOKENIZER_STATE(state) \ + case State::state: \ + return #state; + ENUMERATE_TOKENIZER_STATES +#undef __ENUMERATE_TOKENIZER_STATE + }; + ASSERT_NOT_REACHED(); + } + + void will_switch_to(State); + void will_reconsume_in(State); + + State m_state { State::Data }; + State m_return_state { State::Data }; + + StringView m_input; + size_t m_cursor { 0 }; + + HTMLToken m_current_token; +}; +} |