summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp')
-rw-r--r--Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp2663
1 files changed, 2663 insertions, 0 deletions
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
new file mode 100644
index 0000000000..1bf1dab3c3
--- /dev/null
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
@@ -0,0 +1,2663 @@
+/*
+ * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <LibTextCodec/Decoder.h>
+#include <LibWeb/HTML/Parser/Entities.h>
+#include <LibWeb/HTML/Parser/HTMLToken.h>
+#include <LibWeb/HTML/Parser/HTMLTokenizer.h>
+#include <ctype.h>
+#include <string.h>
+
+namespace Web::HTML {
+
+#pragma GCC diagnostic ignored "-Wunused-label"
+
+//#define TOKENIZER_TRACE
+
+#ifdef TOKENIZER_TRACE
+# define PARSE_ERROR() \
+ do { \
+ dbg() << "Parse error (tokenization)" << __PRETTY_FUNCTION__ << " @ " << __LINE__; \
+ } while (0)
+#else
+# define PARSE_ERROR()
+#endif
+
+#define CONSUME_NEXT_INPUT_CHARACTER \
+ current_input_character = next_code_point();
+
+#define SWITCH_TO(new_state) \
+ do { \
+ will_switch_to(State::new_state); \
+ m_state = State::new_state; \
+ CONSUME_NEXT_INPUT_CHARACTER; \
+ goto new_state; \
+ } while (0)
+
+#define RECONSUME_IN(new_state) \
+ do { \
+ will_reconsume_in(State::new_state); \
+ m_state = State::new_state; \
+ goto new_state; \
+ } while (0)
+
+#define SWITCH_TO_RETURN_STATE \
+ do { \
+ will_switch_to(m_return_state); \
+ m_state = m_return_state; \
+ goto _StartOfFunction; \
+ } while (0)
+
+#define RECONSUME_IN_RETURN_STATE \
+ do { \
+ will_reconsume_in(m_return_state); \
+ m_state = m_return_state; \
+ if (current_input_character.has_value()) \
+ m_utf8_iterator = m_prev_utf8_iterator; \
+ goto _StartOfFunction; \
+ } while (0)
+
+#define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
+ do { \
+ will_switch_to(State::new_state); \
+ m_state = State::new_state; \
+ will_emit(m_current_token); \
+ m_queued_tokens.enqueue(m_current_token); \
+ return m_queued_tokens.dequeue(); \
+ } while (0)
+
+#define EMIT_CHARACTER_AND_RECONSUME_IN(code_point, new_state) \
+ do { \
+ m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); \
+ will_reconsume_in(State::new_state); \
+ m_state = State::new_state; \
+ goto new_state; \
+ } while (0)
+
+#define FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE \
+ do { \
+ for (auto code_point : m_temporary_buffer) { \
+ if (consumed_as_part_of_an_attribute()) { \
+ m_current_token.m_tag.attributes.last().value_builder.append_code_point(code_point); \
+ } else { \
+ create_new_token(HTMLToken::Type::Character); \
+ m_current_token.m_comment_or_character.data.append_code_point(code_point); \
+ m_queued_tokens.enqueue(m_current_token); \
+ } \
+ } \
+ } while (0)
+
+#define DONT_CONSUME_NEXT_INPUT_CHARACTER \
+ do { \
+ m_utf8_iterator = m_prev_utf8_iterator; \
+ } while (0)
+
+#define ON(code_point) \
+ if (current_input_character.has_value() && current_input_character.value() == code_point)
+
+#define ON_EOF \
+ if (!current_input_character.has_value())
+
+#define ON_ASCII_ALPHA \
+ if (current_input_character.has_value() && isalpha(current_input_character.value()))
+
+#define ON_ASCII_ALPHANUMERIC \
+ if (current_input_character.has_value() && isalnum(current_input_character.value()))
+
+#define ON_ASCII_UPPER_ALPHA \
+ if (current_input_character.has_value() && current_input_character.value() >= 'A' && current_input_character.value() <= 'Z')
+
+#define ON_ASCII_LOWER_ALPHA \
+ if (current_input_character.has_value() && current_input_character.value() >= 'a' && current_input_character.value() <= 'z')
+
+#define ON_ASCII_DIGIT \
+ if (current_input_character.has_value() && isdigit(current_input_character.value()))
+
+#define ON_ASCII_HEX_DIGIT \
+ if (current_input_character.has_value() && isxdigit(current_input_character.value()))
+
+#define ON_WHITESPACE \
+ if (current_input_character.has_value() && strchr("\t\n\f ", current_input_character.value()))
+
+#define ANYTHING_ELSE if (1)
+
+#define EMIT_EOF \
+ do { \
+ if (m_has_emitted_eof) \
+ return {}; \
+ m_has_emitted_eof = true; \
+ create_new_token(HTMLToken::Type::EndOfFile); \
+ will_emit(m_current_token); \
+ m_queued_tokens.enqueue(m_current_token); \
+ return m_queued_tokens.dequeue(); \
+ } while (0)
+
+#define EMIT_CURRENT_TOKEN \
+ do { \
+ will_emit(m_current_token); \
+ m_queued_tokens.enqueue(m_current_token); \
+ return m_queued_tokens.dequeue(); \
+ } while (0)
+
+#define EMIT_CHARACTER(code_point) \
+ do { \
+ create_new_token(HTMLToken::Type::Character); \
+ m_current_token.m_comment_or_character.data.append_code_point(code_point); \
+ m_queued_tokens.enqueue(m_current_token); \
+ return m_queued_tokens.dequeue(); \
+ } while (0)
+
+#define EMIT_CURRENT_CHARACTER \
+ EMIT_CHARACTER(current_input_character.value());
+
+#define SWITCH_TO_AND_EMIT_CHARACTER(code_point, new_state) \
+ do { \
+ will_switch_to(State::new_state); \
+ m_state = State::new_state; \
+ EMIT_CHARACTER(code_point); \
+ } while (0)
+
+#define SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(new_state) \
+ SWITCH_TO_AND_EMIT_CHARACTER(current_input_character.value(), new_state)
+
+#define BEGIN_STATE(state) \
+ state: \
+ case State::state: { \
+ { \
+ {
+
+#define END_STATE \
+ ASSERT_NOT_REACHED(); \
+ break; \
+ } \
+ } \
+ }
+
+static inline bool is_surrogate(u32 code_point)
+{
+ return (code_point & 0xfffff800) == 0xd800;
+}
+
+static inline bool is_noncharacter(u32 code_point)
+{
+ return code_point >= 0xfdd0 && (code_point <= 0xfdef || (code_point & 0xfffe) == 0xfffe) && code_point <= 0x10ffff;
+}
+
+static inline bool is_c0_control(u32 code_point)
+{
+ return code_point <= 0x1f;
+}
+
+static inline bool is_control(u32 code_point)
+{
+ return is_c0_control(code_point) || (code_point >= 0x7f && code_point <= 0x9f);
+}
+
+Optional<u32> HTMLTokenizer::next_code_point()
+{
+ if (m_utf8_iterator == m_utf8_view.end())
+ return {};
+ m_prev_utf8_iterator = m_utf8_iterator;
+ ++m_utf8_iterator;
+#ifdef TOKENIZER_TRACE
+ dbg() << "(Tokenizer) Next code_point: " << (char)*m_prev_utf8_iterator;
+#endif
+ return *m_prev_utf8_iterator;
+}
+
+Optional<u32> HTMLTokenizer::peek_code_point(size_t offset) const
+{
+ auto it = m_utf8_iterator;
+ for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i)
+ ++it;
+ if (it == m_utf8_view.end())
+ return {};
+ return *it;
+}
+
+Optional<HTMLToken> HTMLTokenizer::next_token()
+{
+_StartOfFunction:
+ if (!m_queued_tokens.is_empty())
+ return m_queued_tokens.dequeue();
+
+ for (;;) {
+ auto current_input_character = next_code_point();
+ switch (m_state) {
+ BEGIN_STATE(Data)
+ {
+ ON('&')
+ {
+ m_return_state = State::Data;
+ SWITCH_TO(CharacterReference);
+ }
+ ON('<')
+ {
+ SWITCH_TO(TagOpen);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ EMIT_CURRENT_CHARACTER;
+ }
+ ON_EOF
+ {
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ EMIT_CURRENT_CHARACTER;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(TagOpen)
+ {
+ ON('!')
+ {
+ SWITCH_TO(MarkupDeclarationOpen);
+ }
+ ON('/')
+ {
+ SWITCH_TO(EndTagOpen);
+ }
+ ON_ASCII_ALPHA
+ {
+ create_new_token(HTMLToken::Type::StartTag);
+ RECONSUME_IN(TagName);
+ }
+ ON('?')
+ {
+ PARSE_ERROR();
+ create_new_token(HTMLToken::Type::Comment);
+ RECONSUME_IN(BogusComment);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ EMIT_CHARACTER_AND_RECONSUME_IN('<', Data);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(TagName)
+ {
+ ON_WHITESPACE
+ {
+ SWITCH_TO(BeforeAttributeName);
+ }
+ ON('/')
+ {
+ SWITCH_TO(SelfClosingStartTag);
+ }
+ ON('>')
+ {
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_ASCII_UPPER_ALPHA
+ {
+ m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
+ continue;
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ m_current_token.m_tag.tag_name.append_code_point(0xFFFD);
+ continue;
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_tag.tag_name.append_code_point(current_input_character.value());
+ continue;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(EndTagOpen)
+ {
+ ON_ASCII_ALPHA
+ {
+ create_new_token(HTMLToken::Type::EndTag);
+ RECONSUME_IN(TagName);
+ }
+ ON('>')
+ {
+ PARSE_ERROR();
+ SWITCH_TO(Data);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ create_new_token(HTMLToken::Type::Comment);
+ RECONSUME_IN(BogusComment);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(MarkupDeclarationOpen)
+ {
+ DONT_CONSUME_NEXT_INPUT_CHARACTER;
+ if (consume_next_if_match("--")) {
+ create_new_token(HTMLToken::Type::Comment);
+ SWITCH_TO(CommentStart);
+ }
+ if (consume_next_if_match("DOCTYPE", CaseSensitivity::CaseInsensitive)) {
+ SWITCH_TO(DOCTYPE);
+ }
+ if (consume_next_if_match("[CDATA[")) {
+ TODO();
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ create_new_token(HTMLToken::Type::Comment);
+ SWITCH_TO(BogusComment);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(BogusComment)
+ {
+ ON('>')
+ {
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_EOF
+ {
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ m_current_token.m_comment_or_character.data.append_code_point(0xFFFD);
+ continue;
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_comment_or_character.data.append_code_point(current_input_character.value());
+ continue;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(DOCTYPE)
+ {
+ ON_WHITESPACE
+ {
+ SWITCH_TO(BeforeDOCTYPEName);
+ }
+ ON('>')
+ {
+ RECONSUME_IN(BeforeDOCTYPEName);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ create_new_token(HTMLToken::Type::DOCTYPE);
+ m_current_token.m_doctype.force_quirks = true;
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ RECONSUME_IN(BeforeDOCTYPEName);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(BeforeDOCTYPEName)
+ {
+ ON_WHITESPACE
+ {
+ continue;
+ }
+ ON_ASCII_UPPER_ALPHA
+ {
+ create_new_token(HTMLToken::Type::DOCTYPE);
+ m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
+ m_current_token.m_doctype.missing_name = false;
+ SWITCH_TO(DOCTYPEName);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ create_new_token(HTMLToken::Type::DOCTYPE);
+ m_current_token.m_doctype.name.append_code_point(0xFFFD);
+ m_current_token.m_doctype.missing_name = false;
+ SWITCH_TO(DOCTYPEName);
+ }
+ ON('>')
+ {
+ PARSE_ERROR();
+ create_new_token(HTMLToken::Type::DOCTYPE);
+ m_current_token.m_doctype.force_quirks = true;
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ create_new_token(HTMLToken::Type::DOCTYPE);
+ m_current_token.m_doctype.force_quirks = true;
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ create_new_token(HTMLToken::Type::DOCTYPE);
+ m_current_token.m_doctype.name.append_code_point(current_input_character.value());
+ m_current_token.m_doctype.missing_name = false;
+ SWITCH_TO(DOCTYPEName);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(DOCTYPEName)
+ {
+ ON_WHITESPACE
+ {
+ SWITCH_TO(AfterDOCTYPEName);
+ }
+ ON('>')
+ {
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_ASCII_UPPER_ALPHA
+ {
+ m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
+ continue;
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.name.append_code_point(0xFFFD);
+ continue;
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_doctype.name.append_code_point(current_input_character.value());
+ continue;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(AfterDOCTYPEName)
+ {
+ ON_WHITESPACE
+ {
+ continue;
+ }
+ ON('>')
+ {
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ if (toupper(current_input_character.value()) == 'P' && consume_next_if_match("UBLIC", CaseSensitivity::CaseInsensitive)) {
+ SWITCH_TO(AfterDOCTYPEPublicKeyword);
+ }
+ if (toupper(current_input_character.value()) == 'S' && consume_next_if_match("YSTEM", CaseSensitivity::CaseInsensitive)) {
+ SWITCH_TO(AfterDOCTYPESystemKeyword);
+ }
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ RECONSUME_IN(BogusDOCTYPE);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(AfterDOCTYPEPublicKeyword)
+ {
+ ON_WHITESPACE
+ {
+ SWITCH_TO(BeforeDOCTYPEPublicIdentifier);
+ }
+ ON('"')
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.public_identifier.clear();
+ m_current_token.m_doctype.missing_public_identifier = false;
+ SWITCH_TO(DOCTYPEPublicIdentifierDoubleQuoted);
+ }
+ ON('\'')
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.public_identifier.clear();
+ m_current_token.m_doctype.missing_public_identifier = false;
+ SWITCH_TO(DOCTYPEPublicIdentifierSingleQuoted);
+ }
+ ON('>')
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ RECONSUME_IN(BogusDOCTYPE);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(AfterDOCTYPESystemKeyword)
+ {
+ ON_WHITESPACE
+ {
+ SWITCH_TO(BeforeDOCTYPESystemIdentifier);
+ }
+ ON('"')
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.system_identifier.clear();
+ m_current_token.m_doctype.missing_system_identifier = false;
+ SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
+ }
+ ON('\'')
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.system_identifier.clear();
+ m_current_token.m_doctype.missing_system_identifier = false;
+ SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
+ }
+ ON('>')
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ RECONSUME_IN(BogusDOCTYPE);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(BeforeDOCTYPEPublicIdentifier)
+ {
+ ON_WHITESPACE
+ {
+ continue;
+ }
+ ON('"')
+ {
+ m_current_token.m_doctype.public_identifier.clear();
+ m_current_token.m_doctype.missing_public_identifier = false;
+ SWITCH_TO(DOCTYPEPublicIdentifierDoubleQuoted);
+ }
+ ON('\'')
+ {
+ m_current_token.m_doctype.public_identifier.clear();
+ m_current_token.m_doctype.missing_public_identifier = false;
+ SWITCH_TO(DOCTYPEPublicIdentifierSingleQuoted);
+ }
+ ON('>')
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ RECONSUME_IN(BogusDOCTYPE);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(BeforeDOCTYPESystemIdentifier)
+ {
+ ON_WHITESPACE
+ {
+ continue;
+ }
+ ON('"')
+ {
+ m_current_token.m_doctype.system_identifier.clear();
+ m_current_token.m_doctype.missing_system_identifier = false;
+ SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
+ }
+ ON('\'')
+ {
+ m_current_token.m_doctype.system_identifier.clear();
+ m_current_token.m_doctype.missing_system_identifier = false;
+ SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
+ }
+ ON('>')
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ RECONSUME_IN(BogusDOCTYPE);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuoted)
+ {
+ ON('"')
+ {
+ SWITCH_TO(AfterDOCTYPEPublicIdentifier);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.public_identifier.append_code_point(0xFFFD);
+ continue;
+ }
+ ON('>')
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_doctype.public_identifier.append_code_point(current_input_character.value());
+ continue;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuoted)
+ {
+ ON('\'')
+ {
+ SWITCH_TO(AfterDOCTYPEPublicIdentifier);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.public_identifier.append_code_point(0xFFFD);
+ continue;
+ }
+ ON('>')
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_doctype.public_identifier.append_code_point(current_input_character.value());
+ continue;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuoted)
+ {
+ ON('"')
+ {
+ SWITCH_TO(AfterDOCTYPESystemIdentifier);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.system_identifier.append_code_point(0xFFFD);
+ continue;
+ }
+ ON('>')
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_doctype.system_identifier.append_code_point(current_input_character.value());
+ continue;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(DOCTYPESystemIdentifierSingleQuoted)
+ {
+ ON('\'')
+ {
+ SWITCH_TO(AfterDOCTYPESystemIdentifier);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.system_identifier.append_code_point(0xFFFD);
+ continue;
+ }
+ ON('>')
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_doctype.system_identifier.append_code_point(current_input_character.value());
+ continue;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(AfterDOCTYPEPublicIdentifier)
+ {
+ ON_WHITESPACE
+ {
+ SWITCH_TO(BetweenDOCTYPEPublicAndSystemIdentifiers);
+ }
+ ON('>')
+ {
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON('"')
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.system_identifier.clear();
+ m_current_token.m_doctype.missing_system_identifier = false;
+ SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
+ }
+ ON('\'')
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.system_identifier.clear();
+ m_current_token.m_doctype.missing_system_identifier = false;
+ SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ RECONSUME_IN(BogusDOCTYPE);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers)
+ {
+ ON_WHITESPACE
+ {
+ continue;
+ }
+ ON('>')
+ {
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON('"')
+ {
+ m_current_token.m_doctype.system_identifier.clear();
+ m_current_token.m_doctype.missing_system_identifier = false;
+ SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
+ }
+ ON('\'')
+ {
+ m_current_token.m_doctype.system_identifier.clear();
+ m_current_token.m_doctype.missing_system_identifier = false;
+ SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ RECONSUME_IN(BogusDOCTYPE);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(AfterDOCTYPESystemIdentifier)
+ {
+ ON_WHITESPACE
+ {
+ continue;
+ }
+ ON('>')
+ {
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_current_token.m_doctype.force_quirks = true;
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ RECONSUME_IN(BogusDOCTYPE);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(BogusDOCTYPE)
+ {
+ ON('>')
+ {
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ continue;
+ }
+ ON_EOF
+ {
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ continue;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(BeforeAttributeName)
+ {
+ ON_WHITESPACE
+ {
+ continue;
+ }
+ ON('/')
+ {
+ RECONSUME_IN(AfterAttributeName);
+ }
+ ON('>')
+ {
+ RECONSUME_IN(AfterAttributeName);
+ }
+ ON_EOF
+ {
+ RECONSUME_IN(AfterAttributeName);
+ }
+ ON('=')
+ {
+ PARSE_ERROR();
+ auto new_attribute = HTMLToken::AttributeBuilder();
+ new_attribute.local_name_builder.append_code_point(current_input_character.value());
+ m_current_token.m_tag.attributes.append(new_attribute);
+ SWITCH_TO(AttributeName);
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder());
+ RECONSUME_IN(AttributeName);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(SelfClosingStartTag)
+ {
+ ON('>')
+ {
+ m_current_token.m_tag.self_closing = true;
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ RECONSUME_IN(BeforeAttributeName);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(AttributeName)
+ {
+ ON_WHITESPACE
+ {
+ RECONSUME_IN(AfterAttributeName);
+ }
+ ON('/')
+ {
+ RECONSUME_IN(AfterAttributeName);
+ }
+ ON('>')
+ {
+ RECONSUME_IN(AfterAttributeName);
+ }
+ ON_EOF
+ {
+ RECONSUME_IN(AfterAttributeName);
+ }
+ ON('=')
+ {
+ SWITCH_TO(BeforeAttributeValue);
+ }
+ ON_ASCII_UPPER_ALPHA
+ {
+ m_current_token.m_tag.attributes.last().local_name_builder.append_code_point(tolower(current_input_character.value()));
+ continue;
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ m_current_token.m_tag.attributes.last().local_name_builder.append_code_point(0xFFFD);
+ continue;
+ }
+ ON('"')
+ {
+ PARSE_ERROR();
+ goto AnythingElseAttributeName;
+ }
+ ON('\'')
+ {
+ PARSE_ERROR();
+ goto AnythingElseAttributeName;
+ }
+ ON('<')
+ {
+ PARSE_ERROR();
+ goto AnythingElseAttributeName;
+ }
+ ANYTHING_ELSE
+ {
+ AnythingElseAttributeName:
+ m_current_token.m_tag.attributes.last().local_name_builder.append_code_point(current_input_character.value());
+ continue;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(AfterAttributeName)
+ {
+ ON_WHITESPACE
+ {
+ continue;
+ }
+ ON('/')
+ {
+ SWITCH_TO(SelfClosingStartTag);
+ }
+ ON('=')
+ {
+ SWITCH_TO(BeforeAttributeValue);
+ }
+ ON('>')
+ {
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder());
+ RECONSUME_IN(AttributeName);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(BeforeAttributeValue)
+ {
+ ON_WHITESPACE
+ {
+ continue;
+ }
+ ON('"')
+ {
+ SWITCH_TO(AttributeValueDoubleQuoted);
+ }
+ ON('\'')
+ {
+ SWITCH_TO(AttributeValueSingleQuoted);
+ }
+ ON('>')
+ {
+ PARSE_ERROR();
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ANYTHING_ELSE
+ {
+ RECONSUME_IN(AttributeValueUnquoted);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(AttributeValueDoubleQuoted)
+ {
+ ON('"')
+ {
+ SWITCH_TO(AfterAttributeValueQuoted);
+ }
+ ON('&')
+ {
+ m_return_state = State::AttributeValueDoubleQuoted;
+ SWITCH_TO(CharacterReference);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ m_current_token.m_tag.attributes.last().value_builder.append_code_point(0xFFFD);
+ continue;
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_tag.attributes.last().value_builder.append_code_point(current_input_character.value());
+ continue;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(AttributeValueSingleQuoted)
+ {
+ ON('\'')
+ {
+ SWITCH_TO(AfterAttributeValueQuoted);
+ }
+ ON('&')
+ {
+ m_return_state = State::AttributeValueSingleQuoted;
+ SWITCH_TO(CharacterReference);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ m_current_token.m_tag.attributes.last().value_builder.append_code_point(0xFFFD);
+ continue;
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_tag.attributes.last().value_builder.append_code_point(current_input_character.value());
+ continue;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(AttributeValueUnquoted)
+ {
+ ON_WHITESPACE
+ {
+ SWITCH_TO(BeforeAttributeName);
+ }
+ ON('&')
+ {
+ m_return_state = State::AttributeValueUnquoted;
+ SWITCH_TO(CharacterReference);
+ }
+ ON('>')
+ {
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ m_current_token.m_tag.attributes.last().value_builder.append_code_point(0xFFFD);
+ continue;
+ }
+ ON('"')
+ {
+ PARSE_ERROR();
+ goto AnythingElseAttributeValueUnquoted;
+ }
+ ON('\'')
+ {
+ PARSE_ERROR();
+ goto AnythingElseAttributeValueUnquoted;
+ }
+ ON('<')
+ {
+ PARSE_ERROR();
+ goto AnythingElseAttributeValueUnquoted;
+ }
+ ON('=')
+ {
+ PARSE_ERROR();
+ goto AnythingElseAttributeValueUnquoted;
+ }
+ ON('`')
+ {
+ PARSE_ERROR();
+ goto AnythingElseAttributeValueUnquoted;
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ AnythingElseAttributeValueUnquoted:
+ m_current_token.m_tag.attributes.last().value_builder.append_code_point(current_input_character.value());
+ continue;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(AfterAttributeValueQuoted)
+ {
+ ON_WHITESPACE
+ {
+ SWITCH_TO(BeforeAttributeName);
+ }
+ ON('/')
+ {
+ SWITCH_TO(SelfClosingStartTag);
+ }
+ ON('>')
+ {
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ RECONSUME_IN(BeforeAttributeName);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CommentStart)
+ {
+ ON('-')
+ {
+ SWITCH_TO(CommentStartDash);
+ }
+ ON('>')
+ {
+ PARSE_ERROR();
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ANYTHING_ELSE
+ {
+ RECONSUME_IN(Comment);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CommentStartDash)
+ {
+ ON('-')
+ {
+ SWITCH_TO(CommentEnd);
+ }
+ ON('>')
+ {
+ PARSE_ERROR();
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_comment_or_character.data.append('-');
+ RECONSUME_IN(Comment);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(Comment)
+ {
+ ON('<')
+ {
+ m_current_token.m_comment_or_character.data.append_code_point(current_input_character.value());
+ SWITCH_TO(CommentLessThanSign);
+ }
+ ON('-')
+ {
+ SWITCH_TO(CommentEndDash);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ m_current_token.m_comment_or_character.data.append_code_point(0xFFFD);
+ continue;
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_comment_or_character.data.append_code_point(current_input_character.value());
+ continue;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CommentEnd)
+ {
+ ON('>')
+ {
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON('!')
+ {
+ SWITCH_TO(CommentEndBang);
+ }
+ ON('-')
+ {
+ m_current_token.m_comment_or_character.data.append('-');
+ continue;
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_comment_or_character.data.append('-');
+ RECONSUME_IN(Comment);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CommentEndBang)
+ {
+ ON('-')
+ {
+ m_current_token.m_comment_or_character.data.append("--!");
+ SWITCH_TO(CommentEndDash);
+ }
+ ON('>')
+ {
+ PARSE_ERROR();
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_comment_or_character.data.append("--!");
+ RECONSUME_IN(Comment);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CommentEndDash)
+ {
+ ON('-')
+ {
+ SWITCH_TO(CommentEnd);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ m_queued_tokens.enqueue(m_current_token);
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ m_current_token.m_comment_or_character.data.append('-');
+ RECONSUME_IN(Comment);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CommentLessThanSign)
+ {
+ ON('!')
+ {
+ m_current_token.m_comment_or_character.data.append_code_point(current_input_character.value());
+ SWITCH_TO(CommentLessThanSignBang);
+ }
+ ON('<')
+ {
+ m_current_token.m_comment_or_character.data.append_code_point(current_input_character.value());
+ continue;
+ }
+ ANYTHING_ELSE
+ {
+ RECONSUME_IN(Comment);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CommentLessThanSignBang)
+ {
+ ON('-')
+ {
+ SWITCH_TO(CommentLessThanSignBangDash);
+ }
+ ANYTHING_ELSE
+ {
+ RECONSUME_IN(Comment);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CommentLessThanSignBangDash)
+ {
+ ON('-')
+ {
+ SWITCH_TO(CommentLessThanSignBangDashDash);
+ }
+ ANYTHING_ELSE
+ {
+ RECONSUME_IN(CommentEndDash);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CommentLessThanSignBangDashDash)
+ {
+ ON('>')
+ {
+ RECONSUME_IN(CommentEnd);
+ }
+ ON_EOF
+ {
+ RECONSUME_IN(CommentEnd);
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ RECONSUME_IN(CommentEnd);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CharacterReference)
+ {
+ m_temporary_buffer.clear();
+ m_temporary_buffer.append('&');
+
+ ON_ASCII_ALPHANUMERIC
+ {
+ RECONSUME_IN(NamedCharacterReference);
+ }
+ ON('#')
+ {
+ m_temporary_buffer.append(current_input_character.value());
+ SWITCH_TO(NumericCharacterReference);
+ }
+ ANYTHING_ELSE
+ {
+ FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
+ RECONSUME_IN_RETURN_STATE;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(NamedCharacterReference)
+ {
+ size_t byte_offset = m_utf8_view.byte_offset_of(m_prev_utf8_iterator);
+
+ auto match = HTML::code_points_from_entity(m_decoded_input.substring_view(byte_offset, m_decoded_input.length() - byte_offset - 1));
+
+ if (match.has_value()) {
+ for (size_t i = 0; i < match.value().entity.length() - 1; ++i) {
+ m_prev_utf8_iterator = m_utf8_iterator;
+ ++m_utf8_iterator;
+ }
+ for (auto ch : match.value().entity)
+ m_temporary_buffer.append(ch);
+
+ if (consumed_as_part_of_an_attribute() && !match.value().entity.ends_with(';')) {
+ auto next_code_point = peek_code_point(0);
+ if (next_code_point.has_value() && (next_code_point.value() == '=' || isalnum(next_code_point.value()))) {
+ FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
+ SWITCH_TO_RETURN_STATE;
+ }
+ }
+
+ if (!match.value().entity.ends_with(';')) {
+ PARSE_ERROR();
+ }
+
+ m_temporary_buffer.clear();
+ m_temporary_buffer.append(match.value().code_points);
+
+ FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
+ SWITCH_TO_RETURN_STATE;
+ } else {
+ FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
+ // FIXME: This should be SWITCH_TO, but we always lose the first character on this path, so just reconsume it.
+ // I can't wrap my head around how to do it as the spec says.
+ RECONSUME_IN(AmbiguousAmpersand);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(AmbiguousAmpersand)
+ {
+ ON_ASCII_ALPHANUMERIC
+ {
+ if (consumed_as_part_of_an_attribute()) {
+ m_current_token.m_tag.attributes.last().value_builder.append_code_point(current_input_character.value());
+ continue;
+ } else {
+ EMIT_CURRENT_CHARACTER;
+ }
+ }
+ ON(';')
+ {
+ PARSE_ERROR();
+ RECONSUME_IN_RETURN_STATE;
+ }
+ ANYTHING_ELSE
+ {
+ RECONSUME_IN_RETURN_STATE;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(NumericCharacterReference)
+ {
+ m_character_reference_code = 0;
+
+ ON('X')
+ {
+ m_temporary_buffer.append(current_input_character.value());
+ SWITCH_TO(HexadecimalCharacterReferenceStart);
+ }
+ ON('x')
+ {
+ m_temporary_buffer.append(current_input_character.value());
+ SWITCH_TO(HexadecimalCharacterReferenceStart);
+ }
+ ANYTHING_ELSE
+ {
+ RECONSUME_IN(DecimalCharacterReferenceStart);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(HexadecimalCharacterReferenceStart)
+ {
+ ON_ASCII_HEX_DIGIT
+ {
+ RECONSUME_IN(HexadecimalCharacterReference);
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
+ RECONSUME_IN_RETURN_STATE;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(DecimalCharacterReferenceStart)
+ {
+ ON_ASCII_DIGIT
+ {
+ RECONSUME_IN(DecimalCharacterReference);
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
+ RECONSUME_IN_RETURN_STATE;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(HexadecimalCharacterReference)
+ {
+ ON_ASCII_DIGIT
+ {
+ m_character_reference_code *= 16;
+ m_character_reference_code += current_input_character.value() - 0x30;
+ continue;
+ }
+ ON_ASCII_UPPER_ALPHA
+ {
+ m_character_reference_code *= 16;
+ m_character_reference_code += current_input_character.value() - 0x37;
+ continue;
+ }
+ ON_ASCII_LOWER_ALPHA
+ {
+ m_character_reference_code *= 16;
+ m_character_reference_code += current_input_character.value() - 0x57;
+ continue;
+ }
+ ON(';')
+ {
+ SWITCH_TO(NumericCharacterReferenceEnd);
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ RECONSUME_IN(NumericCharacterReferenceEnd);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(DecimalCharacterReference)
+ {
+ ON_ASCII_DIGIT
+ {
+ m_character_reference_code *= 10;
+ m_character_reference_code += current_input_character.value() - 0x30;
+ continue;
+ }
+ ON(';')
+ {
+ SWITCH_TO(NumericCharacterReferenceEnd);
+ }
+ ANYTHING_ELSE
+ {
+ PARSE_ERROR();
+ RECONSUME_IN(NumericCharacterReferenceEnd);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(NumericCharacterReferenceEnd)
+ {
+ DONT_CONSUME_NEXT_INPUT_CHARACTER;
+
+ if (m_character_reference_code == 0) {
+ PARSE_ERROR();
+ m_character_reference_code = 0xFFFD;
+ }
+ if (m_character_reference_code > 0x10ffff) {
+ PARSE_ERROR();
+ m_character_reference_code = 0xFFFD;
+ }
+ if (is_surrogate(m_character_reference_code)) {
+ PARSE_ERROR();
+ m_character_reference_code = 0xFFFD;
+ }
+ if (is_noncharacter(m_character_reference_code)) {
+ PARSE_ERROR();
+ }
+ if (m_character_reference_code == 0xd || (is_control(m_character_reference_code) && !isspace(m_character_reference_code))) {
+ PARSE_ERROR();
+ constexpr struct {
+ u32 number;
+ u32 code_point;
+ } conversion_table[] = {
+ { 0x80, 0x20AC },
+ { 0x82, 0x201A },
+ { 0x83, 0x0192 },
+ { 0x84, 0x201E },
+ { 0x85, 0x2026 },
+ { 0x86, 0x2020 },
+ { 0x87, 0x2021 },
+ { 0x88, 0x02C6 },
+ { 0x89, 0x2030 },
+ { 0x8A, 0x0160 },
+ { 0x8B, 0x2039 },
+ { 0x8C, 0x0152 },
+ { 0x8E, 0x017D },
+ { 0x91, 0x2018 },
+ { 0x92, 0x2019 },
+ { 0x93, 0x201C },
+ { 0x94, 0x201D },
+ { 0x95, 0x2022 },
+ { 0x96, 0x2013 },
+ { 0x97, 0x2014 },
+ { 0x98, 0x02DC },
+ { 0x99, 0x2122 },
+ { 0x9A, 0x0161 },
+ { 0x9B, 0x203A },
+ { 0x9C, 0x0153 },
+ { 0x9E, 0x017E },
+ { 0x9F, 0x0178 },
+ };
+ for (auto& entry : conversion_table) {
+ if (m_character_reference_code == entry.number) {
+ m_character_reference_code = entry.code_point;
+ break;
+ }
+ }
+ }
+
+ m_temporary_buffer.clear();
+ m_temporary_buffer.append(m_character_reference_code);
+ FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
+ SWITCH_TO_RETURN_STATE;
+ }
+ END_STATE
+
+ BEGIN_STATE(RCDATA)
+ {
+ ON('&')
+ {
+ m_return_state = State::RCDATA;
+ SWITCH_TO(CharacterReference);
+ }
+ ON('<')
+ {
+ SWITCH_TO(RCDATALessThanSign);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ EMIT_CHARACTER(0xFFFD);
+ }
+ ON_EOF
+ {
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ EMIT_CURRENT_CHARACTER;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(RCDATALessThanSign)
+ {
+ ON('/')
+ {
+ m_temporary_buffer.clear();
+ SWITCH_TO(RCDATAEndTagOpen);
+ }
+ ANYTHING_ELSE
+ {
+ EMIT_CHARACTER_AND_RECONSUME_IN('<', RCDATA);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(RCDATAEndTagOpen)
+ {
+ ON_ASCII_ALPHA
+ {
+ create_new_token(HTMLToken::Type::EndTag);
+ RECONSUME_IN(RCDATAEndTagName);
+ }
+ ANYTHING_ELSE
+ {
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ RECONSUME_IN(RCDATA);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(RCDATAEndTagName)
+ {
+ ON_WHITESPACE
+ {
+ if (!current_end_tag_token_is_appropriate()) {
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ for (auto code_point : m_temporary_buffer)
+ m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
+ RECONSUME_IN(RCDATA);
+ }
+ SWITCH_TO(BeforeAttributeName);
+ }
+ ON('/')
+ {
+ if (!current_end_tag_token_is_appropriate()) {
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ for (auto code_point : m_temporary_buffer)
+ m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
+ RECONSUME_IN(RCDATA);
+ }
+ SWITCH_TO(SelfClosingStartTag);
+ }
+ ON('>')
+ {
+ if (!current_end_tag_token_is_appropriate()) {
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ for (auto code_point : m_temporary_buffer)
+ m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
+ RECONSUME_IN(RCDATA);
+ }
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_ASCII_UPPER_ALPHA
+ {
+ m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
+ m_temporary_buffer.append(current_input_character.value());
+ continue;
+ }
+ ON_ASCII_LOWER_ALPHA
+ {
+ m_current_token.m_tag.tag_name.append_code_point(current_input_character.value());
+ m_temporary_buffer.append(current_input_character.value());
+ continue;
+ }
+ ANYTHING_ELSE
+ {
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ for (auto code_point : m_temporary_buffer)
+ m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
+ RECONSUME_IN(RCDATA);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(RAWTEXT)
+ {
+ ON('<')
+ {
+ SWITCH_TO(RAWTEXTLessThanSign);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ EMIT_CHARACTER(0xFFFD);
+ }
+ ON_EOF
+ {
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ EMIT_CURRENT_CHARACTER;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(RAWTEXTLessThanSign)
+ {
+ ON('/')
+ {
+ m_temporary_buffer.clear();
+ SWITCH_TO(RAWTEXTEndTagOpen);
+ }
+ ANYTHING_ELSE
+ {
+ EMIT_CHARACTER_AND_RECONSUME_IN('<', RAWTEXT);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(RAWTEXTEndTagOpen)
+ {
+ ON_ASCII_ALPHA
+ {
+ create_new_token(HTMLToken::Type::EndTag);
+ RECONSUME_IN(RAWTEXTEndTagName);
+ }
+ ANYTHING_ELSE
+ {
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ RECONSUME_IN(RAWTEXT);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(RAWTEXTEndTagName)
+ {
+ ON_WHITESPACE
+ {
+ if (!current_end_tag_token_is_appropriate()) {
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ for (auto code_point : m_temporary_buffer)
+ m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
+ RECONSUME_IN(RAWTEXT);
+ }
+ SWITCH_TO(BeforeAttributeName);
+ }
+ ON('/')
+ {
+ if (!current_end_tag_token_is_appropriate()) {
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ for (auto code_point : m_temporary_buffer)
+ m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
+ RECONSUME_IN(RAWTEXT);
+ }
+ SWITCH_TO(SelfClosingStartTag);
+ }
+ ON('>')
+ {
+ if (!current_end_tag_token_is_appropriate()) {
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ for (auto code_point : m_temporary_buffer)
+ m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
+ RECONSUME_IN(RAWTEXT);
+ }
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ }
+ ON_ASCII_UPPER_ALPHA
+ {
+ m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
+ m_temporary_buffer.append(current_input_character.value());
+ continue;
+ }
+ ON_ASCII_LOWER_ALPHA
+ {
+ m_current_token.m_tag.tag_name.append(current_input_character.value());
+ m_temporary_buffer.append(current_input_character.value());
+ continue;
+ }
+ ANYTHING_ELSE
+ {
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ for (auto code_point : m_temporary_buffer)
+ m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
+ RECONSUME_IN(RAWTEXT);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptData)
+ {
+ ON('<')
+ {
+ SWITCH_TO(ScriptDataLessThanSign);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ EMIT_CHARACTER(0xFFFD);
+ }
+ ON_EOF
+ {
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ EMIT_CURRENT_CHARACTER;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(PLAINTEXT)
+ {
+ ON(0)
+ {
+ PARSE_ERROR();
+ EMIT_CHARACTER(0xFFFD);
+ }
+ ON_EOF
+ {
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ EMIT_CURRENT_CHARACTER;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptDataLessThanSign)
+ {
+ ON('/')
+ {
+ m_temporary_buffer.clear();
+ SWITCH_TO(ScriptDataEndTagOpen);
+ }
+ ON('!')
+ {
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('!'));
+ SWITCH_TO(ScriptDataEscapeStart);
+ }
+ ANYTHING_ELSE
+ {
+ EMIT_CHARACTER_AND_RECONSUME_IN('<', ScriptData);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptDataEscapeStart)
+ {
+ ON('-')
+ {
+ SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataEscapeStartDash);
+ }
+ ANYTHING_ELSE
+ {
+ RECONSUME_IN(ScriptData);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptDataEscapeStartDash)
+ {
+ ON('-')
+ {
+ SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataEscapedDashDash);
+ }
+ ANYTHING_ELSE
+ {
+ RECONSUME_IN(ScriptData);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptDataEscapedDashDash)
+ {
+ ON('-')
+ {
+ EMIT_CHARACTER('-');
+ }
+ ON('<')
+ {
+ SWITCH_TO(ScriptDataEscapedLessThanSign);
+ }
+ ON('>')
+ {
+ SWITCH_TO_AND_EMIT_CHARACTER('>', ScriptData);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ SWITCH_TO_AND_EMIT_CHARACTER(0xFFFD, ScriptDataEscaped);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptDataEscapedLessThanSign)
+ {
+ ON('/')
+ {
+ m_temporary_buffer.clear();
+ SWITCH_TO(ScriptDataEscapedEndTagOpen);
+ }
+ ON_ASCII_ALPHA
+ {
+ m_temporary_buffer.clear();
+ EMIT_CHARACTER_AND_RECONSUME_IN('<', ScriptDataDoubleEscapeStart);
+ }
+ ANYTHING_ELSE
+ {
+ EMIT_CHARACTER_AND_RECONSUME_IN('<', ScriptDataEscaped);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptDataEscapedEndTagOpen)
+ {
+ ON_ASCII_ALPHA
+ {
+ create_new_token(HTMLToken::Type::EndTag);
+ RECONSUME_IN(ScriptDataEscapedEndTagName);
+ }
+ ANYTHING_ELSE
+ {
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ RECONSUME_IN(ScriptDataEscaped);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptDataEscapedEndTagName)
+ {
+ ON_WHITESPACE
+ {
+ if (current_end_tag_token_is_appropriate())
+ SWITCH_TO(BeforeAttributeName);
+
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ for (auto code_point : m_temporary_buffer) {
+ m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
+ }
+ RECONSUME_IN(ScriptDataEscaped);
+ }
+ ON('/')
+ {
+ if (current_end_tag_token_is_appropriate())
+ SWITCH_TO(SelfClosingStartTag);
+
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ for (auto code_point : m_temporary_buffer) {
+ m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
+ }
+ RECONSUME_IN(ScriptDataEscaped);
+ }
+ ON('>')
+ {
+ if (current_end_tag_token_is_appropriate())
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ for (auto code_point : m_temporary_buffer) {
+ m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
+ }
+ RECONSUME_IN(ScriptDataEscaped);
+ }
+ ON_ASCII_UPPER_ALPHA
+ {
+ m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
+ m_temporary_buffer.append(current_input_character.value());
+ continue;
+ }
+ ON_ASCII_LOWER_ALPHA
+ {
+ m_current_token.m_tag.tag_name.append(current_input_character.value());
+ m_temporary_buffer.append(current_input_character.value());
+ continue;
+ }
+ ANYTHING_ELSE
+ {
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ for (auto code_point : m_temporary_buffer) {
+ m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
+ }
+ RECONSUME_IN(ScriptDataEscaped);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptDataDoubleEscapeStart)
+ {
+ auto temporary_buffer_equal_to_script = [this]() -> bool {
+ if (m_temporary_buffer.size() != 6)
+ return false;
+
+ // FIXME: Is there a better way of doing this?
+ return m_temporary_buffer[0] == 's' && m_temporary_buffer[1] == 'c' && m_temporary_buffer[2] == 'r' && m_temporary_buffer[3] == 'i' && m_temporary_buffer[4] == 'p' && m_temporary_buffer[5] == 't';
+ };
+ ON_WHITESPACE
+ {
+ if (temporary_buffer_equal_to_script())
+ SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
+ else
+ SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
+ }
+ ON('/')
+ {
+ if (temporary_buffer_equal_to_script())
+ SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
+ else
+ SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
+ }
+ ON('>')
+ {
+ if (temporary_buffer_equal_to_script())
+ SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
+ else
+ SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
+ }
+ ON_ASCII_UPPER_ALPHA
+ {
+ m_temporary_buffer.append(tolower(current_input_character.value()));
+ EMIT_CURRENT_CHARACTER;
+ }
+ ON_ASCII_LOWER_ALPHA
+ {
+ m_temporary_buffer.append(current_input_character.value());
+ EMIT_CURRENT_CHARACTER;
+ }
+ ANYTHING_ELSE
+ {
+ RECONSUME_IN(ScriptDataEscaped);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptDataDoubleEscaped)
+ {
+ ON('-')
+ {
+ SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataDoubleEscapedDash);
+ }
+ ON('<')
+ {
+ SWITCH_TO_AND_EMIT_CHARACTER('<', ScriptDataDoubleEscapedLessThanSign);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ EMIT_CHARACTER(0xFFFD);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ EMIT_CURRENT_CHARACTER;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptDataDoubleEscapedDash)
+ {
+ ON('-')
+ {
+ SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataDoubleEscapedDashDash);
+ }
+ ON('<')
+ {
+ SWITCH_TO_AND_EMIT_CHARACTER('<', ScriptDataDoubleEscapedLessThanSign);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ SWITCH_TO_AND_EMIT_CHARACTER(0xFFFD, ScriptDataDoubleEscaped);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptDataDoubleEscapedDashDash)
+ {
+ ON('-')
+ {
+ EMIT_CHARACTER('-');
+ }
+ ON('<')
+ {
+ SWITCH_TO_AND_EMIT_CHARACTER('<', ScriptDataDoubleEscapedLessThanSign);
+ }
+ ON('>')
+ {
+ SWITCH_TO_AND_EMIT_CHARACTER('>', ScriptData);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ SWITCH_TO_AND_EMIT_CHARACTER(0xFFFD, ScriptDataDoubleEscaped);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptDataDoubleEscapedLessThanSign)
+ {
+ ON('/')
+ {
+ m_temporary_buffer.clear();
+ SWITCH_TO_AND_EMIT_CHARACTER('/', ScriptDataDoubleEscapeEnd);
+ }
+ ANYTHING_ELSE
+ {
+ RECONSUME_IN(ScriptDataDoubleEscaped);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptDataDoubleEscapeEnd)
+ {
+ auto temporary_buffer_equal_to_script = [this]() -> bool {
+ if (m_temporary_buffer.size() != 6)
+ return false;
+
+ // FIXME: Is there a better way of doing this?
+ return m_temporary_buffer[0] == 's' && m_temporary_buffer[1] == 'c' && m_temporary_buffer[2] == 'r' && m_temporary_buffer[3] == 'i' && m_temporary_buffer[4] == 'p' && m_temporary_buffer[5] == 't';
+ };
+ ON_WHITESPACE
+ {
+ if (temporary_buffer_equal_to_script())
+ SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
+ else
+ SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
+ }
+ ON('/')
+ {
+ if (temporary_buffer_equal_to_script())
+ SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
+ else
+ SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
+ }
+ ON('>')
+ {
+ if (temporary_buffer_equal_to_script())
+ SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
+ else
+ SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
+ }
+ ON_ASCII_UPPER_ALPHA
+ {
+ m_temporary_buffer.append(tolower(current_input_character.value()));
+ EMIT_CURRENT_CHARACTER;
+ }
+ ON_ASCII_LOWER_ALPHA
+ {
+ m_temporary_buffer.append(current_input_character.value());
+ EMIT_CURRENT_CHARACTER;
+ }
+ ANYTHING_ELSE
+ {
+ RECONSUME_IN(ScriptDataDoubleEscaped);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptDataEscapedDash)
+ {
+ ON('-')
+ {
+ SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataEscapedDashDash);
+ }
+ ON('<')
+ {
+ SWITCH_TO(ScriptDataEscapedLessThanSign);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ SWITCH_TO_AND_EMIT_CHARACTER(0xFFFD, ScriptDataEscaped);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptDataEscaped)
+ {
+ ON('-')
+ {
+ SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataEscapedDash);
+ }
+ ON('<')
+ {
+ SWITCH_TO(ScriptDataEscapedLessThanSign);
+ }
+ ON(0)
+ {
+ PARSE_ERROR();
+ EMIT_CHARACTER(0xFFFD);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ EMIT_CURRENT_CHARACTER;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptDataEndTagOpen)
+ {
+ ON_ASCII_ALPHA
+ {
+ create_new_token(HTMLToken::Type::EndTag);
+ RECONSUME_IN(ScriptDataEndTagName);
+ }
+ ANYTHING_ELSE
+ {
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ RECONSUME_IN(ScriptData);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(ScriptDataEndTagName)
+ {
+ ON_WHITESPACE
+ {
+ if (current_end_tag_token_is_appropriate())
+ SWITCH_TO(BeforeAttributeName);
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ for (auto code_point : m_temporary_buffer)
+ m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
+ RECONSUME_IN(ScriptData);
+ }
+ ON('/')
+ {
+ if (current_end_tag_token_is_appropriate())
+ SWITCH_TO(SelfClosingStartTag);
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ for (auto code_point : m_temporary_buffer)
+ m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
+ RECONSUME_IN(ScriptData);
+ }
+ ON('>')
+ {
+ if (current_end_tag_token_is_appropriate())
+ SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ for (auto code_point : m_temporary_buffer)
+ m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
+ RECONSUME_IN(ScriptData);
+ }
+ ON_ASCII_UPPER_ALPHA
+ {
+ m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
+ m_temporary_buffer.append(current_input_character.value());
+ continue;
+ }
+ ON_ASCII_LOWER_ALPHA
+ {
+ m_current_token.m_tag.tag_name.append(current_input_character.value());
+ m_temporary_buffer.append(current_input_character.value());
+ continue;
+ }
+ ANYTHING_ELSE
+ {
+ m_queued_tokens.enqueue(HTMLToken::make_character('<'));
+ m_queued_tokens.enqueue(HTMLToken::make_character('/'));
+ for (auto code_point : m_temporary_buffer)
+ m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
+ RECONSUME_IN(ScriptData);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CDATASection)
+ {
+ ON(']')
+ {
+ SWITCH_TO(CDATASectionBracket);
+ }
+ ON_EOF
+ {
+ PARSE_ERROR();
+ EMIT_EOF;
+ }
+ ANYTHING_ELSE
+ {
+ EMIT_CURRENT_CHARACTER;
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CDATASectionBracket)
+ {
+ ON(']')
+ {
+ SWITCH_TO(CDATASectionEnd);
+ }
+ ANYTHING_ELSE
+ {
+ EMIT_CHARACTER_AND_RECONSUME_IN(']', CDATASection);
+ }
+ }
+ END_STATE
+
+ BEGIN_STATE(CDATASectionEnd)
+ {
+ ON(']')
+ {
+ EMIT_CHARACTER(']');
+ }
+ ON('>')
+ {
+ SWITCH_TO(Data);
+ }
+ ANYTHING_ELSE
+ {
+ m_queued_tokens.enqueue(HTMLToken::make_character(']'));
+ m_queued_tokens.enqueue(HTMLToken::make_character(']'));
+ RECONSUME_IN(CDATASection);
+ }
+ }
+ END_STATE
+
+ default:
+ TODO();
+ }
+ }
+}
+
+bool HTMLTokenizer::consume_next_if_match(const StringView& string, CaseSensitivity case_sensitivity)
+{
+ for (size_t i = 0; i < string.length(); ++i) {
+ auto code_point = peek_code_point(i);
+ if (!code_point.has_value())
+ return false;
+ // FIXME: This should be more Unicode-aware.
+ if (case_sensitivity == CaseSensitivity::CaseInsensitive) {
+ if (code_point.value() < 0x80) {
+ if (tolower(code_point.value()) != tolower(string[i]))
+ return false;
+ continue;
+ }
+ }
+ if (code_point.value() != (u32)string[i])
+ return false;
+ }
+ for (size_t i = 0; i < string.length(); ++i) {
+ m_prev_utf8_iterator = m_utf8_iterator;
+ ++m_utf8_iterator;
+ }
+ return true;
+}
+
+void HTMLTokenizer::create_new_token(HTMLToken::Type type)
+{
+ m_current_token = {};
+ m_current_token.m_type = type;
+}
+
+HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding)
+{
+ auto* decoder = TextCodec::decoder_for(encoding);
+ ASSERT(decoder);
+ m_decoded_input = decoder->to_utf8(input);
+ m_utf8_view = Utf8View(m_decoded_input);
+ m_utf8_iterator = m_utf8_view.begin();
+}
+
+void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
+{
+#ifdef TOKENIZER_TRACE
+ dbg() << "[" << state_name(m_state) << "] Switch to " << state_name(new_state);
+#endif
+}
+
+void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
+{
+#ifdef TOKENIZER_TRACE
+ dbg() << "[" << state_name(m_state) << "] Reconsume in " << state_name(new_state);
+#endif
+}
+
+void HTMLTokenizer::switch_to(Badge<HTMLDocumentParser>, State new_state)
+{
+#ifdef TOKENIZER_TRACE
+ dbg() << "[" << state_name(m_state) << "] Parser switches tokenizer state to " << state_name(new_state);
+#endif
+ m_state = new_state;
+}
+
+void HTMLTokenizer::will_emit(HTMLToken& token)
+{
+ if (token.is_start_tag())
+ m_last_emitted_start_tag = token;
+}
+
+bool HTMLTokenizer::current_end_tag_token_is_appropriate() const
+{
+ ASSERT(m_current_token.is_end_tag());
+ if (!m_last_emitted_start_tag.is_start_tag())
+ return false;
+ return m_current_token.tag_name() == m_last_emitted_start_tag.tag_name();
+}
+
+bool HTMLTokenizer::consumed_as_part_of_an_attribute() const
+{
+ return m_return_state == State::AttributeValueUnquoted || m_return_state == State::AttributeValueSingleQuoted || m_return_state == State::AttributeValueDoubleQuoted;
+}
+
+}