summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.cpp12
-rw-r--r--Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.cpp2
-rw-r--r--Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h6
-rw-r--r--Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp83
-rw-r--r--Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h2
5 files changed, 60 insertions, 45 deletions
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.cpp
index cdf0e0a4f8..06c7823523 100644
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.cpp
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.cpp
@@ -262,11 +262,11 @@ DOM::QuirksMode HTMLDocumentParser::which_quirks_mode(const HTMLToken& doctype_t
return DOM::QuirksMode::Yes;
// NOTE: The tokenizer puts the name into lower case for us.
- if (doctype_token.m_doctype.name.to_string() != "html")
+ if (doctype_token.m_doctype.name != "html")
return DOM::QuirksMode::Yes;
- auto public_identifier = doctype_token.m_doctype.public_identifier.to_string();
- auto system_identifier = doctype_token.m_doctype.system_identifier.to_string();
+ auto const& public_identifier = doctype_token.m_doctype.public_identifier;
+ auto const& system_identifier = doctype_token.m_doctype.system_identifier;
if (public_identifier.equals_ignoring_case("-//W3O//DTD W3 HTML Strict 3.0//EN//"))
return DOM::QuirksMode::Yes;
@@ -324,9 +324,9 @@ void HTMLDocumentParser::handle_initial(HTMLToken& token)
if (token.is_doctype()) {
auto doctype = adopt_ref(*new DOM::DocumentType(document()));
- doctype->set_name(token.m_doctype.name.to_string());
- doctype->set_public_id(token.m_doctype.public_identifier.to_string());
- doctype->set_system_id(token.m_doctype.system_identifier.to_string());
+ doctype->set_name(token.m_doctype.name);
+ doctype->set_public_id(token.m_doctype.public_identifier);
+ doctype->set_system_id(token.m_doctype.system_identifier);
document().append_child(move(doctype));
document().set_quirks_mode(which_quirks_mode(token));
m_insertion_mode = InsertionMode::BeforeHTML;
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.cpp
index 7a81844fd2..09348360fd 100644
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.cpp
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.cpp
@@ -16,7 +16,7 @@ String HTMLToken::to_string() const
case HTMLToken::Type::DOCTYPE:
builder.append("DOCTYPE");
builder.append(" { name: '");
- builder.append(m_doctype.name.to_string());
+ builder.append(m_doctype.name);
builder.append("' }");
break;
case HTMLToken::Type::StartTag:
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h
index ef8eaf78b3..9f5a64d663 100644
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h
@@ -194,11 +194,11 @@ private:
struct {
// NOTE: "Missing" is a distinct state from the empty string.
- StringBuilder name;
+ String name;
bool missing_name { true };
- StringBuilder public_identifier;
+ String public_identifier;
bool missing_public_identifier { true };
- StringBuilder system_identifier;
+ String system_identifier;
bool missing_system_identifier { true };
bool force_quirks { false };
} m_doctype;
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
index b3f61d9e5b..541f66c7f7 100644
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
@@ -20,12 +20,18 @@ namespace Web::HTML {
#define CONSUME_NEXT_INPUT_CHARACTER \
current_input_character = next_code_point();
-#define SWITCH_TO(new_state) \
- do { \
- will_switch_to(State::new_state); \
- m_state = State::new_state; \
- CONSUME_NEXT_INPUT_CHARACTER; \
- goto new_state; \
+#define SWITCH_TO(new_state) \
+ do { \
+ VERIFY(m_current_builder.is_empty()); \
+ SWITCH_TO_WITH_UNCLEAN_BUILDER(new_state); \
+ } while (0)
+
+#define SWITCH_TO_WITH_UNCLEAN_BUILDER(new_state) \
+ do { \
+ will_switch_to(State::new_state); \
+ m_state = State::new_state; \
+ CONSUME_NEXT_INPUT_CHARACTER; \
+ goto new_state; \
} while (0)
#define RECONSUME_IN(new_state) \
@@ -449,17 +455,17 @@ _StartOfFunction:
ON_ASCII_UPPER_ALPHA
{
create_new_token(HTMLToken::Type::DOCTYPE);
- m_current_token.m_doctype.name.append(to_ascii_lowercase(current_input_character.value()));
+ m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
m_current_token.m_doctype.missing_name = false;
- SWITCH_TO(DOCTYPEName);
+ SWITCH_TO_WITH_UNCLEAN_BUILDER(DOCTYPEName);
}
ON(0)
{
log_parse_error();
create_new_token(HTMLToken::Type::DOCTYPE);
- m_current_token.m_doctype.name.append_code_point(0xFFFD);
+ m_current_builder.append_code_point(0xFFFD);
m_current_token.m_doctype.missing_name = false;
- SWITCH_TO(DOCTYPEName);
+ SWITCH_TO_WITH_UNCLEAN_BUILDER(DOCTYPEName);
}
ON('>')
{
@@ -479,9 +485,9 @@ _StartOfFunction:
ANYTHING_ELSE
{
create_new_token(HTMLToken::Type::DOCTYPE);
- m_current_token.m_doctype.name.append_code_point(current_input_character.value());
+ m_current_builder.append_code_point(current_input_character.value());
m_current_token.m_doctype.missing_name = false;
- SWITCH_TO(DOCTYPEName);
+ SWITCH_TO_WITH_UNCLEAN_BUILDER(DOCTYPEName);
}
}
END_STATE
@@ -490,21 +496,23 @@ _StartOfFunction:
{
ON_WHITESPACE
{
+ m_current_token.m_doctype.name = consume_current_builder();
SWITCH_TO(AfterDOCTYPEName);
}
ON('>')
{
+ m_current_token.m_doctype.name = consume_current_builder();
SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
}
ON_ASCII_UPPER_ALPHA
{
- m_current_token.m_doctype.name.append(to_ascii_lowercase(current_input_character.value()));
+ m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
continue;
}
ON(0)
{
log_parse_error();
- m_current_token.m_doctype.name.append_code_point(0xFFFD);
+ m_current_builder.append_code_point(0xFFFD);
continue;
}
ON_EOF
@@ -516,7 +524,7 @@ _StartOfFunction:
}
ANYTHING_ELSE
{
- m_current_token.m_doctype.name.append_code_point(current_input_character.value());
+ m_current_builder.append_code_point(current_input_character.value());
continue;
}
}
@@ -563,14 +571,12 @@ _StartOfFunction:
ON('"')
{
log_parse_error();
- m_current_token.m_doctype.public_identifier.clear();
m_current_token.m_doctype.missing_public_identifier = false;
SWITCH_TO(DOCTYPEPublicIdentifierDoubleQuoted);
}
ON('\'')
{
log_parse_error();
- m_current_token.m_doctype.public_identifier.clear();
m_current_token.m_doctype.missing_public_identifier = false;
SWITCH_TO(DOCTYPEPublicIdentifierSingleQuoted);
}
@@ -605,14 +611,14 @@ _StartOfFunction:
ON('"')
{
log_parse_error();
- m_current_token.m_doctype.system_identifier.clear();
+ m_current_token.m_doctype.system_identifier = {};
m_current_token.m_doctype.missing_system_identifier = false;
SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
}
ON('\'')
{
log_parse_error();
- m_current_token.m_doctype.system_identifier.clear();
+ m_current_token.m_doctype.system_identifier = {};
m_current_token.m_doctype.missing_system_identifier = false;
SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
}
@@ -646,13 +652,11 @@ _StartOfFunction:
}
ON('"')
{
- m_current_token.m_doctype.public_identifier.clear();
m_current_token.m_doctype.missing_public_identifier = false;
SWITCH_TO(DOCTYPEPublicIdentifierDoubleQuoted);
}
ON('\'')
{
- m_current_token.m_doctype.public_identifier.clear();
m_current_token.m_doctype.missing_public_identifier = false;
SWITCH_TO(DOCTYPEPublicIdentifierSingleQuoted);
}
@@ -686,13 +690,11 @@ _StartOfFunction:
}
ON('"')
{
- m_current_token.m_doctype.system_identifier.clear();
m_current_token.m_doctype.missing_system_identifier = false;
SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
}
ON('\'')
{
- m_current_token.m_doctype.system_identifier.clear();
m_current_token.m_doctype.missing_system_identifier = false;
SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
}
@@ -722,17 +724,19 @@ _StartOfFunction:
{
ON('"')
{
+ m_current_token.m_doctype.public_identifier = consume_current_builder();
SWITCH_TO(AfterDOCTYPEPublicIdentifier);
}
ON(0)
{
log_parse_error();
- m_current_token.m_doctype.public_identifier.append_code_point(0xFFFD);
+ m_current_builder.append_code_point(0xFFFD);
continue;
}
ON('>')
{
log_parse_error();
+ m_current_token.m_doctype.public_identifier = consume_current_builder();
m_current_token.m_doctype.force_quirks = true;
SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
}
@@ -745,7 +749,7 @@ _StartOfFunction:
}
ANYTHING_ELSE
{
- m_current_token.m_doctype.public_identifier.append_code_point(current_input_character.value());
+ m_current_builder.append_code_point(current_input_character.value());
continue;
}
}
@@ -755,17 +759,19 @@ _StartOfFunction:
{
ON('\'')
{
+ m_current_token.m_doctype.public_identifier = consume_current_builder();
SWITCH_TO(AfterDOCTYPEPublicIdentifier);
}
ON(0)
{
log_parse_error();
- m_current_token.m_doctype.public_identifier.append_code_point(0xFFFD);
+ m_current_builder.append_code_point(0xFFFD);
continue;
}
ON('>')
{
log_parse_error();
+ m_current_token.m_doctype.public_identifier = consume_current_builder();
m_current_token.m_doctype.force_quirks = true;
SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
}
@@ -778,7 +784,7 @@ _StartOfFunction:
}
ANYTHING_ELSE
{
- m_current_token.m_doctype.public_identifier.append_code_point(current_input_character.value());
+ m_current_builder.append_code_point(current_input_character.value());
continue;
}
}
@@ -788,17 +794,19 @@ _StartOfFunction:
{
ON('"')
{
+ m_current_token.m_doctype.public_identifier = consume_current_builder();
SWITCH_TO(AfterDOCTYPESystemIdentifier);
}
ON(0)
{
log_parse_error();
- m_current_token.m_doctype.system_identifier.append_code_point(0xFFFD);
+ m_current_builder.append_code_point(0xFFFD);
continue;
}
ON('>')
{
log_parse_error();
+ m_current_token.m_doctype.public_identifier = consume_current_builder();
m_current_token.m_doctype.force_quirks = true;
SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
}
@@ -811,7 +819,7 @@ _StartOfFunction:
}
ANYTHING_ELSE
{
- m_current_token.m_doctype.system_identifier.append_code_point(current_input_character.value());
+ m_current_builder.append_code_point(current_input_character.value());
continue;
}
}
@@ -821,17 +829,19 @@ _StartOfFunction:
{
ON('\'')
{
+ m_current_token.m_doctype.system_identifier = consume_current_builder();
SWITCH_TO(AfterDOCTYPESystemIdentifier);
}
ON(0)
{
log_parse_error();
- m_current_token.m_doctype.system_identifier.append_code_point(0xFFFD);
+ m_current_builder.append_code_point(0xFFFD);
continue;
}
ON('>')
{
log_parse_error();
+ m_current_token.m_doctype.system_identifier = consume_current_builder();
m_current_token.m_doctype.force_quirks = true;
SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
}
@@ -844,7 +854,7 @@ _StartOfFunction:
}
ANYTHING_ELSE
{
- m_current_token.m_doctype.system_identifier.append_code_point(current_input_character.value());
+ m_current_builder.append_code_point(current_input_character.value());
continue;
}
}
@@ -863,14 +873,12 @@ _StartOfFunction:
ON('"')
{
log_parse_error();
- m_current_token.m_doctype.system_identifier.clear();
m_current_token.m_doctype.missing_system_identifier = false;
SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
}
ON('\'')
{
log_parse_error();
- m_current_token.m_doctype.system_identifier.clear();
m_current_token.m_doctype.missing_system_identifier = false;
SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
}
@@ -902,13 +910,11 @@ _StartOfFunction:
}
ON('"')
{
- m_current_token.m_doctype.system_identifier.clear();
m_current_token.m_doctype.missing_system_identifier = false;
SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
}
ON('\'')
{
- m_current_token.m_doctype.system_identifier.clear();
m_current_token.m_doctype.missing_system_identifier = false;
SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
}
@@ -2681,4 +2687,11 @@ void HTMLTokenizer::restore_to(const Utf8CodePointIterator& new_iterator)
m_utf8_iterator = new_iterator;
}
+String HTMLTokenizer::consume_current_builder()
+{
+ auto string = m_current_builder.to_string();
+ m_current_builder.clear();
+ return string;
+}
+
}
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h
index ebde43ba3c..0ede6cc4ec 100644
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h
@@ -127,6 +127,7 @@ private:
bool consume_next_if_match(const StringView&, CaseSensitivity = CaseSensitivity::CaseSensitive);
void create_new_token(HTMLToken::Type);
bool current_end_tag_token_is_appropriate() const;
+ String consume_current_builder();
static const char* state_name(State state)
{
@@ -163,6 +164,7 @@ private:
Utf8CodePointIterator m_prev_utf8_iterator;
HTMLToken m_current_token;
+ StringBuilder m_current_builder;
Optional<String> m_last_emitted_start_tag_name;