diff options
5 files changed, 60 insertions, 45 deletions
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.cpp index cdf0e0a4f8..06c7823523 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.cpp +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLDocumentParser.cpp @@ -262,11 +262,11 @@ DOM::QuirksMode HTMLDocumentParser::which_quirks_mode(const HTMLToken& doctype_t return DOM::QuirksMode::Yes; // NOTE: The tokenizer puts the name into lower case for us. - if (doctype_token.m_doctype.name.to_string() != "html") + if (doctype_token.m_doctype.name != "html") return DOM::QuirksMode::Yes; - auto public_identifier = doctype_token.m_doctype.public_identifier.to_string(); - auto system_identifier = doctype_token.m_doctype.system_identifier.to_string(); + auto const& public_identifier = doctype_token.m_doctype.public_identifier; + auto const& system_identifier = doctype_token.m_doctype.system_identifier; if (public_identifier.equals_ignoring_case("-//W3O//DTD W3 HTML Strict 3.0//EN//")) return DOM::QuirksMode::Yes; @@ -324,9 +324,9 @@ void HTMLDocumentParser::handle_initial(HTMLToken& token) if (token.is_doctype()) { auto doctype = adopt_ref(*new DOM::DocumentType(document())); - doctype->set_name(token.m_doctype.name.to_string()); - doctype->set_public_id(token.m_doctype.public_identifier.to_string()); - doctype->set_system_id(token.m_doctype.system_identifier.to_string()); + doctype->set_name(token.m_doctype.name); + doctype->set_public_id(token.m_doctype.public_identifier); + doctype->set_system_id(token.m_doctype.system_identifier); document().append_child(move(doctype)); document().set_quirks_mode(which_quirks_mode(token)); m_insertion_mode = InsertionMode::BeforeHTML; diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.cpp index 7a81844fd2..09348360fd 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.cpp +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.cpp @@ -16,7 +16,7 @@ String HTMLToken::to_string() const case HTMLToken::Type::DOCTYPE: builder.append("DOCTYPE"); builder.append(" { name: '"); - builder.append(m_doctype.name.to_string()); + builder.append(m_doctype.name); builder.append("' }"); break; case HTMLToken::Type::StartTag: diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h index ef8eaf78b3..9f5a64d663 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h @@ -194,11 +194,11 @@ private: struct { // NOTE: "Missing" is a distinct state from the empty string. - StringBuilder name; + String name; bool missing_name { true }; - StringBuilder public_identifier; + String public_identifier; bool missing_public_identifier { true }; - StringBuilder system_identifier; + String system_identifier; bool missing_system_identifier { true }; bool force_quirks { false }; } m_doctype; diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp index b3f61d9e5b..541f66c7f7 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp @@ -20,12 +20,18 @@ namespace Web::HTML { #define CONSUME_NEXT_INPUT_CHARACTER \ current_input_character = next_code_point(); -#define SWITCH_TO(new_state) \ - do { \ - will_switch_to(State::new_state); \ - m_state = State::new_state; \ - CONSUME_NEXT_INPUT_CHARACTER; \ - goto new_state; \ +#define SWITCH_TO(new_state) \ + do { \ + VERIFY(m_current_builder.is_empty()); \ + SWITCH_TO_WITH_UNCLEAN_BUILDER(new_state); \ + } while (0) + +#define SWITCH_TO_WITH_UNCLEAN_BUILDER(new_state) \ + do { \ + will_switch_to(State::new_state); \ + m_state = State::new_state; \ + CONSUME_NEXT_INPUT_CHARACTER; \ + goto new_state; \ } while (0) #define RECONSUME_IN(new_state) \ @@ -449,17 +455,17 @@ _StartOfFunction: ON_ASCII_UPPER_ALPHA { create_new_token(HTMLToken::Type::DOCTYPE); - m_current_token.m_doctype.name.append(to_ascii_lowercase(current_input_character.value())); + m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value())); m_current_token.m_doctype.missing_name = false; - SWITCH_TO(DOCTYPEName); + SWITCH_TO_WITH_UNCLEAN_BUILDER(DOCTYPEName); } ON(0) { log_parse_error(); create_new_token(HTMLToken::Type::DOCTYPE); - m_current_token.m_doctype.name.append_code_point(0xFFFD); + m_current_builder.append_code_point(0xFFFD); m_current_token.m_doctype.missing_name = false; - SWITCH_TO(DOCTYPEName); + SWITCH_TO_WITH_UNCLEAN_BUILDER(DOCTYPEName); } ON('>') { @@ -479,9 +485,9 @@ _StartOfFunction: ANYTHING_ELSE { create_new_token(HTMLToken::Type::DOCTYPE); - m_current_token.m_doctype.name.append_code_point(current_input_character.value()); + m_current_builder.append_code_point(current_input_character.value()); m_current_token.m_doctype.missing_name = false; - SWITCH_TO(DOCTYPEName); + SWITCH_TO_WITH_UNCLEAN_BUILDER(DOCTYPEName); } } END_STATE @@ -490,21 +496,23 @@ _StartOfFunction: { ON_WHITESPACE { + m_current_token.m_doctype.name = consume_current_builder(); SWITCH_TO(AfterDOCTYPEName); } ON('>') { + m_current_token.m_doctype.name = consume_current_builder(); SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); } ON_ASCII_UPPER_ALPHA { - m_current_token.m_doctype.name.append(to_ascii_lowercase(current_input_character.value())); + m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value())); continue; } ON(0) { log_parse_error(); - m_current_token.m_doctype.name.append_code_point(0xFFFD); + m_current_builder.append_code_point(0xFFFD); continue; } ON_EOF @@ -516,7 +524,7 @@ _StartOfFunction: } ANYTHING_ELSE { - m_current_token.m_doctype.name.append_code_point(current_input_character.value()); + m_current_builder.append_code_point(current_input_character.value()); continue; } } @@ -563,14 +571,12 @@ _StartOfFunction: ON('"') { log_parse_error(); - m_current_token.m_doctype.public_identifier.clear(); m_current_token.m_doctype.missing_public_identifier = false; SWITCH_TO(DOCTYPEPublicIdentifierDoubleQuoted); } ON('\'') { log_parse_error(); - m_current_token.m_doctype.public_identifier.clear(); m_current_token.m_doctype.missing_public_identifier = false; SWITCH_TO(DOCTYPEPublicIdentifierSingleQuoted); } @@ -605,14 +611,14 @@ _StartOfFunction: ON('"') { log_parse_error(); - m_current_token.m_doctype.system_identifier.clear(); + m_current_token.m_doctype.system_identifier = {}; m_current_token.m_doctype.missing_system_identifier = false; SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted); } ON('\'') { log_parse_error(); - m_current_token.m_doctype.system_identifier.clear(); + m_current_token.m_doctype.system_identifier = {}; m_current_token.m_doctype.missing_system_identifier = false; SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted); } @@ -646,13 +652,11 @@ _StartOfFunction: } ON('"') { - m_current_token.m_doctype.public_identifier.clear(); m_current_token.m_doctype.missing_public_identifier = false; SWITCH_TO(DOCTYPEPublicIdentifierDoubleQuoted); } ON('\'') { - m_current_token.m_doctype.public_identifier.clear(); m_current_token.m_doctype.missing_public_identifier = false; SWITCH_TO(DOCTYPEPublicIdentifierSingleQuoted); } @@ -686,13 +690,11 @@ _StartOfFunction: } ON('"') { - m_current_token.m_doctype.system_identifier.clear(); m_current_token.m_doctype.missing_system_identifier = false; SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted); } ON('\'') { - m_current_token.m_doctype.system_identifier.clear(); m_current_token.m_doctype.missing_system_identifier = false; SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted); } @@ -722,17 +724,19 @@ _StartOfFunction: { ON('"') { + m_current_token.m_doctype.public_identifier = consume_current_builder(); SWITCH_TO(AfterDOCTYPEPublicIdentifier); } ON(0) { log_parse_error(); - m_current_token.m_doctype.public_identifier.append_code_point(0xFFFD); + m_current_builder.append_code_point(0xFFFD); continue; } ON('>') { log_parse_error(); + m_current_token.m_doctype.public_identifier = consume_current_builder(); m_current_token.m_doctype.force_quirks = true; SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); } @@ -745,7 +749,7 @@ _StartOfFunction: } ANYTHING_ELSE { - m_current_token.m_doctype.public_identifier.append_code_point(current_input_character.value()); + m_current_builder.append_code_point(current_input_character.value()); continue; } } @@ -755,17 +759,19 @@ _StartOfFunction: { ON('\'') { + m_current_token.m_doctype.public_identifier = consume_current_builder(); SWITCH_TO(AfterDOCTYPEPublicIdentifier); } ON(0) { log_parse_error(); - m_current_token.m_doctype.public_identifier.append_code_point(0xFFFD); + m_current_builder.append_code_point(0xFFFD); continue; } ON('>') { log_parse_error(); + m_current_token.m_doctype.public_identifier = consume_current_builder(); m_current_token.m_doctype.force_quirks = true; SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); } @@ -778,7 +784,7 @@ _StartOfFunction: } ANYTHING_ELSE { - m_current_token.m_doctype.public_identifier.append_code_point(current_input_character.value()); + m_current_builder.append_code_point(current_input_character.value()); continue; } } @@ -788,17 +794,19 @@ _StartOfFunction: { ON('"') { + m_current_token.m_doctype.public_identifier = consume_current_builder(); SWITCH_TO(AfterDOCTYPESystemIdentifier); } ON(0) { log_parse_error(); - m_current_token.m_doctype.system_identifier.append_code_point(0xFFFD); + m_current_builder.append_code_point(0xFFFD); continue; } ON('>') { log_parse_error(); + m_current_token.m_doctype.public_identifier = consume_current_builder(); m_current_token.m_doctype.force_quirks = true; SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); } @@ -811,7 +819,7 @@ _StartOfFunction: } ANYTHING_ELSE { - m_current_token.m_doctype.system_identifier.append_code_point(current_input_character.value()); + m_current_builder.append_code_point(current_input_character.value()); continue; } } @@ -821,17 +829,19 @@ _StartOfFunction: { ON('\'') { + m_current_token.m_doctype.system_identifier = consume_current_builder(); SWITCH_TO(AfterDOCTYPESystemIdentifier); } ON(0) { log_parse_error(); - m_current_token.m_doctype.system_identifier.append_code_point(0xFFFD); + m_current_builder.append_code_point(0xFFFD); continue; } ON('>') { log_parse_error(); + m_current_token.m_doctype.system_identifier = consume_current_builder(); m_current_token.m_doctype.force_quirks = true; SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); } @@ -844,7 +854,7 @@ _StartOfFunction: } ANYTHING_ELSE { - m_current_token.m_doctype.system_identifier.append_code_point(current_input_character.value()); + m_current_builder.append_code_point(current_input_character.value()); continue; } } @@ -863,14 +873,12 @@ _StartOfFunction: ON('"') { log_parse_error(); - m_current_token.m_doctype.system_identifier.clear(); m_current_token.m_doctype.missing_system_identifier = false; SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted); } ON('\'') { log_parse_error(); - m_current_token.m_doctype.system_identifier.clear(); m_current_token.m_doctype.missing_system_identifier = false; SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted); } @@ -902,13 +910,11 @@ _StartOfFunction: } ON('"') { - m_current_token.m_doctype.system_identifier.clear(); m_current_token.m_doctype.missing_system_identifier = false; SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted); } ON('\'') { - m_current_token.m_doctype.system_identifier.clear(); m_current_token.m_doctype.missing_system_identifier = false; SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted); } @@ -2681,4 +2687,11 @@ void HTMLTokenizer::restore_to(const Utf8CodePointIterator& new_iterator) m_utf8_iterator = new_iterator; } +String HTMLTokenizer::consume_current_builder() +{ + auto string = m_current_builder.to_string(); + m_current_builder.clear(); + return string; +} + } diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h index ebde43ba3c..0ede6cc4ec 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h @@ -127,6 +127,7 @@ private: bool consume_next_if_match(const StringView&, CaseSensitivity = CaseSensitivity::CaseSensitive); void create_new_token(HTMLToken::Type); bool current_end_tag_token_is_appropriate() const; + String consume_current_builder(); static const char* state_name(State state) { @@ -163,6 +164,7 @@ private: Utf8CodePointIterator m_prev_utf8_iterator; HTMLToken m_current_token; + StringBuilder m_current_builder; Optional<String> m_last_emitted_start_tag_name; |